diff --git a/CLAUDE.md b/CLAUDE.md index a8b25d85..704f4be1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -150,6 +150,10 @@ Multiple Claude Code instances run concurrently in this repo. **Every session mu - Never add AI co-authorship lines (`Co-Authored-By` or similar) to commit messages. - Never add "Built with Claude Code", "Generated with Claude Code", or any variation referencing Claude Code or Anthropic to commit messages, PR descriptions, code comments, or any other output. +## PR Reviews (Greptile) + +This repo uses [Greptile](https://greptile.com) for automated PR reviews. After pushing fixes that address review feedback, trigger a re-review by commenting `@greptileai` on the PR. Do **not** use the GitHub "re-request review" API β€” Greptile only responds to the comment trigger. + ## Node Version Requires Node >= 20. diff --git a/README.md b/README.md index 91f98c77..cdc82d77 100644 --- a/README.md +++ b/README.md @@ -583,15 +583,16 @@ const { results: fused } = await multiSearchData( ## πŸ—ΊοΈ Roadmap -See **[ROADMAP.md](ROADMAP.md)** for the full development roadmap. Current plan: +See **[ROADMAP.md](ROADMAP.md)** for the full development roadmap and **[STABILITY.md](STABILITY.md)** for the stability policy and versioning guarantees. Current plan: 1. ~~**Rust Core**~~ β€” **Complete** (v1.3.0) β€” native tree-sitter parsing via napi-rs, parallel multi-core parsing, incremental re-parsing, import resolution & cycle detection in Rust 2. ~~**Foundation Hardening**~~ β€” **Complete** (v1.4.0) β€” parser registry, 12-tool MCP server with multi-repo support, test coverage 62%β†’75%, `apiKeyCommand` secret resolution, global repo registry -3. **Intelligent Embeddings** β€” LLM-generated descriptions, hybrid search -4. **Natural Language Queries** β€” `codegraph ask` command, conversational sessions -5. **Expanded Language Support** β€” 8 new languages (12 β†’ 20) -6. **GitHub Integration & CI** β€” reusable GitHub Action, PR review, SARIF output -7. **Visualization & Advanced** β€” web UI, dead code detection, monorepo support, agentic search +3. **Architectural Refactoring** β€” parser plugin system, repository pattern, pipeline builder, engine strategy, domain errors, curated API +4. **Intelligent Embeddings** β€” LLM-generated descriptions, hybrid search +5. **Natural Language Queries** β€” `codegraph ask` command, conversational sessions +6. **Expanded Language Support** β€” 8 new languages (12 β†’ 20) +7. **GitHub Integration & CI** β€” reusable GitHub Action, PR review, SARIF output +8. **Visualization & Advanced** β€” web UI, dead code detection, monorepo support, agentic search ## 🀝 Contributing diff --git a/ROADMAP.md b/ROADMAP.md index 3e606b52..32620e5f 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -2,7 +2,7 @@ > **Current version:** 1.4.0 | **Status:** Active development | **Updated:** February 2026 -Codegraph is a strong local-first code graph CLI. This roadmap describes planned improvements across eight phases β€” closing gaps with commercial code intelligence platforms while preserving codegraph's core strengths: fully local, open source, zero cloud dependency by default. +Codegraph is a strong local-first code graph CLI. This roadmap describes planned improvements across seven phases β€” closing gaps with commercial code intelligence platforms while preserving codegraph's core strengths: fully local, open source, zero cloud dependency by default. **LLM strategy:** All LLM-powered features are **optional enhancements**. Everything works without an API key. When configured (OpenAI, Anthropic, Ollama, or any OpenAI-compatible endpoint), users unlock richer semantic search and natural language queries. @@ -14,23 +14,21 @@ Codegraph is a strong local-first code graph CLI. This roadmap describes planned |-------|-------|-----------------|--------| | [**1**](#phase-1--rust-core) | Rust Core | Rust parsing engine via napi-rs, parallel parsing, incremental tree-sitter, JS orchestration layer | **Complete** (v1.3.0) | | [**2**](#phase-2--foundation-hardening) | Foundation Hardening | Parser registry, complete MCP, test coverage, enhanced config, multi-repo MCP | **Complete** (v1.4.0) | -| [**3**](#phase-3--architectural-refactoring) | Architectural Refactoring | Parser plugin system, repository pattern, pipeline builder, engine strategy, analysis/formatting split, domain errors, CLI commands, composable MCP, curated API | Planned | -| [**4**](#phase-4--intelligent-embeddings) | Intelligent Embeddings | LLM-generated descriptions, hybrid search | Planned | -| [**5**](#phase-5--natural-language-queries) | Natural Language Queries | `ask` command, conversational sessions | Planned | -| [**6**](#phase-6--expanded-language-support) | Expanded Language Support | 8 new languages (12 β†’ 20), parser utilities | Planned | -| [**7**](#phase-7--github-integration--ci) | GitHub Integration & CI | Reusable GitHub Action, PR review, SARIF output | Planned | -| [**8**](#phase-8--interactive-visualization--advanced-features) | Visualization & Advanced | Web UI, dead code detection, monorepo, agentic search | Planned | +| [**3**](#phase-3--intelligent-embeddings) | Intelligent Embeddings | LLM-generated descriptions, hybrid search | Planned | +| [**4**](#phase-4--natural-language-queries) | Natural Language Queries | `ask` command, conversational sessions | Planned | +| [**5**](#phase-5--expanded-language-support) | Expanded Language Support | 8 new languages (12 β†’ 20), parser utilities | Planned | +| [**6**](#phase-6--github-integration--ci) | GitHub Integration & CI | Reusable GitHub Action, PR review, SARIF output | Planned | +| [**7**](#phase-7--interactive-visualization--advanced-features) | Visualization & Advanced | Web UI, dead code detection, monorepo, agentic search | Planned | ### Dependency graph ``` Phase 1 (Rust Core) └──→ Phase 2 (Foundation Hardening) - └──→ Phase 3 (Architectural Refactoring) - β”œβ”€β”€β†’ Phase 4 (Embeddings) ──→ Phase 5 (NL Queries) - β”œβ”€β”€β†’ Phase 6 (Languages) - └──→ Phase 7 (GitHub/CI) -Phases 1-5 ──→ Phase 8 (Visualization & Advanced) + β”œβ”€β”€β†’ Phase 3 (Embeddings) ──→ Phase 4 (NL Queries) + β”œβ”€β”€β†’ Phase 5 (Languages) + └──→ Phase 6 (GitHub/CI) +Phases 1-4 ──→ Phase 7 (Visualization & Advanced) ``` --- @@ -189,297 +187,11 @@ Support querying multiple codebases from a single MCP server instance. --- -## Phase 3 β€” Architectural Refactoring - -**Goal:** Restructure the codebase for modularity, testability, and long-term maintainability. These are internal improvements β€” no new user-facing features, but they make every subsequent phase easier to build and maintain. - -> Reference: [generated/architecture.md](generated/architecture.md) β€” full analysis with code examples and rationale. - -### 3.1 β€” Parser Plugin System - -Split `parser.js` (2,200+ lines) into a modular directory structure with isolated per-language extractors. - -``` -src/parser/ - index.js # Public API: parseFileAuto, parseFilesAuto - registry.js # LANGUAGE_REGISTRY + extension mapping - engine.js # Native/WASM init, engine resolution, grammar loading - tree-utils.js # findChild, findParentClass, walkTree helpers - base-extractor.js # Shared walk loop + accumulator framework - extractors/ - javascript.js # JS/TS/TSX - python.js - go.js - rust.js - java.js - csharp.js - ruby.js - php.js - hcl.js -``` - -Introduce a `BaseExtractor` that owns the tree walk loop. Each language extractor declares a `nodeType β†’ handler` map instead of reimplementing the traversal. Eliminates repeated walk-and-switch boilerplate across 9+ extractors. - -**Affected files:** `src/parser.js` β†’ split into `src/parser/` - -### 3.2 β€” Repository Pattern for Data Access - -Consolidate all SQL into a single `Repository` class. Currently SQL is scattered across `builder.js`, `queries.js`, `embedder.js`, `watcher.js`, and `cycles.js`. - -``` -src/db/ - connection.js # Open, WAL mode, pragma tuning - migrations.js # Schema versions - repository.js # ALL data access methods (reads + writes) -``` - -All prepared statements, index tuning, and schema knowledge live in one place. Consumers never see SQL. Enables an `InMemoryRepository` for fast unit tests. - -**Affected files:** `src/db.js` β†’ split into `src/db/`, SQL extracted from `builder.js`, `queries.js`, `embedder.js`, `watcher.js`, `cycles.js` - -### 3.3 β€” Analysis / Formatting Separation - -Split `queries.js` (800+ lines) into pure analysis modules and presentation formatters. - -``` -src/analysis/ # Pure data: take repository, return typed results - impact.js - call-chain.js - diff-impact.js - module-map.js - class-hierarchy.js - -src/formatters/ # Presentation: take data, produce strings - cli-formatter.js - json-formatter.js - table-formatter.js -``` - -Analysis modules return pure data. The CLI, MCP server, and programmatic API each pick their own formatter (or none). Eliminates the `*Data()` / `*()` dual-function pattern. - -**Affected files:** `src/queries.js` β†’ split into `src/analysis/` + `src/formatters/` - -### 3.4 β€” Builder Pipeline Architecture - -Refactor `buildGraph()` from a monolithic mega-function into explicit, independently testable pipeline stages. - -```js -const pipeline = [ - collectFiles, // (rootDir, config) => filePaths[] - detectChanges, // (filePaths, db) => { changed, removed, isFullBuild } - parseFiles, // (filePaths, engineOpts) => Map - insertNodes, // (symbolMap, db) => nodeIndex - resolveImports, // (symbolMap, rootDir, aliases) => importEdges[] - buildCallEdges, // (symbolMap, nodeIndex) => callEdges[] - buildClassEdges, // (symbolMap, nodeIndex) => classEdges[] - resolveBarrels, // (edges, symbolMap) => resolvedEdges[] - insertEdges, // (allEdges, db) => stats -] -``` - -Watch mode reuses the same stages (triggered per-file instead of per-project), eliminating the divergence between `watcher.js` and `builder.js` where bug fixes must be applied separately. - -**Affected files:** `src/builder.js`, `src/watcher.js` - -### 3.5 β€” Unified Engine Interface - -Replace scattered `engine.name === 'native'` branching with a Strategy pattern. Every consumer receives an engine object with the same API regardless of backend. - -```js -const engine = createEngine(opts) // returns same interface for native or WASM -engine.parseFile(path, source) -engine.resolveImports(batch, rootDir, aliases) -engine.detectCycles(db) -``` - -Consumers never branch on native vs WASM. Adding a third backend (e.g., remote parsing service) requires zero consumer changes. - -**Affected files:** `src/parser.js`, `src/resolve.js`, `src/cycles.js`, `src/builder.js`, `src/native.js` - -### 3.6 β€” Qualified Names & Hierarchical Scoping - -Enrich the node model with scope information to reduce ambiguity. - -```sql -ALTER TABLE nodes ADD COLUMN qualified_name TEXT; -- 'DateHelper.format' -ALTER TABLE nodes ADD COLUMN scope TEXT; -- 'DateHelper' -ALTER TABLE nodes ADD COLUMN visibility TEXT; -- 'public' | 'private' | 'protected' -``` - -Enables queries like "all methods of class X" without traversing edges. Reduces reliance on heuristic confidence scoring for name collisions. - -**Affected files:** `src/db.js`, `src/parser.js` (extractors), `src/queries.js`, `src/builder.js` - -### 3.7 β€” Composable MCP Tool Registry - -Replace the monolithic `TOOLS` array + `switch` dispatch in `mcp.js` with self-contained tool modules. - -``` -src/mcp/ - server.js # MCP server setup, transport, lifecycle - tool-registry.js # Dynamic tool registration + auto-discovery - tools/ - query-function.js # { schema, handler } per tool - file-deps.js - impact-analysis.js - ... -``` - -Adding a new MCP tool = adding a file. No other files change. - -**Affected files:** `src/mcp.js` β†’ split into `src/mcp/` - -### 3.8 β€” CLI Command Objects - -Move from inline Commander chains in `cli.js` to self-contained command modules. - -``` -src/cli/ - index.js # Commander setup, auto-discover commands - commands/ - build.js # { name, description, options, validate, execute } - query.js - impact.js - ... -``` - -Each command is independently testable by calling `execute()` directly. The CLI index auto-discovers and registers them. - -**Affected files:** `src/cli.js` β†’ split into `src/cli/` - -### 3.9 β€” Domain Error Hierarchy - -Replace ad-hoc error handling (mix of thrown `Error`, returned `null`, `logger.warn()`, `process.exit(1)`) with structured domain errors. - -```js -class CodegraphError extends Error { constructor(message, { code, file, cause }) { ... } } -class ParseError extends CodegraphError { code = 'PARSE_FAILED' } -class DbError extends CodegraphError { code = 'DB_ERROR' } -class ConfigError extends CodegraphError { code = 'CONFIG_INVALID' } -class ResolutionError extends CodegraphError { code = 'RESOLUTION_FAILED' } -class EngineError extends CodegraphError { code = 'ENGINE_UNAVAILABLE' } -``` - -CLI catches domain errors and formats for humans. MCP returns structured error responses. No more `process.exit()` from library code. - -**New file:** `src/errors.js` - -### 3.10 β€” Curated Public API Surface - -Reduce `index.js` from ~40 re-exports to a curated public API. Use `package.json` `exports` field to enforce module boundaries. - -```json -{ "exports": { ".": "./src/index.js", "./cli": "./src/cli.js" } } -``` - -Internal modules become truly internal. Consumers can only import from documented entry points. - -**Affected files:** `src/index.js`, `package.json` - -### 3.11 β€” Embedder Subsystem Extraction - -Restructure `embedder.js` (525 lines) into a standalone subsystem with pluggable vector storage. - -``` -src/embeddings/ - index.js # Public API - model-registry.js # Model definitions, batch sizes, loading - generator.js # Source β†’ text preparation β†’ batch embedding - store.js # Vector storage (pluggable: SQLite blob, HNSW index) - search.js # Similarity search, RRF multi-query fusion -``` - -Decouples embedding schema from the graph DB. The pluggable store interface enables future O(log n) ANN search (e.g., `hnswlib-node`) when symbol counts reach 50K+. - -**Affected files:** `src/embedder.js` β†’ split into `src/embeddings/` - -### 3.12 β€” Testing Pyramid - -Add proper unit test layer below the existing integration tests. - -- Pure unit tests for extractors (pass AST node, assert symbols β€” no file I/O) -- Pure unit tests for BFS/Tarjan algorithms (pass adjacency list, assert result) -- Pure unit tests for confidence scoring (pass parameters, assert score) -- Repository mock for query tests (in-memory data, no SQLite) -- E2E tests that invoke the CLI binary and assert exit codes + stdout - -The repository pattern (3.2) directly enables this: unit tests use `InMemoryRepository`, integration tests use `SqliteRepository`. - -### 3.13 β€” Event-Driven Pipeline - -Add an event/streaming architecture to the build pipeline for progress reporting, cancellation, and large-repo support. - -```js -pipeline.on('file:parsed', (file, symbols) => { /* progress */ }) -pipeline.on('file:indexed', (file, nodeCount) => { /* progress */ }) -pipeline.on('build:complete', (stats) => { /* summary */ }) -pipeline.on('error', (file, err) => { /* continue or abort */ }) -await pipeline.run(rootDir) -``` - -Unifies build and watch code paths. Large builds stream results to the DB incrementally instead of buffering in memory. - -**Affected files:** `src/builder.js`, `src/watcher.js`, `src/cli.js` - -### 3.14 β€” Subgraph Export Filtering - -Add focus/filter options to the export module so visualizations are usable for real projects. - -```bash -codegraph export --format dot --focus src/builder.js --depth 2 -codegraph export --format mermaid --filter "src/api/**" --kind function -codegraph export --format json --changed -``` - -The export module receives a subgraph specification (focus node + depth, file pattern, kind filter) and extracts the relevant subgraph before formatting. - -**Affected files:** `src/export.js`, `src/cli.js` - -### 3.15 β€” Transitive Import-Aware Confidence - -Before falling back to proximity heuristics, walk the import graph from the caller file. If any import path (even indirect through barrel files) reaches a candidate, score it 0.9. Only fall back to proximity when no import path exists. - -**Affected files:** `src/resolve.js`, `src/builder.js` - -### 3.16 β€” Query Result Caching - -Add a TTL/LRU cache between the analysis layer and the repository. Particularly valuable for MCP where an agent session may repeatedly query related symbols. - -```js -class QueryCache { - constructor(db, maxAge = 60_000) { ... } - get(key) { ... } // key = query name + args hash - set(key, value) { ... } - invalidate() { ... } // called after any DB mutation -} -``` - -### 3.17 β€” Configuration Profiles - -Support profile-based configuration for monorepos with multiple services. - -```json -{ - "profiles": { - "backend": { "include": ["services/api/**"], "build": { "dbPath": ".codegraph/api.db" } }, - "frontend": { "include": ["apps/web/**"], "build": { "dbPath": ".codegraph/web.db" } } - } -} -``` - -```bash -codegraph build --profile backend -``` - -**Affected files:** `src/config.js`, `src/cli.js` - ---- - -## Phase 4 β€” Intelligent Embeddings +## Phase 3 β€” Intelligent Embeddings **Goal:** Dramatically improve semantic search quality by embedding natural-language descriptions instead of raw code. -### 4.1 β€” LLM Description Generator +### 3.1 β€” LLM Description Generator For each function/method/class node, generate a concise natural-language description: @@ -507,7 +219,7 @@ For each function/method/class node, generate a concise natural-language descrip **New file:** `src/describer.js` -### 4.2 β€” Enhanced Embedding Pipeline +### 3.2 β€” Enhanced Embedding Pipeline - When descriptions exist, embed the description text instead of raw code - Keep raw code as fallback when no description is available @@ -518,7 +230,7 @@ For each function/method/class node, generate a concise natural-language descrip **Affected files:** `src/embedder.js` -### 4.3 β€” Hybrid Search +### 3.3 β€” Hybrid Search Combine vector similarity with keyword matching. @@ -533,11 +245,11 @@ Combine vector similarity with keyword matching. --- -## Phase 5 β€” Natural Language Queries +## Phase 4 β€” Natural Language Queries **Goal:** Allow developers to ask questions about their codebase in plain English. -### 5.1 β€” Query Engine +### 4.1 β€” Query Engine ```bash codegraph ask "How does the authentication flow work?" @@ -563,7 +275,7 @@ codegraph ask "How does the authentication flow work?" **New file:** `src/nlquery.js` -### 5.2 β€” Conversational Sessions +### 4.2 β€” Conversational Sessions Multi-turn conversations with session memory. @@ -577,7 +289,7 @@ codegraph sessions clear - Store conversation history in SQLite table `sessions` - Include prior Q&A pairs in subsequent prompts -### 5.3 β€” MCP Integration +### 4.3 β€” MCP Integration New MCP tool: `ask_codebase` β€” natural language query via MCP. @@ -587,11 +299,11 @@ Enables AI coding agents (Claude Code, Cursor, etc.) to ask codegraph questions --- -## Phase 6 β€” Expanded Language Support +## Phase 5 β€” Expanded Language Support **Goal:** Go from 12 β†’ 20 supported languages. -### 6.1 β€” Batch 1: High Demand +### 5.1 β€” Batch 1: High Demand | Language | Extensions | Grammar | Effort | |----------|-----------|---------|--------| @@ -600,7 +312,7 @@ Enables AI coding agents (Claude Code, Cursor, etc.) to ask codegraph questions | Kotlin | `.kt`, `.kts` | `tree-sitter-kotlin` | Low | | Swift | `.swift` | `tree-sitter-swift` | Medium | -### 6.2 β€” Batch 2: Growing Ecosystems +### 5.2 β€” Batch 2: Growing Ecosystems | Language | Extensions | Grammar | Effort | |----------|-----------|---------|--------| @@ -609,7 +321,7 @@ Enables AI coding agents (Claude Code, Cursor, etc.) to ask codegraph questions | Lua | `.lua` | `tree-sitter-lua` | Low | | Zig | `.zig` | `tree-sitter-zig` | Low | -### 6.3 β€” Parser Abstraction Layer +### 5.3 β€” Parser Abstraction Layer Extract shared patterns from existing extractors into reusable helpers. @@ -625,11 +337,11 @@ Extract shared patterns from existing extractors into reusable helpers. --- -## Phase 7 β€” GitHub Integration & CI +## Phase 6 β€” GitHub Integration & CI **Goal:** Bring codegraph's analysis into pull request workflows. -### 7.1 β€” Reusable GitHub Action +### 6.1 β€” Reusable GitHub Action A reusable GitHub Action that runs on PRs: @@ -651,7 +363,7 @@ A reusable GitHub Action that runs on PRs: **New file:** `.github/actions/codegraph-ci/action.yml` -### 7.2 β€” PR Review Integration +### 6.2 β€” PR Review Integration ```bash codegraph review --pr @@ -667,7 +379,7 @@ Requires `gh` CLI. For each changed function: **New file:** `src/github.js` -### 7.3 β€” SARIF Output +### 6.3 β€” SARIF Output Add SARIF output format for cycle detection. SARIF integrates with GitHub Code Scanning, showing issues inline in the PR. @@ -675,9 +387,9 @@ Add SARIF output format for cycle detection. SARIF integrates with GitHub Code S --- -## Phase 8 β€” Interactive Visualization & Advanced Features +## Phase 7 β€” Interactive Visualization & Advanced Features -### 8.1 β€” Interactive Web Visualization +### 7.1 β€” Interactive Web Visualization ```bash codegraph viz @@ -697,7 +409,7 @@ Opens a local web UI at `localhost:3000` with: **New file:** `src/visualizer.js` -### 8.2 β€” Dead Code Detection +### 7.2 β€” Dead Code Detection ```bash codegraph dead @@ -708,7 +420,7 @@ Find functions/methods/classes with zero incoming edges (never called). Filters **Affected files:** `src/queries.js` -### 8.3 β€” Cross-Repository Support (Monorepo) +### 7.3 β€” Cross-Repository Support (Monorepo) Support multi-package monorepos with cross-package edges. @@ -718,7 +430,7 @@ Support multi-package monorepos with cross-package edges. - `codegraph build --workspace` to scan all packages - Impact analysis across package boundaries -### 8.4 β€” Agentic Search +### 7.4 β€” Agentic Search Recursive reference-following search that traces connections. @@ -750,12 +462,11 @@ Each phase includes targeted verification: |-------|-------------| | **1** | Benchmark native vs WASM parsing on a large repo, verify identical output from both engines | | **2** | `npm test`, manual MCP client test for all tools, config loading tests | -| **3** | All existing tests pass after refactoring; new unit tests for each extracted module; zero behavior changes | -| **4** | Compare `codegraph search` quality before/after descriptions on a real repo | -| **5** | `codegraph ask "How does import resolution work?"` against codegraph itself | -| **6** | Parse sample files for each new language, verify definitions/calls/imports | -| **7** | Test PR in a fork, verify GitHub Action comment is posted | -| **8** | `codegraph viz` loads, nodes are interactive, search works | +| **3** | Compare `codegraph search` quality before/after descriptions on a real repo | +| **4** | `codegraph ask "How does import resolution work?"` against codegraph itself | +| **5** | Parse sample files for each new language, verify definitions/calls/imports | +| **6** | Test PR in a fork, verify GitHub Action comment is posted | +| **7** | `codegraph viz` loads, nodes are interactive, search works | **Full integration test** after all phases: diff --git a/STABILITY.md b/STABILITY.md new file mode 100644 index 00000000..f9a97ccc --- /dev/null +++ b/STABILITY.md @@ -0,0 +1,125 @@ +# Stability Policy + +> **Status: Anticipated β€” not yet active.** +> This policy describes the stability guarantees codegraph *will* provide once the public API surface stabilizes after [Phase 3 β€” Architectural Refactoring](ROADMAP.md). Until then, breaking changes may still land in minor releases as the internal architecture is restructured. + +--- + +## Signal Status + +| Signal | Current Status | Planned | +|--------|---------------|---------| +| Stability policy | This document (anticipated) | Active after Phase 3 | +| Deprecation warnings | Not yet | Phase 3+ | +| Migration guides | Partial (v1β†’v2: "rebuild required") | Every major going forward | +| Semantic versioning policy | SemVer followed, no support window | Phase 3+ | +| LTS / release tracks | No | When adoption warrants | +| API reference docs | CLI `--help` only | Phase 3+ | +| `@deprecated` annotations | No | Phase 3+ | +| MCP tool schema versioning | No | Phase 3+ | + +--- + +## 1. Semantic Versioning + +Codegraph follows [SemVer 2.0.0](https://semver.org/). Once this policy is active, version bumps will be governed by these rules: + +### What counts as breaking (requires major bump) + +- Removing or renaming CLI commands or flags +- Changing MCP tool names, required parameters, or response shapes +- Removing or renaming programmatic exports from `index.js` +- DB schema changes that require a full rebuild (without automatic migration) + +### What is NOT breaking + +- Internal function signatures (anything not exported from `index.js`) +- Output formatting tweaks (column widths, colors, human-readable text) +- Performance improvements +- New additive features (new commands, new optional flags, new MCP tools) +- Bug fixes that correct previously incorrect behavior + +### Support window + +**TBD.** The plan is to support at least one previous major version with critical bug and security fixes after a new major is released. The exact window will be defined when this policy activates. + +--- + +## 2. Deprecation Policy + +Before removing any public API surface, codegraph will provide advance notice: + +1. **`@deprecated` JSDoc annotation** on the function or method, with a message pointing to the replacement. +2. **Runtime `console.warn`** on first use per process, e.g.: + ``` + [codegraph] DEPRECATED: queryNameData() will be removed in v4.0. Use querySymbol() instead. + ``` +3. **Minimum deprecation window:** one minor release cycle before removal. The deprecation notice ships in version N.x, the removal lands no earlier than version (N+1).0. + +### Scope + +Deprecation notices apply to: + +- Exported functions and classes in `index.js` +- CLI commands and flags +- MCP tool schemas (tool names, parameter names, response properties) + +Internal functions not exported from `index.js` may be changed or removed without deprecation notices. + +--- + +## 3. Migration Guides + +Starting with the next major version, every major release will ship with a migration guide covering: + +- What changed and why +- Step-by-step upgrade instructions +- Before/after code examples for breaking API changes +- DB migration steps (if any) + +Migration guides will be published in `docs/` alongside the release. + +### Retroactive acknowledgment + +**v1 β†’ v2** required a full `codegraph build` to regenerate the graph database. No migration guide was published at the time. Going forward, this gap will not recur. + +--- + +## 4. Release Tracks + +### Current (active now) + +The **Current** track receives all new features and improvements. Breaking changes land in major versions. This is the only active track today. + +### LTS (planned, not yet active) + +An **LTS** (Long-Term Support) track is anticipated when adoption warrants it. When activated: + +- LTS releases receive security fixes and critical bug fixes only +- LTS support window: N months after the next major version ships (exact duration TBD) +- LTS releases will not receive new features + +LTS will be activated based on community adoption and demand β€” there is no fixed date. + +--- + +## 5. API Reference + +| Surface | Current State | Planned | +|---------|--------------|---------| +| CLI commands | Documented via `--help` and [README](README.md#-commands) | No change needed | +| Programmatic API (`index.js`) | Documented in README examples | Auto-generated JSDoc reference (Phase 3+) | +| MCP tools | Documented in [AI Agent Guide](docs/ai-agent-guide.md) | Versioned schema reference (Phase 3+) | + +The planned auto-generated reference will cover all public exports from `index.js` with full type signatures, parameter descriptions, and usage examples. + +--- + +## 6. MCP Tool Schema Versioning + +MCP tool schemas β€” tool names, parameter shapes (names, types, required/optional), and response shapes β€” are part of the public API. Once this policy is active: + +- **Breaking schema changes** (renaming a tool, removing a parameter, changing a response shape) require a **major version bump**. +- **Additive changes** (new optional parameters, new tools, new response fields) are **non-breaking** and may land in minor versions. + +This ensures that AI agents relying on codegraph's MCP tools will not break silently on upgrade. diff --git a/docs/dogfooding-guide.md b/docs/dogfooding-guide.md deleted file mode 100644 index 14c71e19..00000000 --- a/docs/dogfooding-guide.md +++ /dev/null @@ -1,102 +0,0 @@ -# Codegraph Dogfooding Guide - -Codegraph analyzing its own codebase. This guide documents findings from self-analysis and lists improvements β€” both automated fixes already applied and items requiring human judgment. - -## Running the Self-Analysis - -```bash -# Build the graph (from repo root) -node src/cli.js build . - -# Core analysis commands -node src/cli.js cycles # Circular dependency check -node src/cli.js cycles --functions # Function-level cycles -node src/cli.js map --limit 20 --json # Module coupling overview -node src/cli.js diff-impact main --json # Impact of current branch -node src/cli.js deps src/.js # File dependency inspection -node src/cli.js fn # Function call chain trace -node src/cli.js fn-impact # What breaks if function changes -``` - -## Action Items - -These findings require human judgment to address properly: - -### HIGH PRIORITY - -#### 1. parser.js is a 2200+ line monolith (47 function definitions) -**Found by:** `codegraph deps src/parser.js` and `codegraph map` - -`parser.js` has the highest fan-in (14 files import it) and contains extractors for **all 11 languages** in a single file. Each language extractor (Python, Go, Rust, Java, C#, PHP, Ruby, HCL) has its own `walk()` function, creating duplicate names that confuse function-level analysis. - -**Recommendation:** Split per-language extractors into separate files under `src/extractors/`: -``` -src/extractors/ - javascript.js # JS/TS/TSX extractor (currently inline) - python.js # extractPythonSymbols + findPythonParentClass + walk - go.js # extractGoSymbols + walk - rust.js # extractRustSymbols + extractRustUsePath + walk - java.js # extractJavaSymbols + findJavaParentClass + walk - csharp.js # extractCSharpSymbols + extractCSharpBaseTypes + walk - ruby.js # extractRubySymbols + findRubyParentClass + walk - php.js # extractPHPSymbols + findPHPParentClass + walk - hcl.js # extractHCLSymbols + walk -``` -**Impact:** Would improve codegraph's own function-level analysis (no more ambiguous `walk` matches), make each extractor independently testable, and reduce the cognitive load of the file. - -**Trade-off:** The Rust native engine already has this structure (`crates/codegraph-core/src/extractors/`). Aligning the WASM extractors would create parity. - - -### MEDIUM PRIORITY - -#### 3. builder.js has the highest fan-out (7 dependencies) -**Found by:** `codegraph map` - -`builder.js` imports from 7 modules: config, constants, db, logger, parser, resolve, and structure. As the build orchestrator this is somewhat expected, but it also means any change to builder.js has wide blast radius. - -**Recommendation:** Consider whether the `structure.js` integration (already lazy-loaded via dynamic import) pattern could apply to other optional post-build steps. - -#### 4. watcher.js fan-out vs fan-in imbalance (5 out, 2 in) -**Found by:** `codegraph map` - -The watcher depends on 5 modules but only 2 modules reference it. This suggests it might be pulling in more than it needs. - -**Recommendation:** Review whether watcher.js can use more targeted imports or lazy-load some dependencies. - -#### 5. diff-impact runs git in temp directories (test fragility) -**Found by:** Integration test output showing `git diff --no-index` errors in temp directories - -The `diff-impact` command runs `git diff` which fails in non-git temp directories used by tests. The error output is noisy but doesn't fail the test. - -**Recommendation:** Guard the git call or skip gracefully when not in a git repo. - -### LOW PRIORITY - -#### 6. Consider adding a `codegraph stats` command -There's no single command that shows a quick overview of graph health: node/edge counts, cycle count, top coupling hotspots, fan-out outliers. Currently you need to run `map`, `cycles`, and read the build output separately. - -#### 7. Embed and search the codebase itself -Running `codegraph embed .` and then `codegraph search "build dependency graph"` on the codegraph repo would exercise the embedding pipeline and could surface naming/discoverability issues in the API. - -## Known Environment Issue - -On this workstation, changes to files not already tracked as modified on the current git branch (`docs/architecture-audit`) get reverted by an external process (likely a VS Code extension). If you're applying the parser.js cycle fix, do it from a fresh branch or commit immediately. - -## Periodic Self-Check Routine - -Run this after significant changes: - -```bash -# 1. Rebuild the graph -node src/cli.js build . - -# 2. Check for regressions -node src/cli.js cycles # Should be 0 file-level cycles -node src/cli.js map --limit 10 # Verify no new coupling hotspots - -# 3. Check impact of your changes -node src/cli.js diff-impact main - -# 4. Run tests -npm test -``` diff --git a/src/builder.js b/src/builder.js index 01d10225..3b7d3c3d 100644 --- a/src/builder.js +++ b/src/builder.js @@ -1,6 +1,5 @@ import { createHash } from 'node:crypto'; import fs from 'node:fs'; -import os from 'node:os'; import path from 'node:path'; import { loadConfig } from './config.js'; import { EXTENSIONS, IGNORE_DIRS, normalizePath } from './constants.js'; @@ -830,7 +829,8 @@ export async function buildGraph(rootDir, opts = {}) { writeJournalHeader(rootDir, Date.now()); if (!opts.skipRegistry) { - const tmpDir = path.resolve(os.tmpdir()); + const { tmpdir } = await import('node:os'); + const tmpDir = path.resolve(tmpdir()); const resolvedRoot = path.resolve(rootDir); if (resolvedRoot.startsWith(tmpDir)) { debug(`Skipping auto-registration for temp directory: ${resolvedRoot}`); diff --git a/tests/unit/builder.test.js b/tests/unit/builder.test.js index 6408f02f..63c60a4c 100644 --- a/tests/unit/builder.test.js +++ b/tests/unit/builder.test.js @@ -8,8 +8,8 @@ import fs from 'node:fs'; import os from 'node:os'; import path from 'node:path'; -import { afterAll, beforeAll, describe, expect, it } from 'vitest'; -import { collectFiles, loadPathAliases } from '../../src/builder.js'; +import { afterAll, beforeAll, describe, expect, it, vi } from 'vitest'; +import { collectFiles, loadPathAliases, readFileSafe } from '../../src/builder.js'; let tmpDir; @@ -109,6 +109,25 @@ describe('collectFiles', () => { const files = collectFiles(path.join(tmpDir, 'does-not-exist')); expect(files).toEqual([]); }); + + it('detects symlink loops without infinite recursion', () => { + const loopDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-symloop-')); + const subDir = path.join(loopDir, 'sub'); + fs.mkdirSync(subDir); + fs.writeFileSync(path.join(subDir, 'a.js'), 'export const a = 1;'); + try { + fs.symlinkSync(loopDir, path.join(subDir, 'loop'), 'junction'); + } catch { + // Symlinks may require elevated privileges on Windows β€” skip gracefully + fs.rmSync(loopDir, { recursive: true, force: true }); + return; + } + // Should complete without stack overflow + const files = collectFiles(loopDir); + const basenames = files.map((f) => path.basename(f)); + expect(basenames).toContain('a.js'); + fs.rmSync(loopDir, { recursive: true, force: true }); + }); }); // ─── loadPathAliases ────────────────────────────────────────────── @@ -204,3 +223,52 @@ describe('loadPathAliases', () => { fs.rmSync(dir, { recursive: true, force: true }); }); }); + +// ─── readFileSafe ───────────────────────────────────────────────── + +describe('readFileSafe', () => { + it('reads a file normally', () => { + const filePath = path.join(tmpDir, 'src', 'app.js'); + const content = readFileSafe(filePath); + expect(content).toBe('export default {}'); + }); + + it('throws ENOENT for non-existent file without retrying', () => { + const spy = vi.spyOn(fs, 'readFileSync'); + expect(() => readFileSafe(path.join(tmpDir, 'nope.js'))).toThrow(); + // ENOENT is not transient β€” should only be called once (no retries) + expect(spy).toHaveBeenCalledTimes(1); + spy.mockRestore(); + }); + + it('retries on transient errors and succeeds', () => { + const filePath = path.join(tmpDir, 'src', 'app.js'); + const realContent = fs.readFileSync(filePath, 'utf-8'); + let callCount = 0; + const spy = vi.spyOn(fs, 'readFileSync').mockImplementation(() => { + callCount++; + if (callCount <= 2) { + const err = new Error('resource busy'); + err.code = 'EBUSY'; + throw err; + } + return realContent; + }); + const content = readFileSafe(filePath); + expect(content).toBe(realContent); + expect(callCount).toBe(3); // 2 transient failures + 1 success + spy.mockRestore(); + }); + + it('throws after exhausting retries on transient errors', () => { + const spy = vi.spyOn(fs, 'readFileSync').mockImplementation(() => { + const err = new Error('permission denied'); + err.code = 'EACCES'; + throw err; + }); + expect(() => readFileSafe(path.join(tmpDir, 'src', 'app.js'), 1)).toThrow('permission denied'); + // 1 retry = 2 total attempts (initial + 1 retry) + expect(spy).toHaveBeenCalledTimes(2); + spy.mockRestore(); + }); +});