diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs index dd5a2ab8..414a74e8 100644 --- a/crates/codegraph-core/src/extractors/helpers.rs +++ b/crates/codegraph-core/src/extractors/helpers.rs @@ -75,3 +75,16 @@ pub fn start_line(node: &Node) -> u32 { pub fn end_line(node: &Node) -> u32 { node.end_position().row as u32 + 1 } + +/// Char-safe truncation with ellipsis, matching `ast.js:51-54`. +pub fn truncate(s: &str, max: usize) -> String { + if s.len() <= max { + return s.to_string(); + } + let mut end = max.saturating_sub(1); + // Ensure we don't split a multi-byte char + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + format!("{}\u{2026}", &s[..end]) +} diff --git a/crates/codegraph-core/src/extractors/javascript.rs b/crates/codegraph-core/src/extractors/javascript.rs index b952239e..5d3cc2ca 100644 --- a/crates/codegraph-core/src/extractors/javascript.rs +++ b/crates/codegraph-core/src/extractors/javascript.rs @@ -10,6 +10,7 @@ impl SymbolExtractor for JsExtractor { fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { let mut symbols = FileSymbols::new(file_path.to_string()); walk_node(&tree.root_node(), source, &mut symbols); + walk_ast_nodes(&tree.root_node(), source, &mut symbols.ast_nodes); symbols } } @@ -371,6 +372,183 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { } } +// ── AST node extraction (new / throw / await / string / regex) ────────────── + +const TEXT_MAX: usize = 200; + +/// Walk the tree collecting new/throw/await/string/regex AST nodes. +/// Mirrors `walkAst()` in `ast.js:216-276`. +fn walk_ast_nodes(node: &Node, source: &[u8], ast_nodes: &mut Vec) { + match node.kind() { + "new_expression" => { + let name = extract_new_name(node, source); + let text = truncate(node_text(node, source), TEXT_MAX); + ast_nodes.push(AstNode { + kind: "new".to_string(), + name, + line: start_line(node), + text: Some(text), + receiver: None, + }); + // Don't recurse — we already captured this node + return; + } + "throw_statement" => { + let name = extract_throw_name(node, source); + let text = extract_expression_text(node, source); + ast_nodes.push(AstNode { + kind: "throw".to_string(), + name, + line: start_line(node), + text, + receiver: None, + }); + // Don't recurse — prevents double-counting `throw new Error` + return; + } + "await_expression" => { + let name = extract_await_name(node, source); + let text = extract_expression_text(node, source); + ast_nodes.push(AstNode { + kind: "await".to_string(), + name, + line: start_line(node), + text, + receiver: None, + }); + // Don't recurse + return; + } + "string" | "template_string" => { + let raw = node_text(node, source); + // Strip quotes to get content + let content = raw + .trim_start_matches(|c| c == '\'' || c == '"' || c == '`') + .trim_end_matches(|c| c == '\'' || c == '"' || c == '`'); + if content.len() < 2 { + // Still recurse children (template_string may have nested expressions) + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + walk_ast_nodes(&child, source, ast_nodes); + } + } + return; + } + let name = truncate(content, 100); + let text = truncate(raw, TEXT_MAX); + ast_nodes.push(AstNode { + kind: "string".to_string(), + name, + line: start_line(node), + text: Some(text), + receiver: None, + }); + // Do recurse children for strings + } + "regex" => { + let raw = node_text(node, source); + let name = if raw.is_empty() { "?".to_string() } else { raw.to_string() }; + let text = truncate(raw, TEXT_MAX); + ast_nodes.push(AstNode { + kind: "regex".to_string(), + name, + line: start_line(node), + text: Some(text), + receiver: None, + }); + // Do recurse children for regex + } + _ => {} + } + + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + walk_ast_nodes(&child, source, ast_nodes); + } + } +} + +/// Extract constructor name from a `new_expression` node. +/// Handles `new Foo()`, `new a.Foo()`, `new Foo.Bar()`. +fn extract_new_name(node: &Node, source: &[u8]) -> String { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if child.kind() == "identifier" { + return node_text(&child, source).to_string(); + } + if child.kind() == "member_expression" { + return node_text(&child, source).to_string(); + } + } + } + // Fallback: text before '(' minus 'new ' + let raw = node_text(node, source); + raw.split('(') + .next() + .unwrap_or(raw) + .replace("new ", "") + .trim() + .to_string() +} + +/// Extract name from a `throw_statement`. +/// `throw new Error(...)` → "Error"; `throw x` → "x" +fn extract_throw_name(node: &Node, source: &[u8]) -> String { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + match child.kind() { + "new_expression" => return extract_new_name(&child, source), + "call_expression" => { + if let Some(fn_node) = child.child_by_field_name("function") { + return node_text(&fn_node, source).to_string(); + } + let text = node_text(&child, source); + return text.split('(').next().unwrap_or("?").to_string(); + } + "identifier" => return node_text(&child, source).to_string(), + _ => {} + } + } + } + truncate(node_text(node, source), TEXT_MAX) +} + +/// Extract name from an `await_expression`. +/// `await fetch(...)` → "fetch"; `await this.foo()` → "this.foo" +fn extract_await_name(node: &Node, source: &[u8]) -> String { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + match child.kind() { + "call_expression" => { + if let Some(fn_node) = child.child_by_field_name("function") { + return node_text(&fn_node, source).to_string(); + } + let text = node_text(&child, source); + return text.split('(').next().unwrap_or("?").to_string(); + } + "identifier" | "member_expression" => { + return node_text(&child, source).to_string(); + } + _ => {} + } + } + } + truncate(node_text(node, source), TEXT_MAX) +} + +/// Extract expression text from throw/await — skip the keyword child. +fn extract_expression_text(node: &Node, source: &[u8]) -> Option { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + // Skip the keyword token itself + if child.kind() != "throw" && child.kind() != "await" { + return Some(truncate(node_text(&child, source), TEXT_MAX)); + } + } + } + Some(truncate(node_text(node, source), TEXT_MAX)) +} + // ── Extended kinds helpers ────────────────────────────────────────────────── fn extract_js_parameters(node: &Node, source: &[u8]) -> Vec { @@ -1033,4 +1211,94 @@ mod tests { let f = s.definitions.iter().find(|d| d.name == "fn").unwrap(); assert_eq!(f.kind, "function"); } + + // ── AST node extraction tests ──────────────────────────────────────────── + + #[test] + fn ast_extracts_new_expression() { + let s = parse_js("function f() { const m = new Map(); const s = new Set(); }"); + let new_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "new").collect(); + assert_eq!(new_nodes.len(), 2); + let names: Vec<&str> = new_nodes.iter().map(|n| n.name.as_str()).collect(); + assert!(names.contains(&"Map")); + assert!(names.contains(&"Set")); + } + + #[test] + fn ast_extracts_new_member_expression() { + let s = parse_js("const e = new errors.NotFoundError();"); + let new_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "new").collect(); + assert_eq!(new_nodes.len(), 1); + assert_eq!(new_nodes[0].name, "errors.NotFoundError"); + } + + #[test] + fn ast_extracts_throw_statement() { + let s = parse_js("function f() { throw new Error('bad'); }"); + let throw_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "throw").collect(); + assert_eq!(throw_nodes.len(), 1); + assert_eq!(throw_nodes[0].name, "Error"); + } + + #[test] + fn ast_throw_no_double_count_new() { + // `throw new Error(...)` should produce one throw node, NOT also a new node + let s = parse_js("function f() { throw new Error('fail'); }"); + let new_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "new").collect(); + let throw_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "throw").collect(); + assert_eq!(throw_nodes.len(), 1); + assert_eq!(new_nodes.len(), 0, "throw new Error should not also emit a new node"); + } + + #[test] + fn ast_extracts_await_expression() { + let s = parse_js("async function f() { const d = await fetch('/api'); }"); + let await_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "await").collect(); + assert_eq!(await_nodes.len(), 1); + assert_eq!(await_nodes[0].name, "fetch"); + } + + #[test] + fn ast_extracts_await_member_expression() { + let s = parse_js("async function f() { await this.load(); }"); + let await_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "await").collect(); + assert_eq!(await_nodes.len(), 1); + assert_eq!(await_nodes[0].name, "this.load"); + } + + #[test] + fn ast_extracts_string_literals() { + let s = parse_js("const x = 'hello world'; const y = \"foo bar\";"); + let str_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "string").collect(); + assert_eq!(str_nodes.len(), 2); + let names: Vec<&str> = str_nodes.iter().map(|n| n.name.as_str()).collect(); + assert!(names.contains(&"hello world")); + assert!(names.contains(&"foo bar")); + } + + #[test] + fn ast_skips_trivial_strings() { + // Single char or empty strings should be skipped + let s = parse_js("const a = ''; const b = 'x'; const c = 'ok';"); + let str_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "string").collect(); + // Only "ok" has content length >= 2 + assert_eq!(str_nodes.len(), 1); + assert_eq!(str_nodes[0].name, "ok"); + } + + #[test] + fn ast_extracts_regex() { + let s = parse_js("const re = /^[a-z]+$/i;"); + let regex_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "regex").collect(); + assert_eq!(regex_nodes.len(), 1); + assert!(regex_nodes[0].name.contains("[a-z]")); + } + + #[test] + fn ast_extracts_template_string() { + let s = parse_js("const msg = `hello template`;"); + let str_nodes: Vec<_> = s.ast_nodes.iter().filter(|n| n.kind == "string").collect(); + assert_eq!(str_nodes.len(), 1); + assert!(str_nodes[0].name.contains("hello template")); + } } diff --git a/crates/codegraph-core/src/types.rs b/crates/codegraph-core/src/types.rs index ed299f0c..1b219c7c 100644 --- a/crates/codegraph-core/src/types.rs +++ b/crates/codegraph-core/src/types.rs @@ -134,6 +134,16 @@ pub struct ExportInfo { pub line: u32, } +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AstNode { + pub kind: String, + pub name: String, + pub line: u32, + pub text: Option, + pub receiver: Option, +} + #[napi(object)] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FileSymbols { @@ -143,6 +153,8 @@ pub struct FileSymbols { pub imports: Vec, pub classes: Vec, pub exports: Vec, + #[napi(js_name = "astNodes")] + pub ast_nodes: Vec, pub line_count: Option, } @@ -155,6 +167,7 @@ impl FileSymbols { imports: Vec::new(), classes: Vec::new(), exports: Vec::new(), + ast_nodes: Vec::new(), line_count: None, } } diff --git a/src/ast.js b/src/ast.js index 8c349667..e0ccecc9 100644 --- a/src/ast.js +++ b/src/ast.js @@ -195,10 +195,32 @@ export async function buildAstNodes(db, fileSymbols, _rootDir, _engineOpts) { // 2. AST walk for JS/TS/TSX — extract new, throw, await, string, regex const ext = path.extname(relPath).toLowerCase(); - if (WALK_EXTENSIONS.has(ext) && symbols._tree) { - const astRows = []; - walkAst(symbols._tree.rootNode, defs, relPath, astRows, getNodeId); - rows.push(...astRows); + if (WALK_EXTENSIONS.has(ext)) { + if (symbols._tree) { + // WASM path: walk the tree-sitter AST + const astRows = []; + walkAst(symbols._tree.rootNode, defs, relPath, astRows, getNodeId); + rows.push(...astRows); + } else if (symbols.astNodes?.length) { + // Native path: use pre-extracted AST nodes from Rust + for (const n of symbols.astNodes) { + const parentDef = findParentDef(defs, n.line); + let parentNodeId = null; + if (parentDef) { + const row = getNodeId.get(parentDef.name, parentDef.kind, relPath, parentDef.line); + if (row) parentNodeId = row.id; + } + rows.push({ + file: relPath, + line: n.line, + kind: n.kind, + name: n.name, + text: n.text || null, + receiver: n.receiver || null, + parentNodeId, + }); + } + } } if (rows.length > 0) { diff --git a/src/parser.js b/src/parser.js index 54eb0820..04149982 100644 --- a/src/parser.js +++ b/src/parser.js @@ -183,6 +183,13 @@ function normalizeNativeSymbols(result) { kind: e.kind, line: e.line, })), + astNodes: (result.astNodes ?? result.ast_nodes ?? []).map((n) => ({ + kind: n.kind, + name: n.name, + line: n.line, + text: n.text ?? null, + receiver: n.receiver ?? null, + })), }; } diff --git a/tests/parsers/ast-nodes.test.js b/tests/parsers/ast-nodes.test.js index d9ca53f7..93b1e1be 100644 --- a/tests/parsers/ast-nodes.test.js +++ b/tests/parsers/ast-nodes.test.js @@ -12,6 +12,7 @@ import Database from 'better-sqlite3'; import { afterAll, beforeAll, describe, expect, test } from 'vitest'; import { buildAstNodes } from '../../src/ast.js'; import { initSchema } from '../../src/db.js'; +import { loadNative } from '../../src/native.js'; import { parseFilesAuto } from '../../src/parser.js'; // ─── Fixture ────────────────────────────────────────────────────────── @@ -183,3 +184,129 @@ describe('buildAstNodes — JS extraction', () => { } }); }); + +// ─── Native engine AST node extraction ─────────────────────────────── + +// Check if native addon is available AND supports ast_nodes. +// Old prebuilt binaries return FileSymbols without the ast_nodes field. +function nativeSupportsAstNodes() { + const native = loadNative(); + if (!native) return false; + try { + const tmpCheck = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-ast-check-')); + const srcCheck = path.join(tmpCheck, 'src'); + fs.mkdirSync(srcCheck, { recursive: true }); + const checkPath = path.join(srcCheck, 'check.js'); + fs.writeFileSync(checkPath, 'const x = new Map();'); + const results = native.parseFiles([checkPath], tmpCheck); + const hasField = results?.[0]?.astNodes?.length > 0 || results?.[0]?.ast_nodes?.length > 0; + fs.rmSync(tmpCheck, { recursive: true, force: true }); + return hasField; + } catch { + return false; + } +} + +const canTestNative = nativeSupportsAstNodes(); + +describe.skipIf(!canTestNative)('buildAstNodes — native engine', () => { + let nativeTmpDir, nativeDbPath, nativeDb; + + function queryNativeAstNodes(kind) { + return nativeDb.prepare('SELECT * FROM ast_nodes WHERE kind = ? ORDER BY line').all(kind); + } + + function queryAllNativeAstNodes() { + return nativeDb.prepare('SELECT * FROM ast_nodes ORDER BY line').all(); + } + + beforeAll(async () => { + nativeTmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-ast-native-')); + const srcDir = path.join(nativeTmpDir, 'src'); + fs.mkdirSync(srcDir, { recursive: true }); + fs.mkdirSync(path.join(nativeTmpDir, '.codegraph')); + + const fixturePath = path.join(srcDir, 'fixture.js'); + fs.writeFileSync(fixturePath, FIXTURE_CODE); + + const allSymbols = await parseFilesAuto([fixturePath], nativeTmpDir, { engine: 'native' }); + const symbols = allSymbols.get('src/fixture.js'); + if (!symbols) throw new Error('Failed to parse fixture file with native engine'); + + nativeDbPath = path.join(nativeTmpDir, '.codegraph', 'graph.db'); + nativeDb = new Database(nativeDbPath); + nativeDb.pragma('journal_mode = WAL'); + initSchema(nativeDb); + + const insertNode = nativeDb.prepare( + 'INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)', + ); + for (const def of symbols.definitions) { + insertNode.run(def.name, def.kind, 'src/fixture.js', def.line, def.endLine); + } + + await buildAstNodes(nativeDb, allSymbols, nativeTmpDir); + }); + + afterAll(() => { + if (nativeDb) nativeDb.close(); + if (nativeTmpDir) fs.rmSync(nativeTmpDir, { recursive: true, force: true }); + }); + + test('captures new_expression as kind:new', () => { + const nodes = queryNativeAstNodes('new'); + expect(nodes.length).toBeGreaterThanOrEqual(1); + expect(nodes.map((n) => n.name)).toContain('Map'); + }); + + test('captures throw as kind:throw', () => { + const nodes = queryNativeAstNodes('throw'); + expect(nodes.length).toBeGreaterThanOrEqual(1); + expect(nodes.some((n) => n.name === 'Error')).toBe(true); + }); + + test('captures await as kind:await', () => { + const nodes = queryNativeAstNodes('await'); + expect(nodes.length).toBeGreaterThanOrEqual(1); + expect(nodes.some((n) => n.name.includes('fetch'))).toBe(true); + }); + + test('captures string literals as kind:string', () => { + const nodes = queryNativeAstNodes('string'); + expect(nodes.length).toBeGreaterThanOrEqual(1); + expect(nodes.some((n) => n.name.includes('hello world'))).toBe(true); + }); + + test('captures regex as kind:regex', () => { + const nodes = queryNativeAstNodes('regex'); + expect(nodes.length).toBeGreaterThanOrEqual(1); + expect(nodes.some((n) => n.name.includes('[a-z]') || n.name.includes('\\d'))).toBe(true); + }); + + test('no double-count for throw new Error', () => { + const newNodes = queryNativeAstNodes('new'); + // "Error" should NOT appear as a new node — it's captured under throw + expect(newNodes.every((n) => n.name !== 'Error')).toBe(true); + }); + + test('skips trivial strings', () => { + const nodes = queryNativeAstNodes('string'); + for (const node of nodes) { + expect(node.name.length).toBeGreaterThanOrEqual(2); + } + }); + + test('all nodes have valid kinds', () => { + const all = queryAllNativeAstNodes(); + const validKinds = new Set(['call', 'new', 'string', 'regex', 'throw', 'await']); + for (const node of all) { + expect(validKinds.has(node.kind)).toBe(true); + } + }); + + test('parent_node_id is resolved', () => { + const all = queryAllNativeAstNodes(); + const withParent = all.filter((n) => n.parent_node_id != null); + expect(withParent.length).toBeGreaterThan(0); + }); +});