Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/codegraph-core/src/extractors/csharp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ impl SymbolExtractor for CSharpExtractor {
fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
let mut symbols = FileSymbols::new(file_path.to_string());
walk_node(&tree.root_node(), source, &mut symbols);
walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &CSHARP_AST_CONFIG);
symbols
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/codegraph-core/src/extractors/go.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ impl SymbolExtractor for GoExtractor {
fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
let mut symbols = FileSymbols::new(file_path.to_string());
walk_node(&tree.root_node(), source, &mut symbols);
walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &GO_AST_CONFIG);
symbols
}
}
Expand Down
307 changes: 306 additions & 1 deletion crates/codegraph-core/src/extractors/helpers.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use tree_sitter::Node;
use crate::types::Definition;
use crate::types::{AstNode, Definition};

/// Get the text of a node from the source bytes.
pub fn node_text<'a>(node: &Node, source: &'a [u8]) -> &'a str {
Expand Down Expand Up @@ -88,3 +88,308 @@ pub fn truncate(s: &str, max: usize) -> String {
}
format!("{}\u{2026}", &s[..end])
}

// ── AST node extraction (shared across all languages) ────────────────────────

/// Max length for the AST `text` field — matches `TEXT_MAX` in `ast.js`.
pub const AST_TEXT_MAX: usize = 200;

/// Language-specific AST node type configuration.
pub struct LangAstConfig {
/// Node types mapping to `"new"` kind (e.g. `new_expression`, `object_creation_expression`)
pub new_types: &'static [&'static str],
/// Node types mapping to `"throw"` kind (e.g. `throw_statement`, `raise_statement`)
pub throw_types: &'static [&'static str],
/// Node types mapping to `"await"` kind (e.g. `await_expression`, `await`)
pub await_types: &'static [&'static str],
/// Node types mapping to `"string"` kind (e.g. `string`, `string_literal`)
pub string_types: &'static [&'static str],
/// Node types mapping to `"regex"` kind (e.g. `regex`)
pub regex_types: &'static [&'static str],
/// Characters to strip from string delimiters when extracting content.
pub quote_chars: &'static [char],
/// Single-char prefixes that can appear before string quotes (e.g. `r`, `b`, `f`, `u` for Python).
/// Multi-char combos like `rb`, `fr` are handled by stripping each char in sequence.
pub string_prefixes: &'static [char],
}

// ── Per-language configs ─────────────────────────────────────────────────────

pub const PYTHON_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &[],
throw_types: &["raise_statement"],
await_types: &["await"],
string_types: &["string"],
regex_types: &[],
quote_chars: &['\'', '"'],
string_prefixes: &['r', 'b', 'f', 'u', 'R', 'B', 'F', 'U'],
};

pub const GO_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &[],
throw_types: &[],
await_types: &[],
string_types: &["interpreted_string_literal", "raw_string_literal"],
regex_types: &[],
quote_chars: &['"', '`'],
string_prefixes: &[],
};

pub const RUST_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &[],
throw_types: &[],
await_types: &["await_expression"],
string_types: &["string_literal", "raw_string_literal"],
regex_types: &[],
quote_chars: &['"'],
string_prefixes: &[],
};

pub const JAVA_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &["object_creation_expression"],
throw_types: &["throw_statement"],
await_types: &[],
string_types: &["string_literal"],
regex_types: &[],
quote_chars: &['"'],
string_prefixes: &[],
};

pub const CSHARP_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &["object_creation_expression"],
throw_types: &["throw_statement", "throw_expression"],
await_types: &["await_expression"],
string_types: &["string_literal", "verbatim_string_literal"],
regex_types: &[],
quote_chars: &['"'],
string_prefixes: &[],
};

pub const RUBY_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &[],
throw_types: &[],
await_types: &[],
string_types: &["string"],
regex_types: &["regex"],
quote_chars: &['\'', '"'],
string_prefixes: &[],
};

pub const PHP_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &["object_creation_expression"],
throw_types: &["throw_expression"],
await_types: &[],
string_types: &["string", "encapsed_string"],
regex_types: &[],
quote_chars: &['\'', '"'],
string_prefixes: &[],
};

// ── Generic AST node walker ──────────────────────────────────────────────────

/// Node types that represent identifiers across languages.
const IDENT_TYPES: &[&str] = &[
"identifier", "type_identifier", "name", "qualified_name",
"scoped_identifier", "qualified_identifier",
"member_expression", "member_access_expression",
"field_expression", "attribute", "scoped_type_identifier",
];

/// Node types that represent function/method calls across languages.
const CALL_TYPES: &[&str] = &[
"call_expression", "call", "invocation_expression",
"method_invocation", "function_call_expression",
"member_call_expression", "scoped_call_expression",
];

/// Walk the tree collecting AST nodes using language-specific config.
/// Generic version of `walk_ast_nodes()` in `javascript.rs`.
pub fn walk_ast_nodes_with_config(
node: &Node,
source: &[u8],
ast_nodes: &mut Vec<AstNode>,
config: &LangAstConfig,
) {
let kind = node.kind();

if config.new_types.contains(&kind) {
let name = extract_constructor_name(node, source);
let text = truncate(node_text(node, source), AST_TEXT_MAX);
ast_nodes.push(AstNode {
kind: "new".to_string(),
name,
line: start_line(node),
text: Some(text),
receiver: None,
});
// Fall through to recurse children (e.g. string args inside `new`)
} else if config.throw_types.contains(&kind) {
let name = extract_throw_target(node, source, config);
let text = extract_child_expression_text(node, source);
ast_nodes.push(AstNode {
kind: "throw".to_string(),
name,
line: start_line(node),
text,
receiver: None,
});
// Fall through to recurse children (e.g. `new` inside `throw new ...`)
} else if config.await_types.contains(&kind) {
let name = extract_awaited_name(node, source);
let text = extract_child_expression_text(node, source);
ast_nodes.push(AstNode {
kind: "await".to_string(),
name,
line: start_line(node),
text,
receiver: None,
});
// Fall through to recurse children
} else if config.string_types.contains(&kind) {
let raw = node_text(node, source);
let is_raw_string = kind.contains("raw_string");
// Strip language prefix modifiers before quote chars:
// - C# verbatim `@"..."`
// - Rust raw strings `r"..."`, `r#"..."#`
// - Python prefixes: r, b, f, u and combos like rb, fr
let without_prefix = raw.trim_start_matches('@')
.trim_start_matches(|c: char| config.string_prefixes.contains(&c))
.trim_start_matches('r');
// Only strip `#` delimiters for raw string node types (e.g. Rust `r#"..."#`)
let without_prefix = if is_raw_string {
without_prefix.trim_start_matches('#')
} else {
without_prefix
};
let content = without_prefix
.trim_start_matches(|c: char| config.quote_chars.contains(&c));
let content = if is_raw_string {
content.trim_end_matches('#')
} else {
content
};
let content = content
.trim_end_matches(|c: char| config.quote_chars.contains(&c));
if content.chars().count() < 2 {
for i in 0..node.child_count() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

content.len() < 2 counts bytes instead of characters, causing inconsistency with JS

Rust's len() returns byte length, while JS's .length uses UTF-16 code unit count. This causes divergence between native (Rust) and WASM (JS) paths:

  • For non-ASCII characters like "é" (U+00E9):
    • Rust: 2 bytes → passes the filter (>= 2) → string is extracted
    • JS: 1 UTF-16 unit → fails the filter (< 2) → string is skipped

This means the native extractor captures single non-ASCII character strings that the WASM path filters out, producing different output between the two extraction modes.

Fix: Use character count instead of byte count:

Suggested change
for i in 0..node.child_count() {
if content.chars().count() < 2 {

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 4c78a4d — changed content.len() < 2 to content.chars().count() < 2 for character-level counting matching JS .length behavior.

if let Some(child) = node.child(i) {
walk_ast_nodes_with_config(&child, source, ast_nodes, config);
}
}
return;
}
let name = truncate(content, 100);
let text = truncate(raw, AST_TEXT_MAX);
ast_nodes.push(AstNode {
kind: "string".to_string(),
name,
line: start_line(node),
text: Some(text),
receiver: None,
});
// Fall through to recurse children (template strings may have nested expressions)
} else if config.regex_types.contains(&kind) {
let raw = node_text(node, source);
let name = if raw.is_empty() { "?".to_string() } else { raw.to_string() };
let text = truncate(raw, AST_TEXT_MAX);
ast_nodes.push(AstNode {
kind: "regex".to_string(),
name,
line: start_line(node),
text: Some(text),
receiver: None,
});
// Fall through to recurse children
}

for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
walk_ast_nodes_with_config(&child, source, ast_nodes, config);
}
}
}

// ── Name extraction helpers ──────────────────────────────────────────────────

/// Extract constructor name from a `new`/`object_creation_expression` node.
fn extract_constructor_name(node: &Node, source: &[u8]) -> String {
// Try common field names for the constructed type
for field in &["type", "class", "constructor"] {
if let Some(child) = node.child_by_field_name(field) {
return node_text(&child, source).to_string();
}
}
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
if IDENT_TYPES.contains(&child.kind()) {
return node_text(&child, source).to_string();
}
}
}
let raw = node_text(node, source);
raw.split('(')
.next()
.unwrap_or(raw)
.replace("new ", "")
.trim()
.to_string()
}

/// Extract name from a throw/raise statement.
fn extract_throw_target(node: &Node, source: &[u8], config: &LangAstConfig) -> String {
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
let ck = child.kind();
if config.new_types.contains(&ck) {
return extract_constructor_name(&child, source);
}
if CALL_TYPES.contains(&ck) {
return extract_call_name(&child, source);
}
if IDENT_TYPES.contains(&ck) {
return node_text(&child, source).to_string();
}
}
}
truncate(node_text(node, source), AST_TEXT_MAX)
}

/// Extract name from an await expression.
fn extract_awaited_name(node: &Node, source: &[u8]) -> String {
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
let ck = child.kind();
if CALL_TYPES.contains(&ck) {
return extract_call_name(&child, source);
}
if IDENT_TYPES.contains(&ck) {
return node_text(&child, source).to_string();
}
}
}
truncate(node_text(node, source), AST_TEXT_MAX)
}

/// Extract function name from a call node.
fn extract_call_name(node: &Node, source: &[u8]) -> String {
for field in &["function", "method", "name"] {
if let Some(fn_node) = node.child_by_field_name(field) {
return node_text(&fn_node, source).to_string();
}
}
let text = node_text(node, source);
text.split('(').next().unwrap_or("?").to_string()
}

/// Extract expression text from throw/await — skip the keyword child.
fn extract_child_expression_text(node: &Node, source: &[u8]) -> Option<String> {
const KEYWORDS: &[&str] = &["throw", "raise", "await", "new"];
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
if !KEYWORDS.contains(&child.kind()) {
return Some(truncate(node_text(&child, source), AST_TEXT_MAX));
}
}
}
Some(truncate(node_text(node, source), AST_TEXT_MAX))
}
1 change: 1 addition & 0 deletions crates/codegraph-core/src/extractors/java.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ impl SymbolExtractor for JavaExtractor {
fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
let mut symbols = FileSymbols::new(file_path.to_string());
walk_node(&tree.root_node(), source, &mut symbols);
walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &JAVA_AST_CONFIG);
symbols
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/codegraph-core/src/extractors/php.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ impl SymbolExtractor for PhpExtractor {
fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
let mut symbols = FileSymbols::new(file_path.to_string());
walk_node(&tree.root_node(), source, &mut symbols);
walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &PHP_AST_CONFIG);
symbols
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/codegraph-core/src/extractors/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ impl SymbolExtractor for PythonExtractor {
fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
let mut symbols = FileSymbols::new(file_path.to_string());
walk_node(&tree.root_node(), source, &mut symbols);
walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &PYTHON_AST_CONFIG);
symbols
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/codegraph-core/src/extractors/ruby.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ impl SymbolExtractor for RubyExtractor {
fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
let mut symbols = FileSymbols::new(file_path.to_string());
walk_node(&tree.root_node(), source, &mut symbols);
walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &RUBY_AST_CONFIG);
symbols
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/codegraph-core/src/extractors/rust_lang.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ impl SymbolExtractor for RustExtractor {
fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
let mut symbols = FileSymbols::new(file_path.to_string());
walk_node(&tree.root_node(), source, &mut symbols);
walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &RUST_AST_CONFIG);
symbols
}
}
Expand Down
Loading