From a7a61e1126dfbcf9a39858798579a9cf97213563 Mon Sep 17 00:00:00 2001 From: Edin Dagasan Date: Sun, 22 Mar 2026 23:35:01 +0100 Subject: [PATCH] feat: add Scala language support Adds Scala as the 15th supported language using the tree-sitter-scala grammar. Covers classes, traits, objects, enums, functions, imports, inheritance (extends/with, including generic types), and call expressions including instance_expression and generic_function nodes. --- CLAUDE.md | 4 +-- README.md | 6 ++--- code_review_graph/parser.py | 48 +++++++++++++++++++++++++++++++++ docs/FEATURES.md | 2 +- docs/LLM-OPTIMIZED-REFERENCE.md | 2 +- docs/USAGE.md | 2 +- skills/build-graph/SKILL.md | 2 +- tests/fixtures/sample.scala | 37 +++++++++++++++++++++++++ tests/test_multilang.py | 47 ++++++++++++++++++++++++++++++++ 9 files changed, 141 insertions(+), 9 deletions(-) create mode 100644 tests/fixtures/sample.scala diff --git a/CLAUDE.md b/CLAUDE.md index 3d4c9481..619e3cbb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,7 +7,7 @@ ## Architecture - **Core Package**: `code_review_graph/` (Python 3.10+) - - `parser.py` — Tree-sitter multi-language AST parser (14 languages including Vue SFC and Solidity) + - `parser.py` — Tree-sitter multi-language AST parser (15 languages including Vue SFC and Solidity) - `graph.py` — SQLite-backed graph store (nodes, edges, BFS impact analysis) - `tools.py` — 9 MCP tool implementations - `incremental.py` — Git-based change detection, file watching @@ -64,7 +64,7 @@ uv run code-review-graph serve # Start MCP server - `tests/test_tools.py` — MCP tool integration tests - `tests/test_visualization.py` — Export, HTML generation, C++ resolution - `tests/test_incremental.py` — Build, update, migration, git ops -- `tests/test_multilang.py` — 14 language parsing tests (including Vue and Solidity) +- `tests/test_multilang.py` — 15 language parsing tests (including Vue and Solidity) - `tests/test_embeddings.py` — Vector encode/decode, similarity, store - `tests/fixtures/` — Sample files for each supported language diff --git a/README.md b/README.md index fea1735e..e5151d22 100644 --- a/README.md +++ b/README.md @@ -84,10 +84,10 @@ On every git commit or file save, a hook fires. The graph diffs changed files, f
-14 supported languages +15 supported languages
-Python, TypeScript, JavaScript, Vue, Go, Rust, Java, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ +Python, TypeScript, JavaScript, Vue, Go, Rust, Java, Scala, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ Each language has full Tree-sitter grammar support for functions, classes, imports, call sites, inheritance, and test detection. @@ -210,7 +210,7 @@ Claude uses these automatically once the graph is built. | Feature | Details | |---------|---------| | **Incremental updates** | Re-parses only changed files. Subsequent updates complete in under 2 seconds. | -| **14 languages** | Python, TypeScript, JavaScript, Vue, Go, Rust, Java, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ | +| **15 languages** | Python, TypeScript, JavaScript, Vue, Go, Rust, Java, Scala, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ | | **Blast-radius analysis** | Shows exactly which functions, classes, and files are affected by any change | | **Auto-update hooks** | Graph updates on every file edit and git commit without manual intervention | | **Semantic search** | Optional vector embeddings via sentence-transformers | diff --git a/code_review_graph/parser.py b/code_review_graph/parser.py index debe4a98..dc1f5c8d 100644 --- a/code_review_graph/parser.py +++ b/code_review_graph/parser.py @@ -72,6 +72,7 @@ class EdgeInfo: ".kt": "kotlin", ".swift": "swift", ".php": "php", + ".scala": "scala", ".sol": "solidity", ".vue": "vue", } @@ -96,6 +97,9 @@ class EdgeInfo: "kotlin": ["class_declaration", "object_declaration"], "swift": ["class_declaration", "struct_declaration", "protocol_declaration"], "php": ["class_declaration", "interface_declaration"], + "scala": [ + "class_definition", "trait_definition", "object_definition", "enum_definition", + ], "solidity": [ "contract_declaration", "interface_declaration", "library_declaration", "struct_declaration", "enum_declaration", "error_declaration", @@ -118,6 +122,7 @@ class EdgeInfo: "kotlin": ["function_declaration"], "swift": ["function_declaration"], "php": ["function_definition", "method_declaration"], + "scala": ["function_definition", "function_declaration"], # Solidity: events and modifiers use kind="Function" because the graph # schema has no dedicated kind for them. State variables are also modeled # as Function nodes (public ones auto-generate getters) and distinguished @@ -143,6 +148,7 @@ class EdgeInfo: "kotlin": ["import_header"], "swift": ["import_declaration"], "php": ["namespace_use_declaration"], + "scala": ["import_declaration"], "solidity": ["import_directive"], } @@ -161,6 +167,7 @@ class EdgeInfo: "kotlin": ["call_expression"], "swift": ["call_expression"], "php": ["function_call_expression", "member_call_expression"], + "scala": ["call_expression", "instance_expression", "generic_function"], "solidity": ["call_expression"], } @@ -1030,6 +1037,19 @@ def _get_bases(self, node, language: str, source: bytes) -> list[str]: ): text = child.text.decode("utf-8", errors="replace") bases.append(text) + elif language == "scala": + for child in node.children: + if child.type == "extends_clause": + for sub in child.children: + if sub.type == "type_identifier": + bases.append(sub.text.decode("utf-8", errors="replace")) + elif sub.type == "generic_type": + for ident in sub.children: + if ident.type == "type_identifier": + bases.append( + ident.text.decode("utf-8", errors="replace") + ) + break elif language == "cpp": # C++: base_class_clause contains type_identifiers for child in node.children: @@ -1123,6 +1143,27 @@ def _extract_import(self, node, language: str, source: bytes) -> list[str]: val = child.text.decode("utf-8", errors="replace").strip('"') if val: imports.append(val) + elif language == "scala": + parts = [] + selectors = [] + is_wildcard = False + for child in node.children: + if child.type == "identifier": + parts.append(child.text.decode("utf-8", errors="replace")) + elif child.type == "namespace_selectors": + for sub in child.children: + if sub.type == "identifier": + selectors.append(sub.text.decode("utf-8", errors="replace")) + elif child.type == "namespace_wildcard": + is_wildcard = True + base = ".".join(parts) + if selectors: + for name in selectors: + imports.append(f"{base}.{name}") + elif is_wildcard: + imports.append(f"{base}.*") + elif base: + imports.append(base) elif language == "ruby": # require 'module' or require_relative 'path' if "require" in text: @@ -1142,6 +1183,13 @@ def _get_call_name(self, node, language: str, source: bytes) -> Optional[str]: first = node.children[0] + # Scala: instance_expression (new Foo(...)) – extract the type name + if node.type == "instance_expression": + for child in node.children: + if child.type in ("type_identifier", "identifier"): + return child.text.decode("utf-8", errors="replace") + return None + # Solidity wraps call targets in an 'expression' node – unwrap it if language == "solidity" and first.type == "expression" and first.children: first = first.children[0] diff --git a/docs/FEATURES.md b/docs/FEATURES.md index 508dd918..6782433b 100644 --- a/docs/FEATURES.md +++ b/docs/FEATURES.md @@ -5,7 +5,7 @@ - **Call target resolution**: Bare call targets are resolved to qualified names using same-file definitions, improving `callers_of`/`callees_of` accuracy. - **Impact radius pagination**: `get_impact_radius` returns `truncated` flag and `total_impacted` count; `max_results` parameter controls output size. - **`find_large_functions_tool`**: New MCP tool to find functions, classes, or files exceeding a line-count threshold. -- **14 languages**: Added Vue SFC and Solidity support. +- **15 languages**: Added Vue SFC and Solidity support. - **Documentation overhaul**: All docs updated with accurate language/tool counts, version references, and VS Code extension parity. ## v1.8.3 diff --git a/docs/LLM-OPTIMIZED-REFERENCE.md b/docs/LLM-OPTIMIZED-REFERENCE.md index 9aec28c3..77efda8d 100644 --- a/docs/LLM-OPTIMIZED-REFERENCE.md +++ b/docs/LLM-OPTIMIZED-REFERENCE.md @@ -44,7 +44,7 @@ Model: all-MiniLM-L6-v2 (384-dim, fast).
-Supported: Python, TypeScript, JavaScript, Vue, Go, Rust, Java, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ +Supported: Python, TypeScript, JavaScript, Vue, Go, Rust, Java, Scala, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ Parser: Tree-sitter via tree-sitter-language-pack
diff --git a/docs/USAGE.md b/docs/USAGE.md index b4b49833..6ec5b599 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -61,7 +61,7 @@ Then use `embed_graph_tool` to compute vectors. `semantic_search_nodes_tool` aut ## Supported Languages -Python, TypeScript, JavaScript, Vue, Go, Rust, Java, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ +Python, TypeScript, JavaScript, Vue, Go, Rust, Java, Scala, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ ## What Gets Indexed diff --git a/skills/build-graph/SKILL.md b/skills/build-graph/SKILL.md index a9d49278..8dc8d4c6 100644 --- a/skills/build-graph/SKILL.md +++ b/skills/build-graph/SKILL.md @@ -35,4 +35,4 @@ Build or incrementally update the persistent code knowledge graph for this repos - The graph is stored as a SQLite database (`.code-review-graph/graph.db`) in the repo root - Binary files, generated files, and patterns in `.code-review-graphignore` are skipped -- Supported languages: Python, TypeScript/JavaScript, Vue, Go, Rust, Java, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ +- Supported languages: Python, TypeScript/JavaScript, Vue, Go, Rust, Java, Scala, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++ diff --git a/tests/fixtures/sample.scala b/tests/fixtures/sample.scala new file mode 100644 index 00000000..3b2a3329 --- /dev/null +++ b/tests/fixtures/sample.scala @@ -0,0 +1,37 @@ +package com.example.auth + +import scala.collection.mutable +import scala.collection.mutable.{HashMap, ListBuffer} +import scala.util.Try +import scala.concurrent._ + +trait Repository[T]: + def findById(id: Int): Option[T] + def save(entity: T): Unit + +case class User(id: Int, name: String, email: String) + +class InMemoryRepo extends Repository[User] with Serializable: + private val users = mutable.HashMap[Int, User]() + + override def findById(id: Int): Option[User] = + users.get(id) + + override def save(user: User): Unit = + users.put(user.id, user) + println(s"Saved user ${user.id}") + +class UserService(repo: Repository[User]): + def createUser(name: String, email: String): User = + val user = User(1, name, email) + repo.save(user) + user + + def getUser(id: Int): Option[User] = + repo.findById(id) + +object UserService: + def apply(repo: Repository[User]): UserService = new UserService(repo) + +enum Color: + case Red, Green, Blue diff --git a/tests/test_multilang.py b/tests/test_multilang.py index 06d92d31..d26b24a5 100644 --- a/tests/test_multilang.py +++ b/tests/test_multilang.py @@ -269,6 +269,53 @@ def test_finds_functions(self): assert "createUser" in names or "findById" in names or "save" in names +class TestScalaParsing: + def setup_method(self): + self.parser = CodeParser() + self.nodes, self.edges = self.parser.parse_file(FIXTURES / "sample.scala") + + def test_detects_language(self): + assert self.parser.detect_language(Path("Main.scala")) == "scala" + + def test_finds_classes_traits_objects(self): + classes = [n for n in self.nodes if n.kind == "Class"] + names = {c.name for c in classes} + assert "Repository" in names + assert "User" in names + assert "InMemoryRepo" in names + assert "UserService" in names + assert "Color" in names + + def test_finds_functions(self): + funcs = [n for n in self.nodes if n.kind == "Function"] + names = {f.name for f in funcs} + assert "findById" in names + assert "save" in names + assert "createUser" in names + assert "getUser" in names + assert "apply" in names + + def test_finds_imports(self): + imports = [e for e in self.edges if e.kind == "IMPORTS_FROM"] + targets = {e.target for e in imports} + assert "scala.util.Try" in targets + assert "scala.collection.mutable" in targets + assert "scala.collection.mutable.HashMap" in targets + assert "scala.collection.mutable.ListBuffer" in targets + assert "scala.concurrent.*" in targets + assert len(imports) >= 3 + + def test_finds_inheritance(self): + inherits = [e for e in self.edges if e.kind == "INHERITS"] + targets = {e.target for e in inherits} + assert "Repository" in targets + assert "Serializable" in targets + + def test_finds_calls(self): + calls = [e for e in self.edges if e.kind == "CALLS"] + assert len(calls) >= 3 + + class TestSolidityParsing: def setup_method(self): self.parser = CodeParser()