Summary
Add py-tree-sitter with Python and TypeScript grammars as dependencies. Create a LanguageRegistry that abstracts grammar loading and a FileIndex that walks the file tree once for all miners to share. All miners use these abstractions — none import tree-sitter or walk the filesystem directly.
Depends on: #124
New dependencies
# pyproject.toml
"tree-sitter>=0.23",
"tree-sitter-python>=0.23",
"tree-sitter-typescript>=0.23",
New files
src/specleft/discovery/language_registry.py
from specleft.discovery.models import SupportedLanguage
SUPPORTED_EXTENSIONS: dict[str, SupportedLanguage] = {
".py": SupportedLanguage.PYTHON,
".ts": SupportedLanguage.TYPESCRIPT,
".tsx": SupportedLanguage.TYPESCRIPT,
".js": SupportedLanguage.JAVASCRIPT,
".jsx": SupportedLanguage.JAVASCRIPT,
".mjs": SupportedLanguage.JAVASCRIPT,
}
class LanguageRegistry:
def detect_language(self, file_path: Path) -> SupportedLanguage | None:
"""Returns SupportedLanguage enum member or None for unsupported extensions."""
def parse(self, file_path: Path) -> tuple[tree_sitter.Node, SupportedLanguage] | None:
"""Returns (root_node, language) or None if unsupported or parse error."""
def parse_source(self, source: bytes, language: SupportedLanguage) -> tree_sitter.Node:
"""Parse raw bytes directly — used in tests."""
Grammar instances are cached on first load. Parse errors are caught and return None; miners must handle None gracefully.
src/specleft/discovery/file_index.py
from specleft.discovery.models import SupportedLanguage
DEFAULT_EXCLUDE_DIRS: frozenset[str] = frozenset({
".git", "node_modules", "__pycache__", ".venv", "venv",
"dist", "build", ".next", ".mypy_cache", ".pytest_cache",
".tox", ".eggs", "*.egg-info",
})
class FileIndex:
"""
Walks the project tree once and provides filtered views.
Built by the pipeline and shared with all miners via MinerContext.
Miners should never walk the filesystem directly — always use the index.
"""
def __init__(
self,
root: Path,
exclude_dirs: frozenset[str] = DEFAULT_EXCLUDE_DIRS,
) -> None:
self._root = root
self._files: list[Path] = [] # all discovered files
self._by_language: dict[SupportedLanguage, list[Path]] = {}
self._by_extension: dict[str, list[Path]] = {}
self._build(exclude_dirs)
@property
def root(self) -> Path: ...
@property
def total_files(self) -> int: ...
def files_by_language(self, lang: SupportedLanguage) -> list[Path]:
"""Return all files for a given language."""
def files_by_extension(self, *exts: str) -> list[Path]:
"""Return files matching any of the given extensions (e.g. '.py', '.ts')."""
def files_matching(self, *patterns: str) -> list[Path]:
"""Return files whose name matches any of the given glob patterns
(e.g. 'test_*.py', '*.spec.ts')."""
def files_under(self, *dirs: str) -> list[Path]:
"""Return files under specific directories relative to root
(e.g. 'src', 'lib', 'app'). Used by the docstring miner."""
def _build(self, exclude_dirs: frozenset[str]) -> None:
"""Walk once, populate all internal indices."""
src/specleft/discovery/language_detect.py
from specleft.discovery.models import SupportedLanguage
def detect_project_languages(
file_index: FileIndex,
threshold: float = 0.01,
) -> list[SupportedLanguage]:
"""
Count files per language from the pre-built FileIndex.
Return languages above the threshold (default 1% of total files).
"""
Note: detect_project_languages now takes a FileIndex instead of a root: Path, avoiding a second filesystem walk.
Test fixtures (create these)
tests/fixtures/discovery/
sample.py # module with plain functions and a class
sample.ts # module with typed functions
sample_api.py # FastAPI/Flask route definitions
sample_api.ts # Express route definitions
sample_tests.py # pytest test functions (3: plain, parametrized, TestCase)
sample_tests.ts # Jest/Vitest test blocks (describe + it + it.todo)
Acceptance criteria
Summary
Add
py-tree-sitterwith Python and TypeScript grammars as dependencies. Create aLanguageRegistrythat abstracts grammar loading and aFileIndexthat walks the file tree once for all miners to share. All miners use these abstractions — none import tree-sitter or walk the filesystem directly.Depends on: #124
New dependencies
New files
src/specleft/discovery/language_registry.pyGrammar instances are cached on first load. Parse errors are caught and return
None; miners must handleNonegracefully.src/specleft/discovery/file_index.pysrc/specleft/discovery/language_detect.pyNote:
detect_project_languagesnow takes aFileIndexinstead of aroot: Path, avoiding a second filesystem walk.Test fixtures (create these)
Acceptance criteria
LanguageRegistry().parse(path_to_py_file)returns(node, SupportedLanguage.PYTHON)LanguageRegistry().parse(path_to_ts_file)returns(node, SupportedLanguage.TYPESCRIPT)detect_language()returnsSupportedLanguagemembers, not raw strings.rb) returnsNonewithout raisingNonewithout raisingFileIndexwalks the tree once —total_filesis correctFileIndex.files_by_language(SupportedLanguage.PYTHON)returns only.pyfilesFileIndex.files_matching("test_*.py")returns only matching filesFileIndex.files_under("src")returns only files undersrc/DEFAULT_EXCLUDE_DIRSare skippeddetect_project_languages(file_index)returns[SupportedLanguage.PYTHON]on the specleft repotests/discovery/test_language_registry.pyandtests/discovery/test_file_index.pyfeatures/feature-spec-discovery.mdto cover the functionality introduced by this issue