From 66d1a48e240ed16e41632ce826fcee1652c0ec89 Mon Sep 17 00:00:00 2001 From: sergio-sisternes-epam Date: Wed, 4 Mar 2026 15:50:56 +0000 Subject: [PATCH 1/2] perf: exclude apm_modules from compilation scanning and cache Set[Path] Add apm_modules to the hardcoded exclusion list in _analyze_project_structure(), _should_exclude_subdir(), and _get_all_files() so that installed dependency trees are not scanned during compilation. Cache the Set[Path] conversion in _file_matches_pattern() to avoid recreating large sets on every call. Before: ~821s instruction processing, ~1096s total on a project with apm_modules. After: ~17s instruction processing, ~18s total (~62x improvement). Fixes #154 --- src/apm_cli/compilation/context_optimizer.py | 16 +-- .../compilation/test_context_optimizer.py | 116 ++++++++++++++++++ 2 files changed, 125 insertions(+), 7 deletions(-) diff --git a/src/apm_cli/compilation/context_optimizer.py b/src/apm_cli/compilation/context_optimizer.py index 66a9aea2..be258b62 100644 --- a/src/apm_cli/compilation/context_optimizer.py +++ b/src/apm_cli/compilation/context_optimizer.py @@ -162,8 +162,8 @@ def _get_all_files(self) -> List[Path]: if self._file_list_cache is None: self._file_list_cache = [] for root, dirs, files in os.walk(self.base_dir): - # Skip hidden directories for performance - dirs[:] = [d for d in dirs if not d.startswith('.')] + # Skip hidden and excluded directories for performance + dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ('node_modules', '__pycache__', 'dist', 'build', 'apm_modules')] for file in files: if not file.startswith('.'): self._file_list_cache.append(Path(root) / file) @@ -423,7 +423,7 @@ def _analyze_project_structure(self) -> None: continue # Default hardcoded exclusions for backwards compatibility - if any(ignore in str(current_path) for ignore in ['node_modules', '__pycache__', '.git', 'dist', 'build']): + if any(ignore in str(current_path) for ignore in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'apm_modules']): continue # Apply configurable exclusion patterns @@ -475,7 +475,7 @@ def _should_exclude_subdir(self, path: Path) -> bool: # Also check if subdirectory is a default exclusion dir_name = path.name - if dir_name in ['node_modules', '__pycache__', '.git', 'dist', 'build']: + if dir_name in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'apm_modules']: return True # Skip hidden directories @@ -786,9 +786,11 @@ def _file_matches_pattern(self, file_path: Path, pattern: str) -> bool: # Use cached glob results instead of repeated glob calls matches = self._cached_glob(expanded_pattern) - # Convert to Path objects for comparison - match_paths = {Path(match) for match in matches} - if rel_path in match_paths: + # Use cached Set[Path] to avoid recreating on every call + cache_key = f"_set_{expanded_pattern}" + if cache_key not in self._glob_cache: + self._glob_cache[cache_key] = {Path(match) for match in matches} + if rel_path in self._glob_cache[cache_key]: return True except (ValueError, OSError): pass diff --git a/tests/unit/compilation/test_context_optimizer.py b/tests/unit/compilation/test_context_optimizer.py index 9ec96799..496bb903 100644 --- a/tests/unit/compilation/test_context_optimizer.py +++ b/tests/unit/compilation/test_context_optimizer.py @@ -708,6 +708,122 @@ def test_single_item_brace_group(self): result = optimizer._expand_glob_pattern("**/*.{ts}") assert result == ["**/*.ts"] + def test_three_brace_groups(self): + """Test expansion of three brace groups.""" + optimizer = ContextOptimizer(base_dir="/tmp") + result = optimizer._expand_glob_pattern("{src,lib}/*.{test,spec}.{ts,js}") + expected = [ + "src/*.test.ts", "src/*.test.js", + "src/*.spec.ts", "src/*.spec.js", + "lib/*.test.ts", "lib/*.test.js", + "lib/*.spec.ts", "lib/*.spec.js", + ] + assert sorted(result) == sorted(expected) + + +class TestApmModulesExclusion: + """Test that apm_modules/ is excluded from project scanning (Fixes #154).""" + + @pytest.fixture + def project_with_apm_modules(self): + """Create a project with a realistic apm_modules/ tree.""" + with tempfile.TemporaryDirectory() as temp_dir: + base = Path(temp_dir) + + # Real project directories + for d in ["src", "src/components", "tests", "docs"]: + (base / d).mkdir(parents=True, exist_ok=True) + + # Real project files + for f in [ + "src/index.ts", "src/components/App.tsx", + "tests/app.test.ts", "docs/README.md", + ]: + (base / f).touch() + + # Simulate apm_modules with many nested packages + for i in range(50): + pkg_dir = base / "apm_modules" / f"org/package-{i}" / "sub" + pkg_dir.mkdir(parents=True, exist_ok=True) + (pkg_dir / "SKILL.md").touch() + (pkg_dir / "instructions.md").touch() + + yield base + + def test_apm_modules_excluded_from_directory_cache(self, project_with_apm_modules): + """apm_modules directories must not appear in _directory_cache.""" + optimizer = ContextOptimizer(base_dir=str(project_with_apm_modules)) + optimizer._analyze_project_structure() + + cached_paths = set(optimizer._directory_cache.keys()) + # Resolve to handle macOS /var -> /private/var symlink + resolved_base = project_with_apm_modules.resolve() + + # Real dirs are cached + assert resolved_base / "src" in cached_paths + assert resolved_base / "tests" in cached_paths + + # No apm_modules path is cached + apm_paths = [p for p in cached_paths if "apm_modules" in str(p)] + assert apm_paths == [], f"apm_modules leaked into cache: {apm_paths}" + + def test_cache_size_unaffected_by_apm_modules(self, project_with_apm_modules): + """Directory cache size should reflect only project dirs, not apm_modules.""" + optimizer = ContextOptimizer(base_dir=str(project_with_apm_modules)) + optimizer._analyze_project_structure() + + # 50 packages × sub-directory each = 150+ dirs if not excluded. + # Real project has at most ~5 dirs (root, src, src/components, tests, docs). + assert len(optimizer._directory_cache) <= 10, ( + f"Cache has {len(optimizer._directory_cache)} dirs — " + f"apm_modules likely not excluded" + ) + + def test_os_walk_prunes_apm_modules(self, project_with_apm_modules): + """os.walk must not descend into apm_modules subdirectories.""" + optimizer = ContextOptimizer(base_dir=str(project_with_apm_modules)) + optimizer._analyze_project_structure() + + # _should_exclude_subdir must flag apm_modules + apm_modules_path = project_with_apm_modules / "apm_modules" + assert optimizer._should_exclude_subdir(apm_modules_path) is True + + def test_find_matching_dirs_ignores_apm_modules(self, project_with_apm_modules): + """Pattern matching must not return directories inside apm_modules.""" + optimizer = ContextOptimizer(base_dir=str(project_with_apm_modules)) + optimizer._analyze_project_structure() + + matching = optimizer._find_matching_directories("**/*.md") + apm_matches = [p for p in matching if "apm_modules" in str(p)] + assert apm_matches == [], f"apm_modules dirs matched: {apm_matches}" + + +class TestGlobCacheReuse: + """Test that glob results are cached and Set[Path] conversion is not repeated.""" + + def test_set_path_cached_across_calls(self): + """_file_matches_pattern must cache the Set[Path] conversion.""" + with tempfile.TemporaryDirectory() as temp_dir: + base = Path(temp_dir) + (base / "src").mkdir() + (base / "src" / "a.ts").touch() + (base / "src" / "b.ts").touch() + + optimizer = ContextOptimizer(base_dir=str(base)) + optimizer._analyze_project_structure() + + file_a = base / "src" / "a.ts" + file_b = base / "src" / "b.ts" + + # First call populates cache + optimizer._file_matches_pattern(file_a, "**/*.ts") + assert "_set_**/*.ts" in optimizer._glob_cache + + # Second call should reuse the cached set (not recreate it) + cached_set_id = id(optimizer._glob_cache["_set_**/*.ts"]) + optimizer._file_matches_pattern(file_b, "**/*.ts") + assert id(optimizer._glob_cache["_set_**/*.ts"]) == cached_set_id + if __name__ == "__main__": pytest.main([__file__]) \ No newline at end of file From a704d1d69ea9f18c5402c1947d2d02d062760288 Mon Sep 17 00:00:00 2001 From: sergio-sisternes-epam Date: Wed, 4 Mar 2026 16:14:23 +0000 Subject: [PATCH 2/2] refactor: address PR #157 review feedback - Extract DEFAULT_EXCLUDED_DIRNAMES frozenset constant to eliminate duplication across _analyze_project_structure, _should_exclude_subdir, and _get_all_files (review comment 1) - Use dedicated _glob_set_cache: Dict[str, Set[Path]] instead of overloading _glob_cache with '_set_' prefixed keys (review comment 2) - Update docs/cli-reference.md and docs/compilation.md to list apm_modules in default exclusions (review comment 4) --- docs/cli-reference.md | 2 +- docs/compilation.md | 1 + src/apm_cli/compilation/context_optimizer.py | 20 ++++++++++++------- .../compilation/test_context_optimizer.py | 6 +++--- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 6daacf95..ce9a3120 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -725,7 +725,7 @@ Use the `exclude` field to skip directories during compilation. This improves pe - `projects/**/apm/**` - Complex nested matching with `**` **Default exclusions** (always applied): -- `node_modules`, `__pycache__`, `.git`, `dist`, `build` +- `node_modules`, `__pycache__`, `.git`, `dist`, `build`, `apm_modules` - Hidden directories (starting with `.`) Command-line options always override `apm.yml` settings. Priority order: diff --git a/docs/compilation.md b/docs/compilation.md index 5f7fa76b..49dbc9a5 100644 --- a/docs/compilation.md +++ b/docs/compilation.md @@ -296,6 +296,7 @@ APM always excludes these directories (no configuration needed): - `.git` - `dist` - `build` +- `apm_modules` - Hidden directories (starting with `.`) ## Advanced Optimization Features diff --git a/src/apm_cli/compilation/context_optimizer.py b/src/apm_cli/compilation/context_optimizer.py index be258b62..07e85f5f 100644 --- a/src/apm_cli/compilation/context_optimizer.py +++ b/src/apm_cli/compilation/context_optimizer.py @@ -29,6 +29,12 @@ list = builtins.list dict = builtins.dict +# Default directory names excluded from compilation scanning. +# Shared across _analyze_project_structure, _should_exclude_subdir, and _get_all_files. +DEFAULT_EXCLUDED_DIRNAMES = frozenset({ + 'node_modules', '__pycache__', '.git', 'dist', 'build', 'apm_modules', +}) + @dataclass class DirectoryAnalysis: @@ -113,6 +119,7 @@ def __init__(self, base_dir: str = ".", exclude_patterns: Optional[List[str]] = # Performance optimization caches self._glob_cache: Dict[str, List[str]] = {} + self._glob_set_cache: Dict[str, Set[Path]] = {} self._file_list_cache: Optional[List[Path]] = None self._timing_enabled = False self._phase_timings: Dict[str, float] = {} @@ -163,7 +170,7 @@ def _get_all_files(self) -> List[Path]: self._file_list_cache = [] for root, dirs, files in os.walk(self.base_dir): # Skip hidden and excluded directories for performance - dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ('node_modules', '__pycache__', 'dist', 'build', 'apm_modules')] + dirs[:] = [d for d in dirs if not d.startswith('.') and d not in DEFAULT_EXCLUDED_DIRNAMES] for file in files: if not file.startswith('.'): self._file_list_cache.append(Path(root) / file) @@ -423,7 +430,7 @@ def _analyze_project_structure(self) -> None: continue # Default hardcoded exclusions for backwards compatibility - if any(ignore in str(current_path) for ignore in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'apm_modules']): + if any(ignore in str(current_path) for ignore in DEFAULT_EXCLUDED_DIRNAMES): continue # Apply configurable exclusion patterns @@ -475,7 +482,7 @@ def _should_exclude_subdir(self, path: Path) -> bool: # Also check if subdirectory is a default exclusion dir_name = path.name - if dir_name in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'apm_modules']: + if dir_name in DEFAULT_EXCLUDED_DIRNAMES: return True # Skip hidden directories @@ -787,10 +794,9 @@ def _file_matches_pattern(self, file_path: Path, pattern: str) -> bool: # Use cached glob results instead of repeated glob calls matches = self._cached_glob(expanded_pattern) # Use cached Set[Path] to avoid recreating on every call - cache_key = f"_set_{expanded_pattern}" - if cache_key not in self._glob_cache: - self._glob_cache[cache_key] = {Path(match) for match in matches} - if rel_path in self._glob_cache[cache_key]: + if expanded_pattern not in self._glob_set_cache: + self._glob_set_cache[expanded_pattern] = {Path(match) for match in matches} + if rel_path in self._glob_set_cache[expanded_pattern]: return True except (ValueError, OSError): pass diff --git a/tests/unit/compilation/test_context_optimizer.py b/tests/unit/compilation/test_context_optimizer.py index 496bb903..10da05cb 100644 --- a/tests/unit/compilation/test_context_optimizer.py +++ b/tests/unit/compilation/test_context_optimizer.py @@ -817,12 +817,12 @@ def test_set_path_cached_across_calls(self): # First call populates cache optimizer._file_matches_pattern(file_a, "**/*.ts") - assert "_set_**/*.ts" in optimizer._glob_cache + assert "**/*.ts" in optimizer._glob_set_cache # Second call should reuse the cached set (not recreate it) - cached_set_id = id(optimizer._glob_cache["_set_**/*.ts"]) + cached_set_id = id(optimizer._glob_set_cache["**/*.ts"]) optimizer._file_matches_pattern(file_b, "**/*.ts") - assert id(optimizer._glob_cache["_set_**/*.ts"]) == cached_set_id + assert id(optimizer._glob_set_cache["**/*.ts"]) == cached_set_id if __name__ == "__main__":