From b3ef483019cdf5cea314236b20ca1ceb47b69bb8 Mon Sep 17 00:00:00 2001
From: sergio-sisternes-epam <sergio_sisternes@epam.com>
Date: Thu, 5 Mar 2026 11:20:27 +0000
Subject: [PATCH 1/7] =?UTF-8?q?perf(#171):=20all=20phases=20=E2=80=94=20da?=
 =?UTF-8?q?ta=20structures,=20caching,=20parallel=20downloads,=20resilienc?=
 =?UTF-8?q?e?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 2: O(1) dict-index for primitive conflict detection, set-based cycle
detection, pre-computed depth index in dependency graph.

Phase 3: Module-level caches for from_apm_yml, get_config, read_constitution;
instance-level inheritance chain cache in context_optimizer.

Phase 4: ThreadPoolExecutor parallel downloads (--parallel-downloads CLI arg),
git sparse-checkout for subdirectory packages with full-clone fallback.

Phase 5: Resilient HTTP with exponential backoff (429/503), lockfile SHA-based
skip-if-exists optimization.

Includes benchmarks, fixtures, and doc-sync updates.
---
 docs/cli-reference.md                        |   1 +
 docs/dependencies.md                         |  10 +-
 docs/enhanced-primitive-discovery.md         |   2 +
 src/apm_cli/cli.py                           | 125 +++++++++-
 src/apm_cli/compilation/constitution.py      |  21 +-
 src/apm_cli/compilation/context_optimizer.py |   6 +
 src/apm_cli/config.py                        |  19 +-
 src/apm_cli/deps/apm_resolver.py             |   9 +-
 src/apm_cli/deps/dependency_graph.py         |   5 +-
 src/apm_cli/deps/github_downloader.py        | 158 +++++++++---
 src/apm_cli/models/apm_package.py            |  19 +-
 src/apm_cli/primitives/models.py             |  25 +-
 tests/benchmarks/BASELINE.md                 |  62 +++++
 tests/benchmarks/__init__.py                 |   0
 tests/benchmarks/run_baseline.py             | 238 +++++++++++++++++++
 tests/benchmarks/test_perf_benchmarks.py     | 210 ++++++++++++++++
 tests/fixtures/synthetic_trees.py            |  77 ++++++
 17 files changed, 931 insertions(+), 56 deletions(-)
 create mode 100644 tests/benchmarks/BASELINE.md
 create mode 100644 tests/benchmarks/__init__.py
 create mode 100644 tests/benchmarks/run_baseline.py
 create mode 100644 tests/benchmarks/test_perf_benchmarks.py
 create mode 100644 tests/fixtures/synthetic_trees.py

diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index 208f8961..6da67aa0 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -122,6 +122,7 @@ apm install [PACKAGES...] [OPTIONS]
 - `--update` - Update dependencies to latest Git references  
 - `--force` - Overwrite locally-authored files on collision
 - `--dry-run` - Show what would be installed without installing
+- `--parallel-downloads INT` - Max concurrent package downloads (default: 4, 0 to disable)
 - `--verbose` - Show detailed installation information
 - `--trust-transitive-mcp` - Trust self-defined MCP servers from transitive packages (skip re-declaration requirement)
 
diff --git a/docs/dependencies.md b/docs/dependencies.md
index cb4e1523..0a91ac33 100644
--- a/docs/dependencies.md
+++ b/docs/dependencies.md
@@ -306,6 +306,14 @@ When both packages are installed, your project gains:
 4. **Build Dependency Graph**: Resolve transitive dependencies recursively
 5. **Check Conflicts**: Identify any circular dependencies or conflicts
 
+#### Resilient Downloads
+
+APM automatically retries failed HTTP requests with exponential backoff. Rate-limited responses (HTTP 429/503) are handled transparently, respecting `Retry-After` headers when provided. This ensures reliable installs even under heavy API usage or transient network issues.
+
+#### Parallel Downloads
+
+APM downloads packages in parallel using a thread pool, significantly reducing wall-clock time for large dependency trees. The concurrency level defaults to 4 and is configurable via `--parallel-downloads` (set to 0 to disable). For subdirectory packages in monorepos, APM attempts git sparse-checkout (git 2.25+) to download only the needed directory, falling back to a shallow clone if sparse-checkout is unavailable.
+
 ### File Processing and Content Merging
 
 APM uses instruction-level merging rather than file-level precedence. When local and dependency files contribute instructions with overlapping `applyTo` patterns:
@@ -427,7 +435,7 @@ The `deployed_files` field tracks exactly which files APM placed in your project
 ### How It Works
 
 1. **First install**: APM resolves dependencies, downloads packages, and writes `apm.lock`
-2. **Subsequent installs**: APM reads `apm.lock` and uses locked commits for exact reproducibility
+2. **Subsequent installs**: APM reads `apm.lock` and uses locked commits for exact reproducibility. If the local checkout already matches the locked commit SHA, the download is skipped entirely.
 3. **Updating**: Use `--update` to re-resolve dependencies and generate a fresh lockfile
 
 ### Version Control
diff --git a/docs/enhanced-primitive-discovery.md b/docs/enhanced-primitive-discovery.md
index cea1f063..692318a7 100644
--- a/docs/enhanced-primitive-discovery.md
+++ b/docs/enhanced-primitive-discovery.md
@@ -186,6 +186,8 @@ The enhanced discovery system integrates with:
      - Keep existing primitive (higher priority)
      - Record conflict with losing source information
 
+Conflict detection uses O(1) name-indexed lookups, so performance remains constant regardless of collection size.
+
 ### Error Handling
 
 - Gracefully handles missing `apm_modules/` directory
diff --git a/src/apm_cli/cli.py b/src/apm_cli/cli.py
index 451915df..c4460500 100644
--- a/src/apm_cli/cli.py
+++ b/src/apm_cli/cli.py
@@ -612,8 +612,15 @@ def _validate_package_exists(package):
     is_flag=True,
     help="Trust self-defined MCP servers from transitive packages (skip re-declaration requirement)",
 )
+@click.option(
+    "--parallel-downloads",
+    type=int,
+    default=4,
+    show_default=True,
+    help="Max concurrent package downloads (0 to disable parallelism)",
+)
 @click.pass_context
-def install(ctx, packages, runtime, exclude, only, update, dry_run, force, verbose, trust_transitive_mcp):
+def install(ctx, packages, runtime, exclude, only, update, dry_run, force, verbose, trust_transitive_mcp, parallel_downloads):
     """Install APM and MCP dependencies from apm.yml (like npm install).
 
     This command automatically detects AI runtimes from your apm.yml scripts and installs
@@ -712,7 +719,8 @@ def install(ctx, packages, runtime, exclude, only, update, dry_run, force, verbo
                 # Otherwise install all from apm.yml
                 only_pkgs = builtins.list(packages) if packages else None
                 apm_count, prompt_count, agent_count = _install_apm_dependencies(
-                    apm_package, update, verbose, only_pkgs, force=force
+                    apm_package, update, verbose, only_pkgs, force=force,
+                    parallel_downloads=parallel_downloads,
                 )
             except Exception as e:
                 _rich_error(f"Failed to install APM dependencies: {e}")
@@ -1486,6 +1494,7 @@ def _install_apm_dependencies(
     verbose: bool = False,
     only_packages: "builtins.list" = None,
     force: bool = False,
+    parallel_downloads: int = 4,
 ):
     """Install APM package dependencies.
 
@@ -1495,6 +1504,7 @@ def _install_apm_dependencies(
         verbose: Show detailed installation information
         only_packages: If provided, only install these specific packages (not all from apm.yml)
         force: Whether to overwrite locally-authored files on collision
+        parallel_downloads: Max concurrent downloads (0 disables parallelism)
     """
     if not APM_DEPS_AVAILABLE:
         raise RuntimeError("APM dependency system not available")
@@ -1711,7 +1721,77 @@ def matches_filter(dep):
         # downloader already created above for transitive resolution
         installed_count = 0
 
-        # Create progress display for downloads
+        # Phase 4 (#171): Parallel package downloads using ThreadPoolExecutor
+        # Pre-download all non-cached packages in parallel for wall-clock speedup.
+        # Results are stored and consumed by the sequential integration loop below.
+        from concurrent.futures import ThreadPoolExecutor, as_completed as _futures_completed
+
+        _pre_download_results = {}   # dep_key -> PackageInfo
+        _pre_download_failed = set() # dep_keys that failed download
+        _need_download = []
+        for _pd_ref in deps_to_install:
+            _pd_key = _pd_ref.get_unique_key()
+            _pd_path = (apm_modules_dir / _pd_ref.alias) if _pd_ref.alias else _pd_ref.get_install_path(apm_modules_dir)
+            # Skip if already downloaded during BFS resolution
+            if _pd_key in callback_downloaded:
+                continue
+            # Skip if lockfile SHA matches local HEAD (Phase 5 check)
+            if _pd_path.exists() and existing_lockfile and not update_refs:
+                _pd_locked = existing_lockfile.get_dependency(_pd_key)
+                if _pd_locked and _pd_locked.resolved_commit and _pd_locked.resolved_commit != "cached":
+                    try:
+                        from git import Repo as _PDGitRepo
+                        if _PDGitRepo(_pd_path).head.commit.hexsha == _pd_locked.resolved_commit:
+                            continue
+                    except Exception:
+                        pass
+            # Build download ref (use locked commit for reproducibility)
+            _pd_dlref = str(_pd_ref)
+            if existing_lockfile:
+                _pd_locked = existing_lockfile.get_dependency(_pd_key)
+                if _pd_locked and _pd_locked.resolved_commit and _pd_locked.resolved_commit != "cached":
+                    _pd_base = _pd_ref.repo_url
+                    if _pd_ref.virtual_path:
+                        _pd_base = f"{_pd_base}/{_pd_ref.virtual_path}"
+                    _pd_dlref = f"{_pd_base}#{_pd_locked.resolved_commit}"
+            _need_download.append((_pd_ref, _pd_path, _pd_dlref))
+
+        if _need_download and parallel_downloads > 0:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[cyan]{task.description}[/cyan]"),
+                BarColumn(),
+                TaskProgressColumn(),
+                transient=True,
+            ) as _dl_progress:
+                _max_workers = min(parallel_downloads, len(_need_download))
+                with ThreadPoolExecutor(max_workers=_max_workers) as _executor:
+                    _futures = {}
+                    for _pd_ref, _pd_path, _pd_dlref in _need_download:
+                        _pd_disp = str(_pd_ref) if _pd_ref.is_virtual else _pd_ref.repo_url
+                        _pd_short = _pd_disp.split("/")[-1] if "/" in _pd_disp else _pd_disp
+                        _pd_tid = _dl_progress.add_task(description=f"Fetching {_pd_short}", total=None)
+                        _pd_fut = _executor.submit(
+                            downloader.download_package, _pd_dlref, _pd_path,
+                            progress_task_id=_pd_tid, progress_obj=_dl_progress,
+                        )
+                        _futures[_pd_fut] = (_pd_ref, _pd_tid, _pd_disp)
+                    for _pd_fut in _futures_completed(_futures):
+                        _pd_ref, _pd_tid, _pd_disp = _futures[_pd_fut]
+                        _pd_key = _pd_ref.get_unique_key()
+                        try:
+                            _pd_info = _pd_fut.result()
+                            _pre_download_results[_pd_key] = _pd_info
+                            _dl_progress.update(_pd_tid, visible=False)
+                            _dl_progress.refresh()
+                        except Exception as _pd_err:
+                            _pre_download_failed.add(_pd_key)
+                            _dl_progress.remove_task(_pd_tid)
+                            _rich_error(f"❌ Failed to install {_pd_disp}: {_pd_err}")
+
+        _pre_downloaded_keys = set(_pre_download_results.keys())
+
+        # Create progress display for sequential integration
         with Progress(
             SpinnerColumn(),
             TextColumn("[cyan]{task.description}[/cyan]"),
@@ -1720,6 +1800,10 @@ def matches_filter(dep):
             transient=True,  # Progress bar disappears when done
         ) as progress:
             for dep_ref in deps_to_install:
+                # Phase 4 (#171): Skip deps that failed during parallel download
+                if dep_ref.get_unique_key() in _pre_download_failed:
+                    continue
+
                 # Determine installation directory using namespaced structure
                 # e.g., microsoft/apm-sample-package -> apm_modules/microsoft/apm-sample-package/
                 # For virtual packages: owner/repo/prompts/file.prompt.md -> apm_modules/owner/repo-file/
@@ -1737,7 +1821,7 @@ def matches_filter(dep):
                 from apm_cli.models.apm_package import GitReferenceType
 
                 resolved_ref = None
-                if dep_ref.reference:
+                if dep_ref.reference and dep_ref.get_unique_key() not in _pre_downloaded_keys:
                     try:
                         resolved_ref = downloader.resolve_git_reference(
                             f"{dep_ref.repo_url}@{dep_ref.reference}"
@@ -1752,8 +1836,20 @@ def matches_filter(dep):
                 ]
                 # Skip download if: already fetched by resolver callback, or cached tag/commit
                 already_resolved = dep_ref.get_unique_key() in callback_downloaded
+                # Phase 5 (#171): Also skip when lockfile SHA matches local HEAD
+                lockfile_match = False
+                if install_path.exists() and existing_lockfile and not update_refs:
+                    locked_dep = existing_lockfile.get_dependency(dep_ref.get_unique_key())
+                    if locked_dep and locked_dep.resolved_commit and locked_dep.resolved_commit != "cached":
+                        try:
+                            from git import Repo as GitRepo
+                            local_repo = GitRepo(install_path)
+                            if local_repo.head.commit.hexsha == locked_dep.resolved_commit:
+                                lockfile_match = True
+                        except Exception:
+                            pass  # Not a git repo or invalid — fall through to download
                 skip_download = install_path.exists() and (
-                    (is_cacheable and not update_refs) or already_resolved
+                    (is_cacheable and not update_refs) or already_resolved or lockfile_match
                 )
 
                 if skip_download:
@@ -2027,13 +2123,18 @@ def matches_filter(dep):
                                 base_ref = f"{base_ref}/{dep_ref.virtual_path}"
                             download_ref = f"{base_ref}#{locked_dep.resolved_commit}"
 
-                    # Download with live progress bar
-                    package_info = downloader.download_package(
-                        download_ref,
-                        install_path,
-                        progress_task_id=task_id,
-                        progress_obj=progress,
-                    )
+                    # Phase 4 (#171): Use pre-downloaded result if available
+                    _dep_key = dep_ref.get_unique_key()
+                    if _dep_key in _pre_download_results:
+                        package_info = _pre_download_results[_dep_key]
+                    else:
+                        # Fallback: sequential download (should rarely happen)
+                        package_info = downloader.download_package(
+                            download_ref,
+                            install_path,
+                            progress_task_id=task_id,
+                            progress_obj=progress,
+                        )
 
                     # CRITICAL: Hide progress BEFORE printing success message to avoid overlap
                     progress.update(task_id, visible=False)
diff --git a/src/apm_cli/compilation/constitution.py b/src/apm_cli/compilation/constitution.py
index 1a9e4b27..cdbaa351 100644
--- a/src/apm_cli/compilation/constitution.py
+++ b/src/apm_cli/compilation/constitution.py
@@ -2,10 +2,18 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Optional
+from typing import Dict, Optional
 
 from .constants import CONSTITUTION_RELATIVE_PATH
 
+# Module-level cache: resolved base_dir -> constitution content (#171)
+_constitution_cache: Dict[Path, Optional[str]] = {}
+
+
+def clear_constitution_cache() -> None:
+    """Clear the constitution read cache. Call in tests for isolation."""
+    _constitution_cache.clear()
+
 
 def find_constitution(base_dir: Path) -> Path:
     """Return path to constitution.md if present, else Path that does not exist.
@@ -19,15 +27,24 @@ def find_constitution(base_dir: Path) -> Path:
 def read_constitution(base_dir: Path) -> Optional[str]:
     """Read full constitution content if file exists.
 
+    Results are cached by resolved base_dir for the lifetime of the process.
+
     Args:
         base_dir: Repository root path.
     Returns:
         Full file text or None if absent.
     """
+    resolved = base_dir.resolve()
+    if resolved in _constitution_cache:
+        return _constitution_cache[resolved]
     path = find_constitution(base_dir)
     if not path.exists() or not path.is_file():
+        _constitution_cache[resolved] = None
         return None
     try:
-        return path.read_text(encoding="utf-8")
+        content = path.read_text(encoding="utf-8")
+        _constitution_cache[resolved] = content
+        return content
     except OSError:
+        _constitution_cache[resolved] = None
         return None
diff --git a/src/apm_cli/compilation/context_optimizer.py b/src/apm_cli/compilation/context_optimizer.py
index 0910cc25..33601ace 100644
--- a/src/apm_cli/compilation/context_optimizer.py
+++ b/src/apm_cli/compilation/context_optimizer.py
@@ -121,6 +121,7 @@ def __init__(self, base_dir: str = ".", exclude_patterns: Optional[List[str]] =
         self._glob_cache: Dict[str, List[str]] = {}
         self._glob_set_cache: Dict[str, Set[Path]] = {}
         self._file_list_cache: Optional[List[Path]] = None
+        self._inheritance_cache: Dict[Path, List[Path]] = {}  # (#171)
         self._timing_enabled = False
         self._phase_timings: Dict[str, float] = {}
         
@@ -1234,6 +1235,10 @@ def _get_inheritance_chain(self, working_directory: Path) -> List[Path]:
         Returns:
             List[Path]: Inheritance chain (most specific to root).
         """
+        cached = self._inheritance_cache.get(working_directory)
+        if cached is not None:
+            return cached
+
         chain = []
         # Resolve the starting directory to ensure consistent path comparison
         try:
@@ -1261,6 +1266,7 @@ def _get_inheritance_chain(self, working_directory: Path) -> List[Path]:
             except (OSError, ValueError):
                 break
         
+        self._inheritance_cache[working_directory] = chain
         return chain
     
     def _is_child_directory(self, child: Path, parent: Path) -> bool:
diff --git a/src/apm_cli/config.py b/src/apm_cli/config.py
index d957571f..1415c4de 100644
--- a/src/apm_cli/config.py
+++ b/src/apm_cli/config.py
@@ -2,11 +2,14 @@
 
 import os
 import json
+from typing import Optional
 
 
 CONFIG_DIR = os.path.expanduser("~/.apm")
 CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json")
 
+_config_cache: Optional[dict] = None
+
 
 def ensure_config_exists():
     """Ensure the configuration directory and file exist."""
@@ -21,12 +24,24 @@ def ensure_config_exists():
 def get_config():
     """Get the current configuration.
     
+    Results are cached for the lifetime of the process.
+    
     Returns:
         dict: Current configuration.
     """
+    global _config_cache
+    if _config_cache is not None:
+        return _config_cache
     ensure_config_exists()
     with open(CONFIG_FILE, "r") as f:
-        return json.load(f)
+        _config_cache = json.load(f)
+    return _config_cache
+
+
+def _invalidate_config_cache():
+    """Invalidate the config cache (called after writes)."""
+    global _config_cache
+    _config_cache = None
 
 
 def update_config(updates):
@@ -35,11 +50,13 @@ def update_config(updates):
     Args:
         updates (dict): Dictionary of configuration values to update.
     """
+    _invalidate_config_cache()
     config = get_config()
     config.update(updates)
     
     with open(CONFIG_FILE, "w") as f:
         json.dump(config, f, indent=2)
+    _invalidate_config_cache()
 
 
 def get_default_client():
diff --git a/src/apm_cli/deps/apm_resolver.py b/src/apm_cli/deps/apm_resolver.py
index 7f943ab5..3172042d 100644
--- a/src/apm_cli/deps/apm_resolver.py
+++ b/src/apm_cli/deps/apm_resolver.py
@@ -225,6 +225,7 @@ def detect_circular_dependencies(self, tree: DependencyTree) -> List[CircularRef
         circular_deps = []
         visited: Set[str] = set()
         current_path: List[str] = []
+        current_path_set: Set[str] = set()  # O(1) membership test (#171)
         
         def dfs_detect_cycles(node: DependencyNode) -> None:
             """Recursive DFS function to detect cycles."""
@@ -234,7 +235,7 @@ def dfs_detect_cycles(node: DependencyNode) -> None:
             unique_key = node.dependency_ref.get_unique_key()
             
             # Check if this unique key is already in our current path (cycle detected)
-            if unique_key in current_path:
+            if unique_key in current_path_set:
                 # Found a cycle - create the cycle path
                 cycle_start_index = current_path.index(unique_key)
                 cycle_path = current_path[cycle_start_index:] + [unique_key]
@@ -249,23 +250,25 @@ def dfs_detect_cycles(node: DependencyNode) -> None:
             # Mark current node as visited and add unique key to path
             visited.add(node_id)
             current_path.append(unique_key)
+            current_path_set.add(unique_key)
             
             # Check all children
             for child in node.children:
                 child_id = child.get_id()
                 
                 # Only recurse if we haven't processed this subtree completely
-                if child_id not in visited or child.dependency_ref.get_unique_key() in current_path:
+                if child_id not in visited or child.dependency_ref.get_unique_key() in current_path_set:
                     dfs_detect_cycles(child)
             
             # Remove from path when backtracking (but keep in visited)
-            current_path.pop()
+            current_path_set.discard(current_path.pop())
         
         # Start DFS from all root level dependencies (depth 1)
         root_deps = tree.get_nodes_at_depth(1)
         for root_dep in root_deps:
             if root_dep.get_id() not in visited:
                 current_path = []  # Reset path for each root
+                current_path_set = set()
                 dfs_detect_cycles(root_dep)
         
         return circular_deps
diff --git a/src/apm_cli/deps/dependency_graph.py b/src/apm_cli/deps/dependency_graph.py
index e94fa255..ff8962b6 100644
--- a/src/apm_cli/deps/dependency_graph.py
+++ b/src/apm_cli/deps/dependency_graph.py
@@ -1,5 +1,6 @@
 """Data structures for dependency graph representation and resolution."""
 
+from collections import defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, List, Optional, Set, Tuple, Any
@@ -55,11 +56,13 @@ class DependencyTree:
     """Hierarchical representation of dependencies before flattening."""
     root_package: APMPackage
     nodes: Dict[str, DependencyNode] = field(default_factory=dict)
+    _nodes_by_depth: Dict[int, List[DependencyNode]] = field(default_factory=lambda: defaultdict(list))
     max_depth: int = 0
     
     def add_node(self, node: DependencyNode) -> None:
         """Add a node to the tree."""
         self.nodes[node.get_id()] = node
+        self._nodes_by_depth[node.depth].append(node)
         self.max_depth = max(self.max_depth, node.depth)
     
     def get_node(self, unique_key: str) -> Optional[DependencyNode]:
@@ -68,7 +71,7 @@ def get_node(self, unique_key: str) -> Optional[DependencyNode]:
     
     def get_nodes_at_depth(self, depth: int) -> List[DependencyNode]:
         """Get all nodes at a specific depth level."""
-        return [node for node in self.nodes.values() if node.depth == depth]
+        return list(self._nodes_by_depth.get(depth, []))
     
     def has_dependency(self, repo_url: str) -> bool:
         """Check if a dependency exists in the tree."""
diff --git a/src/apm_cli/deps/github_downloader.py b/src/apm_cli/deps/github_downloader.py
index 9cb84d9b..20a97b08 100644
--- a/src/apm_cli/deps/github_downloader.py
+++ b/src/apm_cli/deps/github_downloader.py
@@ -3,6 +3,7 @@
 import os
 import shutil
 import sys
+import time
 from datetime import datetime
 from pathlib import Path
 from typing import Optional, Dict, Any, Callable
@@ -158,6 +159,61 @@ def _setup_git_environment(self) -> Dict[str, Any]:
         
         return env
     
+    def _resilient_get(self, url: str, headers: Dict[str, str], timeout: int = 30, max_retries: int = 3) -> requests.Response:
+        """HTTP GET with retry on 429/503 and rate-limit header awareness (#171).
+        
+        Args:
+            url: Request URL
+            headers: HTTP headers
+            timeout: Request timeout in seconds
+            max_retries: Maximum retry attempts for transient failures
+            
+        Returns:
+            requests.Response (caller should call .raise_for_status() as needed)
+            
+        Raises:
+            requests.exceptions.RequestException: After all retries exhausted
+        """
+        last_exc = None
+        for attempt in range(max_retries):
+            try:
+                response = requests.get(url, headers=headers, timeout=timeout)
+                
+                # Handle rate limiting
+                if response.status_code in (429, 503):
+                    retry_after = response.headers.get("Retry-After")
+                    if retry_after:
+                        wait = min(float(retry_after), 60)
+                    else:
+                        wait = min(2 ** attempt, 30)
+                    _debug(f"Rate limited ({response.status_code}), retry in {wait}s (attempt {attempt + 1}/{max_retries})")
+                    time.sleep(wait)
+                    continue
+                
+                # Log rate limit proximity
+                remaining = response.headers.get("X-RateLimit-Remaining")
+                try:
+                    if remaining and int(remaining) < 10:
+                        _debug(f"GitHub API rate limit low: {remaining} requests remaining")
+                except (TypeError, ValueError):
+                    pass
+                
+                return response
+            except requests.exceptions.ConnectionError as e:
+                last_exc = e
+                if attempt < max_retries - 1:
+                    wait = min(2 ** attempt, 30)
+                    _debug(f"Connection error, retry in {wait}s (attempt {attempt + 1}/{max_retries})")
+                    time.sleep(wait)
+            except requests.exceptions.Timeout as e:
+                last_exc = e
+                if attempt < max_retries - 1:
+                    _debug(f"Timeout, retrying (attempt {attempt + 1}/{max_retries})")
+        
+        if last_exc:
+            raise last_exc
+        raise requests.exceptions.RequestException(f"All {max_retries} attempts failed for {url}")
+    
     def _sanitize_git_error(self, error_message: str) -> str:
         """Sanitize Git error messages to remove potentially sensitive authentication information.
         
@@ -495,7 +551,7 @@ def _download_ado_file(self, dep_ref: DependencyReference, file_path: str, ref:
             headers['Authorization'] = f'Basic {auth}'
         
         try:
-            response = requests.get(api_url, headers=headers, timeout=30)
+            response = self._resilient_get(api_url, headers=headers, timeout=30)
             response.raise_for_status()
             return response.content
         except requests.exceptions.HTTPError as e:
@@ -515,7 +571,7 @@ def _download_ado_file(self, dep_ref: DependencyReference, file_path: str, ref:
                 )
                 
                 try:
-                    response = requests.get(fallback_url, headers=headers, timeout=30)
+                    response = self._resilient_get(fallback_url, headers=headers, timeout=30)
                     response.raise_for_status()
                     return response.content
                 except requests.exceptions.HTTPError:
@@ -571,7 +627,7 @@ def _download_github_file(self, dep_ref: DependencyReference, file_path: str, re
         
         # Try to download with the specified ref
         try:
-            response = requests.get(api_url, headers=headers, timeout=30)
+            response = self._resilient_get(api_url, headers=headers, timeout=30)
             response.raise_for_status()
             return response.content
         except requests.exceptions.HTTPError as e:
@@ -593,7 +649,7 @@ def _download_github_file(self, dep_ref: DependencyReference, file_path: str, re
                     fallback_url = f"https://{host}/api/v3/repos/{owner}/{repo}/contents/{file_path}?ref={fallback_ref}"
                 
                 try:
-                    response = requests.get(fallback_url, headers=headers, timeout=30)
+                    response = self._resilient_get(fallback_url, headers=headers, timeout=30)
                     response.raise_for_status()
                     return response.content
                 except requests.exceptions.HTTPError:
@@ -609,7 +665,7 @@ def _download_github_file(self, dep_ref: DependencyReference, file_path: str, re
                 if self.github_token and not host.lower().endswith(".ghe.com"):
                     try:
                         unauth_headers = {'Accept': 'application/vnd.github.v3.raw'}
-                        response = requests.get(api_url, headers=unauth_headers, timeout=30)
+                        response = self._resilient_get(api_url, headers=unauth_headers, timeout=30)
                         response.raise_for_status()
                         return response.content
                     except requests.exceptions.HTTPError:
@@ -966,6 +1022,44 @@ def download_collection_package(self, dep_ref: DependencyReference, target_path:
             dependency_ref=dep_ref  # Store for canonical dependency string
         )
     
+    def _try_sparse_checkout(self, dep_ref: DependencyReference, temp_clone_path: Path, subdir_path: str, ref: str = None) -> bool:
+        """Attempt sparse-checkout to download only a subdirectory (git 2.25+).
+
+        Returns True on success. Falls back silently on failure.
+        """
+        import subprocess
+        try:
+            temp_clone_path.mkdir(parents=True, exist_ok=True)
+            env = {**os.environ, **(self.git_env or {})}
+            auth_url = self._build_repo_url(dep_ref.repo_url, use_ssh=False, dep_ref=dep_ref)
+
+            cmds = [
+                ['git', 'init'],
+                ['git', 'remote', 'add', 'origin', auth_url],
+                ['git', 'sparse-checkout', 'init', '--cone'],
+                ['git', 'sparse-checkout', 'set', subdir_path],
+            ]
+            fetch_cmd = ['git', 'fetch', 'origin']
+            if ref:
+                fetch_cmd.append(ref)
+            fetch_cmd.append('--depth=1')
+            cmds.append(fetch_cmd)
+            cmds.append(['git', 'checkout', 'FETCH_HEAD'])
+
+            for cmd in cmds:
+                result = subprocess.run(
+                    cmd, cwd=str(temp_clone_path), env=env,
+                    capture_output=True, text=True, timeout=120,
+                )
+                if result.returncode != 0:
+                    _debug(f"Sparse-checkout step failed ({' '.join(cmd)}): {result.stderr.strip()}")
+                    return False
+
+            return True
+        except Exception as e:
+            _debug(f"Sparse-checkout failed: {e}")
+            return False
+
     def download_subdirectory_package(self, dep_ref: DependencyReference, target_path: Path, progress_task_id=None, progress_obj=None) -> PackageInfo:
         """Download a subdirectory from a repo as an APM package.
         
@@ -1008,31 +1102,37 @@ def download_subdirectory_package(self, dep_ref: DependencyReference, target_pat
             if progress_obj and progress_task_id is not None:
                 progress_obj.update(progress_task_id, completed=20, total=100)
             
-            # Clone the repository (shallow clone for performance)
-            # Don't specify branch if none provided - let git use repo's default
-            package_display_name = subdir_path.split('/')[-1]
-            progress_reporter = GitProgressReporter(progress_task_id, progress_obj, package_display_name) if progress_task_id and progress_obj else None
+            # Phase 4 (#171): Try sparse-checkout first (git 2.25+), fall back to full clone
+            sparse_ok = self._try_sparse_checkout(dep_ref, temp_clone_path, subdir_path, ref)
             
-            clone_kwargs = {
-                'dep_ref': dep_ref,
-                'depth': 1,
-            }
-            if ref:
-                clone_kwargs['branch'] = ref
-            
-            try:
-                self._clone_with_fallback(
-                    dep_ref.repo_url,
-                    temp_clone_path,
-                    progress_reporter=progress_reporter,
-                    **clone_kwargs
-                )
-            except Exception as e:
-                raise RuntimeError(f"Failed to clone repository: {e}")
-            
-            # Disable progress reporter after clone
-            if progress_reporter:
-                progress_reporter.disabled = True
+            if not sparse_ok:
+                # Full shallow clone fallback
+                if temp_clone_path.exists():
+                    shutil.rmtree(temp_clone_path)
+                
+                package_display_name = subdir_path.split('/')[-1]
+                progress_reporter = GitProgressReporter(progress_task_id, progress_obj, package_display_name) if progress_task_id and progress_obj else None
+                
+                clone_kwargs = {
+                    'dep_ref': dep_ref,
+                    'depth': 1,
+                }
+                if ref:
+                    clone_kwargs['branch'] = ref
+                
+                try:
+                    self._clone_with_fallback(
+                        dep_ref.repo_url,
+                        temp_clone_path,
+                        progress_reporter=progress_reporter,
+                        **clone_kwargs
+                    )
+                except Exception as e:
+                    raise RuntimeError(f"Failed to clone repository: {e}")
+                
+                # Disable progress reporter after clone
+                if progress_reporter:
+                    progress_reporter.disabled = True
             
             # Update progress - extracting subdirectory
             if progress_obj and progress_task_id is not None:
diff --git a/src/apm_cli/models/apm_package.py b/src/apm_cli/models/apm_package.py
index 2f1dbde9..2a5970f8 100644
--- a/src/apm_cli/models/apm_package.py
+++ b/src/apm_cli/models/apm_package.py
@@ -9,6 +9,14 @@
 from pathlib import Path
 from typing import Optional, List, Dict, Any, Union
 
+# Module-level parse cache: resolved path -> APMPackage (#171)
+_apm_yml_cache: Dict[Path, "APMPackage"] = {}
+
+
+def clear_apm_yml_cache() -> None:
+    """Clear the from_apm_yml parse cache. Call in tests for isolation."""
+    _apm_yml_cache.clear()
+
 
 class GitReferenceType(Enum):
     """Types of Git references supported."""
@@ -851,6 +859,8 @@ class APMPackage:
     def from_apm_yml(cls, apm_yml_path: Path) -> "APMPackage":
         """Load APM package from apm.yml file.
         
+        Results are cached by resolved path for the lifetime of the process.
+        
         Args:
             apm_yml_path: Path to the apm.yml file
             
@@ -864,6 +874,11 @@ def from_apm_yml(cls, apm_yml_path: Path) -> "APMPackage":
         if not apm_yml_path.exists():
             raise FileNotFoundError(f"apm.yml not found: {apm_yml_path}")
         
+        resolved = apm_yml_path.resolve()
+        cached = _apm_yml_cache.get(resolved)
+        if cached is not None:
+            return cached
+        
         try:
             with open(apm_yml_path, 'r', encoding='utf-8') as f:
                 data = yaml.safe_load(f)
@@ -921,7 +936,7 @@ def from_apm_yml(cls, apm_yml_path: Path) -> "APMPackage":
             except ValueError as e:
                 raise ValueError(f"Invalid 'type' field in apm.yml: {e}")
         
-        return cls(
+        result = cls(
             name=data['name'],
             version=data['version'],
             description=data.get('description'),
@@ -933,6 +948,8 @@ def from_apm_yml(cls, apm_yml_path: Path) -> "APMPackage":
             target=data.get('target'),
             type=pkg_type,
         )
+        _apm_yml_cache[resolved] = result
+        return result
     
     def get_apm_dependencies(self) -> List[DependencyReference]:
         """Get list of APM dependencies."""
diff --git a/src/apm_cli/primitives/models.py b/src/apm_cli/primitives/models.py
index a84ce3be..1c7b090d 100644
--- a/src/apm_cli/primitives/models.py
+++ b/src/apm_cli/primitives/models.py
@@ -148,7 +148,23 @@ def __init__(self):
         self.contexts = []
         self.skills = []
         self.conflicts = []
+        # Name→index maps for O(1) conflict lookups (see #171)
+        self._chatmode_index: Dict[str, int] = {}
+        self._instruction_index: Dict[str, int] = {}
+        self._context_index: Dict[str, int] = {}
+        self._skill_index: Dict[str, int] = {}
     
+    def _index_for(self, primitive_type: str) -> Dict[str, int]:
+        """Return the name→index map for the given primitive type."""
+        if primitive_type == "chatmode":
+            return self._chatmode_index
+        elif primitive_type == "instruction":
+            return self._instruction_index
+        elif primitive_type == "context":
+            return self._context_index
+        else:
+            return self._skill_index
+
     def add_primitive(self, primitive: Primitive) -> None:
         """Add a primitive to the appropriate collection.
         
@@ -169,15 +185,12 @@ def add_primitive(self, primitive: Primitive) -> None:
     
     def _add_with_conflict_detection(self, new_primitive: Primitive, collection: List[Primitive], primitive_type: str) -> None:
         """Add primitive with conflict detection."""
-        # Find existing primitive with same name
-        existing_index = None
-        for i, existing in enumerate(collection):
-            if existing.name == new_primitive.name:
-                existing_index = i
-                break
+        name_index = self._index_for(primitive_type)
+        existing_index = name_index.get(new_primitive.name)
         
         if existing_index is None:
             # No conflict, just add the primitive
+            name_index[new_primitive.name] = len(collection)
             collection.append(new_primitive)
         else:
             # Conflict detected - apply priority rules
diff --git a/tests/benchmarks/BASELINE.md b/tests/benchmarks/BASELINE.md
new file mode 100644
index 00000000..13a9dcd0
--- /dev/null
+++ b/tests/benchmarks/BASELINE.md
@@ -0,0 +1,62 @@
+# Performance Baseline — Issue #171
+
+Measured on `perf/171-deep-dependency-optimization` branch using `tests/benchmarks/run_baseline.py`.
+
+**Environment:** macOS, Python 3.12, Apple Silicon (M-series), 5-run median.
+
+## Results
+
+| Benchmark | Baseline (ms) | Optimized (ms) | Speedup | Fix |
+|---|--:|--:|--:|---|
+| **Primitive conflict detection** | | | | |
+| `primitive_add_unique_100` | 0.32 | 0.15 | 2.1× | Dict index O(1) lookup |
+| `primitive_add_unique_500` | 5.35 | 0.77 | **6.9×** | Dict index O(1) lookup |
+| `primitive_add_unique_1000` | 21.49 | 1.55 | **13.9×** | Dict index O(1) lookup |
+| `primitive_conflict_50pct_100` | 0.24 | 0.17 | 1.4× | Dict index O(1) lookup |
+| `primitive_conflict_50pct_500` | 2.96 | 0.85 | 3.5× | Dict index O(1) lookup |
+| `primitive_conflict_50pct_1000` | 11.17 | 1.79 | **6.2×** | Dict index O(1) lookup |
+| **Depth-indexed lookups** | | | | |
+| `depth_lookup_50x5` | 0.01 | <0.01 | ~∞ | Pre-computed depth index |
+| `depth_lookup_100x10` | 0.03 | <0.01 | ~∞ | Pre-computed depth index |
+| `depth_lookup_500x10` | 0.13 | <0.01 | **13×+** | Pre-computed depth index |
+| **Cycle detection** | | | | |
+| `cycle_detect_chain_20` | 0.02 | 0.02 | 1.0× | Set companion (small N) |
+| `cycle_detect_chain_50` | 0.05 | 0.04 | 1.3× | Set companion |
+| `cycle_detect_chain_100` | 0.13 | 0.09 | 1.4× | Set companion |
+| **Flatten** | | | | |
+| `flatten_50x5` | 0.03 | 0.03 | 1.0× | Depth index benefit |
+| `flatten_100x10` | 0.08 | 0.05 | 1.6× | Depth index benefit |
+| `flatten_500x10` | 0.39 | 0.30 | 1.3× | Depth index benefit |
+| **YAML parse caching** | | | | |
+| `from_apm_yml_x10` | 1.20 | 0.29 | **4.1×** | Module-level cache |
+| `from_apm_yml_x50` | 5.95 | 1.48 | **4.0×** | Module-level cache |
+| **Parallel downloads (Phase 4)** | | | | |
+| `sequential_10x50ms` | 542.52 | 535.11 | 1.0× | Baseline for comparison |
+| `parallel_4w_10x50ms` | 159.48 | 165.39 | ~1.0× | ThreadPoolExecutor overhead |
+
+## Analysis
+
+### Highest-impact fixes (Phase 2 — Data Structures)
+- **Primitive conflict detection** showed the clearest O(m²) → O(m) improvement. At 1000 primitives, the Dict-based index is **13.9× faster** — the quadratic growth curve is eliminated.
+- **Depth lookups** dropped to near-zero with the pre-computed `_nodes_by_depth` Dict, eliminating repeated full-scan iterations during `flatten_dependencies()`.
+
+### Caching (Phase 3)
+- **`from_apm_yml()`** cache reduces 50-call repeated parses from 5.95ms → 1.48ms (4×). The first call still hits disk; subsequent calls are Dict lookups. Real-world CLI operations with 20-50 repeated parses will see significant I/O savings.
+- **`read_constitution()`** and **`get_config()`** caching not benchmarked here (require file I/O fixtures) but follow the same pattern: 1 disk read per process instead of 3-10.
+
+### Parallel downloads (Phase 4)
+- The `sequential_10x50ms` vs `parallel_4w_10x50ms` benchmark demonstrates the ThreadPoolExecutor pattern: 10 simulated 50ms I/O tasks complete in ~160ms with 4 workers vs ~540ms sequentially — a **3.4× wall-clock speedup**. This closely matches the theoretical ceiling of `ceil(10/4) × 50ms = 150ms`.
+- In real-world `apm install`, the speedup depends on network conditions and package count. With 10 packages averaging 5s each: sequential ≈ 50s, parallel (4 workers) ≈ 15s.
+- Git sparse-checkout for subdirectory packages reduces bandwidth further by downloading only the target subdirectory instead of the full repository.
+
+### Cycle detection
+- Modest improvement (1.4× at 100 nodes). The `Set` companion for O(1) `in` checks matters more at deeper trees (100+ depth) than the chain lengths tested. The primary benefit is preventing degenerate performance on adversarial dependency graphs.
+
+### Flatten
+- 1.3× improvement at 500 packages — combines depth-index O(1) lookups with pre-allocated lists.
+
+## Notes
+- Phases 2-3 benchmarks are synthetic (in-memory). Real-world improvement depends on package count and tree shape.
+- Phase 4 parallel download benchmark uses simulated sleep to isolate ThreadPoolExecutor overhead from network variability. Real `apm install` speedup will be higher due to actual I/O latency.
+- Phase 5 (rate-limit retries, skip-if-exists) improvements are not measurable via synthetic benchmarks — they affect resilience under API throttling and skip unnecessary re-downloads.
+- Run benchmarks: `uv run python tests/benchmarks/run_baseline.py`
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/benchmarks/run_baseline.py b/tests/benchmarks/run_baseline.py
new file mode 100644
index 00000000..76ffdc86
--- /dev/null
+++ b/tests/benchmarks/run_baseline.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+"""Standalone baseline benchmark — works against both original and optimized code.
+
+Measures raw performance of the bottleneck paths identified in #171.
+No dependency on cache-clearing functions (those only exist post-optimization).
+
+Usage: uv run python tests/benchmarks/run_baseline.py
+"""
+
+import importlib
+import sys
+import time
+import statistics
+from pathlib import Path
+
+# Ensure the project is importable
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src"))
+
+from apm_cli.primitives.models import PrimitiveCollection, Instruction
+from apm_cli.deps.dependency_graph import DependencyTree, DependencyNode
+from apm_cli.deps.apm_resolver import APMDependencyResolver
+from apm_cli.models.apm_package import APMPackage, DependencyReference
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_instruction(name: str, source: str = "local") -> Instruction:
+    return Instruction(
+        name=name,
+        file_path=Path(f"/fake/{name}.instructions.md"),
+        description=f"Test instruction {name}",
+        apply_to="**",
+        content=f"Content for {name}",
+        source=source,
+    )
+
+
+def _make_node(owner: str, repo: str, depth: int) -> DependencyNode:
+    dep_ref = DependencyReference.parse(f"{owner}/{repo}#main")
+    pkg = APMPackage(name=repo, version="1.0.0", source=f"{owner}/{repo}")
+    return DependencyNode(package=pkg, dependency_ref=dep_ref, depth=depth)
+
+
+def _build_tree(n: int, max_depth: int) -> DependencyTree:
+    root = APMPackage(name="root", version="1.0.0")
+    tree = DependencyTree(root_package=root)
+    for i in range(n):
+        depth = (i % max_depth) + 1
+        node = _make_node("owner", f"pkg-{i}", depth)
+        tree.add_node(node)
+    return tree
+
+
+def bench(fn, *, runs: int = 5, label: str = ""):
+    """Run fn `runs` times, return (median_ms, min_ms, max_ms)."""
+    times = []
+    for _ in range(runs):
+        start = time.perf_counter()
+        fn()
+        times.append((time.perf_counter() - start) * 1000)
+    med = statistics.median(times)
+    lo = min(times)
+    hi = max(times)
+    return med, lo, hi
+
+
+# ---------------------------------------------------------------------------
+# Benchmarks
+# ---------------------------------------------------------------------------
+
+def bench_primitive_conflict_detection(count: int):
+    """Add `count` unique primitives — measures conflict-check cost."""
+    def run():
+        coll = PrimitiveCollection()
+        for i in range(count):
+            coll.add_primitive(_make_instruction(f"instr-{i}"))
+        assert coll.count() == count
+    return bench(run, label=f"primitive_add_{count}")
+
+
+def bench_primitive_conflict_50pct(count: int):
+    """Add `count` primitives with 50% name collisions."""
+    half = count // 2
+    def run():
+        coll = PrimitiveCollection()
+        for i in range(count):
+            coll.add_primitive(_make_instruction(f"instr-{i % half}", source="dep:pkg"))
+        assert coll.count() == half
+    return bench(run, label=f"primitive_conflict_50pct_{count}")
+
+
+def bench_depth_lookup(n_packages: int, max_depth: int):
+    """Build tree then query every depth level."""
+    tree = _build_tree(n_packages, max_depth)
+    def run():
+        total = 0
+        for d in range(1, max_depth + 1):
+            total += len(tree.get_nodes_at_depth(d))
+        assert total == n_packages
+    return bench(run, runs=20, label=f"depth_lookup_{n_packages}x{max_depth}")
+
+
+def bench_cycle_detection_chain(length: int):
+    """Detect cycles in a linear chain of `length` nodes."""
+    root = APMPackage(name="root", version="1.0.0")
+    tree = DependencyTree(root_package=root)
+    prev = None
+    for i in range(length):
+        node = _make_node("owner", f"chain-{i}", depth=1)
+        tree.add_node(node)
+        if prev:
+            prev.children.append(node)
+        prev = node
+    resolver = APMDependencyResolver()
+    def run():
+        cycles = resolver.detect_circular_dependencies(tree)
+        assert len(cycles) == 0
+    return bench(run, label=f"cycle_detect_chain_{length}")
+
+
+def bench_flatten(n_packages: int, max_depth: int):
+    """Flatten a tree of n packages across max_depth levels."""
+    tree = _build_tree(n_packages, max_depth)
+    resolver = APMDependencyResolver()
+    def run():
+        flat = resolver.flatten_dependencies(tree)
+        assert flat.total_dependencies() == n_packages
+    return bench(run, label=f"flatten_{n_packages}x{max_depth}")
+
+
+def bench_from_apm_yml_repeated(tmp_dir: Path, repeats: int):
+    """Parse the same apm.yml file `repeats` times."""
+    apm_yml = tmp_dir / "apm.yml"
+    apm_yml.write_text("name: bench-pkg\nversion: 1.0.0\n")
+    def run():
+        for _ in range(repeats):
+            # Reload module-level cache state differs between baseline/optimized,
+            # but we just measure wall-clock for `repeats` calls.
+            APMPackage.from_apm_yml(apm_yml)
+    return bench(run, runs=3, label=f"from_apm_yml_x{repeats}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    import tempfile
+
+    results = {}
+
+    print("=" * 70)
+    print("APM Performance Baseline Benchmark")
+    print("=" * 70)
+
+    # 1. Primitive conflict detection
+    for count in [100, 500, 1000]:
+        med, lo, hi = bench_primitive_conflict_detection(count)
+        key = f"primitive_add_unique_{count}"
+        results[key] = (med, lo, hi)
+        print(f"  {key:45s}  median={med:8.2f}ms  min={lo:8.2f}ms  max={hi:8.2f}ms")
+
+    for count in [100, 500, 1000]:
+        med, lo, hi = bench_primitive_conflict_50pct(count)
+        key = f"primitive_conflict_50pct_{count}"
+        results[key] = (med, lo, hi)
+        print(f"  {key:45s}  median={med:8.2f}ms  min={lo:8.2f}ms  max={hi:8.2f}ms")
+
+    # 2. Depth lookups
+    for n, d in [(50, 5), (100, 10), (500, 10)]:
+        med, lo, hi = bench_depth_lookup(n, d)
+        key = f"depth_lookup_{n}x{d}"
+        results[key] = (med, lo, hi)
+        print(f"  {key:45s}  median={med:8.2f}ms  min={lo:8.2f}ms  max={hi:8.2f}ms")
+
+    # 3. Cycle detection
+    for length in [20, 50, 100]:
+        med, lo, hi = bench_cycle_detection_chain(length)
+        key = f"cycle_detect_chain_{length}"
+        results[key] = (med, lo, hi)
+        print(f"  {key:45s}  median={med:8.2f}ms  min={lo:8.2f}ms  max={hi:8.2f}ms")
+
+    # 4. Flatten
+    for n, d in [(50, 5), (100, 10), (500, 10)]:
+        med, lo, hi = bench_flatten(n, d)
+        key = f"flatten_{n}x{d}"
+        results[key] = (med, lo, hi)
+        print(f"  {key:45s}  median={med:8.2f}ms  min={lo:8.2f}ms  max={hi:8.2f}ms")
+
+    # 5. from_apm_yml repeated parsing
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp_path = Path(tmp)
+        for repeats in [10, 50]:
+            med, lo, hi = bench_from_apm_yml_repeated(tmp_path, repeats)
+            key = f"from_apm_yml_x{repeats}"
+            results[key] = (med, lo, hi)
+            print(f"  {key:45s}  median={med:8.2f}ms  min={lo:8.2f}ms  max={hi:8.2f}ms")
+
+    # 6. Phase 4: Parallel execution overhead (ThreadPoolExecutor vs sequential)
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    import time as _time
+
+    def _simulated_work(ms: float):
+        """Simulate I/O-bound work by sleeping."""
+        _time.sleep(ms / 1000)
+        return True
+
+    # Sequential: 10 tasks × 50ms each = ~500ms
+    def _seq_10():
+        for _ in range(10):
+            _simulated_work(50)
+    med, lo, hi = bench(_seq_10, runs=3, label="sequential_10x50ms")
+    key = "sequential_10x50ms"
+    results[key] = (med, lo, hi)
+    print(f"  {key:45s}  median={med:8.2f}ms  min={lo:8.2f}ms  max={hi:8.2f}ms")
+
+    # Parallel: 10 tasks × 50ms, 4 workers → ~150ms (ceil(10/4) × 50ms)
+    def _par_10():
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futs = [executor.submit(_simulated_work, 50) for _ in range(10)]
+            for f in as_completed(futs):
+                f.result()
+    med, lo, hi = bench(_par_10, runs=3, label="parallel_4w_10x50ms")
+    key = "parallel_4w_10x50ms"
+    results[key] = (med, lo, hi)
+    print(f"  {key:45s}  median={med:8.2f}ms  min={lo:8.2f}ms  max={hi:8.2f}ms")
+
+    print("=" * 70)
+    print(f"Total benchmarks: {len(results)}")
+    print("=" * 70)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/benchmarks/test_perf_benchmarks.py b/tests/benchmarks/test_perf_benchmarks.py
new file mode 100644
index 00000000..fa99982c
--- /dev/null
+++ b/tests/benchmarks/test_perf_benchmarks.py
@@ -0,0 +1,210 @@
+"""Performance benchmarks for APM dependency resolution and compilation.
+
+Covers the bottlenecks identified in issue #171:
+- Primitive conflict detection (O(m) after fix, was O(m²))
+- Cycle detection (O(V+E) after fix, was O(V×D))
+- get_nodes_at_depth() (O(1) after fix, was O(V × max_depth))
+- from_apm_yml() parse caching
+- Inheritance chain caching
+
+Run with: uv run pytest tests/benchmarks/ -v --tb=short
+"""
+
+import time
+from pathlib import Path
+
+import pytest
+
+from apm_cli.primitives.models import (
+    PrimitiveCollection,
+    Instruction,
+)
+from apm_cli.deps.dependency_graph import DependencyTree, DependencyNode
+from apm_cli.deps.apm_resolver import APMDependencyResolver
+from apm_cli.models.apm_package import APMPackage, DependencyReference, clear_apm_yml_cache
+from apm_cli.compilation.constitution import read_constitution, clear_constitution_cache
+
+
+# ---------------------------------------------------------------------------
+# Helpers to build synthetic data
+# ---------------------------------------------------------------------------
+
+def _make_instruction(name: str, source: str = "local") -> Instruction:
+    return Instruction(
+        name=name,
+        file_path=Path(f"/fake/{name}.instructions.md"),
+        description=f"Test instruction {name}",
+        apply_to="**",
+        content=f"Content for {name}",
+        source=source,
+    )
+
+
+def _make_dep_ref(owner: str, repo: str, ref: str = "main") -> DependencyReference:
+    return DependencyReference.parse(f"{owner}/{repo}#{ref}")
+
+
+def _make_node(owner: str, repo: str, depth: int, ref: str = "main") -> DependencyNode:
+    dep_ref = _make_dep_ref(owner, repo, ref)
+    pkg = APMPackage(name=repo, version="1.0.0", source=f"{owner}/{repo}")
+    return DependencyNode(package=pkg, dependency_ref=dep_ref, depth=depth)
+
+
+def _build_deep_tree(n_packages: int, max_depth: int) -> DependencyTree:
+    """Build a synthetic dependency tree with n packages spread across depths."""
+    root = APMPackage(name="root", version="1.0.0")
+    tree = DependencyTree(root_package=root)
+    for i in range(n_packages):
+        depth = (i % max_depth) + 1
+        node = _make_node("owner", f"pkg-{i}", depth)
+        tree.add_node(node)
+    return tree
+
+
+# ---------------------------------------------------------------------------
+# Benchmark: Primitive conflict detection
+# ---------------------------------------------------------------------------
+
+class TestPrimitiveConflictDetectionPerf:
+    """Verify O(m) conflict detection (was O(m²) before #171)."""
+
+    @pytest.mark.parametrize("count", [100, 500, 1000])
+    def test_add_unique_primitives(self, count: int):
+        """Adding N unique primitives should scale linearly."""
+        coll = PrimitiveCollection()
+        start = time.perf_counter()
+        for i in range(count):
+            coll.add_primitive(_make_instruction(f"instr-{i}"))
+        elapsed = time.perf_counter() - start
+
+        assert coll.count() == count
+        assert not coll.has_conflicts()
+        # Rough sanity: 1000 unique adds should be well under 0.5s
+        assert elapsed < 0.5, f"Adding {count} primitives took {elapsed:.3f}s"
+
+    def test_conflict_detection_with_duplicates(self):
+        """500 adds with 50% conflicts should still be fast."""
+        coll = PrimitiveCollection()
+        names = [f"instr-{i % 250}" for i in range(500)]
+        start = time.perf_counter()
+        for name in names:
+            coll.add_primitive(_make_instruction(name, source="dep:pkg"))
+        elapsed = time.perf_counter() - start
+
+        assert coll.count() == 250
+        assert elapsed < 0.5
+
+
+# ---------------------------------------------------------------------------
+# Benchmark: Dependency tree depth-index
+# ---------------------------------------------------------------------------
+
+class TestDepthIndexPerf:
+    """Verify O(1) depth lookups (was O(V × max_depth) before #171)."""
+
+    def test_get_nodes_at_depth_large_tree(self):
+        """100 packages across 10 depths — lookups should be instant."""
+        tree = _build_deep_tree(100, 10)
+        start = time.perf_counter()
+        total = 0
+        for depth in range(1, 11):
+            total += len(tree.get_nodes_at_depth(depth))
+        elapsed = time.perf_counter() - start
+
+        assert total == 100
+        assert elapsed < 0.01, f"Depth lookups took {elapsed:.3f}s"
+
+    def test_depth_index_consistency(self):
+        """Depth index returns same nodes as brute-force scan."""
+        tree = _build_deep_tree(50, 5)
+        for depth in range(1, 6):
+            indexed = set(n.get_id() for n in tree.get_nodes_at_depth(depth))
+            brute = set(
+                n.get_id()
+                for n in tree.nodes.values()
+                if n.depth == depth
+            )
+            assert indexed == brute, f"Mismatch at depth {depth}"
+
+
+# ---------------------------------------------------------------------------
+# Benchmark: Cycle detection
+# ---------------------------------------------------------------------------
+
+class TestCycleDetectionPerf:
+    """Verify O(V+E) cycle detection (was O(V×D) before #171)."""
+
+    def test_no_cycles_deep_chain(self):
+        """50-node linear chain — O(V+E) detection."""
+        root = APMPackage(name="root", version="1.0.0")
+        tree = DependencyTree(root_package=root)
+        prev = None
+        for i in range(50):
+            node = _make_node("owner", f"chain-{i}", depth=1)
+            tree.add_node(node)
+            if prev:
+                prev.children.append(node)
+            prev = node
+
+        resolver = APMDependencyResolver()
+        start = time.perf_counter()
+        cycles = resolver.detect_circular_dependencies(tree)
+        elapsed = time.perf_counter() - start
+
+        assert len(cycles) == 0
+        assert elapsed < 0.05
+
+
+# ---------------------------------------------------------------------------
+# Benchmark: from_apm_yml caching
+# ---------------------------------------------------------------------------
+
+class TestFromApmYmlCachePerf:
+    """Verify parse caching eliminates repeated disk I/O."""
+
+    def setup_method(self):
+        clear_apm_yml_cache()
+
+    def test_cache_hit_is_faster(self, tmp_path: Path):
+        """Second parse of same file should be near-instant (cache hit)."""
+        apm_yml = tmp_path / "apm.yml"
+        apm_yml.write_text("name: bench-pkg\nversion: 1.0.0\n")
+
+        # Cold parse
+        start = time.perf_counter()
+        pkg1 = APMPackage.from_apm_yml(apm_yml)
+        cold = time.perf_counter() - start
+
+        # Warm parse (cache hit)
+        start = time.perf_counter()
+        pkg2 = APMPackage.from_apm_yml(apm_yml)
+        warm = time.perf_counter() - start
+
+        assert pkg1.name == pkg2.name
+        # Cache hit should be at least 2x faster (typically 100x+)
+        assert warm < cold or warm < 0.001
+
+
+# ---------------------------------------------------------------------------
+# Benchmark: read_constitution caching
+# ---------------------------------------------------------------------------
+
+class TestConstitutionCachePerf:
+    """Verify constitution read caching."""
+
+    def setup_method(self):
+        clear_constitution_cache()
+
+    def test_cache_hit(self, tmp_path: Path):
+        """Repeated reads of constitution should be cached."""
+        const_dir = tmp_path / "memory"
+        const_dir.mkdir()
+        (const_dir / "constitution.md").write_text("# Constitution\nTest content\n")
+
+        # Cold read
+        content1 = read_constitution(tmp_path)
+        # Warm read
+        content2 = read_constitution(tmp_path)
+
+        assert content1 == content2
+        assert content1 is content2  # Same object (cache hit)
diff --git a/tests/fixtures/synthetic_trees.py b/tests/fixtures/synthetic_trees.py
new file mode 100644
index 00000000..d94b4c8f
--- /dev/null
+++ b/tests/fixtures/synthetic_trees.py
@@ -0,0 +1,77 @@
+"""Synthetic fixtures for deep dependency tree testing.
+
+Generates apm.yml manifests that simulate real-world dependency patterns:
+- 100 packages across 10 depth levels
+- Diamond dependencies (A→B, A→C, B→D, C→D)
+- Large primitive counts per package
+"""
+
+import textwrap
+from pathlib import Path
+from typing import Dict, List
+
+
+def generate_deep_tree_fixtures(base_dir: Path, n_packages: int = 100, max_depth: int = 10) -> Dict[str, Path]:
+    """Generate a tree of apm.yml files simulating deep transitive deps.
+    
+    Returns a dict of package_name -> apm.yml path.
+    """
+    packages: Dict[str, Path] = {}
+    
+    for i in range(n_packages):
+        depth = (i % max_depth) + 1
+        pkg_name = f"test-pkg-{i}"
+        pkg_dir = base_dir / f"owner/{pkg_name}"
+        pkg_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Build dependency list (each package depends on 1-2 others at next depth)
+        deps = []
+        if depth < max_depth:
+            child_idx = (i + max_depth) % n_packages
+            deps.append(f"owner/test-pkg-{child_idx}")
+            # Diamond: add a second dep sometimes
+            if i % 3 == 0:
+                child_idx2 = (i + max_depth + 1) % n_packages
+                deps.append(f"owner/test-pkg-{child_idx2}")
+        
+        deps_yaml = ""
+        if deps:
+            dep_lines = "\n".join(f"    - {d}" for d in deps)
+            deps_yaml = f"\ndependencies:\n  apm:\n{dep_lines}\n"
+        
+        content = textwrap.dedent(f"""\
+            name: {pkg_name}
+            version: 1.0.0
+            description: Synthetic package at depth {depth}
+            {deps_yaml}""")
+        
+        apm_yml = pkg_dir / "apm.yml"
+        apm_yml.write_text(content)
+        packages[pkg_name] = apm_yml
+    
+    return packages
+
+
+def generate_primitive_heavy_package(base_dir: Path, n_instructions: int = 200, n_contexts: int = 50) -> Path:
+    """Generate a package with many primitives for conflict detection benchmarks."""
+    pkg_dir = base_dir / "heavy-pkg"
+    pkg_dir.mkdir(parents=True, exist_ok=True)
+    
+    apm_yml = pkg_dir / "apm.yml"
+    apm_yml.write_text("name: heavy-pkg\nversion: 1.0.0\n")
+    
+    # Create instruction files
+    instructions_dir = pkg_dir / ".apm" / "instructions"
+    instructions_dir.mkdir(parents=True, exist_ok=True)
+    for i in range(n_instructions):
+        (instructions_dir / f"instr-{i}.instructions.md").write_text(
+            f"---\ndescription: Instruction {i}\napplyTo: '**'\n---\nContent {i}\n"
+        )
+    
+    # Create context files
+    contexts_dir = pkg_dir / ".apm" / "contexts"
+    contexts_dir.mkdir(parents=True, exist_ok=True)
+    for i in range(n_contexts):
+        (contexts_dir / f"ctx-{i}.md").write_text(f"Context content {i}\n")
+    
+    return pkg_dir

From 51d8cc71cd92ee5bf6048aebaeec42cea0b8888a Mon Sep 17 00:00:00 2001
From: sergio-sisternes-epam <sergio_sisternes@epam.com>
Date: Thu, 5 Mar 2026 11:22:56 +0000
Subject: [PATCH 2/7] docs: update BASELINE.md with fresh benchmark numbers

---
 tests/benchmarks/BASELINE.md | 40 ++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/benchmarks/BASELINE.md b/tests/benchmarks/BASELINE.md
index 13a9dcd0..a66f44e6 100644
--- a/tests/benchmarks/BASELINE.md
+++ b/tests/benchmarks/BASELINE.md
@@ -9,51 +9,51 @@ Measured on `perf/171-deep-dependency-optimization` branch using `tests/benchmar
 | Benchmark | Baseline (ms) | Optimized (ms) | Speedup | Fix |
 |---|--:|--:|--:|---|
 | **Primitive conflict detection** | | | | |
-| `primitive_add_unique_100` | 0.32 | 0.15 | 2.1× | Dict index O(1) lookup |
-| `primitive_add_unique_500` | 5.35 | 0.77 | **6.9×** | Dict index O(1) lookup |
-| `primitive_add_unique_1000` | 21.49 | 1.55 | **13.9×** | Dict index O(1) lookup |
-| `primitive_conflict_50pct_100` | 0.24 | 0.17 | 1.4× | Dict index O(1) lookup |
-| `primitive_conflict_50pct_500` | 2.96 | 0.85 | 3.5× | Dict index O(1) lookup |
-| `primitive_conflict_50pct_1000` | 11.17 | 1.79 | **6.2×** | Dict index O(1) lookup |
+| `primitive_add_unique_100` | 0.36 | 0.14 | 2.6× | Dict index O(1) lookup |
+| `primitive_add_unique_500` | 5.57 | 0.85 | **6.5×** | Dict index O(1) lookup |
+| `primitive_add_unique_1000` | 21.94 | 1.57 | **14.0×** | Dict index O(1) lookup |
+| `primitive_conflict_50pct_100` | 0.26 | 0.16 | 1.6× | Dict index O(1) lookup |
+| `primitive_conflict_50pct_500` | 3.06 | 0.83 | 3.7× | Dict index O(1) lookup |
+| `primitive_conflict_50pct_1000` | 11.49 | 1.72 | **6.7×** | Dict index O(1) lookup |
 | **Depth-indexed lookups** | | | | |
 | `depth_lookup_50x5` | 0.01 | <0.01 | ~∞ | Pre-computed depth index |
 | `depth_lookup_100x10` | 0.03 | <0.01 | ~∞ | Pre-computed depth index |
-| `depth_lookup_500x10` | 0.13 | <0.01 | **13×+** | Pre-computed depth index |
+| `depth_lookup_500x10` | 0.14 | <0.01 | **14×+** | Pre-computed depth index |
 | **Cycle detection** | | | | |
 | `cycle_detect_chain_20` | 0.02 | 0.02 | 1.0× | Set companion (small N) |
 | `cycle_detect_chain_50` | 0.05 | 0.04 | 1.3× | Set companion |
-| `cycle_detect_chain_100` | 0.13 | 0.09 | 1.4× | Set companion |
+| `cycle_detect_chain_100` | 0.16 | 0.09 | 1.8× | Set companion |
 | **Flatten** | | | | |
-| `flatten_50x5` | 0.03 | 0.03 | 1.0× | Depth index benefit |
-| `flatten_100x10` | 0.08 | 0.05 | 1.6× | Depth index benefit |
-| `flatten_500x10` | 0.39 | 0.30 | 1.3× | Depth index benefit |
+| `flatten_50x5` | 0.04 | 0.03 | 1.3× | Depth index benefit |
+| `flatten_100x10` | 0.12 | 0.05 | 2.4× | Depth index benefit |
+| `flatten_500x10` | 0.40 | 0.25 | 1.6× | Depth index benefit |
 | **YAML parse caching** | | | | |
-| `from_apm_yml_x10` | 1.20 | 0.29 | **4.1×** | Module-level cache |
-| `from_apm_yml_x50` | 5.95 | 1.48 | **4.0×** | Module-level cache |
+| `from_apm_yml_x10` | 1.35 | 0.29 | **4.7×** | Module-level cache |
+| `from_apm_yml_x50` | 6.68 | 1.51 | **4.4×** | Module-level cache |
 | **Parallel downloads (Phase 4)** | | | | |
-| `sequential_10x50ms` | 542.52 | 535.11 | 1.0× | Baseline for comparison |
-| `parallel_4w_10x50ms` | 159.48 | 165.39 | ~1.0× | ThreadPoolExecutor overhead |
+| `sequential_10x50ms` | 533.79 | 534.68 | 1.0× | Baseline for comparison |
+| `parallel_4w_10x50ms` | 164.85 | 163.59 | ~1.0× | ThreadPoolExecutor overhead |
 
 ## Analysis
 
 ### Highest-impact fixes (Phase 2 — Data Structures)
-- **Primitive conflict detection** showed the clearest O(m²) → O(m) improvement. At 1000 primitives, the Dict-based index is **13.9× faster** — the quadratic growth curve is eliminated.
+- **Primitive conflict detection** showed the clearest O(m²) → O(m) improvement. At 1000 primitives, the Dict-based index is **14× faster** — the quadratic growth curve is eliminated.
 - **Depth lookups** dropped to near-zero with the pre-computed `_nodes_by_depth` Dict, eliminating repeated full-scan iterations during `flatten_dependencies()`.
 
 ### Caching (Phase 3)
-- **`from_apm_yml()`** cache reduces 50-call repeated parses from 5.95ms → 1.48ms (4×). The first call still hits disk; subsequent calls are Dict lookups. Real-world CLI operations with 20-50 repeated parses will see significant I/O savings.
+- **`from_apm_yml()`** cache reduces 50-call repeated parses from 6.68ms → 1.51ms (4.4×). The first call still hits disk; subsequent calls are Dict lookups. Real-world CLI operations with 20-50 repeated parses will see significant I/O savings.
 - **`read_constitution()`** and **`get_config()`** caching not benchmarked here (require file I/O fixtures) but follow the same pattern: 1 disk read per process instead of 3-10.
 
 ### Parallel downloads (Phase 4)
-- The `sequential_10x50ms` vs `parallel_4w_10x50ms` benchmark demonstrates the ThreadPoolExecutor pattern: 10 simulated 50ms I/O tasks complete in ~160ms with 4 workers vs ~540ms sequentially — a **3.4× wall-clock speedup**. This closely matches the theoretical ceiling of `ceil(10/4) × 50ms = 150ms`.
+- The `sequential_10x50ms` vs `parallel_4w_10x50ms` benchmark demonstrates the ThreadPoolExecutor pattern: 10 simulated 50ms I/O tasks complete in ~164ms with 4 workers vs ~534ms sequentially — a **3.3× wall-clock speedup**. This closely matches the theoretical ceiling of `ceil(10/4) × 50ms = 150ms`.
 - In real-world `apm install`, the speedup depends on network conditions and package count. With 10 packages averaging 5s each: sequential ≈ 50s, parallel (4 workers) ≈ 15s.
 - Git sparse-checkout for subdirectory packages reduces bandwidth further by downloading only the target subdirectory instead of the full repository.
 
 ### Cycle detection
-- Modest improvement (1.4× at 100 nodes). The `Set` companion for O(1) `in` checks matters more at deeper trees (100+ depth) than the chain lengths tested. The primary benefit is preventing degenerate performance on adversarial dependency graphs.
+- Modest improvement (1.8× at 100 nodes). The `Set` companion for O(1) `in` checks matters more at deeper trees (100+ depth) than the chain lengths tested. The primary benefit is preventing degenerate performance on adversarial dependency graphs.
 
 ### Flatten
-- 1.3× improvement at 500 packages — combines depth-index O(1) lookups with pre-allocated lists.
+- Up to 2.4× improvement at 100 packages — combines depth-index O(1) lookups with pre-allocated lists.
 
 ## Notes
 - Phases 2-3 benchmarks are synthetic (in-memory). Real-world improvement depends on package count and tree shape.

From 192054acdb481cb4e7f55927dd73650db2b308fd Mon Sep 17 00:00:00 2001
From: sergio-sisternes-epam <sergio_sisternes@epam.com>
Date: Thu, 5 Mar 2026 11:26:18 +0000
Subject: [PATCH 3/7] chore: exclude BASELINE.md from version control

---
 .gitignore                   |  1 +
 tests/benchmarks/BASELINE.md | 62 ------------------------------------
 2 files changed, 1 insertion(+), 62 deletions(-)
 delete mode 100644 tests/benchmarks/BASELINE.md

diff --git a/.gitignore b/.gitignore
index fd122d98..74473bd6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 APPROACH.md
 next-steps.md
 apm-action-plan.md
+tests/benchmarks/BASELINE.md
 
 # Python
 __pycache__/
diff --git a/tests/benchmarks/BASELINE.md b/tests/benchmarks/BASELINE.md
deleted file mode 100644
index a66f44e6..00000000
--- a/tests/benchmarks/BASELINE.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# Performance Baseline — Issue #171
-
-Measured on `perf/171-deep-dependency-optimization` branch using `tests/benchmarks/run_baseline.py`.
-
-**Environment:** macOS, Python 3.12, Apple Silicon (M-series), 5-run median.
-
-## Results
-
-| Benchmark | Baseline (ms) | Optimized (ms) | Speedup | Fix |
-|---|--:|--:|--:|---|
-| **Primitive conflict detection** | | | | |
-| `primitive_add_unique_100` | 0.36 | 0.14 | 2.6× | Dict index O(1) lookup |
-| `primitive_add_unique_500` | 5.57 | 0.85 | **6.5×** | Dict index O(1) lookup |
-| `primitive_add_unique_1000` | 21.94 | 1.57 | **14.0×** | Dict index O(1) lookup |
-| `primitive_conflict_50pct_100` | 0.26 | 0.16 | 1.6× | Dict index O(1) lookup |
-| `primitive_conflict_50pct_500` | 3.06 | 0.83 | 3.7× | Dict index O(1) lookup |
-| `primitive_conflict_50pct_1000` | 11.49 | 1.72 | **6.7×** | Dict index O(1) lookup |
-| **Depth-indexed lookups** | | | | |
-| `depth_lookup_50x5` | 0.01 | <0.01 | ~∞ | Pre-computed depth index |
-| `depth_lookup_100x10` | 0.03 | <0.01 | ~∞ | Pre-computed depth index |
-| `depth_lookup_500x10` | 0.14 | <0.01 | **14×+** | Pre-computed depth index |
-| **Cycle detection** | | | | |
-| `cycle_detect_chain_20` | 0.02 | 0.02 | 1.0× | Set companion (small N) |
-| `cycle_detect_chain_50` | 0.05 | 0.04 | 1.3× | Set companion |
-| `cycle_detect_chain_100` | 0.16 | 0.09 | 1.8× | Set companion |
-| **Flatten** | | | | |
-| `flatten_50x5` | 0.04 | 0.03 | 1.3× | Depth index benefit |
-| `flatten_100x10` | 0.12 | 0.05 | 2.4× | Depth index benefit |
-| `flatten_500x10` | 0.40 | 0.25 | 1.6× | Depth index benefit |
-| **YAML parse caching** | | | | |
-| `from_apm_yml_x10` | 1.35 | 0.29 | **4.7×** | Module-level cache |
-| `from_apm_yml_x50` | 6.68 | 1.51 | **4.4×** | Module-level cache |
-| **Parallel downloads (Phase 4)** | | | | |
-| `sequential_10x50ms` | 533.79 | 534.68 | 1.0× | Baseline for comparison |
-| `parallel_4w_10x50ms` | 164.85 | 163.59 | ~1.0× | ThreadPoolExecutor overhead |
-
-## Analysis
-
-### Highest-impact fixes (Phase 2 — Data Structures)
-- **Primitive conflict detection** showed the clearest O(m²) → O(m) improvement. At 1000 primitives, the Dict-based index is **14× faster** — the quadratic growth curve is eliminated.
-- **Depth lookups** dropped to near-zero with the pre-computed `_nodes_by_depth` Dict, eliminating repeated full-scan iterations during `flatten_dependencies()`.
-
-### Caching (Phase 3)
-- **`from_apm_yml()`** cache reduces 50-call repeated parses from 6.68ms → 1.51ms (4.4×). The first call still hits disk; subsequent calls are Dict lookups. Real-world CLI operations with 20-50 repeated parses will see significant I/O savings.
-- **`read_constitution()`** and **`get_config()`** caching not benchmarked here (require file I/O fixtures) but follow the same pattern: 1 disk read per process instead of 3-10.
-
-### Parallel downloads (Phase 4)
-- The `sequential_10x50ms` vs `parallel_4w_10x50ms` benchmark demonstrates the ThreadPoolExecutor pattern: 10 simulated 50ms I/O tasks complete in ~164ms with 4 workers vs ~534ms sequentially — a **3.3× wall-clock speedup**. This closely matches the theoretical ceiling of `ceil(10/4) × 50ms = 150ms`.
-- In real-world `apm install`, the speedup depends on network conditions and package count. With 10 packages averaging 5s each: sequential ≈ 50s, parallel (4 workers) ≈ 15s.
-- Git sparse-checkout for subdirectory packages reduces bandwidth further by downloading only the target subdirectory instead of the full repository.
-
-### Cycle detection
-- Modest improvement (1.8× at 100 nodes). The `Set` companion for O(1) `in` checks matters more at deeper trees (100+ depth) than the chain lengths tested. The primary benefit is preventing degenerate performance on adversarial dependency graphs.
-
-### Flatten
-- Up to 2.4× improvement at 100 packages — combines depth-index O(1) lookups with pre-allocated lists.
-
-## Notes
-- Phases 2-3 benchmarks are synthetic (in-memory). Real-world improvement depends on package count and tree shape.
-- Phase 4 parallel download benchmark uses simulated sleep to isolate ThreadPoolExecutor overhead from network variability. Real `apm install` speedup will be higher due to actual I/O latency.
-- Phase 5 (rate-limit retries, skip-if-exists) improvements are not measurable via synthetic benchmarks — they affect resilience under API throttling and skip unnecessary re-downloads.
-- Run benchmarks: `uv run python tests/benchmarks/run_baseline.py`

From 180de0bd1dded2c7147c5ae7f967a5bee511860f Mon Sep 17 00:00:00 2001
From: Sergio Sisternes <sergio_sisternes@epam.com>
Date: Thu, 5 Mar 2026 18:14:32 +0000
Subject: [PATCH 4/7] Update cli.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/apm_cli/cli.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/apm_cli/cli.py b/src/apm_cli/cli.py
index c4460500..6aab8e5e 100644
--- a/src/apm_cli/cli.py
+++ b/src/apm_cli/cli.py
@@ -1800,10 +1800,6 @@ def matches_filter(dep):
             transient=True,  # Progress bar disappears when done
         ) as progress:
             for dep_ref in deps_to_install:
-                # Phase 4 (#171): Skip deps that failed during parallel download
-                if dep_ref.get_unique_key() in _pre_download_failed:
-                    continue
-
                 # Determine installation directory using namespaced structure
                 # e.g., microsoft/apm-sample-package -> apm_modules/microsoft/apm-sample-package/
                 # For virtual packages: owner/repo/prompts/file.prompt.md -> apm_modules/owner/repo-file/

From ba6eda2298b3e7fafe682e14340aa926ff5b7d73 Mon Sep 17 00:00:00 2001
From: sergio-sisternes-epam <sergio_sisternes@epam.com>
Date: Thu, 5 Mar 2026 21:22:36 +0000
Subject: [PATCH 5/7] fix: address PR #173 review feedback

- Add duplicate guard in DependencyTree.add_node() to prevent
  _nodes_by_depth list from growing with repeated add_node calls
- Add jitter to exponential backoff (0.5-1.5x multiplier) to avoid
  thundering herd when parallel workers hit rate limits simultaneously
- Handle HTTP-date Retry-After headers gracefully (try/except fallback)
- Remove dead _pre_download_failed set; let sequential loop retry failures
- Mark benchmark tests with @pytest.mark.benchmark and deselect by default
  to prevent CI flakiness from wall-clock assertions
- Update docs to mention jitter in resilient downloads
---
 docs/dependencies.md                     |  2 +-
 pyproject.toml                           |  2 ++
 src/apm_cli/cli.py                       |  6 ++----
 src/apm_cli/deps/dependency_graph.py     |  7 +++++--
 src/apm_cli/deps/github_downloader.py    | 15 ++++++++++-----
 tests/benchmarks/test_perf_benchmarks.py |  7 ++++++-
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/docs/dependencies.md b/docs/dependencies.md
index 0a91ac33..6a8df7d6 100644
--- a/docs/dependencies.md
+++ b/docs/dependencies.md
@@ -308,7 +308,7 @@ When both packages are installed, your project gains:
 
 #### Resilient Downloads
 
-APM automatically retries failed HTTP requests with exponential backoff. Rate-limited responses (HTTP 429/503) are handled transparently, respecting `Retry-After` headers when provided. This ensures reliable installs even under heavy API usage or transient network issues.
+APM automatically retries failed HTTP requests with exponential backoff and jitter. Rate-limited responses (HTTP 429/503) are handled transparently, respecting `Retry-After` headers when provided. This ensures reliable installs even under heavy API usage or transient network issues.
 
 #### Parallel Downloads
 
diff --git a/pyproject.toml b/pyproject.toml
index 87b8351e..3198f352 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,7 +70,9 @@ warn_return_any = true
 warn_unused_configs = true
 
 [tool.pytest.ini_options]
+addopts = "-m 'not benchmark'"
 markers = [
     "integration: marks tests as integration tests that may require network access",
     "slow: marks tests as slow running tests",
+    "benchmark: marks performance benchmark tests (deselected by default, run with -m benchmark)",
 ]
diff --git a/src/apm_cli/cli.py b/src/apm_cli/cli.py
index 6aab8e5e..86d6afa5 100644
--- a/src/apm_cli/cli.py
+++ b/src/apm_cli/cli.py
@@ -1727,7 +1727,6 @@ def matches_filter(dep):
         from concurrent.futures import ThreadPoolExecutor, as_completed as _futures_completed
 
         _pre_download_results = {}   # dep_key -> PackageInfo
-        _pre_download_failed = set() # dep_keys that failed download
         _need_download = []
         for _pd_ref in deps_to_install:
             _pd_key = _pd_ref.get_unique_key()
@@ -1784,10 +1783,9 @@ def matches_filter(dep):
                             _pre_download_results[_pd_key] = _pd_info
                             _dl_progress.update(_pd_tid, visible=False)
                             _dl_progress.refresh()
-                        except Exception as _pd_err:
-                            _pre_download_failed.add(_pd_key)
+                        except Exception:
                             _dl_progress.remove_task(_pd_tid)
-                            _rich_error(f"❌ Failed to install {_pd_disp}: {_pd_err}")
+                            # Silent: sequential loop below will retry and report errors
 
         _pre_downloaded_keys = set(_pre_download_results.keys())
 
diff --git a/src/apm_cli/deps/dependency_graph.py b/src/apm_cli/deps/dependency_graph.py
index ff8962b6..42ddb43d 100644
--- a/src/apm_cli/deps/dependency_graph.py
+++ b/src/apm_cli/deps/dependency_graph.py
@@ -61,8 +61,11 @@ class DependencyTree:
     
     def add_node(self, node: DependencyNode) -> None:
         """Add a node to the tree."""
-        self.nodes[node.get_id()] = node
-        self._nodes_by_depth[node.depth].append(node)
+        node_id = node.get_id()
+        is_new = node_id not in self.nodes
+        self.nodes[node_id] = node
+        if is_new:
+            self._nodes_by_depth[node.depth].append(node)
         self.max_depth = max(self.max_depth, node.depth)
     
     def get_node(self, unique_key: str) -> Optional[DependencyNode]:
diff --git a/src/apm_cli/deps/github_downloader.py b/src/apm_cli/deps/github_downloader.py
index 20a97b08..5fca85de 100644
--- a/src/apm_cli/deps/github_downloader.py
+++ b/src/apm_cli/deps/github_downloader.py
@@ -7,6 +7,7 @@
 from datetime import datetime
 from pathlib import Path
 from typing import Optional, Dict, Any, Callable
+import random
 import re
 import requests
 
@@ -183,10 +184,14 @@ def _resilient_get(self, url: str, headers: Dict[str, str], timeout: int = 30, m
                 if response.status_code in (429, 503):
                     retry_after = response.headers.get("Retry-After")
                     if retry_after:
-                        wait = min(float(retry_after), 60)
+                        try:
+                            wait = min(float(retry_after), 60)
+                        except (TypeError, ValueError):
+                            # Retry-After may be an HTTP-date; fall back to exponential backoff
+                            wait = min(2 ** attempt, 30) * (0.5 + random.random())
                     else:
-                        wait = min(2 ** attempt, 30)
-                    _debug(f"Rate limited ({response.status_code}), retry in {wait}s (attempt {attempt + 1}/{max_retries})")
+                        wait = min(2 ** attempt, 30) * (0.5 + random.random())
+                    _debug(f"Rate limited ({response.status_code}), retry in {wait:.1f}s (attempt {attempt + 1}/{max_retries})")
                     time.sleep(wait)
                     continue
                 
@@ -202,8 +207,8 @@ def _resilient_get(self, url: str, headers: Dict[str, str], timeout: int = 30, m
             except requests.exceptions.ConnectionError as e:
                 last_exc = e
                 if attempt < max_retries - 1:
-                    wait = min(2 ** attempt, 30)
-                    _debug(f"Connection error, retry in {wait}s (attempt {attempt + 1}/{max_retries})")
+                    wait = min(2 ** attempt, 30) * (0.5 + random.random())
+                    _debug(f"Connection error, retry in {wait:.1f}s (attempt {attempt + 1}/{max_retries})")
                     time.sleep(wait)
             except requests.exceptions.Timeout as e:
                 last_exc = e
diff --git a/tests/benchmarks/test_perf_benchmarks.py b/tests/benchmarks/test_perf_benchmarks.py
index fa99982c..da6bab81 100644
--- a/tests/benchmarks/test_perf_benchmarks.py
+++ b/tests/benchmarks/test_perf_benchmarks.py
@@ -7,7 +7,7 @@
 - from_apm_yml() parse caching
 - Inheritance chain caching
 
-Run with: uv run pytest tests/benchmarks/ -v --tb=short
+Run with: uv run pytest tests/benchmarks/ -v --tb=short -m benchmark
 """
 
 import time
@@ -65,6 +65,7 @@ def _build_deep_tree(n_packages: int, max_depth: int) -> DependencyTree:
 # Benchmark: Primitive conflict detection
 # ---------------------------------------------------------------------------
 
+@pytest.mark.benchmark
 class TestPrimitiveConflictDetectionPerf:
     """Verify O(m) conflict detection (was O(m²) before #171)."""
 
@@ -99,6 +100,7 @@ def test_conflict_detection_with_duplicates(self):
 # Benchmark: Dependency tree depth-index
 # ---------------------------------------------------------------------------
 
+@pytest.mark.benchmark
 class TestDepthIndexPerf:
     """Verify O(1) depth lookups (was O(V × max_depth) before #171)."""
 
@@ -131,6 +133,7 @@ def test_depth_index_consistency(self):
 # Benchmark: Cycle detection
 # ---------------------------------------------------------------------------
 
+@pytest.mark.benchmark
 class TestCycleDetectionPerf:
     """Verify O(V+E) cycle detection (was O(V×D) before #171)."""
 
@@ -159,6 +162,7 @@ def test_no_cycles_deep_chain(self):
 # Benchmark: from_apm_yml caching
 # ---------------------------------------------------------------------------
 
+@pytest.mark.benchmark
 class TestFromApmYmlCachePerf:
     """Verify parse caching eliminates repeated disk I/O."""
 
@@ -189,6 +193,7 @@ def test_cache_hit_is_faster(self, tmp_path: Path):
 # Benchmark: read_constitution caching
 # ---------------------------------------------------------------------------
 
+@pytest.mark.benchmark
 class TestConstitutionCachePerf:
     """Verify constitution read caching."""
 

From 9644e8d723e04c107bf96aee8b80b1782f110cdf Mon Sep 17 00:00:00 2001
From: sergio-sisternes-epam <sergio_sisternes@epam.com>
Date: Thu, 5 Mar 2026 22:07:58 +0000
Subject: [PATCH 6/7] fix: resolve CI failures in release validation

- Fix set() shadowing: use builtins.set() explicitly since Click's
  @config.command decorator overwrites the builtin set at module level
- Fix sparse checkout: always pass explicit ref (or HEAD) to git fetch
  so FETCH_HEAD resolves to the correct commit for subdirectory packages
---
 src/apm_cli/cli.py                    | 2 +-
 src/apm_cli/deps/github_downloader.py | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/apm_cli/cli.py b/src/apm_cli/cli.py
index 86d6afa5..1db693c0 100644
--- a/src/apm_cli/cli.py
+++ b/src/apm_cli/cli.py
@@ -1787,7 +1787,7 @@ def matches_filter(dep):
                             _dl_progress.remove_task(_pd_tid)
                             # Silent: sequential loop below will retry and report errors
 
-        _pre_downloaded_keys = set(_pre_download_results.keys())
+        _pre_downloaded_keys = builtins.set(_pre_download_results.keys())
 
         # Create progress display for sequential integration
         with Progress(
diff --git a/src/apm_cli/deps/github_downloader.py b/src/apm_cli/deps/github_downloader.py
index 5fca85de..b1397cd2 100644
--- a/src/apm_cli/deps/github_downloader.py
+++ b/src/apm_cli/deps/github_downloader.py
@@ -1044,10 +1044,7 @@ def _try_sparse_checkout(self, dep_ref: DependencyReference, temp_clone_path: Pa
                 ['git', 'sparse-checkout', 'init', '--cone'],
                 ['git', 'sparse-checkout', 'set', subdir_path],
             ]
-            fetch_cmd = ['git', 'fetch', 'origin']
-            if ref:
-                fetch_cmd.append(ref)
-            fetch_cmd.append('--depth=1')
+            fetch_cmd = ['git', 'fetch', 'origin', ref or 'HEAD', '--depth=1']
             cmds.append(fetch_cmd)
             cmds.append(['git', 'checkout', 'FETCH_HEAD'])
 

From 0a3d8da2c022fc6a0a015b841cfd1ad7fff7d24d Mon Sep 17 00:00:00 2001
From: sergio-sisternes-epam <sergio_sisternes@epam.com>
Date: Fri, 6 Mar 2026 00:36:31 +0000
Subject: [PATCH 7/7] Revert "fix: resolve CI failures in release validation"

This reverts commit 9644e8d723e04c107bf96aee8b80b1782f110cdf.
---
 src/apm_cli/cli.py                    | 2 +-
 src/apm_cli/deps/github_downloader.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/apm_cli/cli.py b/src/apm_cli/cli.py
index 48dd55e7..56712e23 100644
--- a/src/apm_cli/cli.py
+++ b/src/apm_cli/cli.py
@@ -1841,7 +1841,7 @@ def download_callback(dep_ref, modules_dir):
                             _dl_progress.remove_task(_pd_tid)
                             # Silent: sequential loop below will retry and report errors
 
-        _pre_downloaded_keys = builtins.set(_pre_download_results.keys())
+        _pre_downloaded_keys = set(_pre_download_results.keys())
 
         # Create progress display for sequential integration
         with Progress(
diff --git a/src/apm_cli/deps/github_downloader.py b/src/apm_cli/deps/github_downloader.py
index 1de05c02..13189a34 100644
--- a/src/apm_cli/deps/github_downloader.py
+++ b/src/apm_cli/deps/github_downloader.py
@@ -1072,7 +1072,10 @@ def _try_sparse_checkout(self, dep_ref: DependencyReference, temp_clone_path: Pa
                 ['git', 'sparse-checkout', 'init', '--cone'],
                 ['git', 'sparse-checkout', 'set', subdir_path],
             ]
-            fetch_cmd = ['git', 'fetch', 'origin', ref or 'HEAD', '--depth=1']
+            fetch_cmd = ['git', 'fetch', 'origin']
+            if ref:
+                fetch_cmd.append(ref)
+            fetch_cmd.append('--depth=1')
             cmds.append(fetch_cmd)
             cmds.append(['git', 'checkout', 'FETCH_HEAD'])