-
-
Notifications
You must be signed in to change notification settings - Fork 52
feat: Implement Smart Retry Logic with Exponential Backoff for Installations #658
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
c3e5ac7
769bedc
d1c7baf
b74b3e5
840f3ef
ae299f1
9046057
af526dc
b66ebf7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| from enum import Enum | ||
| from typing import Any | ||
|
|
||
| from cortex.utils.retry import SmartRetry | ||
| from cortex.validators import DANGEROUS_PATTERNS | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
@@ -60,13 +61,15 @@ def __init__( | |
| enable_rollback: bool = False, | ||
| log_file: str | None = None, | ||
| progress_callback: Callable[[int, int, InstallationStep], None] | None = None, | ||
| max_retries: int = 5, | ||
| ): | ||
| """Initialize an installation run with optional logging and rollback.""" | ||
| self.timeout = timeout | ||
| self.stop_on_error = stop_on_error | ||
| self.enable_rollback = enable_rollback | ||
| self.log_file = log_file | ||
| self.progress_callback = progress_callback | ||
| self.max_retries = max_retries | ||
|
|
||
| if descriptions and len(descriptions) != len(commands): | ||
| raise ValueError("Number of descriptions must match number of commands") | ||
|
|
@@ -90,6 +93,7 @@ def from_plan( | |
| enable_rollback: bool | None = None, | ||
| log_file: str | None = None, | ||
| progress_callback: Callable[[int, int, InstallationStep], None] | None = None, | ||
| max_retries: int = 5, | ||
| ) -> "InstallationCoordinator": | ||
| """Create a coordinator from a structured plan produced by an LLM. | ||
|
|
||
|
|
@@ -124,6 +128,7 @@ def from_plan( | |
| ), | ||
| log_file=log_file, | ||
| progress_callback=progress_callback, | ||
| max_retries=max_retries, | ||
| ) | ||
|
|
||
| for rollback_cmd in rollback_commands: | ||
|
|
@@ -174,14 +179,27 @@ def _execute_command(self, step: InstallationStep) -> bool: | |
| self._log(f"Command blocked: {step.command} - {error}") | ||
| return False | ||
|
|
||
| try: | ||
| def run_cmd() -> subprocess.CompletedProcess[str]: | ||
| # Use shell=True carefully - commands are validated first | ||
| # For complex shell commands (pipes, redirects), shell=True is needed | ||
| # Simple commands could use shlex.split() with shell=False | ||
| result = subprocess.run( | ||
| return subprocess.run( | ||
| step.command, shell=True, capture_output=True, text=True, timeout=self.timeout | ||
| ) | ||
|
|
||
| def status_callback(msg: str) -> None: | ||
| self._log(msg) | ||
| # Also print to stdout so the user sees the retry happening | ||
| print(msg) | ||
|
Comment on lines
+190
to
+193
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If progress_callback is also set, user may see duplicate output. Consider checking if a callback exists before printing. |
||
|
|
||
| retry_handler = SmartRetry( | ||
| max_retries=self.max_retries, | ||
| status_callback=status_callback, | ||
| ) | ||
|
|
||
| try: | ||
| result = retry_handler.run(run_cmd) | ||
|
|
||
| step.return_code = result.returncode | ||
| step.output = result.stdout | ||
| step.error = result.stderr | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| import logging | ||
| import time | ||
| from collections.abc import Callable | ||
| from typing import Any | ||
|
|
||
| from cortex.error_parser import ErrorCategory, ErrorParser | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ErrorParser class is unused, remove that. |
||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class SmartRetry: | ||
| """ | ||
| Implements smart retry logic with exponential backoff. | ||
| Uses ErrorParser to distinguish between transient and permanent errors. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| max_retries: int = 5, | ||
| backoff_factor: float = 1.0, | ||
| status_callback: Callable[[str], None] | None = None, | ||
| ): | ||
| if not isinstance(max_retries, int) or max_retries < 0: | ||
| raise ValueError("max_retries must be a non-negative integer") | ||
| if not isinstance(backoff_factor, (int, float)) or backoff_factor < 0: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should reject backoff_factor=0 as well since it defeats the purpose of backoff. |
||
| raise ValueError("backoff_factor must be a non-negative number") | ||
|
|
||
| self.max_retries = max_retries | ||
| self.backoff_factor = float(backoff_factor) | ||
| self.status_callback = status_callback | ||
| self.error_parser = ErrorParser() | ||
|
|
||
| def run(self, func: Callable[[], Any]) -> Any: | ||
| """ | ||
| Run a function with smart retry logic. | ||
| Args: | ||
| func: The function to execute. Expected to return a result object | ||
| that has `returncode`, `stdout`, and `stderr` attributes | ||
| (like subprocess.CompletedProcess), or raise an exception. | ||
| Returns: | ||
| The result of the function call. | ||
| """ | ||
| attempt = 0 | ||
| last_exception = None | ||
| last_result = None | ||
|
|
||
| while attempt <= self.max_retries: | ||
| try: | ||
| result = func() | ||
| last_result = result | ||
|
|
||
| # If result indicates success (returncode 0), return immediately | ||
| if hasattr(result, "returncode") and result.returncode == 0: | ||
| return result | ||
|
|
||
| # If result indicates failure, analyze it | ||
| error_msg = "" | ||
| if hasattr(result, "stderr") and result.stderr: | ||
| error_msg = result.stderr | ||
| elif hasattr(result, "stdout") and result.stdout: | ||
| error_msg = result.stdout | ||
|
Comment on lines
+60
to
+63
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Stdout may contain success output, not error info. This could cause false retry classification. So fix it. |
||
|
|
||
| if not self._should_retry(error_msg): | ||
| return result | ||
|
|
||
| except Exception as e: | ||
| last_exception = e | ||
| if not self._should_retry(str(e)): | ||
| raise | ||
|
|
||
| # If we are here, we need to retry (unless max retries reached) | ||
| if attempt == self.max_retries: | ||
| break | ||
|
|
||
| attempt += 1 | ||
| sleep_time = self.backoff_factor * (2 ** (attempt - 1)) | ||
|
|
||
| msg = f"⚠️ Transient error detected. Retrying in {sleep_time}s... (Attempt {attempt}/{self.max_retries})" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On first retry, attempt=1, so message shows "Attempt 1/5" but it's actually the 2nd execution. Consider clarifying: "Retry 1/5" instead. |
||
| logger.warning(msg) | ||
| if self.status_callback: | ||
| self.status_callback(msg) | ||
|
|
||
| time.sleep(sleep_time) | ||
|
|
||
| if last_exception: | ||
| raise last_exception | ||
| return last_result | ||
|
|
||
| def _should_retry(self, error_message: str) -> bool: | ||
| """ | ||
| Determine if we should retry based on the error message. | ||
| """ | ||
| if not error_message: | ||
| # If no error message, assume it's a generic failure that might be transient | ||
| return True | ||
|
Comment on lines
+95
to
+97
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is risky. A command could fail with returncode != 0 and empty stderr/stdout for permanent reasons. Consider logging a warning here. |
||
|
|
||
| analysis = self.error_parser.parse_error(error_message) | ||
| category = analysis.primary_category | ||
|
|
||
| # If the error is explicitly marked as not fixable, fail fast | ||
| if not analysis.is_fixable: | ||
| return False | ||
|
|
||
| # Retry on network errors, lock errors, or unknown errors (conservative) | ||
| if category in [ | ||
| ErrorCategory.NETWORK_ERROR, | ||
| ErrorCategory.LOCK_ERROR, | ||
| ErrorCategory.UNKNOWN, | ||
| ]: | ||
| return True | ||
|
|
||
| # Fail fast on permanent errors | ||
| if category in [ | ||
| ErrorCategory.PERMISSION_DENIED, | ||
| ErrorCategory.PACKAGE_NOT_FOUND, | ||
| ErrorCategory.CONFIGURATION_ERROR, | ||
| ErrorCategory.DEPENDENCY_MISSING, | ||
| ErrorCategory.CONFLICT, | ||
| ErrorCategory.DISK_SPACE, | ||
| ]: | ||
| return False | ||
|
|
||
| # Default to retry for safety if not explicitly categorized as permanent | ||
| return True | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do not use magic numbers add a constant instead something like =
DEFAULT_MAX_RETRIES = 5