From 8f7bcc13687a05d3796bb5515a6170975f66f315 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 12:20:17 +0530 Subject: [PATCH 01/20] Implement retry policy and enhance errored state handling - Added a retry policy model to manage state retries with configurable methods (fixed, linear, exponential). - Updated the errored state function to create a retry state if the maximum retries have not been reached, improving error recovery. - Enhanced the ErroredResponseModel to include a flag indicating whether a retry state was created. - Modified the GraphTemplate and State models to incorporate retry policy attributes, ensuring better state management. - Improved validation and error handling in the upsert_graph_template function to accommodate the new retry policy structure. --- state-manager/app/controller/errored_state.py | 39 ++++++++++++++++++- .../app/controller/upsert_graph_template.py | 7 +++- .../app/models/db/graph_template_model.py | 3 +- state-manager/app/models/db/state.py | 2 + state-manager/app/models/errored_models.py | 3 +- state-manager/app/models/graph_models.py | 3 ++ .../app/models/retry_policy_model.py | 12 ++++++ 7 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 state-manager/app/models/retry_policy_model.py diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py index b59e2573..889c01ad 100644 --- a/state-manager/app/controller/errored_state.py +++ b/state-manager/app/controller/errored_state.py @@ -5,9 +5,22 @@ from app.models.db.state import State from app.models.state_status_enum import StateStatusEnum from app.singletons.logs_manager import LogsManager +from app.models.retry_policy_model import RetryPolicyModel, RetryMethod +from app.models.db.graph_template_model import GraphTemplate logger = LogsManager().get_logger() +def _calculate_enqueue_after(retry_policy: RetryPolicyModel, retry_count: int) -> int: + # convert seconds to milliseconds + if retry_policy.method == RetryMethod.FIXED: + return (retry_policy.backoff_factor * 1000) + elif retry_policy.method == RetryMethod.LINEAR: + return (retry_policy.backoff_factor * retry_count) * 1000 + elif retry_policy.method == RetryMethod.EXPONENTIAL: + return (retry_policy.backoff_factor ** retry_count) * 1000 + else: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid retry method") + async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: ErroredRequestModel, x_exosphere_request_id: str) -> ErroredResponseModel: try: @@ -23,11 +36,35 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E if state.status == StateStatusEnum.EXECUTED: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="State is already executed") + graph_template = await GraphTemplate.get(namespace_name, state.graph_name) + + retry_created = False + + if state.retry_count < graph_template.retry_policy.max_retries: + retry_state = State( + node_name=state.node_name, + namespace_name=state.namespace_name, + identifier=state.identifier, + graph_name=state.graph_name, + run_id=state.run_id, + status=StateStatusEnum.CREATED, + inputs=state.inputs, + outputs=state.outputs, + error=body.error, + parents=state.parents, + does_unites=state.does_unites, + enqueue_after=state.enqueue_after + _calculate_enqueue_after(graph_template.retry_policy, state.retry_count + 1), + retry_count=state.retry_count + 1 + ) + retry_state = await retry_state.insert() + logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id) + retry_created = True + state.status = StateStatusEnum.ERRORED state.error = body.error await state.save() - return ErroredResponseModel(status=StateStatusEnum.ERRORED) + return ErroredResponseModel(status=StateStatusEnum.ERRORED, retry_created=retry_created) except Exception as e: logger.error(f"Error errored state {state_id} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id, error=e) diff --git a/state-manager/app/controller/upsert_graph_template.py b/state-manager/app/controller/upsert_graph_template.py index 99a178ae..16882018 100644 --- a/state-manager/app/controller/upsert_graph_template.py +++ b/state-manager/app/controller/upsert_graph_template.py @@ -27,7 +27,8 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse Set({ GraphTemplate.nodes: body.nodes, # type: ignore GraphTemplate.validation_status: GraphTemplateValidationStatus.PENDING, # type: ignore - GraphTemplate.validation_errors: [] # type: ignore + GraphTemplate.validation_errors: [], # type: ignore + GraphTemplate.retry_policy: body.retry_policy # type: ignore }) ) @@ -44,7 +45,8 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse namespace=namespace_name, nodes=body.nodes, validation_status=GraphTemplateValidationStatus.PENDING, - validation_errors=[] + validation_errors=[], + retry_policy=body.retry_policy ).set_secrets(body.secrets) ) except ValueError as e: @@ -58,6 +60,7 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse validation_status=graph_template.validation_status, validation_errors=graph_template.validation_errors, secrets={secret_name: True for secret_name in graph_template.get_secrets().keys()}, + retry_policy=graph_template.retry_policy, created_at=graph_template.created_at, updated_at=graph_template.updated_at ) diff --git a/state-manager/app/models/db/graph_template_model.py b/state-manager/app/models/db/graph_template_model.py index 999d5389..4b3a4731 100644 --- a/state-manager/app/models/db/graph_template_model.py +++ b/state-manager/app/models/db/graph_template_model.py @@ -11,7 +11,7 @@ from ..node_template_model import NodeTemplate from app.utils.encrypter import get_encrypter from app.models.dependent_string import DependentString - +from app.models.retry_policy_model import RetryPolicyModel class GraphTemplate(BaseDatabaseModel): name: str = Field(..., description="Name of the graph") @@ -20,6 +20,7 @@ class GraphTemplate(BaseDatabaseModel): validation_status: GraphTemplateValidationStatus = Field(..., description="Validation status of the graph") validation_errors: List[str] = Field(default_factory=list, description="Validation errors of the graph") secrets: Dict[str, str] = Field(default_factory=dict, description="Secrets of the graph") + retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph") _node_by_identifier: Dict[str, NodeTemplate] | None = PrivateAttr(default=None) _parents_by_identifier: Dict[str, set[str]] | None = PrivateAttr(default=None) # type: ignore diff --git a/state-manager/app/models/db/state.py b/state-manager/app/models/db/state.py index e37326b4..28efc613 100644 --- a/state-manager/app/models/db/state.py +++ b/state-manager/app/models/db/state.py @@ -24,6 +24,7 @@ class State(BaseDatabaseModel): does_unites: bool = Field(default=False, description="Whether this state unites other states") state_fingerprint: str = Field(default="", description="Fingerprint of the state") enqueue_after: int = Field(default_factory=lambda: int(time.time() * 1000), gt=0, description="Unix time in milliseconds after which the state should be enqueued") + retry_count: int = Field(default=0, description="Number of times the state has been retried") @before_event([Insert, Replace, Save]) def _generate_fingerprint(self): @@ -37,6 +38,7 @@ def _generate_fingerprint(self): "identifier": self.identifier, "graph_name": self.graph_name, "run_id": self.run_id, + "retry_count": self.retry_count, "parents": {k: str(v) for k, v in self.parents.items()}, } payload = json.dumps( diff --git a/state-manager/app/models/errored_models.py b/state-manager/app/models/errored_models.py index 5acfaa34..8814d56a 100644 --- a/state-manager/app/models/errored_models.py +++ b/state-manager/app/models/errored_models.py @@ -7,4 +7,5 @@ class ErroredRequestModel(BaseModel): class ErroredResponseModel(BaseModel): - status: StateStatusEnum = Field(..., description="Status of the state") \ No newline at end of file + status: StateStatusEnum = Field(..., description="Status of the state") + retry_created: bool = Field(default=False, description="Whether a retry state was created") \ No newline at end of file diff --git a/state-manager/app/models/graph_models.py b/state-manager/app/models/graph_models.py index 1fd80bd1..8e67cb2d 100644 --- a/state-manager/app/models/graph_models.py +++ b/state-manager/app/models/graph_models.py @@ -3,16 +3,19 @@ from typing import Dict, List, Optional from datetime import datetime from .graph_template_validation_status import GraphTemplateValidationStatus +from .retry_policy_model import RetryPolicyModel class UpsertGraphTemplateRequest(BaseModel): secrets: Dict[str, str] = Field(..., description="Dictionary of secrets that are used while graph execution") nodes: List[NodeTemplate] = Field(..., description="List of node templates that define the graph structure") + retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph") class UpsertGraphTemplateResponse(BaseModel): nodes: List[NodeTemplate] = Field(..., description="List of node templates that define the graph structure") secrets: Dict[str, bool] = Field(..., description="Dictionary of secrets that are used while graph execution") + retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph") created_at: datetime = Field(..., description="Timestamp when the graph template was created") updated_at: datetime = Field(..., description="Timestamp when the graph template was last updated") validation_status: GraphTemplateValidationStatus = Field(..., description="Current validation status of the graph template") diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py new file mode 100644 index 00000000..2b9d4923 --- /dev/null +++ b/state-manager/app/models/retry_policy_model.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel, Field +from enum import Enum + +class RetryMethod(str, Enum): + EXPONENTIAL = "EXPONENTIAL" + LINEAR = "LINEAR" + FIXED = "FIXED" + +class RetryPolicyModel(BaseModel): + max_retries: int = Field(default=3, description="The maximum number of retries", ge=0) + method: RetryMethod = Field(default=RetryMethod.EXPONENTIAL, description="The method of retry") + backoff_factor: int = Field(default=2, description="The backoff factor in seconds (default: 2 = 2 seconds)", gt=0) \ No newline at end of file From f6a3bb2f06fb6a2a81eaae009219835f43d25296 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 13:12:43 +0530 Subject: [PATCH 02/20] Refactor retry policy and errored state handling - Introduced a new RetryStrategy enum with additional strategies for retrying operations, enhancing flexibility in retry mechanisms. - Updated the RetryPolicyModel to include a compute_delay method for calculating delays based on the selected strategy. - Refactored the errored_state function to utilize the new retry policy structure, improving error handling and state management. - Removed the previous _calculate_enqueue_after function, streamlining the code and enhancing clarity. --- state-manager/app/controller/errored_state.py | 20 ++----- .../app/models/retry_policy_model.py | 53 +++++++++++++++++-- 2 files changed, 55 insertions(+), 18 deletions(-) diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py index 889c01ad..8ceb522d 100644 --- a/state-manager/app/controller/errored_state.py +++ b/state-manager/app/controller/errored_state.py @@ -1,3 +1,5 @@ +import time + from app.models.errored_models import ErroredRequestModel, ErroredResponseModel from fastapi import HTTPException, status from beanie import PydanticObjectId @@ -5,22 +7,10 @@ from app.models.db.state import State from app.models.state_status_enum import StateStatusEnum from app.singletons.logs_manager import LogsManager -from app.models.retry_policy_model import RetryPolicyModel, RetryMethod from app.models.db.graph_template_model import GraphTemplate logger = LogsManager().get_logger() -def _calculate_enqueue_after(retry_policy: RetryPolicyModel, retry_count: int) -> int: - # convert seconds to milliseconds - if retry_policy.method == RetryMethod.FIXED: - return (retry_policy.backoff_factor * 1000) - elif retry_policy.method == RetryMethod.LINEAR: - return (retry_policy.backoff_factor * retry_count) * 1000 - elif retry_policy.method == RetryMethod.EXPONENTIAL: - return (retry_policy.backoff_factor ** retry_count) * 1000 - else: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid retry method") - async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: ErroredRequestModel, x_exosphere_request_id: str) -> ErroredResponseModel: try: @@ -49,11 +39,11 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E run_id=state.run_id, status=StateStatusEnum.CREATED, inputs=state.inputs, - outputs=state.outputs, - error=body.error, + outputs={}, + error=None, parents=state.parents, does_unites=state.does_unites, - enqueue_after=state.enqueue_after + _calculate_enqueue_after(graph_template.retry_policy, state.retry_count + 1), + enqueue_after= int(time.time() * 1000) + graph_template.retry_policy.compute_delay(state.retry_count + 1), retry_count=state.retry_count + 1 ) retry_state = await retry_state.insert() diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py index 2b9d4923..070b4528 100644 --- a/state-manager/app/models/retry_policy_model.py +++ b/state-manager/app/models/retry_policy_model.py @@ -1,12 +1,59 @@ from pydantic import BaseModel, Field from enum import Enum +import random -class RetryMethod(str, Enum): +class RetryStrategy(str, Enum): EXPONENTIAL = "EXPONENTIAL" + EXPONENTIAL_FULL_JITTER = "EXPONENTIAL_FULL_JITTER" + EXPONENTIAL_EQUAL_JITTER = "EXPONENTIAL_EQUAL_JITTER" + LINEAR = "LINEAR" + LINEAR_FULL_JITTER = "LINEAR_FULL_JITTER" + LINEAR_EQUAL_JITTER = "LINEAR_EQUAL_JITTER" + FIXED = "FIXED" + FIXED_FULL_JITTER = "FIXED_FULL_JITTER" + FIXED_EQUAL_JITTER = "FIXED_EQUAL_JITTER" class RetryPolicyModel(BaseModel): max_retries: int = Field(default=3, description="The maximum number of retries", ge=0) - method: RetryMethod = Field(default=RetryMethod.EXPONENTIAL, description="The method of retry") - backoff_factor: int = Field(default=2, description="The backoff factor in seconds (default: 2 = 2 seconds)", gt=0) \ No newline at end of file + strategy: RetryStrategy = Field(default=RetryStrategy.EXPONENTIAL, description="The method of retry") + backoff_factor: int = Field(default=2000, description="The backoff factor in milliseconds (default: 2000 = 2 seconds)", gt=0) + exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0) + + def compute_delay(self, retry_count: int) -> int: + if self.strategy == RetryStrategy.EXPONENTIAL: + return (self.backoff_factor * (self.exponent ** retry_count)) + + elif self.strategy == RetryStrategy.EXPONENTIAL_FULL_JITTER: + base = self.backoff_factor * (self.exponent ** retry_count) + return int(random.uniform(0, base)) + + elif self.strategy == RetryStrategy.EXPONENTIAL_EQUAL_JITTER: + base = self.backoff_factor * (self.exponent ** retry_count) + return int(base/2 + random.uniform(0, base / 2)) + + elif self.strategy == RetryStrategy.LINEAR: + return (self.backoff_factor * retry_count) + + elif self.strategy == RetryStrategy.LINEAR_FULL_JITTER: + base = self.backoff_factor * retry_count + return int(random.uniform(0, base)) + + elif self.strategy == RetryStrategy.LINEAR_EQUAL_JITTER: + base = self.backoff_factor * retry_count + return int(base/2 + random.uniform(0, base / 2)) + + elif self.strategy == RetryStrategy.FIXED: + return self.backoff_factor + + elif self.strategy == RetryStrategy.FIXED_FULL_JITTER: + base = self.backoff_factor + return int(random.uniform(0, base)) + + elif self.strategy == RetryStrategy.FIXED_EQUAL_JITTER: + base = self.backoff_factor + return int(base/2 + random.uniform(0, base / 2)) + + else: + raise Exception("Invalid retry strategy") \ No newline at end of file From 66c794a3937ad2f9536d288ea136d757333a7bca Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 13:24:31 +0530 Subject: [PATCH 03/20] Add retry policy documentation and integrate into graph configuration - Introduced a new documentation file for the Retry Policy feature, detailing its configuration and usage within Exosphere. - Updated the `create-graph.md` file to include a section on retry policies, explaining their structure and providing examples. - Modified `mkdocs.yml` to include the new Retry Policy documentation in the navigation, enhancing accessibility for users. --- docs/docs/exosphere/create-graph.md | 25 ++- docs/docs/exosphere/retry-policy.md | 276 ++++++++++++++++++++++++++++ docs/mkdocs.yml | 2 + 3 files changed, 302 insertions(+), 1 deletion(-) create mode 100644 docs/docs/exosphere/retry-policy.md diff --git a/docs/docs/exosphere/create-graph.md b/docs/docs/exosphere/create-graph.md index 11d20395..f5ceeaa1 100644 --- a/docs/docs/exosphere/create-graph.md +++ b/docs/docs/exosphere/create-graph.md @@ -51,7 +51,13 @@ One can define a graph on Exosphere through a simple json config, which specifie }, "next_nodes": [] } - ] + ], + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2 + } } ``` @@ -126,6 +132,23 @@ Use the `${{ ... }}` syntax to map outputs from previous nodes: - **`${{ node_identifier.outputs.field_name }}`**: Maps output from a specific node - **`initial`**: Static value provided when the graph is triggered - **Direct values**: String values. In v1, numbers/booleans must be string-encoded (e.g., "42", "true"). + +### Retry Policy + +Graphs can include a retry policy to handle transient failures automatically. The retry policy is configured at the graph level and applies to all nodes within the graph. + +```json +{ + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2 + } +} +``` + +For detailed information about retry policies, including all available strategies and configuration options, see the [Retry Policy](retry-policy.md) documentation. ## Creating Graph Templates The recommended way to create graph templates is using the Exosphere Python SDK, which provides a clean interface to the State Manager API. diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md new file mode 100644 index 00000000..1dbc6430 --- /dev/null +++ b/docs/docs/exosphere/retry-policy.md @@ -0,0 +1,276 @@ +# Retry Policy + +!!! beta "Beta Feature" + Retry Policy is currently available in beta. The API and functionality may change in future releases. + +The Retry Policy feature in Exosphere provides sophisticated retry mechanisms for handling transient failures in your workflow nodes. When a node execution fails, the retry policy automatically determines when and how to retry the execution based on configurable strategies. + +## Overview + +Retry policies are configured at the graph level and apply to all nodes within that graph. When a node fails with an error, the state manager automatically creates a retry state with a calculated delay before the next execution attempt. + +## Configuration + +Retry policies are defined in your graph template configuration: + +```json +{ + "secrets": { + "api_key": "your-api-key" + }, + "nodes": [ + { + "node_name": "MyNode", + "namespace": "MyProject", + "identifier": "my_node", + "inputs": { + "data": "initial" + }, + "next_nodes": [] + } + ], + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2 + } +} +``` + +## Parameters + +### max_retries +- **Type**: `int` +- **Default**: `3` +- **Description**: The maximum number of retry attempts before giving up +- **Constraints**: Must be >= 0 + +### strategy +- **Type**: `string` +- **Default**: `"EXPONENTIAL"` +- **Description**: The retry strategy to use for calculating delays +- **Options**: See [Retry Strategies](#retry-strategies) below + +### backoff_factor +- **Type**: `int` +- **Default**: `2000` (2 seconds) +- **Description**: The base delay factor in milliseconds +- **Constraints**: Must be > 0 + +### exponent +- **Type**: `int` +- **Default**: `2` +- **Description**: The exponent used for exponential strategies +- **Constraints**: Must be > 0 + +## Retry Strategies + +Exosphere supports three main categories of retry strategies, each with jitter variants to prevent thundering herd problems. + +### Exponential Strategies + +Exponential strategies increase the delay exponentially with each retry attempt. + +#### EXPONENTIAL +Standard exponential backoff without jitter. + +**Formula**: `backoff_factor * (exponent ^ retry_count)` + +**Example**: +- Retry 1: 2000ms (2 seconds) +- Retry 2: 4000ms (4 seconds) +- Retry 3: 8000ms (8 seconds) + +#### EXPONENTIAL_FULL_JITTER +Exponential backoff with full jitter (random delay between 0 and calculated delay). + +**Formula**: `random(0, backoff_factor * (exponent ^ retry_count))` + +**Example**: +- Retry 1: 0-2000ms (random) +- Retry 2: 0-4000ms (random) +- Retry 3: 0-8000ms (random) + +#### EXPONENTIAL_EQUAL_JITTER +Exponential backoff with equal jitter (random delay around half the calculated delay). + +**Formula**: `(backoff_factor * (exponent ^ retry_count)) / 2 + random(0, (backoff_factor * (exponent ^ retry_count)) / 2)` + +**Example**: +- Retry 1: 1000-2000ms (random) +- Retry 2: 2000-4000ms (random) +- Retry 3: 4000-8000ms (random) + +### Linear Strategies + +Linear strategies increase the delay linearly with each retry attempt. + +#### LINEAR +Standard linear backoff without jitter. + +**Formula**: `backoff_factor * retry_count` + +**Example**: +- Retry 1: 2000ms (2 seconds) +- Retry 2: 4000ms (4 seconds) +- Retry 3: 6000ms (6 seconds) + +#### LINEAR_FULL_JITTER +Linear backoff with full jitter. + +**Formula**: `random(0, backoff_factor * retry_count)` + +**Example**: +- Retry 1: 0-2000ms (random) +- Retry 2: 0-4000ms (random) +- Retry 3: 0-6000ms (random) + +#### LINEAR_EQUAL_JITTER +Linear backoff with equal jitter. + +**Formula**: `(backoff_factor * retry_count) / 2 + random(0, (backoff_factor * retry_count) / 2)` + +**Example**: +- Retry 1: 1000-2000ms (random) +- Retry 2: 2000-4000ms (random) +- Retry 3: 3000-6000ms (random) + +### Fixed Strategies + +Fixed strategies use a constant delay for all retry attempts. + +#### FIXED +Standard fixed delay without jitter. + +**Formula**: `backoff_factor` + +**Example**: +- Retry 1: 2000ms (2 seconds) +- Retry 2: 2000ms (2 seconds) +- Retry 3: 2000ms (2 seconds) + +#### FIXED_FULL_JITTER +Fixed delay with full jitter. + +**Formula**: `random(0, backoff_factor)` + +**Example**: +- Retry 1: 0-2000ms (random) +- Retry 2: 0-2000ms (random) +- Retry 3: 0-2000ms (random) + +#### FIXED_EQUAL_JITTER +Fixed delay with equal jitter. + +**Formula**: `backoff_factor / 2 + random(0, backoff_factor / 2)` + +**Example**: +- Retry 1: 1000-2000ms (random) +- Retry 2: 1000-2000ms (random) +- Retry 3: 1000-2000ms (random) + +## Usage Examples + +### Basic Exponential Retry +```json +{ + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 1000, + "exponent": 2 + } +} +``` + +### Aggressive Retry with Jitter +```json +{ + "retry_policy": { + "max_retries": 5, + "strategy": "EXPONENTIAL_FULL_JITTER", + "backoff_factor": 500, + "exponent": 3 + } +} +``` + +### Conservative Linear Retry +```json +{ + "retry_policy": { + "max_retries": 2, + "strategy": "LINEAR", + "backoff_factor": 5000 + } +} +``` + +### Fixed Retry for Rate Limiting +```json +{ + "retry_policy": { + "max_retries": 10, + "strategy": "FIXED_EQUAL_JITTER", + "backoff_factor": 1000 + } +} +``` + +## When Retries Are Triggered + +Retries are automatically triggered when: + +1. A node execution fails with an error +2. The current retry count is less than `max_retries` +3. The state status is `QUEUED` or `EXECUTED` + +The retry mechanism: +- Creates a new state with `retry_count` incremented by 1 +- Sets `enqueue_after` to the current time plus the calculated delay +- Sets the original state status to `ERRORED` with the error message + +## Best Practices + +### Choose the Right Strategy +- **EXPONENTIAL**: Best for most transient failures (network issues, temporary service unavailability) +- **LINEAR**: Good for predictable, consistent delays +- **FIXED**: Useful for rate limiting scenarios + +### Use Jitter for High Concurrency +- **FULL_JITTER**: Best for high concurrency to prevent thundering herd +- **EQUAL_JITTER**: Good balance between predictability and randomization +- **No Jitter**: Use only when you need deterministic behavior + +### Set Appropriate Limits +- **max_retries**: Consider the nature of your failures and downstream dependencies +- **backoff_factor**: Balance between responsiveness and resource usage +- **exponent**: Higher values create more aggressive backoff + +### Monitor Retry Patterns +- Track retry counts in your monitoring system +- Set up alerts for graphs with high retry rates +- Analyze retry patterns to identify systemic issues + +## Limitations + +- Retry policies apply to all nodes in a graph uniformly +- Individual node-level retry policies are not supported +- Retry delays are calculated in milliseconds +- Maximum delay is not capped (consider using reasonable `backoff_factor` and `exponent` values) + +## Error Handling + +If a retry policy configuration is invalid: +- The graph template validation will fail +- An error will be returned during graph creation +- The graph will not be saved until the configuration is corrected + +## Integration with Signals + +Retry policies work alongside Exosphere's signal system: + +- Nodes can still raise `PruneSignal` to stop retries immediately +- Nodes can raise `ReQueueAfterSignal` to re-queue after sometime, this will not mark nodes as failure. +- The retry count is preserved when using signals \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 51df18d8..5ebcf23f 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -101,6 +101,7 @@ plugins: - exosphere/register-node.md - exosphere/create-runtime.md - exosphere/create-graph.md + - exosphere/retry-policy.md - exosphere/trigger-graph.md - exosphere/dashboard.md - exosphere/signals.md @@ -129,6 +130,7 @@ nav: - Register Node: exosphere/register-node.md - Create Runtime: exosphere/create-runtime.md - Create Graph: exosphere/create-graph.md + - Retry Policy: exosphere/retry-policy.md - Trigger Graph: exosphere/trigger-graph.md - Dashboard: exosphere/dashboard.md - Signals: exosphere/signals.md From 728b225395ae04ebfb43a2a637943fc90a66eaf9 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 13:32:57 +0530 Subject: [PATCH 04/20] Enhance retry policy error handling and validation - Added validation to ensure the retry count is greater than 0 in the compute_delay method of RetryPolicyModel, raising a ValueError for invalid inputs. - Updated the compute_delay method to correctly calculate delays based on the retry count, adjusting the exponentiation logic. - Refined error handling in the GraphTemplate model by replacing ValueError with HTTPException for better integration with FastAPI, ensuring a 404 response when a graph template is not found. --- state-manager/app/models/db/graph_template_model.py | 4 +++- state-manager/app/models/retry_policy_model.py | 11 +++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/state-manager/app/models/db/graph_template_model.py b/state-manager/app/models/db/graph_template_model.py index 4b3a4731..ee0323d8 100644 --- a/state-manager/app/models/db/graph_template_model.py +++ b/state-manager/app/models/db/graph_template_model.py @@ -5,6 +5,8 @@ from pymongo import IndexModel from pydantic import Field, field_validator, PrivateAttr, model_validator from typing import List, Self, Dict +from fastapi.exceptions import HTTPException +from fastapi import status from .base import BaseDatabaseModel from ..graph_template_validation_status import GraphTemplateValidationStatus @@ -303,7 +305,7 @@ def get_path_by_identifier(self, identifier: str) -> set[str]: async def get(namespace: str, graph_name: str) -> "GraphTemplate": graph_template = await GraphTemplate.find_one(GraphTemplate.namespace == namespace, GraphTemplate.name == graph_name) if not graph_template: - raise ValueError(f"Graph template not found for namespace: {namespace} and graph name: {graph_name}") + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found") return graph_template @staticmethod diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py index 070b4528..5fe92409 100644 --- a/state-manager/app/models/retry_policy_model.py +++ b/state-manager/app/models/retry_policy_model.py @@ -22,15 +22,18 @@ class RetryPolicyModel(BaseModel): exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0) def compute_delay(self, retry_count: int) -> int: + if retry_count < 1: + raise ValueError(f"Retry count must be greater than 1, got {retry_count}") + if self.strategy == RetryStrategy.EXPONENTIAL: - return (self.backoff_factor * (self.exponent ** retry_count)) + return (self.backoff_factor * (self.exponent ** (retry_count - 1))) elif self.strategy == RetryStrategy.EXPONENTIAL_FULL_JITTER: - base = self.backoff_factor * (self.exponent ** retry_count) + base = self.backoff_factor * (self.exponent ** (retry_count - 1)) return int(random.uniform(0, base)) elif self.strategy == RetryStrategy.EXPONENTIAL_EQUAL_JITTER: - base = self.backoff_factor * (self.exponent ** retry_count) + base = self.backoff_factor * (self.exponent ** (retry_count - 1)) return int(base/2 + random.uniform(0, base / 2)) elif self.strategy == RetryStrategy.LINEAR: @@ -56,4 +59,4 @@ def compute_delay(self, retry_count: int) -> int: return int(base/2 + random.uniform(0, base / 2)) else: - raise Exception("Invalid retry strategy") \ No newline at end of file + raise ValueError(f"Invalid retry strategy: {self.strategy}") \ No newline at end of file From ba1378589e994df72b3c1073896e4e3034ae2d8f Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 13:40:54 +0530 Subject: [PATCH 05/20] Update retry policy documentation and examples - Clarified the backoff_factor parameter in the retry policy documentation to specify its unit as milliseconds. - Added retry policy examples in the create-graph.md file to demonstrate its usage in graph template creation and updates. - Ensured consistency in the retry policy structure across documentation, enhancing user understanding and implementation. --- docs/docs/exosphere/create-graph.md | 19 ++++++++-- docs/docs/exosphere/retry-policy.md | 58 +++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 10 deletions(-) diff --git a/docs/docs/exosphere/create-graph.md b/docs/docs/exosphere/create-graph.md index f5ceeaa1..178debad 100644 --- a/docs/docs/exosphere/create-graph.md +++ b/docs/docs/exosphere/create-graph.md @@ -142,13 +142,14 @@ Graphs can include a retry policy to handle transient failures automatically. Th "retry_policy": { "max_retries": 3, "strategy": "EXPONENTIAL", - "backoff_factor": 2000, + "backoff_factor": 2000, // milliseconds "exponent": 2 } } ``` For detailed information about retry policies, including all available strategies and configuration options, see the [Retry Policy](retry-policy.md) documentation. + ## Creating Graph Templates The recommended way to create graph templates is using the Exosphere Python SDK, which provides a clean interface to the State Manager API. @@ -179,7 +180,13 @@ async def create_graph_template(): result = await state_manager.upsert_graph( graph_name="my-workflow", graph_nodes=graph_nodes, - secrets=secrets + secrets=secrets, + retry_policy={ + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2 + } ) print("Graph template created successfully!") print(f"Validation status: {result['validation_status']}") @@ -291,7 +298,13 @@ The state manager validates your graph template: result = await state_manager.upsert_graph( graph_name="my-workflow", graph_nodes=updated_nodes, - secrets=updated_secrets + secrets=updated_secrets, + retry_policy={ + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2 + } ) print("Graph template updated successfully!") print(f"Validation status: {result['validation_status']}") diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md index 1dbc6430..a8079d87 100644 --- a/docs/docs/exosphere/retry-policy.md +++ b/docs/docs/exosphere/retry-policy.md @@ -41,24 +41,28 @@ Retry policies are defined in your graph template configuration: ## Parameters ### max_retries + - **Type**: `int` - **Default**: `3` - **Description**: The maximum number of retry attempts before giving up - **Constraints**: Must be >= 0 ### strategy + - **Type**: `string` - **Default**: `"EXPONENTIAL"` - **Description**: The retry strategy to use for calculating delays - **Options**: See [Retry Strategies](#retry-strategies) below ### backoff_factor -- **Type**: `int` + +- **Type**: `int` (milliseconds) - **Default**: `2000` (2 seconds) - **Description**: The base delay factor in milliseconds - **Constraints**: Must be > 0 ### exponent + - **Type**: `int` - **Default**: `2` - **Description**: The exponent used for exponential strategies @@ -73,31 +77,41 @@ Exosphere supports three main categories of retry strategies, each with jitter v Exponential strategies increase the delay exponentially with each retry attempt. #### EXPONENTIAL + Standard exponential backoff without jitter. -**Formula**: `backoff_factor * (exponent ^ retry_count)` +**Formula**: `backoff_factor * (exponent ^ (retry_count - 1))` **Example**: + - Retry 1: 2000ms (2 seconds) - Retry 2: 4000ms (4 seconds) - Retry 3: 8000ms (8 seconds) #### EXPONENTIAL_FULL_JITTER + Exponential backoff with full jitter (random delay between 0 and calculated delay). -**Formula**: `random(0, backoff_factor * (exponent ^ retry_count))` +**Formula**: `random(0, backoff_factor * (exponent ^ (retry_count - 1)))` + +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* **Example**: + - Retry 1: 0-2000ms (random) - Retry 2: 0-4000ms (random) - Retry 3: 0-8000ms (random) #### EXPONENTIAL_EQUAL_JITTER + Exponential backoff with equal jitter (random delay around half the calculated delay). -**Formula**: `(backoff_factor * (exponent ^ retry_count)) / 2 + random(0, (backoff_factor * (exponent ^ retry_count)) / 2)` +**Formula**: `(backoff_factor * (exponent ^ (retry_count - 1))) / 2 + random(0, (backoff_factor * (exponent ^ (retry_count - 1))) / 2)` + +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* **Example**: + - Retry 1: 1000-2000ms (random) - Retry 2: 2000-4000ms (random) - Retry 3: 4000-8000ms (random) @@ -107,31 +121,41 @@ Exponential backoff with equal jitter (random delay around half the calculated d Linear strategies increase the delay linearly with each retry attempt. #### LINEAR + Standard linear backoff without jitter. **Formula**: `backoff_factor * retry_count` **Example**: + - Retry 1: 2000ms (2 seconds) - Retry 2: 4000ms (4 seconds) - Retry 3: 6000ms (6 seconds) #### LINEAR_FULL_JITTER + Linear backoff with full jitter. **Formula**: `random(0, backoff_factor * retry_count)` +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + **Example**: + - Retry 1: 0-2000ms (random) - Retry 2: 0-4000ms (random) - Retry 3: 0-6000ms (random) #### LINEAR_EQUAL_JITTER + Linear backoff with equal jitter. **Formula**: `(backoff_factor * retry_count) / 2 + random(0, (backoff_factor * retry_count) / 2)` +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + **Example**: + - Retry 1: 1000-2000ms (random) - Retry 2: 2000-4000ms (random) - Retry 3: 3000-6000ms (random) @@ -141,31 +165,41 @@ Linear backoff with equal jitter. Fixed strategies use a constant delay for all retry attempts. #### FIXED + Standard fixed delay without jitter. **Formula**: `backoff_factor` **Example**: + - Retry 1: 2000ms (2 seconds) - Retry 2: 2000ms (2 seconds) - Retry 3: 2000ms (2 seconds) #### FIXED_FULL_JITTER + Fixed delay with full jitter. **Formula**: `random(0, backoff_factor)` +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + **Example**: + - Retry 1: 0-2000ms (random) - Retry 2: 0-2000ms (random) - Retry 3: 0-2000ms (random) #### FIXED_EQUAL_JITTER + Fixed delay with equal jitter. **Formula**: `backoff_factor / 2 + random(0, backoff_factor / 2)` +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + **Example**: + - Retry 1: 1000-2000ms (random) - Retry 2: 1000-2000ms (random) - Retry 3: 1000-2000ms (random) @@ -173,6 +207,7 @@ Fixed delay with equal jitter. ## Usage Examples ### Basic Exponential Retry + ```json { "retry_policy": { @@ -185,6 +220,7 @@ Fixed delay with equal jitter. ``` ### Aggressive Retry with Jitter + ```json { "retry_policy": { @@ -197,6 +233,7 @@ Fixed delay with equal jitter. ``` ### Conservative Linear Retry + ```json { "retry_policy": { @@ -208,6 +245,7 @@ Fixed delay with equal jitter. ``` ### Fixed Retry for Rate Limiting + ```json { "retry_policy": { @@ -227,6 +265,7 @@ Retries are automatically triggered when: 3. The state status is `QUEUED` or `EXECUTED` The retry mechanism: + - Creates a new state with `retry_count` incremented by 1 - Sets `enqueue_after` to the current time plus the calculated delay - Sets the original state status to `ERRORED` with the error message @@ -234,21 +273,25 @@ The retry mechanism: ## Best Practices ### Choose the Right Strategy + - **EXPONENTIAL**: Best for most transient failures (network issues, temporary service unavailability) - **LINEAR**: Good for predictable, consistent delays - **FIXED**: Useful for rate limiting scenarios ### Use Jitter for High Concurrency + - **FULL_JITTER**: Best for high concurrency to prevent thundering herd - **EQUAL_JITTER**: Good balance between predictability and randomization - **No Jitter**: Use only when you need deterministic behavior ### Set Appropriate Limits + - **max_retries**: Consider the nature of your failures and downstream dependencies - **backoff_factor**: Balance between responsiveness and resource usage - **exponent**: Higher values create more aggressive backoff ### Monitor Retry Patterns + - Track retry counts in your monitoring system - Set up alerts for graphs with high retry rates - Analyze retry patterns to identify systemic issues @@ -263,14 +306,15 @@ The retry mechanism: ## Error Handling If a retry policy configuration is invalid: + - The graph template validation will fail - An error will be returned during graph creation - The graph will not be saved until the configuration is corrected ## Integration with Signals -Retry policies work alongside Exosphere's signal system: +Retry policies work alongside Exosphere's signaling system: - Nodes can still raise `PruneSignal` to stop retries immediately -- Nodes can raise `ReQueueAfterSignal` to re-queue after sometime, this will not mark nodes as failure. -- The retry count is preserved when using signals \ No newline at end of file +- Nodes can raise `ReQueueAfterSignal` to re-queue after some time. This will not mark nodes as failures. +- The retry count is preserved when using signals. \ No newline at end of file From 6e02d5022023336e421dc67a31665a0e7b2f080c Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 13:51:21 +0530 Subject: [PATCH 06/20] Enhance retry policy implementation and documentation - Added the `max_delay` parameter to the retry policy model, allowing users to cap the maximum delay for retry attempts. - Updated the documentation to include detailed explanations of the `max_delay` parameter and its usage in retry strategies. - Improved error handling in the `errored_state` function to log errors when fetching graph templates and raise appropriate HTTP exceptions. - Refactored the `GraphTemplate` model to raise a ValueError instead of an HTTPException when a graph template is not found, enhancing error handling consistency. - Updated the `compute_delay` method in the `RetryPolicyModel` to apply the new delay capping logic across all retry strategies. --- docs/docs/exosphere/retry-policy.md | 79 ++++++++++++++++++- state-manager/app/controller/errored_state.py | 8 +- .../app/models/db/graph_template_model.py | 4 +- .../app/models/retry_policy_model.py | 25 +++--- 4 files changed, 101 insertions(+), 15 deletions(-) diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md index a8079d87..d1b0aef3 100644 --- a/docs/docs/exosphere/retry-policy.md +++ b/docs/docs/exosphere/retry-policy.md @@ -33,7 +33,8 @@ Retry policies are defined in your graph template configuration: "max_retries": 3, "strategy": "EXPONENTIAL", "backoff_factor": 2000, - "exponent": 2 + "exponent": 2, + "max_delay": 3600000 } } ``` @@ -68,6 +69,14 @@ Retry policies are defined in your graph template configuration: - **Description**: The exponent used for exponential strategies - **Constraints**: Must be > 0 +### max_delay + +- **Type**: `int | null` (milliseconds) +- **Default**: `null` (no maximum delay) +- **Description**: The maximum delay in milliseconds that any retry attempt can have. When set, all calculated delays are capped at this value using the `_cap` function +- **Constraints**: Must be > 0 when not null +- **Example**: `3600000` (1 hour) would cap all delays to a maximum of 1 hour + ## Retry Strategies Exosphere supports three main categories of retry strategies, each with jitter variants to prevent thundering herd problems. @@ -204,6 +213,43 @@ Fixed delay with equal jitter. - Retry 2: 1000-2000ms (random) - Retry 3: 1000-2000ms (random) +## Delay Capping + +The retry policy includes a built-in delay capping mechanism through the `_cap` function and `max_delay` parameter. This ensures that retry delays never exceed a specified maximum value, even with aggressive exponential backoff strategies. + +### How Delay Capping Works + +The `_cap` function is applied to all calculated delays: + +```python +def _cap(value: int) -> int: + if self.max_delay is not None: + return min(value, self.max_delay) + return value +``` + +**Behavior:** +- If `max_delay` is set, all calculated delays are capped at this value +- If `max_delay` is `null` (default), no capping is applied +- The capping is applied after all strategy calculations. + +### Example with Delay Capping + +Consider an exponential strategy with `backoff_factor: 2000`, `exponent: 2`, and `max_delay: 10000`: + +**With capping:** +- Retry 1: 2000ms +- Retry 2: 4000ms +- Retry 3: 8000ms +- Retry 4: 10000ms (capped at max_delay) + +### When to Use Delay Capping + +- **Long-running workflows**: Prevent excessive delays that could impact overall workflow completion time +- **User-facing applications**: Ensure retries don't create unacceptable wait times +- **Resource management**: Control resource consumption by limiting retry delays +- **Predictable behavior**: Create more predictable retry patterns for monitoring and alerting + ## Usage Examples ### Basic Exponential Retry @@ -256,6 +302,34 @@ Fixed delay with equal jitter. } ``` +### Exponential Retry with Delay Capping + +```json +{ + "retry_policy": { + "max_retries": 5, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2, + "max_delay": 30000 + } +} +``` + +### Conservative Retry with Maximum Delay + +```json +{ + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL_FULL_JITTER", + "backoff_factor": 1000, + "exponent": 3, + "max_delay": 60000 + } +} +``` + ## When Retries Are Triggered Retries are automatically triggered when: @@ -289,6 +363,7 @@ The retry mechanism: - **max_retries**: Consider the nature of your failures and downstream dependencies - **backoff_factor**: Balance between responsiveness and resource usage - **exponent**: Higher values create more aggressive backoff +- **max_delay**: Set a reasonable maximum delay to prevent excessive wait times, especially for exponential strategies ### Monitor Retry Patterns @@ -301,7 +376,7 @@ The retry mechanism: - Retry policies apply to all nodes in a graph uniformly - Individual node-level retry policies are not supported - Retry delays are calculated in milliseconds -- Maximum delay is not capped (consider using reasonable `backoff_factor` and `exponent` values) +- Maximum delay can be capped using the `max_delay` parameter (recommended for long-running workflows) ## Error Handling diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py index 8ceb522d..1fd56021 100644 --- a/state-manager/app/controller/errored_state.py +++ b/state-manager/app/controller/errored_state.py @@ -26,7 +26,13 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E if state.status == StateStatusEnum.EXECUTED: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="State is already executed") - graph_template = await GraphTemplate.get(namespace_name, state.graph_name) + try: + graph_template = await GraphTemplate.get(namespace_name, state.graph_name) + except Exception as e: + logger.error(f"Error getting graph template {state.graph_name} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id, error=e) + if "Graph template not found" in str(e): + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found") + raise e retry_created = False diff --git a/state-manager/app/models/db/graph_template_model.py b/state-manager/app/models/db/graph_template_model.py index ee0323d8..4b3a4731 100644 --- a/state-manager/app/models/db/graph_template_model.py +++ b/state-manager/app/models/db/graph_template_model.py @@ -5,8 +5,6 @@ from pymongo import IndexModel from pydantic import Field, field_validator, PrivateAttr, model_validator from typing import List, Self, Dict -from fastapi.exceptions import HTTPException -from fastapi import status from .base import BaseDatabaseModel from ..graph_template_validation_status import GraphTemplateValidationStatus @@ -305,7 +303,7 @@ def get_path_by_identifier(self, identifier: str) -> set[str]: async def get(namespace: str, graph_name: str) -> "GraphTemplate": graph_template = await GraphTemplate.find_one(GraphTemplate.namespace == namespace, GraphTemplate.name == graph_name) if not graph_template: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found") + raise ValueError(f"Graph template not found for namespace: {namespace} and graph name: {graph_name}") return graph_template @staticmethod diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py index 5fe92409..9c695bfe 100644 --- a/state-manager/app/models/retry_policy_model.py +++ b/state-manager/app/models/retry_policy_model.py @@ -20,43 +20,50 @@ class RetryPolicyModel(BaseModel): strategy: RetryStrategy = Field(default=RetryStrategy.EXPONENTIAL, description="The method of retry") backoff_factor: int = Field(default=2000, description="The backoff factor in milliseconds (default: 2000 = 2 seconds)", gt=0) exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0) + max_delay: int | None = Field(default=None, description="The maximum delay in milliseconds (default: 3600000 = 1 hour)", gt=0) def compute_delay(self, retry_count: int) -> int: + + def _cap(value: int) -> int: + if self.max_delay is not None: + return min(value, self.max_delay) + return value + if retry_count < 1: raise ValueError(f"Retry count must be greater than 1, got {retry_count}") if self.strategy == RetryStrategy.EXPONENTIAL: - return (self.backoff_factor * (self.exponent ** (retry_count - 1))) + return _cap(self.backoff_factor * (self.exponent ** (retry_count - 1))) elif self.strategy == RetryStrategy.EXPONENTIAL_FULL_JITTER: base = self.backoff_factor * (self.exponent ** (retry_count - 1)) - return int(random.uniform(0, base)) + return _cap(int(random.uniform(0, base))) elif self.strategy == RetryStrategy.EXPONENTIAL_EQUAL_JITTER: base = self.backoff_factor * (self.exponent ** (retry_count - 1)) - return int(base/2 + random.uniform(0, base / 2)) + return _cap(int(base/2 + random.uniform(0, base / 2))) elif self.strategy == RetryStrategy.LINEAR: - return (self.backoff_factor * retry_count) + return _cap(self.backoff_factor * retry_count) elif self.strategy == RetryStrategy.LINEAR_FULL_JITTER: base = self.backoff_factor * retry_count - return int(random.uniform(0, base)) + return _cap(int(random.uniform(0, base))) elif self.strategy == RetryStrategy.LINEAR_EQUAL_JITTER: base = self.backoff_factor * retry_count - return int(base/2 + random.uniform(0, base / 2)) + return _cap(int(base/2 + random.uniform(0, base / 2))) elif self.strategy == RetryStrategy.FIXED: - return self.backoff_factor + return _cap(self.backoff_factor) elif self.strategy == RetryStrategy.FIXED_FULL_JITTER: base = self.backoff_factor - return int(random.uniform(0, base)) + return _cap(int(random.uniform(0, base))) elif self.strategy == RetryStrategy.FIXED_EQUAL_JITTER: base = self.backoff_factor - return int(base/2 + random.uniform(0, base / 2)) + return _cap(int(base/2 + random.uniform(0, base / 2))) else: raise ValueError(f"Invalid retry strategy: {self.strategy}") \ No newline at end of file From 64592d8a4bc7e236dc06b5ba3af1956e2ca6a158 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 14:07:57 +0530 Subject: [PATCH 07/20] Enhance errored state handling with retry state management - Added error handling for duplicate retry states in the `errored_state` function, logging when a retry state already exists. - Introduced a new `fanout_id` field in the `State` model to support unique identification of retry states. - Updated the database index to enforce uniqueness on the combination of relevant state fields, improving data integrity and query performance. --- state-manager/app/controller/errored_state.py | 42 +++++++++++-------- state-manager/app/models/db/state.py | 17 +++++++- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py index 1fd56021..47f1db50 100644 --- a/state-manager/app/controller/errored_state.py +++ b/state-manager/app/controller/errored_state.py @@ -3,6 +3,7 @@ from app.models.errored_models import ErroredRequestModel, ErroredResponseModel from fastapi import HTTPException, status from beanie import PydanticObjectId +from pymongo.errors import DuplicateKeyError from app.models.db.state import State from app.models.state_status_enum import StateStatusEnum @@ -37,24 +38,29 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E retry_created = False if state.retry_count < graph_template.retry_policy.max_retries: - retry_state = State( - node_name=state.node_name, - namespace_name=state.namespace_name, - identifier=state.identifier, - graph_name=state.graph_name, - run_id=state.run_id, - status=StateStatusEnum.CREATED, - inputs=state.inputs, - outputs={}, - error=None, - parents=state.parents, - does_unites=state.does_unites, - enqueue_after= int(time.time() * 1000) + graph_template.retry_policy.compute_delay(state.retry_count + 1), - retry_count=state.retry_count + 1 - ) - retry_state = await retry_state.insert() - logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id) - retry_created = True + try: + retry_state = State( + node_name=state.node_name, + namespace_name=state.namespace_name, + identifier=state.identifier, + graph_name=state.graph_name, + run_id=state.run_id, + status=StateStatusEnum.CREATED, + inputs=state.inputs, + outputs={}, + error=None, + parents=state.parents, + does_unites=state.does_unites, + enqueue_after= int(time.time() * 1000) + graph_template.retry_policy.compute_delay(state.retry_count + 1), + retry_count=state.retry_count + 1, + fanout_id=state.fanout_id + ) + retry_state = await retry_state.insert() + logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id) + retry_created = True + except DuplicateKeyError: + logger.info(f"Retry state {retry_state.id} already exists for state {state_id}", x_exosphere_request_id=x_exosphere_request_id) + retry_created = True state.status = StateStatusEnum.ERRORED state.error = body.error diff --git a/state-manager/app/models/db/state.py b/state-manager/app/models/db/state.py index b883ec65..0747d408 100644 --- a/state-manager/app/models/db/state.py +++ b/state-manager/app/models/db/state.py @@ -8,6 +8,7 @@ import hashlib import json import time +import uuid class State(BaseDatabaseModel): node_name: str = Field(..., description="Name of the node of the state") @@ -25,7 +26,8 @@ class State(BaseDatabaseModel): state_fingerprint: str = Field(default="", description="Fingerprint of the state") enqueue_after: int = Field(default_factory=lambda: int(time.time() * 1000), gt=0, description="Unix time in milliseconds after which the state should be enqueued") retry_count: int = Field(default=0, description="Number of times the state has been retried") - + fanout_id: str = Field(default=str(uuid.uuid4()), description="Fanout ID of the state") + @before_event([Insert, Replace, Save]) def _generate_fingerprint(self): if not self.does_unites: @@ -78,5 +80,18 @@ class Settings: ("node_name", 1), ], name="enqueue_query" + ), + IndexModel( + [ + ("node_name", 1), + ("namespace_name", 1), + ("graph_name", 1), + ("identifier", 1), + ("run_id", 1), + ("retry_count", 1), + ("fanout_id", 1), + ], + unique=True, + name="uniq_fanout_retry" ) ] \ No newline at end of file From 33e75d3fd200f79e81e54e8d366281011888f7cc Mon Sep 17 00:00:00 2001 From: Nivedit Jain Date: Sun, 31 Aug 2025 14:11:11 +0530 Subject: [PATCH 08/20] Update state-manager/app/models/db/state.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- state-manager/app/models/db/state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/state-manager/app/models/db/state.py b/state-manager/app/models/db/state.py index 0747d408..05441ec3 100644 --- a/state-manager/app/models/db/state.py +++ b/state-manager/app/models/db/state.py @@ -26,7 +26,7 @@ class State(BaseDatabaseModel): state_fingerprint: str = Field(default="", description="Fingerprint of the state") enqueue_after: int = Field(default_factory=lambda: int(time.time() * 1000), gt=0, description="Unix time in milliseconds after which the state should be enqueued") retry_count: int = Field(default=0, description="Number of times the state has been retried") - fanout_id: str = Field(default=str(uuid.uuid4()), description="Fanout ID of the state") + fanout_id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Fanout ID of the state") @before_event([Insert, Replace, Save]) def _generate_fingerprint(self): From 0c16bd75eb5655cb2e3c1fb81b253957659ef9ca Mon Sep 17 00:00:00 2001 From: Nivedit Jain Date: Sun, 31 Aug 2025 14:11:25 +0530 Subject: [PATCH 09/20] Update docs/docs/exosphere/retry-policy.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/docs/exosphere/retry-policy.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md index d1b0aef3..8089502b 100644 --- a/docs/docs/exosphere/retry-policy.md +++ b/docs/docs/exosphere/retry-policy.md @@ -336,7 +336,7 @@ Retries are automatically triggered when: 1. A node execution fails with an error 2. The current retry count is less than `max_retries` -3. The state status is `QUEUED` or `EXECUTED` +3. The state status is `QUEUED` The retry mechanism: From b0cabb02a96f3bd29e66f5e88bfdbbb023de87f6 Mon Sep 17 00:00:00 2001 From: Nivedit Jain Date: Sun, 31 Aug 2025 14:11:49 +0530 Subject: [PATCH 10/20] Update docs/docs/exosphere/retry-policy.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/docs/exosphere/retry-policy.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md index 8089502b..87257bcd 100644 --- a/docs/docs/exosphere/retry-policy.md +++ b/docs/docs/exosphere/retry-policy.md @@ -392,4 +392,4 @@ Retry policies work alongside Exosphere's signaling system: - Nodes can still raise `PruneSignal` to stop retries immediately - Nodes can raise `ReQueueAfterSignal` to re-queue after some time. This will not mark nodes as failures. -- The retry count is preserved when using signals. \ No newline at end of file +- When a node is re-queued using `ReQueueAfterSignal`, the `retry_count` is not incremented. The existing count is carried over to the new state. \ No newline at end of file From 74da1b8c8f342a53f57db3c31c43f07cd004905f Mon Sep 17 00:00:00 2001 From: Nivedit Jain Date: Sun, 31 Aug 2025 14:12:03 +0530 Subject: [PATCH 11/20] Update state-manager/app/controller/errored_state.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- state-manager/app/controller/errored_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py index 47f1db50..2faab448 100644 --- a/state-manager/app/controller/errored_state.py +++ b/state-manager/app/controller/errored_state.py @@ -31,7 +31,7 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E graph_template = await GraphTemplate.get(namespace_name, state.graph_name) except Exception as e: logger.error(f"Error getting graph template {state.graph_name} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id, error=e) - if "Graph template not found" in str(e): + if isinstance(e, ValueError) and "Graph template not found" in str(e): raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found") raise e From 06ee0a3642ada9954e7f8b77e573a73aeb7afeb7 Mon Sep 17 00:00:00 2001 From: Nivedit Jain Date: Sun, 31 Aug 2025 14:12:14 +0530 Subject: [PATCH 12/20] Update state-manager/app/controller/errored_state.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- state-manager/app/controller/errored_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py index 2faab448..6645dac9 100644 --- a/state-manager/app/controller/errored_state.py +++ b/state-manager/app/controller/errored_state.py @@ -59,7 +59,7 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id) retry_created = True except DuplicateKeyError: - logger.info(f"Retry state {retry_state.id} already exists for state {state_id}", x_exosphere_request_id=x_exosphere_request_id) + logger.info(f"Duplicate retry state detected for state {state_id}. A retry state with the same unique key already exists.", x_exosphere_request_id=x_exosphere_request_id) retry_created = True state.status = StateStatusEnum.ERRORED From fded75b13c7bb6c968975d8937ff98c1eb792c03 Mon Sep 17 00:00:00 2001 From: Nivedit Jain Date: Sun, 31 Aug 2025 14:12:22 +0530 Subject: [PATCH 13/20] Update state-manager/app/models/retry_policy_model.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- state-manager/app/models/retry_policy_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py index 9c695bfe..61da87a3 100644 --- a/state-manager/app/models/retry_policy_model.py +++ b/state-manager/app/models/retry_policy_model.py @@ -30,7 +30,7 @@ def _cap(value: int) -> int: return value if retry_count < 1: - raise ValueError(f"Retry count must be greater than 1, got {retry_count}") + raise ValueError(f"Retry count must be greater than or equal to 1, got {retry_count}") if self.strategy == RetryStrategy.EXPONENTIAL: return _cap(self.backoff_factor * (self.exponent ** (retry_count - 1))) From e56d5f5da2ba2bc5b2b3a81278c19f807c85229a Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 14:26:03 +0530 Subject: [PATCH 14/20] Update max_delay description in RetryPolicyModel to clarify behavior when set to None --- state-manager/app/models/retry_policy_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py index 61da87a3..be719176 100644 --- a/state-manager/app/models/retry_policy_model.py +++ b/state-manager/app/models/retry_policy_model.py @@ -20,7 +20,7 @@ class RetryPolicyModel(BaseModel): strategy: RetryStrategy = Field(default=RetryStrategy.EXPONENTIAL, description="The method of retry") backoff_factor: int = Field(default=2000, description="The backoff factor in milliseconds (default: 2000 = 2 seconds)", gt=0) exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0) - max_delay: int | None = Field(default=None, description="The maximum delay in milliseconds (default: 3600000 = 1 hour)", gt=0) + max_delay: int | None = Field(default=None, description="The maximum delay in milliseconds (no default limit when None)", gt=0) def compute_delay(self, retry_count: int) -> int: From 42efd6850a647b42c6c60ac60c6b255f999cab4d Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 14:32:23 +0530 Subject: [PATCH 15/20] Refine documentation for retry policy and errored state handling - Added missing newlines in the retry policy documentation for better readability. - Removed redundant assignment in the errored state handling to streamline the code logic. --- docs/docs/exosphere/retry-policy.md | 4 +++- state-manager/app/controller/errored_state.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md index 87257bcd..639306dc 100644 --- a/docs/docs/exosphere/retry-policy.md +++ b/docs/docs/exosphere/retry-policy.md @@ -229,6 +229,7 @@ def _cap(value: int) -> int: ``` **Behavior:** + - If `max_delay` is set, all calculated delays are capped at this value - If `max_delay` is `null` (default), no capping is applied - The capping is applied after all strategy calculations. @@ -238,6 +239,7 @@ def _cap(value: int) -> int: Consider an exponential strategy with `backoff_factor: 2000`, `exponent: 2`, and `max_delay: 10000`: **With capping:** + - Retry 1: 2000ms - Retry 2: 4000ms - Retry 3: 8000ms @@ -392,4 +394,4 @@ Retry policies work alongside Exosphere's signaling system: - Nodes can still raise `PruneSignal` to stop retries immediately - Nodes can raise `ReQueueAfterSignal` to re-queue after some time. This will not mark nodes as failures. -- When a node is re-queued using `ReQueueAfterSignal`, the `retry_count` is not incremented. The existing count is carried over to the new state. \ No newline at end of file +- When a node is re-queued using `ReQueueAfterSignal`, the `retry_count` is not incremented. The existing count is carried over to the new state. diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py index 6645dac9..f798cec8 100644 --- a/state-manager/app/controller/errored_state.py +++ b/state-manager/app/controller/errored_state.py @@ -60,7 +60,6 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E retry_created = True except DuplicateKeyError: logger.info(f"Duplicate retry state detected for state {state_id}. A retry state with the same unique key already exists.", x_exosphere_request_id=x_exosphere_request_id) - retry_created = True state.status = StateStatusEnum.ERRORED state.error = body.error From 4ec30abd4c6ca679c398eb035045ef761a617e5a Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 14:54:02 +0530 Subject: [PATCH 16/20] Enhance tests for errored state and upsert graph template - Added retry policy integration in the test cases for errored state handling and upsert graph template. - Updated test fixtures to include additional state attributes for better simulation of state behavior. - Improved error handling assertions in the errored state tests to ensure proper HTTP exceptions are raised. - Mocked retry policy in graph template tests to validate retry behavior during upsert operations. --- .../unit/controller/test_errored_state.py | 94 +++-- .../controller/test_upsert_graph_template.py | 33 ++ .../unit/models/test_retry_policy_model.py | 378 ++++++++++++++++++ 3 files changed, 478 insertions(+), 27 deletions(-) create mode 100644 state-manager/tests/unit/models/test_retry_policy_model.py diff --git a/state-manager/tests/unit/controller/test_errored_state.py b/state-manager/tests/unit/controller/test_errored_state.py index b1fc7df5..df1d5526 100644 --- a/state-manager/tests/unit/controller/test_errored_state.py +++ b/state-manager/tests/unit/controller/test_errored_state.py @@ -34,6 +34,16 @@ def mock_state_queued(self): state = MagicMock() state.id = PydanticObjectId() state.status = StateStatusEnum.QUEUED + state.graph_name = "test_graph" + state.retry_count = 0 + state.node_name = "test_node" + state.namespace_name = "test_namespace" + state.identifier = "test_identifier" + state.run_id = "test_run_id" + state.inputs = {} + state.parents = [] + state.does_unites = False + state.fanout_id = None return state @pytest.fixture @@ -41,11 +51,23 @@ def mock_state_executed(self): state = MagicMock() state.id = PydanticObjectId() state.status = StateStatusEnum.EXECUTED + state.graph_name = "test_graph" + state.retry_count = 0 + state.node_name = "test_node" + state.namespace_name = "test_namespace" + state.identifier = "test_identifier" + state.run_id = "test_run_id" + state.inputs = {} + state.parents = [] + state.does_unites = False + state.fanout_id = None return state @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') async def test_errored_state_success_queued( self, + mock_graph_template_class, mock_state_class, mock_namespace, mock_state_id, @@ -55,10 +77,18 @@ async def test_errored_state_success_queued( ): """Test successful error marking of queued state""" - mock_state_queued.save = AsyncMock() + # Mock GraphTemplate.get to return a valid graph template + mock_graph_template = MagicMock() + mock_graph_template.retry_policy.max_retries = 3 + mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000) + mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template) - mock_state_queued.status = StateStatusEnum.QUEUED - mock_state_queued.save = AsyncMock() + # Mock State constructor and insert method + mock_retry_state = MagicMock() + mock_retry_state.insert = AsyncMock(return_value=mock_retry_state) + mock_state_class.return_value = mock_retry_state + + mock_state_queued.save = AsyncMock() mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) # Act @@ -75,8 +105,10 @@ async def test_errored_state_success_queued( @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') async def test_errored_state_success_executed( self, + mock_graph_template_class, mock_state_class, mock_namespace, mock_state_id, @@ -84,26 +116,22 @@ async def test_errored_state_success_executed( mock_state_executed, mock_request_id ): - """Test successful error marking of executed state""" - - mock_state_executed.save = AsyncMock() - - mock_state_executed.status = StateStatusEnum.QUEUED - mock_state_executed.save = AsyncMock() + """Test that executed states cannot be marked as errored""" + # Arrange + mock_state_executed.status = StateStatusEnum.EXECUTED mock_state_class.find_one = AsyncMock(return_value=mock_state_executed) - # Act - result = await errored_state( - mock_namespace, - mock_state_id, - mock_errored_request, - mock_request_id - ) - - # Assert - assert result.status == StateStatusEnum.ERRORED - assert mock_state_class.find_one.call_count == 1 # Called once for finding + # Act & Assert + with pytest.raises(HTTPException) as exc_info: + await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + assert exc_info.value.status_code == status.HTTP_400_BAD_REQUEST + assert exc_info.value.detail == "State is already executed" @patch('app.controller.errored_state.State') async def test_errored_state_not_found( @@ -197,6 +225,7 @@ async def test_errored_state_already_executed( # Arrange mock_state = MagicMock() mock_state.status = StateStatusEnum.EXECUTED + mock_state.graph_name = "test_graph" mock_state_class.find_one = AsyncMock(return_value=mock_state) # Act & Assert @@ -222,7 +251,7 @@ async def test_errored_state_database_error( ): """Test handling of database errors""" # Arrange - mock_state_class.find_one = MagicMock(side_effect=Exception("Database error")) + mock_state_class.find_one = AsyncMock(side_effect=Exception("Database error")) # Act & Assert with pytest.raises(Exception) as exc_info: @@ -236,31 +265,42 @@ async def test_errored_state_database_error( assert str(exc_info.value) == "Database error" @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') async def test_errored_state_with_different_error_message( self, + mock_graph_template_class, mock_state_class, mock_namespace, mock_state_id, + mock_errored_request, mock_state_queued, mock_request_id ): """Test error marking with different error message""" # Arrange - errored_request = ErroredRequestModel( + different_error_request = ErroredRequestModel( error="Different error message" - ) + ) + + # Mock GraphTemplate.get to return a valid graph template + mock_graph_template = MagicMock() + mock_graph_template.retry_policy.max_retries = 3 + mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000) + mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template) + + # Mock State constructor and insert method + mock_retry_state = MagicMock() + mock_retry_state.insert = AsyncMock(return_value=mock_retry_state) + mock_state_class.return_value = mock_retry_state mock_state_queued.save = AsyncMock() - - mock_state_queued.status = StateStatusEnum.QUEUED - mock_state_queued.set = AsyncMock() mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) # Act result = await errored_state( mock_namespace, mock_state_id, - errored_request, + different_error_request, mock_request_id ) diff --git a/state-manager/tests/unit/controller/test_upsert_graph_template.py b/state-manager/tests/unit/controller/test_upsert_graph_template.py index 2d309376..00832b7f 100644 --- a/state-manager/tests/unit/controller/test_upsert_graph_template.py +++ b/state-manager/tests/unit/controller/test_upsert_graph_template.py @@ -7,6 +7,7 @@ from app.models.graph_models import UpsertGraphTemplateRequest from app.models.graph_template_validation_status import GraphTemplateValidationStatus from app.models.node_template_model import NodeTemplate +from app.models.retry_policy_model import RetryPolicyModel class TestUpsertGraphTemplate: @@ -74,6 +75,14 @@ def mock_existing_template(self, mock_nodes, mock_secrets): template.updated_at = datetime(2023, 1, 2, 12, 0, 0) template.get_secrets.return_value = mock_secrets template.set_secrets.return_value = template + + # Add proper retry_policy using real RetryPolicyModel + template.retry_policy = RetryPolicyModel( + max_retries=3, + backoff_factor=1000, + max_delay=30000 + ) + return template @patch('app.controller.upsert_graph_template.GraphTemplate') @@ -148,6 +157,14 @@ async def test_upsert_graph_template_create_new( mock_new_template.get_secrets.return_value = mock_upsert_request.secrets mock_new_template.set_secrets.return_value = mock_new_template + # Add proper retry_policy mock + mock_retry_policy = RetryPolicyModel( + max_retries=3, + backoff_factor=1000, + max_delay=30000 + ) + mock_new_template.retry_policy = mock_retry_policy + mock_graph_template_class.insert = AsyncMock(return_value=mock_new_template) # Act @@ -225,6 +242,14 @@ async def test_upsert_graph_template_with_empty_nodes( mock_existing_template.get_secrets.return_value = {} mock_existing_template.set_secrets.return_value = mock_existing_template + # Add proper retry_policy mock + mock_retry_policy = RetryPolicyModel( + max_retries=3, + backoff_factor=1000, + max_delay=30000 + ) + mock_existing_template.retry_policy = mock_retry_policy + mock_existing_template.update = AsyncMock() mock_graph_template_class.find_one = AsyncMock(return_value=mock_existing_template) @@ -269,6 +294,14 @@ async def test_upsert_graph_template_with_validation_errors( mock_existing_template.get_secrets.return_value = mock_upsert_request.secrets mock_existing_template.set_secrets.return_value = mock_existing_template + # Add proper retry_policy mock + mock_retry_policy = RetryPolicyModel( + max_retries=3, + backoff_factor=1000, + max_delay=30000 + ) + mock_existing_template.retry_policy = mock_retry_policy + mock_existing_template.update = AsyncMock() mock_graph_template_class.find_one = AsyncMock(return_value=mock_existing_template) diff --git a/state-manager/tests/unit/models/test_retry_policy_model.py b/state-manager/tests/unit/models/test_retry_policy_model.py new file mode 100644 index 00000000..0ec7c5b8 --- /dev/null +++ b/state-manager/tests/unit/models/test_retry_policy_model.py @@ -0,0 +1,378 @@ +import pytest +import random +from app.models.retry_policy_model import RetryPolicyModel, RetryStrategy + + +class TestRetryPolicyModel: + """Test cases for RetryPolicyModel""" + + def test_default_initialization(self): + """Test RetryPolicyModel with default values""" + policy = RetryPolicyModel() + + assert policy.max_retries == 3 + assert policy.strategy == RetryStrategy.EXPONENTIAL + assert policy.backoff_factor == 2000 + assert policy.exponent == 2 + assert policy.max_delay is None + + def test_custom_initialization(self): + """Test RetryPolicyModel with custom values""" + policy = RetryPolicyModel( + max_retries=5, + strategy=RetryStrategy.LINEAR, + backoff_factor=1000, + exponent=3, + max_delay=10000 + ) + + assert policy.max_retries == 5 + assert policy.strategy == RetryStrategy.LINEAR + assert policy.backoff_factor == 1000 + assert policy.exponent == 3 + assert policy.max_delay == 10000 + + def test_exponential_strategy(self): + """Test exponential retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # 1000 * 2^0 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 2000 # 1000 * 2^1 + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 4000 # 1000 * 2^2 + + def test_exponential_strategy_with_max_delay(self): + """Test exponential retry strategy with max delay cap""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2, + max_delay=3000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # 1000 * 2^0 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 2000 # 1000 * 2^1 + + # Test retry count 3 (should be capped at max_delay) + delay = policy.compute_delay(3) + assert delay == 3000 # Capped at max_delay + + def test_linear_strategy(self): + """Test linear retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.LINEAR, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # 1000 * 1 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 2000 # 1000 * 2 + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 3000 # 1000 * 3 + + def test_fixed_strategy(self): + """Test fixed retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.FIXED, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # Always 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 1000 # Always 1000 + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 1000 # Always 1000 + + def test_exponential_full_jitter_strategy(self): + """Test exponential full jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER, + backoff_factor=1000, + exponent=2 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 0 <= delay <= 1000 # Random between 0 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 0 <= delay <= 2000 # Random between 0 and 2000 + + def test_exponential_equal_jitter_strategy(self): + """Test exponential equal jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL_EQUAL_JITTER, + backoff_factor=1000, + exponent=2 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 500 <= delay <= 1000 # Random between 500 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 1000 <= delay <= 2000 # Random between 1000 and 2000 + + def test_linear_full_jitter_strategy(self): + """Test linear full jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.LINEAR_FULL_JITTER, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 0 <= delay <= 1000 # Random between 0 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 0 <= delay <= 2000 # Random between 0 and 2000 + + def test_linear_equal_jitter_strategy(self): + """Test linear equal jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.LINEAR_EQUAL_JITTER, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 500 <= delay <= 1000 # Random between 500 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 1000 <= delay <= 2000 # Random between 1000 and 2000 + + def test_fixed_full_jitter_strategy(self): + """Test fixed full jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.FIXED_FULL_JITTER, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 0 <= delay <= 1000 # Random between 0 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 0 <= delay <= 1000 # Random between 0 and 1000 + + def test_fixed_equal_jitter_strategy(self): + """Test fixed equal jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.FIXED_EQUAL_JITTER, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 500 <= delay <= 1000 # Random between 500 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 500 <= delay <= 1000 # Random between 500 and 1000 + + def test_invalid_retry_count(self): + """Test that invalid retry count raises ValueError""" + policy = RetryPolicyModel() + + # Test retry count 0 + with pytest.raises(ValueError, match="Retry count must be greater than or equal to 1"): + policy.compute_delay(0) + + # Test retry count -1 + with pytest.raises(ValueError, match="Retry count must be greater than or equal to 1"): + policy.compute_delay(-1) + + def test_max_delay_capping(self): + """Test that max_delay properly caps the delay""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2, + max_delay=1500 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # Not capped + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 1500 # Capped at max_delay + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 1500 # Capped at max_delay + + def test_jitter_strategies_with_max_delay(self): + """Test jitter strategies with max delay capping""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER, + backoff_factor=1000, + exponent=2, + max_delay=1500 + ) + + # Test multiple calls to ensure max_delay is respected + for _ in range(10): + delay = policy.compute_delay(3) + assert delay <= 1500 # Should never exceed max_delay + + def test_different_exponents(self): + """Test different exponent values""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=3 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # 1000 * 3^0 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 3000 # 1000 * 3^1 + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 9000 # 1000 * 3^2 + + def test_different_backoff_factors(self): + """Test different backoff factor values""" + policy = RetryPolicyModel( + strategy=RetryStrategy.LINEAR, + backoff_factor=500 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 500 # 500 * 1 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 1000 # 500 * 2 + + def test_model_validation(self): + """Test Pydantic model validation""" + # Test valid model + policy = RetryPolicyModel( + max_retries=5, + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2, + max_delay=10000 + ) + + # Test invalid max_retries (negative) + with pytest.raises(ValueError): + RetryPolicyModel(max_retries=-1) + + # Test invalid backoff_factor (non-positive) + with pytest.raises(ValueError): + RetryPolicyModel(backoff_factor=0) + + # Test invalid exponent (non-positive) + with pytest.raises(ValueError): + RetryPolicyModel(exponent=0) + + # Test invalid max_delay (non-positive) + with pytest.raises(ValueError): + RetryPolicyModel(max_delay=0) + + def test_strategy_enum_values(self): + """Test all RetryStrategy enum values""" + strategies = [ + RetryStrategy.EXPONENTIAL, + RetryStrategy.EXPONENTIAL_FULL_JITTER, + RetryStrategy.EXPONENTIAL_EQUAL_JITTER, + RetryStrategy.LINEAR, + RetryStrategy.LINEAR_FULL_JITTER, + RetryStrategy.LINEAR_EQUAL_JITTER, + RetryStrategy.FIXED, + RetryStrategy.FIXED_FULL_JITTER, + RetryStrategy.FIXED_EQUAL_JITTER + ] + + for strategy in strategies: + policy = RetryPolicyModel(strategy=strategy) + assert policy.strategy == strategy + # Should not raise any exceptions + delay = policy.compute_delay(1) + assert isinstance(delay, int) + assert delay >= 0 + + def test_edge_case_large_numbers(self): + """Test edge cases with large numbers""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000000, + exponent=10 + ) + + # Test that large numbers don't cause overflow + delay = policy.compute_delay(3) + assert isinstance(delay, int) + assert delay > 0 + + def test_consistency_across_calls(self): + """Test that non-jitter strategies are consistent""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2 + ) + + # Multiple calls should return the same result for non-jitter strategies + delay1 = policy.compute_delay(2) + delay2 = policy.compute_delay(2) + assert delay1 == delay2 + + def test_jitter_variability(self): + """Test that jitter strategies produce different results""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER, + backoff_factor=1000, + exponent=2 + ) + + # Multiple calls should return different results for jitter strategies + delays = set() + for _ in range(100): + delay = policy.compute_delay(2) + delays.add(delay) + + # Should have multiple different values (not all the same) + assert len(delays) > 1 \ No newline at end of file From b1b85d187b8e268c5b37bafe4fc5e0941ceb0461 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 14:55:49 +0530 Subject: [PATCH 17/20] Refactor test for RetryPolicyModel by removing unnecessary import - Removed the unused 'random' import from the test file for cleaner code. - Updated the instantiation of RetryPolicyModel to improve readability. --- state-manager/tests/unit/models/test_retry_policy_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/state-manager/tests/unit/models/test_retry_policy_model.py b/state-manager/tests/unit/models/test_retry_policy_model.py index 0ec7c5b8..038f3adc 100644 --- a/state-manager/tests/unit/models/test_retry_policy_model.py +++ b/state-manager/tests/unit/models/test_retry_policy_model.py @@ -1,5 +1,4 @@ import pytest -import random from app.models.retry_policy_model import RetryPolicyModel, RetryStrategy @@ -288,7 +287,7 @@ def test_different_backoff_factors(self): def test_model_validation(self): """Test Pydantic model validation""" # Test valid model - policy = RetryPolicyModel( + RetryPolicyModel( max_retries=5, strategy=RetryStrategy.EXPONENTIAL, backoff_factor=1000, From 3806d2684d758387ba80dd458c36d9b7b5d5961e Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 15:04:35 +0530 Subject: [PATCH 18/20] Add comprehensive tests for errored state handling in graph templates - Introduced multiple test cases to cover scenarios where graph templates are not found, encounter other errors, or trigger DuplicateKeyError during state creation. - Enhanced error handling assertions to ensure proper HTTP exceptions are raised for missing graph templates and other exceptions. - Validated behavior when maximum retries are reached, ensuring no new state is created in such cases. - Improved overall test coverage for errored state functionality. --- .../unit/controller/test_errored_state.py | 181 ++++++++++++++++++ 1 file changed, 181 insertions(+) diff --git a/state-manager/tests/unit/controller/test_errored_state.py b/state-manager/tests/unit/controller/test_errored_state.py index df1d5526..31338db3 100644 --- a/state-manager/tests/unit/controller/test_errored_state.py +++ b/state-manager/tests/unit/controller/test_errored_state.py @@ -2,6 +2,7 @@ from unittest.mock import AsyncMock, MagicMock, patch from fastapi import HTTPException, status from beanie import PydanticObjectId +from pymongo.errors import DuplicateKeyError from app.controller.errored_state import errored_state from app.models.errored_models import ErroredRequestModel @@ -309,3 +310,183 @@ async def test_errored_state_with_different_error_message( assert mock_state_class.find_one.call_count == 1 # Called once for finding assert mock_state_queued.error == "Different error message" + @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') + async def test_errored_state_graph_template_not_found( + self, + mock_graph_template_class, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_state_queued, + mock_request_id + ): + """Test when graph template is not found""" + # Arrange + mock_state_queued.save = AsyncMock() + mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) + + # Mock GraphTemplate.get to raise ValueError with "Graph template not found" + mock_graph_template_class.get = AsyncMock(side_effect=ValueError("Graph template not found")) + + # Act & Assert + with pytest.raises(HTTPException) as exc_info: + await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + assert exc_info.value.status_code == status.HTTP_404_NOT_FOUND + assert exc_info.value.detail == "Graph template not found" + + @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') + async def test_errored_state_graph_template_other_error( + self, + mock_graph_template_class, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_state_queued, + mock_request_id + ): + """Test when graph template raises other exceptions""" + # Arrange + mock_state_queued.save = AsyncMock() + mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) + + # Mock GraphTemplate.get to raise a different exception + mock_graph_template_class.get = AsyncMock(side_effect=Exception("Database connection error")) + + # Act & Assert + with pytest.raises(Exception) as exc_info: + await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + assert str(exc_info.value) == "Database connection error" + + @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') + async def test_errored_state_duplicate_key_error( + self, + mock_graph_template_class, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_state_queued, + mock_request_id + ): + """Test when creating retry state encounters DuplicateKeyError""" + # Arrange + mock_state_queued.save = AsyncMock() + mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) + + # Mock GraphTemplate.get to return a valid graph template + mock_graph_template = MagicMock() + mock_graph_template.retry_policy.max_retries = 3 + mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000) + mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template) + + # Mock State constructor and insert method to raise DuplicateKeyError + mock_retry_state = MagicMock() + mock_retry_state.insert = AsyncMock(side_effect=DuplicateKeyError("Duplicate key error")) + mock_state_class.return_value = mock_retry_state + + # Act + result = await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + # Assert + assert result.status == StateStatusEnum.ERRORED + assert result.retry_created == False + assert mock_state_queued.status == StateStatusEnum.ERRORED + assert mock_state_queued.error == mock_errored_request.error + + @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') + async def test_errored_state_max_retries_reached( + self, + mock_graph_template_class, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ): + """Test when state has reached max retries""" + # Arrange + mock_state = MagicMock() + mock_state.id = PydanticObjectId() + mock_state.status = StateStatusEnum.QUEUED + mock_state.graph_name = "test_graph" + mock_state.retry_count = 3 # Already at max retries + mock_state.node_name = "test_node" + mock_state.namespace_name = "test_namespace" + mock_state.identifier = "test_identifier" + mock_state.run_id = "test_run_id" + mock_state.inputs = {} + mock_state.parents = [] + mock_state.does_unites = False + mock_state.fanout_id = None + mock_state.save = AsyncMock() + + mock_state_class.find_one = AsyncMock(return_value=mock_state) + + # Mock GraphTemplate.get to return a valid graph template with max_retries = 3 + mock_graph_template = MagicMock() + mock_graph_template.retry_policy.max_retries = 3 + mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template) + + # Act + result = await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + # Assert + assert result.status == StateStatusEnum.ERRORED + assert result.retry_created == False + assert mock_state.status == StateStatusEnum.ERRORED + assert mock_state.error == mock_errored_request.error + # Verify that State constructor was not called (no retry created) + mock_state_class.assert_not_called() + + @patch('app.controller.errored_state.State') + async def test_errored_state_general_exception( + self, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ): + """Test handling of general exceptions in the main try-catch block""" + # Arrange + mock_state_class.find_one = AsyncMock(side_effect=Exception("Unexpected error")) + + # Act & Assert + with pytest.raises(Exception) as exc_info: + await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + assert str(exc_info.value) == "Unexpected error" + From 4048228e97a1d31308238f179453babe7a9a222a Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 15:06:27 +0530 Subject: [PATCH 19/20] Refactor assertions in errored state tests for clarity - Updated assertions in the TestErroredState class to use 'not' instead of '== False' for improved readability. - Ensured consistency in the test code style across multiple test cases. --- state-manager/tests/unit/controller/test_errored_state.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/state-manager/tests/unit/controller/test_errored_state.py b/state-manager/tests/unit/controller/test_errored_state.py index 31338db3..032571bf 100644 --- a/state-manager/tests/unit/controller/test_errored_state.py +++ b/state-manager/tests/unit/controller/test_errored_state.py @@ -411,7 +411,7 @@ async def test_errored_state_duplicate_key_error( # Assert assert result.status == StateStatusEnum.ERRORED - assert result.retry_created == False + assert not result.retry_created assert mock_state_queued.status == StateStatusEnum.ERRORED assert mock_state_queued.error == mock_errored_request.error @@ -460,7 +460,7 @@ async def test_errored_state_max_retries_reached( # Assert assert result.status == StateStatusEnum.ERRORED - assert result.retry_created == False + assert not result.retry_created assert mock_state.status == StateStatusEnum.ERRORED assert mock_state.error == mock_errored_request.error # Verify that State constructor was not called (no retry created) From 462cfdefbb905fe7aa1ca24f5a4c957867fdc2a9 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Sun, 31 Aug 2025 15:08:39 +0530 Subject: [PATCH 20/20] Remove Kubernetes deployment steps from the publish workflow - Eliminated the deploy-to-k8s job from the publish-state-manager workflow to streamline the CI/CD process. - This change focuses on publishing the image without the deployment step, simplifying the workflow configuration. --- .github/workflows/publish-state-mangaer.yml | 27 --------------------- 1 file changed, 27 deletions(-) diff --git a/.github/workflows/publish-state-mangaer.yml b/.github/workflows/publish-state-mangaer.yml index 883fd9b8..b3926336 100644 --- a/.github/workflows/publish-state-mangaer.yml +++ b/.github/workflows/publish-state-mangaer.yml @@ -111,30 +111,3 @@ jobs: push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - - deploy-to-k8s: - needs: publish-image - runs-on: ubuntu-latest - - steps: - - name: Deploy to K8s - run: | - echo "${{ secrets.KUBE_CONFIG }}" | base64 -d > kubeconfig.yaml - export KUBECONFIG=$PWD/kubeconfig.yaml - kubectl get nodes - - echo "selected image: ${{ fromJson(needs.publish-image.outputs.json).tags[1] }}" - - kubectl set image deployment/exosphere-state-manager exosphere-state-manager=${{fromJson(needs.publish-image.outputs.json).tags[1]}} - - kubectl rollout status deployment/exosphere-state-manager - - status=$(kubectl rollout status deployment/exosphere-state-manager) - - echo "$status" - - if [[ "$status" != *"successfully rolled out"* ]]; then - kubectl rollout undo deployment/exosphere-state-manager - echo "❌ Deployment failed. Rolled back." >&2 - exit 1 - fi \ No newline at end of file