From 8f7bcc13687a05d3796bb5515a6170975f66f315 Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 12:20:17 +0530
Subject: [PATCH 01/20] Implement retry policy and enhance errored state
 handling

- Added a retry policy model to manage state retries with configurable methods (fixed, linear, exponential).
- Updated the errored state function to create a retry state if the maximum retries have not been reached, improving error recovery.
- Enhanced the ErroredResponseModel to include a flag indicating whether a retry state was created.
- Modified the GraphTemplate and State models to incorporate retry policy attributes, ensuring better state management.
- Improved validation and error handling in the upsert_graph_template function to accommodate the new retry policy structure.
---
 state-manager/app/controller/errored_state.py | 39 ++++++++++++++++++-
 .../app/controller/upsert_graph_template.py   |  7 +++-
 .../app/models/db/graph_template_model.py     |  3 +-
 state-manager/app/models/db/state.py          |  2 +
 state-manager/app/models/errored_models.py    |  3 +-
 state-manager/app/models/graph_models.py      |  3 ++
 .../app/models/retry_policy_model.py          | 12 ++++++
 7 files changed, 64 insertions(+), 5 deletions(-)
 create mode 100644 state-manager/app/models/retry_policy_model.py

diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py
index b59e2573..889c01ad 100644
--- a/state-manager/app/controller/errored_state.py
+++ b/state-manager/app/controller/errored_state.py
@@ -5,9 +5,22 @@
 from app.models.db.state import State
 from app.models.state_status_enum import StateStatusEnum
 from app.singletons.logs_manager import LogsManager
+from app.models.retry_policy_model import RetryPolicyModel, RetryMethod
+from app.models.db.graph_template_model import GraphTemplate
 
 logger = LogsManager().get_logger()
 
+def _calculate_enqueue_after(retry_policy: RetryPolicyModel, retry_count: int) -> int:
+    # convert seconds to milliseconds
+    if retry_policy.method == RetryMethod.FIXED:
+        return (retry_policy.backoff_factor * 1000)
+    elif retry_policy.method == RetryMethod.LINEAR:
+        return (retry_policy.backoff_factor * retry_count) * 1000
+    elif retry_policy.method == RetryMethod.EXPONENTIAL:
+        return (retry_policy.backoff_factor ** retry_count) * 1000
+    else:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid retry method")
+
 async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: ErroredRequestModel, x_exosphere_request_id: str) -> ErroredResponseModel:
 
     try:
@@ -23,11 +36,35 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E
         if state.status == StateStatusEnum.EXECUTED:
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="State is already executed")
         
+        graph_template = await GraphTemplate.get(namespace_name, state.graph_name)
+
+        retry_created = False
+
+        if state.retry_count < graph_template.retry_policy.max_retries:
+            retry_state = State(
+                node_name=state.node_name,
+                namespace_name=state.namespace_name,
+                identifier=state.identifier,
+                graph_name=state.graph_name,
+                run_id=state.run_id,
+                status=StateStatusEnum.CREATED,
+                inputs=state.inputs,
+                outputs=state.outputs,
+                error=body.error,
+                parents=state.parents,
+                does_unites=state.does_unites,
+                enqueue_after=state.enqueue_after + _calculate_enqueue_after(graph_template.retry_policy, state.retry_count + 1),
+                retry_count=state.retry_count + 1
+            )
+            retry_state = await retry_state.insert()
+            logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id)
+            retry_created = True
+
         state.status = StateStatusEnum.ERRORED
         state.error = body.error
         await state.save()
 
-        return ErroredResponseModel(status=StateStatusEnum.ERRORED)
+        return ErroredResponseModel(status=StateStatusEnum.ERRORED, retry_created=retry_created)
 
     except Exception as e:
         logger.error(f"Error errored state {state_id} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id, error=e)
diff --git a/state-manager/app/controller/upsert_graph_template.py b/state-manager/app/controller/upsert_graph_template.py
index 99a178ae..16882018 100644
--- a/state-manager/app/controller/upsert_graph_template.py
+++ b/state-manager/app/controller/upsert_graph_template.py
@@ -27,7 +27,8 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse
                     Set({
                         GraphTemplate.nodes: body.nodes, # type: ignore
                         GraphTemplate.validation_status: GraphTemplateValidationStatus.PENDING, # type: ignore
-                        GraphTemplate.validation_errors: [] # type: ignore
+                        GraphTemplate.validation_errors: [], # type: ignore
+                        GraphTemplate.retry_policy: body.retry_policy # type: ignore
                     })
                 )
                 
@@ -44,7 +45,8 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse
                         namespace=namespace_name,
                         nodes=body.nodes,
                         validation_status=GraphTemplateValidationStatus.PENDING,
-                        validation_errors=[]
+                        validation_errors=[],
+                        retry_policy=body.retry_policy
                     ).set_secrets(body.secrets)
                 )
         except ValueError as e:
@@ -58,6 +60,7 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse
             validation_status=graph_template.validation_status,
             validation_errors=graph_template.validation_errors,
             secrets={secret_name: True for secret_name in graph_template.get_secrets().keys()},
+            retry_policy=graph_template.retry_policy,
             created_at=graph_template.created_at,
             updated_at=graph_template.updated_at
         )
diff --git a/state-manager/app/models/db/graph_template_model.py b/state-manager/app/models/db/graph_template_model.py
index 999d5389..4b3a4731 100644
--- a/state-manager/app/models/db/graph_template_model.py
+++ b/state-manager/app/models/db/graph_template_model.py
@@ -11,7 +11,7 @@
 from ..node_template_model import NodeTemplate
 from app.utils.encrypter import get_encrypter
 from app.models.dependent_string import DependentString
-
+from app.models.retry_policy_model import RetryPolicyModel
 
 class GraphTemplate(BaseDatabaseModel):
     name: str = Field(..., description="Name of the graph")
@@ -20,6 +20,7 @@ class GraphTemplate(BaseDatabaseModel):
     validation_status: GraphTemplateValidationStatus = Field(..., description="Validation status of the graph")
     validation_errors: List[str] = Field(default_factory=list, description="Validation errors of the graph")
     secrets: Dict[str, str] = Field(default_factory=dict, description="Secrets of the graph")
+    retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph")
 
     _node_by_identifier: Dict[str, NodeTemplate] | None = PrivateAttr(default=None)
     _parents_by_identifier: Dict[str, set[str]] | None = PrivateAttr(default=None) # type: ignore
diff --git a/state-manager/app/models/db/state.py b/state-manager/app/models/db/state.py
index e37326b4..28efc613 100644
--- a/state-manager/app/models/db/state.py
+++ b/state-manager/app/models/db/state.py
@@ -24,6 +24,7 @@ class State(BaseDatabaseModel):
     does_unites: bool = Field(default=False, description="Whether this state unites other states")
     state_fingerprint: str = Field(default="", description="Fingerprint of the state")
     enqueue_after: int = Field(default_factory=lambda: int(time.time() * 1000), gt=0, description="Unix time in milliseconds after which the state should be enqueued")
+    retry_count: int = Field(default=0, description="Number of times the state has been retried")
     
     @before_event([Insert, Replace, Save])
     def _generate_fingerprint(self):
@@ -37,6 +38,7 @@ def _generate_fingerprint(self):
             "identifier": self.identifier,
             "graph_name": self.graph_name,
             "run_id": self.run_id,
+            "retry_count": self.retry_count,
             "parents": {k: str(v) for k, v in self.parents.items()},
         }
         payload = json.dumps(
diff --git a/state-manager/app/models/errored_models.py b/state-manager/app/models/errored_models.py
index 5acfaa34..8814d56a 100644
--- a/state-manager/app/models/errored_models.py
+++ b/state-manager/app/models/errored_models.py
@@ -7,4 +7,5 @@ class ErroredRequestModel(BaseModel):
 
 
 class ErroredResponseModel(BaseModel):
-    status: StateStatusEnum = Field(..., description="Status of the state")
\ No newline at end of file
+    status: StateStatusEnum = Field(..., description="Status of the state")
+    retry_created: bool = Field(default=False, description="Whether a retry state was created")
\ No newline at end of file
diff --git a/state-manager/app/models/graph_models.py b/state-manager/app/models/graph_models.py
index 1fd80bd1..8e67cb2d 100644
--- a/state-manager/app/models/graph_models.py
+++ b/state-manager/app/models/graph_models.py
@@ -3,16 +3,19 @@
 from typing import Dict, List, Optional
 from datetime import datetime
 from .graph_template_validation_status import GraphTemplateValidationStatus
+from .retry_policy_model import RetryPolicyModel
 
 
 class UpsertGraphTemplateRequest(BaseModel):
     secrets: Dict[str, str] = Field(..., description="Dictionary of secrets that are used while graph execution")
     nodes: List[NodeTemplate] = Field(..., description="List of node templates that define the graph structure")
+    retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph")
 
 
 class UpsertGraphTemplateResponse(BaseModel):
     nodes: List[NodeTemplate] = Field(..., description="List of node templates that define the graph structure")
     secrets: Dict[str, bool] = Field(..., description="Dictionary of secrets that are used while graph execution")
+    retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph")
     created_at: datetime = Field(..., description="Timestamp when the graph template was created")
     updated_at: datetime = Field(..., description="Timestamp when the graph template was last updated")
     validation_status: GraphTemplateValidationStatus = Field(..., description="Current validation status of the graph template")
diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py
new file mode 100644
index 00000000..2b9d4923
--- /dev/null
+++ b/state-manager/app/models/retry_policy_model.py
@@ -0,0 +1,12 @@
+from pydantic import BaseModel, Field
+from enum import Enum
+
+class RetryMethod(str, Enum):
+    EXPONENTIAL = "EXPONENTIAL"
+    LINEAR = "LINEAR"
+    FIXED = "FIXED"
+
+class RetryPolicyModel(BaseModel):
+    max_retries: int = Field(default=3, description="The maximum number of retries", ge=0)
+    method: RetryMethod = Field(default=RetryMethod.EXPONENTIAL, description="The method of retry")
+    backoff_factor: int = Field(default=2, description="The backoff factor in seconds (default: 2 = 2 seconds)", gt=0)
\ No newline at end of file

From f6a3bb2f06fb6a2a81eaae009219835f43d25296 Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 13:12:43 +0530
Subject: [PATCH 02/20] Refactor retry policy and errored state handling

- Introduced a new RetryStrategy enum with additional strategies for retrying operations, enhancing flexibility in retry mechanisms.
- Updated the RetryPolicyModel to include a compute_delay method for calculating delays based on the selected strategy.
- Refactored the errored_state function to utilize the new retry policy structure, improving error handling and state management.
- Removed the previous _calculate_enqueue_after function, streamlining the code and enhancing clarity.
---
 state-manager/app/controller/errored_state.py | 20 ++-----
 .../app/models/retry_policy_model.py          | 53 +++++++++++++++++--
 2 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py
index 889c01ad..8ceb522d 100644
--- a/state-manager/app/controller/errored_state.py
+++ b/state-manager/app/controller/errored_state.py
@@ -1,3 +1,5 @@
+import time
+
 from app.models.errored_models import ErroredRequestModel, ErroredResponseModel
 from fastapi import HTTPException, status
 from beanie import PydanticObjectId
@@ -5,22 +7,10 @@
 from app.models.db.state import State
 from app.models.state_status_enum import StateStatusEnum
 from app.singletons.logs_manager import LogsManager
-from app.models.retry_policy_model import RetryPolicyModel, RetryMethod
 from app.models.db.graph_template_model import GraphTemplate
 
 logger = LogsManager().get_logger()
 
-def _calculate_enqueue_after(retry_policy: RetryPolicyModel, retry_count: int) -> int:
-    # convert seconds to milliseconds
-    if retry_policy.method == RetryMethod.FIXED:
-        return (retry_policy.backoff_factor * 1000)
-    elif retry_policy.method == RetryMethod.LINEAR:
-        return (retry_policy.backoff_factor * retry_count) * 1000
-    elif retry_policy.method == RetryMethod.EXPONENTIAL:
-        return (retry_policy.backoff_factor ** retry_count) * 1000
-    else:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid retry method")
-
 async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: ErroredRequestModel, x_exosphere_request_id: str) -> ErroredResponseModel:
 
     try:
@@ -49,11 +39,11 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E
                 run_id=state.run_id,
                 status=StateStatusEnum.CREATED,
                 inputs=state.inputs,
-                outputs=state.outputs,
-                error=body.error,
+                outputs={},
+                error=None,
                 parents=state.parents,
                 does_unites=state.does_unites,
-                enqueue_after=state.enqueue_after + _calculate_enqueue_after(graph_template.retry_policy, state.retry_count + 1),
+                enqueue_after= int(time.time() * 1000) + graph_template.retry_policy.compute_delay(state.retry_count + 1),
                 retry_count=state.retry_count + 1
             )
             retry_state = await retry_state.insert()
diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py
index 2b9d4923..070b4528 100644
--- a/state-manager/app/models/retry_policy_model.py
+++ b/state-manager/app/models/retry_policy_model.py
@@ -1,12 +1,59 @@
 from pydantic import BaseModel, Field
 from enum import Enum
+import random
 
-class RetryMethod(str, Enum):
+class RetryStrategy(str, Enum):
     EXPONENTIAL = "EXPONENTIAL"
+    EXPONENTIAL_FULL_JITTER = "EXPONENTIAL_FULL_JITTER"
+    EXPONENTIAL_EQUAL_JITTER = "EXPONENTIAL_EQUAL_JITTER"
+
     LINEAR = "LINEAR"
+    LINEAR_FULL_JITTER = "LINEAR_FULL_JITTER"
+    LINEAR_EQUAL_JITTER = "LINEAR_EQUAL_JITTER"
+
     FIXED = "FIXED"
+    FIXED_FULL_JITTER = "FIXED_FULL_JITTER"
+    FIXED_EQUAL_JITTER = "FIXED_EQUAL_JITTER"
 
 class RetryPolicyModel(BaseModel):
     max_retries: int = Field(default=3, description="The maximum number of retries", ge=0)
-    method: RetryMethod = Field(default=RetryMethod.EXPONENTIAL, description="The method of retry")
-    backoff_factor: int = Field(default=2, description="The backoff factor in seconds (default: 2 = 2 seconds)", gt=0)
\ No newline at end of file
+    strategy: RetryStrategy = Field(default=RetryStrategy.EXPONENTIAL, description="The method of retry")
+    backoff_factor: int = Field(default=2000, description="The backoff factor in milliseconds (default: 2000 = 2 seconds)", gt=0)
+    exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0)
+
+    def compute_delay(self, retry_count: int) -> int:
+        if self.strategy == RetryStrategy.EXPONENTIAL:
+            return (self.backoff_factor * (self.exponent ** retry_count))
+        
+        elif self.strategy == RetryStrategy.EXPONENTIAL_FULL_JITTER:
+            base = self.backoff_factor * (self.exponent ** retry_count)
+            return int(random.uniform(0, base))
+        
+        elif self.strategy == RetryStrategy.EXPONENTIAL_EQUAL_JITTER:
+            base = self.backoff_factor * (self.exponent ** retry_count)
+            return int(base/2 + random.uniform(0, base / 2))
+        
+        elif self.strategy == RetryStrategy.LINEAR:
+            return (self.backoff_factor * retry_count)
+        
+        elif self.strategy == RetryStrategy.LINEAR_FULL_JITTER:
+            base = self.backoff_factor * retry_count
+            return int(random.uniform(0, base))
+
+        elif self.strategy == RetryStrategy.LINEAR_EQUAL_JITTER:
+            base = self.backoff_factor * retry_count
+            return int(base/2 + random.uniform(0, base / 2))
+
+        elif self.strategy == RetryStrategy.FIXED:
+            return self.backoff_factor
+        
+        elif self.strategy == RetryStrategy.FIXED_FULL_JITTER:
+            base = self.backoff_factor
+            return int(random.uniform(0, base))
+
+        elif self.strategy == RetryStrategy.FIXED_EQUAL_JITTER:
+            base = self.backoff_factor
+            return int(base/2 + random.uniform(0, base / 2))
+
+        else:
+            raise Exception("Invalid retry strategy")
\ No newline at end of file

From 66c794a3937ad2f9536d288ea136d757333a7bca Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 13:24:31 +0530
Subject: [PATCH 03/20] Add retry policy documentation and integrate into graph
 configuration

- Introduced a new documentation file for the Retry Policy feature, detailing its configuration and usage within Exosphere.
- Updated the `create-graph.md` file to include a section on retry policies, explaining their structure and providing examples.
- Modified `mkdocs.yml` to include the new Retry Policy documentation in the navigation, enhancing accessibility for users.
---
 docs/docs/exosphere/create-graph.md |  25 ++-
 docs/docs/exosphere/retry-policy.md | 276 ++++++++++++++++++++++++++++
 docs/mkdocs.yml                     |   2 +
 3 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 docs/docs/exosphere/retry-policy.md

diff --git a/docs/docs/exosphere/create-graph.md b/docs/docs/exosphere/create-graph.md
index 11d20395..f5ceeaa1 100644
--- a/docs/docs/exosphere/create-graph.md
+++ b/docs/docs/exosphere/create-graph.md
@@ -51,7 +51,13 @@ One can define a graph on Exosphere through a simple json config, which specifie
       },
       "next_nodes": []
     }
-  ]
+  ],
+  "retry_policy": {
+    "max_retries": 3,
+    "strategy": "EXPONENTIAL",
+    "backoff_factor": 2000,
+    "exponent": 2
+  }
 }
 ```
 
@@ -126,6 +132,23 @@ Use the `${{ ... }}` syntax to map outputs from previous nodes:
 - **`${{ node_identifier.outputs.field_name }}`**: Maps output from a specific node
 - **`initial`**: Static value provided when the graph is triggered
 - **Direct values**: String values. In v1, numbers/booleans must be string-encoded (e.g., "42", "true").
+
+### Retry Policy
+
+Graphs can include a retry policy to handle transient failures automatically. The retry policy is configured at the graph level and applies to all nodes within the graph.
+
+```json
+{
+  "retry_policy": {
+    "max_retries": 3,
+    "strategy": "EXPONENTIAL",
+    "backoff_factor": 2000,
+    "exponent": 2
+  }
+}
+```
+
+For detailed information about retry policies, including all available strategies and configuration options, see the [Retry Policy](retry-policy.md) documentation.
 ## Creating Graph Templates
 
 The recommended way to create graph templates is using the Exosphere Python SDK, which provides a clean interface to the State Manager API.
diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md
new file mode 100644
index 00000000..1dbc6430
--- /dev/null
+++ b/docs/docs/exosphere/retry-policy.md
@@ -0,0 +1,276 @@
+# Retry Policy
+
+!!! beta "Beta Feature"
+    Retry Policy is currently available in beta. The API and functionality may change in future releases.
+
+The Retry Policy feature in Exosphere provides sophisticated retry mechanisms for handling transient failures in your workflow nodes. When a node execution fails, the retry policy automatically determines when and how to retry the execution based on configurable strategies.
+
+## Overview
+
+Retry policies are configured at the graph level and apply to all nodes within that graph. When a node fails with an error, the state manager automatically creates a retry state with a calculated delay before the next execution attempt.
+
+## Configuration
+
+Retry policies are defined in your graph template configuration:
+
+```json
+{
+  "secrets": {
+    "api_key": "your-api-key"
+  },
+  "nodes": [
+    {
+      "node_name": "MyNode",
+      "namespace": "MyProject",
+      "identifier": "my_node",
+      "inputs": {
+        "data": "initial"
+      },
+      "next_nodes": []
+    }
+  ],
+  "retry_policy": {
+    "max_retries": 3,
+    "strategy": "EXPONENTIAL",
+    "backoff_factor": 2000,
+    "exponent": 2
+  }
+}
+```
+
+## Parameters
+
+### max_retries
+- **Type**: `int`
+- **Default**: `3`
+- **Description**: The maximum number of retry attempts before giving up
+- **Constraints**: Must be >= 0
+
+### strategy
+- **Type**: `string`
+- **Default**: `"EXPONENTIAL"`
+- **Description**: The retry strategy to use for calculating delays
+- **Options**: See [Retry Strategies](#retry-strategies) below
+
+### backoff_factor
+- **Type**: `int`
+- **Default**: `2000` (2 seconds)
+- **Description**: The base delay factor in milliseconds
+- **Constraints**: Must be > 0
+
+### exponent
+- **Type**: `int`
+- **Default**: `2`
+- **Description**: The exponent used for exponential strategies
+- **Constraints**: Must be > 0
+
+## Retry Strategies
+
+Exosphere supports three main categories of retry strategies, each with jitter variants to prevent thundering herd problems.
+
+### Exponential Strategies
+
+Exponential strategies increase the delay exponentially with each retry attempt.
+
+#### EXPONENTIAL
+Standard exponential backoff without jitter.
+
+**Formula**: `backoff_factor * (exponent ^ retry_count)`
+
+**Example**:
+- Retry 1: 2000ms (2 seconds)
+- Retry 2: 4000ms (4 seconds)
+- Retry 3: 8000ms (8 seconds)
+
+#### EXPONENTIAL_FULL_JITTER
+Exponential backoff with full jitter (random delay between 0 and calculated delay).
+
+**Formula**: `random(0, backoff_factor * (exponent ^ retry_count))`
+
+**Example**:
+- Retry 1: 0-2000ms (random)
+- Retry 2: 0-4000ms (random)
+- Retry 3: 0-8000ms (random)
+
+#### EXPONENTIAL_EQUAL_JITTER
+Exponential backoff with equal jitter (random delay around half the calculated delay).
+
+**Formula**: `(backoff_factor * (exponent ^ retry_count)) / 2 + random(0, (backoff_factor * (exponent ^ retry_count)) / 2)`
+
+**Example**:
+- Retry 1: 1000-2000ms (random)
+- Retry 2: 2000-4000ms (random)
+- Retry 3: 4000-8000ms (random)
+
+### Linear Strategies
+
+Linear strategies increase the delay linearly with each retry attempt.
+
+#### LINEAR
+Standard linear backoff without jitter.
+
+**Formula**: `backoff_factor * retry_count`
+
+**Example**:
+- Retry 1: 2000ms (2 seconds)
+- Retry 2: 4000ms (4 seconds)
+- Retry 3: 6000ms (6 seconds)
+
+#### LINEAR_FULL_JITTER
+Linear backoff with full jitter.
+
+**Formula**: `random(0, backoff_factor * retry_count)`
+
+**Example**:
+- Retry 1: 0-2000ms (random)
+- Retry 2: 0-4000ms (random)
+- Retry 3: 0-6000ms (random)
+
+#### LINEAR_EQUAL_JITTER
+Linear backoff with equal jitter.
+
+**Formula**: `(backoff_factor * retry_count) / 2 + random(0, (backoff_factor * retry_count) / 2)`
+
+**Example**:
+- Retry 1: 1000-2000ms (random)
+- Retry 2: 2000-4000ms (random)
+- Retry 3: 3000-6000ms (random)
+
+### Fixed Strategies
+
+Fixed strategies use a constant delay for all retry attempts.
+
+#### FIXED
+Standard fixed delay without jitter.
+
+**Formula**: `backoff_factor`
+
+**Example**:
+- Retry 1: 2000ms (2 seconds)
+- Retry 2: 2000ms (2 seconds)
+- Retry 3: 2000ms (2 seconds)
+
+#### FIXED_FULL_JITTER
+Fixed delay with full jitter.
+
+**Formula**: `random(0, backoff_factor)`
+
+**Example**:
+- Retry 1: 0-2000ms (random)
+- Retry 2: 0-2000ms (random)
+- Retry 3: 0-2000ms (random)
+
+#### FIXED_EQUAL_JITTER
+Fixed delay with equal jitter.
+
+**Formula**: `backoff_factor / 2 + random(0, backoff_factor / 2)`
+
+**Example**:
+- Retry 1: 1000-2000ms (random)
+- Retry 2: 1000-2000ms (random)
+- Retry 3: 1000-2000ms (random)
+
+## Usage Examples
+
+### Basic Exponential Retry
+```json
+{
+  "retry_policy": {
+    "max_retries": 3,
+    "strategy": "EXPONENTIAL",
+    "backoff_factor": 1000,
+    "exponent": 2
+  }
+}
+```
+
+### Aggressive Retry with Jitter
+```json
+{
+  "retry_policy": {
+    "max_retries": 5,
+    "strategy": "EXPONENTIAL_FULL_JITTER",
+    "backoff_factor": 500,
+    "exponent": 3
+  }
+}
+```
+
+### Conservative Linear Retry
+```json
+{
+  "retry_policy": {
+    "max_retries": 2,
+    "strategy": "LINEAR",
+    "backoff_factor": 5000
+  }
+}
+```
+
+### Fixed Retry for Rate Limiting
+```json
+{
+  "retry_policy": {
+    "max_retries": 10,
+    "strategy": "FIXED_EQUAL_JITTER",
+    "backoff_factor": 1000
+  }
+}
+```
+
+## When Retries Are Triggered
+
+Retries are automatically triggered when:
+
+1. A node execution fails with an error
+2. The current retry count is less than `max_retries`
+3. The state status is `QUEUED` or `EXECUTED`
+
+The retry mechanism:
+- Creates a new state with `retry_count` incremented by 1
+- Sets `enqueue_after` to the current time plus the calculated delay
+- Sets the original state status to `ERRORED` with the error message
+
+## Best Practices
+
+### Choose the Right Strategy
+- **EXPONENTIAL**: Best for most transient failures (network issues, temporary service unavailability)
+- **LINEAR**: Good for predictable, consistent delays
+- **FIXED**: Useful for rate limiting scenarios
+
+### Use Jitter for High Concurrency
+- **FULL_JITTER**: Best for high concurrency to prevent thundering herd
+- **EQUAL_JITTER**: Good balance between predictability and randomization
+- **No Jitter**: Use only when you need deterministic behavior
+
+### Set Appropriate Limits
+- **max_retries**: Consider the nature of your failures and downstream dependencies
+- **backoff_factor**: Balance between responsiveness and resource usage
+- **exponent**: Higher values create more aggressive backoff
+
+### Monitor Retry Patterns
+- Track retry counts in your monitoring system
+- Set up alerts for graphs with high retry rates
+- Analyze retry patterns to identify systemic issues
+
+## Limitations
+
+- Retry policies apply to all nodes in a graph uniformly
+- Individual node-level retry policies are not supported
+- Retry delays are calculated in milliseconds
+- Maximum delay is not capped (consider using reasonable `backoff_factor` and `exponent` values)
+
+## Error Handling
+
+If a retry policy configuration is invalid:
+- The graph template validation will fail
+- An error will be returned during graph creation
+- The graph will not be saved until the configuration is corrected
+
+## Integration with Signals
+
+Retry policies work alongside Exosphere's signal system:
+
+- Nodes can still raise `PruneSignal` to stop retries immediately
+- Nodes can raise `ReQueueAfterSignal` to re-queue after sometime, this will not mark nodes as failure.
+- The retry count is preserved when using signals 
\ No newline at end of file
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 51df18d8..5ebcf23f 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -101,6 +101,7 @@ plugins:
           - exosphere/register-node.md
           - exosphere/create-runtime.md
           - exosphere/create-graph.md
+          - exosphere/retry-policy.md
           - exosphere/trigger-graph.md
           - exosphere/dashboard.md
           - exosphere/signals.md
@@ -129,6 +130,7 @@ nav:
   - Register Node: exosphere/register-node.md 
   - Create Runtime: exosphere/create-runtime.md
   - Create Graph: exosphere/create-graph.md
+  - Retry Policy: exosphere/retry-policy.md
   - Trigger Graph: exosphere/trigger-graph.md  
   - Dashboard: exosphere/dashboard.md
   - Signals: exosphere/signals.md

From 728b225395ae04ebfb43a2a637943fc90a66eaf9 Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 13:32:57 +0530
Subject: [PATCH 04/20] Enhance retry policy error handling and validation

- Added validation to ensure the retry count is greater than 0 in the compute_delay method of RetryPolicyModel, raising a ValueError for invalid inputs.
- Updated the compute_delay method to correctly calculate delays based on the retry count, adjusting the exponentiation logic.
- Refined error handling in the GraphTemplate model by replacing ValueError with HTTPException for better integration with FastAPI, ensuring a 404 response when a graph template is not found.
---
 state-manager/app/models/db/graph_template_model.py |  4 +++-
 state-manager/app/models/retry_policy_model.py      | 11 +++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/state-manager/app/models/db/graph_template_model.py b/state-manager/app/models/db/graph_template_model.py
index 4b3a4731..ee0323d8 100644
--- a/state-manager/app/models/db/graph_template_model.py
+++ b/state-manager/app/models/db/graph_template_model.py
@@ -5,6 +5,8 @@
 from pymongo import IndexModel
 from pydantic import Field, field_validator, PrivateAttr, model_validator
 from typing import List, Self, Dict
+from fastapi.exceptions import HTTPException
+from fastapi import status
 
 from .base import BaseDatabaseModel
 from ..graph_template_validation_status import GraphTemplateValidationStatus
@@ -303,7 +305,7 @@ def get_path_by_identifier(self, identifier: str) -> set[str]:
     async def get(namespace: str, graph_name: str) -> "GraphTemplate":
         graph_template = await GraphTemplate.find_one(GraphTemplate.namespace == namespace, GraphTemplate.name == graph_name)
         if not graph_template:
-            raise ValueError(f"Graph template not found for namespace: {namespace} and graph name: {graph_name}")
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found")
         return graph_template
     
     @staticmethod
diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py
index 070b4528..5fe92409 100644
--- a/state-manager/app/models/retry_policy_model.py
+++ b/state-manager/app/models/retry_policy_model.py
@@ -22,15 +22,18 @@ class RetryPolicyModel(BaseModel):
     exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0)
 
     def compute_delay(self, retry_count: int) -> int:
+        if retry_count < 1:
+            raise ValueError(f"Retry count must be greater than 1, got {retry_count}")
+        
         if self.strategy == RetryStrategy.EXPONENTIAL:
-            return (self.backoff_factor * (self.exponent ** retry_count))
+            return (self.backoff_factor * (self.exponent ** (retry_count - 1)))
         
         elif self.strategy == RetryStrategy.EXPONENTIAL_FULL_JITTER:
-            base = self.backoff_factor * (self.exponent ** retry_count)
+            base = self.backoff_factor * (self.exponent ** (retry_count - 1))
             return int(random.uniform(0, base))
         
         elif self.strategy == RetryStrategy.EXPONENTIAL_EQUAL_JITTER:
-            base = self.backoff_factor * (self.exponent ** retry_count)
+            base = self.backoff_factor * (self.exponent ** (retry_count - 1))
             return int(base/2 + random.uniform(0, base / 2))
         
         elif self.strategy == RetryStrategy.LINEAR:
@@ -56,4 +59,4 @@ def compute_delay(self, retry_count: int) -> int:
             return int(base/2 + random.uniform(0, base / 2))
 
         else:
-            raise Exception("Invalid retry strategy")
\ No newline at end of file
+            raise ValueError(f"Invalid retry strategy: {self.strategy}")
\ No newline at end of file

From ba1378589e994df72b3c1073896e4e3034ae2d8f Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 13:40:54 +0530
Subject: [PATCH 05/20] Update retry policy documentation and examples

- Clarified the backoff_factor parameter in the retry policy documentation to specify its unit as milliseconds.
- Added retry policy examples in the create-graph.md file to demonstrate its usage in graph template creation and updates.
- Ensured consistency in the retry policy structure across documentation, enhancing user understanding and implementation.
---
 docs/docs/exosphere/create-graph.md | 19 ++++++++--
 docs/docs/exosphere/retry-policy.md | 58 +++++++++++++++++++++++++----
 2 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/docs/docs/exosphere/create-graph.md b/docs/docs/exosphere/create-graph.md
index f5ceeaa1..178debad 100644
--- a/docs/docs/exosphere/create-graph.md
+++ b/docs/docs/exosphere/create-graph.md
@@ -142,13 +142,14 @@ Graphs can include a retry policy to handle transient failures automatically. Th
   "retry_policy": {
     "max_retries": 3,
     "strategy": "EXPONENTIAL",
-    "backoff_factor": 2000,
+    "backoff_factor": 2000, // milliseconds
     "exponent": 2
   }
 }
 ```
 
 For detailed information about retry policies, including all available strategies and configuration options, see the [Retry Policy](retry-policy.md) documentation.
+
 ## Creating Graph Templates
 
 The recommended way to create graph templates is using the Exosphere Python SDK, which provides a clean interface to the State Manager API.
@@ -179,7 +180,13 @@ async def create_graph_template():
         result = await state_manager.upsert_graph(
             graph_name="my-workflow",
             graph_nodes=graph_nodes,
-            secrets=secrets
+            secrets=secrets,
+            retry_policy={
+                "max_retries": 3,
+                "strategy": "EXPONENTIAL",
+                "backoff_factor": 2000,
+                "exponent": 2
+            }
         )
         print("Graph template created successfully!")
         print(f"Validation status: {result['validation_status']}")
@@ -291,7 +298,13 @@ The state manager validates your graph template:
             result = await state_manager.upsert_graph(
                 graph_name="my-workflow",
                 graph_nodes=updated_nodes,
-                secrets=updated_secrets
+                secrets=updated_secrets,
+                retry_policy={
+                    "max_retries": 3,
+                    "strategy": "EXPONENTIAL",
+                    "backoff_factor": 2000,
+                    "exponent": 2
+                }
             )
             print("Graph template updated successfully!")
             print(f"Validation status: {result['validation_status']}")
diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md
index 1dbc6430..a8079d87 100644
--- a/docs/docs/exosphere/retry-policy.md
+++ b/docs/docs/exosphere/retry-policy.md
@@ -41,24 +41,28 @@ Retry policies are defined in your graph template configuration:
 ## Parameters
 
 ### max_retries
+
 - **Type**: `int`
 - **Default**: `3`
 - **Description**: The maximum number of retry attempts before giving up
 - **Constraints**: Must be >= 0
 
 ### strategy
+
 - **Type**: `string`
 - **Default**: `"EXPONENTIAL"`
 - **Description**: The retry strategy to use for calculating delays
 - **Options**: See [Retry Strategies](#retry-strategies) below
 
 ### backoff_factor
-- **Type**: `int`
+
+- **Type**: `int` (milliseconds)
 - **Default**: `2000` (2 seconds)
 - **Description**: The base delay factor in milliseconds
 - **Constraints**: Must be > 0
 
 ### exponent
+
 - **Type**: `int`
 - **Default**: `2`
 - **Description**: The exponent used for exponential strategies
@@ -73,31 +77,41 @@ Exosphere supports three main categories of retry strategies, each with jitter v
 Exponential strategies increase the delay exponentially with each retry attempt.
 
 #### EXPONENTIAL
+
 Standard exponential backoff without jitter.
 
-**Formula**: `backoff_factor * (exponent ^ retry_count)`
+**Formula**: `backoff_factor * (exponent ^ (retry_count - 1))`
 
 **Example**:
+
 - Retry 1: 2000ms (2 seconds)
 - Retry 2: 4000ms (4 seconds)
 - Retry 3: 8000ms (8 seconds)
 
 #### EXPONENTIAL_FULL_JITTER
+
 Exponential backoff with full jitter (random delay between 0 and calculated delay).
 
-**Formula**: `random(0, backoff_factor * (exponent ^ retry_count))`
+**Formula**: `random(0, backoff_factor * (exponent ^ (retry_count - 1)))`
+
+*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].*
 
 **Example**:
+
 - Retry 1: 0-2000ms (random)
 - Retry 2: 0-4000ms (random)
 - Retry 3: 0-8000ms (random)
 
 #### EXPONENTIAL_EQUAL_JITTER
+
 Exponential backoff with equal jitter (random delay around half the calculated delay).
 
-**Formula**: `(backoff_factor * (exponent ^ retry_count)) / 2 + random(0, (backoff_factor * (exponent ^ retry_count)) / 2)`
+**Formula**: `(backoff_factor * (exponent ^ (retry_count - 1))) / 2 + random(0, (backoff_factor * (exponent ^ (retry_count - 1))) / 2)`
+
+*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].*
 
 **Example**:
+
 - Retry 1: 1000-2000ms (random)
 - Retry 2: 2000-4000ms (random)
 - Retry 3: 4000-8000ms (random)
@@ -107,31 +121,41 @@ Exponential backoff with equal jitter (random delay around half the calculated d
 Linear strategies increase the delay linearly with each retry attempt.
 
 #### LINEAR
+
 Standard linear backoff without jitter.
 
 **Formula**: `backoff_factor * retry_count`
 
 **Example**:
+
 - Retry 1: 2000ms (2 seconds)
 - Retry 2: 4000ms (4 seconds)
 - Retry 3: 6000ms (6 seconds)
 
 #### LINEAR_FULL_JITTER
+
 Linear backoff with full jitter.
 
 **Formula**: `random(0, backoff_factor * retry_count)`
 
+*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].*
+
 **Example**:
+
 - Retry 1: 0-2000ms (random)
 - Retry 2: 0-4000ms (random)
 - Retry 3: 0-6000ms (random)
 
 #### LINEAR_EQUAL_JITTER
+
 Linear backoff with equal jitter.
 
 **Formula**: `(backoff_factor * retry_count) / 2 + random(0, (backoff_factor * retry_count) / 2)`
 
+*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].*
+
 **Example**:
+
 - Retry 1: 1000-2000ms (random)
 - Retry 2: 2000-4000ms (random)
 - Retry 3: 3000-6000ms (random)
@@ -141,31 +165,41 @@ Linear backoff with equal jitter.
 Fixed strategies use a constant delay for all retry attempts.
 
 #### FIXED
+
 Standard fixed delay without jitter.
 
 **Formula**: `backoff_factor`
 
 **Example**:
+
 - Retry 1: 2000ms (2 seconds)
 - Retry 2: 2000ms (2 seconds)
 - Retry 3: 2000ms (2 seconds)
 
 #### FIXED_FULL_JITTER
+
 Fixed delay with full jitter.
 
 **Formula**: `random(0, backoff_factor)`
 
+*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].*
+
 **Example**:
+
 - Retry 1: 0-2000ms (random)
 - Retry 2: 0-2000ms (random)
 - Retry 3: 0-2000ms (random)
 
 #### FIXED_EQUAL_JITTER
+
 Fixed delay with equal jitter.
 
 **Formula**: `backoff_factor / 2 + random(0, backoff_factor / 2)`
 
+*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].*
+
 **Example**:
+
 - Retry 1: 1000-2000ms (random)
 - Retry 2: 1000-2000ms (random)
 - Retry 3: 1000-2000ms (random)
@@ -173,6 +207,7 @@ Fixed delay with equal jitter.
 ## Usage Examples
 
 ### Basic Exponential Retry
+
 ```json
 {
   "retry_policy": {
@@ -185,6 +220,7 @@ Fixed delay with equal jitter.
 ```
 
 ### Aggressive Retry with Jitter
+
 ```json
 {
   "retry_policy": {
@@ -197,6 +233,7 @@ Fixed delay with equal jitter.
 ```
 
 ### Conservative Linear Retry
+
 ```json
 {
   "retry_policy": {
@@ -208,6 +245,7 @@ Fixed delay with equal jitter.
 ```
 
 ### Fixed Retry for Rate Limiting
+
 ```json
 {
   "retry_policy": {
@@ -227,6 +265,7 @@ Retries are automatically triggered when:
 3. The state status is `QUEUED` or `EXECUTED`
 
 The retry mechanism:
+
 - Creates a new state with `retry_count` incremented by 1
 - Sets `enqueue_after` to the current time plus the calculated delay
 - Sets the original state status to `ERRORED` with the error message
@@ -234,21 +273,25 @@ The retry mechanism:
 ## Best Practices
 
 ### Choose the Right Strategy
+
 - **EXPONENTIAL**: Best for most transient failures (network issues, temporary service unavailability)
 - **LINEAR**: Good for predictable, consistent delays
 - **FIXED**: Useful for rate limiting scenarios
 
 ### Use Jitter for High Concurrency
+
 - **FULL_JITTER**: Best for high concurrency to prevent thundering herd
 - **EQUAL_JITTER**: Good balance between predictability and randomization
 - **No Jitter**: Use only when you need deterministic behavior
 
 ### Set Appropriate Limits
+
 - **max_retries**: Consider the nature of your failures and downstream dependencies
 - **backoff_factor**: Balance between responsiveness and resource usage
 - **exponent**: Higher values create more aggressive backoff
 
 ### Monitor Retry Patterns
+
 - Track retry counts in your monitoring system
 - Set up alerts for graphs with high retry rates
 - Analyze retry patterns to identify systemic issues
@@ -263,14 +306,15 @@ The retry mechanism:
 ## Error Handling
 
 If a retry policy configuration is invalid:
+
 - The graph template validation will fail
 - An error will be returned during graph creation
 - The graph will not be saved until the configuration is corrected
 
 ## Integration with Signals
 
-Retry policies work alongside Exosphere's signal system:
+Retry policies work alongside Exosphere's signaling system:
 
 - Nodes can still raise `PruneSignal` to stop retries immediately
-- Nodes can raise `ReQueueAfterSignal` to re-queue after sometime, this will not mark nodes as failure.
-- The retry count is preserved when using signals 
\ No newline at end of file
+- Nodes can raise `ReQueueAfterSignal` to re-queue after some time. This will not mark nodes as failures.
+- The retry count is preserved when using signals. 
\ No newline at end of file

From 6e02d5022023336e421dc67a31665a0e7b2f080c Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 13:51:21 +0530
Subject: [PATCH 06/20] Enhance retry policy implementation and documentation

- Added the `max_delay` parameter to the retry policy model, allowing users to cap the maximum delay for retry attempts.
- Updated the documentation to include detailed explanations of the `max_delay` parameter and its usage in retry strategies.
- Improved error handling in the `errored_state` function to log errors when fetching graph templates and raise appropriate HTTP exceptions.
- Refactored the `GraphTemplate` model to raise a ValueError instead of an HTTPException when a graph template is not found, enhancing error handling consistency.
- Updated the `compute_delay` method in the `RetryPolicyModel` to apply the new delay capping logic across all retry strategies.
---
 docs/docs/exosphere/retry-policy.md           | 79 ++++++++++++++++++-
 state-manager/app/controller/errored_state.py |  8 +-
 .../app/models/db/graph_template_model.py     |  4 +-
 .../app/models/retry_policy_model.py          | 25 +++---
 4 files changed, 101 insertions(+), 15 deletions(-)

diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md
index a8079d87..d1b0aef3 100644
--- a/docs/docs/exosphere/retry-policy.md
+++ b/docs/docs/exosphere/retry-policy.md
@@ -33,7 +33,8 @@ Retry policies are defined in your graph template configuration:
     "max_retries": 3,
     "strategy": "EXPONENTIAL",
     "backoff_factor": 2000,
-    "exponent": 2
+    "exponent": 2,
+    "max_delay": 3600000
   }
 }
 ```
@@ -68,6 +69,14 @@ Retry policies are defined in your graph template configuration:
 - **Description**: The exponent used for exponential strategies
 - **Constraints**: Must be > 0
 
+### max_delay
+
+- **Type**: `int | null` (milliseconds)
+- **Default**: `null` (no maximum delay)
+- **Description**: The maximum delay in milliseconds that any retry attempt can have. When set, all calculated delays are capped at this value using the `_cap` function
+- **Constraints**: Must be > 0 when not null
+- **Example**: `3600000` (1 hour) would cap all delays to a maximum of 1 hour
+
 ## Retry Strategies
 
 Exosphere supports three main categories of retry strategies, each with jitter variants to prevent thundering herd problems.
@@ -204,6 +213,43 @@ Fixed delay with equal jitter.
 - Retry 2: 1000-2000ms (random)
 - Retry 3: 1000-2000ms (random)
 
+## Delay Capping
+
+The retry policy includes a built-in delay capping mechanism through the `_cap` function and `max_delay` parameter. This ensures that retry delays never exceed a specified maximum value, even with aggressive exponential backoff strategies.
+
+### How Delay Capping Works
+
+The `_cap` function is applied to all calculated delays:
+
+```python
+def _cap(value: int) -> int:
+    if self.max_delay is not None:
+        return min(value, self.max_delay)
+    return value
+```
+
+**Behavior:**
+- If `max_delay` is set, all calculated delays are capped at this value
+- If `max_delay` is `null` (default), no capping is applied
+- The capping is applied after all strategy calculations.
+
+### Example with Delay Capping
+
+Consider an exponential strategy with `backoff_factor: 2000`, `exponent: 2`, and `max_delay: 10000`:
+
+**With capping:**
+- Retry 1: 2000ms
+- Retry 2: 4000ms
+- Retry 3: 8000ms
+- Retry 4: 10000ms (capped at max_delay)
+
+### When to Use Delay Capping
+
+- **Long-running workflows**: Prevent excessive delays that could impact overall workflow completion time
+- **User-facing applications**: Ensure retries don't create unacceptable wait times
+- **Resource management**: Control resource consumption by limiting retry delays
+- **Predictable behavior**: Create more predictable retry patterns for monitoring and alerting
+
 ## Usage Examples
 
 ### Basic Exponential Retry
@@ -256,6 +302,34 @@ Fixed delay with equal jitter.
 }
 ```
 
+### Exponential Retry with Delay Capping
+
+```json
+{
+  "retry_policy": {
+    "max_retries": 5,
+    "strategy": "EXPONENTIAL",
+    "backoff_factor": 2000,
+    "exponent": 2,
+    "max_delay": 30000
+  }
+}
+```
+
+### Conservative Retry with Maximum Delay
+
+```json
+{
+  "retry_policy": {
+    "max_retries": 3,
+    "strategy": "EXPONENTIAL_FULL_JITTER",
+    "backoff_factor": 1000,
+    "exponent": 3,
+    "max_delay": 60000
+  }
+}
+```
+
 ## When Retries Are Triggered
 
 Retries are automatically triggered when:
@@ -289,6 +363,7 @@ The retry mechanism:
 - **max_retries**: Consider the nature of your failures and downstream dependencies
 - **backoff_factor**: Balance between responsiveness and resource usage
 - **exponent**: Higher values create more aggressive backoff
+- **max_delay**: Set a reasonable maximum delay to prevent excessive wait times, especially for exponential strategies
 
 ### Monitor Retry Patterns
 
@@ -301,7 +376,7 @@ The retry mechanism:
 - Retry policies apply to all nodes in a graph uniformly
 - Individual node-level retry policies are not supported
 - Retry delays are calculated in milliseconds
-- Maximum delay is not capped (consider using reasonable `backoff_factor` and `exponent` values)
+- Maximum delay can be capped using the `max_delay` parameter (recommended for long-running workflows)
 
 ## Error Handling
 
diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py
index 8ceb522d..1fd56021 100644
--- a/state-manager/app/controller/errored_state.py
+++ b/state-manager/app/controller/errored_state.py
@@ -26,7 +26,13 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E
         if state.status == StateStatusEnum.EXECUTED:
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="State is already executed")
         
-        graph_template = await GraphTemplate.get(namespace_name, state.graph_name)
+        try:
+            graph_template = await GraphTemplate.get(namespace_name, state.graph_name)
+        except Exception as e:
+            logger.error(f"Error getting graph template {state.graph_name} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id, error=e)
+            if "Graph template not found" in str(e):
+                raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found")
+            raise e
 
         retry_created = False
 
diff --git a/state-manager/app/models/db/graph_template_model.py b/state-manager/app/models/db/graph_template_model.py
index ee0323d8..4b3a4731 100644
--- a/state-manager/app/models/db/graph_template_model.py
+++ b/state-manager/app/models/db/graph_template_model.py
@@ -5,8 +5,6 @@
 from pymongo import IndexModel
 from pydantic import Field, field_validator, PrivateAttr, model_validator
 from typing import List, Self, Dict
-from fastapi.exceptions import HTTPException
-from fastapi import status
 
 from .base import BaseDatabaseModel
 from ..graph_template_validation_status import GraphTemplateValidationStatus
@@ -305,7 +303,7 @@ def get_path_by_identifier(self, identifier: str) -> set[str]:
     async def get(namespace: str, graph_name: str) -> "GraphTemplate":
         graph_template = await GraphTemplate.find_one(GraphTemplate.namespace == namespace, GraphTemplate.name == graph_name)
         if not graph_template:
-            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found")
+            raise ValueError(f"Graph template not found for namespace: {namespace} and graph name: {graph_name}")
         return graph_template
     
     @staticmethod
diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py
index 5fe92409..9c695bfe 100644
--- a/state-manager/app/models/retry_policy_model.py
+++ b/state-manager/app/models/retry_policy_model.py
@@ -20,43 +20,50 @@ class RetryPolicyModel(BaseModel):
     strategy: RetryStrategy = Field(default=RetryStrategy.EXPONENTIAL, description="The method of retry")
     backoff_factor: int = Field(default=2000, description="The backoff factor in milliseconds (default: 2000 = 2 seconds)", gt=0)
     exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0)
+    max_delay: int | None = Field(default=None, description="The maximum delay in milliseconds (default: 3600000 = 1 hour)", gt=0)
 
     def compute_delay(self, retry_count: int) -> int:
+
+        def _cap(value: int) -> int:
+            if self.max_delay is not None:
+                return min(value, self.max_delay)
+            return value
+
         if retry_count < 1:
             raise ValueError(f"Retry count must be greater than 1, got {retry_count}")
         
         if self.strategy == RetryStrategy.EXPONENTIAL:
-            return (self.backoff_factor * (self.exponent ** (retry_count - 1)))
+            return _cap(self.backoff_factor * (self.exponent ** (retry_count - 1)))
         
         elif self.strategy == RetryStrategy.EXPONENTIAL_FULL_JITTER:
             base = self.backoff_factor * (self.exponent ** (retry_count - 1))
-            return int(random.uniform(0, base))
+            return _cap(int(random.uniform(0, base)))
         
         elif self.strategy == RetryStrategy.EXPONENTIAL_EQUAL_JITTER:
             base = self.backoff_factor * (self.exponent ** (retry_count - 1))
-            return int(base/2 + random.uniform(0, base / 2))
+            return _cap(int(base/2 + random.uniform(0, base / 2)))
         
         elif self.strategy == RetryStrategy.LINEAR:
-            return (self.backoff_factor * retry_count)
+            return _cap(self.backoff_factor * retry_count)
         
         elif self.strategy == RetryStrategy.LINEAR_FULL_JITTER:
             base = self.backoff_factor * retry_count
-            return int(random.uniform(0, base))
+            return _cap(int(random.uniform(0, base)))
 
         elif self.strategy == RetryStrategy.LINEAR_EQUAL_JITTER:
             base = self.backoff_factor * retry_count
-            return int(base/2 + random.uniform(0, base / 2))
+            return _cap(int(base/2 + random.uniform(0, base / 2)))
 
         elif self.strategy == RetryStrategy.FIXED:
-            return self.backoff_factor
+            return _cap(self.backoff_factor)
         
         elif self.strategy == RetryStrategy.FIXED_FULL_JITTER:
             base = self.backoff_factor
-            return int(random.uniform(0, base))
+            return _cap(int(random.uniform(0, base)))
 
         elif self.strategy == RetryStrategy.FIXED_EQUAL_JITTER:
             base = self.backoff_factor
-            return int(base/2 + random.uniform(0, base / 2))
+            return _cap(int(base/2 + random.uniform(0, base / 2)))
 
         else:
             raise ValueError(f"Invalid retry strategy: {self.strategy}")
\ No newline at end of file

From 64592d8a4bc7e236dc06b5ba3af1956e2ca6a158 Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 14:07:57 +0530
Subject: [PATCH 07/20] Enhance errored state handling with retry state
 management

- Added error handling for duplicate retry states in the `errored_state` function, logging when a retry state already exists.
- Introduced a new `fanout_id` field in the `State` model to support unique identification of retry states.
- Updated the database index to enforce uniqueness on the combination of relevant state fields, improving data integrity and query performance.
---
 state-manager/app/controller/errored_state.py | 42 +++++++++++--------
 state-manager/app/models/db/state.py          | 17 +++++++-
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py
index 1fd56021..47f1db50 100644
--- a/state-manager/app/controller/errored_state.py
+++ b/state-manager/app/controller/errored_state.py
@@ -3,6 +3,7 @@
 from app.models.errored_models import ErroredRequestModel, ErroredResponseModel
 from fastapi import HTTPException, status
 from beanie import PydanticObjectId
+from pymongo.errors import DuplicateKeyError
 
 from app.models.db.state import State
 from app.models.state_status_enum import StateStatusEnum
@@ -37,24 +38,29 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E
         retry_created = False
 
         if state.retry_count < graph_template.retry_policy.max_retries:
-            retry_state = State(
-                node_name=state.node_name,
-                namespace_name=state.namespace_name,
-                identifier=state.identifier,
-                graph_name=state.graph_name,
-                run_id=state.run_id,
-                status=StateStatusEnum.CREATED,
-                inputs=state.inputs,
-                outputs={},
-                error=None,
-                parents=state.parents,
-                does_unites=state.does_unites,
-                enqueue_after= int(time.time() * 1000) + graph_template.retry_policy.compute_delay(state.retry_count + 1),
-                retry_count=state.retry_count + 1
-            )
-            retry_state = await retry_state.insert()
-            logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id)
-            retry_created = True
+            try:
+                retry_state = State(
+                    node_name=state.node_name,
+                    namespace_name=state.namespace_name,
+                    identifier=state.identifier,
+                    graph_name=state.graph_name,
+                    run_id=state.run_id,
+                    status=StateStatusEnum.CREATED,
+                    inputs=state.inputs,
+                    outputs={},
+                    error=None,
+                    parents=state.parents,
+                    does_unites=state.does_unites,
+                    enqueue_after= int(time.time() * 1000) + graph_template.retry_policy.compute_delay(state.retry_count + 1),
+                    retry_count=state.retry_count + 1,
+                    fanout_id=state.fanout_id
+                )
+                retry_state = await retry_state.insert()
+                logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id)
+                retry_created = True
+            except DuplicateKeyError:
+                logger.info(f"Retry state {retry_state.id} already exists for state {state_id}", x_exosphere_request_id=x_exosphere_request_id)
+                retry_created = True
 
         state.status = StateStatusEnum.ERRORED
         state.error = body.error
diff --git a/state-manager/app/models/db/state.py b/state-manager/app/models/db/state.py
index b883ec65..0747d408 100644
--- a/state-manager/app/models/db/state.py
+++ b/state-manager/app/models/db/state.py
@@ -8,6 +8,7 @@
 import hashlib
 import json
 import time
+import uuid
 
 class State(BaseDatabaseModel):
     node_name: str = Field(..., description="Name of the node of the state")
@@ -25,7 +26,8 @@ class State(BaseDatabaseModel):
     state_fingerprint: str = Field(default="", description="Fingerprint of the state")
     enqueue_after: int = Field(default_factory=lambda: int(time.time() * 1000), gt=0, description="Unix time in milliseconds after which the state should be enqueued")
     retry_count: int = Field(default=0, description="Number of times the state has been retried")
-    
+    fanout_id: str = Field(default=str(uuid.uuid4()), description="Fanout ID of the state")
+
     @before_event([Insert, Replace, Save])
     def _generate_fingerprint(self):
         if not self.does_unites:
@@ -78,5 +80,18 @@ class Settings:
                     ("node_name", 1),
                 ],
                 name="enqueue_query"
+            ),
+            IndexModel(
+                [
+                    ("node_name", 1),
+                    ("namespace_name", 1),
+                    ("graph_name", 1),
+                    ("identifier", 1),
+                    ("run_id", 1),
+                    ("retry_count", 1),
+                    ("fanout_id", 1),
+                ],
+                unique=True,
+                name="uniq_fanout_retry"
             )
         ]
\ No newline at end of file

From 33e75d3fd200f79e81e54e8d366281011888f7cc Mon Sep 17 00:00:00 2001
From: Nivedit Jain <niveditjain@outlook.com>
Date: Sun, 31 Aug 2025 14:11:11 +0530
Subject: [PATCH 08/20] Update state-manager/app/models/db/state.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 state-manager/app/models/db/state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/state-manager/app/models/db/state.py b/state-manager/app/models/db/state.py
index 0747d408..05441ec3 100644
--- a/state-manager/app/models/db/state.py
+++ b/state-manager/app/models/db/state.py
@@ -26,7 +26,7 @@ class State(BaseDatabaseModel):
     state_fingerprint: str = Field(default="", description="Fingerprint of the state")
     enqueue_after: int = Field(default_factory=lambda: int(time.time() * 1000), gt=0, description="Unix time in milliseconds after which the state should be enqueued")
     retry_count: int = Field(default=0, description="Number of times the state has been retried")
-    fanout_id: str = Field(default=str(uuid.uuid4()), description="Fanout ID of the state")
+    fanout_id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Fanout ID of the state")
 
     @before_event([Insert, Replace, Save])
     def _generate_fingerprint(self):

From 0c16bd75eb5655cb2e3c1fb81b253957659ef9ca Mon Sep 17 00:00:00 2001
From: Nivedit Jain <niveditjain@outlook.com>
Date: Sun, 31 Aug 2025 14:11:25 +0530
Subject: [PATCH 09/20] Update docs/docs/exosphere/retry-policy.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/docs/exosphere/retry-policy.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md
index d1b0aef3..8089502b 100644
--- a/docs/docs/exosphere/retry-policy.md
+++ b/docs/docs/exosphere/retry-policy.md
@@ -336,7 +336,7 @@ Retries are automatically triggered when:
 
 1. A node execution fails with an error
 2. The current retry count is less than `max_retries`
-3. The state status is `QUEUED` or `EXECUTED`
+3. The state status is `QUEUED`
 
 The retry mechanism:
 

From b0cabb02a96f3bd29e66f5e88bfdbbb023de87f6 Mon Sep 17 00:00:00 2001
From: Nivedit Jain <niveditjain@outlook.com>
Date: Sun, 31 Aug 2025 14:11:49 +0530
Subject: [PATCH 10/20] Update docs/docs/exosphere/retry-policy.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/docs/exosphere/retry-policy.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md
index 8089502b..87257bcd 100644
--- a/docs/docs/exosphere/retry-policy.md
+++ b/docs/docs/exosphere/retry-policy.md
@@ -392,4 +392,4 @@ Retry policies work alongside Exosphere's signaling system:
 
 - Nodes can still raise `PruneSignal` to stop retries immediately
 - Nodes can raise `ReQueueAfterSignal` to re-queue after some time. This will not mark nodes as failures.
-- The retry count is preserved when using signals. 
\ No newline at end of file
+- When a node is re-queued using `ReQueueAfterSignal`, the `retry_count` is not incremented. The existing count is carried over to the new state.
\ No newline at end of file

From 74da1b8c8f342a53f57db3c31c43f07cd004905f Mon Sep 17 00:00:00 2001
From: Nivedit Jain <niveditjain@outlook.com>
Date: Sun, 31 Aug 2025 14:12:03 +0530
Subject: [PATCH 11/20] Update state-manager/app/controller/errored_state.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 state-manager/app/controller/errored_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py
index 47f1db50..2faab448 100644
--- a/state-manager/app/controller/errored_state.py
+++ b/state-manager/app/controller/errored_state.py
@@ -31,7 +31,7 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E
             graph_template = await GraphTemplate.get(namespace_name, state.graph_name)
         except Exception as e:
             logger.error(f"Error getting graph template {state.graph_name} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id, error=e)
-            if "Graph template not found" in str(e):
+            if isinstance(e, ValueError) and "Graph template not found" in str(e):
                 raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found")
             raise e
 

From 06ee0a3642ada9954e7f8b77e573a73aeb7afeb7 Mon Sep 17 00:00:00 2001
From: Nivedit Jain <niveditjain@outlook.com>
Date: Sun, 31 Aug 2025 14:12:14 +0530
Subject: [PATCH 12/20] Update state-manager/app/controller/errored_state.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 state-manager/app/controller/errored_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py
index 2faab448..6645dac9 100644
--- a/state-manager/app/controller/errored_state.py
+++ b/state-manager/app/controller/errored_state.py
@@ -59,7 +59,7 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E
                 logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id)
                 retry_created = True
             except DuplicateKeyError:
-                logger.info(f"Retry state {retry_state.id} already exists for state {state_id}", x_exosphere_request_id=x_exosphere_request_id)
+                logger.info(f"Duplicate retry state detected for state {state_id}. A retry state with the same unique key already exists.", x_exosphere_request_id=x_exosphere_request_id)
                 retry_created = True
 
         state.status = StateStatusEnum.ERRORED

From fded75b13c7bb6c968975d8937ff98c1eb792c03 Mon Sep 17 00:00:00 2001
From: Nivedit Jain <niveditjain@outlook.com>
Date: Sun, 31 Aug 2025 14:12:22 +0530
Subject: [PATCH 13/20] Update state-manager/app/models/retry_policy_model.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 state-manager/app/models/retry_policy_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py
index 9c695bfe..61da87a3 100644
--- a/state-manager/app/models/retry_policy_model.py
+++ b/state-manager/app/models/retry_policy_model.py
@@ -30,7 +30,7 @@ def _cap(value: int) -> int:
             return value
 
         if retry_count < 1:
-            raise ValueError(f"Retry count must be greater than 1, got {retry_count}")
+            raise ValueError(f"Retry count must be greater than or equal to 1, got {retry_count}")
         
         if self.strategy == RetryStrategy.EXPONENTIAL:
             return _cap(self.backoff_factor * (self.exponent ** (retry_count - 1)))

From e56d5f5da2ba2bc5b2b3a81278c19f807c85229a Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 14:26:03 +0530
Subject: [PATCH 14/20] Update max_delay description in RetryPolicyModel to
 clarify behavior when set to None

---
 state-manager/app/models/retry_policy_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py
index 61da87a3..be719176 100644
--- a/state-manager/app/models/retry_policy_model.py
+++ b/state-manager/app/models/retry_policy_model.py
@@ -20,7 +20,7 @@ class RetryPolicyModel(BaseModel):
     strategy: RetryStrategy = Field(default=RetryStrategy.EXPONENTIAL, description="The method of retry")
     backoff_factor: int = Field(default=2000, description="The backoff factor in milliseconds (default: 2000 = 2 seconds)", gt=0)
     exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0)
-    max_delay: int | None = Field(default=None, description="The maximum delay in milliseconds (default: 3600000 = 1 hour)", gt=0)
+    max_delay: int | None = Field(default=None, description="The maximum delay in milliseconds (no default limit when None)", gt=0)
 
     def compute_delay(self, retry_count: int) -> int:
 

From 42efd6850a647b42c6c60ac60c6b255f999cab4d Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 14:32:23 +0530
Subject: [PATCH 15/20] Refine documentation for retry policy and errored state
 handling

- Added missing newlines in the retry policy documentation for better readability.
- Removed redundant assignment in the errored state handling to streamline the code logic.
---
 docs/docs/exosphere/retry-policy.md           | 4 +++-
 state-manager/app/controller/errored_state.py | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md
index 87257bcd..639306dc 100644
--- a/docs/docs/exosphere/retry-policy.md
+++ b/docs/docs/exosphere/retry-policy.md
@@ -229,6 +229,7 @@ def _cap(value: int) -> int:
 ```
 
 **Behavior:**
+
 - If `max_delay` is set, all calculated delays are capped at this value
 - If `max_delay` is `null` (default), no capping is applied
 - The capping is applied after all strategy calculations.
@@ -238,6 +239,7 @@ def _cap(value: int) -> int:
 Consider an exponential strategy with `backoff_factor: 2000`, `exponent: 2`, and `max_delay: 10000`:
 
 **With capping:**
+
 - Retry 1: 2000ms
 - Retry 2: 4000ms
 - Retry 3: 8000ms
@@ -392,4 +394,4 @@ Retry policies work alongside Exosphere's signaling system:
 
 - Nodes can still raise `PruneSignal` to stop retries immediately
 - Nodes can raise `ReQueueAfterSignal` to re-queue after some time. This will not mark nodes as failures.
-- When a node is re-queued using `ReQueueAfterSignal`, the `retry_count` is not incremented. The existing count is carried over to the new state.
\ No newline at end of file
+- When a node is re-queued using `ReQueueAfterSignal`, the `retry_count` is not incremented. The existing count is carried over to the new state.
diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py
index 6645dac9..f798cec8 100644
--- a/state-manager/app/controller/errored_state.py
+++ b/state-manager/app/controller/errored_state.py
@@ -60,7 +60,6 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E
                 retry_created = True
             except DuplicateKeyError:
                 logger.info(f"Duplicate retry state detected for state {state_id}. A retry state with the same unique key already exists.", x_exosphere_request_id=x_exosphere_request_id)
-                retry_created = True
 
         state.status = StateStatusEnum.ERRORED
         state.error = body.error

From 4ec30abd4c6ca679c398eb035045ef761a617e5a Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 14:54:02 +0530
Subject: [PATCH 16/20] Enhance tests for errored state and upsert graph
 template

- Added retry policy integration in the test cases for errored state handling and upsert graph template.
- Updated test fixtures to include additional state attributes for better simulation of state behavior.
- Improved error handling assertions in the errored state tests to ensure proper HTTP exceptions are raised.
- Mocked retry policy in graph template tests to validate retry behavior during upsert operations.
---
 .../unit/controller/test_errored_state.py     |  94 +++--
 .../controller/test_upsert_graph_template.py  |  33 ++
 .../unit/models/test_retry_policy_model.py    | 378 ++++++++++++++++++
 3 files changed, 478 insertions(+), 27 deletions(-)
 create mode 100644 state-manager/tests/unit/models/test_retry_policy_model.py

diff --git a/state-manager/tests/unit/controller/test_errored_state.py b/state-manager/tests/unit/controller/test_errored_state.py
index b1fc7df5..df1d5526 100644
--- a/state-manager/tests/unit/controller/test_errored_state.py
+++ b/state-manager/tests/unit/controller/test_errored_state.py
@@ -34,6 +34,16 @@ def mock_state_queued(self):
         state = MagicMock()
         state.id = PydanticObjectId()
         state.status = StateStatusEnum.QUEUED
+        state.graph_name = "test_graph"
+        state.retry_count = 0
+        state.node_name = "test_node"
+        state.namespace_name = "test_namespace"
+        state.identifier = "test_identifier"
+        state.run_id = "test_run_id"
+        state.inputs = {}
+        state.parents = []
+        state.does_unites = False
+        state.fanout_id = None
         return state
 
     @pytest.fixture
@@ -41,11 +51,23 @@ def mock_state_executed(self):
         state = MagicMock()
         state.id = PydanticObjectId()
         state.status = StateStatusEnum.EXECUTED
+        state.graph_name = "test_graph"
+        state.retry_count = 0
+        state.node_name = "test_node"
+        state.namespace_name = "test_namespace"
+        state.identifier = "test_identifier"
+        state.run_id = "test_run_id"
+        state.inputs = {}
+        state.parents = []
+        state.does_unites = False
+        state.fanout_id = None
         return state
 
     @patch('app.controller.errored_state.State')
+    @patch('app.controller.errored_state.GraphTemplate')
     async def test_errored_state_success_queued(
         self,
+        mock_graph_template_class,
         mock_state_class,
         mock_namespace,
         mock_state_id,
@@ -55,10 +77,18 @@ async def test_errored_state_success_queued(
     ):
         """Test successful error marking of queued state"""      
         
-        mock_state_queued.save = AsyncMock()     
+        # Mock GraphTemplate.get to return a valid graph template
+        mock_graph_template = MagicMock()
+        mock_graph_template.retry_policy.max_retries = 3
+        mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000)
+        mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template)
         
-        mock_state_queued.status = StateStatusEnum.QUEUED
-        mock_state_queued.save = AsyncMock()
+        # Mock State constructor and insert method
+        mock_retry_state = MagicMock()
+        mock_retry_state.insert = AsyncMock(return_value=mock_retry_state)
+        mock_state_class.return_value = mock_retry_state
+        
+        mock_state_queued.save = AsyncMock()     
         mock_state_class.find_one = AsyncMock(return_value=mock_state_queued)
 
         # Act
@@ -75,8 +105,10 @@ async def test_errored_state_success_queued(
         
 
     @patch('app.controller.errored_state.State')
+    @patch('app.controller.errored_state.GraphTemplate')
     async def test_errored_state_success_executed(
         self,
+        mock_graph_template_class,
         mock_state_class,
         mock_namespace,
         mock_state_id,
@@ -84,26 +116,22 @@ async def test_errored_state_success_executed(
         mock_state_executed,
         mock_request_id
     ):
-        """Test successful error marking of executed state"""
-      
-        mock_state_executed.save = AsyncMock() 
-
-        mock_state_executed.status = StateStatusEnum.QUEUED
-        mock_state_executed.save = AsyncMock()
+        """Test that executed states cannot be marked as errored"""
+        # Arrange
+        mock_state_executed.status = StateStatusEnum.EXECUTED
         mock_state_class.find_one = AsyncMock(return_value=mock_state_executed)
 
-        # Act
-        result = await errored_state(
-            mock_namespace,
-            mock_state_id,
-            mock_errored_request,
-            mock_request_id
-        )
-
-        # Assert
-        assert result.status == StateStatusEnum.ERRORED
-        assert mock_state_class.find_one.call_count == 1  # Called once for finding
+        # Act & Assert
+        with pytest.raises(HTTPException) as exc_info:
+            await errored_state(
+                mock_namespace,
+                mock_state_id,
+                mock_errored_request,
+                mock_request_id
+            )
         
+        assert exc_info.value.status_code == status.HTTP_400_BAD_REQUEST
+        assert exc_info.value.detail == "State is already executed"
 
     @patch('app.controller.errored_state.State')
     async def test_errored_state_not_found(
@@ -197,6 +225,7 @@ async def test_errored_state_already_executed(
         # Arrange
         mock_state = MagicMock()
         mock_state.status = StateStatusEnum.EXECUTED
+        mock_state.graph_name = "test_graph"
         mock_state_class.find_one = AsyncMock(return_value=mock_state)
 
         # Act & Assert
@@ -222,7 +251,7 @@ async def test_errored_state_database_error(
     ):
         """Test handling of database errors"""
         # Arrange
-        mock_state_class.find_one = MagicMock(side_effect=Exception("Database error"))
+        mock_state_class.find_one = AsyncMock(side_effect=Exception("Database error"))
 
         # Act & Assert
         with pytest.raises(Exception) as exc_info:
@@ -236,31 +265,42 @@ async def test_errored_state_database_error(
         assert str(exc_info.value) == "Database error"
 
     @patch('app.controller.errored_state.State')
+    @patch('app.controller.errored_state.GraphTemplate')
     async def test_errored_state_with_different_error_message(
         self,
+        mock_graph_template_class,
         mock_state_class,
         mock_namespace,
         mock_state_id,
+        mock_errored_request,
         mock_state_queued,
         mock_request_id
     ):
         """Test error marking with different error message"""
         # Arrange
-        errored_request = ErroredRequestModel(
+        different_error_request = ErroredRequestModel(
             error="Different error message"
-        )       
+        )
+        
+        # Mock GraphTemplate.get to return a valid graph template
+        mock_graph_template = MagicMock()
+        mock_graph_template.retry_policy.max_retries = 3
+        mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000)
+        mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template)
+        
+        # Mock State constructor and insert method
+        mock_retry_state = MagicMock()
+        mock_retry_state.insert = AsyncMock(return_value=mock_retry_state)
+        mock_state_class.return_value = mock_retry_state
         
         mock_state_queued.save = AsyncMock()
-
-        mock_state_queued.status = StateStatusEnum.QUEUED
-        mock_state_queued.set = AsyncMock()
         mock_state_class.find_one = AsyncMock(return_value=mock_state_queued)
 
         # Act
         result = await errored_state(
             mock_namespace,
             mock_state_id,
-            errored_request,
+            different_error_request,
             mock_request_id
         )
 
diff --git a/state-manager/tests/unit/controller/test_upsert_graph_template.py b/state-manager/tests/unit/controller/test_upsert_graph_template.py
index 2d309376..00832b7f 100644
--- a/state-manager/tests/unit/controller/test_upsert_graph_template.py
+++ b/state-manager/tests/unit/controller/test_upsert_graph_template.py
@@ -7,6 +7,7 @@
 from app.models.graph_models import UpsertGraphTemplateRequest
 from app.models.graph_template_validation_status import GraphTemplateValidationStatus
 from app.models.node_template_model import NodeTemplate
+from app.models.retry_policy_model import RetryPolicyModel
 
 
 class TestUpsertGraphTemplate:
@@ -74,6 +75,14 @@ def mock_existing_template(self, mock_nodes, mock_secrets):
         template.updated_at = datetime(2023, 1, 2, 12, 0, 0)
         template.get_secrets.return_value = mock_secrets
         template.set_secrets.return_value = template
+        
+        # Add proper retry_policy using real RetryPolicyModel
+        template.retry_policy = RetryPolicyModel(
+            max_retries=3,
+            backoff_factor=1000,
+            max_delay=30000
+        )
+        
         return template
 
     @patch('app.controller.upsert_graph_template.GraphTemplate')
@@ -148,6 +157,14 @@ async def test_upsert_graph_template_create_new(
         mock_new_template.get_secrets.return_value = mock_upsert_request.secrets
         mock_new_template.set_secrets.return_value = mock_new_template
         
+        # Add proper retry_policy mock
+        mock_retry_policy = RetryPolicyModel(
+            max_retries=3,
+            backoff_factor=1000,
+            max_delay=30000
+        )
+        mock_new_template.retry_policy = mock_retry_policy
+        
         mock_graph_template_class.insert = AsyncMock(return_value=mock_new_template)
 
         # Act
@@ -225,6 +242,14 @@ async def test_upsert_graph_template_with_empty_nodes(
         mock_existing_template.get_secrets.return_value = {}
         mock_existing_template.set_secrets.return_value = mock_existing_template
         
+        # Add proper retry_policy mock
+        mock_retry_policy = RetryPolicyModel(
+            max_retries=3,
+            backoff_factor=1000,
+            max_delay=30000
+        )
+        mock_existing_template.retry_policy = mock_retry_policy
+        
         mock_existing_template.update = AsyncMock()
 
         mock_graph_template_class.find_one = AsyncMock(return_value=mock_existing_template)
@@ -269,6 +294,14 @@ async def test_upsert_graph_template_with_validation_errors(
         mock_existing_template.get_secrets.return_value = mock_upsert_request.secrets
         mock_existing_template.set_secrets.return_value = mock_existing_template
 
+        # Add proper retry_policy mock
+        mock_retry_policy = RetryPolicyModel(
+            max_retries=3,
+            backoff_factor=1000,
+            max_delay=30000
+        )
+        mock_existing_template.retry_policy = mock_retry_policy
+
         mock_existing_template.update = AsyncMock()
         
         mock_graph_template_class.find_one = AsyncMock(return_value=mock_existing_template)
diff --git a/state-manager/tests/unit/models/test_retry_policy_model.py b/state-manager/tests/unit/models/test_retry_policy_model.py
new file mode 100644
index 00000000..0ec7c5b8
--- /dev/null
+++ b/state-manager/tests/unit/models/test_retry_policy_model.py
@@ -0,0 +1,378 @@
+import pytest
+import random
+from app.models.retry_policy_model import RetryPolicyModel, RetryStrategy
+
+
+class TestRetryPolicyModel:
+    """Test cases for RetryPolicyModel"""
+
+    def test_default_initialization(self):
+        """Test RetryPolicyModel with default values"""
+        policy = RetryPolicyModel()
+        
+        assert policy.max_retries == 3
+        assert policy.strategy == RetryStrategy.EXPONENTIAL
+        assert policy.backoff_factor == 2000
+        assert policy.exponent == 2
+        assert policy.max_delay is None
+
+    def test_custom_initialization(self):
+        """Test RetryPolicyModel with custom values"""
+        policy = RetryPolicyModel(
+            max_retries=5,
+            strategy=RetryStrategy.LINEAR,
+            backoff_factor=1000,
+            exponent=3,
+            max_delay=10000
+        )
+        
+        assert policy.max_retries == 5
+        assert policy.strategy == RetryStrategy.LINEAR
+        assert policy.backoff_factor == 1000
+        assert policy.exponent == 3
+        assert policy.max_delay == 10000
+
+    def test_exponential_strategy(self):
+        """Test exponential retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL,
+            backoff_factor=1000,
+            exponent=2
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert delay == 1000  # 1000 * 2^0
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert delay == 2000  # 1000 * 2^1
+        
+        # Test retry count 3
+        delay = policy.compute_delay(3)
+        assert delay == 4000  # 1000 * 2^2
+
+    def test_exponential_strategy_with_max_delay(self):
+        """Test exponential retry strategy with max delay cap"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL,
+            backoff_factor=1000,
+            exponent=2,
+            max_delay=3000
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert delay == 1000  # 1000 * 2^0
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert delay == 2000  # 1000 * 2^1
+        
+        # Test retry count 3 (should be capped at max_delay)
+        delay = policy.compute_delay(3)
+        assert delay == 3000  # Capped at max_delay
+
+    def test_linear_strategy(self):
+        """Test linear retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.LINEAR,
+            backoff_factor=1000
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert delay == 1000  # 1000 * 1
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert delay == 2000  # 1000 * 2
+        
+        # Test retry count 3
+        delay = policy.compute_delay(3)
+        assert delay == 3000  # 1000 * 3
+
+    def test_fixed_strategy(self):
+        """Test fixed retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.FIXED,
+            backoff_factor=1000
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert delay == 1000  # Always 1000
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert delay == 1000  # Always 1000
+        
+        # Test retry count 3
+        delay = policy.compute_delay(3)
+        assert delay == 1000  # Always 1000
+
+    def test_exponential_full_jitter_strategy(self):
+        """Test exponential full jitter retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER,
+            backoff_factor=1000,
+            exponent=2
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert 0 <= delay <= 1000  # Random between 0 and 1000
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert 0 <= delay <= 2000  # Random between 0 and 2000
+
+    def test_exponential_equal_jitter_strategy(self):
+        """Test exponential equal jitter retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL_EQUAL_JITTER,
+            backoff_factor=1000,
+            exponent=2
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert 500 <= delay <= 1000  # Random between 500 and 1000
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert 1000 <= delay <= 2000  # Random between 1000 and 2000
+
+    def test_linear_full_jitter_strategy(self):
+        """Test linear full jitter retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.LINEAR_FULL_JITTER,
+            backoff_factor=1000
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert 0 <= delay <= 1000  # Random between 0 and 1000
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert 0 <= delay <= 2000  # Random between 0 and 2000
+
+    def test_linear_equal_jitter_strategy(self):
+        """Test linear equal jitter retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.LINEAR_EQUAL_JITTER,
+            backoff_factor=1000
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert 500 <= delay <= 1000  # Random between 500 and 1000
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert 1000 <= delay <= 2000  # Random between 1000 and 2000
+
+    def test_fixed_full_jitter_strategy(self):
+        """Test fixed full jitter retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.FIXED_FULL_JITTER,
+            backoff_factor=1000
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert 0 <= delay <= 1000  # Random between 0 and 1000
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert 0 <= delay <= 1000  # Random between 0 and 1000
+
+    def test_fixed_equal_jitter_strategy(self):
+        """Test fixed equal jitter retry strategy"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.FIXED_EQUAL_JITTER,
+            backoff_factor=1000
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert 500 <= delay <= 1000  # Random between 500 and 1000
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert 500 <= delay <= 1000  # Random between 500 and 1000
+
+    def test_invalid_retry_count(self):
+        """Test that invalid retry count raises ValueError"""
+        policy = RetryPolicyModel()
+        
+        # Test retry count 0
+        with pytest.raises(ValueError, match="Retry count must be greater than or equal to 1"):
+            policy.compute_delay(0)
+        
+        # Test retry count -1
+        with pytest.raises(ValueError, match="Retry count must be greater than or equal to 1"):
+            policy.compute_delay(-1)
+
+    def test_max_delay_capping(self):
+        """Test that max_delay properly caps the delay"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL,
+            backoff_factor=1000,
+            exponent=2,
+            max_delay=1500
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert delay == 1000  # Not capped
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert delay == 1500  # Capped at max_delay
+        
+        # Test retry count 3
+        delay = policy.compute_delay(3)
+        assert delay == 1500  # Capped at max_delay
+
+    def test_jitter_strategies_with_max_delay(self):
+        """Test jitter strategies with max delay capping"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER,
+            backoff_factor=1000,
+            exponent=2,
+            max_delay=1500
+        )
+        
+        # Test multiple calls to ensure max_delay is respected
+        for _ in range(10):
+            delay = policy.compute_delay(3)
+            assert delay <= 1500  # Should never exceed max_delay
+
+    def test_different_exponents(self):
+        """Test different exponent values"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL,
+            backoff_factor=1000,
+            exponent=3
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert delay == 1000  # 1000 * 3^0
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert delay == 3000  # 1000 * 3^1
+        
+        # Test retry count 3
+        delay = policy.compute_delay(3)
+        assert delay == 9000  # 1000 * 3^2
+
+    def test_different_backoff_factors(self):
+        """Test different backoff factor values"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.LINEAR,
+            backoff_factor=500
+        )
+        
+        # Test retry count 1
+        delay = policy.compute_delay(1)
+        assert delay == 500  # 500 * 1
+        
+        # Test retry count 2
+        delay = policy.compute_delay(2)
+        assert delay == 1000  # 500 * 2
+
+    def test_model_validation(self):
+        """Test Pydantic model validation"""
+        # Test valid model
+        policy = RetryPolicyModel(
+            max_retries=5,
+            strategy=RetryStrategy.EXPONENTIAL,
+            backoff_factor=1000,
+            exponent=2,
+            max_delay=10000
+        )
+        
+        # Test invalid max_retries (negative)
+        with pytest.raises(ValueError):
+            RetryPolicyModel(max_retries=-1)
+        
+        # Test invalid backoff_factor (non-positive)
+        with pytest.raises(ValueError):
+            RetryPolicyModel(backoff_factor=0)
+        
+        # Test invalid exponent (non-positive)
+        with pytest.raises(ValueError):
+            RetryPolicyModel(exponent=0)
+        
+        # Test invalid max_delay (non-positive)
+        with pytest.raises(ValueError):
+            RetryPolicyModel(max_delay=0)
+
+    def test_strategy_enum_values(self):
+        """Test all RetryStrategy enum values"""
+        strategies = [
+            RetryStrategy.EXPONENTIAL,
+            RetryStrategy.EXPONENTIAL_FULL_JITTER,
+            RetryStrategy.EXPONENTIAL_EQUAL_JITTER,
+            RetryStrategy.LINEAR,
+            RetryStrategy.LINEAR_FULL_JITTER,
+            RetryStrategy.LINEAR_EQUAL_JITTER,
+            RetryStrategy.FIXED,
+            RetryStrategy.FIXED_FULL_JITTER,
+            RetryStrategy.FIXED_EQUAL_JITTER
+        ]
+        
+        for strategy in strategies:
+            policy = RetryPolicyModel(strategy=strategy)
+            assert policy.strategy == strategy
+            # Should not raise any exceptions
+            delay = policy.compute_delay(1)
+            assert isinstance(delay, int)
+            assert delay >= 0
+
+    def test_edge_case_large_numbers(self):
+        """Test edge cases with large numbers"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL,
+            backoff_factor=1000000,
+            exponent=10
+        )
+        
+        # Test that large numbers don't cause overflow
+        delay = policy.compute_delay(3)
+        assert isinstance(delay, int)
+        assert delay > 0
+
+    def test_consistency_across_calls(self):
+        """Test that non-jitter strategies are consistent"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL,
+            backoff_factor=1000,
+            exponent=2
+        )
+        
+        # Multiple calls should return the same result for non-jitter strategies
+        delay1 = policy.compute_delay(2)
+        delay2 = policy.compute_delay(2)
+        assert delay1 == delay2
+
+    def test_jitter_variability(self):
+        """Test that jitter strategies produce different results"""
+        policy = RetryPolicyModel(
+            strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER,
+            backoff_factor=1000,
+            exponent=2
+        )
+        
+        # Multiple calls should return different results for jitter strategies
+        delays = set()
+        for _ in range(100):
+            delay = policy.compute_delay(2)
+            delays.add(delay)
+        
+        # Should have multiple different values (not all the same)
+        assert len(delays) > 1 
\ No newline at end of file

From b1b85d187b8e268c5b37bafe4fc5e0941ceb0461 Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 14:55:49 +0530
Subject: [PATCH 17/20] Refactor test for RetryPolicyModel by removing
 unnecessary import

- Removed the unused 'random' import from the test file for cleaner code.
- Updated the instantiation of RetryPolicyModel to improve readability.
---
 state-manager/tests/unit/models/test_retry_policy_model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/state-manager/tests/unit/models/test_retry_policy_model.py b/state-manager/tests/unit/models/test_retry_policy_model.py
index 0ec7c5b8..038f3adc 100644
--- a/state-manager/tests/unit/models/test_retry_policy_model.py
+++ b/state-manager/tests/unit/models/test_retry_policy_model.py
@@ -1,5 +1,4 @@
 import pytest
-import random
 from app.models.retry_policy_model import RetryPolicyModel, RetryStrategy
 
 
@@ -288,7 +287,7 @@ def test_different_backoff_factors(self):
     def test_model_validation(self):
         """Test Pydantic model validation"""
         # Test valid model
-        policy = RetryPolicyModel(
+        RetryPolicyModel(
             max_retries=5,
             strategy=RetryStrategy.EXPONENTIAL,
             backoff_factor=1000,

From 3806d2684d758387ba80dd458c36d9b7b5d5961e Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 15:04:35 +0530
Subject: [PATCH 18/20] Add comprehensive tests for errored state handling in
 graph templates

- Introduced multiple test cases to cover scenarios where graph templates are not found, encounter other errors, or trigger DuplicateKeyError during state creation.
- Enhanced error handling assertions to ensure proper HTTP exceptions are raised for missing graph templates and other exceptions.
- Validated behavior when maximum retries are reached, ensuring no new state is created in such cases.
- Improved overall test coverage for errored state functionality.
---
 .../unit/controller/test_errored_state.py     | 181 ++++++++++++++++++
 1 file changed, 181 insertions(+)

diff --git a/state-manager/tests/unit/controller/test_errored_state.py b/state-manager/tests/unit/controller/test_errored_state.py
index df1d5526..31338db3 100644
--- a/state-manager/tests/unit/controller/test_errored_state.py
+++ b/state-manager/tests/unit/controller/test_errored_state.py
@@ -2,6 +2,7 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 from fastapi import HTTPException, status
 from beanie import PydanticObjectId
+from pymongo.errors import DuplicateKeyError
 
 from app.controller.errored_state import errored_state
 from app.models.errored_models import ErroredRequestModel
@@ -309,3 +310,183 @@ async def test_errored_state_with_different_error_message(
         assert mock_state_class.find_one.call_count == 1  # Called once for finding
         assert mock_state_queued.error == "Different error message"
 
+    @patch('app.controller.errored_state.State')
+    @patch('app.controller.errored_state.GraphTemplate')
+    async def test_errored_state_graph_template_not_found(
+        self,
+        mock_graph_template_class,
+        mock_state_class,
+        mock_namespace,
+        mock_state_id,
+        mock_errored_request,
+        mock_state_queued,
+        mock_request_id
+    ):
+        """Test when graph template is not found"""
+        # Arrange
+        mock_state_queued.save = AsyncMock()
+        mock_state_class.find_one = AsyncMock(return_value=mock_state_queued)
+        
+        # Mock GraphTemplate.get to raise ValueError with "Graph template not found"
+        mock_graph_template_class.get = AsyncMock(side_effect=ValueError("Graph template not found"))
+
+        # Act & Assert
+        with pytest.raises(HTTPException) as exc_info:
+            await errored_state(
+                mock_namespace,
+                mock_state_id,
+                mock_errored_request,
+                mock_request_id
+            )
+        
+        assert exc_info.value.status_code == status.HTTP_404_NOT_FOUND
+        assert exc_info.value.detail == "Graph template not found"
+
+    @patch('app.controller.errored_state.State')
+    @patch('app.controller.errored_state.GraphTemplate')
+    async def test_errored_state_graph_template_other_error(
+        self,
+        mock_graph_template_class,
+        mock_state_class,
+        mock_namespace,
+        mock_state_id,
+        mock_errored_request,
+        mock_state_queued,
+        mock_request_id
+    ):
+        """Test when graph template raises other exceptions"""
+        # Arrange
+        mock_state_queued.save = AsyncMock()
+        mock_state_class.find_one = AsyncMock(return_value=mock_state_queued)
+        
+        # Mock GraphTemplate.get to raise a different exception
+        mock_graph_template_class.get = AsyncMock(side_effect=Exception("Database connection error"))
+
+        # Act & Assert
+        with pytest.raises(Exception) as exc_info:
+            await errored_state(
+                mock_namespace,
+                mock_state_id,
+                mock_errored_request,
+                mock_request_id
+            )
+        
+        assert str(exc_info.value) == "Database connection error"
+
+    @patch('app.controller.errored_state.State')
+    @patch('app.controller.errored_state.GraphTemplate')
+    async def test_errored_state_duplicate_key_error(
+        self,
+        mock_graph_template_class,
+        mock_state_class,
+        mock_namespace,
+        mock_state_id,
+        mock_errored_request,
+        mock_state_queued,
+        mock_request_id
+    ):
+        """Test when creating retry state encounters DuplicateKeyError"""
+        # Arrange
+        mock_state_queued.save = AsyncMock()
+        mock_state_class.find_one = AsyncMock(return_value=mock_state_queued)
+        
+        # Mock GraphTemplate.get to return a valid graph template
+        mock_graph_template = MagicMock()
+        mock_graph_template.retry_policy.max_retries = 3
+        mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000)
+        mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template)
+        
+        # Mock State constructor and insert method to raise DuplicateKeyError
+        mock_retry_state = MagicMock()
+        mock_retry_state.insert = AsyncMock(side_effect=DuplicateKeyError("Duplicate key error"))
+        mock_state_class.return_value = mock_retry_state
+
+        # Act
+        result = await errored_state(
+            mock_namespace,
+            mock_state_id,
+            mock_errored_request,
+            mock_request_id
+        )
+
+        # Assert
+        assert result.status == StateStatusEnum.ERRORED
+        assert result.retry_created == False
+        assert mock_state_queued.status == StateStatusEnum.ERRORED
+        assert mock_state_queued.error == mock_errored_request.error
+
+    @patch('app.controller.errored_state.State')
+    @patch('app.controller.errored_state.GraphTemplate')
+    async def test_errored_state_max_retries_reached(
+        self,
+        mock_graph_template_class,
+        mock_state_class,
+        mock_namespace,
+        mock_state_id,
+        mock_errored_request,
+        mock_request_id
+    ):
+        """Test when state has reached max retries"""
+        # Arrange
+        mock_state = MagicMock()
+        mock_state.id = PydanticObjectId()
+        mock_state.status = StateStatusEnum.QUEUED
+        mock_state.graph_name = "test_graph"
+        mock_state.retry_count = 3  # Already at max retries
+        mock_state.node_name = "test_node"
+        mock_state.namespace_name = "test_namespace"
+        mock_state.identifier = "test_identifier"
+        mock_state.run_id = "test_run_id"
+        mock_state.inputs = {}
+        mock_state.parents = []
+        mock_state.does_unites = False
+        mock_state.fanout_id = None
+        mock_state.save = AsyncMock()
+        
+        mock_state_class.find_one = AsyncMock(return_value=mock_state)
+        
+        # Mock GraphTemplate.get to return a valid graph template with max_retries = 3
+        mock_graph_template = MagicMock()
+        mock_graph_template.retry_policy.max_retries = 3
+        mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template)
+
+        # Act
+        result = await errored_state(
+            mock_namespace,
+            mock_state_id,
+            mock_errored_request,
+            mock_request_id
+        )
+
+        # Assert
+        assert result.status == StateStatusEnum.ERRORED
+        assert result.retry_created == False
+        assert mock_state.status == StateStatusEnum.ERRORED
+        assert mock_state.error == mock_errored_request.error
+        # Verify that State constructor was not called (no retry created)
+        mock_state_class.assert_not_called()
+
+    @patch('app.controller.errored_state.State')
+    async def test_errored_state_general_exception(
+        self,
+        mock_state_class,
+        mock_namespace,
+        mock_state_id,
+        mock_errored_request,
+        mock_request_id
+    ):
+        """Test handling of general exceptions in the main try-catch block"""
+        # Arrange
+        mock_state_class.find_one = AsyncMock(side_effect=Exception("Unexpected error"))
+
+        # Act & Assert
+        with pytest.raises(Exception) as exc_info:
+            await errored_state(
+                mock_namespace,
+                mock_state_id,
+                mock_errored_request,
+                mock_request_id
+            )
+        
+        assert str(exc_info.value) == "Unexpected error"
+

From 4048228e97a1d31308238f179453babe7a9a222a Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 15:06:27 +0530
Subject: [PATCH 19/20] Refactor assertions in errored state tests for clarity

- Updated assertions in the TestErroredState class to use 'not' instead of '== False' for improved readability.
- Ensured consistency in the test code style across multiple test cases.
---
 state-manager/tests/unit/controller/test_errored_state.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/state-manager/tests/unit/controller/test_errored_state.py b/state-manager/tests/unit/controller/test_errored_state.py
index 31338db3..032571bf 100644
--- a/state-manager/tests/unit/controller/test_errored_state.py
+++ b/state-manager/tests/unit/controller/test_errored_state.py
@@ -411,7 +411,7 @@ async def test_errored_state_duplicate_key_error(
 
         # Assert
         assert result.status == StateStatusEnum.ERRORED
-        assert result.retry_created == False
+        assert not result.retry_created
         assert mock_state_queued.status == StateStatusEnum.ERRORED
         assert mock_state_queued.error == mock_errored_request.error
 
@@ -460,7 +460,7 @@ async def test_errored_state_max_retries_reached(
 
         # Assert
         assert result.status == StateStatusEnum.ERRORED
-        assert result.retry_created == False
+        assert not result.retry_created
         assert mock_state.status == StateStatusEnum.ERRORED
         assert mock_state.error == mock_errored_request.error
         # Verify that State constructor was not called (no retry created)

From 462cfdefbb905fe7aa1ca24f5a4c957867fdc2a9 Mon Sep 17 00:00:00 2001
From: NiveditJain <nivedit@aikin.club>
Date: Sun, 31 Aug 2025 15:08:39 +0530
Subject: [PATCH 20/20] Remove Kubernetes deployment steps from the publish
 workflow

- Eliminated the deploy-to-k8s job from the publish-state-manager workflow to streamline the CI/CD process.
- This change focuses on publishing the image without the deployment step, simplifying the workflow configuration.
---
 .github/workflows/publish-state-mangaer.yml | 27 ---------------------
 1 file changed, 27 deletions(-)

diff --git a/.github/workflows/publish-state-mangaer.yml b/.github/workflows/publish-state-mangaer.yml
index 883fd9b8..b3926336 100644
--- a/.github/workflows/publish-state-mangaer.yml
+++ b/.github/workflows/publish-state-mangaer.yml
@@ -111,30 +111,3 @@ jobs:
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-
-  deploy-to-k8s:
-    needs: publish-image
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Deploy to K8s
-        run: |
-          echo "${{ secrets.KUBE_CONFIG }}" | base64 -d > kubeconfig.yaml
-          export KUBECONFIG=$PWD/kubeconfig.yaml
-          kubectl get nodes
-
-          echo "selected image: ${{ fromJson(needs.publish-image.outputs.json).tags[1] }}"
-
-          kubectl set image deployment/exosphere-state-manager exosphere-state-manager=${{fromJson(needs.publish-image.outputs.json).tags[1]}}
-
-          kubectl rollout status deployment/exosphere-state-manager
-
-          status=$(kubectl rollout status deployment/exosphere-state-manager)
-
-          echo "$status"
-
-          if [[ "$status" != *"successfully rolled out"* ]]; then
-            kubectl rollout undo deployment/exosphere-state-manager
-            echo "❌ Deployment failed. Rolled back." >&2
-            exit 1
-          fi
\ No newline at end of file