From b1f2bb0447c1467b8e46ac69661888d58e05a61f Mon Sep 17 00:00:00 2001 From: Ritwik G Date: Fri, 13 Mar 2026 11:39:06 +0530 Subject: [PATCH 1/5] fix: replace permanent Redis failure flag with time-based retry When Redis connections failed (e.g., stale TCP connections dropped by k8s networking), the _redis_available flag was permanently set to False, disabling caching for the entire process lifetime. This caused all VFS operations to hit the database directly, leading to slow MCP responses and 502 backend_timeout errors at the load balancer. Changes: - Replace permanent _redis_available flag with time-based backoff (30s) - Add health_check_interval=15 to detect stale connections before use - Add retry_on_timeout=True for transient timeout errors - Reset client on ConnectionError so pool can create fresh connections Co-Authored-By: Claude Opus 4.6 --- backend/app/services/analytics_cache.py | 50 ++++++++++++++++--------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/backend/app/services/analytics_cache.py b/backend/app/services/analytics_cache.py index 47679fb..b752143 100644 --- a/backend/app/services/analytics_cache.py +++ b/backend/app/services/analytics_cache.py @@ -9,6 +9,7 @@ import hashlib import json import logging +import time from typing import Any, Callable, Optional, TypeVar import redis @@ -21,7 +22,8 @@ # Module-level Redis client singleton _redis_client: Optional[redis.Redis] = None -_redis_available: Optional[bool] = None +_redis_unavailable_until: float = 0 +_RETRY_BACKOFF_SECONDS = 30 def get_redis_client() -> Optional[redis.Redis]: @@ -29,43 +31,49 @@ def get_redis_client() -> Optional[redis.Redis]: Get or create a Redis client. Returns None if Redis is not configured or unavailable. - Uses lazy initialization and caches the result. + Uses lazy initialization with time-based retry on failure. """ - global _redis_client, _redis_available + global _redis_client, _redis_unavailable_until - # If we've already determined Redis is unavailable, don't retry - if _redis_available is False: + # Check if Redis is configured + if not settings.redis_url: + return None + + # If recently failed, skip until backoff expires + if _redis_unavailable_until > time.monotonic(): return None # If we already have a client, return it if _redis_client is not None: return _redis_client - # Check if Redis is configured - if not settings.redis_url: - logger.debug("Redis URL not configured, caching disabled") - _redis_available = False - return None - try: _redis_client = redis.from_url( settings.redis_url, decode_responses=True, socket_connect_timeout=2, socket_timeout=2, + health_check_interval=15, + retry_on_timeout=True, ) - # Test connection _redis_client.ping() - _redis_available = True - logger.info("Redis connection established for analytics caching") + _redis_unavailable_until = 0 + logger.info("Redis connection established for caching") return _redis_client except Exception as e: - logger.warning(f"Redis connection failed, caching disabled: {e}") - _redis_available = False + logger.warning(f"Redis connection failed, will retry in {_RETRY_BACKOFF_SECONDS}s: {e}") + _redis_unavailable_until = time.monotonic() + _RETRY_BACKOFF_SECONDS _redis_client = None return None +def _mark_redis_unavailable() -> None: + """Mark Redis as temporarily unavailable so we don't retry on every request.""" + global _redis_client, _redis_unavailable_until + _redis_client = None + _redis_unavailable_until = time.monotonic() + _RETRY_BACKOFF_SECONDS + + def _generate_cache_key(prefix: str, *args: Any, **kwargs: Any) -> str: """ Generate a cache key from prefix and arguments. @@ -126,6 +134,9 @@ def wrapper(*args: Any, **kwargs: Any) -> T: if cached is not None: logger.debug(f"Cache hit for {cache_key}") return json.loads(cached) + except redis.ConnectionError as e: + logger.warning(f"Redis connection lost during cache read: {e}") + _mark_redis_unavailable() except Exception as e: logger.warning(f"Cache read failed for {cache_key}: {e}") @@ -142,6 +153,9 @@ def wrapper(*args: Any, **kwargs: Any) -> T: client.setex(cache_key, ttl_seconds, cache_value) logger.debug(f"Cache set for {cache_key} (TTL: {ttl_seconds}s)") + except redis.ConnectionError as e: + logger.warning(f"Redis connection lost during cache write: {e}") + _mark_redis_unavailable() except Exception as e: logger.warning(f"Cache write failed for {cache_key}: {e}") @@ -192,6 +206,6 @@ def reset_redis_connection() -> None: Useful for testing or when Redis becomes available after being down. """ - global _redis_client, _redis_available + global _redis_client, _redis_unavailable_until _redis_client = None - _redis_available = None + _redis_unavailable_until = 0 From 032a725e37947b0dac52c78e0290aa755c8b977a Mon Sep 17 00:00:00 2001 From: Ritwik G Date: Fri, 13 Mar 2026 11:49:34 +0530 Subject: [PATCH 2/5] fix: handle ConnectionError in invalidate + skip redundant write after read failure - Add ConnectionError handling to invalidate_analytics_cache so it resets the stale client instead of leaving it for subsequent callers. - Skip cache write attempt if read already marked Redis unavailable, avoiding a redundant 2s timeout on the dead connection. Co-Authored-By: Claude Opus 4.6 --- backend/app/services/analytics_cache.py | 33 ++++++++++++++----------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/backend/app/services/analytics_cache.py b/backend/app/services/analytics_cache.py index b752143..2d9ccc4 100644 --- a/backend/app/services/analytics_cache.py +++ b/backend/app/services/analytics_cache.py @@ -143,21 +143,20 @@ def wrapper(*args: Any, **kwargs: Any) -> T: # Execute function result = func(*args, **kwargs) - try: - # Store in cache - # Convert Pydantic models to dict for JSON serialization - if hasattr(result, "model_dump"): - cache_value = json.dumps(result.model_dump(mode="json")) - else: - cache_value = json.dumps(result) - - client.setex(cache_key, ttl_seconds, cache_value) - logger.debug(f"Cache set for {cache_key} (TTL: {ttl_seconds}s)") - except redis.ConnectionError as e: - logger.warning(f"Redis connection lost during cache write: {e}") - _mark_redis_unavailable() - except Exception as e: - logger.warning(f"Cache write failed for {cache_key}: {e}") + if _redis_client is not None: + try: + if hasattr(result, "model_dump"): + cache_value = json.dumps(result.model_dump(mode="json")) + else: + cache_value = json.dumps(result) + + client.setex(cache_key, ttl_seconds, cache_value) + logger.debug(f"Cache set for {cache_key} (TTL: {ttl_seconds}s)") + except redis.ConnectionError as e: + logger.warning(f"Redis connection lost during cache write: {e}") + _mark_redis_unavailable() + except Exception as e: + logger.warning(f"Cache write failed for {cache_key}: {e}") return result @@ -195,6 +194,10 @@ def invalidate_analytics_cache(org_id: Optional[str] = None) -> int: logger.info(f"Invalidated {deleted} analytics cache entries") return deleted return 0 + except redis.ConnectionError as e: + logger.warning(f"Redis connection lost during cache invalidation: {e}") + _mark_redis_unavailable() + return 0 except Exception as e: logger.warning(f"Cache invalidation failed: {e}") return 0 From 1cb286d0642bcfe3057275cc38a10d97faf43a07 Mon Sep 17 00:00:00 2001 From: Ritwik G Date: Fri, 13 Mar 2026 11:57:39 +0530 Subject: [PATCH 3/5] fix: re-fetch client before write, treat TimeoutError like ConnectionError MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Write path now calls get_redis_client() for a fresh reference instead of reusing the potentially stale local `client` from the read phase. Prevents a reconnected client from being killed by a stale reference. - Catch redis.TimeoutError alongside ConnectionError in all three sites (read, write, invalidate). TimeoutError is not a ConnectionError subclass, so it was falling through to the generic handler without triggering backoff — causing 4s penalty per request under load. Co-Authored-By: Claude Opus 4.6 --- backend/app/services/analytics_cache.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/app/services/analytics_cache.py b/backend/app/services/analytics_cache.py index 2d9ccc4..57c5622 100644 --- a/backend/app/services/analytics_cache.py +++ b/backend/app/services/analytics_cache.py @@ -134,7 +134,7 @@ def wrapper(*args: Any, **kwargs: Any) -> T: if cached is not None: logger.debug(f"Cache hit for {cache_key}") return json.loads(cached) - except redis.ConnectionError as e: + except (redis.ConnectionError, redis.TimeoutError) as e: logger.warning(f"Redis connection lost during cache read: {e}") _mark_redis_unavailable() except Exception as e: @@ -143,16 +143,17 @@ def wrapper(*args: Any, **kwargs: Any) -> T: # Execute function result = func(*args, **kwargs) - if _redis_client is not None: + write_client = get_redis_client() + if write_client is not None: try: if hasattr(result, "model_dump"): cache_value = json.dumps(result.model_dump(mode="json")) else: cache_value = json.dumps(result) - client.setex(cache_key, ttl_seconds, cache_value) + write_client.setex(cache_key, ttl_seconds, cache_value) logger.debug(f"Cache set for {cache_key} (TTL: {ttl_seconds}s)") - except redis.ConnectionError as e: + except (redis.ConnectionError, redis.TimeoutError) as e: logger.warning(f"Redis connection lost during cache write: {e}") _mark_redis_unavailable() except Exception as e: @@ -194,7 +195,7 @@ def invalidate_analytics_cache(org_id: Optional[str] = None) -> int: logger.info(f"Invalidated {deleted} analytics cache entries") return deleted return 0 - except redis.ConnectionError as e: + except (redis.ConnectionError, redis.TimeoutError) as e: logger.warning(f"Redis connection lost during cache invalidation: {e}") _mark_redis_unavailable() return 0 From e3a67cf2544fbf1f1d11e84d394125aa330c6ede Mon Sep 17 00:00:00 2001 From: Ritwik G Date: Fri, 13 Mar 2026 12:04:09 +0530 Subject: [PATCH 4/5] fix: disable retry_on_timeout to avoid 4s blocking per request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With retry_on_timeout=True, timed-out commands block for 4s (2s × 2) before fallback. health_check_interval=15 already detects stale connections proactively, making the retry redundant and harmful. Co-Authored-By: Claude Opus 4.6 --- backend/app/services/analytics_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/services/analytics_cache.py b/backend/app/services/analytics_cache.py index 57c5622..66b3822 100644 --- a/backend/app/services/analytics_cache.py +++ b/backend/app/services/analytics_cache.py @@ -54,7 +54,7 @@ def get_redis_client() -> Optional[redis.Redis]: socket_connect_timeout=2, socket_timeout=2, health_check_interval=15, - retry_on_timeout=True, + retry_on_timeout=False, ) _redis_client.ping() _redis_unavailable_until = 0 From b1a3099a03ac6f1253c6622768e56dd24730a5c5 Mon Sep 17 00:00:00 2001 From: Ritwik G Date: Fri, 13 Mar 2026 12:14:44 +0530 Subject: [PATCH 5/5] fix: restore debug log when Redis URL is not configured Co-Authored-By: Claude Opus 4.6 --- backend/app/services/analytics_cache.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/app/services/analytics_cache.py b/backend/app/services/analytics_cache.py index 66b3822..a88c928 100644 --- a/backend/app/services/analytics_cache.py +++ b/backend/app/services/analytics_cache.py @@ -37,6 +37,7 @@ def get_redis_client() -> Optional[redis.Redis]: # Check if Redis is configured if not settings.redis_url: + logger.debug("Redis URL not configured, caching disabled") return None # If recently failed, skip until backoff expires