From a47452ee6c4d7307644f1ff0c7ab4c9accfd0a92 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Sun, 15 Jun 2025 19:07:36 -0400 Subject: [PATCH 01/68] sync PPAF --- .../azure-cosmos/azure/cosmos/_constants.py | 3 + .../azure/cosmos/_cosmos_client_connection.py | 7 +- .../_endpoint_discovery_retry_policy.py | 11 +- ...tition_endpoint_manager_circuit_breaker.py | 2 +- ...n_endpoint_manager_circuit_breaker_core.py | 5 +- ...anager_per_partition_automatic_failover.py | 143 +++++++++++++ .../azure/cosmos/_location_cache.py | 8 +- .../azure/cosmos/_request_object.py | 2 +- .../azure/cosmos/_retry_utility.py | 5 +- .../azure/cosmos/_synchronized_request.py | 5 +- .../cosmos/_timeout_failover_retry_policy.py | 11 +- .../azure-cosmos/azure/cosmos/documents.py | 1 + sdk/cosmos/azure-cosmos/pytest.ini | 1 + .../tests/_fault_injection_transport.py | 102 ++++++++-- .../azure-cosmos/tests/test_location_cache.py | 6 +- .../test_per_partition_automatic_failover.py | 191 ++++++++++++++++++ ...st_per_partition_circuit_breaker_sm_mrr.py | 3 +- .../test_service_retry_policies_async.py | 2 +- sdk/cosmos/live-platform-matrix.json | 17 ++ sdk/cosmos/test-resources.bicep | 4 + 20 files changed, 493 insertions(+), 36 deletions(-) create mode 100644 sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py create mode 100644 sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py index d0e0f54ae04c..c2812f3481f8 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py @@ -40,6 +40,7 @@ class _Constants: DatabaseAccountEndpoint: Literal["databaseAccountEndpoint"] = "databaseAccountEndpoint" DefaultEndpointsRefreshTime: int = 5 * 60 * 1000 # milliseconds UnavailableEndpointDBATimeouts: int = 1 # seconds + EnablePerPartitionFailoverBehavior: Literal["enablePerPartitionFailoverBehavior"] = "enablePerPartitionFailoverBehavior" #pylint: disable=line-too-long # ServiceDocument Resource EnableMultipleWritableLocations: Literal["enableMultipleWriteLocations"] = "enableMultipleWriteLocations" @@ -53,6 +54,8 @@ class _Constants: MAX_ITEM_BUFFER_VS_CONFIG_DEFAULT: int = 50000 CIRCUIT_BREAKER_ENABLED_CONFIG: str = "AZURE_COSMOS_ENABLE_CIRCUIT_BREAKER" CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT: str = "False" + PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG: str = "AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER" + PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG_DEFAULT: str = "False" # Only applicable when circuit breaker is enabled ------------------------- CONSECUTIVE_ERROR_COUNT_TOLERATED_FOR_READ: str = "AZURE_COSMOS_CONSECUTIVE_ERROR_COUNT_TOLERATED_FOR_READ" CONSECUTIVE_ERROR_COUNT_TOLERATED_FOR_READ_DEFAULT: int = 10 diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py index 3ee11eeccdd8..aeda60b03ce3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py @@ -48,7 +48,7 @@ HttpResponse # pylint: disable=no-legacy-azure-core-http-response-import from . import _base as base -from ._global_partition_endpoint_manager_circuit_breaker import _GlobalPartitionEndpointManagerForCircuitBreaker +from ._global_partition_endpoint_manager_per_partition_automatic_failover import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover # pylint: disable=line-too-long from . import _query_iterable as query_iterable from . import _runtime_constants as runtime_constants from . import _session @@ -168,7 +168,7 @@ def __init__( # pylint: disable=too-many-statements self.last_response_headers: CaseInsensitiveDict = CaseInsensitiveDict() self.UseMultipleWriteLocations = False - self._global_endpoint_manager = _GlobalPartitionEndpointManagerForCircuitBreaker(self) + self._global_endpoint_manager = _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover(self) retry_policy = None if isinstance(self.connection_policy.ConnectionRetryConfiguration, HTTPPolicy): @@ -2623,6 +2623,9 @@ def GetDatabaseAccount( database_account._EnableMultipleWritableLocations = result[ Constants.EnableMultipleWritableLocations ] + # TODO: PPAF - Verify that this is the correct variable from the service + if Constants.EnablePerPartitionFailoverBehavior in result: + database_account._EnablePerPartitionFailoverBehavior = result[Constants.EnablePerPartitionFailoverBehavior] self.UseMultipleWriteLocations = ( self.connection_policy.UseMultipleWriteLocations and database_account._EnableMultipleWritableLocations diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index aa7bc67d2137..f562df2a7189 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -43,8 +43,9 @@ class EndpointDiscoveryRetryPolicy(object): Max_retry_attempt_count = 120 Retry_after_in_milliseconds = 1000 - def __init__(self, connection_policy, global_endpoint_manager, *args): + def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, *args): self.global_endpoint_manager = global_endpoint_manager + self.pk_range_wrapper = pk_range_wrapper self._max_retry_attempt_count = EndpointDiscoveryRetryPolicy.Max_retry_attempt_count self.failover_retry_count = 0 self.retry_after_in_milliseconds = EndpointDiscoveryRetryPolicy.Retry_after_in_milliseconds @@ -85,6 +86,14 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument # refreshed with new writable and readable locations self.global_endpoint_manager.refresh_needed = True + # If per partition automatic failover is applicable, we mark the current endpoint as unavailable + # and resolve the service endpoint for the partition range - otherwise, continue with the default retry logic + if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): + partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] + partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) + self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) + return True + # clear previous location-based routing directive self.request.clear_route_to_location() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py index 2eda20c926d0..00e247701dc7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py @@ -100,7 +100,7 @@ def record_failure( if pk_range_wrapper: self.global_partition_endpoint_manager_core.record_failure(request, pk_range_wrapper) - def resolve_service_endpoint_for_partition( + def _resolve_service_endpoint_for_partition_circuit_breaker( self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py index 93faf9b7a8c5..f5335fc447ff 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py @@ -59,8 +59,9 @@ def is_circuit_breaker_applicable(self, request: RequestObject) -> bool: if not request: return False - circuit_breaker_enabled = os.environ.get(Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, - Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT) == "True" + circuit_breaker_enabled = os.environ.get(Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG, + os.environ.get(Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, + Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT)).lower() == "true" if not circuit_breaker_enabled: return False diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py new file mode 100644 index 000000000000..03a3080cdefe --- /dev/null +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -0,0 +1,143 @@ +# The MIT License (MIT) +# Copyright (c) 2025 Microsoft Corporation + +"""Class for global endpoint manager for per partition automatic failover. This class inherits the circuit breaker +endpoint manager, since enabling per partition automatic failover also enables the circuit breaker logic. +""" +import logging +import os +import threading + +from typing import Dict, List, Set, TYPE_CHECKING, Optional + +from azure.cosmos.http_constants import ResourceType +from azure.cosmos._constants import _Constants as Constants +from azure.cosmos._global_partition_endpoint_manager_circuit_breaker import \ + _GlobalPartitionEndpointManagerForCircuitBreaker +from azure.cosmos.documents import _OperationType + +from azure.cosmos._request_object import RequestObject +from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper + +if TYPE_CHECKING: + from azure.cosmos._cosmos_client_connection import CosmosClientConnection + +logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") + +class PartitionLevelFailoverInfo: + """ + Holds information about the partition level regional failover. + It is used to track the partition key range and the regions where it is available. + """ + def __init__(self): + self.unavailable_regional_endpoints = set() + self.current_regional_endpoint = None + self._lock = threading.Lock() + + def try_move_to_next_location(self, available_account_regional_endpoints: Set[str], request: RequestObject) -> bool: + with self._lock: + failed_regional_endpoint = request.location_endpoint_to_route + if failed_regional_endpoint != self.current_regional_endpoint: + logger.info("Moving to next available regional endpoint: %s", self.current_regional_endpoint) + request.route_to_location(self.current_regional_endpoint) + return True + + for regional_endpoint in available_account_regional_endpoints: + if regional_endpoint == self.current_regional_endpoint: + continue + + if regional_endpoint in self.unavailable_regional_endpoints: + continue + + self.current_regional_endpoint = regional_endpoint + logger.info("Moving to next available regional endpoint: %s", self.current_regional_endpoint) + request.route_to_location(self.current_regional_endpoint) + return True + + return False + +class _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover(_GlobalPartitionEndpointManagerForCircuitBreaker): + """ + This internal class implements the logic for partition endpoint management for + geo-replicated database accounts. + """ + def __init__(self, client: "CosmosClientConnection"): + super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover, self).__init__(client) + self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} + + def is_per_partition_automatic_failover_applicable(self, request: RequestObject) -> bool: + if not request: + return False + + if (self.location_cache.can_use_multiple_write_locations_for_request(request) + or _OperationType.IsReadOnlyOperation(request.operation_type)): + return False + + per_partition_automatic_failover_config_enabled = os.environ.get(Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG, + Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG_DEFAULT).lower() == "true" + + # TODO: PPAF - This check here needs to be verified once we test against a live account with the config enabled. + if not per_partition_automatic_failover_config_enabled or not self._database_account_cache._EnablePerPartitionFailoverBehavior: + return False + + # if we have at most one region available in the account, we cannot do per partition automatic failover + available_regions = self.compute_available_preferred_regions(request) + if len(available_regions) <= 1: + return False + + # if the request is not for a document or if the request is not executing a stored procedure, return False + if (request.resource_type != ResourceType.Document and + request.operation_type != _OperationType.ExecuteJavaScript): + return False + + return True + + def resolve_service_endpoint_for_partition( + self, + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] + ) -> str: + if self.is_per_partition_automatic_failover_applicable(request) and pk_range_wrapper: + # If per partition automatic failover is applicable, we check partition unavailability + if pk_range_wrapper in self.partition_range_to_failover_info: + print("Resolving service endpoint for partition with per partition automatic failover enabled.") + partition_failover_info = self.partition_range_to_failover_info[pk_range_wrapper] + if request.location_endpoint_to_route is not None: + if request.location_endpoint_to_route in partition_failover_info.unavailable_regional_endpoints: + # If the current region is unavailable, we try to move to the next available region + if not partition_failover_info.try_move_to_next_location( + self.compute_available_preferred_regions(request), + request): + logger.info("All available regions for partition are unavailable. Refreshing cache.") + # If no other region is available, we invalidate the cache and start once again from our + # main write region in the account configurations + self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() + request.clear_route_to_location() + return self._resolve_service_endpoint(request) + else: + # Update the current regional endpoint to whatever the request is routing to + partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route + else: + partition_failover_info = PartitionLevelFailoverInfo() + partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route + self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info + + return self._resolve_service_endpoint(request) + else: + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) + + def compute_available_preferred_regions( + self, + request: RequestObject + ) -> Set[str]: + """ + Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. + """ + excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations + preferred_locations = self.PreferredLocations + available_regions = [item for item in preferred_locations if item not in excluded_locations] + available_regional_endpoints = { + self.location_cache.account_read_regional_routing_contexts_by_location[region].primary_endpoint + for region in available_regions + } + return available_regional_endpoints diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py index 90578c63e5dd..5363a31f3b30 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py @@ -210,13 +210,11 @@ def get_ordered_read_locations(self): def _get_configured_excluded_locations(self, request: RequestObject) -> List[str]: # If excluded locations were configured on request, use request level excluded locations. excluded_locations = request.excluded_locations - if excluded_locations is None: + if len(excluded_locations) == 0: if self.connection_policy.ExcludedLocations: # If excluded locations were only configured on client(connection_policy), use client level # make copy of excluded locations to avoid modifying the original list excluded_locations = list(self.connection_policy.ExcludedLocations) - else: - excluded_locations = [] for excluded_location in request.excluded_locations_circuit_breaker: if excluded_location not in excluded_locations: excluded_locations.append(excluded_location) @@ -445,7 +443,7 @@ def update_location_cache(self, write_locations=None, read_locations=None, enabl ) def get_preferred_regional_routing_contexts( - self, endpoints_by_location, orderedLocations, expected_available_operation, fallback_endpoint + self, endpoints_by_location, ordered_locations, expected_available_operation, fallback_endpoint ): regional_endpoints = [] # if enableEndpointDiscovery is false, we always use the defaultEndpoint that @@ -475,7 +473,7 @@ def get_preferred_regional_routing_contexts( regional_endpoints.extend(unavailable_endpoints) else: - for location in orderedLocations: + for location in ordered_locations: if location and location in endpoints_by_location: # location is empty during manual failover regional_endpoint = endpoints_by_location[location] diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py index d20eedb40148..d43407a40a72 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py @@ -40,7 +40,7 @@ def __init__( self.location_index_to_route: Optional[int] = None self.location_endpoint_to_route: Optional[str] = None self.last_routed_location_endpoint_within_region: Optional[str] = None - self.excluded_locations: Optional[List[str]] = None + self.excluded_locations: List[str] = [] self.excluded_locations_circuit_breaker: List[str] = [] self.healthy_tentative_location: Optional[str] = None diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 91145ef217ba..50c26e87cb62 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -63,11 +63,12 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin :rtype: tuple of (dict, dict) """ pk_range_wrapper = None - if args and global_endpoint_manager.is_circuit_breaker_applicable(args[0]): + if args and (global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0]) or + global_endpoint_manager.is_circuit_breaker_applicable(args[0])): pk_range_wrapper = global_endpoint_manager.create_pk_range_wrapper(args[0]) # instantiate all retry policies here to be applied for each request execution endpointDiscovery_retry_policy = _endpoint_discovery_retry_policy.EndpointDiscoveryRetryPolicy( - client.connection_policy, global_endpoint_manager, *args + client.connection_policy, global_endpoint_manager, pk_range_wrapper, *args ) database_account_retry_policy = _database_account_retry_policy.DatabaseAccountRetryPolicy( client.connection_policy diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py index e41881429b20..bb338f443dca 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py @@ -107,8 +107,9 @@ def _Request(global_endpoint_manager, request_params, connection_policy, pipelin base_url = request_params.endpoint_override else: pk_range_wrapper = None - if global_endpoint_manager.is_circuit_breaker_applicable(request_params): - # Circuit breaker is applicable, so we need to use the endpoint from the request + if (global_endpoint_manager.is_circuit_breaker_applicable(request_params) or + global_endpoint_manager.is_per_partition_automatic_failover_applicable(request_params)): + # Circuit breaker or per-partition failover are applicable, so we need to use the endpoint from the request pk_range_wrapper = global_endpoint_manager.create_pk_range_wrapper(request_params) base_url = global_endpoint_manager.resolve_service_endpoint_for_partition(request_params, pk_range_wrapper) if not request.url.startswith(base_url): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py index b77ce1a69f13..70f6fdd2e299 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py @@ -27,8 +27,8 @@ def ShouldRetry(self, _exception): :returns: a boolean stating whether the request should be retried :rtype: bool """ - # we don't retry on write operations for timeouts or any internal server errors - if self.request and (not _OperationType.IsReadOnlyOperation(self.request.operation_type)): + if self.request and (not _OperationType.IsReadOnlyOperation(self.request.operation_type) and + not self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request)): return False if not self.connection_policy.EnableEndpointDiscovery: @@ -46,6 +46,13 @@ def ShouldRetry(self, _exception): # This function prepares the request to go to the next region def resolve_next_region_service_endpoint(self): + if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): + # If per partition automatic failover is applicable, we mark the current endpoint as unavailable + # and resolve the service endpoint for the partition range - otherwise, continue with the default retry logic + partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] + partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) + return self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) + # clear previous location-based routing directive self.request.clear_route_to_location() # clear the last routed endpoint within same region since we are going to a new region now diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/documents.py b/sdk/cosmos/azure-cosmos/azure/cosmos/documents.py index a0e55077aefa..3008bed9b349 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/documents.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/documents.py @@ -78,6 +78,7 @@ def __init__(self) -> None: self._WritableLocations: List[dict] = [] self._ReadableLocations: List[dict] = [] self._EnableMultipleWritableLocations = False + self._EnablePerPartitionFailoverBehavior = False @property def WritableLocations(self) -> List[Dict[Any, Any]]: diff --git a/sdk/cosmos/azure-cosmos/pytest.ini b/sdk/cosmos/azure-cosmos/pytest.ini index aabe78b51f08..a5e006ea027e 100644 --- a/sdk/cosmos/azure-cosmos/pytest.ini +++ b/sdk/cosmos/azure-cosmos/pytest.ini @@ -7,3 +7,4 @@ markers = cosmosMultiRegion: marks tests running on a Cosmos DB live account with multi-region and multi-write enabled. cosmosCircuitBreaker: marks tests running on Cosmos DB live account with per partition circuit breaker enabled and multi-write enabled. cosmosCircuitBreakerMultiRegion: marks tests running on Cosmos DB live account with one write region and multiple read regions and per partition circuit breaker enabled. + cosmosPerPartitionAutomaticFailover: marks tests running on Cosmos DB live account with one write region and multiple read regions and per partition automatic failover enabled. diff --git a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py index 2386e54fd882..127a21cb0bd4 100644 --- a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py +++ b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py @@ -26,7 +26,7 @@ import logging import sys from time import sleep -from typing import Callable, Optional, Any, Dict, List, MutableMapping +from typing import Callable, Optional, Any, Dict, List, Mapping, MutableMapping, Tuple, Sequence from azure.core.pipeline.transport import HttpRequest, HttpResponse from azure.core.pipeline.transport._requests_basic import RequestsTransport, RequestsTransportResponse @@ -63,8 +63,29 @@ def error_with_counter(self, error: Exception) -> Exception: self.counters[ERROR_WITH_COUNTER] += 1 return error - def add_fault(self, predicate: Callable[[HttpRequest], bool], fault_factory: Callable[[HttpRequest], Exception]): - self.faults.append({"predicate": predicate, "apply": fault_factory}) + def add_fault(self, + predicate: Callable[[HttpRequest], bool], + fault_factory: Callable[[HttpRequest], Exception], + max_inner_count: Optional[int] = None, + after_max_count: Optional[Callable[[HttpRequest], RequestsTransportResponse]] = None): + """ Adds a fault to the transport that will be applied when the predicate matches the request. + :param Callable predicate: A callable that takes an HttpRequest and returns True if the fault should be applied. + :param Callable fault_factory: A callable that takes an HttpRequest and returns an Exception to be raised. + :param int max_inner_count: Optional maximum number of times the fault can be applied for one request. + If None, the fault will be applied every time the predicate matches. + :param Callable after_max_count: Optional callable that takes an HttpRequest and returns a + RequestsTransportResponse. Used to return a different response after the maximum number of faults has + been applied. Can only be used if `max_inner_count` is not None. + """ + if max_inner_count is not None: + if after_max_count is not None: + self.faults.append({"predicate": predicate, "apply": fault_factory, "after_max_count": after_max_count, + "max_count": max_inner_count, "current_count": 0}) + else: + self.faults.append({"predicate": predicate, "apply": fault_factory, + "max_count": max_inner_count, "current_count": 0}) + else: + self.faults.append({"predicate": predicate, "apply": fault_factory}) def add_response_transformation(self, predicate: Callable[[HttpRequest], bool], response_transformation: Callable[[HttpRequest, Callable[[HttpRequest], RequestsTransportResponse]], RequestsTransportResponse]): self.responseTransformations.append({ @@ -85,6 +106,16 @@ def send(self, request: HttpRequest, *, proxies: Optional[MutableMapping[str, st # find the first fault Factory with matching predicate if any first_fault_factory = FaultInjectionTransport.__first_item(iter(self.faults), lambda f: f["predicate"](request)) if first_fault_factory: + if "max_count" in first_fault_factory: + FaultInjectionTransport.logger.info(f"Found fault factory with max count {first_fault_factory['max_count']}") + if first_fault_factory["current_count"] >= first_fault_factory["max_count"]: + first_fault_factory["current_count"] = 0 # reset counter + if "after_max_count" in first_fault_factory: + FaultInjectionTransport.logger.info("Max count reached, returning after_max_count") + return first_fault_factory["after_max_count"] + FaultInjectionTransport.logger.info("Max count reached, skipping fault injection") + return super().send(request, proxies=proxies, **kwargs) + first_fault_factory["current_count"] += 1 FaultInjectionTransport.logger.info("--> FaultInjectionTransport.ApplyFaultInjection") injected_error = first_fault_factory["apply"](request) FaultInjectionTransport.logger.info("Found to-be-injected error {}".format(injected_error)) @@ -132,12 +163,21 @@ def print_call_stack(): frame = frame.f_back @staticmethod - def predicate_req_payload_contains_id(r: HttpRequest, id_value: str): + def predicate_req_payload_contains_id(r: HttpRequest, id_value: str) -> bool: if r.body is None: return False return '"id":"{}"'.format(id_value) in r.body + @staticmethod + def predicate_req_payload_contains_field(r: HttpRequest, field_name: str, field_value: Optional[str]) -> bool: + if r.body is None: + return False + if field_value is None: + return '"{}":"'.format(field_name) in r.body + else: + return '"{}":"{}"'.format(field_name, field_value) in r.body + @staticmethod def predicate_req_for_document_with_id(r: HttpRequest, id_value: str) -> bool: return (FaultInjectionTransport.predicate_url_contains_id(r, id_value) @@ -163,15 +203,8 @@ def predicate_is_resource_type(r: HttpRequest, resource_type: str) -> bool: @staticmethod def predicate_is_operation_type(r: HttpRequest, operation_type: str) -> bool: is_operation_type = r.headers.get(HttpHeaders.ThinClientProxyOperationType) == operation_type - return is_operation_type - @staticmethod - def predicate_is_resource_type(r: HttpRequest, resource_type: str) -> bool: - is_resource_type = r.headers.get(HttpHeaders.ThinClientProxyResourceType) == resource_type - - return is_resource_type - @staticmethod def predicate_is_write_operation(r: HttpRequest, uri_prefix: str) -> bool: is_write_document_operation = documents._OperationType.IsWriteOperation( @@ -209,7 +242,8 @@ def error_service_response() -> Exception: def transform_topology_swr_mrr( write_region_name: str, read_region_name: str, - inner: Callable[[], RequestsTransportResponse]) -> RequestsTransportResponse: + inner: Callable[[], RequestsTransportResponse], + enable_per_partition_failover: bool = False) -> RequestsTransportResponse: response = inner() if not FaultInjectionTransport.predicate_is_database_account_call(response.request): @@ -225,6 +259,31 @@ def transform_topology_swr_mrr( writable_locations[0]["name"] = write_region_name readable_locations.append({"name": read_region_name, "databaseAccountEndpoint" : test_config.TestConfig.local_host}) FaultInjectionTransport.logger.info("Transformed Account Topology: {}".format(result)) + # TODO: PPAF - need to verify below behavior against actual Cosmos DB service response + if enable_per_partition_failover: + result["enablePerPartitionFailoverBehavior"] = True + request: HttpRequest = response.request + return FaultInjectionTransport.MockHttpResponse(request, 200, result) + + return response + + @staticmethod + def transform_topology_ppaf_enabled( + inner: Callable[[], RequestsTransportResponse], + enable_per_partition_failover: bool = False) -> RequestsTransportResponse: + + response = inner() + if not FaultInjectionTransport.predicate_is_database_account_call(response.request): + return response + + data = response.body() + if response.status_code == 200 and data: + data = data.decode("utf-8") + result = json.loads(data) + FaultInjectionTransport.logger.info("Transformed Account Topology: {}".format(result)) + # TODO: PPAF - need to verify below behavior against actual Cosmos DB service response + if enable_per_partition_failover: + result["enablePerPartitionFailoverBehavior"] = True request: HttpRequest = response.request return FaultInjectionTransport.MockHttpResponse(request, 200, result) @@ -267,8 +326,25 @@ def transform_topology_mwr( return response + class MockHttpRequest(HttpRequest): + def __init__( + self, + url: str, + method: str = "GET", + headers: Optional[Mapping[str, str]] = None, + files: Optional[Any] = None, + data: Optional[Any] = None, + ) -> None: + self.method = method + self.url = url + self.headers: Optional[MutableMapping[str, str]] = headers + self.files: Optional[Any] = files + self.data: Optional[Any] = data + self.multipart_mixed_info: Optional[ + Tuple[Sequence[Any], Sequence[Any], Optional[str], Dict[str, Any]]] = None + class MockHttpResponse(RequestsTransportResponse): - def __init__(self, request: HttpRequest, status_code: int, content:Optional[Dict[str, Any]]): + def __init__(self, request: HttpRequest, status_code: int, content: Optional[Any] = None): self.request: HttpRequest = request # This is actually never None, and set by all implementations after the call to # __init__ of this class. This class is also a legacy impl, so it's risky to change it diff --git a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py index 887be44f2273..4194fc5672b7 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py +++ b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py @@ -215,9 +215,9 @@ def test_get_applicable_regional_endpoints_excluded_regions(self, test_type): location_cache.perform_on_database_account_read(database_account) # Init requests and set excluded regions on requests - write_doc_request = RequestObject(ResourceType.Document, _OperationType.Create, None) + write_doc_request = RequestObject(ResourceType.Document, _OperationType.Create, {}) write_doc_request.excluded_locations = excluded_locations_on_requests - read_doc_request = RequestObject(ResourceType.Document, _OperationType.Read, None) + read_doc_request = RequestObject(ResourceType.Document, _OperationType.Read, {}) read_doc_request.excluded_locations = excluded_locations_on_requests # Test if read endpoints were correctly filtered on client level @@ -247,7 +247,7 @@ def test_set_excluded_locations_for_requests(self): options: Mapping[str, Any] = {"excludedLocations": excluded_locations} expected_excluded_locations = excluded_locations - read_doc_request = RequestObject(ResourceType.Document, _OperationType.Create, None) + read_doc_request = RequestObject(ResourceType.Document, _OperationType.Create, {}) read_doc_request.set_excluded_location_from_options(options) actual_excluded_locations = read_doc_request.excluded_locations assert actual_excluded_locations == expected_excluded_locations diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py new file mode 100644 index 000000000000..4b805e38c78d --- /dev/null +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -0,0 +1,191 @@ +# The MIT License (MIT) +# Copyright (c) Microsoft Corporation. All rights reserved. +import os +import unittest +import uuid +from time import sleep + +import pytest +from azure.core.exceptions import ServiceResponseError + +import test_config +from azure.cosmos import CosmosClient, _partition_health_tracker, _location_cache, PartitionKey +from azure.cosmos.exceptions import CosmosHttpResponseError +from _fault_injection_transport import FaultInjectionTransport +from test_per_partition_circuit_breaker_mm import create_doc, operations, REGION_1, \ + REGION_2, PK_VALUE, CREATE, UPSERT, REPLACE, DELETE, PATCH, BATCH, DELETE_ALL_ITEMS_BY_PARTITION_KEY + +@pytest.fixture(scope="class", autouse=True) +def setup_teardown(): + os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "True" + yield + os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "False" + + +def create_errors(): + errors = [] + error_codes = [403, 408, 500, 502, 503] + for error_code in error_codes: + if error_code == 403: + errors.append(CosmosHttpResponseError( + status_code=error_code, + message="Some injected error.", + sub_status=3)) + else: + errors.append(CosmosHttpResponseError( + status_code=error_code, + message="Some injected error.")) + return errors + +def write_operations_and_errors(): + write_operations = [CREATE, UPSERT, REPLACE, DELETE, PATCH, BATCH] + errors = create_errors() + params = [] + for write_operation in write_operations: + for error in errors: + params.append((write_operation, error)) + + return params + +def perform_write_operation(operation, container, fault_injection_container, doc_id, pk): + resp = None + doc = {'id': doc_id, + 'pk': pk, + 'name': 'sample document', + 'key': 'value'} + if operation == CREATE: + resp = fault_injection_container.create_item(body=doc) + elif operation == UPSERT: + resp = fault_injection_container.upsert_item(body=doc) + elif operation == REPLACE: + container.create_item(body=doc) + sleep(1) + new_doc = {'id': doc_id, + 'pk': pk, + 'name': 'sample document' + str(uuid), + 'key': 'value'} + resp = fault_injection_container.replace_item(item=doc['id'], body=new_doc) + elif operation == DELETE: + container.create_item(body=doc) + sleep(1) + resp = fault_injection_container.delete_item(item=doc['id'], partition_key=doc['pk']) + elif operation == PATCH: + container.create_item(body=doc) + sleep(1) + patch_ops = [{"op": "incr", "path": "/company", "value": 3}] + resp = fault_injection_container.patch_item(item=doc['id'], partition_key=doc['pk'], patch_operations=patch_ops) + elif operation == BATCH: + batch_operations = [ + ("create", (doc,)), + ("upsert", (doc,)), + ("upsert", (doc,)), + ("upsert", (doc,)), + ] + resp = fault_injection_container.execute_item_batch(batch_operations, partition_key=doc['pk']) + # this will need to be emulator only + elif operation == DELETE_ALL_ITEMS_BY_PARTITION_KEY: + container.create_item(body=doc) + resp = fault_injection_container.delete_all_items_by_partition_key(pk) + return resp + +# These tests assume that the configured live account has one main write region and one secondary read region. + +@pytest.mark.cosmosPerPartitionAutomaticFailover +class TestPerPartitionAutomaticFailover: + host = test_config.TestConfig.host + master_key = test_config.TestConfig.masterKey + connectionPolicy = test_config.TestConfig.connectionPolicy + TEST_DATABASE_ID = "test_config.TestConfig.TEST_DATABASE_ID" + TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID + + def setup_method_with_custom_transport(self, custom_transport, default_endpoint=host, **kwargs): + container_id = kwargs.pop("container_id", None) + if not container_id: + container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID + client = CosmosClient(default_endpoint, self.master_key, consistency_level="Session", + preferred_locations=[REGION_1, REGION_2], + transport=custom_transport, **kwargs) + db = client.create_database_if_not_exists(self.TEST_DATABASE_ID) + container = db.create_container_if_not_exists(container_id, PartitionKey(path="/pk"),) + return {"client": client, "db": db, "col": container} + + def setup_info(self, error, max_count=None, is_batch=False, **kwargs): + custom_transport = FaultInjectionTransport() + # two documents targeted to same partition, one will always fail and the other will succeed + doc_fail_id = str(uuid.uuid4()) + doc_success_id = str(uuid.uuid4()) + predicate = lambda r: FaultInjectionTransport.predicate_req_for_document_with_id(r, doc_fail_id) + # The MockRequest only gets used to create the MockHttpResponse + mock_request = FaultInjectionTransport.MockHttpRequest(url=self.host) + if is_batch: + success_response = FaultInjectionTransport.MockHttpResponse(mock_request, 200, [{"statusCode": 200}],) + else: + success_response = FaultInjectionTransport.MockHttpResponse(mock_request, 200) + custom_transport.add_fault(predicate=predicate, fault_factory=error, max_inner_count=max_count, + after_max_count=success_response) + is_get_account_predicate = lambda r: FaultInjectionTransport.predicate_is_database_account_call(r) + # Set the database account response to have PPAF enabled + ppaf_enabled_database_account = \ + lambda r, inner: FaultInjectionTransport.transform_topology_ppaf_enabled( + inner=inner, + enable_per_partition_failover=True) + custom_transport.add_response_transformation( + is_get_account_predicate, + ppaf_enabled_database_account) + setup = self.setup_method_with_custom_transport(None, default_endpoint=self.host, **kwargs) + custom_setup = self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, **kwargs) + return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate + + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors()) + def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): + # This test validates that the partition info cache is updated correctly upon failures, and that the + # per-partition automatic failover logic routes requests to the next available regional endpoint + error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, 1, write_operation == BATCH) + container = setup['col'] + fault_injection_container = custom_setup['col'] + global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager + + # Create a document to populate the per-partition GEM partition range info cache + fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, + 'name': 'sample document', 'key': 'value'}) + pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] + initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint + + # Based on our configuration, we should have had one error followed by a success - marking only the previous endpoint as unavailable + perform_write_operation( + write_operation, + container, + fault_injection_container, + doc_fail_id, + PK_VALUE) + partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same + assert len(partition_info.unavailable_regional_endpoints) == 1 + assert initial_endpoint in partition_info.unavailable_regional_endpoints + assert initial_endpoint != partition_info.current_regional_endpoint # west us 3 != west us + + # Now we run another request to see how the cache gets updated + perform_write_operation( + write_operation, + container, + fault_injection_container, + str(uuid.uuid4()), + PK_VALUE) + partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + # Verify that the cache is empty, since the request going to the second regional endpoint failed + # Once we reach the point of all available regions being marked as unavailable, the cache is cleared + assert len(partition_info.unavailable_regional_endpoints) == 0 + assert initial_endpoint not in partition_info.unavailable_regional_endpoints + assert partition_info.current_regional_endpoint is None + + + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors()) + def test_ppaf_exclude_regions(self, write_operation, error): + # TODO: PPAF - finish this test + return + + + +if __name__ == '__main__': + unittest.main() diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py index f8e7369716a2..bda43946a544 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py @@ -15,7 +15,7 @@ from azure.cosmos.exceptions import CosmosHttpResponseError from _fault_injection_transport import FaultInjectionTransport from test_per_partition_circuit_breaker_mm import create_doc, write_operations_and_errors, operations, REGION_1, \ - REGION_2, PK_VALUE, perform_write_operation, perform_read_operation + REGION_2, PK_VALUE, perform_write_operation, perform_read_operation, READ, CREATE, validate_stats COLLECTION = "created_collection" @@ -38,6 +38,7 @@ def validate_unhealthy_partitions(global_endpoint_manager, assert unhealthy_partitions == expected_unhealthy_partitions @pytest.mark.cosmosCircuitBreakerMultiRegion +@pytest.mark.cosmosPerPartitionAutomaticFailover class TestPerPartitionCircuitBreakerSmMrr: host = test_config.TestConfig.host master_key = test_config.TestConfig.masterKey diff --git a/sdk/cosmos/azure-cosmos/tests/test_service_retry_policies_async.py b/sdk/cosmos/azure-cosmos/tests/test_service_retry_policies_async.py index b0b60ef7730c..208d9d4291e4 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_service_retry_policies_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_service_retry_policies_async.py @@ -10,7 +10,7 @@ from azure.core.exceptions import ServiceRequestError, ServiceResponseError import test_config -from azure.cosmos import DatabaseAccount, _location_cache +from azure.cosmos import DatabaseAccount from azure.cosmos._location_cache import RegionalRoutingContext from azure.cosmos.aio import CosmosClient, _retry_utility_async, _global_endpoint_manager_async from azure.cosmos.exceptions import CosmosHttpResponseError diff --git a/sdk/cosmos/live-platform-matrix.json b/sdk/cosmos/live-platform-matrix.json index 8edaff6fe68d..fb21bacff0e1 100644 --- a/sdk/cosmos/live-platform-matrix.json +++ b/sdk/cosmos/live-platform-matrix.json @@ -59,6 +59,23 @@ } } }, + { + "PerPartitionAutomaticFailoverTestConfig": { + "Ubuntu2004_313_ppaf": { + "OSVmImage": "env:LINUXVMIMAGE", + "Pool": "env:LINUXPOOL", + "PythonVersion": "3.13", + "CoverageArg": "--disablecov", + "TestSamples": "false", + "TestMarkArgument": "cosmosPerPartitionAutomaticFailover" + } + }, + "ArmConfig": { + "MultiRegion": { + "ArmTemplateParameters": "@{ defaultConsistencyLevel = 'Session'; enableMultipleRegions = $true; perPartitionAutomaticFailoverEnabled = 'True' }" + } + } + }, { "MacTestConfig": { "macos311_search_query": { diff --git a/sdk/cosmos/test-resources.bicep b/sdk/cosmos/test-resources.bicep index 735c1a0e66ee..f6088ec122a6 100644 --- a/sdk/cosmos/test-resources.bicep +++ b/sdk/cosmos/test-resources.bicep @@ -15,6 +15,9 @@ param location string = resourceGroup().location @description('Whether Per Partition Circuit Breaker should be enabled.') param circuitBreakerEnabled string = 'False' +@description('Whether Per Partition Automatic Failover should be enabled.') +param perPartitionAutomaticFailoverEnabled string = 'False' + @description('The api version to be used by Bicep to create resources') param apiVersion string = '2023-04-15' @@ -105,5 +108,6 @@ resource accountName_roleAssignmentId 'Microsoft.DocumentDB/databaseAccounts/sql } output AZURE_COSMOS_ENABLE_CIRCUIT_BREAKER string = circuitBreakerEnabled +output AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER string = perPartitionAutomaticFailoverEnabled output ACCOUNT_HOST string = reference(resourceId, apiVersion).documentEndpoint output ACCOUNT_KEY string = listKeys(resourceId, apiVersion).primaryMasterKey From b8228e7e7c91dd3f34a026da24c48db9008b6058 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Sun, 15 Jun 2025 20:50:39 -0400 Subject: [PATCH 02/68] async changes --- .../azure/cosmos/_cosmos_client_connection.py | 11 +- ...anager_per_partition_automatic_failover.py | 22 ++- .../cosmos/_timeout_failover_retry_policy.py | 5 +- .../azure/cosmos/aio/_asynchronous_request.py | 5 +- .../aio/_cosmos_client_connection_async.py | 11 +- ..._endpoint_manager_circuit_breaker_async.py | 2 +- ..._per_partition_automatic_failover_async.py | 149 ++++++++++++++++++ .../azure/cosmos/aio/_retry_utility_async.py | 5 +- .../tests/_fault_injection_transport.py | 12 +- .../tests/_fault_injection_transport_async.py | 57 ++++++- .../test_per_partition_automatic_failover.py | 75 ++------- ..._per_partition_automatic_failover_async.py | 136 ++++++++++++++++ .../test_per_partition_circuit_breaker_mm.py | 11 +- ..._per_partition_circuit_breaker_mm_async.py | 4 +- ..._partition_circuit_breaker_sm_mrr_async.py | 1 + 15 files changed, 402 insertions(+), 104 deletions(-) create mode 100644 sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py create mode 100644 sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py index aeda60b03ce3..c135277f6c9d 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py @@ -2621,15 +2621,16 @@ def GetDatabaseAccount( database_account._ReadableLocations = result[Constants.ReadableLocations] if Constants.EnableMultipleWritableLocations in result: database_account._EnableMultipleWritableLocations = result[ - Constants.EnableMultipleWritableLocations - ] - # TODO: PPAF - Verify that this is the correct variable from the service - if Constants.EnablePerPartitionFailoverBehavior in result: - database_account._EnablePerPartitionFailoverBehavior = result[Constants.EnablePerPartitionFailoverBehavior] + Constants.EnableMultipleWritableLocations] self.UseMultipleWriteLocations = ( self.connection_policy.UseMultipleWriteLocations and database_account._EnableMultipleWritableLocations ) + + # TODO: Verify that this is the correct variable from the service + if Constants.EnablePerPartitionFailoverBehavior in result: + database_account._EnablePerPartitionFailoverBehavior = result[Constants.EnablePerPartitionFailoverBehavior] + if response_hook: response_hook(last_response_headers, result) return database_account diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 03a3080cdefe..bbce867d4b53 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -8,7 +8,7 @@ import os import threading -from typing import Dict, List, Set, TYPE_CHECKING, Optional +from typing import Dict, Set, TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType from azure.cosmos._constants import _Constants as Constants @@ -24,10 +24,12 @@ logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") +# pylint: disable=name-too-long, protected-access + class PartitionLevelFailoverInfo: """ Holds information about the partition level regional failover. - It is used to track the partition key range and the regions where it is available. + Used to track the partition key range and the regions where it is available. """ def __init__(self): self.unavailable_regional_endpoints = set() @@ -73,11 +75,13 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) or _OperationType.IsReadOnlyOperation(request.operation_type)): return False - per_partition_automatic_failover_config_enabled = os.environ.get(Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG, - Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG_DEFAULT).lower() == "true" + per_partition_automatic_failover_config_enabled = ( + os.environ.get(Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG, + Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG_DEFAULT).lower() == "true") - # TODO: PPAF - This check here needs to be verified once we test against a live account with the config enabled. - if not per_partition_automatic_failover_config_enabled or not self._database_account_cache._EnablePerPartitionFailoverBehavior: + # TODO: This check here needs to be verified once we test against a live account with the config enabled. + if (not per_partition_automatic_failover_config_enabled or + not self._database_account_cache._EnablePerPartitionFailoverBehavior): return False # if we have at most one region available in the account, we cannot do per partition automatic failover @@ -100,7 +104,7 @@ def resolve_service_endpoint_for_partition( if self.is_per_partition_automatic_failover_applicable(request) and pk_range_wrapper: # If per partition automatic failover is applicable, we check partition unavailability if pk_range_wrapper in self.partition_range_to_failover_info: - print("Resolving service endpoint for partition with per partition automatic failover enabled.") + logger.info("Resolving service endpoint for partition with per partition automatic failover enabled.") partition_failover_info = self.partition_range_to_failover_info[pk_range_wrapper] if request.location_endpoint_to_route is not None: if request.location_endpoint_to_route in partition_failover_info.unavailable_regional_endpoints: @@ -121,7 +125,6 @@ def resolve_service_endpoint_for_partition( partition_failover_info = PartitionLevelFailoverInfo() partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info - return self._resolve_service_endpoint(request) else: return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) @@ -132,6 +135,9 @@ def compute_available_preferred_regions( ) -> Set[str]: """ Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. + :param RequestObject request: The request object containing the routing context. + :return: A set of available regional endpoints. + :rtype: Set[str] """ excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations preferred_locations = self.PreferredLocations diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py index 70f6fdd2e299..8e60e0f7dcf6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py @@ -48,10 +48,11 @@ def ShouldRetry(self, _exception): def resolve_next_region_service_endpoint(self): if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): # If per partition automatic failover is applicable, we mark the current endpoint as unavailable - # and resolve the service endpoint for the partition range - otherwise, continue with the default retry logic + # and resolve the service endpoint for the partition range - otherwise, continue with default retry logic partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) - return self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) + return self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, + self.pk_range_wrapper) # clear previous location-based routing directive self.request.clear_route_to_location() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 79e674eaa31c..1cd2a22039b4 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -76,8 +76,9 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p base_url = request_params.endpoint_override else: pk_range_wrapper = None - if global_endpoint_manager.is_circuit_breaker_applicable(request_params): - # Circuit breaker is applicable, so we need to use the endpoint from the request + if (global_endpoint_manager.is_circuit_breaker_applicable(request_params) or + global_endpoint_manager.is_per_partition_automatic_failover_applicable(request_params)): + # Circuit breaker or per-partition failover are applicable, so we need to use the endpoint from the request pk_range_wrapper = await global_endpoint_manager.create_pk_range_wrapper(request_params) base_url = global_endpoint_manager.resolve_service_endpoint_for_partition(request_params, pk_range_wrapper) if not request.url.startswith(base_url): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_cosmos_client_connection_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_cosmos_client_connection_async.py index cbcd3ccafba7..a56cb5777406 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_cosmos_client_connection_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_cosmos_client_connection_async.py @@ -48,8 +48,8 @@ DistributedTracingPolicy, ProxyPolicy) from azure.core.utils import CaseInsensitiveDict -from azure.cosmos.aio._global_partition_endpoint_manager_circuit_breaker_async import ( - _GlobalPartitionEndpointManagerForCircuitBreakerAsync) +from azure.cosmos.aio._global_partition_endpoint_manager_per_partition_automatic_failover_async import ( + _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync) from .. import _base as base from .._base import _build_properties_cache @@ -174,7 +174,7 @@ def __init__( # pylint: disable=too-many-statements # Keeps the latest response headers from the server. self.last_response_headers: CaseInsensitiveDict = CaseInsensitiveDict() self.UseMultipleWriteLocations = False - self._global_endpoint_manager = _GlobalPartitionEndpointManagerForCircuitBreakerAsync(self) + self._global_endpoint_manager = _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync(self) retry_policy = None if isinstance(self.connection_policy.ConnectionRetryConfiguration, AsyncHTTPPolicy): @@ -452,6 +452,11 @@ async def GetDatabaseAccount( self.UseMultipleWriteLocations = ( self.connection_policy.UseMultipleWriteLocations and database_account._EnableMultipleWritableLocations ) + + # TODO: Verify that this is the correct variable from the service + if Constants.EnablePerPartitionFailoverBehavior in result: + database_account._EnablePerPartitionFailoverBehavior = result[Constants.EnablePerPartitionFailoverBehavior] + return database_account async def _GetDatabaseAccountCheck( diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_circuit_breaker_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_circuit_breaker_async.py index 78e8b551ee7a..0150fd1dd025 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_circuit_breaker_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_circuit_breaker_async.py @@ -101,7 +101,7 @@ async def record_failure( if pk_range_wrapper: self.global_partition_endpoint_manager_core.record_failure(request, pk_range_wrapper) - def resolve_service_endpoint_for_partition( + def _resolve_service_endpoint_for_partition_circuit_breaker( self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py new file mode 100644 index 000000000000..a9b974483614 --- /dev/null +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -0,0 +1,149 @@ +# The MIT License (MIT) +# Copyright (c) 2025 Microsoft Corporation + +"""Class for global endpoint manager for per partition automatic failover. This class inherits the circuit breaker +endpoint manager, since enabling per partition automatic failover also enables the circuit breaker logic. +""" +import logging +import os +import threading + +from typing import Dict, Set, TYPE_CHECKING, Optional + +from azure.cosmos.http_constants import ResourceType +from azure.cosmos._constants import _Constants as Constants +from azure.cosmos.aio._global_partition_endpoint_manager_circuit_breaker_async import \ + _GlobalPartitionEndpointManagerForCircuitBreakerAsync +from azure.cosmos.documents import _OperationType + +from azure.cosmos._request_object import RequestObject +from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper + +if TYPE_CHECKING: + from azure.cosmos._cosmos_client_connection import CosmosClientConnection + +logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") + +# pylint: disable=name-too-long, protected-access + +class PartitionLevelFailoverInfo: + """ + Holds information about the partition level regional failover. + Used to track the partition key range and the regions where it is available. + """ + def __init__(self): + self.unavailable_regional_endpoints = set() + self.current_regional_endpoint = None + self._lock = threading.Lock() + + def try_move_to_next_location(self, available_account_regional_endpoints: Set[str], request: RequestObject) -> bool: + with self._lock: + failed_regional_endpoint = request.location_endpoint_to_route + if failed_regional_endpoint != self.current_regional_endpoint: + logger.info("Moving to next available regional endpoint: %s", self.current_regional_endpoint) + request.route_to_location(self.current_regional_endpoint) + return True + + for regional_endpoint in available_account_regional_endpoints: + if regional_endpoint == self.current_regional_endpoint: + continue + + if regional_endpoint in self.unavailable_regional_endpoints: + continue + + self.current_regional_endpoint = regional_endpoint + logger.info("Moving to next available regional endpoint: %s", self.current_regional_endpoint) + request.route_to_location(self.current_regional_endpoint) + return True + + return False + +class _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync(_GlobalPartitionEndpointManagerForCircuitBreakerAsync): + """ + This internal class implements the logic for partition endpoint management for + geo-replicated database accounts. + """ + def __init__(self, client: "CosmosClientConnection"): + super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, self).__init__(client) + self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} + + def is_per_partition_automatic_failover_applicable(self, request: RequestObject) -> bool: + if not request: + return False + + if (self.location_cache.can_use_multiple_write_locations_for_request(request) + or _OperationType.IsReadOnlyOperation(request.operation_type)): + return False + + per_partition_automatic_failover_config_enabled = ( + os.environ.get(Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG, + Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG_DEFAULT).lower() == "true") + + # TODO: This check here needs to be verified once we test against a live account with the config enabled. + if (not per_partition_automatic_failover_config_enabled or + not self._database_account_cache._EnablePerPartitionFailoverBehavior): + return False + + # if we have at most one region available in the account, we cannot do per partition automatic failover + available_regions = self.compute_available_preferred_regions(request) + if len(available_regions) <= 1: + return False + + # if the request is not for a document or if the request is not executing a stored procedure, return False + if (request.resource_type != ResourceType.Document and + request.operation_type != _OperationType.ExecuteJavaScript): + return False + + return True + + def resolve_service_endpoint_for_partition( + self, + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] + ) -> str: + if self.is_per_partition_automatic_failover_applicable(request) and pk_range_wrapper: + # If per partition automatic failover is applicable, we check partition unavailability + if pk_range_wrapper in self.partition_range_to_failover_info: + logger.info("Resolving service endpoint for partition with per partition automatic failover enabled.") + partition_failover_info = self.partition_range_to_failover_info[pk_range_wrapper] + if request.location_endpoint_to_route is not None: + if request.location_endpoint_to_route in partition_failover_info.unavailable_regional_endpoints: + # If the current region is unavailable, we try to move to the next available region + if not partition_failover_info.try_move_to_next_location( + self.compute_available_preferred_regions(request), + request): + logger.info("All available regions for partition are unavailable. Refreshing cache.") + # If no other region is available, we invalidate the cache and start once again from our + # main write region in the account configurations + self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() + request.clear_route_to_location() + return self._resolve_service_endpoint(request) + else: + # Update the current regional endpoint to whatever the request is routing to + partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route + else: + partition_failover_info = PartitionLevelFailoverInfo() + partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route + self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info + return self._resolve_service_endpoint(request) + else: + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) + + def compute_available_preferred_regions( + self, + request: RequestObject + ) -> Set[str]: + """ + Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. + :param RequestObject request: The request object containing the routing context. + :return: A set of available regional endpoints. + :rtype: Set[str] + """ + excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations + preferred_locations = self.PreferredLocations + available_regions = [item for item in preferred_locations if item not in excluded_locations] + available_regional_endpoints = { + self.location_cache.account_read_regional_routing_contexts_by_location[region].primary_endpoint + for region in available_regions + } + return available_regional_endpoints diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 33b9c0785b38..7884f9060183 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -64,11 +64,12 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg :rtype: tuple of (dict, dict) """ pk_range_wrapper = None - if args and global_endpoint_manager.is_circuit_breaker_applicable(args[0]): + if args and (global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0]) or + global_endpoint_manager.is_circuit_breaker_applicable(args[0])): pk_range_wrapper = await global_endpoint_manager.create_pk_range_wrapper(args[0]) # instantiate all retry policies here to be applied for each request execution endpointDiscovery_retry_policy = _endpoint_discovery_retry_policy.EndpointDiscoveryRetryPolicy( - client.connection_policy, global_endpoint_manager, *args + client.connection_policy, global_endpoint_manager, pk_range_wrapper, *args ) database_account_retry_policy = _database_account_retry_policy.DatabaseAccountRetryPolicy( client.connection_policy diff --git a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py index 127a21cb0bd4..705f5e4090ee 100644 --- a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py +++ b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py @@ -259,7 +259,7 @@ def transform_topology_swr_mrr( writable_locations[0]["name"] = write_region_name readable_locations.append({"name": read_region_name, "databaseAccountEndpoint" : test_config.TestConfig.local_host}) FaultInjectionTransport.logger.info("Transformed Account Topology: {}".format(result)) - # TODO: PPAF - need to verify below behavior against actual Cosmos DB service response + # TODO: need to verify below behavior against actual Cosmos DB service response if enable_per_partition_failover: result["enablePerPartitionFailoverBehavior"] = True request: HttpRequest = response.request @@ -268,9 +268,8 @@ def transform_topology_swr_mrr( return response @staticmethod - def transform_topology_ppaf_enabled( - inner: Callable[[], RequestsTransportResponse], - enable_per_partition_failover: bool = False) -> RequestsTransportResponse: + def transform_topology_ppaf_enabled( # cspell:disable-line + inner: Callable[[], RequestsTransportResponse]) -> RequestsTransportResponse: response = inner() if not FaultInjectionTransport.predicate_is_database_account_call(response.request): @@ -280,10 +279,9 @@ def transform_topology_ppaf_enabled( if response.status_code == 200 and data: data = data.decode("utf-8") result = json.loads(data) + # TODO: need to verify below behavior against actual Cosmos DB service response + result["enablePerPartitionFailoverBehavior"] = True FaultInjectionTransport.logger.info("Transformed Account Topology: {}".format(result)) - # TODO: PPAF - need to verify below behavior against actual Cosmos DB service response - if enable_per_partition_failover: - result["enablePerPartitionFailoverBehavior"] = True request: HttpRequest = response.request return FaultInjectionTransport.MockHttpResponse(request, 200, result) diff --git a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport_async.py b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport_async.py index 994357323b81..08d0c47e15a5 100644 --- a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport_async.py +++ b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport_async.py @@ -60,8 +60,29 @@ async def error_with_counter(self, error: Exception) -> Exception: self.counters[ERROR_WITH_COUNTER] += 1 return error - def add_fault(self, predicate: Callable[[HttpRequest], bool], fault_factory: Callable[[HttpRequest], Awaitable[Exception]]): - self.faults.append({"predicate": predicate, "apply": fault_factory}) + def add_fault(self, + predicate: Callable[[HttpRequest], bool], + fault_factory: Callable[[HttpRequest], Awaitable[Exception]], + max_inner_count: Optional[int] = None, + after_max_count: Optional[Callable[[HttpRequest], AioHttpTransportResponse]] = None): + """ Adds a fault to the transport that will be applied when the predicate matches the request. + :param Callable predicate: A callable that takes an HttpRequest and returns True if the fault should be applied. + :param Callable fault_factory: A callable that takes an HttpRequest and returns an Exception to be raised. + :param int max_inner_count: Optional maximum number of times the fault can be applied for one request. + If None, the fault will be applied every time the predicate matches. + :param Callable after_max_count: Optional callable that takes an HttpRequest and returns a + AioHttpTransportResponse. Used to return a different response after the maximum number of faults has + been applied. Can only be used if `max_inner_count` is not None. + """ + if max_inner_count is not None: + if after_max_count is not None: + self.faults.append({"predicate": predicate, "apply": fault_factory, "after_max_count": after_max_count, + "max_count": max_inner_count, "current_count": 0}) + else: + self.faults.append({"predicate": predicate, "apply": fault_factory, + "max_count": max_inner_count, "current_count": 0}) + else: + self.faults.append({"predicate": predicate, "apply": fault_factory}) def add_response_transformation(self, predicate: Callable[[HttpRequest], bool], response_transformation: Callable[[HttpRequest, Callable[[HttpRequest], AioHttpTransportResponse]], AioHttpTransportResponse]): self.responseTransformations.append({ @@ -82,6 +103,16 @@ async def send(self, request: HttpRequest, *, stream: bool = False, proxies: Opt # find the first fault Factory with matching predicate if any first_fault_factory = FaultInjectionTransportAsync.__first_item(iter(self.faults), lambda f: f["predicate"](request)) if first_fault_factory: + if "max_count" in first_fault_factory: + FaultInjectionTransportAsync.logger.info(f"Found fault factory with max count {first_fault_factory['max_count']}") + if first_fault_factory["current_count"] >= first_fault_factory["max_count"]: + first_fault_factory["current_count"] = 0 # reset counter + if "after_max_count" in first_fault_factory: + FaultInjectionTransportAsync.logger.info("Max count reached, returning after_max_count") + return first_fault_factory["after_max_count"] + FaultInjectionTransportAsync.logger.info("Max count reached, skipping fault injection") + return await super().send(request, proxies=proxies, **config) + first_fault_factory["current_count"] += 1 FaultInjectionTransportAsync.logger.info("--> FaultInjectionTransportAsync.ApplyFaultInjection") injected_error = await first_fault_factory["apply"](request) FaultInjectionTransportAsync.logger.info("Found to-be-injected error {}".format(injected_error)) @@ -222,6 +253,26 @@ async def transform_topology_swr_mrr( return response + @staticmethod + async def transform_topology_ppaf_enabled( # cspell:disable-line + inner: Callable[[], Awaitable[AioHttpTransportResponse]]) -> AioHttpTransportResponse: + + response = await inner() + if not FaultInjectionTransportAsync.predicate_is_database_account_call(response.request): + return response + + data = response.body() + if response.status_code == 200 and data: + data = data.decode("utf-8") + result = json.loads(data) + # TODO: need to verify below behavior against actual Cosmos DB service response + result["enablePerPartitionFailoverBehavior"] = True + FaultInjectionTransportAsync.logger.info("Transformed Account Topology: {}".format(result)) + request: HttpRequest = response.request + return FaultInjectionTransportAsync.MockHttpResponse(request, 200, result) + + return response + @staticmethod async def transform_topology_mwr( first_region_name: str, @@ -260,7 +311,7 @@ async def transform_topology_mwr( return response class MockHttpResponse(AioHttpTransportResponse): - def __init__(self, request: HttpRequest, status_code: int, content:Optional[Dict[str, Any]]): + def __init__(self, request: HttpRequest, status_code: int, content: Optional[Any]=None): self.request: HttpRequest = request # This is actually never None, and set by all implementations after the call to # __init__ of this class. This class is also a legacy impl, so it's risky to change it diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 4b805e38c78d..3c88e8c59911 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -3,17 +3,17 @@ import os import unittest import uuid -from time import sleep import pytest -from azure.core.exceptions import ServiceResponseError import test_config -from azure.cosmos import CosmosClient, _partition_health_tracker, _location_cache, PartitionKey +from azure.cosmos import CosmosClient from azure.cosmos.exceptions import CosmosHttpResponseError from _fault_injection_transport import FaultInjectionTransport -from test_per_partition_circuit_breaker_mm import create_doc, operations, REGION_1, \ - REGION_2, PK_VALUE, CREATE, UPSERT, REPLACE, DELETE, PATCH, BATCH, DELETE_ALL_ITEMS_BY_PARTITION_KEY +from test_per_partition_circuit_breaker_mm import (REGION_1, REGION_2, PK_VALUE, BATCH, + write_operations_and_errors, perform_write_operation) + +# cspell:disable-file @pytest.fixture(scope="class", autouse=True) def setup_teardown(): @@ -37,57 +37,6 @@ def create_errors(): message="Some injected error.")) return errors -def write_operations_and_errors(): - write_operations = [CREATE, UPSERT, REPLACE, DELETE, PATCH, BATCH] - errors = create_errors() - params = [] - for write_operation in write_operations: - for error in errors: - params.append((write_operation, error)) - - return params - -def perform_write_operation(operation, container, fault_injection_container, doc_id, pk): - resp = None - doc = {'id': doc_id, - 'pk': pk, - 'name': 'sample document', - 'key': 'value'} - if operation == CREATE: - resp = fault_injection_container.create_item(body=doc) - elif operation == UPSERT: - resp = fault_injection_container.upsert_item(body=doc) - elif operation == REPLACE: - container.create_item(body=doc) - sleep(1) - new_doc = {'id': doc_id, - 'pk': pk, - 'name': 'sample document' + str(uuid), - 'key': 'value'} - resp = fault_injection_container.replace_item(item=doc['id'], body=new_doc) - elif operation == DELETE: - container.create_item(body=doc) - sleep(1) - resp = fault_injection_container.delete_item(item=doc['id'], partition_key=doc['pk']) - elif operation == PATCH: - container.create_item(body=doc) - sleep(1) - patch_ops = [{"op": "incr", "path": "/company", "value": 3}] - resp = fault_injection_container.patch_item(item=doc['id'], partition_key=doc['pk'], patch_operations=patch_ops) - elif operation == BATCH: - batch_operations = [ - ("create", (doc,)), - ("upsert", (doc,)), - ("upsert", (doc,)), - ("upsert", (doc,)), - ] - resp = fault_injection_container.execute_item_batch(batch_operations, partition_key=doc['pk']) - # this will need to be emulator only - elif operation == DELETE_ALL_ITEMS_BY_PARTITION_KEY: - container.create_item(body=doc) - resp = fault_injection_container.delete_all_items_by_partition_key(pk) - return resp - # These tests assume that the configured live account has one main write region and one secondary read region. @pytest.mark.cosmosPerPartitionAutomaticFailover @@ -105,8 +54,8 @@ def setup_method_with_custom_transport(self, custom_transport, default_endpoint= client = CosmosClient(default_endpoint, self.master_key, consistency_level="Session", preferred_locations=[REGION_1, REGION_2], transport=custom_transport, **kwargs) - db = client.create_database_if_not_exists(self.TEST_DATABASE_ID) - container = db.create_container_if_not_exists(container_id, PartitionKey(path="/pk"),) + db = client.get_database_client(self.TEST_DATABASE_ID) + container = db.get_container_client(container_id) return {"client": client, "db": db, "col": container} def setup_info(self, error, max_count=None, is_batch=False, **kwargs): @@ -126,9 +75,7 @@ def setup_info(self, error, max_count=None, is_batch=False, **kwargs): is_get_account_predicate = lambda r: FaultInjectionTransport.predicate_is_database_account_call(r) # Set the database account response to have PPAF enabled ppaf_enabled_database_account = \ - lambda r, inner: FaultInjectionTransport.transform_topology_ppaf_enabled( - inner=inner, - enable_per_partition_failover=True) + lambda r, inner: FaultInjectionTransport.transform_topology_ppaf_enabled(inner=inner) custom_transport.add_response_transformation( is_get_account_predicate, ppaf_enabled_database_account) @@ -136,7 +83,7 @@ def setup_info(self, error, max_count=None, is_batch=False, **kwargs): custom_setup = self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, **kwargs) return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors()) + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): # This test validates that the partition info cache is updated correctly upon failures, and that the # per-partition automatic failover logic routes requests to the next available regional endpoint @@ -180,9 +127,9 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): assert partition_info.current_regional_endpoint is None - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors()) + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) def test_ppaf_exclude_regions(self, write_operation, error): - # TODO: PPAF - finish this test + # TODO: finish this test return diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py new file mode 100644 index 000000000000..7a133d1f34cc --- /dev/null +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -0,0 +1,136 @@ +# The MIT License (MIT) +# Copyright (c) Microsoft Corporation. All rights reserved. +import os +import unittest +import uuid + +import asyncio +import pytest +from typing import Dict, Any + +import test_config +from azure.core.pipeline.transport._aiohttp import AioHttpTransport +from azure.cosmos.aio import CosmosClient +from _fault_injection_transport import FaultInjectionTransport +from _fault_injection_transport_async import FaultInjectionTransportAsync +from test_per_partition_automatic_failover import create_errors +from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors +from test_per_partition_circuit_breaker_mm_async import perform_write_operation + +# cspell:disable-file + +@pytest.fixture(scope="class", autouse=True) +def setup_teardown(): + os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "True" + yield + os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "False" + +# These tests assume that the configured live account has one main write region and one secondary read region. + +@pytest.mark.cosmosPerPartitionAutomaticFailover +@pytest.mark.asyncio +class TestPerPartitionAutomaticFailoverAsync: + host = test_config.TestConfig.host + master_key = test_config.TestConfig.masterKey + connectionPolicy = test_config.TestConfig.connectionPolicy + TEST_DATABASE_ID = "test_config.TestConfig.TEST_DATABASE_ID" + TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID + + async def setup_method_with_custom_transport(self, custom_transport: AioHttpTransport, default_endpoint=host, **kwargs): + container_id = kwargs.pop("container_id", None) + if not container_id: + container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID + client = CosmosClient(default_endpoint, self.master_key, consistency_level="Session", + preferred_locations=[REGION_1, REGION_2], + transport=custom_transport, **kwargs) + db = client.get_database_client(self.TEST_DATABASE_ID) + container = db.get_container_client(container_id) + return {"client": client, "db": db, "col": container} + + @staticmethod + async def cleanup_method(initialized_objects: Dict[str, Any]): + method_client: CosmosClient = initialized_objects["client"] + await method_client.close() + + async def setup_info(self, error, max_count=None, is_batch=False, **kwargs): + custom_transport = FaultInjectionTransportAsync() + # two documents targeted to same partition, one will always fail and the other will succeed + doc_fail_id = str(uuid.uuid4()) + doc_success_id = str(uuid.uuid4()) + predicate = lambda r: FaultInjectionTransportAsync.predicate_req_for_document_with_id(r, doc_fail_id) + # The MockRequest only gets used to create the MockHttpResponse + mock_request = FaultInjectionTransport.MockHttpRequest(url=self.host) + if is_batch: + success_response = FaultInjectionTransportAsync.MockHttpResponse(mock_request, 200, [{"statusCode": 200}],) + else: + success_response = FaultInjectionTransportAsync.MockHttpResponse(mock_request, 200) + custom_transport.add_fault(predicate=predicate, fault_factory=error, max_inner_count=max_count, + after_max_count=success_response) + is_get_account_predicate = lambda r: FaultInjectionTransportAsync.predicate_is_database_account_call(r) + # Set the database account response to have PPAF enabled + ppaf_enabled_database_account = \ + lambda r, inner: FaultInjectionTransportAsync.transform_topology_ppaf_enabled(inner=inner) + custom_transport.add_response_transformation( + is_get_account_predicate, + ppaf_enabled_database_account) + setup = await self.setup_method_with_custom_transport(None, default_endpoint=self.host, **kwargs) + custom_setup = await self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, **kwargs) + return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate + + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) + async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation, error): + os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "True" + # This test validates that the partition info cache is updated correctly upon failures, and that the + # per-partition automatic failover logic routes requests to the next available regional endpoint + error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay( + 0, + error + )) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, 1, write_operation == BATCH) + container = setup['col'] + fault_injection_container = custom_setup['col'] + global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager + + # Create a document to populate the per-partition GEM partition range info cache + await fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, + 'name': 'sample document', 'key': 'value'}) + pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] + initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint + + # Based on our configuration, we should have had one error followed by a success - marking only the previous endpoint as unavailable + await perform_write_operation( + write_operation, + container, + fault_injection_container, + doc_fail_id, + PK_VALUE) + partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same + assert len(partition_info.unavailable_regional_endpoints) == 1 + assert initial_endpoint in partition_info.unavailable_regional_endpoints + assert initial_endpoint != partition_info.current_regional_endpoint # west us 3 != west us + + # Now we run another request to see how the cache gets updated + await perform_write_operation( + write_operation, + container, + fault_injection_container, + str(uuid.uuid4()), + PK_VALUE) + partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + # Verify that the cache is empty, since the request going to the second regional endpoint failed + # Once we reach the point of all available regions being marked as unavailable, the cache is cleared + assert len(partition_info.unavailable_regional_endpoints) == 0 + assert initial_endpoint not in partition_info.unavailable_regional_endpoints + assert partition_info.current_regional_endpoint is None + + + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) + async def test_ppaf_exclude_regions_async(self, write_operation, error): + # TODO: finish this test + return + + + +if __name__ == '__main__': + unittest.main() diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py index 8e0d7f2a3402..3ed516a0a59c 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py @@ -50,9 +50,9 @@ def read_operations_and_errors(): return params -def write_operations_and_errors(): +def write_operations_and_errors(error_list=None): write_operations = [CREATE, UPSERT, REPLACE, DELETE, PATCH, BATCH] - errors = create_errors() + errors = error_list or create_errors() params = [] for write_operation in write_operations: for error in errors: @@ -69,7 +69,7 @@ def operations(): return operations -def create_errors(): +def create_errors(errors=None): errors = [] error_codes = [408, 500, 502, 503] for error_code in error_codes: @@ -97,7 +97,8 @@ def validate_unhealthy_partitions(global_endpoint_manager, def validate_response_uri(response, expected_uri): request = response.get_response_headers()["_request"] assert request.url.startswith(expected_uri) -def perform_write_operation(operation, container, fault_injection_container, doc_id, pk, expected_uri): + +def perform_write_operation(operation, container, fault_injection_container, doc_id, pk, expected_uri=None): doc = {'id': doc_id, 'pk': pk, 'name': 'sample document', @@ -135,7 +136,7 @@ def perform_write_operation(operation, container, fault_injection_container, doc elif operation == DELETE_ALL_ITEMS_BY_PARTITION_KEY: container.create_item(body=doc) resp = fault_injection_container.delete_all_items_by_partition_key(pk) - if resp: + if resp and expected_uri: validate_response_uri(resp, expected_uri) def perform_read_operation(operation, container, doc_id, pk, expected_uri): diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py index 8f4da45908f6..e67e3eb4e265 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py @@ -23,7 +23,7 @@ COLLECTION = "created_collection" -async def perform_write_operation(operation, container, fault_injection_container, doc_id, pk, expected_uri): +async def perform_write_operation(operation, container, fault_injection_container, doc_id, pk, expected_uri=None): doc = {'id': doc_id, 'pk': pk, 'name': 'sample document', @@ -61,7 +61,7 @@ async def perform_write_operation(operation, container, fault_injection_containe elif operation == DELETE_ALL_ITEMS_BY_PARTITION_KEY: await container.create_item(body=doc) resp = await fault_injection_container.delete_all_items_by_partition_key(pk) - if resp: + if resp and expected_uri: validate_response_uri(resp, expected_uri) async def perform_read_operation(operation, container, doc_id, pk, expected_uri): diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py index 9779b9c68362..d1925988fca6 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py @@ -23,6 +23,7 @@ COLLECTION = "created_collection" @pytest.mark.cosmosCircuitBreakerMultiRegion +@pytest.mark.cosmosPerPartitionAutomaticFailover @pytest.mark.asyncio class TestPerPartitionCircuitBreakerSmMrrAsync: host = test_config.TestConfig.host From 151a2facf819d7e2495ff1b8ef94ceafa8414ac7 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Sun, 15 Jun 2025 20:53:10 -0400 Subject: [PATCH 03/68] Update test_per_partition_automatic_failover_async.py --- .../tests/test_per_partition_automatic_failover_async.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 7a133d1f34cc..9720830b742e 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -79,7 +79,6 @@ async def setup_info(self, error, max_count=None, is_batch=False, **kwargs): @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation, error): - os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "True" # This test validates that the partition info cache is updated correctly upon failures, and that the # per-partition automatic failover logic routes requests to the next available regional endpoint error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay( From b9e0a081bf572f9bb5fb002f20aca9edfad6ac95 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 16 Jun 2025 09:08:59 -0400 Subject: [PATCH 04/68] CI fixes --- ...ion_endpoint_manager_per_partition_automatic_failover.py | 5 ++--- ...dpoint_manager_per_partition_automatic_failover_async.py | 6 +++--- .../tests/test_per_partition_automatic_failover.py | 2 +- .../tests/test_per_partition_automatic_failover_async.py | 2 +- sdk/cosmos/live-platform-matrix.json | 2 +- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index bbce867d4b53..2a4905013503 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -20,7 +20,7 @@ from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper if TYPE_CHECKING: - from azure.cosmos._cosmos_client_connection import CosmosClientConnection + from azure.cosmos.aio._cosmos_client_connection_async import CosmosClientConnection logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") @@ -126,8 +126,7 @@ def resolve_service_endpoint_for_partition( partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info return self._resolve_service_endpoint(request) - else: - return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) def compute_available_preferred_regions( self, diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index a9b974483614..d177a50603d7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -58,7 +58,8 @@ def try_move_to_next_location(self, available_account_regional_endpoints: Set[st return False -class _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync(_GlobalPartitionEndpointManagerForCircuitBreakerAsync): +class _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync( + _GlobalPartitionEndpointManagerForCircuitBreakerAsync): """ This internal class implements the logic for partition endpoint management for geo-replicated database accounts. @@ -126,8 +127,7 @@ def resolve_service_endpoint_for_partition( partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info return self._resolve_service_endpoint(request) - else: - return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) def compute_available_preferred_regions( self, diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 3c88e8c59911..77857c20d325 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -13,7 +13,7 @@ from test_per_partition_circuit_breaker_mm import (REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors, perform_write_operation) -# cspell:disable-file +# cspell:disable @pytest.fixture(scope="class", autouse=True) def setup_teardown(): diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 9720830b742e..d9536d99192b 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -17,7 +17,7 @@ from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors from test_per_partition_circuit_breaker_mm_async import perform_write_operation -# cspell:disable-file +# cspell:disable @pytest.fixture(scope="class", autouse=True) def setup_teardown(): diff --git a/sdk/cosmos/live-platform-matrix.json b/sdk/cosmos/live-platform-matrix.json index fb21bacff0e1..08bc50407192 100644 --- a/sdk/cosmos/live-platform-matrix.json +++ b/sdk/cosmos/live-platform-matrix.json @@ -61,7 +61,7 @@ }, { "PerPartitionAutomaticFailoverTestConfig": { - "Ubuntu2004_313_ppaf": { + "Ubuntu2004_313_partition_automatic_failover": { "OSVmImage": "env:LINUXVMIMAGE", "Pool": "env:LINUXPOOL", "PythonVersion": "3.13", From e4d7046a747d1c942f5a374f8c7c141b2c62e3f3 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 16 Jun 2025 09:36:35 -0400 Subject: [PATCH 05/68] changelog --- sdk/cosmos/azure-cosmos/CHANGELOG.md | 1 + ...rtition_endpoint_manager_per_partition_automatic_failover.py | 2 +- ...n_endpoint_manager_per_partition_automatic_failover_async.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index ba4f51186488..5fde0181b7a3 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -3,6 +3,7 @@ ### 4.13.0b2 (Unreleased) #### Features Added +* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-configure-per-partition-automatic-failover) and enable through the environment variable `AZURE_COSMOS_ENABLE_PER_PARTITION_FAILOVER`. See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). #### Breaking Changes diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 2a4905013503..25c868357d62 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -20,7 +20,7 @@ from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper if TYPE_CHECKING: - from azure.cosmos.aio._cosmos_client_connection_async import CosmosClientConnection + from azure.cosmos._cosmos_client_connection import CosmosClientConnection logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index d177a50603d7..b078d4b21e27 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -20,7 +20,7 @@ from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper if TYPE_CHECKING: - from azure.cosmos._cosmos_client_connection import CosmosClientConnection + from azure.cosmos.aio._cosmos_client_connection_async import CosmosClientConnection logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") From 09e7163c30f4d8e4167d51a51b74f1b0285131f7 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 16 Jun 2025 10:18:31 -0400 Subject: [PATCH 06/68] broken link --- sdk/cosmos/azure-cosmos/CHANGELOG.md | 2 +- .../tests/test_per_partition_automatic_failover.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index 5fde0181b7a3..e30a8bbbe11f 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -3,7 +3,7 @@ ### 4.13.0b2 (Unreleased) #### Features Added -* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-configure-per-partition-automatic-failover) and enable through the environment variable `AZURE_COSMOS_ENABLE_PER_PARTITION_FAILOVER`. See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). +* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover) and enable through the environment variable `AZURE_COSMOS_ENABLE_PER_PARTITION_FAILOVER`. See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). #### Breaking Changes diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 77857c20d325..2cc21fea6045 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -15,11 +15,11 @@ # cspell:disable -@pytest.fixture(scope="class", autouse=True) -def setup_teardown(): - os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "True" - yield - os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "False" +# @pytest.fixture(scope="class", autouse=True) +# def setup_teardown(): +# os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "True" +# yield +# os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "False" def create_errors(): From 4e28f66d583490a0c60a5ab38ddc6d1cd18b7e1d Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 16 Jun 2025 11:40:48 -0400 Subject: [PATCH 07/68] Update test_location_cache.py --- sdk/cosmos/azure-cosmos/tests/test_location_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py index 4194fc5672b7..41cf287fc0d8 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py +++ b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py @@ -161,7 +161,7 @@ def test_get_applicable_regional_endpoints_excluded_regions(self, test_type): [location4_name], [], ] - excluded_locations_on_requests_list = [None] * 5 + excluded_locations_on_requests_list = [[]] * 5 elif test_type == "OnRequest": excluded_locations_on_client_list = [[]] * 5 excluded_locations_on_requests_list = [ From c5319e872135e1762a53be6e5662472652c76227 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 16 Jun 2025 18:49:28 -0400 Subject: [PATCH 08/68] change PPAF detection logic --- sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py | 2 -- ...obal_partition_endpoint_manager_circuit_breaker_core.py | 7 ++++--- ...on_endpoint_manager_per_partition_automatic_failover.py | 7 +------ ...point_manager_per_partition_automatic_failover_async.py | 7 +------ .../tests/test_per_partition_automatic_failover.py | 7 ------- .../tests/test_per_partition_automatic_failover_async.py | 6 ------ 6 files changed, 6 insertions(+), 30 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py index c2812f3481f8..dd076002b4f9 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py @@ -54,8 +54,6 @@ class _Constants: MAX_ITEM_BUFFER_VS_CONFIG_DEFAULT: int = 50000 CIRCUIT_BREAKER_ENABLED_CONFIG: str = "AZURE_COSMOS_ENABLE_CIRCUIT_BREAKER" CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT: str = "False" - PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG: str = "AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER" - PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG_DEFAULT: str = "False" # Only applicable when circuit breaker is enabled ------------------------- CONSECUTIVE_ERROR_COUNT_TOLERATED_FOR_READ: str = "AZURE_COSMOS_CONSECUTIVE_ERROR_COUNT_TOLERATED_FOR_READ" CONSECUTIVE_ERROR_COUNT_TOLERATED_FOR_READ_DEFAULT: int = 10 diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py index f5335fc447ff..9586a7032ff7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py @@ -59,9 +59,10 @@ def is_circuit_breaker_applicable(self, request: RequestObject) -> bool: if not request: return False - circuit_breaker_enabled = os.environ.get(Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG, - os.environ.get(Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, - Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT)).lower() == "true" + circuit_breaker_enabled = \ + (self.client._global_endpoint_manager._database_account_cache._EnablePerPartitionFailoverBehavior == True or + os.environ.get(os.environ.get(Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, + Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT)).lower() == "true") if not circuit_breaker_enabled: return False diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 25c868357d62..63904eef582e 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -75,13 +75,8 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) or _OperationType.IsReadOnlyOperation(request.operation_type)): return False - per_partition_automatic_failover_config_enabled = ( - os.environ.get(Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG, - Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG_DEFAULT).lower() == "true") - # TODO: This check here needs to be verified once we test against a live account with the config enabled. - if (not per_partition_automatic_failover_config_enabled or - not self._database_account_cache._EnablePerPartitionFailoverBehavior): + if not self._database_account_cache._EnablePerPartitionFailoverBehavior: return False # if we have at most one region available in the account, we cannot do per partition automatic failover diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index b078d4b21e27..8b18f90c62ef 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -76,13 +76,8 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) or _OperationType.IsReadOnlyOperation(request.operation_type)): return False - per_partition_automatic_failover_config_enabled = ( - os.environ.get(Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG, - Constants.PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED_CONFIG_DEFAULT).lower() == "true") - # TODO: This check here needs to be verified once we test against a live account with the config enabled. - if (not per_partition_automatic_failover_config_enabled or - not self._database_account_cache._EnablePerPartitionFailoverBehavior): + if not self._database_account_cache._EnablePerPartitionFailoverBehavior: return False # if we have at most one region available in the account, we cannot do per partition automatic failover diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 2cc21fea6045..0ea7f25e0867 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -15,13 +15,6 @@ # cspell:disable -# @pytest.fixture(scope="class", autouse=True) -# def setup_teardown(): -# os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "True" -# yield -# os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "False" - - def create_errors(): errors = [] error_codes = [403, 408, 500, 502, 503] diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index d9536d99192b..5b8554a93c31 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -19,12 +19,6 @@ # cspell:disable -@pytest.fixture(scope="class", autouse=True) -def setup_teardown(): - os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "True" - yield - os.environ["AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER"] = "False" - # These tests assume that the configured live account has one main write region and one secondary read region. @pytest.mark.cosmosPerPartitionAutomaticFailover From eba60933d13bb1e4fea804b9895a72ba34108870 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 16 Jun 2025 19:26:06 -0400 Subject: [PATCH 09/68] Update _global_partition_endpoint_manager_circuit_breaker_core.py --- ..._global_partition_endpoint_manager_circuit_breaker_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py index 9586a7032ff7..09c5b838f931 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py @@ -61,8 +61,8 @@ def is_circuit_breaker_applicable(self, request: RequestObject) -> bool: circuit_breaker_enabled = \ (self.client._global_endpoint_manager._database_account_cache._EnablePerPartitionFailoverBehavior == True or - os.environ.get(os.environ.get(Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, - Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT)).lower() == "true") + os.environ.get(Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, + Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT).lower() == "true") if not circuit_breaker_enabled: return False From 2ec5c5d6e72a38df5813abfbe5c48bed30fe3c52 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 17 Jun 2025 09:27:28 -0400 Subject: [PATCH 10/68] Update _global_partition_endpoint_manager_circuit_breaker_core.py --- ...al_partition_endpoint_manager_circuit_breaker_core.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py index 09c5b838f931..8059ede8a316 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py @@ -59,10 +59,11 @@ def is_circuit_breaker_applicable(self, request: RequestObject) -> bool: if not request: return False - circuit_breaker_enabled = \ - (self.client._global_endpoint_manager._database_account_cache._EnablePerPartitionFailoverBehavior == True or - os.environ.get(Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, - Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT).lower() == "true") + circuit_breaker_enabled = os.environ.get(Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, + Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT).lower() == "true" + if not circuit_breaker_enabled and self.client._global_endpoint_manager is not None: + if self.client._global_endpoint_manager._database_account_cache is not None: + circuit_breaker_enabled = self.client._global_endpoint_manager._database_account_cache._EnablePerPartitionFailoverBehavior == True # pylint: disable=line-too-long if not circuit_breaker_enabled: return False From 62d7be0d1704ba7d02a9d18dbfebee303d5f8c6c Mon Sep 17 00:00:00 2001 From: tvaron3 Date: Tue, 17 Jun 2025 22:29:31 -0700 Subject: [PATCH 11/68] fix tests and remove environment variable --- sdk/cosmos/azure-cosmos/CHANGELOG.md | 2 +- .../azure/cosmos/_endpoint_discovery_retry_policy.py | 9 --------- ..._endpoint_manager_per_partition_automatic_failover.py | 2 +- ...int_manager_per_partition_automatic_failover_async.py | 4 +--- .../tests/test_per_partition_automatic_failover.py | 3 +-- .../tests/test_per_partition_automatic_failover_async.py | 9 ++++----- sdk/cosmos/live-platform-matrix.json | 2 +- sdk/cosmos/test-resources.bicep | 4 ---- 8 files changed, 9 insertions(+), 26 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index e30a8bbbe11f..ac4d91ec8545 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -3,7 +3,7 @@ ### 4.13.0b2 (Unreleased) #### Features Added -* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover) and enable through the environment variable `AZURE_COSMOS_ENABLE_PER_PARTITION_FAILOVER`. See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). +* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover). See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). #### Breaking Changes diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index f562df2a7189..aabf247936fc 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -23,17 +23,8 @@ Azure Cosmos database service. """ -import logging from azure.cosmos.documents import _OperationType -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -log_formatter = logging.Formatter("%(levelname)s:%(message)s") -log_handler = logging.StreamHandler() -log_handler.setFormatter(log_formatter) -logger.addHandler(log_handler) - - class EndpointDiscoveryRetryPolicy(object): """The endpoint discovery retry policy class used for geo-replicated database accounts to handle the write forbidden exceptions due to writable/readable location changes diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 63904eef582e..f329bc0e16ed 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -76,7 +76,7 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) return False # TODO: This check here needs to be verified once we test against a live account with the config enabled. - if not self._database_account_cache._EnablePerPartitionFailoverBehavior: + if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: return False # if we have at most one region available in the account, we cannot do per partition automatic failover diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 8b18f90c62ef..fca16249ffa2 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -5,13 +5,11 @@ endpoint manager, since enabling per partition automatic failover also enables the circuit breaker logic. """ import logging -import os import threading from typing import Dict, Set, TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType -from azure.cosmos._constants import _Constants as Constants from azure.cosmos.aio._global_partition_endpoint_manager_circuit_breaker_async import \ _GlobalPartitionEndpointManagerForCircuitBreakerAsync from azure.cosmos.documents import _OperationType @@ -77,7 +75,7 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) return False # TODO: This check here needs to be verified once we test against a live account with the config enabled. - if not self._database_account_cache._EnablePerPartitionFailoverBehavior: + if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: return False # if we have at most one region available in the account, we cannot do per partition automatic failover diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 0ea7f25e0867..fa5337514292 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -36,8 +36,7 @@ def create_errors(): class TestPerPartitionAutomaticFailover: host = test_config.TestConfig.host master_key = test_config.TestConfig.masterKey - connectionPolicy = test_config.TestConfig.connectionPolicy - TEST_DATABASE_ID = "test_config.TestConfig.TEST_DATABASE_ID" + TEST_DATABASE_ID = test_config.TestConfig.TEST_DATABASE_ID TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID def setup_method_with_custom_transport(self, custom_transport, default_endpoint=host, **kwargs): diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 5b8554a93c31..0321a63fa799 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -1,12 +1,12 @@ # The MIT License (MIT) # Copyright (c) Microsoft Corporation. All rights reserved. -import os import unittest import uuid import asyncio + import pytest -from typing import Dict, Any +from typing import Dict, Any, Optional import test_config from azure.core.pipeline.transport._aiohttp import AioHttpTransport @@ -26,11 +26,10 @@ class TestPerPartitionAutomaticFailoverAsync: host = test_config.TestConfig.host master_key = test_config.TestConfig.masterKey - connectionPolicy = test_config.TestConfig.connectionPolicy - TEST_DATABASE_ID = "test_config.TestConfig.TEST_DATABASE_ID" + TEST_DATABASE_ID = test_config.TestConfig.TEST_DATABASE_ID TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID - async def setup_method_with_custom_transport(self, custom_transport: AioHttpTransport, default_endpoint=host, **kwargs): + async def setup_method_with_custom_transport(self, custom_transport: Optional[AioHttpTransport], default_endpoint=host, **kwargs): container_id = kwargs.pop("container_id", None) if not container_id: container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID diff --git a/sdk/cosmos/live-platform-matrix.json b/sdk/cosmos/live-platform-matrix.json index 08bc50407192..36a685e54b0e 100644 --- a/sdk/cosmos/live-platform-matrix.json +++ b/sdk/cosmos/live-platform-matrix.json @@ -72,7 +72,7 @@ }, "ArmConfig": { "MultiRegion": { - "ArmTemplateParameters": "@{ defaultConsistencyLevel = 'Session'; enableMultipleRegions = $true; perPartitionAutomaticFailoverEnabled = 'True' }" + "ArmTemplateParameters": "@{ defaultConsistencyLevel = 'Session'; enableMultipleRegions = $true;}" } } }, diff --git a/sdk/cosmos/test-resources.bicep b/sdk/cosmos/test-resources.bicep index f6088ec122a6..735c1a0e66ee 100644 --- a/sdk/cosmos/test-resources.bicep +++ b/sdk/cosmos/test-resources.bicep @@ -15,9 +15,6 @@ param location string = resourceGroup().location @description('Whether Per Partition Circuit Breaker should be enabled.') param circuitBreakerEnabled string = 'False' -@description('Whether Per Partition Automatic Failover should be enabled.') -param perPartitionAutomaticFailoverEnabled string = 'False' - @description('The api version to be used by Bicep to create resources') param apiVersion string = '2023-04-15' @@ -108,6 +105,5 @@ resource accountName_roleAssignmentId 'Microsoft.DocumentDB/databaseAccounts/sql } output AZURE_COSMOS_ENABLE_CIRCUIT_BREAKER string = circuitBreakerEnabled -output AZURE_COSMOS_ENABLE_PER_PARTITION_AUTOMATIC_FAILOVER string = perPartitionAutomaticFailoverEnabled output ACCOUNT_HOST string = reference(resourceId, apiVersion).documentEndpoint output ACCOUNT_KEY string = listKeys(resourceId, apiVersion).primaryMasterKey From 24b84153791c481adec4f62e1db17776b74418cd Mon Sep 17 00:00:00 2001 From: tvaron3 Date: Mon, 23 Jun 2025 13:09:52 -0400 Subject: [PATCH 12/68] fix tests --- sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py index d43407a40a72..c966ed3d4218 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py @@ -40,7 +40,8 @@ def __init__( self.location_index_to_route: Optional[int] = None self.location_endpoint_to_route: Optional[str] = None self.last_routed_location_endpoint_within_region: Optional[str] = None - self.excluded_locations: List[str] = [] + # fix this + self.excluded_locations: Optional[List[str]] = None self.excluded_locations_circuit_breaker: List[str] = [] self.healthy_tentative_location: Optional[str] = None From 9595327e8b61ed19e23b3cc25793af908be6fdbd Mon Sep 17 00:00:00 2001 From: tvaron3 Date: Wed, 2 Jul 2025 14:55:58 -0700 Subject: [PATCH 13/68] revert excluded locations change --- sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py | 4 +++- sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py index 5363a31f3b30..52b7e8e0fc44 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py @@ -210,11 +210,13 @@ def get_ordered_read_locations(self): def _get_configured_excluded_locations(self, request: RequestObject) -> List[str]: # If excluded locations were configured on request, use request level excluded locations. excluded_locations = request.excluded_locations - if len(excluded_locations) == 0: + if excluded_locations is None: if self.connection_policy.ExcludedLocations: # If excluded locations were only configured on client(connection_policy), use client level # make copy of excluded locations to avoid modifying the original list excluded_locations = list(self.connection_policy.ExcludedLocations) + else: + excluded_locations = [] for excluded_location in request.excluded_locations_circuit_breaker: if excluded_location not in excluded_locations: excluded_locations.append(excluded_location) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py index c966ed3d4218..d20eedb40148 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_request_object.py @@ -40,7 +40,6 @@ def __init__( self.location_index_to_route: Optional[int] = None self.location_endpoint_to_route: Optional[str] = None self.last_routed_location_endpoint_within_region: Optional[str] = None - # fix this self.excluded_locations: Optional[List[str]] = None self.excluded_locations_circuit_breaker: List[str] = [] self.healthy_tentative_location: Optional[str] = None From 8911ef524fe3853a7ec0e20964c43a04f576331a Mon Sep 17 00:00:00 2001 From: tvaron3 Date: Thu, 3 Jul 2025 13:41:45 -0700 Subject: [PATCH 14/68] fix analyze --- ..._global_partition_endpoint_manager_circuit_breaker_core.py | 4 +++- ...ition_endpoint_manager_per_partition_automatic_failover.py | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py index 8059ede8a316..91fd67805a18 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker_core.py @@ -19,6 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +# pylint: disable=protected-access + """Internal class for global endpoint manager for circuit breaker. """ import logging @@ -63,7 +65,7 @@ def is_circuit_breaker_applicable(self, request: RequestObject) -> bool: Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT).lower() == "true" if not circuit_breaker_enabled and self.client._global_endpoint_manager is not None: if self.client._global_endpoint_manager._database_account_cache is not None: - circuit_breaker_enabled = self.client._global_endpoint_manager._database_account_cache._EnablePerPartitionFailoverBehavior == True # pylint: disable=line-too-long + circuit_breaker_enabled = self.client._global_endpoint_manager._database_account_cache._EnablePerPartitionFailoverBehavior is True # pylint: disable=line-too-long if not circuit_breaker_enabled: return False diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index f329bc0e16ed..6dfb759a88f8 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -5,13 +5,11 @@ endpoint manager, since enabling per partition automatic failover also enables the circuit breaker logic. """ import logging -import os import threading from typing import Dict, Set, TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType -from azure.cosmos._constants import _Constants as Constants from azure.cosmos._global_partition_endpoint_manager_circuit_breaker import \ _GlobalPartitionEndpointManagerForCircuitBreaker from azure.cosmos.documents import _OperationType From 25dbeb3a40266bf7fe83369eae7cec569c5a15bd Mon Sep 17 00:00:00 2001 From: tvaron3 Date: Mon, 7 Jul 2025 14:30:50 -0700 Subject: [PATCH 15/68] test excluded locations --- .../azure-cosmos/tests/test_excluded_locations_async.py | 1 - sdk/cosmos/azure-cosmos/tests/test_location_cache.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_excluded_locations_async.py b/sdk/cosmos/azure-cosmos/tests/test_excluded_locations_async.py index 1b2928de217e..d4d01de83e80 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_excluded_locations_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_excluded_locations_async.py @@ -10,7 +10,6 @@ import pytest_asyncio from azure.cosmos.aio import CosmosClient -from azure.cosmos.partition_key import PartitionKey from test_excluded_locations import (TestDataType, set_test_data_type, read_item_test_data, write_item_test_data, read_and_write_item_test_data, verify_endpoint) diff --git a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py index 41cf287fc0d8..52797696a5d2 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py +++ b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py @@ -150,7 +150,7 @@ def test_resolve_request_endpoint_preferred_regions(self): assert read_resolved == write_resolved assert read_resolved == default_endpoint - @pytest.mark.parametrize("test_type",["OnClient", "OnRequest", "OnBoth"]) + @pytest.mark.parametrize("test_type",["OnClient"]) def test_get_applicable_regional_endpoints_excluded_regions(self, test_type): # Init test data if test_type == "OnClient": @@ -161,7 +161,7 @@ def test_get_applicable_regional_endpoints_excluded_regions(self, test_type): [location4_name], [], ] - excluded_locations_on_requests_list = [[]] * 5 + excluded_locations_on_requests_list = [None] * 5 elif test_type == "OnRequest": excluded_locations_on_client_list = [[]] * 5 excluded_locations_on_requests_list = [ From d61a9a9ca30de998b08ef46cb02cdce631dc573c Mon Sep 17 00:00:00 2001 From: tvaron3 Date: Mon, 7 Jul 2025 23:32:19 -0700 Subject: [PATCH 16/68] Add different error handling for 503 and 408s, update README --- sdk/cosmos/azure-cosmos/README.md | 5 ++ .../azure-cosmos/azure/cosmos/_constants.py | 5 ++ .../azure/cosmos/_cosmos_client_connection.py | 1 - ...anager_per_partition_automatic_failover.py | 10 +-- .../azure/cosmos/_retry_utility.py | 7 ++- .../_service_unavailable_retry_policy.py | 61 +++++++++++++++++++ .../cosmos/_timeout_failover_retry_policy.py | 10 +-- .../aio/_cosmos_client_connection_async.py | 1 - ..._per_partition_automatic_failover_async.py | 5 +- .../azure/cosmos/aio/_retry_utility_async.py | 7 ++- .../azure/cosmos/http_constants.py | 1 + .../azure-cosmos/tests/test_location_cache.py | 2 +- .../test_per_partition_automatic_failover.py | 3 +- 13 files changed, 95 insertions(+), 23 deletions(-) create mode 100644 sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py diff --git a/sdk/cosmos/azure-cosmos/README.md b/sdk/cosmos/azure-cosmos/README.md index 17bd8a7560e2..e993fee96c1c 100644 --- a/sdk/cosmos/azure-cosmos/README.md +++ b/sdk/cosmos/azure-cosmos/README.md @@ -909,6 +909,11 @@ requests to another region: - `AZURE_COSMOS_FAILURE_PERCENTAGE_TOLERATED`: Default is a `90` percent failure rate. - After a partition reaches a 90 percent failure rate for all requests, the SDK will send requests routed to that partition to another region. +### Per Partition Automatic Failover (Public Preview) +Per partition automatic failover enables the SDK to automatically redirect write requests at the partition level to another region based on service-side signals. This feature is available +only for single write region accounts that have at least one read-only region. When per partition automatic failover is enabled, per partition circuit breaker and hedging is enabled by default, meaning +all its configurable options also apply to per partition automatic failover. To enable this feature, follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover). + ## Troubleshooting ### General diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py index dd076002b4f9..1f9a7d81eef7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py @@ -25,6 +25,7 @@ from typing import Dict from typing_extensions import Literal +# cspell:disable-line class _Constants: @@ -62,6 +63,10 @@ class _Constants: FAILURE_PERCENTAGE_TOLERATED = "AZURE_COSMOS_FAILURE_PERCENTAGE_TOLERATED" FAILURE_PERCENTAGE_TOLERATED_DEFAULT: int = 90 # ------------------------------------------------------------------------- + # Only applicable when per partition automatic failover is enabled -------- + TIMEOUT_ERROR_THRESHOLD_PPAF = "AZURE_COSMOS_TIMEOUT_ERROR_THRESHOLD_FOR_PPAF" + TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT: int = 10 + # ------------------------------------------------------------------------- # Error code translations ERROR_TRANSLATIONS: Dict[int, str] = { diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py index c135277f6c9d..1756849613f1 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py @@ -2627,7 +2627,6 @@ def GetDatabaseAccount( self.connection_policy.UseMultipleWriteLocations and database_account._EnableMultipleWritableLocations ) - # TODO: Verify that this is the correct variable from the service if Constants.EnablePerPartitionFailoverBehavior in result: database_account._EnablePerPartitionFailoverBehavior = result[Constants.EnablePerPartitionFailoverBehavior] diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 6dfb759a88f8..643ec509c852 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -73,7 +73,6 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) or _OperationType.IsReadOnlyOperation(request.operation_type)): return False - # TODO: This check here needs to be verified once we test against a live account with the config enabled. if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: return False @@ -110,7 +109,7 @@ def resolve_service_endpoint_for_partition( # main write region in the account configurations self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() request.clear_route_to_location() - return self._resolve_service_endpoint(request) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) else: # Update the current regional endpoint to whatever the request is routing to partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route @@ -118,7 +117,7 @@ def resolve_service_endpoint_for_partition( partition_failover_info = PartitionLevelFailoverInfo() partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info - return self._resolve_service_endpoint(request) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) def compute_available_preferred_regions( @@ -131,7 +130,10 @@ def compute_available_preferred_regions( :return: A set of available regional endpoints. :rtype: Set[str] """ - excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations + if request.excluded_locations: + excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations + else: + excluded_locations = self.location_cache.connection_policy.ExcludedLocations preferred_locations = self.PreferredLocations available_regions = [item for item in preferred_locations if item not in excluded_locations] available_regional_endpoints = { diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 50c26e87cb62..c18f580f52ab 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -29,7 +29,7 @@ from azure.core.pipeline import PipelineRequest from azure.core.pipeline.policies import RetryPolicy -from . import _container_recreate_retry_policy, _database_account_retry_policy +from . import _container_recreate_retry_policy, _database_account_retry_policy, _service_unavailable_retry_policy from . import _default_retry_policy from . import _endpoint_discovery_retry_policy from . import _gone_retry_policy @@ -95,6 +95,8 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin service_request_retry_policy = _service_request_retry_policy.ServiceRequestRetryPolicy( client.connection_policy, global_endpoint_manager, pk_range_wrapper, *args, ) + service_unavailable_retry_policy = _service_unavailable_retry_policy._ServiceUnavailableRetryPolicy( + client.connection_policy, global_endpoint_manager, pk_range_wrapper, *args) # HttpRequest we would need to modify for Container Recreate Retry Policy request = None if args and len(args) > 3: @@ -181,9 +183,12 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin retry_policy.container_rid = cached_container["_rid"] request.headers[retry_policy._intended_headers] = retry_policy.container_rid + elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: + retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: # record the failure for circuit breaker tracking + # TODO: change this to track errors for ppaf global_endpoint_manager.record_failure(args[0]) retry_policy = timeout_failover_retry_policy else: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py new file mode 100644 index 000000000000..3dc3df1aac70 --- /dev/null +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -0,0 +1,61 @@ +# The MIT License (MIT) +# Copyright (c) Microsoft Corporation. All rights reserved. + +"""Internal class for service unavailable retry policy implementation in the Azure +Cosmos database service. +""" + +class _ServiceUnavailableRetryPolicy(object): + + def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, *args): + self.retry_after_in_milliseconds = 500 + self.global_endpoint_manager = global_endpoint_manager + self.pk_range_wrapper = pk_range_wrapper + # If an account only has 1 region, then we still want to retry once on the same region + self._max_retry_attempt_count = (len(self.global_endpoint_manager.location_cache.read_regional_routing_contexts) + + 1) + self.retry_count = 0 + self.connection_policy = connection_policy + self.request = args[0] if args else None + + def ShouldRetry(self, _exception): + """Returns true if the request should retry based on the passed-in exception. + + :param exceptions.CosmosHttpResponseError _exception: + :returns: a boolean stating whether the request should be retried + :rtype: bool + """ + # writes are retried for 503s + if not self.connection_policy.EnableEndpointDiscovery: + return False + + self.retry_count += 1 + # Check if the next retry about to be done is safe + if self.retry_count >= self._max_retry_attempt_count: + return False + + if self.request: + location_endpoint = self.resolve_next_region_service_endpoint() + self.request.route_to_location(location_endpoint) + return True + + # This function prepares the request to go to the next region + def resolve_next_region_service_endpoint(self): + if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): + # If per partition automatic failover is applicable, we mark the current endpoint as unavailable + # and resolve the service endpoint for the partition range - otherwise, continue with default retry logic + partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] + partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) + return self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, + self.pk_range_wrapper) + + # clear previous location-based routing directive + self.request.clear_route_to_location() + # clear the last routed endpoint within same region since we are going to a new region now + self.request.last_routed_location_endpoint_within_region = None + # set location-based routing directive based on retry count + # ensuring usePreferredLocations is set to True for retry + self.request.route_to_location_with_preferred_location_flag(self.retry_count, True) + # Resolve the endpoint for the request and pin the resolution to the resolved endpoint + # This enables marking the endpoint unavailability on endpoint failover/unreachability + return self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py index 8e60e0f7dcf6..434d15f51bb4 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py @@ -27,8 +27,7 @@ def ShouldRetry(self, _exception): :returns: a boolean stating whether the request should be retried :rtype: bool """ - if self.request and (not _OperationType.IsReadOnlyOperation(self.request.operation_type) and - not self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request)): + if self.request and not _OperationType.IsReadOnlyOperation(self.request.operation_type): return False if not self.connection_policy.EnableEndpointDiscovery: @@ -46,13 +45,6 @@ def ShouldRetry(self, _exception): # This function prepares the request to go to the next region def resolve_next_region_service_endpoint(self): - if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): - # If per partition automatic failover is applicable, we mark the current endpoint as unavailable - # and resolve the service endpoint for the partition range - otherwise, continue with default retry logic - partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] - partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) - return self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, - self.pk_range_wrapper) # clear previous location-based routing directive self.request.clear_route_to_location() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_cosmos_client_connection_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_cosmos_client_connection_async.py index a56cb5777406..89d6b23c06d3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_cosmos_client_connection_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_cosmos_client_connection_async.py @@ -453,7 +453,6 @@ async def GetDatabaseAccount( self.connection_policy.UseMultipleWriteLocations and database_account._EnableMultipleWritableLocations ) - # TODO: Verify that this is the correct variable from the service if Constants.EnablePerPartitionFailoverBehavior in result: database_account._EnablePerPartitionFailoverBehavior = result[Constants.EnablePerPartitionFailoverBehavior] diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index fca16249ffa2..0192548506c7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -74,7 +74,6 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) or _OperationType.IsReadOnlyOperation(request.operation_type)): return False - # TODO: This check here needs to be verified once we test against a live account with the config enabled. if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: return False @@ -111,7 +110,7 @@ def resolve_service_endpoint_for_partition( # main write region in the account configurations self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() request.clear_route_to_location() - return self._resolve_service_endpoint(request) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request) else: # Update the current regional endpoint to whatever the request is routing to partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route @@ -119,7 +118,7 @@ def resolve_service_endpoint_for_partition( partition_failover_info = PartitionLevelFailoverInfo() partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info - return self._resolve_service_endpoint(request) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request) return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) def compute_available_preferred_regions( diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 7884f9060183..c1b3d6f3eea8 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -28,7 +28,7 @@ from azure.core.exceptions import AzureError, ClientAuthenticationError, ServiceRequestError, ServiceResponseError from azure.core.pipeline.policies import AsyncRetryPolicy -from .. import _default_retry_policy, _database_account_retry_policy +from .. import _default_retry_policy, _database_account_retry_policy, _service_unavailable_retry_policy from .. import _endpoint_discovery_retry_policy from .. import _gone_retry_policy from .. import _resource_throttle_retry_policy @@ -94,6 +94,8 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg service_request_retry_policy = _service_request_retry_policy.ServiceRequestRetryPolicy( client.connection_policy, global_endpoint_manager, pk_range_wrapper, *args, ) + service_unavailable_retry_policy = _service_unavailable_retry_policy._ServiceUnavailableRetryPolicy( + client.connection_policy, global_endpoint_manager, pk_range_wrapper, *args) # HttpRequest we would need to modify for Container Recreate Retry Policy request = None if args and len(args) > 3: @@ -180,10 +182,13 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg retry_policy.container_rid = cached_container["_rid"] request.headers[retry_policy._intended_headers] = retry_policy.container_rid + elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: + retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: # record the failure for circuit breaker tracking if args: await global_endpoint_manager.record_failure(args[0]) + # TODO: change this to track errors for ppaf retry_policy = timeout_failover_retry_policy else: retry_policy = defaultRetry_policy diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/http_constants.py b/sdk/cosmos/azure-cosmos/azure/cosmos/http_constants.py index 81804f71dfef..7324649a5d43 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/http_constants.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/http_constants.py @@ -401,6 +401,7 @@ class StatusCodes: RETRY_WITH = 449 INTERNAL_SERVER_ERROR = 500 + SERVICE_UNAVAILABLE = 503 # Operation pause and cancel. These are FAKE status codes for QOS logging purpose only. OPERATION_PAUSED = 1200 diff --git a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py index 52797696a5d2..4194fc5672b7 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py +++ b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py @@ -150,7 +150,7 @@ def test_resolve_request_endpoint_preferred_regions(self): assert read_resolved == write_resolved assert read_resolved == default_endpoint - @pytest.mark.parametrize("test_type",["OnClient"]) + @pytest.mark.parametrize("test_type",["OnClient", "OnRequest", "OnBoth"]) def test_get_applicable_regional_endpoints_excluded_regions(self, test_type): # Init test data if test_type == "OnClient": diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index fa5337514292..69d18595e123 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -1,6 +1,5 @@ # The MIT License (MIT) # Copyright (c) Microsoft Corporation. All rights reserved. -import os import unittest import uuid @@ -17,7 +16,7 @@ def create_errors(): errors = [] - error_codes = [403, 408, 500, 502, 503] + error_codes = [403, 503] for error_code in error_codes: if error_code == 403: errors.append(CosmosHttpResponseError( From f1c69ed5f2f3ae395b7b3600147d06607262b615 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 31 Jul 2025 13:34:20 -0400 Subject: [PATCH 17/68] mypy, cspell, pylint --- sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py | 2 +- ...tion_endpoint_manager_per_partition_automatic_failover.py | 3 ++- sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py | 1 + ...ndpoint_manager_per_partition_automatic_failover_async.py | 5 +++-- .../azure-cosmos/azure/cosmos/aio/_retry_utility_async.py | 1 + 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py index c3a75b734c08..304be31411be 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py @@ -25,7 +25,7 @@ from typing import Dict from typing_extensions import Literal -# cspell:disable-line +# cspell:ignore PPAF class _Constants: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 643ec509c852..46d5cf207285 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -109,7 +109,8 @@ def resolve_service_endpoint_for_partition( # main write region in the account configurations self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() request.clear_route_to_location() - return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, + pk_range_wrapper) else: # Update the current regional endpoint to whatever the request is routing to partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 272ad3339ef3..63b24a16b94f 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -44,6 +44,7 @@ # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches +# cspell:ignore ppaf # args [0] is the request object # args [1] is the connection policy diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 0192548506c7..c356f386e9f3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -110,7 +110,8 @@ def resolve_service_endpoint_for_partition( # main write region in the account configurations self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() request.clear_route_to_location() - return self._resolve_service_endpoint_for_partition_circuit_breaker(request) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, + pk_range_wrapper) else: # Update the current regional endpoint to whatever the request is routing to partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route @@ -118,7 +119,7 @@ def resolve_service_endpoint_for_partition( partition_failover_info = PartitionLevelFailoverInfo() partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info - return self._resolve_service_endpoint_for_partition_circuit_breaker(request) + return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) def compute_available_preferred_regions( diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index dfa181e8976a..1c4d44b647b5 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -45,6 +45,7 @@ # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches +# cspell:ignore ppaf # args [0] is the request object # args [1] is the connection policy From 9306d15f493dacf20256334f154375260556bdab Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 31 Jul 2025 19:04:16 -0400 Subject: [PATCH 18/68] remove tag from tests since config is service based --- .../tests/test_per_partition_circuit_breaker_sm_mrr.py | 1 - .../tests/test_per_partition_circuit_breaker_sm_mrr_async.py | 1 - 2 files changed, 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py index a43003ed22b4..7c63e8e82897 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py @@ -38,7 +38,6 @@ def validate_unhealthy_partitions(global_endpoint_manager, assert unhealthy_partitions == expected_unhealthy_partitions @pytest.mark.cosmosCircuitBreakerMultiRegion -@pytest.mark.cosmosPerPartitionAutomaticFailover class TestPerPartitionCircuitBreakerSmMrr: host = test_config.TestConfig.host master_key = test_config.TestConfig.masterKey diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py index d1925988fca6..9779b9c68362 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py @@ -23,7 +23,6 @@ COLLECTION = "created_collection" @pytest.mark.cosmosCircuitBreakerMultiRegion -@pytest.mark.cosmosPerPartitionAutomaticFailover @pytest.mark.asyncio class TestPerPartitionCircuitBreakerSmMrrAsync: host = test_config.TestConfig.host From bd07d8351909f136acceab05ea79c906f71bdf9e Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 7 Aug 2025 16:45:49 -0400 Subject: [PATCH 19/68] add threshold-based retries for 408, 5xx errors --- ...tition_endpoint_manager_circuit_breaker.py | 20 +++++++------- ...anager_per_partition_automatic_failover.py | 26 +++++++++++++++++++ .../azure/cosmos/_partition_health_tracker.py | 25 ++++++++++++++++++ .../azure/cosmos/_retry_utility.py | 9 +++---- .../azure/cosmos/_synchronized_request.py | 17 +++++++++--- .../cosmos/_timeout_failover_retry_policy.py | 16 ++++++++++++ .../azure/cosmos/aio/_asynchronous_request.py | 13 +++++++++- ..._endpoint_manager_circuit_breaker_async.py | 20 +++++++------- ..._per_partition_automatic_failover_async.py | 26 +++++++++++++++++++ .../azure/cosmos/aio/_retry_utility_async.py | 8 +++--- .../test_per_partition_automatic_failover.py | 3 ++- 11 files changed, 150 insertions(+), 33 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py index d188d7713cb7..94fc4eafb98e 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py @@ -89,12 +89,13 @@ def create_pk_range_wrapper(self, request: RequestObject) -> Optional[PartitionK return PartitionKeyRangeWrapper(partition_range, container_rid) - def record_failure( + def record_ppcb_failure( self, - request: RequestObject - ) -> None: + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None)-> None: if self.is_circuit_breaker_applicable(request): - pk_range_wrapper = self.create_pk_range_wrapper(request) + if pk_range_wrapper is None: + pk_range_wrapper = self.create_pk_range_wrapper(request) if pk_range_wrapper: self.global_partition_endpoint_manager_core.record_failure(request, pk_range_wrapper) @@ -109,11 +110,12 @@ def _resolve_service_endpoint_for_partition_circuit_breaker( pk_range_wrapper) return self._resolve_service_endpoint(request) - def record_success( + def record_ppcb_success( self, - request: RequestObject - ) -> None: - if self.global_partition_endpoint_manager_core.is_circuit_breaker_applicable(request): - pk_range_wrapper = self.create_pk_range_wrapper(request) + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: + if self.is_circuit_breaker_applicable(request): + if pk_range_wrapper is None: + pk_range_wrapper = self.create_pk_range_wrapper(request) if pk_range_wrapper: self.global_partition_endpoint_manager_core.record_success(request, pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 46d5cf207285..a1bcae1da3d8 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -12,6 +12,7 @@ from azure.cosmos.http_constants import ResourceType from azure.cosmos._global_partition_endpoint_manager_circuit_breaker import \ _GlobalPartitionEndpointManagerForCircuitBreaker +from azure.cosmos._partition_health_tracker import _PPAFPartitionThresholdsTracker from azure.cosmos.documents import _OperationType from azure.cosmos._request_object import RequestObject @@ -64,6 +65,7 @@ class _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover(_GlobalPar def __init__(self, client: "CosmosClientConnection"): super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover, self).__init__(client) self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} + self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() def is_per_partition_automatic_failover_applicable(self, request: RequestObject) -> bool: if not request: @@ -142,3 +144,27 @@ def compute_available_preferred_regions( for region in available_regions } return available_regional_endpoints + + def record_failure(self, + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: + """Records a failure for the given partition key range and request.""" + if self.is_per_partition_automatic_failover_applicable(request): + if pk_range_wrapper is None: + pk_range_wrapper = self.create_pk_range_wrapper(request) + if pk_range_wrapper: + self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper) + else: + self.record_ppcb_failure(request, pk_range_wrapper) + + def record_success(self, + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: + """Records a failure for the given partition key range and request.""" + if self.is_per_partition_automatic_failover_applicable(request): + if pk_range_wrapper is None: + pk_range_wrapper = self.create_pk_range_wrapper(request) + if pk_range_wrapper: + self.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) + else: + self.record_ppcb_success(request, pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py index 0fc10fcc2bce..e7c44d01120f 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py @@ -44,6 +44,8 @@ LAST_UNAVAILABILITY_CHECK_TIME_STAMP = "lastUnavailabilityCheckTimeStamp" HEALTH_STATUS = "healthStatus" +#cspell:ignore PPAF + class _PartitionHealthInfo(object): """ This internal class keeps the health and statistics for a partition. @@ -290,3 +292,26 @@ def _reset_partition_health_tracker_stats(self) -> None: for locations in self.pk_range_wrapper_to_health_info.values(): for health_info in locations.values(): health_info.reset_failure_rate_health_stats() + +class _PPAFPartitionThresholdsTracker(object): + """ + This internal class implements the logic for tracking consecutive failure thresholds for a partition + in the context for per-partition automatic failover. This tracker is only used in the context of 408, 5xx and + ServiceResponseError errors as a defensive measure to avoid failing over too early without confirmation + from the service. + """ + + def __init__(self) -> None: + self.pk_range_wrapper_to_failure_count: Dict[PartitionKeyRangeWrapper, int] = {} + + def add_failure(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> None: + if pk_range_wrapper not in self.pk_range_wrapper_to_failure_count: + self.pk_range_wrapper_to_failure_count[pk_range_wrapper] = 0 + self.pk_range_wrapper_to_failure_count[pk_range_wrapper] += 1 + + def clear_pk_failures(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> None: + if pk_range_wrapper in self.pk_range_wrapper_to_failure_count: + del self.pk_range_wrapper_to_failure_count[pk_range_wrapper] + + def get_pk_failures(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> int: + return self.pk_range_wrapper_to_failure_count.get(pk_range_wrapper, 0) \ No newline at end of file diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 63b24a16b94f..ee5ded634082 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -115,7 +115,7 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin try: if args: result = ExecuteFunction(function, global_endpoint_manager, *args, **kwargs) - global_endpoint_manager.record_success(args[0]) + global_endpoint_manager.record_success(args[0], pk_range_wrapper) else: result = ExecuteFunction(function, *args, **kwargs) if not client.last_response_headers: @@ -195,8 +195,7 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: # record the failure for circuit breaker tracking - # TODO: change this to track errors for ppaf - global_endpoint_manager.record_failure(args[0]) + global_endpoint_manager.record_failure(args[0], pk_range_wrapper) retry_policy = timeout_failover_retry_policy else: retry_policy = defaultRetry_policy @@ -230,7 +229,7 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin raise e else: if args: - global_endpoint_manager.record_failure(args[0]) + global_endpoint_manager.record_failure(args[0], pk_range_wrapper) _handle_service_request_retries(client, service_request_retry_policy, e, *args) except ServiceResponseError as e: @@ -239,7 +238,7 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin raise e else: if args: - global_endpoint_manager.record_failure(args[0]) + global_endpoint_manager.record_failure(args[0], pk_range_wrapper) _handle_service_response_retries(request, client, service_response_retry_policy, e, *args) def ExecuteFunction(function, *args, **kwargs): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py index bb338f443dca..8e70f443e052 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py @@ -27,10 +27,13 @@ from urllib.parse import urlparse from azure.core.exceptions import DecodeError # type: ignore +from azure.core import PipelineClient +from typing import Any -from . import exceptions -from . import http_constants -from . import _retry_utility +from . import exceptions, http_constants, _retry_utility +from .documents import ConnectionPolicy +from ._request_object import RequestObject +from ._global_partition_endpoint_manager_per_partition_automatic_failover import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover def _is_readable_stream(obj): @@ -65,7 +68,13 @@ def _request_body_from_data(data): return None -def _Request(global_endpoint_manager, request_params, connection_policy, pipeline_client, request, **kwargs): # pylint: disable=too-many-statements +def _Request( + global_endpoint_manager: _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover, + request_params: RequestObject, + connection_policy: ConnectionPolicy, + pipeline_client: PipelineClient, + request: Any, + **kwargs): # pylint: disable=too-many-statements """Makes one http request using the requests module. :param _GlobalEndpointManager global_endpoint_manager: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py index 7506500b463a..33b2596ccb5a 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py @@ -4,8 +4,11 @@ """Internal class for timeout failover retry policy implementation in the Azure Cosmos database service. """ +import os from azure.cosmos.documents import _OperationType +from azure.cosmos._constants import _Constants as Constants +# cspell:ignore PPAF, ppaf class _TimeoutFailoverRetryPolicy(object): @@ -37,6 +40,19 @@ def ShouldRetry(self, _exception): :returns: a boolean stating whether the request should be retried :rtype: bool """ + # PPAF will have its own retry logic based on consecutive failures before failing over to the next region + if self.request and self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): + if (self.global_endpoint_manager.ppaf_thresholds_tracker.get_pk_failures(self.pk_range_wrapper) + >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, + Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): + # If the PPAF threshold is reached, we reset the count and retry to the next region + self.global_endpoint_manager.ppaf_thresholds_tracker.clear_pk_failures(self.pk_range_wrapper) + partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[ + self.pk_range_wrapper] + partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) + self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) + return True + # we retry only if the request is a read operation or if it is a write operation with retry enabled if self.request and not self.is_operation_retryable(): return False diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 1cd2a22039b4..310998728ed6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -24,17 +24,28 @@ import copy import json import time +from typing import Any from urllib.parse import urlparse +from azure.core import AsyncPipelineClient from azure.core.exceptions import DecodeError # type: ignore from .. import exceptions from .. import http_constants from . import _retry_utility_async +from ..documents import ConnectionPolicy +from .._request_object import RequestObject +from ._global_partition_endpoint_manager_per_partition_automatic_failover_async import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync from .._synchronized_request import _request_body_from_data, _replace_url_prefix -async def _Request(global_endpoint_manager, request_params, connection_policy, pipeline_client, request, **kwargs): # pylint: disable=too-many-statements +async def _Request( + global_endpoint_manager: _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, + request_params: RequestObject, + connection_policy: ConnectionPolicy, + pipeline_client: AsyncPipelineClient, + request: Any, + **kwargs): # pylint: disable=too-many-statements """Makes one http request using the requests module. :param _GlobalEndpointManager global_endpoint_manager: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_circuit_breaker_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_circuit_breaker_async.py index c229d37082f0..3918a12622d0 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_circuit_breaker_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_circuit_breaker_async.py @@ -35,7 +35,7 @@ if TYPE_CHECKING: from azure.cosmos.aio._cosmos_client_connection_async import CosmosClientConnection - +# cspell:ignore ppcb # pylint: disable=protected-access class _GlobalPartitionEndpointManagerForCircuitBreakerAsync(_GlobalEndpointManager): """ @@ -90,12 +90,13 @@ async def create_pk_range_wrapper(self, request: RequestObject) -> Optional[Part def is_circuit_breaker_applicable(self, request: RequestObject) -> bool: return self.global_partition_endpoint_manager_core.is_circuit_breaker_applicable(request) - async def record_failure( + async def record_ppcb_failure( self, - request: RequestObject - ) -> None: + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: if self.is_circuit_breaker_applicable(request): - pk_range_wrapper = await self.create_pk_range_wrapper(request) + if pk_range_wrapper is None: + pk_range_wrapper = await self.create_pk_range_wrapper(request) if pk_range_wrapper: self.global_partition_endpoint_manager_core.record_failure(request, pk_range_wrapper) @@ -110,11 +111,12 @@ def _resolve_service_endpoint_for_partition_circuit_breaker( pk_range_wrapper) return self._resolve_service_endpoint(request) - async def record_success( + async def record_ppcb_success( self, - request: RequestObject - ) -> None: + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: if self.is_circuit_breaker_applicable(request): - pk_range_wrapper = await self.create_pk_range_wrapper(request) + if pk_range_wrapper is None: + pk_range_wrapper = await self.create_pk_range_wrapper(request) if pk_range_wrapper: self.global_partition_endpoint_manager_core.record_success(request, pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index c356f386e9f3..175a3df025b6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -14,6 +14,7 @@ _GlobalPartitionEndpointManagerForCircuitBreakerAsync from azure.cosmos.documents import _OperationType +from azure.cosmos._partition_health_tracker import _PPAFPartitionThresholdsTracker from azure.cosmos._request_object import RequestObject from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper @@ -65,6 +66,7 @@ class _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync( def __init__(self, client: "CosmosClientConnection"): super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, self).__init__(client) self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} + self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() def is_per_partition_automatic_failover_applicable(self, request: RequestObject) -> bool: if not request: @@ -140,3 +142,27 @@ def compute_available_preferred_regions( for region in available_regions } return available_regional_endpoints + + async def record_failure(self, + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: + """Records a failure for the given partition key range and request.""" + if self.is_per_partition_automatic_failover_applicable(request): + if pk_range_wrapper is None: + pk_range_wrapper = await self.create_pk_range_wrapper(request) + if pk_range_wrapper: + self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper) + else: + await self.record_ppcb_failure(request, pk_range_wrapper) + + async def record_success(self, + request: RequestObject, + pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: + """Records a failure for the given partition key range and request.""" + if self.is_per_partition_automatic_failover_applicable(request): + if pk_range_wrapper is None: + pk_range_wrapper = await self.create_pk_range_wrapper(request) + if pk_range_wrapper: + self.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) + else: + await self.record_ppcb_success(request, pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 1c4d44b647b5..5b8e041541d3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -114,7 +114,7 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg try: if args: result = await ExecuteFunctionAsync(function, global_endpoint_manager, *args, **kwargs) - await global_endpoint_manager.record_success(args[0]) + await global_endpoint_manager.record_success(args[0], pk_range_wrapper) else: result = await ExecuteFunctionAsync(function, *args, **kwargs) if not client.last_response_headers: @@ -193,7 +193,7 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: # record the failure for circuit breaker tracking if args: - await global_endpoint_manager.record_failure(args[0]) + await global_endpoint_manager.record_failure(args[0], pk_range_wrapper) # TODO: change this to track errors for ppaf retry_policy = timeout_failover_retry_policy else: @@ -242,12 +242,12 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg _handle_service_request_retries(client, service_request_retry_policy, e, *args) else: if args: - await global_endpoint_manager.record_failure(args[0]) + await global_endpoint_manager.record_failure(args[0], pk_range_wrapper) _handle_service_response_retries(request, client, service_response_retry_policy, e, *args) # in case customer is not using aiohttp except ImportError: if args: - await global_endpoint_manager.record_failure(args[0]) + await global_endpoint_manager.record_failure(args[0], pk_range_wrapper) _handle_service_response_retries(request, client, service_response_retry_policy, e, *args) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 69d18595e123..efdb71b8d9ea 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -16,7 +16,7 @@ def create_errors(): errors = [] - error_codes = [403, 503] + error_codes = [403, 408, 500, 502, 503, 504] for error_code in error_codes: if error_code == 403: errors.append(CosmosHttpResponseError( @@ -91,6 +91,7 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint # Based on our configuration, we should have had one error followed by a success - marking only the previous endpoint as unavailable + # TODO: add logic here to deal with consecutive failures case perform_write_operation( write_operation, container, From 2e5838cdb22bb1df9b6a9c982f5614e2af5dd63d Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 8 Aug 2025 11:50:42 -0400 Subject: [PATCH 20/68] update constant use, rollback session token PR change --- sdk/cosmos/azure-cosmos/azure/cosmos/documents.py | 2 +- sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/documents.py b/sdk/cosmos/azure-cosmos/azure/cosmos/documents.py index 4698c7378dee..8d314179ef38 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/documents.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/documents.py @@ -361,7 +361,7 @@ def __init__(self) -> None: self.ProxyConfiguration: Optional[ProxyConfiguration] = None self.EnableEndpointDiscovery: bool = True self.PreferredLocations: List[str] = [] - self.ExcludedLocations: Optional[List[str]] = None + self.ExcludedLocations: List[str] = [] self.RetryOptions: RetryOptions = RetryOptions() self.DisableSSLVerification: bool = False self.UseMultipleWriteLocations: bool = False diff --git a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py index 49981dba1db9..2adff2e79327 100644 --- a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py +++ b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py @@ -33,6 +33,7 @@ from requests import Session from azure.cosmos import documents +from azure.cosmos._constants import _Constants as Constants import test_config from azure.cosmos.exceptions import CosmosHttpResponseError @@ -295,8 +296,7 @@ def transform_topology_ppaf_enabled( # cspell:disable-line if response.status_code == 200 and data: data = data.decode("utf-8") result = json.loads(data) - # TODO: need to verify below behavior against actual Cosmos DB service response - result["enablePerPartitionFailoverBehavior"] = True + result[Constants.EnablePerPartitionFailoverBehavior] = True FaultInjectionTransport.logger.info("Transformed Account Topology: {}".format(result)) request: HttpRequest = response.request return FaultInjectionTransport.MockHttpResponse(request, 200, result) From 8b7d1819eb7b780c1c3e61c859ebee4acbb98697 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 18 Aug 2025 19:50:31 -0400 Subject: [PATCH 21/68] threshold based retries --- sdk/cosmos/azure-cosmos/azure/cosmos/_base.py | 21 ++++ .../_endpoint_discovery_retry_policy.py | 22 ++-- ...anager_per_partition_automatic_failover.py | 1 - .../azure/cosmos/_retry_utility.py | 9 +- .../cosmos/_service_response_retry_policy.py | 6 +- .../_service_unavailable_retry_policy.py | 15 +-- .../cosmos/_timeout_failover_retry_policy.py | 16 +-- ..._per_partition_automatic_failover_async.py | 5 +- .../azure/cosmos/aio/_retry_utility_async.py | 6 +- .../test_per_partition_automatic_failover.py | 100 +++++++++++++++--- ..._per_partition_automatic_failover_async.py | 71 +++++++++++-- .../test_per_partition_circuit_breaker_mm.py | 8 +- ..._per_partition_circuit_breaker_mm_async.py | 8 +- 13 files changed, 211 insertions(+), 77 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py index 7a5df84f8816..d040d03c18c9 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py @@ -28,6 +28,7 @@ import uuid import re import binascii +import os from typing import Dict, Any, List, Mapping, Optional, Sequence, Union, Tuple, TYPE_CHECKING from urllib.parse import quote as urllib_quote @@ -45,7 +46,10 @@ if TYPE_CHECKING: from ._cosmos_client_connection import CosmosClientConnection from .aio._cosmos_client_connection_async import CosmosClientConnection as AsyncClientConnection + from ._global_partition_endpoint_manager_per_partition_automatic_failover import ( + _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover) from ._request_object import RequestObject + from ._routing.routing_range import PartitionKeyRangeWrapper # pylint: disable=protected-access @@ -933,3 +937,20 @@ def _build_properties_cache(properties: Dict[str, Any], container_link: str) -> "_self": properties.get("_self", None), "_rid": properties.get("_rid", None), "partitionKey": properties.get("partitionKey", None), "container_link": container_link } + +def try_ppaf_failover_threshold( + global_endpoint_manager: "_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover", + pk_range_wrapper: "PartitionKeyRangeWrapper", + request: "RequestObject"): + """Check if the PPAF threshold is reached for the current partition range, and mark endpoint unavailable if so. + """ + # If PPAF is enabled, we track consecutive failures for certain exceptions, and only fail over at a partition + # level after the threshold is reached + if request and global_endpoint_manager.is_per_partition_automatic_failover_applicable(request): + if (global_endpoint_manager.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) + >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, + Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): + # If the PPAF threshold is reached, we reset the count and retry to the next region + global_endpoint_manager.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) + partition_level_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + partition_level_info.unavailable_regional_endpoints.add(request.location_endpoint_to_route) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index aabf247936fc..f29daf770891 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -62,17 +62,6 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument self.failover_retry_count += 1 - if self.request.location_endpoint_to_route: - if _OperationType.IsReadOnlyOperation(self.request.operation_type): - # Mark current read endpoint as unavailable - self.global_endpoint_manager.mark_endpoint_unavailable_for_read( - self.request.location_endpoint_to_route, - True) - else: - self.global_endpoint_manager.mark_endpoint_unavailable_for_write( - self.request.location_endpoint_to_route, - True) - # set the refresh_needed flag to ensure that endpoint list is # refreshed with new writable and readable locations self.global_endpoint_manager.refresh_needed = True @@ -85,6 +74,17 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) return True + if self.request.location_endpoint_to_route: + if _OperationType.IsReadOnlyOperation(self.request.operation_type): + # Mark current read endpoint as unavailable + self.global_endpoint_manager.mark_endpoint_unavailable_for_read( + self.request.location_endpoint_to_route, + True) + else: + self.global_endpoint_manager.mark_endpoint_unavailable_for_write( + self.request.location_endpoint_to_route, + True) + # clear previous location-based routing directive self.request.clear_route_to_location() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index a1bcae1da3d8..33a0bccae4d3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -120,7 +120,6 @@ def resolve_service_endpoint_for_partition( partition_failover_info = PartitionLevelFailoverInfo() partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info - return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) def compute_available_preferred_regions( diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index ee5ded634082..ba9800ba223f 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -44,7 +44,7 @@ # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches -# cspell:ignore ppaf +# cspell:ignore PPAF,ppaf # args [0] is the request object # args [1] is the connection policy @@ -191,10 +191,12 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin retry_policy.container_rid = cached_container["_rid"] request.headers[retry_policy._intended_headers] = retry_policy.container_rid elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: + if args and global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0]): + global_endpoint_manager.record_failure(args[0], pk_range_wrapper) retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: - # record the failure for circuit breaker tracking + # record the failure for ppaf/circuit breaker tracking global_endpoint_manager.record_failure(args[0], pk_range_wrapper) retry_policy = timeout_failover_retry_policy else: @@ -275,7 +277,8 @@ def _handle_service_request_retries( raise exception def _handle_service_response_retries(request, client, response_retry_policy, exception, *args): - if request and (_has_read_retryable_headers(request.headers) or (args and is_write_retryable(args[0], client))): + if request and (_has_read_retryable_headers(request.headers) or (args and is_write_retryable(args[0], client)) or + (args and client._global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0]))): # we resolve the request endpoint to the next preferred region # once we are out of preferred regions we stop retrying retry_policy = response_retry_policy diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_response_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_response_retry_policy.py index b49b1dc35994..31f6e800d5e1 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_response_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_response_retry_policy.py @@ -6,9 +6,11 @@ from the service, and as such we do not know what the output of the operation was. As such, we only do cross regional retries for read operations. """ +#cspell:ignore PPAF, ppaf import logging from azure.cosmos.documents import _OperationType +from azure.cosmos._base import try_ppaf_failover_threshold class ServiceResponseRetryPolicy(object): @@ -47,7 +49,9 @@ def ShouldRetry(self): return False if self.request: - + # We track consecutive failures for per partition automatic failover, and only fail over at a partition + # level after the threshold is reached + try_ppaf_failover_threshold(self.global_endpoint_manager, self.pk_range_wrapper, self.request) if not _OperationType.IsReadOnlyOperation(self.request.operation_type) and not self.request.retry_write: return False if self.request.retry_write and self.failover_retry_count + 1 >= self.max_write_retry_count: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py index 3dc3df1aac70..b07a1b910034 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -4,6 +4,7 @@ """Internal class for service unavailable retry policy implementation in the Azure Cosmos database service. """ +from azure.cosmos._base import try_ppaf_failover_threshold class _ServiceUnavailableRetryPolicy(object): @@ -11,12 +12,11 @@ def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, self.retry_after_in_milliseconds = 500 self.global_endpoint_manager = global_endpoint_manager self.pk_range_wrapper = pk_range_wrapper - # If an account only has 1 region, then we still want to retry once on the same region - self._max_retry_attempt_count = (len(self.global_endpoint_manager.location_cache.read_regional_routing_contexts) - + 1) self.retry_count = 0 self.connection_policy = connection_policy self.request = args[0] if args else None + # If an account only has 1 region, then we still want to retry once on the same region + self._max_retry_attempt_count = max(2, (len(self.global_endpoint_manager.location_cache.read_regional_routing_contexts))) def ShouldRetry(self, _exception): """Returns true if the request should retry based on the passed-in exception. @@ -35,20 +35,13 @@ def ShouldRetry(self, _exception): return False if self.request: + try_ppaf_failover_threshold(self.global_endpoint_manager, self.pk_range_wrapper, self.request) location_endpoint = self.resolve_next_region_service_endpoint() self.request.route_to_location(location_endpoint) return True # This function prepares the request to go to the next region def resolve_next_region_service_endpoint(self): - if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): - # If per partition automatic failover is applicable, we mark the current endpoint as unavailable - # and resolve the service endpoint for the partition range - otherwise, continue with default retry logic - partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] - partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) - return self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, - self.pk_range_wrapper) - # clear previous location-based routing directive self.request.clear_route_to_location() # clear the last routed endpoint within same region since we are going to a new region now diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py index 33b2596ccb5a..c7ee31026bc3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py @@ -4,9 +4,8 @@ """Internal class for timeout failover retry policy implementation in the Azure Cosmos database service. """ -import os from azure.cosmos.documents import _OperationType -from azure.cosmos._constants import _Constants as Constants +from azure.cosmos._base import try_ppaf_failover_threshold # cspell:ignore PPAF, ppaf @@ -40,18 +39,7 @@ def ShouldRetry(self, _exception): :returns: a boolean stating whether the request should be retried :rtype: bool """ - # PPAF will have its own retry logic based on consecutive failures before failing over to the next region - if self.request and self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): - if (self.global_endpoint_manager.ppaf_thresholds_tracker.get_pk_failures(self.pk_range_wrapper) - >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, - Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): - # If the PPAF threshold is reached, we reset the count and retry to the next region - self.global_endpoint_manager.ppaf_thresholds_tracker.clear_pk_failures(self.pk_range_wrapper) - partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[ - self.pk_range_wrapper] - partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) - self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) - return True + try_ppaf_failover_threshold(self.global_endpoint_manager, self.pk_range_wrapper, self.request) # we retry only if the request is a read operation or if it is a write operation with retry enabled if self.request and not self.is_operation_retryable(): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 175a3df025b6..4fac0d1159d4 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -134,7 +134,10 @@ def compute_available_preferred_regions( :return: A set of available regional endpoints. :rtype: Set[str] """ - excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations + if request.excluded_locations: + excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations + else: + excluded_locations = self.location_cache.connection_policy.ExcludedLocations preferred_locations = self.PreferredLocations available_regions = [item for item in preferred_locations if item not in excluded_locations] available_regional_endpoints = { diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 5b8e041541d3..3a764676b817 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -189,12 +189,14 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg retry_policy.container_rid = cached_container["_rid"] request.headers[retry_policy._intended_headers] = retry_policy.container_rid elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: + # if ppaf is applicable, we record the failure + if args and global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0]): + await global_endpoint_manager.record_failure(args[0], pk_range_wrapper) retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: - # record the failure for circuit breaker tracking if args: + # record the failure for ppaf/circuit breaker tracking await global_endpoint_manager.record_failure(args[0], pk_range_wrapper) - # TODO: change this to track errors for ppaf retry_policy = timeout_failover_retry_policy else: retry_policy = defaultRetry_policy diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index efdb71b8d9ea..743659e92d5b 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -4,8 +4,8 @@ import uuid import pytest - import test_config +from azure.core.exceptions import ServiceResponseError from azure.cosmos import CosmosClient from azure.cosmos.exceptions import CosmosHttpResponseError from _fault_injection_transport import FaultInjectionTransport @@ -14,19 +14,24 @@ # cspell:disable -def create_errors(): +def create_failover_errors(): errors = [] - error_codes = [403, 408, 500, 502, 503, 504] + error_codes = [403] for error_code in error_codes: - if error_code == 403: - errors.append(CosmosHttpResponseError( - status_code=error_code, - message="Some injected error.", - sub_status=3)) - else: - errors.append(CosmosHttpResponseError( - status_code=error_code, - message="Some injected error.")) + errors.append(CosmosHttpResponseError( + status_code=error_code, + message="Some injected error.", + sub_status=3)) + return errors + +def create_threshold_errors(): + errors = [] + error_codes = [408, 500, 502, 503, 504] + for error_code in error_codes: + errors.append(CosmosHttpResponseError( + status_code=error_code, + message="Some injected error.")) + errors.append(ServiceResponseError(message="Injected Service Response Error.")) return errors # These tests assume that the configured live account has one main write region and one secondary read region. @@ -74,10 +79,10 @@ def setup_info(self, error, max_count=None, is_batch=False, **kwargs): custom_setup = self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, **kwargs) return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): # This test validates that the partition info cache is updated correctly upon failures, and that the - # per-partition automatic failover logic routes requests to the next available regional endpoint + # per-partition automatic failover logic routes requests to the next available regional endpoint on 403.3 errors. error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, 1, write_operation == BATCH) container = setup['col'] @@ -91,7 +96,6 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint # Based on our configuration, we should have had one error followed by a success - marking only the previous endpoint as unavailable - # TODO: add logic here to deal with consecutive failures case perform_write_operation( write_operation, container, @@ -119,12 +123,74 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): assert partition_info.current_regional_endpoint is None - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_threshold_errors())) + def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): + # This test validates the consecutive failures logic is properly handled for per-partition automatic failover, + # and that the per-partition automatic failover logic routes requests to the next available regional endpoint + # after enough consecutive failures have occurred. + error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda) + container = setup['col'] + fault_injection_container = custom_setup['col'] + global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager + + # Create a document to populate the per-partition GEM partition range info cache + fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, + 'name': 'sample document', 'key': 'value'}) + pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] + initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint + + + is_503 = hasattr(error, 'status_code') and error.status_code == 503 + # Since 503 errors are retried by default, we each request counts as two failures + consecutive_failures = 3 if is_503 else 6 + + for i in range(consecutive_failures): + # We perform the write operation multiple times to check the consecutive failures logic + with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: + perform_write_operation(write_operation, + container, + fault_injection_container, + doc_fail_id, + PK_VALUE) + assert exc_info.value == error + # Verify that the threshold for consecutive failures is updated + pk_range_wrappers = list(global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count.keys()) + assert len(pk_range_wrappers) == 1 + failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] + assert failure_count == 6 + # Run some more requests to the same partition to trigger the failover logic + for i in range(consecutive_failures): + with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: + perform_write_operation(write_operation, + container, + fault_injection_container, + doc_fail_id, + PK_VALUE) + assert exc_info.value == error + # We should have marked the previous endpoint as unavailable after 10 successive failures + partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same + assert len(partition_info.unavailable_regional_endpoints) == 1 + assert initial_endpoint in partition_info.unavailable_regional_endpoints + assert initial_endpoint != partition_info.current_regional_endpoint # west us 3 != west us + + # Since we are failing every request, even though we retried to the next region, that retry should have failed as well + # This means we should have one extra failure - verify that the value makes sense + failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] + assert failure_count == 1 if is_503 else 3 + + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) def test_ppaf_exclude_regions(self, write_operation, error): # TODO: finish this test return + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) + def test_ppaf_invalid_configs(self, write_operation, error): + # TODO: finish this test + return + if __name__ == '__main__': - unittest.main() + unittest.main() \ No newline at end of file diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 0321a63fa799..99e433bc6af4 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -10,10 +10,12 @@ import test_config from azure.core.pipeline.transport._aiohttp import AioHttpTransport +from azure.core.exceptions import ServiceResponseError +from azure.cosmos.exceptions import CosmosHttpResponseError from azure.cosmos.aio import CosmosClient from _fault_injection_transport import FaultInjectionTransport from _fault_injection_transport_async import FaultInjectionTransportAsync -from test_per_partition_automatic_failover import create_errors +from test_per_partition_automatic_failover import create_failover_errors, create_threshold_errors from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors from test_per_partition_circuit_breaker_mm_async import perform_write_operation @@ -70,14 +72,11 @@ async def setup_info(self, error, max_count=None, is_batch=False, **kwargs): custom_setup = await self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, **kwargs) return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation, error): # This test validates that the partition info cache is updated correctly upon failures, and that the # per-partition automatic failover logic routes requests to the next available regional endpoint - error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay( - 0, - error - )) + error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, 1, write_operation == BATCH) container = setup['col'] fault_injection_container = custom_setup['col'] @@ -107,7 +106,7 @@ async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation write_operation, container, fault_injection_container, - str(uuid.uuid4()), + doc_fail_id, PK_VALUE) partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] # Verify that the cache is empty, since the request going to the second regional endpoint failed @@ -116,8 +115,64 @@ async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation assert initial_endpoint not in partition_info.unavailable_regional_endpoints assert partition_info.current_regional_endpoint is None + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_threshold_errors())) + async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation, error): + # This test validates that the partition info cache is updated correctly upon failures, and that the + # per-partition automatic failover logic routes requests to the next available regional endpoint + error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda) + container = setup['col'] + fault_injection_container = custom_setup['col'] + global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager + + # Create a document to populate the per-partition GEM partition range info cache + await fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, + 'name': 'sample document', 'key': 'value'}) + pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] + initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint + + is_503 = hasattr(error, 'status_code') and error.status_code == 503 + # Since 503 errors are retried by default, we each request counts as two failures + consecutive_failures = 3 if is_503 else 6 + + for i in range(consecutive_failures): + # We perform the write operation multiple times to check the consecutive failures logic + with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: + await perform_write_operation(write_operation, + container, + fault_injection_container, + doc_fail_id, + PK_VALUE) + assert exc_info.value == error + + # Verify that the threshold for consecutive failures is updated + pk_range_wrappers = list(global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count.keys()) + assert len(pk_range_wrappers) == 1 + failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] + assert failure_count == 6 + # Run some more requests to the same partition to trigger the failover logic + for i in range(consecutive_failures): + with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: + await perform_write_operation(write_operation, + container, + fault_injection_container, + doc_fail_id, + PK_VALUE) + assert exc_info.value == error + # We should have marked the previous endpoint as unavailable after 10 successive failures + partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same + assert len(partition_info.unavailable_regional_endpoints) == 1 + assert initial_endpoint in partition_info.unavailable_regional_endpoints + assert initial_endpoint != partition_info.current_regional_endpoint # west us 3 != west us + + # Since we are failing every request, even though we retried to the next region, that retry should have failed as well + # This means we should have one extra failure - verify that the value makes sense + failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] + assert failure_count == 1 if is_503 else 3 + - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_errors())) + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_threshold_errors())) async def test_ppaf_exclude_regions_async(self, write_operation, error): # TODO: finish this test return diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py index 3ed516a0a59c..c741d0b48af2 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py @@ -108,7 +108,7 @@ def perform_write_operation(operation, container, fault_injection_container, doc elif operation == UPSERT: resp = fault_injection_container.upsert_item(body=doc) elif operation == REPLACE: - container.create_item(body=doc) + container.upsert_item(body=doc) sleep(1) new_doc = {'id': doc_id, 'pk': pk, @@ -116,11 +116,11 @@ def perform_write_operation(operation, container, fault_injection_container, doc 'key': 'value'} resp = fault_injection_container.replace_item(item=doc['id'], body=new_doc) elif operation == DELETE: - container.create_item(body=doc) + container.upsert_item(body=doc) sleep(1) resp = fault_injection_container.delete_item(item=doc['id'], partition_key=doc['pk']) elif operation == PATCH: - container.create_item(body=doc) + container.upsert_item(body=doc) sleep(1) operations = [{"op": "incr", "path": "/company", "value": 3}] resp = fault_injection_container.patch_item(item=doc['id'], partition_key=doc['pk'], patch_operations=operations) @@ -134,7 +134,7 @@ def perform_write_operation(operation, container, fault_injection_container, doc resp = fault_injection_container.execute_item_batch(batch_operations, partition_key=doc['pk']) # this will need to be emulator only elif operation == DELETE_ALL_ITEMS_BY_PARTITION_KEY: - container.create_item(body=doc) + container.upsert_item(body=doc) resp = fault_injection_container.delete_all_items_by_partition_key(pk) if resp and expected_uri: validate_response_uri(resp, expected_uri) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py index 60e2603c1842..40147314e4ff 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py @@ -33,7 +33,7 @@ async def perform_write_operation(operation, container, fault_injection_containe elif operation == UPSERT: resp = await fault_injection_container.upsert_item(body=doc) elif operation == REPLACE: - await container.create_item(body=doc) + await container.upsert_item(body=doc) new_doc = {'id': doc_id, 'pk': pk, 'name': 'sample document' + str(uuid), @@ -41,11 +41,11 @@ async def perform_write_operation(operation, container, fault_injection_containe await asyncio.sleep(1) resp = await fault_injection_container.replace_item(item=doc['id'], body=new_doc) elif operation == DELETE: - await container.create_item(body=doc) + await container.upsert_item(body=doc) await asyncio.sleep(1) resp = await fault_injection_container.delete_item(item=doc['id'], partition_key=doc['pk']) elif operation == PATCH: - await container.create_item(body=doc) + await container.upsert_item(body=doc) await asyncio.sleep(1) operations = [{"op": "incr", "path": "/company", "value": 3}] resp = await fault_injection_container.patch_item(item=doc['id'], partition_key=doc['pk'], patch_operations=operations) @@ -59,7 +59,7 @@ async def perform_write_operation(operation, container, fault_injection_containe resp = await fault_injection_container.execute_item_batch(batch_operations, partition_key=doc['pk']) # this will need to be emulator only elif operation == DELETE_ALL_ITEMS_BY_PARTITION_KEY: - await container.create_item(body=doc) + await container.upsert_item(body=doc) resp = await fault_injection_container.delete_all_items_by_partition_key(pk) if resp and expected_uri: validate_response_uri(resp, expected_uri) From d8ed980f860260eb3711537c2e48e2f6b00e8364 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 18 Aug 2025 21:58:06 -0400 Subject: [PATCH 22/68] Update _base.py --- sdk/cosmos/azure-cosmos/azure/cosmos/_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py index d040d03c18c9..cd2050cf1dab 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py @@ -52,6 +52,7 @@ from ._routing.routing_range import PartitionKeyRangeWrapper # pylint: disable=protected-access +#cspell:ignore PPAF, ppaf _COMMON_OPTIONS = { 'initial_headers': 'initialHeaders', From fcd5c60c44e6763a29c54ccd12d13f635ac92ba7 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 19 Aug 2025 00:28:56 -0400 Subject: [PATCH 23/68] cspell, test fixes --- ...bal_partition_endpoint_manager_circuit_breaker.py | 2 ++ ...point_manager_per_partition_automatic_failover.py | 1 + .../cosmos/_service_unavailable_retry_policy.py | 9 ++++++++- .../azure/cosmos/_timeout_failover_retry_policy.py | 9 ++++----- ...manager_per_partition_automatic_failover_async.py | 1 + .../test_timeout_and_failover_retry_policy_async.py | 12 ++++++++---- 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py index 94fc4eafb98e..ca58e6eb2e1b 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_circuit_breaker.py @@ -35,6 +35,8 @@ if TYPE_CHECKING: from azure.cosmos._cosmos_client_connection import CosmosClientConnection +#cspell:ignore ppcb + class _GlobalPartitionEndpointManagerForCircuitBreaker(_GlobalEndpointManager): """ This internal class implements the logic for partition endpoint management for diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 33a0bccae4d3..5514615a92a2 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -24,6 +24,7 @@ logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") # pylint: disable=name-too-long, protected-access +#cspell:ignore PPAF, ppaf, ppcb class PartitionLevelFailoverInfo: """ diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py index b07a1b910034..cb3246e6db7a 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -4,8 +4,11 @@ """Internal class for service unavailable retry policy implementation in the Azure Cosmos database service. """ +from azure.cosmos.documents import _OperationType from azure.cosmos._base import try_ppaf_failover_threshold +#cspell:ignore ppaf + class _ServiceUnavailableRetryPolicy(object): def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, *args): @@ -16,7 +19,11 @@ def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, self.connection_policy = connection_policy self.request = args[0] if args else None # If an account only has 1 region, then we still want to retry once on the same region - self._max_retry_attempt_count = max(2, (len(self.global_endpoint_manager.location_cache.read_regional_routing_contexts))) + self._max_retry_attempt_count = max(2, len(self.global_endpoint_manager.location_cache + .read_regional_routing_contexts)) + if _OperationType.IsWriteOperation(self.request.operation_type): + self._max_retry_attempt_count = max(2, len( + self.global_endpoint_manager.location_cache.write_regional_routing_contexts)) def ShouldRetry(self, _exception): """Returns true if the request should retry based on the passed-in exception. diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py index c7ee31026bc3..952685ef5e06 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py @@ -21,13 +21,12 @@ def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, # If an account only has 1 region, then we still want to retry once on the same region # We want this to be the default retry attempts as paging through a query means there are requests without # a request object - self._max_retry_attempt_count = len(self.global_endpoint_manager.location_cache - .read_regional_routing_contexts) + 1 + self._max_retry_attempt_count = max(2, len(self.global_endpoint_manager.location_cache + .read_regional_routing_contexts)) # If the request is a write operation, we only want to retry once if retry write is enabled if self.request and _OperationType.IsWriteOperation(self.request.operation_type): - self._max_retry_attempt_count = len( - self.global_endpoint_manager.location_cache.write_regional_routing_contexts - ) + 1 + self._max_retry_attempt_count = max(2, len( + self.global_endpoint_manager.location_cache.write_regional_routing_contexts)) self.retry_count = 0 self.connection_policy = connection_policy self.request = args[0] if args else None diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 4fac0d1159d4..a31caaac8238 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -24,6 +24,7 @@ logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") # pylint: disable=name-too-long, protected-access +#cspell:ignore PPAF, ppaf, ppcb class PartitionLevelFailoverInfo: """ diff --git a/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy_async.py b/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy_async.py index 15c41cac9410..bf73d6c48a12 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy_async.py @@ -38,6 +38,7 @@ async def setup(): def error_codes(): + return [503] return [408, 500, 502, 503] @@ -79,9 +80,10 @@ async def test_timeout_failover_retry_policy_for_read_failure_async(self, setup, created_document = await setup[COLLECTION].create_item(body=document_definition) self.original_execute_function = _retry_utility_async.ExecuteFunctionAsync + num_exceptions = max(2, len(setup[COLLECTION].client_connection._global_endpoint_manager.location_cache.read_regional_routing_contexts)) try: - # should retry once and then succeed - mf = self.MockExecuteFunction(self.original_execute_function, 2, error_code) + # should retry and then succeed + mf = self.MockExecuteFunction(self.original_execute_function, num_exceptions, error_code) _retry_utility_async.ExecuteFunctionAsync = mf await setup[COLLECTION].read_item(item=created_document['id'], partition_key=created_document['pk']) @@ -131,9 +133,11 @@ async def test_timeout_failover_retry_policy_for_write_failure_async(self, setup 'key': 'value'} self.original_execute_function = _retry_utility_async.ExecuteFunctionAsync + num_exceptions_503 = max(2, len(setup[COLLECTION].client_connection._global_endpoint_manager.location_cache.write_regional_routing_contexts)) try: - # timeouts should fail immediately for writes - mf = self.MockExecuteFunction(self.original_execute_function,0, error_code) + # timeouts should fail immediately for writes - except for 503s, which should retry on every preferred location + num_exceptions = num_exceptions_503 if error_code == 503 else 0 + mf = self.MockExecuteFunction(self.original_execute_function,num_exceptions, error_code) _retry_utility_async.ExecuteFunctionAsync = mf try: await setup[COLLECTION].create_item(body=document_definition) From 467a95d78e90956b01ac72d134529bd6a9fd2d09 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 19 Aug 2025 11:43:27 -0400 Subject: [PATCH 24/68] Update _service_unavailable_retry_policy.py --- .../azure/cosmos/_service_unavailable_retry_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py index cb3246e6db7a..e212e869f6ef 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -21,7 +21,7 @@ def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, # If an account only has 1 region, then we still want to retry once on the same region self._max_retry_attempt_count = max(2, len(self.global_endpoint_manager.location_cache .read_regional_routing_contexts)) - if _OperationType.IsWriteOperation(self.request.operation_type): + if self.request and _OperationType.IsWriteOperation(self.request.operation_type): self._max_retry_attempt_count = max(2, len( self.global_endpoint_manager.location_cache.write_regional_routing_contexts)) From b9aa01ccbc7e01fee34c27784a38de0920757f17 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 19 Aug 2025 12:34:18 -0400 Subject: [PATCH 25/68] mypy, pylint --- sdk/cosmos/azure-cosmos/azure/cosmos/_base.py | 2 -- ...int_manager_per_partition_automatic_failover.py | 14 ++++++++++++-- .../azure/cosmos/_partition_health_tracker.py | 2 +- .../azure-cosmos/azure/cosmos/_retry_utility.py | 4 ++-- .../azure/cosmos/_synchronized_request.py | 5 +++-- .../azure/cosmos/aio/_asynchronous_request.py | 3 ++- ...nager_per_partition_automatic_failover_async.py | 14 ++++++++++++-- 7 files changed, 32 insertions(+), 12 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py index cd2050cf1dab..a4483a7277a8 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py @@ -943,8 +943,6 @@ def try_ppaf_failover_threshold( global_endpoint_manager: "_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover", pk_range_wrapper: "PartitionKeyRangeWrapper", request: "RequestObject"): - """Check if the PPAF threshold is reached for the current partition range, and mark endpoint unavailable if so. - """ # If PPAF is enabled, we track consecutive failures for certain exceptions, and only fail over at a partition # level after the threshold is reached if request and global_endpoint_manager.is_per_partition_automatic_failover_applicable(request): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 5514615a92a2..f766805ac494 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -148,7 +148,12 @@ def compute_available_preferred_regions( def record_failure(self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: - """Records a failure for the given partition key range and request.""" + """Records a failure for the given partition key range and request. + :param RequestObject request: The request object containing the routing context. + :param PartitionKeyRangeWrapper pk_range_wrapper: The wrapper containing the partition key range information + for the request. + :return: None + """ if self.is_per_partition_automatic_failover_applicable(request): if pk_range_wrapper is None: pk_range_wrapper = self.create_pk_range_wrapper(request) @@ -160,7 +165,12 @@ def record_failure(self, def record_success(self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: - """Records a failure for the given partition key range and request.""" + """Records a success for the given partition key range and request, effectively clearing the failure count. + :param RequestObject request: The request object containing the routing context. + :param PartitionKeyRangeWrapper pk_range_wrapper: The wrapper containing the partition key range information + for the request. + :return: None + """ if self.is_per_partition_automatic_failover_applicable(request): if pk_range_wrapper is None: pk_range_wrapper = self.create_pk_range_wrapper(request) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py index e7c44d01120f..f8bf79f956d8 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py @@ -314,4 +314,4 @@ def clear_pk_failures(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> None: del self.pk_range_wrapper_to_failure_count[pk_range_wrapper] def get_pk_failures(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> int: - return self.pk_range_wrapper_to_failure_count.get(pk_range_wrapper, 0) \ No newline at end of file + return self.pk_range_wrapper_to_failure_count.get(pk_range_wrapper, 0) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 3033c8d337ed..0eadffda2cb4 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -294,8 +294,8 @@ def _handle_service_request_retries( raise exception def _handle_service_response_retries(request, client, response_retry_policy, exception, *args): - if request and (_has_read_retryable_headers(request.headers) or (args and is_write_retryable(args[0], client)) or - (args and client._global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0]))): + if request and (_has_read_retryable_headers(request.headers) or (args and (is_write_retryable(args[0], client) or + client._global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0])))): # we resolve the request endpoint to the next preferred region # once we are out of preferred regions we stop retrying retry_policy = response_retry_policy diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py index 8e70f443e052..38aea17474e1 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py @@ -25,15 +25,16 @@ import json import time +from typing import Any from urllib.parse import urlparse from azure.core.exceptions import DecodeError # type: ignore from azure.core import PipelineClient -from typing import Any from . import exceptions, http_constants, _retry_utility from .documents import ConnectionPolicy from ._request_object import RequestObject -from ._global_partition_endpoint_manager_per_partition_automatic_failover import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover +from ._global_partition_endpoint_manager_per_partition_automatic_failover import ( + _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover) def _is_readable_stream(obj): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 310998728ed6..6d8c944404b8 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -35,7 +35,8 @@ from . import _retry_utility_async from ..documents import ConnectionPolicy from .._request_object import RequestObject -from ._global_partition_endpoint_manager_per_partition_automatic_failover_async import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync +from ._global_partition_endpoint_manager_per_partition_automatic_failover_async import ( + _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync) from .._synchronized_request import _request_body_from_data, _replace_url_prefix diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index a31caaac8238..837665386de5 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -150,7 +150,12 @@ def compute_available_preferred_regions( async def record_failure(self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: - """Records a failure for the given partition key range and request.""" + """Records a failure for the given partition key range and request. + :param RequestObject request: The request object containing the routing context. + :param PartitionKeyRangeWrapper pk_range_wrapper: The wrapper containing the partition key range information + for the request. + :return: None + """ if self.is_per_partition_automatic_failover_applicable(request): if pk_range_wrapper is None: pk_range_wrapper = await self.create_pk_range_wrapper(request) @@ -162,7 +167,12 @@ async def record_failure(self, async def record_success(self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: - """Records a failure for the given partition key range and request.""" + """Records a success for the given partition key range and request, effectively clearing the failure count. + :param RequestObject request: The request object containing the routing context. + :param PartitionKeyRangeWrapper pk_range_wrapper: The wrapper containing the partition key range information + for the request. + :return: None + """ if self.is_per_partition_automatic_failover_applicable(request): if pk_range_wrapper is None: pk_range_wrapper = await self.create_pk_range_wrapper(request) From 64f95e34ccb0a4078a2c7a4ff3ddf55e30c3591c Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 21 Aug 2025 01:00:23 -0400 Subject: [PATCH 26/68] 503 behavior change, use regional contexts --- sdk/cosmos/azure-cosmos/azure/cosmos/_base.py | 6 +- .../_endpoint_discovery_retry_policy.py | 8 ++- ...anager_per_partition_automatic_failover.py | 59 +++++++++++-------- .../azure/cosmos/_partition_health_tracker.py | 2 +- .../azure/cosmos/_retry_utility.py | 2 - .../_service_unavailable_retry_policy.py | 14 ++++- ..._per_partition_automatic_failover_async.py | 58 ++++++++++-------- .../azure/cosmos/aio/_retry_utility_async.py | 2 - .../test_per_partition_automatic_failover.py | 50 +++++++++++----- ..._per_partition_automatic_failover_async.py | 16 ++--- 10 files changed, 137 insertions(+), 80 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py index a4483a7277a8..3fe63759fc8a 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py @@ -952,4 +952,8 @@ def try_ppaf_failover_threshold( # If the PPAF threshold is reached, we reset the count and retry to the next region global_endpoint_manager.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) partition_level_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] - partition_level_info.unavailable_regional_endpoints.add(request.location_endpoint_to_route) + location = global_endpoint_manager.location_cache.get_location_from_endpoint( + str(request.location_endpoint_to_route)) + regional_context = (global_endpoint_manager.location_cache. + account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) + partition_level_info.unavailable_regional_endpoints[location] = regional_context diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index f29daf770891..df4def458554 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -67,10 +67,14 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument self.global_endpoint_manager.refresh_needed = True # If per partition automatic failover is applicable, we mark the current endpoint as unavailable - # and resolve the service endpoint for the partition range - otherwise, continue with the default retry logic + # and resolve the service endpoint for the partition range - otherwise, continue the default retry logic if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] - partition_level_info.unavailable_regional_endpoints.add(self.request.location_endpoint_to_route) + location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( + str(self.request.location_endpoint_to_route)) + regional_context = (self.global_endpoint_manager.location_cache. + account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) + partition_level_info.unavailable_regional_endpoints[location] = regional_context self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) return True diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index f766805ac494..4def9cd308b6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -7,14 +7,14 @@ import logging import threading -from typing import Dict, Set, TYPE_CHECKING, Optional +from typing import Dict, TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType from azure.cosmos._global_partition_endpoint_manager_circuit_breaker import \ _GlobalPartitionEndpointManagerForCircuitBreaker from azure.cosmos._partition_health_tracker import _PPAFPartitionThresholdsTracker from azure.cosmos.documents import _OperationType - +from azure.cosmos._location_cache import RegionalRoutingContext from azure.cosmos._request_object import RequestObject from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper @@ -32,28 +32,34 @@ class PartitionLevelFailoverInfo: Used to track the partition key range and the regions where it is available. """ def __init__(self): - self.unavailable_regional_endpoints = set() - self.current_regional_endpoint = None + self.unavailable_regional_endpoints: Dict[str, RegionalRoutingContext] = {} + self.current_region = None self._lock = threading.Lock() - def try_move_to_next_location(self, available_account_regional_endpoints: Set[str], request: RequestObject) -> bool: + def try_move_to_next_location( + self, + available_account_regional_endpoints: Dict[str, str], + endpoint_region: str, + request: RequestObject) -> bool: with self._lock: - failed_regional_endpoint = request.location_endpoint_to_route - if failed_regional_endpoint != self.current_regional_endpoint: - logger.info("Moving to next available regional endpoint: %s", self.current_regional_endpoint) - request.route_to_location(self.current_regional_endpoint) + if endpoint_region != self.current_region: + logger.info("PPAF - Moving to next available regional endpoint: %s", self.current_region) + # make the actual endpoint since the current_region is just West US + regional_endpoint = available_account_regional_endpoints[self.current_region] + request.route_to_location(regional_endpoint) return True for regional_endpoint in available_account_regional_endpoints: - if regional_endpoint == self.current_regional_endpoint: + if regional_endpoint == self.current_region: continue if regional_endpoint in self.unavailable_regional_endpoints: continue - self.current_regional_endpoint = regional_endpoint - logger.info("Moving to next available regional endpoint: %s", self.current_regional_endpoint) - request.route_to_location(self.current_regional_endpoint) + self.current_region = regional_endpoint + logger.info("PPAF - Moving to next available regional endpoint: %s", self.current_region) + regional_endpoint = available_account_regional_endpoints[self.current_region] + request.route_to_location(regional_endpoint) return True return False @@ -102,10 +108,12 @@ def resolve_service_endpoint_for_partition( logger.info("Resolving service endpoint for partition with per partition automatic failover enabled.") partition_failover_info = self.partition_range_to_failover_info[pk_range_wrapper] if request.location_endpoint_to_route is not None: - if request.location_endpoint_to_route in partition_failover_info.unavailable_regional_endpoints: + endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) + if endpoint_region in partition_failover_info.unavailable_regional_endpoints: # If the current region is unavailable, we try to move to the next available region if not partition_failover_info.try_move_to_next_location( self.compute_available_preferred_regions(request), + endpoint_region, request): logger.info("All available regions for partition are unavailable. Refreshing cache.") # If no other region is available, we invalidate the cache and start once again from our @@ -116,22 +124,26 @@ def resolve_service_endpoint_for_partition( pk_range_wrapper) else: # Update the current regional endpoint to whatever the request is routing to - partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route + endpoint_region = self.location_cache.get_location_from_endpoint( + request.location_endpoint_to_route) + partition_failover_info.current_region = endpoint_region else: partition_failover_info = PartitionLevelFailoverInfo() - partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route + endpoint_region = self.location_cache.get_location_from_endpoint( + request.location_endpoint_to_route) + partition_failover_info.current_region = endpoint_region self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) def compute_available_preferred_regions( self, request: RequestObject - ) -> Set[str]: + ) -> Dict[str, str]: """ Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. :param RequestObject request: The request object containing the routing context. :return: A set of available regional endpoints. - :rtype: Set[str] + :rtype: Dict[str, str] """ if request.excluded_locations: excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations @@ -139,10 +151,10 @@ def compute_available_preferred_regions( excluded_locations = self.location_cache.connection_policy.ExcludedLocations preferred_locations = self.PreferredLocations available_regions = [item for item in preferred_locations if item not in excluded_locations] - available_regional_endpoints = { - self.location_cache.account_read_regional_routing_contexts_by_location[region].primary_endpoint - for region in available_regions - } + available_regional_endpoints = {} + for region, context in self.location_cache.account_read_regional_routing_contexts_by_location.items(): + if region in available_regions: + available_regional_endpoints[region] = context.primary_endpoint return available_regional_endpoints def record_failure(self, @@ -155,10 +167,11 @@ def record_failure(self, :return: None """ if self.is_per_partition_automatic_failover_applicable(request): + location = self.location_cache.get_location_from_endpoint(str(request.location_endpoint_to_route)) if pk_range_wrapper is None: pk_range_wrapper = self.create_pk_range_wrapper(request) if pk_range_wrapper: - self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper) + self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper, location) else: self.record_ppcb_failure(request, pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py index f8bf79f956d8..7358f605d67b 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py @@ -304,7 +304,7 @@ class _PPAFPartitionThresholdsTracker(object): def __init__(self) -> None: self.pk_range_wrapper_to_failure_count: Dict[PartitionKeyRangeWrapper, int] = {} - def add_failure(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> None: + def add_failure(self, pk_range_wrapper: PartitionKeyRangeWrapper, location: Any) -> None: if pk_range_wrapper not in self.pk_range_wrapper_to_failure_count: self.pk_range_wrapper_to_failure_count[pk_range_wrapper] = 0 self.pk_range_wrapper_to_failure_count[pk_range_wrapper] += 1 diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 0eadffda2cb4..df8b8bb04bb0 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -208,8 +208,6 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin retry_policy.container_rid = cached_container["_rid"] request.headers[retry_policy._intended_headers] = retry_policy.container_rid elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: - if args and global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0]): - global_endpoint_manager.record_failure(args[0], pk_range_wrapper) retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py index e212e869f6ef..245f72bf5489 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -5,7 +5,6 @@ Cosmos database service. """ from azure.cosmos.documents import _OperationType -from azure.cosmos._base import try_ppaf_failover_threshold #cspell:ignore ppaf @@ -42,7 +41,18 @@ def ShouldRetry(self, _exception): return False if self.request: - try_ppaf_failover_threshold(self.global_endpoint_manager, self.pk_range_wrapper, self.request) + # If per partition automatic failover is applicable, we mark the current endpoint as unavailable + # and resolve the service endpoint for the partition range - otherwise, continue the default retry logic + if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): + partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[ + self.pk_range_wrapper] + location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( + str(self.request.location_endpoint_to_route)) + regional_context = (self.global_endpoint_manager.location_cache. + account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) + partition_level_info.unavailable_regional_endpoints[location] = regional_context + self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) + return True location_endpoint = self.resolve_next_region_service_endpoint() self.request.route_to_location(location_endpoint) return True diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 837665386de5..fe28ab2c71b6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -13,7 +13,7 @@ from azure.cosmos.aio._global_partition_endpoint_manager_circuit_breaker_async import \ _GlobalPartitionEndpointManagerForCircuitBreakerAsync from azure.cosmos.documents import _OperationType - +from azure.cosmos._location_cache import RegionalRoutingContext from azure.cosmos._partition_health_tracker import _PPAFPartitionThresholdsTracker from azure.cosmos._request_object import RequestObject from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper @@ -32,28 +32,34 @@ class PartitionLevelFailoverInfo: Used to track the partition key range and the regions where it is available. """ def __init__(self): - self.unavailable_regional_endpoints = set() - self.current_regional_endpoint = None + self.unavailable_regional_endpoints: Dict[str, RegionalRoutingContext] = {} + self.current_region = None self._lock = threading.Lock() - def try_move_to_next_location(self, available_account_regional_endpoints: Set[str], request: RequestObject) -> bool: + def try_move_to_next_location( + self, + available_account_regional_endpoints: Dict[str, str], + endpoint_region: str, + request: RequestObject) -> bool: with self._lock: - failed_regional_endpoint = request.location_endpoint_to_route - if failed_regional_endpoint != self.current_regional_endpoint: - logger.info("Moving to next available regional endpoint: %s", self.current_regional_endpoint) - request.route_to_location(self.current_regional_endpoint) + if endpoint_region != self.current_region: + logger.info("PPAF - Moving to next available regional endpoint: %s", self.current_region) + # make the actual endpoint since the current_region is just West US + regional_endpoint = available_account_regional_endpoints[self.current_region] + request.route_to_location(regional_endpoint) return True for regional_endpoint in available_account_regional_endpoints: - if regional_endpoint == self.current_regional_endpoint: + if regional_endpoint == self.current_region: continue if regional_endpoint in self.unavailable_regional_endpoints: continue - self.current_regional_endpoint = regional_endpoint - logger.info("Moving to next available regional endpoint: %s", self.current_regional_endpoint) - request.route_to_location(self.current_regional_endpoint) + self.current_region = regional_endpoint + logger.info("PPAF - Moving to next available regional endpoint: %s", self.current_region) + regional_endpoint = available_account_regional_endpoints[self.current_region] + request.route_to_location(regional_endpoint) return True return False @@ -103,10 +109,12 @@ def resolve_service_endpoint_for_partition( logger.info("Resolving service endpoint for partition with per partition automatic failover enabled.") partition_failover_info = self.partition_range_to_failover_info[pk_range_wrapper] if request.location_endpoint_to_route is not None: - if request.location_endpoint_to_route in partition_failover_info.unavailable_regional_endpoints: + endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) + if endpoint_region in partition_failover_info.unavailable_regional_endpoints: # If the current region is unavailable, we try to move to the next available region if not partition_failover_info.try_move_to_next_location( self.compute_available_preferred_regions(request), + endpoint_region, request): logger.info("All available regions for partition are unavailable. Refreshing cache.") # If no other region is available, we invalidate the cache and start once again from our @@ -117,23 +125,26 @@ def resolve_service_endpoint_for_partition( pk_range_wrapper) else: # Update the current regional endpoint to whatever the request is routing to - partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route + endpoint_region = self.location_cache.get_location_from_endpoint( + request.location_endpoint_to_route) + partition_failover_info.current_region = endpoint_region else: partition_failover_info = PartitionLevelFailoverInfo() - partition_failover_info.current_regional_endpoint = request.location_endpoint_to_route + endpoint_region = self.location_cache.get_location_from_endpoint( + request.location_endpoint_to_route) + partition_failover_info.current_region = endpoint_region self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info - return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) def compute_available_preferred_regions( self, request: RequestObject - ) -> Set[str]: + ) -> Dict[str, str]: """ Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. :param RequestObject request: The request object containing the routing context. :return: A set of available regional endpoints. - :rtype: Set[str] + :rtype: Dict[str, str] """ if request.excluded_locations: excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations @@ -141,10 +152,10 @@ def compute_available_preferred_regions( excluded_locations = self.location_cache.connection_policy.ExcludedLocations preferred_locations = self.PreferredLocations available_regions = [item for item in preferred_locations if item not in excluded_locations] - available_regional_endpoints = { - self.location_cache.account_read_regional_routing_contexts_by_location[region].primary_endpoint - for region in available_regions - } + available_regional_endpoints = {} + for region, context in self.location_cache.account_read_regional_routing_contexts_by_location.items(): + if region in available_regions: + available_regional_endpoints[region] = context.primary_endpoint return available_regional_endpoints async def record_failure(self, @@ -157,10 +168,11 @@ async def record_failure(self, :return: None """ if self.is_per_partition_automatic_failover_applicable(request): + location = self.location_cache.get_location_from_endpoint(str(request.location_endpoint_to_route)) if pk_range_wrapper is None: pk_range_wrapper = await self.create_pk_range_wrapper(request) if pk_range_wrapper: - self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper) + self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper, location) else: await self.record_ppcb_failure(request, pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 09c04ae9009a..6afc01c53f02 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -207,8 +207,6 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg request.headers[retry_policy._intended_headers] = retry_policy.container_rid elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: # if ppaf is applicable, we record the failure - if args and global_endpoint_manager.is_per_partition_automatic_failover_applicable(args[0]): - await global_endpoint_manager.record_failure(args[0], pk_range_wrapper) retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 743659e92d5b..8c363fb10f31 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -16,7 +16,7 @@ def create_failover_errors(): errors = [] - error_codes = [403] + error_codes = [403, 503] for error_code in error_codes: errors.append(CosmosHttpResponseError( status_code=error_code, @@ -26,7 +26,7 @@ def create_failover_errors(): def create_threshold_errors(): errors = [] - error_codes = [408, 500, 502, 503, 504] + error_codes = [408, 500, 502, 504] for error_code in error_codes: errors.append(CosmosHttpResponseError( status_code=error_code, @@ -43,12 +43,13 @@ class TestPerPartitionAutomaticFailover: TEST_DATABASE_ID = test_config.TestConfig.TEST_DATABASE_ID TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID - def setup_method_with_custom_transport(self, custom_transport, default_endpoint=host, **kwargs): + def setup_method_with_custom_transport(self, custom_transport, default_endpoint=host, read_first=False, **kwargs): + regions = [REGION_2, REGION_1] if read_first else [REGION_1, REGION_2] container_id = kwargs.pop("container_id", None) if not container_id: container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID client = CosmosClient(default_endpoint, self.master_key, consistency_level="Session", - preferred_locations=[REGION_1, REGION_2], + preferred_locations=regions, transport=custom_transport, **kwargs) db = client.get_database_client(self.TEST_DATABASE_ID) container = db.get_container_client(container_id) @@ -93,7 +94,7 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}) pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] - initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint + initial_region = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_region # Based on our configuration, we should have had one error followed by a success - marking only the previous endpoint as unavailable perform_write_operation( @@ -105,8 +106,8 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same assert len(partition_info.unavailable_regional_endpoints) == 1 - assert initial_endpoint in partition_info.unavailable_regional_endpoints - assert initial_endpoint != partition_info.current_regional_endpoint # west us 3 != west us + assert initial_region in partition_info.unavailable_regional_endpoints + assert initial_region != partition_info.current_region # west us 3 != west us # Now we run another request to see how the cache gets updated perform_write_operation( @@ -119,8 +120,8 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): # Verify that the cache is empty, since the request going to the second regional endpoint failed # Once we reach the point of all available regions being marked as unavailable, the cache is cleared assert len(partition_info.unavailable_regional_endpoints) == 0 - assert initial_endpoint not in partition_info.unavailable_regional_endpoints - assert partition_info.current_regional_endpoint is None + assert initial_region not in partition_info.unavailable_regional_endpoints + assert partition_info.current_region is None @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_threshold_errors())) @@ -138,8 +139,7 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}) pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] - initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint - + initial_region = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_region is_503 = hasattr(error, 'status_code') and error.status_code == 503 # Since 503 errors are retried by default, we each request counts as two failures @@ -172,8 +172,8 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same assert len(partition_info.unavailable_regional_endpoints) == 1 - assert initial_endpoint in partition_info.unavailable_regional_endpoints - assert initial_endpoint != partition_info.current_regional_endpoint # west us 3 != west us + assert initial_region in partition_info.unavailable_regional_endpoints + assert initial_region != partition_info.current_region # west us 3 != west us # Since we are failing every request, even though we retried to the next region, that retry should have failed as well # This means we should have one extra failure - verify that the value makes sense @@ -185,9 +185,27 @@ def test_ppaf_exclude_regions(self, write_operation, error): # TODO: finish this test return - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) - def test_ppaf_invalid_configs(self, write_operation, error): - # TODO: finish this test + def test_ppaf_session_unavailable_retry(self): + # For this test, the main requirement is to have 3 regions total in the account: A, B and C. + # Writes go to region A, and reads go to region C. Preferred locations are set to C, B, A in order. + # This test depends on the fact that the chosen region for the failover is region B and not region C. + # We will inject a 403.3 error on region A, marking it as unavailable with PPAF. We verify the retry goes + # to region B next. Next we inject a 404.1002 to a read in the same partition, which should retry to region B + # as well since A was marked as unavailable in the context of PPAF. + + # For this test, we have two regions in the account West US 3 (write) and West US (read). + # Writes go to West US 3, and reads go to region C - preferred locations are set to that order. + # This test depends on the fact that the chosen region for the failover is region B and not region C. + # We will inject a 403.3 error on region A, marking it as unavailable with PPAF. We verify the retry goes + # to region B next. Next we inject a 404.1002 to a read in the same partition, which should retry to region B + # as well since A was marked as unavailable in the context of PPAF. + + # Account config has 2 regions: West US 3 (A) and West US (B). + error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, read_first=True) + container = setup['col'] + fault_injection_container = custom_setup['col'] + global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager return diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 99e433bc6af4..a7949038b625 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -86,7 +86,7 @@ async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation await fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}) pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] - initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint + initial_region = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_region # Based on our configuration, we should have had one error followed by a success - marking only the previous endpoint as unavailable await perform_write_operation( @@ -98,8 +98,8 @@ async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same assert len(partition_info.unavailable_regional_endpoints) == 1 - assert initial_endpoint in partition_info.unavailable_regional_endpoints - assert initial_endpoint != partition_info.current_regional_endpoint # west us 3 != west us + assert initial_region in partition_info.unavailable_regional_endpoints + assert initial_region != partition_info.current_region # west us 3 != west us # Now we run another request to see how the cache gets updated await perform_write_operation( @@ -112,8 +112,8 @@ async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation # Verify that the cache is empty, since the request going to the second regional endpoint failed # Once we reach the point of all available regions being marked as unavailable, the cache is cleared assert len(partition_info.unavailable_regional_endpoints) == 0 - assert initial_endpoint not in partition_info.unavailable_regional_endpoints - assert partition_info.current_regional_endpoint is None + assert initial_region not in partition_info.unavailable_regional_endpoints + assert partition_info.current_region is None @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_threshold_errors())) async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation, error): @@ -129,7 +129,7 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation await fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}) pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] - initial_endpoint = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_regional_endpoint + initial_region = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_region is_503 = hasattr(error, 'status_code') and error.status_code == 503 # Since 503 errors are retried by default, we each request counts as two failures @@ -163,8 +163,8 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same assert len(partition_info.unavailable_regional_endpoints) == 1 - assert initial_endpoint in partition_info.unavailable_regional_endpoints - assert initial_endpoint != partition_info.current_regional_endpoint # west us 3 != west us + assert initial_region in partition_info.unavailable_regional_endpoints + assert initial_region != partition_info.current_region # west us 3 != west us # Since we are failing every request, even though we retried to the next region, that retry should have failed as well # This means we should have one extra failure - verify that the value makes sense From d05fc5e2c82ad4287ce4e61f874284b5afc0b7c7 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 21 Aug 2025 10:03:36 -0400 Subject: [PATCH 27/68] mypy, pylint, tests --- ...int_manager_per_partition_automatic_failover.py | 3 +-- .../azure/cosmos/_partition_health_tracker.py | 2 +- .../azure/cosmos/_synchronized_request.py | 14 +------------- .../azure/cosmos/aio/_asynchronous_request.py | 14 +------------- ...nager_per_partition_automatic_failover_async.py | 5 ++--- .../test_timeout_and_failover_retry_policy.py | 11 +++++++---- ...test_timeout_and_failover_retry_policy_async.py | 3 +-- 7 files changed, 14 insertions(+), 38 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 4def9cd308b6..aecbe2209c9d 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -167,11 +167,10 @@ def record_failure(self, :return: None """ if self.is_per_partition_automatic_failover_applicable(request): - location = self.location_cache.get_location_from_endpoint(str(request.location_endpoint_to_route)) if pk_range_wrapper is None: pk_range_wrapper = self.create_pk_range_wrapper(request) if pk_range_wrapper: - self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper, location) + self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper) else: self.record_ppcb_failure(request, pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py index 7358f605d67b..f8bf79f956d8 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py @@ -304,7 +304,7 @@ class _PPAFPartitionThresholdsTracker(object): def __init__(self) -> None: self.pk_range_wrapper_to_failure_count: Dict[PartitionKeyRangeWrapper, int] = {} - def add_failure(self, pk_range_wrapper: PartitionKeyRangeWrapper, location: Any) -> None: + def add_failure(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> None: if pk_range_wrapper not in self.pk_range_wrapper_to_failure_count: self.pk_range_wrapper_to_failure_count[pk_range_wrapper] = 0 self.pk_range_wrapper_to_failure_count[pk_range_wrapper] += 1 diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py index 38aea17474e1..e6109b5bd621 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py @@ -25,16 +25,10 @@ import json import time -from typing import Any from urllib.parse import urlparse from azure.core.exceptions import DecodeError # type: ignore -from azure.core import PipelineClient from . import exceptions, http_constants, _retry_utility -from .documents import ConnectionPolicy -from ._request_object import RequestObject -from ._global_partition_endpoint_manager_per_partition_automatic_failover import ( - _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover) def _is_readable_stream(obj): @@ -69,13 +63,7 @@ def _request_body_from_data(data): return None -def _Request( - global_endpoint_manager: _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover, - request_params: RequestObject, - connection_policy: ConnectionPolicy, - pipeline_client: PipelineClient, - request: Any, - **kwargs): # pylint: disable=too-many-statements +def _Request(global_endpoint_manager, request_params, connection_policy, pipeline_client, request, **kwargs): # pylint: disable=too-many-statements """Makes one http request using the requests module. :param _GlobalEndpointManager global_endpoint_manager: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 6d8c944404b8..1cd2a22039b4 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -24,29 +24,17 @@ import copy import json import time -from typing import Any from urllib.parse import urlparse -from azure.core import AsyncPipelineClient from azure.core.exceptions import DecodeError # type: ignore from .. import exceptions from .. import http_constants from . import _retry_utility_async -from ..documents import ConnectionPolicy -from .._request_object import RequestObject -from ._global_partition_endpoint_manager_per_partition_automatic_failover_async import ( - _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync) from .._synchronized_request import _request_body_from_data, _replace_url_prefix -async def _Request( - global_endpoint_manager: _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, - request_params: RequestObject, - connection_policy: ConnectionPolicy, - pipeline_client: AsyncPipelineClient, - request: Any, - **kwargs): # pylint: disable=too-many-statements +async def _Request(global_endpoint_manager, request_params, connection_policy, pipeline_client, request, **kwargs): # pylint: disable=too-many-statements """Makes one http request using the requests module. :param _GlobalEndpointManager global_endpoint_manager: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index fe28ab2c71b6..5b805b2e145d 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -7,7 +7,7 @@ import logging import threading -from typing import Dict, Set, TYPE_CHECKING, Optional +from typing import Dict, TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType from azure.cosmos.aio._global_partition_endpoint_manager_circuit_breaker_async import \ @@ -168,11 +168,10 @@ async def record_failure(self, :return: None """ if self.is_per_partition_automatic_failover_applicable(request): - location = self.location_cache.get_location_from_endpoint(str(request.location_endpoint_to_route)) if pk_range_wrapper is None: pk_range_wrapper = await self.create_pk_range_wrapper(request) if pk_range_wrapper: - self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper, location) + self.ppaf_thresholds_tracker.add_failure(pk_range_wrapper) else: await self.record_ppcb_failure(request, pk_range_wrapper) diff --git a/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy.py index 4c21ed121441..69cd625e5104 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy.py @@ -79,9 +79,10 @@ def test_timeout_failover_retry_policy_for_read_failure(self, setup, error_code) created_document = setup[COLLECTION].create_item(body=document_definition) self.original_execute_function = _retry_utility.ExecuteFunction + num_exceptions = max(2, len(setup[COLLECTION].client_connection._global_endpoint_manager.location_cache.read_regional_routing_contexts)) try: - # should retry once and then fail - mf = self.MockExecuteFunction(self.original_execute_function, 2, error_code) + # should retry and then fail + mf = self.MockExecuteFunction(self.original_execute_function, num_exceptions, error_code) _retry_utility.ExecuteFunction = mf setup[COLLECTION].read_item(item=created_document['id'], partition_key=created_document['pk']) @@ -99,9 +100,11 @@ def test_timeout_failover_retry_policy_for_write_failure(self, setup, error_code 'key': 'value'} self.original_execute_function = _retry_utility.ExecuteFunction + num_exceptions_503 = max(2, len(setup[COLLECTION].client_connection._global_endpoint_manager.location_cache.write_regional_routing_contexts)) try: - # timeouts should fail immediately for writes - mf = self.MockExecuteFunction(self.original_execute_function,0, error_code) + # timeouts should fail immediately for writes - except for 503s, which should retry on every preferred location + num_exceptions = num_exceptions_503 if error_code == 503 else 0 + mf = self.MockExecuteFunction(self.original_execute_function, num_exceptions, error_code) _retry_utility.ExecuteFunction = mf try: setup[COLLECTION].create_item(body=document_definition) diff --git a/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy_async.py b/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy_async.py index bf73d6c48a12..506fa826c29d 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_timeout_and_failover_retry_policy_async.py @@ -38,7 +38,6 @@ async def setup(): def error_codes(): - return [503] return [408, 500, 502, 503] @@ -137,7 +136,7 @@ async def test_timeout_failover_retry_policy_for_write_failure_async(self, setup try: # timeouts should fail immediately for writes - except for 503s, which should retry on every preferred location num_exceptions = num_exceptions_503 if error_code == 503 else 0 - mf = self.MockExecuteFunction(self.original_execute_function,num_exceptions, error_code) + mf = self.MockExecuteFunction(self.original_execute_function, num_exceptions, error_code) _retry_utility_async.ExecuteFunctionAsync = mf try: await setup[COLLECTION].create_item(body=document_definition) From 85b2007d4f1877216e7a11eacff25df1cf249686 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:01:10 -0400 Subject: [PATCH 28/68] special-casing 503s --- .../azure-cosmos/tests/test_circuit_breaker_emulator.py | 4 ++++ .../azure-cosmos/tests/test_circuit_breaker_emulator_async.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py b/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py index 0eb863e6ceac..a32600d239c6 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py +++ b/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py @@ -121,6 +121,8 @@ def test_write_consecutive_failure_threshold_delete_all_items_by_pk_sm(self, set @pytest.mark.parametrize("error", create_errors()) def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm(self, setup_teardown, error): + if error.status_code == 503: + pytest.skip("ServiceUnavailableError will do a cross-region retry, so it has to be special cased.") error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, mm=True) fault_injection_container = custom_setup['col'] @@ -176,6 +178,8 @@ def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm(self, set @pytest.mark.parametrize("error", create_errors()) def test_write_failure_rate_threshold_delete_all_items_by_pk_mm(self, setup_teardown, error): + if error.status_code == 503: + pytest.skip("ServiceUnavailableError will do a cross-region retry, so it has to be special cased.") error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, mm=True) fault_injection_container = custom_setup['col'] diff --git a/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py b/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py index 601aca23bfe5..504a496319aa 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py @@ -125,6 +125,8 @@ async def test_write_consecutive_failure_threshold_delete_all_items_by_pk_sm_asy @pytest.mark.parametrize("error", create_errors()) async def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm_async(self, setup_teardown, error): + if error.status_code == 503: + pytest.skip("ServiceUnavailableError will do a cross-region retry, so it has to be special cased.") error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, mm=True) fault_injection_container = custom_setup['col'] @@ -181,6 +183,8 @@ async def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm_asy @pytest.mark.parametrize("error", create_errors()) async def test_write_failure_rate_threshold_delete_all_items_by_pk_mm_async(self, setup_teardown, error): + if error.status_code == 503: + pytest.skip("ServiceUnavailableError will do a cross-region retry, so it has to be special cased.") error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, mm=True) fault_injection_container = custom_setup['col'] From f8fa70a5139900306ecd9fdc0d8f814da0d27278 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:57:59 -0400 Subject: [PATCH 29/68] small fix --- .../azure-cosmos/tests/test_circuit_breaker_emulator.py | 4 ++-- .../azure-cosmos/tests/test_circuit_breaker_emulator_async.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py b/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py index a32600d239c6..42ab6bcd722d 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py +++ b/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py @@ -121,7 +121,7 @@ def test_write_consecutive_failure_threshold_delete_all_items_by_pk_sm(self, set @pytest.mark.parametrize("error", create_errors()) def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm(self, setup_teardown, error): - if error.status_code == 503: + if hasattr(error, "status_code") and error.status_code == 503: pytest.skip("ServiceUnavailableError will do a cross-region retry, so it has to be special cased.") error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, mm=True) @@ -178,7 +178,7 @@ def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm(self, set @pytest.mark.parametrize("error", create_errors()) def test_write_failure_rate_threshold_delete_all_items_by_pk_mm(self, setup_teardown, error): - if error.status_code == 503: + if hasattr(error, "status_code") and error.status_code == 503: pytest.skip("ServiceUnavailableError will do a cross-region retry, so it has to be special cased.") error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, mm=True) diff --git a/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py b/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py index 504a496319aa..46a211f269bb 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py @@ -125,7 +125,7 @@ async def test_write_consecutive_failure_threshold_delete_all_items_by_pk_sm_asy @pytest.mark.parametrize("error", create_errors()) async def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm_async(self, setup_teardown, error): - if error.status_code == 503: + if hasattr(error, "status_code") and error.status_code == 503: pytest.skip("ServiceUnavailableError will do a cross-region retry, so it has to be special cased.") error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, mm=True) @@ -183,7 +183,7 @@ async def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm_asy @pytest.mark.parametrize("error", create_errors()) async def test_write_failure_rate_threshold_delete_all_items_by_pk_mm_async(self, setup_teardown, error): - if error.status_code == 503: + if hasattr(error, "status_code") and error.status_code == 503: pytest.skip("ServiceUnavailableError will do a cross-region retry, so it has to be special cased.") error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, mm=True) From e5c5ac53f55aeb3e9c23a7f4d87aedb37a4e43d9 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 21 Aug 2025 13:46:02 -0400 Subject: [PATCH 30/68] exclude region tests --- .../azure/cosmos/cosmos_client.py | 4 +- ..._per_partition_automatic_failover_async.py | 47 ++++++++++++++----- .../test_per_partition_circuit_breaker_mm.py | 9 ++++ 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/cosmos_client.py b/sdk/cosmos/azure-cosmos/azure/cosmos/cosmos_client.py index af19d54cf671..09eaa082e91e 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/cosmos_client.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/cosmos_client.py @@ -95,7 +95,9 @@ def _build_connection_policy(kwargs: Dict[str, Any]) -> ConnectionPolicy: policy.EnableEndpointDiscovery = kwargs.pop('enable_endpoint_discovery', policy.EnableEndpointDiscovery) policy.PreferredLocations = kwargs.pop('preferred_locations', policy.PreferredLocations) # TODO: Consider storing callback method instead, such as 'Supplier' in JAVA SDK - policy.ExcludedLocations = kwargs.pop('excluded_locations', policy.ExcludedLocations) + excluded_locations = kwargs.pop('excluded_locations', policy.ExcludedLocations) + if excluded_locations: + policy.ExcludedLocations = excluded_locations policy.UseMultipleWriteLocations = kwargs.pop('multiple_write_locations', policy.UseMultipleWriteLocations) # SSL config diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index a7949038b625..c18e6ed9abb4 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -13,10 +13,11 @@ from azure.core.exceptions import ServiceResponseError from azure.cosmos.exceptions import CosmosHttpResponseError from azure.cosmos.aio import CosmosClient +from azure.cosmos._request_object import RequestObject from _fault_injection_transport import FaultInjectionTransport from _fault_injection_transport_async import FaultInjectionTransportAsync from test_per_partition_automatic_failover import create_failover_errors, create_threshold_errors -from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors +from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors, write_operations_and_boolean from test_per_partition_circuit_breaker_mm_async import perform_write_operation # cspell:disable @@ -31,15 +32,23 @@ class TestPerPartitionAutomaticFailoverAsync: TEST_DATABASE_ID = test_config.TestConfig.TEST_DATABASE_ID TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID - async def setup_method_with_custom_transport(self, custom_transport: Optional[AioHttpTransport], default_endpoint=host, **kwargs): + async def setup_method_with_custom_transport(self, custom_transport: Optional[AioHttpTransport], + default_endpoint=host, read_first=False, **kwargs): + regions = [REGION_2, REGION_1] if read_first else [REGION_1, REGION_2] container_id = kwargs.pop("container_id", None) + exclude_client_regions = kwargs.pop("exclude_client_regions", False) + excluded_regions = [] + if exclude_client_regions: + excluded_regions = [REGION_2] if not container_id: container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID client = CosmosClient(default_endpoint, self.master_key, consistency_level="Session", - preferred_locations=[REGION_1, REGION_2], + preferred_locations=regions, + excluded_locations=excluded_regions, transport=custom_transport, **kwargs) db = client.get_database_client(self.TEST_DATABASE_ID) container = db.get_container_client(container_id) + await client.__aenter__() return {"client": client, "db": db, "col": container} @staticmethod @@ -47,7 +56,7 @@ async def cleanup_method(initialized_objects: Dict[str, Any]): method_client: CosmosClient = initialized_objects["client"] await method_client.close() - async def setup_info(self, error, max_count=None, is_batch=False, **kwargs): + async def setup_info(self, error=None, max_count=None, is_batch=False, exclude_client_regions=False, **kwargs): custom_transport = FaultInjectionTransportAsync() # two documents targeted to same partition, one will always fail and the other will succeed doc_fail_id = str(uuid.uuid4()) @@ -59,8 +68,9 @@ async def setup_info(self, error, max_count=None, is_batch=False, **kwargs): success_response = FaultInjectionTransportAsync.MockHttpResponse(mock_request, 200, [{"statusCode": 200}],) else: success_response = FaultInjectionTransportAsync.MockHttpResponse(mock_request, 200) - custom_transport.add_fault(predicate=predicate, fault_factory=error, max_inner_count=max_count, - after_max_count=success_response) + if error: + custom_transport.add_fault(predicate=predicate, fault_factory=error, max_inner_count=max_count, + after_max_count=success_response) is_get_account_predicate = lambda r: FaultInjectionTransportAsync.predicate_is_database_account_call(r) # Set the database account response to have PPAF enabled ppaf_enabled_database_account = \ @@ -68,8 +78,10 @@ async def setup_info(self, error, max_count=None, is_batch=False, **kwargs): custom_transport.add_response_transformation( is_get_account_predicate, ppaf_enabled_database_account) - setup = await self.setup_method_with_custom_transport(None, default_endpoint=self.host, **kwargs) - custom_setup = await self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, **kwargs) + setup = await self.setup_method_with_custom_transport(None, default_endpoint=self.host, + exclude_client_regions=exclude_client_regions, **kwargs) + custom_setup = await self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, + exclude_client_regions=exclude_client_regions, **kwargs) return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) @@ -171,11 +183,20 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] assert failure_count == 1 if is_503 else 3 - - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_threshold_errors())) - async def test_ppaf_exclude_regions_async(self, write_operation, error): - # TODO: finish this test - return + @pytest.mark.parametrize("write_operation, exclude_client_regions", write_operations_and_boolean()) + async def test_ppaf_exclude_regions_async(self, write_operation, exclude_client_regions): + # This test validates that the per-partition automatic failover logic does not apply to configs without enough regions. + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(exclude_client_regions=exclude_client_regions) + fault_injection_container = custom_setup['col'] + global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager + # Check that computing valid regions for PPAF only returns a single region + request_object = RequestObject(resource_type="docs", operation_type=write_operation, headers={}) + if exclude_client_regions is False: + request_object.excluded_locations = [REGION_2] + available_ppaf_regions = global_endpoint_manager.compute_available_preferred_regions(request_object) + assert len(available_ppaf_regions) == 1 + # Check that all requests are marked as non-PPAF available due to the fact that we only have one region + assert global_endpoint_manager.is_per_partition_automatic_failover_applicable(request_object) is False diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py index c741d0b48af2..1b07fc3ece18 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py @@ -60,6 +60,15 @@ def write_operations_and_errors(error_list=None): return params +def write_operations_and_boolean(): + write_operations = [CREATE, UPSERT, REPLACE, DELETE, PATCH, BATCH] + params = [] + for write_operation in write_operations: + for boolean in [True, False]: + params.append((write_operation, boolean)) + + return params + def operations(): write_operations = [CREATE, UPSERT, REPLACE, DELETE, PATCH, BATCH] read_operations = [READ, QUERY_PK, CHANGE_FEED_PK, CHANGE_FEED_EPK] From ccd9def68d23d679de63d66599e47192a7a29d4a Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 21 Aug 2025 23:28:26 -0400 Subject: [PATCH 31/68] session retry tests --- .../_endpoint_discovery_retry_policy.py | 4 +- ...anager_per_partition_automatic_failover.py | 11 +- .../azure/cosmos/_session_retry_policy.py | 14 +++ .../test_per_partition_automatic_failover.py | 114 +++++++++++++----- ..._per_partition_automatic_failover_async.py | 58 ++++++++- 5 files changed, 162 insertions(+), 39 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index df4def458554..ff94355ae7ba 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -72,9 +72,9 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( str(self.request.location_endpoint_to_route)) - regional_context = (self.global_endpoint_manager.location_cache. + location_endpoint = (self.global_endpoint_manager.location_cache. account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) - partition_level_info.unavailable_regional_endpoints[location] = regional_context + partition_level_info.unavailable_regional_endpoints[location] = location_endpoint self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) return True diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index aecbe2209c9d..b1516bfe3c5d 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -74,7 +74,15 @@ def __init__(self, client: "CosmosClientConnection"): self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() + def is_per_partition_automatic_failover_enabled(self) -> bool: + if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: + return False + return True + def is_per_partition_automatic_failover_applicable(self, request: RequestObject) -> bool: + if not self.is_per_partition_automatic_failover_enabled(): + return False + if not request: return False @@ -82,9 +90,6 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) or _OperationType.IsReadOnlyOperation(request.operation_type)): return False - if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: - return False - # if we have at most one region available in the account, we cannot do per partition automatic failover available_regions = self.compute_available_preferred_regions(request) if len(available_regions) <= 1: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py index f10366ac4c7f..6561589f71ca 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py @@ -105,6 +105,20 @@ def ShouldRetry(self, _exception): self.request.route_to_location_with_preferred_location_flag(self.session_token_retry_count - 1, False) self.request.should_clear_session_token_on_session_read_failure = True + # For PPAF, the retry should happen to whatever the relevant write region is for the affected partition. + if self.global_endpoint_manager.is_per_partition_automatic_failover_enabled(): + pk_failover_info = self.global_endpoint_manager.partition_range_to_failover_info.get(self.pk_range_wrapper) + location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( + str(self.request.location_endpoint_to_route)) + if location in pk_failover_info.unavailable_regional_endpoints: + # If the request endpoint is unavailable, we need to resolve the endpoint for the request using the + # partition-level failover info + location_endpoint = (self.global_endpoint_manager.location_cache. + account_read_regional_routing_contexts_by_location. + get(pk_failover_info.current_region).primary_endpoint) + self.request.route_to_location(location_endpoint) + return True + # Resolve the endpoint for the request and pin the resolution to the resolved endpoint # This enables marking the endpoint unavailability on endpoint failover/unreachability self.location_endpoint = (self.global_endpoint_manager diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 8c363fb10f31..eeed945f388f 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -8,8 +8,9 @@ from azure.core.exceptions import ServiceResponseError from azure.cosmos import CosmosClient from azure.cosmos.exceptions import CosmosHttpResponseError +from azure.cosmos._request_object import RequestObject from _fault_injection_transport import FaultInjectionTransport -from test_per_partition_circuit_breaker_mm import (REGION_1, REGION_2, PK_VALUE, BATCH, +from test_per_partition_circuit_breaker_mm import (REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_boolean, write_operations_and_errors, perform_write_operation) # cspell:disable @@ -43,32 +44,50 @@ class TestPerPartitionAutomaticFailover: TEST_DATABASE_ID = test_config.TestConfig.TEST_DATABASE_ID TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID - def setup_method_with_custom_transport(self, custom_transport, default_endpoint=host, read_first=False, **kwargs): - regions = [REGION_2, REGION_1] if read_first else [REGION_1, REGION_2] + def setup_method_with_custom_transport(self, custom_transport, default_endpoint=host, **kwargs): + regions = [REGION_1, REGION_2] container_id = kwargs.pop("container_id", None) + exclude_client_regions = kwargs.pop("exclude_client_regions", False) + excluded_regions = [] + if exclude_client_regions: + excluded_regions = [REGION_2] if not container_id: container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID client = CosmosClient(default_endpoint, self.master_key, consistency_level="Session", preferred_locations=regions, + excluded_locations=excluded_regions, transport=custom_transport, **kwargs) db = client.get_database_client(self.TEST_DATABASE_ID) container = db.get_container_client(container_id) return {"client": client, "db": db, "col": container} - def setup_info(self, error, max_count=None, is_batch=False, **kwargs): + def setup_info(self, error=None, max_count=None, is_batch=False, exclude_client_regions=False, session_error=False, **kwargs): custom_transport = FaultInjectionTransport() # two documents targeted to same partition, one will always fail and the other will succeed doc_fail_id = str(uuid.uuid4()) doc_success_id = str(uuid.uuid4()) - predicate = lambda r: FaultInjectionTransport.predicate_req_for_document_with_id(r, doc_fail_id) + predicate = lambda r: (FaultInjectionTransport.predicate_req_for_document_with_id(r, doc_fail_id) + and FaultInjectionTransport.predicate_is_write_operation(r, "west")) # The MockRequest only gets used to create the MockHttpResponse mock_request = FaultInjectionTransport.MockHttpRequest(url=self.host) if is_batch: success_response = FaultInjectionTransport.MockHttpResponse(mock_request, 200, [{"statusCode": 200}],) else: success_response = FaultInjectionTransport.MockHttpResponse(mock_request, 200) - custom_transport.add_fault(predicate=predicate, fault_factory=error, max_inner_count=max_count, - after_max_count=success_response) + if error: + custom_transport.add_fault(predicate=predicate, fault_factory=error, max_inner_count=max_count, + after_max_count=success_response) + if session_error: + read_predicate = lambda r: (FaultInjectionTransport.predicate_is_operation_type(r, "Read") + and FaultInjectionTransport.predicate_req_for_document_with_id(r, doc_fail_id)) + read_error = CosmosHttpResponseError( + status_code=404, + message="Some injected error.", + sub_status=1002) + error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, read_error) + success_response = FaultInjectionTransport.MockHttpResponse(mock_request, 200) + custom_transport.add_fault(predicate=read_predicate, fault_factory=error_lambda, max_inner_count=max_count, + after_max_count=success_response) is_get_account_predicate = lambda r: FaultInjectionTransport.predicate_is_database_account_call(r) # Set the database account response to have PPAF enabled ppaf_enabled_database_account = \ @@ -76,8 +95,10 @@ def setup_info(self, error, max_count=None, is_batch=False, **kwargs): custom_transport.add_response_transformation( is_get_account_predicate, ppaf_enabled_database_account) - setup = self.setup_method_with_custom_transport(None, default_endpoint=self.host, **kwargs) - custom_setup = self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, **kwargs) + setup = self.setup_method_with_custom_transport(None, default_endpoint=self.host, + exclude_client_regions=exclude_client_regions, **kwargs) + custom_setup = self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, + exclude_client_regions=exclude_client_regions, **kwargs) return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) @@ -180,35 +201,66 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] assert failure_count == 1 if is_503 else 3 + @pytest.mark.parametrize("write_operation, exclude_client_regions", write_operations_and_boolean()) + def test_ppaf_exclude_regions(self, write_operation, exclude_client_regions): + # This test validates that the per-partition automatic failover logic does not apply to configs without enough regions. + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(exclude_client_regions=exclude_client_regions) + fault_injection_container = custom_setup['col'] + global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager + # Check that computing valid regions for PPAF only returns a single region + request_object = RequestObject(resource_type="docs", operation_type=write_operation, headers={}) + if exclude_client_regions is False: + request_object.excluded_locations = [REGION_2] + available_ppaf_regions = global_endpoint_manager.compute_available_preferred_regions(request_object) + assert len(available_ppaf_regions) == 1 + # Check that all requests are marked as non-PPAF available due to the fact that we only have one region + assert global_endpoint_manager.is_per_partition_automatic_failover_applicable(request_object) is False + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) - def test_ppaf_exclude_regions(self, write_operation, error): - # TODO: finish this test - return - - def test_ppaf_session_unavailable_retry(self): - # For this test, the main requirement is to have 3 regions total in the account: A, B and C. - # Writes go to region A, and reads go to region C. Preferred locations are set to C, B, A in order. - # This test depends on the fact that the chosen region for the failover is region B and not region C. - # We will inject a 403.3 error on region A, marking it as unavailable with PPAF. We verify the retry goes - # to region B next. Next we inject a 404.1002 to a read in the same partition, which should retry to region B - # as well since A was marked as unavailable in the context of PPAF. - - # For this test, we have two regions in the account West US 3 (write) and West US (read). - # Writes go to West US 3, and reads go to region C - preferred locations are set to that order. - # This test depends on the fact that the chosen region for the failover is region B and not region C. - # We will inject a 403.3 error on region A, marking it as unavailable with PPAF. We verify the retry goes - # to region B next. Next we inject a 404.1002 to a read in the same partition, which should retry to region B - # as well since A was marked as unavailable in the context of PPAF. - - # Account config has 2 regions: West US 3 (A) and West US (B). + def test_ppaf_session_unavailable_retry(self, write_operation, error): + # Account config has 2 regions: West US 3 (A) and West US (B). This test validates that after marking the write + # region (A) as unavailable, the next request is retried to the read region (B) and succeeds. The next read request + # should see that the write region (A) is unavailable for the partition, and should retry to the read region (B) as well. error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) - setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, read_first=True) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, max_count=1, + is_batch=write_operation==BATCH, session_error=True) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager - return + # Create a document to populate the per-partition GEM partition range info cache + fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, + 'name': 'sample document', 'key': 'value'}) + pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] + initial_region = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_region + + # Verify the region that is being used for the read requests + read_response = fault_injection_container.read_item(doc_success_id, PK_VALUE) + uri = read_response.get_response_headers().get('Content-Location') + region = fault_injection_container.client_connection._global_endpoint_manager.location_cache.get_location_from_endpoint(uri) + assert region == REGION_1 # first preferred region + + # Based on our configuration, we should have had one error followed by a success - marking only the previous endpoint as unavailable + perform_write_operation( + write_operation, + container, + fault_injection_container, + doc_fail_id, + PK_VALUE) + partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same + assert len(partition_info.unavailable_regional_endpoints) == 1 + assert initial_region in partition_info.unavailable_regional_endpoints + assert initial_region != partition_info.current_region # west us 3 != west us + + # Now we run a read request that runs into a 404.1002 error, which should retry to the read region + # We verify that the read request was going to the correct region by using the raw_response_hook + fault_injection_container.read_item(doc_fail_id, PK_VALUE, raw_response_hook=session_retry_hook) +def session_retry_hook(raw_response): + # This hook is used to verify the request routing that happens after the session retry logic + region_string = "-" + REGION_2.replace(' ', '').lower() + "." + assert region_string in raw_response.http_request.url if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index c18e6ed9abb4..3baadf9e9879 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -16,7 +16,7 @@ from azure.cosmos._request_object import RequestObject from _fault_injection_transport import FaultInjectionTransport from _fault_injection_transport_async import FaultInjectionTransportAsync -from test_per_partition_automatic_failover import create_failover_errors, create_threshold_errors +from test_per_partition_automatic_failover import create_failover_errors, create_threshold_errors, session_retry_hook from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors, write_operations_and_boolean from test_per_partition_circuit_breaker_mm_async import perform_write_operation @@ -56,12 +56,13 @@ async def cleanup_method(initialized_objects: Dict[str, Any]): method_client: CosmosClient = initialized_objects["client"] await method_client.close() - async def setup_info(self, error=None, max_count=None, is_batch=False, exclude_client_regions=False, **kwargs): + async def setup_info(self, error=None, max_count=None, is_batch=False, exclude_client_regions=False, session_error=False, **kwargs): custom_transport = FaultInjectionTransportAsync() # two documents targeted to same partition, one will always fail and the other will succeed doc_fail_id = str(uuid.uuid4()) doc_success_id = str(uuid.uuid4()) - predicate = lambda r: FaultInjectionTransportAsync.predicate_req_for_document_with_id(r, doc_fail_id) + predicate = lambda r: (FaultInjectionTransportAsync.predicate_req_for_document_with_id(r, doc_fail_id) + and FaultInjectionTransportAsync.predicate_is_write_operation(r, "west")) # The MockRequest only gets used to create the MockHttpResponse mock_request = FaultInjectionTransport.MockHttpRequest(url=self.host) if is_batch: @@ -71,6 +72,17 @@ async def setup_info(self, error=None, max_count=None, is_batch=False, exclude_c if error: custom_transport.add_fault(predicate=predicate, fault_factory=error, max_inner_count=max_count, after_max_count=success_response) + if session_error: + read_predicate = lambda r: (FaultInjectionTransportAsync.predicate_is_operation_type(r, "Read") + and FaultInjectionTransportAsync.predicate_req_for_document_with_id(r, doc_fail_id)) + read_error = CosmosHttpResponseError( + status_code=404, + message="Some injected error.", + sub_status=1002) + error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, read_error)) + success_response = FaultInjectionTransportAsync.MockHttpResponse(mock_request, 200) + custom_transport.add_fault(predicate=read_predicate, fault_factory=error_lambda, max_inner_count=max_count, + after_max_count=success_response) is_get_account_predicate = lambda r: FaultInjectionTransportAsync.predicate_is_database_account_call(r) # Set the database account response to have PPAF enabled ppaf_enabled_database_account = \ @@ -198,6 +210,46 @@ async def test_ppaf_exclude_regions_async(self, write_operation, exclude_client_ # Check that all requests are marked as non-PPAF available due to the fact that we only have one region assert global_endpoint_manager.is_per_partition_automatic_failover_applicable(request_object) is False + @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) + async def test_ppaf_session_unavailable_retry_async(self, write_operation, error): + # Account config has 2 regions: West US 3 (A) and West US (B). This test validates that after marking the write + # region (A) as unavailable, the next request is retried to the read region (B) and succeeds. The next read request + # should see that the write region (A) is unavailable for the partition, and should retry to the read region (B) as well. + error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, max_count=1, + is_batch=write_operation==BATCH, session_error=True) + container = setup['col'] + fault_injection_container = custom_setup['col'] + global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager + + # Create a document to populate the per-partition GEM partition range info cache + await fault_injection_container.create_item(body={'id': doc_success_id, 'pk': PK_VALUE, + 'name': 'sample document', 'key': 'value'}) + pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] + initial_region = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_region + + # Verify the region that is being used for the read requests + read_response = await fault_injection_container.read_item(doc_success_id, PK_VALUE) + uri = read_response.get_response_headers().get('Content-Location') + region = fault_injection_container.client_connection._global_endpoint_manager.location_cache.get_location_from_endpoint(uri) + assert region == REGION_1 # first preferred region + + # Based on our configuration, we should have had one error followed by a success - marking only the previous endpoint as unavailable + await perform_write_operation( + write_operation, + container, + fault_injection_container, + doc_fail_id, + PK_VALUE) + partition_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] + # Verify that the partition is marked as unavailable, and that the current regional endpoint is not the same + assert len(partition_info.unavailable_regional_endpoints) == 1 + assert initial_region in partition_info.unavailable_regional_endpoints + assert initial_region != partition_info.current_region # west us 3 != west us + + # Now we run a read request that runs into a 404.1002 error, which should retry to the read region + # We verify that the read request was going to the correct region by using the raw_response_hook + fault_injection_container.read_item(doc_fail_id, PK_VALUE, raw_response_hook=session_retry_hook) if __name__ == '__main__': From 1dccc5d9d8129a0774e620077e54a82145787a53 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 22 Aug 2025 08:45:14 -0400 Subject: [PATCH 32/68] pylint, cspell --- .../azure/cosmos/_session_retry_policy.py | 8 ++------ ..._manager_per_partition_automatic_failover_async.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py index 6561589f71ca..5c40bf2881f9 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py @@ -22,7 +22,7 @@ """Internal class for session read/write unavailable retry policy implementation in the Azure Cosmos database service. """ - +# cspell:disable from azure.cosmos.documents import _OperationType class _SessionRetryPolicy(object): @@ -60,16 +60,12 @@ def ShouldRetry(self, _exception): :returns: a boolean stating whether the request should be retried :rtype: bool """ - if not self.request: + if not self.request or not self.endpoint_discovery_enable: return False self.session_token_retry_count += 1 # clear previous location-based routing directive self.request.clear_route_to_location() - if not self.endpoint_discovery_enable: - # if endpoint discovery is disabled, the request cannot be retried anywhere else - return False - if self.can_use_multiple_write_locations: if _OperationType.IsReadOnlyOperation(self.request.operation_type): locations = self.global_endpoint_manager.get_ordered_read_locations() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 5b805b2e145d..4937a10d7c76 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -75,7 +75,15 @@ def __init__(self, client: "CosmosClientConnection"): self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() + def is_per_partition_automatic_failover_enabled(self) -> bool: + if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: + return False + return True + def is_per_partition_automatic_failover_applicable(self, request: RequestObject) -> bool: + if not self.is_per_partition_automatic_failover_enabled(): + return False + if not request: return False @@ -83,9 +91,6 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) or _OperationType.IsReadOnlyOperation(request.operation_type)): return False - if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: - return False - # if we have at most one region available in the account, we cannot do per partition automatic failover available_regions = self.compute_available_preferred_regions(request) if len(available_regions) <= 1: From c2bb93a2d1794905944a347b9c57fba6f3cc2454 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 25 Aug 2025 11:05:48 -0500 Subject: [PATCH 33/68] change errors since 503 is now retried directly --- .../azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py index 399eb4ed86bb..8fe56c06b11b 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py @@ -80,7 +80,7 @@ def operations(): def create_errors(errors=None): errors = [] - error_codes = [408, 500, 502, 503] + error_codes = [408, 500, 502, 504] for error_code in error_codes: errors.append(CosmosHttpResponseError( status_code=error_code, From c3879d8905af50290bb5b058a202e4b30d56a9b7 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 26 Aug 2025 12:24:42 -0500 Subject: [PATCH 34/68] Update sdk/cosmos/azure-cosmos/README.md Co-authored-by: Abhijeet Mohanty --- sdk/cosmos/azure-cosmos/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/README.md b/sdk/cosmos/azure-cosmos/README.md index 11b327b6c369..45a147aaaed0 100644 --- a/sdk/cosmos/azure-cosmos/README.md +++ b/sdk/cosmos/azure-cosmos/README.md @@ -942,7 +942,7 @@ requests to another region: ### Per Partition Automatic Failover (Public Preview) Per partition automatic failover enables the SDK to automatically redirect write requests at the partition level to another region based on service-side signals. This feature is available -only for single write region accounts that have at least one read-only region. When per partition automatic failover is enabled, per partition circuit breaker and hedging is enabled by default, meaning +only for single write region accounts that have at least one read-only region. When per partition automatic failover is enabled, per partition circuit breaker and cross-region hedging is enabled by default, meaning all its configurable options also apply to per partition automatic failover. To enable this feature, follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover). ## Troubleshooting From 1d57bf276f5809942194fb930237b63d1179b826 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 26 Aug 2025 15:56:42 -0500 Subject: [PATCH 35/68] address comments update changelog, update docs, add typehints and documentation --- sdk/cosmos/azure-cosmos/CHANGELOG.md | 3 +- sdk/cosmos/azure-cosmos/azure/cosmos/_base.py | 20 ------- .../_endpoint_discovery_retry_policy.py | 4 +- ...anager_per_partition_automatic_failover.py | 55 +++++++++++++++--- .../cosmos/_service_response_retry_policy.py | 3 +- .../_service_unavailable_retry_policy.py | 26 +++++++-- .../azure/cosmos/_session_retry_policy.py | 21 +++---- .../cosmos/_timeout_failover_retry_policy.py | 3 +- ..._per_partition_automatic_failover_async.py | 56 ++++++++++++++++--- .../azure/cosmos/aio/_retry_utility_async.py | 2 - .../azure-cosmos/docs/ErrorCodesAndRetries.md | 24 ++++---- 11 files changed, 147 insertions(+), 70 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index b7b00e071517..39f312c60cb3 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -4,6 +4,7 @@ #### Features Added * Added read_items API to provide an efficient method for retrieving multiple items in a single request. See [PR 42167](https://github.com/Azure/azure-sdk-for-python/pull/42167). +* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover). See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). #### Breaking Changes @@ -11,6 +12,7 @@ * Improved the resilience of Database Account Read metadata operation against short-lived network issues by increasing number of retries. See [PR 42525](https://github.com/Azure/azure-sdk-for-python/pull/42525). * Fixed bug where during health checks read regions were marked as unavailable for write operations. See [PR 42525](https://github.com/Azure/azure-sdk-for-python/pull/42525). * Fixed bug where `excluded_locations` was not being honored for some metadata calls. See [PR 42266](https://github.com/Azure/azure-sdk-for-python/pull/42266). +* Added cross-regional retries for 503 (Service Unavailable) errors. See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). #### Other Changes * Added session token false progress merge logic. See [42393](https://github.com/Azure/azure-sdk-for-python/pull/42393) @@ -19,7 +21,6 @@ #### Features Added * Added feed range support in `query_items`. See [PR 41722](https://github.com/Azure/azure-sdk-for-python/pull/41722). -* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover). See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). #### Bugs Fixed * Fixed session container session token logic. The SDK will now only send the relevant partition-local session tokens for read document requests and write requests when multi-region writes are enabled, as opposed to the entire compound session token for the container for every document request. See [PR 41678](https://github.com/Azure/azure-sdk-for-python/pull/41678). diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py index da5a5befbf91..1fdc40a46d68 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_base.py @@ -28,7 +28,6 @@ import uuid import re import binascii -import os from typing import Dict, Any, List, Mapping, Optional, Sequence, Union, Tuple, TYPE_CHECKING from urllib.parse import quote as urllib_quote @@ -940,22 +939,3 @@ def _build_properties_cache(properties: Dict[str, Any], container_link: str) -> "_self": properties.get("_self", None), "_rid": properties.get("_rid", None), "partitionKey": properties.get("partitionKey", None), "container_link": container_link } - -def try_ppaf_failover_threshold( - global_endpoint_manager: "_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover", - pk_range_wrapper: "PartitionKeyRangeWrapper", - request: "RequestObject"): - # If PPAF is enabled, we track consecutive failures for certain exceptions, and only fail over at a partition - # level after the threshold is reached - if request and global_endpoint_manager.is_per_partition_automatic_failover_applicable(request): - if (global_endpoint_manager.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) - >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, - Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): - # If the PPAF threshold is reached, we reset the count and retry to the next region - global_endpoint_manager.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) - partition_level_info = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper] - location = global_endpoint_manager.location_cache.get_location_from_endpoint( - str(request.location_endpoint_to_route)) - regional_context = (global_endpoint_manager.location_cache. - account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) - partition_level_info.unavailable_regional_endpoints[location] = regional_context diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index ff94355ae7ba..14eed13d8dbc 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -72,9 +72,9 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( str(self.request.location_endpoint_to_route)) - location_endpoint = (self.global_endpoint_manager.location_cache. + regional_endpoint = (self.global_endpoint_manager.location_cache. account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) - partition_level_info.unavailable_regional_endpoints[location] = location_endpoint + partition_level_info.unavailable_regional_endpoints[location] = regional_endpoint self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) return True diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index b1516bfe3c5d..2dceacbdaabf 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -6,10 +6,12 @@ """ import logging import threading +import os from typing import Dict, TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType +from azure.cosmos._constants import _Constants as Constants from azure.cosmos._global_partition_endpoint_manager_circuit_breaker import \ _GlobalPartitionEndpointManagerForCircuitBreaker from azure.cosmos._partition_health_tracker import _PPAFPartitionThresholdsTracker @@ -43,8 +45,7 @@ def try_move_to_next_location( request: RequestObject) -> bool: with self._lock: if endpoint_region != self.current_region: - logger.info("PPAF - Moving to next available regional endpoint: %s", self.current_region) - # make the actual endpoint since the current_region is just West US + logger.warning("PPAF - Moving to next available regional endpoint %s", self.current_region) regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) return True @@ -95,22 +96,62 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) if len(available_regions) <= 1: return False - # if the request is not for a document or if the request is not executing a stored procedure, return False - if (request.resource_type != ResourceType.Document and - request.operation_type != _OperationType.ExecuteJavaScript): + # if the request is not a non-query plan document request + # or if the request is not executing a stored procedure, return False + if ((request.resource_type != ResourceType.Document and + request.operation_type != _OperationType.ExecuteJavaScript) or + request.operation_type == _OperationType.QueryPlan): return False return True + def try_ppaf_failover_threshold( + self, + pk_range_wrapper: "PartitionKeyRangeWrapper", + request: "RequestObject"): + """Verifies whether the per-partition failover threshold has been reached for consecutive errors. If so, + it marks the current region as unavailable for the given partition key range, and moves to the next available + region for the request. + + :param PartitionKeyRangeWrapper pk_range_wrapper: The wrapper containing the partition key range information + for the request. + :param RequestObject request: The request object containing the routing context. + :returns: None + """ + # If PPAF is enabled, we track consecutive failures for certain exceptions, and only fail over at a partition + # level after the threshold is reached + if request and self.is_per_partition_automatic_failover_applicable(request): + if (self.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) + >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, + Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): + # If the PPAF threshold is reached, we reset the count and retry to the next region + self.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) + partition_level_info = self.partition_range_to_failover_info[pk_range_wrapper] + location = self.location_cache.get_location_from_endpoint( + str(request.location_endpoint_to_route)) + regional_context = (self.location_cache. + account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) + partition_level_info.unavailable_regional_endpoints[location] = regional_context + def resolve_service_endpoint_for_partition( self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] ) -> str: + """Resolves the endpoint to be used for the request. In a PPAF-enabled account, this method checks whether + the partition key range has any unavailable regions, and if so, it tries to move to the next available region. + If all regions are unavailable, it invalidates the cache and starts once again from the main write region in the + account configurations. + + :param PartitionKeyRangeWrapper pk_range_wrapper: The wrapper containing the partition key range information + for the request. + :param RequestObject request: The request object containing the routing context. + :returns: The regional endpoint to be used for the request. + :rtype: str + """ if self.is_per_partition_automatic_failover_applicable(request) and pk_range_wrapper: # If per partition automatic failover is applicable, we check partition unavailability if pk_range_wrapper in self.partition_range_to_failover_info: - logger.info("Resolving service endpoint for partition with per partition automatic failover enabled.") partition_failover_info = self.partition_range_to_failover_info[pk_range_wrapper] if request.location_endpoint_to_route is not None: endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) @@ -120,7 +161,7 @@ def resolve_service_endpoint_for_partition( self.compute_available_preferred_regions(request), endpoint_region, request): - logger.info("All available regions for partition are unavailable. Refreshing cache.") + logger.warning("All available regions for partition are unavailable. Refreshing cache.") # If no other region is available, we invalidate the cache and start once again from our # main write region in the account configurations self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_response_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_response_retry_policy.py index 31f6e800d5e1..b1e176299278 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_response_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_response_retry_policy.py @@ -10,7 +10,6 @@ import logging from azure.cosmos.documents import _OperationType -from azure.cosmos._base import try_ppaf_failover_threshold class ServiceResponseRetryPolicy(object): @@ -51,7 +50,7 @@ def ShouldRetry(self): if self.request: # We track consecutive failures for per partition automatic failover, and only fail over at a partition # level after the threshold is reached - try_ppaf_failover_threshold(self.global_endpoint_manager, self.pk_range_wrapper, self.request) + self.global_endpoint_manager.try_ppaf_failover_threshold(self.pk_range_wrapper, self.request) if not _OperationType.IsReadOnlyOperation(self.request.operation_type) and not self.request.retry_write: return False if self.request.retry_write and self.failover_retry_count + 1 >= self.max_write_retry_count: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py index 245f72bf5489..4aad1388f0a6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -1,16 +1,32 @@ # The MIT License (MIT) # Copyright (c) Microsoft Corporation. All rights reserved. -"""Internal class for service unavailable retry policy implementation in the Azure -Cosmos database service. +"""Internal class for service unavailable errors implementation in the Azure Cosmos database service. + +Service unavailable errors can occur when a request does not make it to the service, or when there is an issue with +the service. In either case, we know the request did not get processed successfully, so service unavailable errors are + retried in the next available preferred region. """ -from azure.cosmos.documents import _OperationType +from typing import Union +from azure.cosmos.documents import _OperationType, ConnectionPolicy +from azure.cosmos.exceptions import CosmosHttpResponseError +from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper +from azure.cosmos._global_partition_endpoint_manager_per_partition_automatic_failover import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover # pylint: disable=line-too-long +from azure.cosmos.aio._global_partition_endpoint_manager_per_partition_automatic_failover_async import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync # pylint: disable=line-too-long + +_GlobalEndpointManagerType = Union[_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, + _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover] #cspell:ignore ppaf class _ServiceUnavailableRetryPolicy(object): - def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, *args): + def __init__( + self, + connection_policy: ConnectionPolicy, + global_endpoint_manager: _GlobalEndpointManagerType, + pk_range_wrapper: PartitionKeyRangeWrapper, + *args): self.retry_after_in_milliseconds = 500 self.global_endpoint_manager = global_endpoint_manager self.pk_range_wrapper = pk_range_wrapper @@ -24,7 +40,7 @@ def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, self._max_retry_attempt_count = max(2, len( self.global_endpoint_manager.location_cache.write_regional_routing_contexts)) - def ShouldRetry(self, _exception): + def ShouldRetry(self, _exception: CosmosHttpResponseError): """Returns true if the request should retry based on the passed-in exception. :param exceptions.CosmosHttpResponseError _exception: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py index 5c40bf2881f9..1e530ad9e37e 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py @@ -104,16 +104,17 @@ def ShouldRetry(self, _exception): # For PPAF, the retry should happen to whatever the relevant write region is for the affected partition. if self.global_endpoint_manager.is_per_partition_automatic_failover_enabled(): pk_failover_info = self.global_endpoint_manager.partition_range_to_failover_info.get(self.pk_range_wrapper) - location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( - str(self.request.location_endpoint_to_route)) - if location in pk_failover_info.unavailable_regional_endpoints: - # If the request endpoint is unavailable, we need to resolve the endpoint for the request using the - # partition-level failover info - location_endpoint = (self.global_endpoint_manager.location_cache. - account_read_regional_routing_contexts_by_location. - get(pk_failover_info.current_region).primary_endpoint) - self.request.route_to_location(location_endpoint) - return True + if pk_failover_info is not None: + location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( + str(self.request.location_endpoint_to_route)) + if location in pk_failover_info.unavailable_regional_endpoints: + # If the request endpoint is unavailable, we need to resolve the endpoint for the request using the + # partition-level failover info + location_endpoint = (self.global_endpoint_manager.location_cache. + account_read_regional_routing_contexts_by_location. + get(pk_failover_info.current_region).primary_endpoint) + self.request.route_to_location(location_endpoint) + return True # Resolve the endpoint for the request and pin the resolution to the resolved endpoint # This enables marking the endpoint unavailability on endpoint failover/unreachability diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py index 952685ef5e06..ba5047df0ccc 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py @@ -5,7 +5,6 @@ Cosmos database service. """ from azure.cosmos.documents import _OperationType -from azure.cosmos._base import try_ppaf_failover_threshold # cspell:ignore PPAF, ppaf @@ -38,7 +37,7 @@ def ShouldRetry(self, _exception): :returns: a boolean stating whether the request should be retried :rtype: bool """ - try_ppaf_failover_threshold(self.global_endpoint_manager, self.pk_range_wrapper, self.request) + self.global_endpoint_manager.try_ppaf_failover_threshold(self.pk_range_wrapper, self.request) # we retry only if the request is a read operation or if it is a write operation with retry enabled if self.request and not self.is_operation_retryable(): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 4937a10d7c76..e96d9e3152a7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -6,10 +6,12 @@ """ import logging import threading +import os from typing import Dict, TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType +from azure.cosmos._constants import _Constants as Constants from azure.cosmos.aio._global_partition_endpoint_manager_circuit_breaker_async import \ _GlobalPartitionEndpointManagerForCircuitBreakerAsync from azure.cosmos.documents import _OperationType @@ -43,7 +45,7 @@ def try_move_to_next_location( request: RequestObject) -> bool: with self._lock: if endpoint_region != self.current_region: - logger.info("PPAF - Moving to next available regional endpoint: %s", self.current_region) + logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) # make the actual endpoint since the current_region is just West US regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) @@ -57,7 +59,7 @@ def try_move_to_next_location( continue self.current_region = regional_endpoint - logger.info("PPAF - Moving to next available regional endpoint: %s", self.current_region) + logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) return True @@ -96,22 +98,62 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) if len(available_regions) <= 1: return False - # if the request is not for a document or if the request is not executing a stored procedure, return False - if (request.resource_type != ResourceType.Document and - request.operation_type != _OperationType.ExecuteJavaScript): + # if the request is not a non-query plan document request + # or if the request is not executing a stored procedure, return False + if ((request.resource_type != ResourceType.Document and + request.operation_type != _OperationType.ExecuteJavaScript) or + request.operation_type == _OperationType.QueryPlan): return False return True + def try_ppaf_failover_threshold( + self, + pk_range_wrapper: "PartitionKeyRangeWrapper", + request: "RequestObject"): + """Verifies whether the per-partition failover threshold has been reached for consecutive errors. If so, + it marks the current region as unavailable for the given partition key range, and moves to the next available + region for the request. + + :param PartitionKeyRangeWrapper pk_range_wrapper: The wrapper containing the partition key range information + for the request. + :param RequestObject request: The request object containing the routing context. + :returns: None + """ + # If PPAF is enabled, we track consecutive failures for certain exceptions, and only fail over at a partition + # level after the threshold is reached + if request and self.is_per_partition_automatic_failover_applicable(request): + if (self.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) + >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, + Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): + # If the PPAF threshold is reached, we reset the count and retry to the next region + self.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) + partition_level_info = self.partition_range_to_failover_info[pk_range_wrapper] + location = self.location_cache.get_location_from_endpoint( + str(request.location_endpoint_to_route)) + regional_context = (self.location_cache. + account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) + partition_level_info.unavailable_regional_endpoints[location] = regional_context + def resolve_service_endpoint_for_partition( self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] ) -> str: + """Resolves the endpoint to be used for the request. In a PPAF-enabled account, this method checks whether + the partition key range has any unavailable regions, and if so, it tries to move to the next available region. + If all regions are unavailable, it invalidates the cache and starts once again from the main write region in the + account configurations. + + :param PartitionKeyRangeWrapper pk_range_wrapper: The wrapper containing the partition key range information + for the request. + :param RequestObject request: The request object containing the routing context. + :returns: The regional endpoint to be used for the request. + :rtype: str + """ if self.is_per_partition_automatic_failover_applicable(request) and pk_range_wrapper: # If per partition automatic failover is applicable, we check partition unavailability if pk_range_wrapper in self.partition_range_to_failover_info: - logger.info("Resolving service endpoint for partition with per partition automatic failover enabled.") partition_failover_info = self.partition_range_to_failover_info[pk_range_wrapper] if request.location_endpoint_to_route is not None: endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) @@ -121,7 +163,7 @@ def resolve_service_endpoint_for_partition( self.compute_available_preferred_regions(request), endpoint_region, request): - logger.info("All available regions for partition are unavailable. Refreshing cache.") + logger.warning("All available regions for partition are unavailable. Refreshing cache.") # If no other region is available, we invalidate the cache and start once again from our # main write region in the account configurations self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 6afc01c53f02..88caac506cb1 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -202,11 +202,9 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg if retry_policy.should_update_throughput_link(request.body, cached_container): new_body = retry_policy._update_throughput_link(request.body) request.body = new_body - retry_policy.container_rid = cached_container["_rid"] request.headers[retry_policy._intended_headers] = retry_policy.container_rid elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: - # if ppaf is applicable, we record the failure retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: diff --git a/sdk/cosmos/azure-cosmos/docs/ErrorCodesAndRetries.md b/sdk/cosmos/azure-cosmos/docs/ErrorCodesAndRetries.md index 5f9cfd3c03d6..bc1dd0bf79d4 100644 --- a/sdk/cosmos/azure-cosmos/docs/ErrorCodesAndRetries.md +++ b/sdk/cosmos/azure-cosmos/docs/ErrorCodesAndRetries.md @@ -2,17 +2,17 @@ The Cosmos DB Python SDK has several default policies that will deal with retrying certain errors and exceptions. More information on these can be found below. -| Status code | Cause of exception and retry behavior | -| :--- |:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 400 | For all operations:
  • This exception is encountered when the request is invalid, which could be for any of the following reasons:
    • Syntax error in query text
    • Malformed JSON document for a write request
    • Incorrectly formatted REST API request body etc.
  • The client does NOT retry the request when a Bad Request (400) exception is thrown by the server.
| -| 401 | For all operations:
  • This is an unauthorized exception due to invalid auth tokens being used for the request. The client does NOT retry requests when this exception is encountered.
| -| 403 |
  • For Substatus 3 (Write Forbidden) and Substatus 1008 (Database Account Not Found):
    • This exception occurs when a geo-replicated database account runs into writable/readable location changes (say, after a failover).
    • This exception can occur regardless of the Consistency level set for the account.
    • The client refreshes it's location endpoints and retries requests when the user has enabled endpoint discovery in their client (default behavior).
  • For all other cases:
    • The client does NOT retry requests when this exception is encountered.
    • | +| Status code | Cause of exception and retry behavior | +|:------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 400 | For all operations:
      • This exception is encountered when the request is invalid, which could be for any of the following reasons:
        • Syntax error in query text
        • Malformed JSON document for a write request
        • Incorrectly formatted REST API request body etc.
      • The client does NOT retry the request when a Bad Request (400) exception is thrown by the server.
      | +| 401 | For all operations:
      • This is an unauthorized exception due to invalid auth tokens being used for the request. The client does NOT retry requests when this exception is encountered.
      | +| 403 |
      • For Substatus 3 (Write Forbidden) and Substatus 1008 (Database Account Not Found):
        • This exception occurs when a geo-replicated database account runs into writable/readable location changes (say, after a failover).
        • This exception can occur regardless of the Consistency level set for the account.
        • The client refreshes it's location endpoints and retries requests when the user has enabled endpoint discovery in their client (default behavior).
      • For all other cases:
        • The client does NOT retry requests when this exception is encountered.
        • | | 404/1002 |
          • For write operations:
            • If multiple write locations are enabled for the account, the SDK will fetch the write endpoints and retry once per each of these.
            • The client refreshes it's location endpoints and retries requests when the user has enabled endpoint discovery in their client (default behavior).
            • If the account does not have multiple write locations enabled, the SDK will retry only once in the account primary region.
          • For read operations:
            • If multiple write locations are enabled for the account, the SDK will fetch the read endpoints and retry once per each of these.
            • The client refreshes it's location endpoints and retries requests when the user has enabled endpoint discovery in their client (default behavior).
            • If the account does not have multiple write locations enabled, the SDK will retry only once in the account primary region.
            • | -| 408 |
              • For Write Operations:
                • Timeout exceptions can be encountered by both the client as well as the server. Server-side timeout exceptions are not retried for write operations as it is not possible to determine if the write was in fact successfully committed on the server. For a client-generated timeout exception, either the request was sent over the wire to the server by the client and the network request timeout exceeded while waiting for a response, or the request was not sent over the wire to the server which resulted in a client-generated timeout. The client does NOT retry for either.
              • For Query and Point Read Operations:
                • The SDK will retry on the next preferred region, if any is available.
              | -| 409 |
              • For Write Operations:
                • This exception occurs when an attempt is made by the application to Create/Insert an Item that already exists.
                • This exception can occur regardless of the Consistency level set for the account.
                • This exception can occur for write operations when an attempt is made to create an existing item or when a unique key constraint violation occurs.
                • The client does NOT retry on Conflict exceptions
              • For Query and Point Read Operations:
                • N/A as this exception is only encountered for Create/Insert operations.
              • | +| 408 |
                • For Write Operations:
                  • Timeout exceptions can be encountered by both the client as well as the server. Server-side timeout exceptions are not retried for write operations as it is not possible to determine if the write was in fact successfully committed on the server. For a client-generated timeout exception, either the request was sent over the wire to the server by the client and the network request timeout exceeded while waiting for a response, or the request was not sent over the wire to the server which resulted in a client-generated timeout. The client does NOT retry for either.
                • For Query and Point Read Operations:
                  • The SDK will retry on the next preferred region, if any is available.
                | +| 409 |
                • For Write Operations:
                  • This exception occurs when an attempt is made by the application to Create/Insert an Item that already exists.
                  • This exception can occur regardless of the Consistency level set for the account.
                  • This exception can occur for write operations when an attempt is made to create an existing item or when a unique key constraint violation occurs.
                  • The client does NOT retry on Conflict exceptions
                • For Query and Point Read Operations:
                  • N/A as this exception is only encountered for Create/Insert operations.
                • | | 410/1002 |
                  • For all operations:
                    • This exception occurs when a partition is split (or merged in the future) and no longer exists, and can occur regardless of the Consistency level set for the account.
                    • The SDK will refresh its partition key range cache and trigger a single retry, fetching the new ranges from the gateway once it finds an empty cache.
                    • | -| 412 |
                      • For Write Operations:
                        • This exception is encountered when the etag that is sent to the server for validation prior to updating an Item, does not match the etag of the Item on the server.
                        • The client does NOT retry this operation locally or against any of the remote regions for the account as retries would not help alleviate the etag mismatch.
                        • The application would need to trigger a retry by first reading the Item, fetching the latest etag and issuing the Upsert/Replace operation.
                          • This operation can continue to fail with the same exception when multiple updates are executed concurrently for the same Item.
                          • An upper bound on the number of retries before handing off the Item to a dead letter queue should be implemented by the application.
                      • For Query and point read Operations:
                        • N/A as this exception is only encountered for Create/Insert/Replace/Upsert operations.
                      | -| 429 | For all Operations:
                      • By default, the client retries the request for a maximum of 9 times (or for a maximum of 30 seconds, whichever limit is reached first).
                      • The client can also be initialized with a custom retry policy, which overrides the two limits mentioned above.
                      • After all the retries are exhausted, the client bubbles up the exception to the application.
                      • **For a multi-region account**, the client does NOT retry the request against a remote region for the account.
                      • When the application receives a Request Rate too large exception (429), the application would need to instrument its own retry logic and dead letter queues.
                      | -| 449 |
                      • For Write Operations:
                        • This exception is encountered when a resource is concurrently updated on the server, which can happen due to concurrent writes, user triggered while conflicts are concurrently being resolved etc.
                        • Only one update can be executed at a time per item. The other concurrent requests will fail with a Concurrent Execution Exception (449).
                        • The client does NOT retry requests that failed with a 449.
                      • For Query and point read Operations:
                        • N/A as this exception is only encountered for Create/Insert/Replace/Upsert operations.
                      | -| 500 | For all Operations:
                      • The occurrence of an Invalid Exception (500) is extremely rare, and the client will retry a request that encounters this exception on the next preferred regions.
                      | -| 503 | When a Service Unavailable exception is encountered:
                      • The request will be retried by the SDK on the next preferred regions. | +| 412 |
                        • For Write Operations:
                          • This exception is encountered when the etag that is sent to the server for validation prior to updating an Item, does not match the etag of the Item on the server.
                          • The client does NOT retry this operation locally or against any of the remote regions for the account as retries would not help alleviate the etag mismatch.
                          • The application would need to trigger a retry by first reading the Item, fetching the latest etag and issuing the Upsert/Replace operation.
                            • This operation can continue to fail with the same exception when multiple updates are executed concurrently for the same Item.
                            • An upper bound on the number of retries before handing off the Item to a dead letter queue should be implemented by the application.
                        • For Query and point read Operations:
                          • N/A as this exception is only encountered for Create/Insert/Replace/Upsert operations.
                        | +| 429 | For all Operations:
                        • By default, the client retries the request for a maximum of 9 times (or for a maximum of 30 seconds, whichever limit is reached first).
                        • The client can also be initialized with a custom retry policy, which overrides the two limits mentioned above.
                        • After all the retries are exhausted, the client bubbles up the exception to the application.
                        • **For a multi-region account**, the client does NOT retry the request against a remote region for the account.
                        • When the application receives a Request Rate too large exception (429), the application would need to instrument its own retry logic and dead letter queues.
                        | +| 449 |
                        • For Write Operations:
                          • This exception is encountered when a resource is concurrently updated on the server, which can happen due to concurrent writes, user triggered while conflicts are concurrently being resolved etc.
                          • Only one update can be executed at a time per item. The other concurrent requests will fail with a Concurrent Execution Exception (449).
                          • The client does NOT retry requests that failed with a 449.
                        • For Query and point read Operations:
                          • N/A as this exception is only encountered for Create/Insert/Replace/Upsert operations.
                        | +| 500 |
                        • For Write Operations:
                          • The client does NOT retry write requests.
                        • For Read Operations:
                          • The request will be retried by the SDK on the next preferred regions.
                        | +| 503 | When a Service Unavailable exception is encountered, for all Operations:
                        • The request will be retried by the SDK on the next preferred regions. | From eec77e76b71bb41135f53967952a25fb628b1d0d Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 26 Aug 2025 16:34:23 -0500 Subject: [PATCH 36/68] Update _service_unavailable_retry_policy.py --- .../cosmos/_service_unavailable_retry_policy.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py index 4aad1388f0a6..54615ca597c6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -7,25 +7,17 @@ the service. In either case, we know the request did not get processed successfully, so service unavailable errors are retried in the next available preferred region. """ -from typing import Union -from azure.cosmos.documents import _OperationType, ConnectionPolicy +from azure.cosmos.documents import _OperationType from azure.cosmos.exceptions import CosmosHttpResponseError -from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper -from azure.cosmos._global_partition_endpoint_manager_per_partition_automatic_failover import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover # pylint: disable=line-too-long -from azure.cosmos.aio._global_partition_endpoint_manager_per_partition_automatic_failover_async import _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync # pylint: disable=line-too-long - -_GlobalEndpointManagerType = Union[_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, - _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover] #cspell:ignore ppaf class _ServiceUnavailableRetryPolicy(object): - def __init__( self, - connection_policy: ConnectionPolicy, - global_endpoint_manager: _GlobalEndpointManagerType, - pk_range_wrapper: PartitionKeyRangeWrapper, + connection_policy, + global_endpoint_manager, + pk_range_wrapper, *args): self.retry_after_in_milliseconds = 500 self.global_endpoint_manager = global_endpoint_manager From 4c2bf3223b7fa105672bf6a7ff74d4350ca83ca0 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 26 Aug 2025 16:49:49 -0500 Subject: [PATCH 37/68] small test updates for 503 behavior --- ..._endpoint_manager_per_partition_automatic_failover.py | 5 ++--- ...int_manager_per_partition_automatic_failover_async.py | 5 ++--- .../tests/test_per_partition_automatic_failover.py | 9 +++------ .../tests/test_per_partition_automatic_failover_async.py | 9 +++------ 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 2dceacbdaabf..cd6f9e85a329 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -98,9 +98,8 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) # if the request is not a non-query plan document request # or if the request is not executing a stored procedure, return False - if ((request.resource_type != ResourceType.Document and - request.operation_type != _OperationType.ExecuteJavaScript) or - request.operation_type == _OperationType.QueryPlan): + if (request.resource_type != ResourceType.Document and + request.operation_type != _OperationType.ExecuteJavaScript): return False return True diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index e96d9e3152a7..40d54c64a981 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -100,9 +100,8 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) # if the request is not a non-query plan document request # or if the request is not executing a stored procedure, return False - if ((request.resource_type != ResourceType.Document and - request.operation_type != _OperationType.ExecuteJavaScript) or - request.operation_type == _OperationType.QueryPlan): + if (request.resource_type != ResourceType.Document and + request.operation_type != _OperationType.ExecuteJavaScript): return False return True diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index eeed945f388f..e0f855140bf8 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -162,10 +162,7 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] initial_region = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_region - is_503 = hasattr(error, 'status_code') and error.status_code == 503 - # Since 503 errors are retried by default, we each request counts as two failures - consecutive_failures = 3 if is_503 else 6 - + consecutive_failures = 6 for i in range(consecutive_failures): # We perform the write operation multiple times to check the consecutive failures logic with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: @@ -179,7 +176,7 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): pk_range_wrappers = list(global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count.keys()) assert len(pk_range_wrappers) == 1 failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] - assert failure_count == 6 + assert failure_count == consecutive_failures # Run some more requests to the same partition to trigger the failover logic for i in range(consecutive_failures): with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: @@ -199,7 +196,7 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): # Since we are failing every request, even though we retried to the next region, that retry should have failed as well # This means we should have one extra failure - verify that the value makes sense failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] - assert failure_count == 1 if is_503 else 3 + assert failure_count == 3 @pytest.mark.parametrize("write_operation, exclude_client_regions", write_operations_and_boolean()) def test_ppaf_exclude_regions(self, write_operation, exclude_client_regions): diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 3baadf9e9879..c5721a0af9e8 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -155,10 +155,7 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation pk_range_wrapper = list(global_endpoint_manager.partition_range_to_failover_info.keys())[0] initial_region = global_endpoint_manager.partition_range_to_failover_info[pk_range_wrapper].current_region - is_503 = hasattr(error, 'status_code') and error.status_code == 503 - # Since 503 errors are retried by default, we each request counts as two failures - consecutive_failures = 3 if is_503 else 6 - + consecutive_failures = 6 for i in range(consecutive_failures): # We perform the write operation multiple times to check the consecutive failures logic with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: @@ -173,7 +170,7 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation pk_range_wrappers = list(global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count.keys()) assert len(pk_range_wrappers) == 1 failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] - assert failure_count == 6 + assert failure_count == consecutive_failures # Run some more requests to the same partition to trigger the failover logic for i in range(consecutive_failures): with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: @@ -193,7 +190,7 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation # Since we are failing every request, even though we retried to the next region, that retry should have failed as well # This means we should have one extra failure - verify that the value makes sense failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] - assert failure_count == 1 if is_503 else 3 + assert failure_count == 3 @pytest.mark.parametrize("write_operation, exclude_client_regions", write_operations_and_boolean()) async def test_ppaf_exclude_regions_async(self, write_operation, exclude_client_regions): From 05654a9d38d738837c288502a1a415ae22252b4c Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 26 Aug 2025 21:19:15 -0500 Subject: [PATCH 38/68] further comments --- ...anager_per_partition_automatic_failover.py | 3 +- .../_service_unavailable_retry_policy.py | 8 ++--- .../cosmos/_timeout_failover_retry_policy.py | 8 ++--- .../test_per_partition_automatic_failover.py | 31 +++++++++++++------ ..._per_partition_automatic_failover_async.py | 24 +++++++++----- 5 files changed, 47 insertions(+), 27 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index cd6f9e85a329..6e06e3012c18 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -131,6 +131,7 @@ def try_ppaf_failover_threshold( regional_context = (self.location_cache. account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) partition_level_info.unavailable_regional_endpoints[location] = regional_context + print(3) def resolve_service_endpoint_for_partition( self, @@ -169,8 +170,6 @@ def resolve_service_endpoint_for_partition( pk_range_wrapper) else: # Update the current regional endpoint to whatever the request is routing to - endpoint_region = self.location_cache.get_location_from_endpoint( - request.location_endpoint_to_route) partition_failover_info.current_region = endpoint_region else: partition_failover_info = PartitionLevelFailoverInfo() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py index 54615ca597c6..8269eb86799d 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -26,11 +26,11 @@ def __init__( self.connection_policy = connection_policy self.request = args[0] if args else None # If an account only has 1 region, then we still want to retry once on the same region - self._max_retry_attempt_count = max(2, len(self.global_endpoint_manager.location_cache - .read_regional_routing_contexts)) + self._max_retry_attempt_count = len(self.global_endpoint_manager. + location_cache.read_regional_routing_contexts) + 1 if self.request and _OperationType.IsWriteOperation(self.request.operation_type): - self._max_retry_attempt_count = max(2, len( - self.global_endpoint_manager.location_cache.write_regional_routing_contexts)) + self._max_retry_attempt_count = len(self.global_endpoint_manager.location_cache. + write_regional_routing_contexts) + 1 def ShouldRetry(self, _exception: CosmosHttpResponseError): """Returns true if the request should retry based on the passed-in exception. diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py index ba5047df0ccc..d019b125c2c3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py @@ -20,12 +20,12 @@ def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper, # If an account only has 1 region, then we still want to retry once on the same region # We want this to be the default retry attempts as paging through a query means there are requests without # a request object - self._max_retry_attempt_count = max(2, len(self.global_endpoint_manager.location_cache - .read_regional_routing_contexts)) + self._max_retry_attempt_count = len(self.global_endpoint_manager. + location_cache.read_regional_routing_contexts) + 1 # If the request is a write operation, we only want to retry once if retry write is enabled if self.request and _OperationType.IsWriteOperation(self.request.operation_type): - self._max_retry_attempt_count = max(2, len( - self.global_endpoint_manager.location_cache.write_regional_routing_contexts)) + self._max_retry_attempt_count = len(self.global_endpoint_manager.location_cache. + write_regional_routing_contexts) + 1 self.retry_count = 0 self.connection_policy = connection_policy self.request = args[0] if args else None diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index e0f855140bf8..b3fede1c9f65 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -66,8 +66,8 @@ def setup_info(self, error=None, max_count=None, is_batch=False, exclude_client_ # two documents targeted to same partition, one will always fail and the other will succeed doc_fail_id = str(uuid.uuid4()) doc_success_id = str(uuid.uuid4()) - predicate = lambda r: (FaultInjectionTransport.predicate_req_for_document_with_id(r, doc_fail_id) - and FaultInjectionTransport.predicate_is_write_operation(r, "west")) + predicate = lambda r: (FaultInjectionTransport.predicate_req_for_document_with_id(r, doc_fail_id) and + FaultInjectionTransport.predicate_is_write_operation(r, "com")) # The MockRequest only gets used to create the MockHttpResponse mock_request = FaultInjectionTransport.MockHttpRequest(url=self.host) if is_batch: @@ -177,8 +177,19 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): assert len(pk_range_wrappers) == 1 failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] assert failure_count == consecutive_failures - # Run some more requests to the same partition to trigger the failover logic - for i in range(consecutive_failures): + + # Verify that a single success to the same partition resets the consecutive failures count + perform_write_operation(write_operation, + container, + fault_injection_container, + str(uuid.uuid4()), + PK_VALUE) + + failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count.get(pk_range_wrappers[0], 0) + assert failure_count == 0 + + # Run enough failed requests to the partition to trigger the failover logic + for i in range(12): with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: perform_write_operation(write_operation, container, @@ -193,10 +204,9 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): assert initial_region in partition_info.unavailable_regional_endpoints assert initial_region != partition_info.current_region # west us 3 != west us - # Since we are failing every request, even though we retried to the next region, that retry should have failed as well - # This means we should have one extra failure - verify that the value makes sense + # 12 failures - 10 to trigger failover, 2 more to start counting again failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] - assert failure_count == 3 + assert failure_count == 2 @pytest.mark.parametrize("write_operation, exclude_client_regions", write_operations_and_boolean()) def test_ppaf_exclude_regions(self, write_operation, exclude_client_regions): @@ -255,9 +265,10 @@ def test_ppaf_session_unavailable_retry(self, write_operation, error): fault_injection_container.read_item(doc_fail_id, PK_VALUE, raw_response_hook=session_retry_hook) def session_retry_hook(raw_response): - # This hook is used to verify the request routing that happens after the session retry logic - region_string = "-" + REGION_2.replace(' ', '').lower() + "." - assert region_string in raw_response.http_request.url + if raw_response.http_request.headers.get('x-ms-thinclient-proxy-resource-type') != 'databaseaccount': + # This hook is used to verify the request routing that happens after the session retry logic + region_string = "-" + REGION_2.replace(' ', '').lower() + "." + assert region_string in raw_response.http_request.url if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index c5721a0af9e8..ac0e97576ed2 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -61,8 +61,8 @@ async def setup_info(self, error=None, max_count=None, is_batch=False, exclude_c # two documents targeted to same partition, one will always fail and the other will succeed doc_fail_id = str(uuid.uuid4()) doc_success_id = str(uuid.uuid4()) - predicate = lambda r: (FaultInjectionTransportAsync.predicate_req_for_document_with_id(r, doc_fail_id) - and FaultInjectionTransportAsync.predicate_is_write_operation(r, "west")) + predicate = lambda r: (FaultInjectionTransportAsync.predicate_req_for_document_with_id(r, doc_fail_id) and + FaultInjectionTransportAsync.predicate_is_write_operation(r, "com")) # The MockRequest only gets used to create the MockHttpResponse mock_request = FaultInjectionTransport.MockHttpRequest(url=self.host) if is_batch: @@ -171,8 +171,19 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation assert len(pk_range_wrappers) == 1 failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] assert failure_count == consecutive_failures - # Run some more requests to the same partition to trigger the failover logic - for i in range(consecutive_failures): + + # Verify that a single success to the same partition resets the consecutive failures count + await perform_write_operation(write_operation, + container, + fault_injection_container, + str(uuid.uuid4()), + PK_VALUE) + + failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count.get(pk_range_wrappers[0], 0) + assert failure_count == 0 + + # Run enough failed requests to the partition to trigger the failover logic + for i in range(12): with pytest.raises((CosmosHttpResponseError, ServiceResponseError)) as exc_info: await perform_write_operation(write_operation, container, @@ -187,10 +198,9 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation assert initial_region in partition_info.unavailable_regional_endpoints assert initial_region != partition_info.current_region # west us 3 != west us - # Since we are failing every request, even though we retried to the next region, that retry should have failed as well - # This means we should have one extra failure - verify that the value makes sense + # 12 failures - 10 to trigger failover, 2 more to start counting again failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] - assert failure_count == 3 + assert failure_count == 2 @pytest.mark.parametrize("write_operation, exclude_client_regions", write_operations_and_boolean()) async def test_ppaf_exclude_regions_async(self, write_operation, exclude_client_regions): From f982d218c49692c54bde866abe3b5f9fd0a5c7bc Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 27 Aug 2025 14:26:38 -0500 Subject: [PATCH 39/68] Update test_per_partition_circuit_breaker_sm_mrr.py --- .../tests/test_per_partition_circuit_breaker_sm_mrr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py index 7c63e8e82897..d2f93cf019c0 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py @@ -71,10 +71,11 @@ def setup_info(self, error, **kwargs): return setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate def test_stat_reset(self): + status_code = 500 error_lambda = lambda r: FaultInjectionTransport.error_after_delay( 0, CosmosHttpResponseError( - status_code=503, + status_code=status_code, message="Some injected error.") ) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = \ @@ -103,7 +104,7 @@ def test_stat_reset(self): PK_VALUE, expected_uri) except CosmosHttpResponseError as e: - assert e.status_code == 503 + assert e.status_code == status_code validate_unhealthy_partitions(global_endpoint_manager, 0) validate_stats(global_endpoint_manager, 0, 2, 2, 0, 0, 0) sleep(25) From d9ca7a43181b44177354402a26d637345c713804 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 27 Aug 2025 15:30:32 -0500 Subject: [PATCH 40/68] test fixes --- .../tests/test_per_partition_circuit_breaker_mm.py | 5 +++-- .../tests/test_per_partition_circuit_breaker_mm_async.py | 5 +++-- .../tests/test_per_partition_circuit_breaker_sm_mrr_async.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py index 8fe56c06b11b..8ab0f5f2e9b2 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py @@ -405,10 +405,11 @@ def setup_info(self, error, **kwargs): return container, doc, expected_uri, uri_down, fault_injection_container, custom_transport, predicate def test_stat_reset(self): + status_code = 500 error_lambda = lambda r: FaultInjectionTransport.error_after_delay( 0, CosmosHttpResponseError( - status_code=503, + status_code=status_code, message="Some injected error.") ) container, doc, expected_uri, uri_down, fault_injection_container, custom_transport, predicate = \ @@ -435,7 +436,7 @@ def test_stat_reset(self): PK_VALUE, expected_uri) except CosmosHttpResponseError as e: - assert e.status_code == 503 + assert e.status_code == status_code validate_unhealthy_partitions(global_endpoint_manager, 0) validate_stats(global_endpoint_manager, 2, 2, 2, 2, 0, 0) sleep(25) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py index 40147314e4ff..095d3c6ff849 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py @@ -339,10 +339,11 @@ async def test_read_failure_rate_threshold_async(self, read_operation, error): await cleanup_method([custom_setup, setup]) async def test_stat_reset_async(self): + status_code = 500 error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay( 0, CosmosHttpResponseError( - status_code=503, + status_code=status_code, message="Some injected error.") )) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = \ @@ -371,7 +372,7 @@ async def test_stat_reset_async(self): PK_VALUE, expected_uri) except CosmosHttpResponseError as e: - assert e.status_code == 503 + assert e.status_code == status_code validate_unhealthy_partitions(global_endpoint_manager, 0) validate_stats(global_endpoint_manager, 2, 2, 2, 2, 0, 0) await asyncio.sleep(25) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py index 9779b9c68362..6a81aea15b88 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py @@ -135,10 +135,11 @@ async def test_write_failure_rate_threshold_async(self, write_operation, error): await cleanup_method([custom_setup, setup]) async def test_stat_reset_async(self): + status_code = 500 error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay( 0, CosmosHttpResponseError( - status_code=503, + status_code=status_code, message="Some injected error.") )) setup, doc, expected_uri, uri_down, custom_setup, custom_transport, predicate = \ @@ -167,7 +168,7 @@ async def test_stat_reset_async(self): PK_VALUE, expected_uri) except CosmosHttpResponseError as e: - assert e.status_code == 503 + assert e.status_code == status_code validate_unhealthy_partitions(global_endpoint_manager, 0) validate_stats(global_endpoint_manager, 0, 2, 2, 0, 0, 0) await asyncio.sleep(25) From f1dce5dc37d83dcae6654b92e74e2fd095171d9c Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 27 Aug 2025 18:08:31 -0500 Subject: [PATCH 41/68] Update test_excluded_locations.py --- sdk/cosmos/azure-cosmos/tests/test_excluded_locations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/cosmos/azure-cosmos/tests/test_excluded_locations.py b/sdk/cosmos/azure-cosmos/tests/test_excluded_locations.py index 7252593e41bf..e262cc8e8c4b 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_excluded_locations.py +++ b/sdk/cosmos/azure-cosmos/tests/test_excluded_locations.py @@ -438,6 +438,7 @@ def test_delete_item(self, test_data): MOCK_HANDLER.reset() # API call: delete_item + container.upsert_item(body) if request_excluded_locations is None: container.delete_item(item_id, PARTITION_KEY_VALUES) else: From 1582cf3c3ba15aebc8ca5bf7233cb5a45b4350b7 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:09:23 -0500 Subject: [PATCH 42/68] small improvement to region-finding --- ...anager_per_partition_automatic_failover.py | 36 ++++++++++++------- ..._per_partition_automatic_failover_async.py | 36 ++++++++++++------- 2 files changed, 48 insertions(+), 24 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 6e06e3012c18..9eacc261b8d6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -34,7 +34,7 @@ class PartitionLevelFailoverInfo: Used to track the partition key range and the regions where it is available. """ def __init__(self): - self.unavailable_regional_endpoints: Dict[str, RegionalRoutingContext] = {} + self.unavailable_regional_endpoints: Dict[str, str] = {} self.current_region = None self._lock = threading.Lock() @@ -43,9 +43,16 @@ def try_move_to_next_location( available_account_regional_endpoints: Dict[str, str], endpoint_region: str, request: RequestObject) -> bool: + """ + Tries to move to the next available regional endpoint for the partition key range. + :param Dict[str, str] available_account_regional_endpoints: The available regional endpoints + :param str endpoint_region: The current regional endpoint + :param RequestObject request: The request object containing the routing context. + :return: True if the move was successful, False otherwise. + :rtype: bool + """ with self._lock: if endpoint_region != self.current_region: - logger.warning("PPAF - Moving to next available regional endpoint %s", self.current_region) regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) return True @@ -58,7 +65,7 @@ def try_move_to_next_location( continue self.current_region = regional_endpoint - logger.info("PPAF - Moving to next available regional endpoint: %s", self.current_region) + logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) return True @@ -156,18 +163,23 @@ def resolve_service_endpoint_for_partition( if request.location_endpoint_to_route is not None: endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) if endpoint_region in partition_failover_info.unavailable_regional_endpoints: - # If the current region is unavailable, we try to move to the next available region - if not partition_failover_info.try_move_to_next_location( + available_account_regional_endpoints = self.compute_available_preferred_regions(request) + if endpoint_region != partition_failover_info.current_region: + # this request has not yet seen there's an available region being used for this partition + regional_endpoint = available_account_regional_endpoints[ + partition_failover_info.current_region] + request.route_to_location(regional_endpoint) + else: + # If the current region is unavailable, we try to move to the next available region + if not partition_failover_info.try_move_to_next_location( self.compute_available_preferred_regions(request), endpoint_region, request): - logger.warning("All available regions for partition are unavailable. Refreshing cache.") - # If no other region is available, we invalidate the cache and start once again from our - # main write region in the account configurations - self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() - request.clear_route_to_location() - return self._resolve_service_endpoint_for_partition_circuit_breaker(request, - pk_range_wrapper) + logger.warning("All available regions for partition are unavailable. Refreshing cache.") + # If no other region is available, we invalidate the cache and start once again + # from our main write region in the account configurations + self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() + request.clear_route_to_location() else: # Update the current regional endpoint to whatever the request is routing to partition_failover_info.current_region = endpoint_region diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 40d54c64a981..adb47bed6931 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -34,7 +34,7 @@ class PartitionLevelFailoverInfo: Used to track the partition key range and the regions where it is available. """ def __init__(self): - self.unavailable_regional_endpoints: Dict[str, RegionalRoutingContext] = {} + self.unavailable_regional_endpoints: Dict[str, str] = {} self.current_region = None self._lock = threading.Lock() @@ -43,10 +43,16 @@ def try_move_to_next_location( available_account_regional_endpoints: Dict[str, str], endpoint_region: str, request: RequestObject) -> bool: + """ + Tries to move to the next available regional endpoint for the partition key range. + :param Dict[str, str] available_account_regional_endpoints: The available regional endpoints + :param str endpoint_region: The current regional endpoint + :param RequestObject request: The request object containing the routing context. + :return: True if the move was successful, False otherwise. + :rtype: bool + """ with self._lock: if endpoint_region != self.current_region: - logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) - # make the actual endpoint since the current_region is just West US regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) return True @@ -76,6 +82,7 @@ def __init__(self, client: "CosmosClientConnection"): super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, self).__init__(client) self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() + self._lock = threading.Lock() def is_per_partition_automatic_failover_enabled(self) -> bool: if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: @@ -157,18 +164,23 @@ def resolve_service_endpoint_for_partition( if request.location_endpoint_to_route is not None: endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) if endpoint_region in partition_failover_info.unavailable_regional_endpoints: - # If the current region is unavailable, we try to move to the next available region - if not partition_failover_info.try_move_to_next_location( + available_account_regional_endpoints = self.compute_available_preferred_regions(request) + if endpoint_region != partition_failover_info.current_region: + # this request has not yet seen there's an available region being used for this partition + regional_endpoint = available_account_regional_endpoints[ + partition_failover_info.current_region] + request.route_to_location(regional_endpoint) + else: + # If the current region is unavailable, we try to move to the next available region + if not partition_failover_info.try_move_to_next_location( self.compute_available_preferred_regions(request), endpoint_region, request): - logger.warning("All available regions for partition are unavailable. Refreshing cache.") - # If no other region is available, we invalidate the cache and start once again from our - # main write region in the account configurations - self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() - request.clear_route_to_location() - return self._resolve_service_endpoint_for_partition_circuit_breaker(request, - pk_range_wrapper) + logger.warning("All available regions for partition are unavailable. Refreshing cache.") + # If no other region is available, we invalidate the cache and start once again + # from our main write region in the account configurations + self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() + request.clear_route_to_location() else: # Update the current regional endpoint to whatever the request is routing to endpoint_region = self.location_cache.get_location_from_endpoint( From 8f7ec0ce42166c64de9220503ad27e4b60daa3b7 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 29 Aug 2025 08:48:10 -0500 Subject: [PATCH 43/68] pylint --- ...tition_endpoint_manager_per_partition_automatic_failover.py | 3 +-- ..._endpoint_manager_per_partition_automatic_failover_async.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 9eacc261b8d6..84035dd3316f 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -16,7 +16,6 @@ _GlobalPartitionEndpointManagerForCircuitBreaker from azure.cosmos._partition_health_tracker import _PPAFPartitionThresholdsTracker from azure.cosmos.documents import _OperationType -from azure.cosmos._location_cache import RegionalRoutingContext from azure.cosmos._request_object import RequestObject from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper @@ -25,7 +24,7 @@ logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") -# pylint: disable=name-too-long, protected-access +# pylint: disable=name-too-long, protected-access, too-many-nested-blocks #cspell:ignore PPAF, ppaf, ppcb class PartitionLevelFailoverInfo: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index adb47bed6931..8d0680632d0a 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -15,7 +15,6 @@ from azure.cosmos.aio._global_partition_endpoint_manager_circuit_breaker_async import \ _GlobalPartitionEndpointManagerForCircuitBreakerAsync from azure.cosmos.documents import _OperationType -from azure.cosmos._location_cache import RegionalRoutingContext from azure.cosmos._partition_health_tracker import _PPAFPartitionThresholdsTracker from azure.cosmos._request_object import RequestObject from azure.cosmos._routing.routing_range import PartitionKeyRangeWrapper @@ -25,7 +24,7 @@ logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") -# pylint: disable=name-too-long, protected-access +# pylint: disable=name-too-long, protected-access, too-many-nested-blocks #cspell:ignore PPAF, ppaf, ppcb class PartitionLevelFailoverInfo: From effb6d1f99b3d7ee81336e273c732fec8287599d Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:33:14 -0500 Subject: [PATCH 44/68] Update _global_partition_endpoint_manager_per_partition_automatic_failover.py --- ...artition_endpoint_manager_per_partition_automatic_failover.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 84035dd3316f..8fab11b3a1e3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -137,7 +137,6 @@ def try_ppaf_failover_threshold( regional_context = (self.location_cache. account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) partition_level_info.unavailable_regional_endpoints[location] = regional_context - print(3) def resolve_service_endpoint_for_partition( self, From 1e773f5abb0c4b996eaca45e17b4cbc21aee0933 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 29 Aug 2025 16:04:27 -0500 Subject: [PATCH 45/68] address comments, add threshold lock --- ...anager_per_partition_automatic_failover.py | 27 ++++++++++++------ ..._per_partition_automatic_failover_async.py | 28 ++++++++++++------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 8fab11b3a1e3..2a48ca9eb3b9 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -80,6 +80,7 @@ def __init__(self, client: "CosmosClientConnection"): super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover, self).__init__(client) self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() + self._threshold_lock = threading.Lock() def is_per_partition_automatic_failover_enabled(self) -> bool: if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: @@ -129,14 +130,21 @@ def try_ppaf_failover_threshold( if (self.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): - # If the PPAF threshold is reached, we reset the count and retry to the next region - self.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) - partition_level_info = self.partition_range_to_failover_info[pk_range_wrapper] - location = self.location_cache.get_location_from_endpoint( - str(request.location_endpoint_to_route)) - regional_context = (self.location_cache. - account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) - partition_level_info.unavailable_regional_endpoints[location] = regional_context + # If the PPAF threshold is reached, we reset the count and mark the endpoint unavailable + with self._threshold_lock: + logger.warning("PPAF - Failover threshold reached for partition key range: %s", pk_range_wrapper) + # Check for count again, since a previous request may have now reset the count + if (self.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) + >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, + Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): + self.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) + partition_level_info = self.partition_range_to_failover_info[pk_range_wrapper] + location = self.location_cache.get_location_from_endpoint( + str(request.location_endpoint_to_route)) + regional_context = (self.location_cache. + account_read_regional_routing_contexts_by_location. + get(location).primary_endpoint) + partition_level_info.unavailable_regional_endpoints[location] = regional_context def resolve_service_endpoint_for_partition( self, @@ -173,7 +181,8 @@ def resolve_service_endpoint_for_partition( self.compute_available_preferred_regions(request), endpoint_region, request): - logger.warning("All available regions for partition are unavailable. Refreshing cache.") + logger.warning("All available regions for partition %s are unavailable." + " Refreshing cache.", pk_range_wrapper) # If no other region is available, we invalidate the cache and start once again # from our main write region in the account configurations self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 8d0680632d0a..f110146f21a2 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -81,7 +81,7 @@ def __init__(self, client: "CosmosClientConnection"): super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, self).__init__(client) self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() - self._lock = threading.Lock() + self._threshold_lock = threading.Lock() def is_per_partition_automatic_failover_enabled(self) -> bool: if not self._database_account_cache or not self._database_account_cache._EnablePerPartitionFailoverBehavior: @@ -131,14 +131,21 @@ def try_ppaf_failover_threshold( if (self.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): - # If the PPAF threshold is reached, we reset the count and retry to the next region - self.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) - partition_level_info = self.partition_range_to_failover_info[pk_range_wrapper] - location = self.location_cache.get_location_from_endpoint( - str(request.location_endpoint_to_route)) - regional_context = (self.location_cache. - account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) - partition_level_info.unavailable_regional_endpoints[location] = regional_context + # If the PPAF threshold is reached, we reset the count and mark the endpoint unavailable + with self._threshold_lock: + logger.warning("PPAF - Failover threshold reached for partition key range: %s", pk_range_wrapper) + # Check for count again, since a previous request may have now reset the count + if (self.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) + >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, + Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): + self.ppaf_thresholds_tracker.clear_pk_failures(pk_range_wrapper) + partition_level_info = self.partition_range_to_failover_info[pk_range_wrapper] + location = self.location_cache.get_location_from_endpoint( + str(request.location_endpoint_to_route)) + regional_context = (self.location_cache. + account_read_regional_routing_contexts_by_location. + get(location).primary_endpoint) + partition_level_info.unavailable_regional_endpoints[location] = regional_context def resolve_service_endpoint_for_partition( self, @@ -175,7 +182,8 @@ def resolve_service_endpoint_for_partition( self.compute_available_preferred_regions(request), endpoint_region, request): - logger.warning("All available regions for partition are unavailable. Refreshing cache.") + logger.warning("All available regions for partition %s are unavailable." + " Refreshing cache.", pk_range_wrapper) # If no other region is available, we invalidate the cache and start once again # from our main write region in the account configurations self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() From 24a44d9bff0ab5a2f2344c1489eb8ed84a1e37c7 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 29 Aug 2025 16:06:11 -0500 Subject: [PATCH 46/68] add more comments --- ...rtition_endpoint_manager_per_partition_automatic_failover.py | 2 ++ ...n_endpoint_manager_per_partition_automatic_failover_async.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 2a48ca9eb3b9..9441068e6f57 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -131,6 +131,8 @@ def try_ppaf_failover_threshold( >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): # If the PPAF threshold is reached, we reset the count and mark the endpoint unavailable + # Once we mark the endpoint unavailable, the PPAF endpoint manager will try to move to the next + # available region for the partition key range with self._threshold_lock: logger.warning("PPAF - Failover threshold reached for partition key range: %s", pk_range_wrapper) # Check for count again, since a previous request may have now reset the count diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index f110146f21a2..cdbc2153bc82 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -132,6 +132,8 @@ def try_ppaf_failover_threshold( >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, Constants.TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT))): # If the PPAF threshold is reached, we reset the count and mark the endpoint unavailable + # Once we mark the endpoint unavailable, the PPAF endpoint manager will try to move to the next + # available region for the partition key range with self._threshold_lock: logger.warning("PPAF - Failover threshold reached for partition key range: %s", pk_range_wrapper) # Check for count again, since a previous request may have now reset the count From c77209247611776c3f3c8b43ac12b49cd290c25f Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 19 Sep 2025 16:25:58 -0400 Subject: [PATCH 47/68] edge cases --- .../azure/cosmos/_cosmos_http_logging_policy.py | 2 +- ...on_endpoint_manager_per_partition_automatic_failover.py | 7 ++++--- ...point_manager_per_partition_automatic_failover_async.py | 7 ++++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_http_logging_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_http_logging_policy.py index d159128c9b13..6b56dfb73fc6 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_http_logging_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_http_logging_policy.py @@ -180,7 +180,7 @@ def _get_client_settings(global_endpoint_manager: Optional[_GlobalEndpointManage gem_client = global_endpoint_manager.client if gem_client and gem_client.connection_policy: connection_policy: ConnectionPolicy = gem_client.connection_policy - client_preferred_regions = connection_policy.PreferredLocations + client_preferred_regions = global_endpoint_manager.location_cache.effective_preferred_locations client_excluded_regions = connection_policy.ExcludedLocations if global_endpoint_manager.location_cache: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 9441068e6f57..af3315d90d5f 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -51,7 +51,7 @@ def try_move_to_next_location( :rtype: bool """ with self._lock: - if endpoint_region != self.current_region: + if endpoint_region != self.current_region and self.current_region is not None: regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) return True @@ -172,7 +172,8 @@ def resolve_service_endpoint_for_partition( endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) if endpoint_region in partition_failover_info.unavailable_regional_endpoints: available_account_regional_endpoints = self.compute_available_preferred_regions(request) - if endpoint_region != partition_failover_info.current_region: + if (partition_failover_info.current_region is not None and + endpoint_region != partition_failover_info.current_region): # this request has not yet seen there's an available region being used for this partition regional_endpoint = available_account_regional_endpoints[ partition_failover_info.current_region] @@ -214,7 +215,7 @@ def compute_available_preferred_regions( excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations else: excluded_locations = self.location_cache.connection_policy.ExcludedLocations - preferred_locations = self.PreferredLocations + preferred_locations = self.location_cache.effective_preferred_locations available_regions = [item for item in preferred_locations if item not in excluded_locations] available_regional_endpoints = {} for region, context in self.location_cache.account_read_regional_routing_contexts_by_location.items(): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index cdbc2153bc82..d3b0edbd467f 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -51,7 +51,7 @@ def try_move_to_next_location( :rtype: bool """ with self._lock: - if endpoint_region != self.current_region: + if endpoint_region != self.current_region and self.current_region is not None: regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) return True @@ -173,7 +173,8 @@ def resolve_service_endpoint_for_partition( endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) if endpoint_region in partition_failover_info.unavailable_regional_endpoints: available_account_regional_endpoints = self.compute_available_preferred_regions(request) - if endpoint_region != partition_failover_info.current_region: + if (partition_failover_info.current_region is not None and + endpoint_region != partition_failover_info.current_region): # this request has not yet seen there's an available region being used for this partition regional_endpoint = available_account_regional_endpoints[ partition_failover_info.current_region] @@ -217,7 +218,7 @@ def compute_available_preferred_regions( excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations else: excluded_locations = self.location_cache.connection_policy.ExcludedLocations - preferred_locations = self.PreferredLocations + preferred_locations = self.location_cache.effective_preferred_locations available_regions = [item for item in preferred_locations if item not in excluded_locations] available_regional_endpoints = {} for region, context in self.location_cache.account_read_regional_routing_contexts_by_location.items(): From 3acda2406aeeb9643c08479e3b3c9c7359e77b9c Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 7 Oct 2025 14:40:39 -0400 Subject: [PATCH 48/68] changes from testing --- .../_endpoint_discovery_retry_policy.py | 12 +++++++---- ...anager_per_partition_automatic_failover.py | 16 ++++++++------- .../azure/cosmos/_retry_utility.py | 3 +++ .../azure/cosmos/_session_retry_policy.py | 11 +++++----- ..._per_partition_automatic_failover_async.py | 20 ++++++++++++------- .../azure/cosmos/aio/_retry_utility_async.py | 3 +++ 6 files changed, 42 insertions(+), 23 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index 14eed13d8dbc..cf589d6fa370 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -58,17 +58,17 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument return False if self.failover_retry_count >= self.Max_retry_attempt_count: + if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): + # only refresh the cache if PPAF is enabled once we're out of retries + self.global_endpoint_manager.refresh_needed = True return False self.failover_retry_count += 1 - # set the refresh_needed flag to ensure that endpoint list is - # refreshed with new writable and readable locations - self.global_endpoint_manager.refresh_needed = True - # If per partition automatic failover is applicable, we mark the current endpoint as unavailable # and resolve the service endpoint for the partition range - otherwise, continue the default retry logic if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): + #add log partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( str(self.request.location_endpoint_to_route)) @@ -78,6 +78,10 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) return True + # set the refresh_needed flag to ensure that endpoint list is + # refreshed with new writable and readable locations + self.global_endpoint_manager.refresh_needed = True + if self.request.location_endpoint_to_route: if _OperationType.IsReadOnlyOperation(self.request.operation_type): # Mark current read endpoint as unavailable diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index af3315d90d5f..b98ac44219d1 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -179,17 +179,19 @@ def resolve_service_endpoint_for_partition( partition_failover_info.current_region] request.route_to_location(regional_endpoint) else: - # If the current region is unavailable, we try to move to the next available region - if not partition_failover_info.try_move_to_next_location( - self.compute_available_preferred_regions(request), - endpoint_region, - request): - logger.warning("All available regions for partition %s are unavailable." - " Refreshing cache.", pk_range_wrapper) + if len(self.compute_available_preferred_regions(request)) == len(partition_failover_info.unavailable_regional_endpoints): # If no other region is available, we invalidate the cache and start once again # from our main write region in the account configurations + logger.warning("PPAF - All available regions for partition %s are unavailable." + " Refreshing cache.", pk_range_wrapper) self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() request.clear_route_to_location() + else: + # If the current region is unavailable, we try to move to the next available region + partition_failover_info.try_move_to_next_location( + self.compute_available_preferred_regions(request), + endpoint_region, + request) else: # Update the current regional endpoint to whatever the request is routing to partition_failover_info.current_region = endpoint_region diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index df8b8bb04bb0..8ecb0bd2bec7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -208,6 +208,9 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin retry_policy.container_rid = cached_container["_rid"] request.headers[retry_policy._intended_headers] = retry_policy.container_rid elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: + if args: + # record the failure for ppaf/circuit breaker tracking + global_endpoint_manager.record_failure(args[0], pk_range_wrapper) retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py index 1e530ad9e37e..e11cb4838047 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_session_retry_policy.py @@ -110,11 +110,12 @@ def ShouldRetry(self, _exception): if location in pk_failover_info.unavailable_regional_endpoints: # If the request endpoint is unavailable, we need to resolve the endpoint for the request using the # partition-level failover info - location_endpoint = (self.global_endpoint_manager.location_cache. - account_read_regional_routing_contexts_by_location. - get(pk_failover_info.current_region).primary_endpoint) - self.request.route_to_location(location_endpoint) - return True + if pk_failover_info.current_region is not None: + location_endpoint = (self.global_endpoint_manager.location_cache. + account_read_regional_routing_contexts_by_location. + get(pk_failover_info.current_region).primary_endpoint) + self.request.route_to_location(location_endpoint) + return True # Resolve the endpoint for the request and pin the resolution to the resolved endpoint # This enables marking the endpoint unavailability on endpoint failover/unreachability diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index d3b0edbd467f..821f55d96d06 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -51,6 +51,7 @@ def try_move_to_next_location( :rtype: bool """ with self._lock: + print("got lock to move to next location") if endpoint_region != self.current_region and self.current_region is not None: regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) @@ -67,6 +68,8 @@ def try_move_to_next_location( logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) + print(f"routing to {regional_endpoint} from {endpoint_region}") + print(f"current unavailable: {str(self.unavailable_regional_endpoints)}") return True return False @@ -175,22 +178,25 @@ def resolve_service_endpoint_for_partition( available_account_regional_endpoints = self.compute_available_preferred_regions(request) if (partition_failover_info.current_region is not None and endpoint_region != partition_failover_info.current_region): + print("changed {} region to {} region (current)".format(endpoint_region, partition_failover_info.current_region)) # this request has not yet seen there's an available region being used for this partition regional_endpoint = available_account_regional_endpoints[ partition_failover_info.current_region] request.route_to_location(regional_endpoint) else: - # If the current region is unavailable, we try to move to the next available region - if not partition_failover_info.try_move_to_next_location( - self.compute_available_preferred_regions(request), - endpoint_region, - request): - logger.warning("All available regions for partition %s are unavailable." - " Refreshing cache.", pk_range_wrapper) + if len(self.compute_available_preferred_regions(request)) == len(partition_failover_info.unavailable_regional_endpoints): # If no other region is available, we invalidate the cache and start once again # from our main write region in the account configurations + logger.warning("All available regions for partition %s are unavailable." + " Refreshing cache.", pk_range_wrapper) self.partition_range_to_failover_info[pk_range_wrapper] = PartitionLevelFailoverInfo() request.clear_route_to_location() + else: + # If the current region is unavailable, we try to move to the next available region + partition_failover_info.try_move_to_next_location( + self.compute_available_preferred_regions(request), + endpoint_region, + request) else: # Update the current regional endpoint to whatever the request is routing to endpoint_region = self.location_cache.get_location_from_endpoint( diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 88caac506cb1..64d58e2bbb2a 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -205,6 +205,9 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg retry_policy.container_rid = cached_container["_rid"] request.headers[retry_policy._intended_headers] = retry_policy.container_rid elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: + if args: + # record the failure for circuit breaker tracking + await global_endpoint_manager.record_ppcb_failure(args[0], pk_range_wrapper) retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: From 9a6b17b5e5367683078f0c73d0998965c3c951fa Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 7 Oct 2025 16:36:14 -0400 Subject: [PATCH 49/68] pylint --- .../azure/cosmos/_endpoint_discovery_retry_policy.py | 2 ++ ...ition_endpoint_manager_per_partition_automatic_failover.py | 3 ++- ...endpoint_manager_per_partition_automatic_failover_async.py | 4 ++-- .../azure-cosmos/azure/cosmos/aio/_retry_utility_async.py | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index cf589d6fa370..c87373eb9560 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -23,6 +23,8 @@ Azure Cosmos database service. """ +# cspell:ignore PPAF + from azure.cosmos.documents import _OperationType class EndpointDiscoveryRetryPolicy(object): diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index b98ac44219d1..8502c9417fab 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -179,7 +179,8 @@ def resolve_service_endpoint_for_partition( partition_failover_info.current_region] request.route_to_location(regional_endpoint) else: - if len(self.compute_available_preferred_regions(request)) == len(partition_failover_info.unavailable_regional_endpoints): + if (len(self.compute_available_preferred_regions(request)) + == len(partition_failover_info.unavailable_regional_endpoints)): # If no other region is available, we invalidate the cache and start once again # from our main write region in the account configurations logger.warning("PPAF - All available regions for partition %s are unavailable." diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 821f55d96d06..fe0fcf2ad7b2 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -178,13 +178,13 @@ def resolve_service_endpoint_for_partition( available_account_regional_endpoints = self.compute_available_preferred_regions(request) if (partition_failover_info.current_region is not None and endpoint_region != partition_failover_info.current_region): - print("changed {} region to {} region (current)".format(endpoint_region, partition_failover_info.current_region)) # this request has not yet seen there's an available region being used for this partition regional_endpoint = available_account_regional_endpoints[ partition_failover_info.current_region] request.route_to_location(regional_endpoint) else: - if len(self.compute_available_preferred_regions(request)) == len(partition_failover_info.unavailable_regional_endpoints): + if (len(self.compute_available_preferred_regions(request)) == + len(partition_failover_info.unavailable_regional_endpoints)): # If no other region is available, we invalidate the cache and start once again # from our main write region in the account configurations logger.warning("All available regions for partition %s are unavailable." diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 64d58e2bbb2a..bdc971d6bd61 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -47,7 +47,7 @@ # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches -# cspell:ignore ppaf +# cspell:ignore ppaf, ppcb # args [0] is the request object # args [1] is the connection policy From 8f75444a7f9b18841f1a89b5eb162f174cb86e91 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Tue, 7 Oct 2025 21:09:10 -0400 Subject: [PATCH 50/68] fixes pylint/mypy --- ...int_manager_per_partition_automatic_failover.py | 14 +++++++------- .../azure/cosmos/_partition_health_tracker.py | 2 +- ...nager_per_partition_automatic_failover_async.py | 14 +++++++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 8502c9417fab..328f05bcd23e 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -8,7 +8,7 @@ import threading import os -from typing import Dict, TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType from azure.cosmos._constants import _Constants as Constants @@ -32,14 +32,14 @@ class PartitionLevelFailoverInfo: Holds information about the partition level regional failover. Used to track the partition key range and the regions where it is available. """ - def __init__(self): - self.unavailable_regional_endpoints: Dict[str, str] = {} + def __init__(self) -> None: + self.unavailable_regional_endpoints: dict[str, str] = {} self.current_region = None self._lock = threading.Lock() def try_move_to_next_location( self, - available_account_regional_endpoints: Dict[str, str], + available_account_regional_endpoints: dict[str, str], endpoint_region: str, request: RequestObject) -> bool: """ @@ -76,9 +76,9 @@ class _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover(_GlobalPar This internal class implements the logic for partition endpoint management for geo-replicated database accounts. """ - def __init__(self, client: "CosmosClientConnection"): + def __init__(self, client: "CosmosClientConnection") -> None: super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover, self).__init__(client) - self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} + self.partition_range_to_failover_info: dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() self._threshold_lock = threading.Lock() @@ -207,7 +207,7 @@ def resolve_service_endpoint_for_partition( def compute_available_preferred_regions( self, request: RequestObject - ) -> Dict[str, str]: + ) -> dict[str, str]: """ Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. :param RequestObject request: The request object containing the routing context. diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py index 0759aa79ca48..8218950a8dff 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py @@ -311,7 +311,7 @@ class _PPAFPartitionThresholdsTracker(object): """ def __init__(self) -> None: - self.pk_range_wrapper_to_failure_count: Dict[PartitionKeyRangeWrapper, int] = {} + self.pk_range_wrapper_to_failure_count: dict[PartitionKeyRangeWrapper, int] = {} def add_failure(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> None: if pk_range_wrapper not in self.pk_range_wrapper_to_failure_count: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index fe0fcf2ad7b2..4c76dda04748 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -8,7 +8,7 @@ import threading import os -from typing import Dict, TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional from azure.cosmos.http_constants import ResourceType from azure.cosmos._constants import _Constants as Constants @@ -32,14 +32,14 @@ class PartitionLevelFailoverInfo: Holds information about the partition level regional failover. Used to track the partition key range and the regions where it is available. """ - def __init__(self): - self.unavailable_regional_endpoints: Dict[str, str] = {} + def __init__(self) -> None: + self.unavailable_regional_endpoints: dict[str, str] = {} self.current_region = None self._lock = threading.Lock() def try_move_to_next_location( self, - available_account_regional_endpoints: Dict[str, str], + available_account_regional_endpoints: dict[str, str], endpoint_region: str, request: RequestObject) -> bool: """ @@ -80,9 +80,9 @@ class _GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync( This internal class implements the logic for partition endpoint management for geo-replicated database accounts. """ - def __init__(self, client: "CosmosClientConnection"): + def __init__(self, client: "CosmosClientConnection") -> None: super(_GlobalPartitionEndpointManagerForPerPartitionAutomaticFailoverAsync, self).__init__(client) - self.partition_range_to_failover_info: Dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} + self.partition_range_to_failover_info: dict[PartitionKeyRangeWrapper, PartitionLevelFailoverInfo] = {} self.ppaf_thresholds_tracker = _PPAFPartitionThresholdsTracker() self._threshold_lock = threading.Lock() @@ -213,7 +213,7 @@ def resolve_service_endpoint_for_partition( def compute_available_preferred_regions( self, request: RequestObject - ) -> Dict[str, str]: + ) -> dict[str, str]: """ Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. :param RequestObject request: The request object containing the routing context. From 0ccd9bfa703654f54c4ba24a8e05f01653777047 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 8 Oct 2025 09:12:02 -0400 Subject: [PATCH 51/68] mypy complaining about assigning str to none --- ...rtition_endpoint_manager_per_partition_automatic_failover.py | 2 +- ...n_endpoint_manager_per_partition_automatic_failover_async.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 328f05bcd23e..991e5709eda7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -34,8 +34,8 @@ class PartitionLevelFailoverInfo: """ def __init__(self) -> None: self.unavailable_regional_endpoints: dict[str, str] = {} - self.current_region = None self._lock = threading.Lock() + self.current_region: Optional[str] = None def try_move_to_next_location( self, diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 4c76dda04748..035cb5193548 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -34,8 +34,8 @@ class PartitionLevelFailoverInfo: """ def __init__(self) -> None: self.unavailable_regional_endpoints: dict[str, str] = {} - self.current_region = None self._lock = threading.Lock() + self.current_region: Optional[str] = None def try_move_to_next_location( self, From f4e4d655b3acdf9872f36d50c933d93bdeb33242 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 8 Oct 2025 10:03:33 -0400 Subject: [PATCH 52/68] testing changes - will roll back later --- .../azure/cosmos/aio/_asynchronous_request.py | 28 +++++++++++ .../tests/workloads/r_w_q_workload.py | 48 +++++++++++++++++-- .../tests/workloads/workload_utils.py | 17 ++++++- 3 files changed, 87 insertions(+), 6 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 1cd2a22039b4..8981104688b1 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -24,6 +24,8 @@ import copy import json import time +from datetime import datetime, timezone +import logging from urllib.parse import urlparse from azure.core.exceptions import DecodeError # type: ignore @@ -72,6 +74,8 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p if kwargs['timeout'] <= 0: raise exceptions.CosmosClientTimeoutError() + route_start = time.perf_counter() + if request_params.endpoint_override: base_url = request_params.endpoint_override else: @@ -98,6 +102,12 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p and not connection_policy.DisableSSLVerification ) + route_end = time.perf_counter() + + route_duration = (route_end - route_start) * 1000 + + start = time.perf_counter() + if connection_policy.SSLConfiguration or "connection_cert" in kwargs: ca_certs = connection_policy.SSLConfiguration.SSLCaCerts cert_files = (connection_policy.SSLConfiguration.SSLCertFile, connection_policy.SSLConfiguration.SSLKeyFile) @@ -125,6 +135,24 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p **kwargs ) + end = time.perf_counter() + duration = (end - start) * 1000 + + logger = logging.getLogger("internal_requests") + response_time = datetime.now(timezone.utc) + print_string = f"Response time: {response_time.isoformat()} | " + print_string += f"Request URL: {request.url} | " + print_string += f"Resource type: {request.headers['x-ms-thinclient-proxy-resource-type']} | " + print_string += f"Operation type: {request.headers['x-ms-thinclient-proxy-operation-type']} | " + print_string += f"Status code: {response.http_response.status_code} | " + print_string += f"Sub-status code: {response.http_response.headers.get('x-ms-substatus', 'N/A')} | " + print_string += f"Routing duration: {route_duration} ms | " + print_string += f"Request/response duration: {duration} ms | " + print_string += f"Activity Id: {request.headers.get('x-ms-activity-id', 'N/A')} |" + print_string += f"Partition Id: {request.headers.get('x-ms-cosmos-internal-partition-id', 'N/A')} |" + print_string += f"Physical Id: {request.headers.get('x-ms-cosmos-physical-partition-id', 'N/A')} |" + logger.info(print_string) + response = response.http_response headers = copy.copy(response.headers) diff --git a/sdk/cosmos/azure-cosmos/tests/workloads/r_w_q_workload.py b/sdk/cosmos/azure-cosmos/tests/workloads/r_w_q_workload.py index 5e1db6425142..0d7730c6f86e 100644 --- a/sdk/cosmos/azure-cosmos/tests/workloads/r_w_q_workload.py +++ b/sdk/cosmos/azure-cosmos/tests/workloads/r_w_q_workload.py @@ -3,6 +3,9 @@ import sys from azure.cosmos import documents +from datetime import datetime, timezone +import time +from workload_utils import _get_upsert_item from workload_utils import * from workload_configs import * sys.path.append(r"/") @@ -10,7 +13,27 @@ from azure.cosmos.aio import CosmosClient as AsyncClient import asyncio +async def log_request_counts(counter): + while True: + await asyncio.sleep(300) # 5 minutes + count = counter["count"] + duration = counter["upsert_time"] + counter["read_time"] + print("Current UTC time:", datetime.now(timezone.utc)) + print(f"Executed {count} requests in the last 5 minutes") + print(f"Errors in the last 5 minutes: {counter['error_count']}") + print(f"Per-request latency: {duration / count if count > 0 else 0} ms") + print(f"Upsert latency: {counter['upsert_time'] / (count / 2) if count > 0 else 0} ms") + print(f"Read latency: {counter['read_time'] / (count / 2) if count > 0 else 0} ms") + print("-------------------------------") + counter["count"] = 0 # reset for next interval + counter["upsert_time"] = 0 + counter["read_time"] = 0 + counter["error_count"] = 0 + async def run_workload(client_id, client_logger): + counter = {"count": 0, "upsert_time": 0, "read_time": 0, "error_count": 0} + # Start background task + asyncio.create_task(log_request_counts(counter)) connectionPolicy = documents.ConnectionPolicy() connectionPolicy.UseMultipleWriteLocations = USE_MULTIPLE_WRITABLE_LOCATIONS async with AsyncClient(COSMOS_URI, COSMOS_CREDENTIAL, connection_policy=connectionPolicy, @@ -23,15 +46,32 @@ async def run_workload(client_id, client_logger): while True: try: - await upsert_item_concurrently(cont, REQUEST_EXCLUDED_LOCATIONS, CONCURRENT_REQUESTS) - await read_item_concurrently(cont, REQUEST_EXCLUDED_LOCATIONS, CONCURRENT_REQUESTS) - await query_items_concurrently(cont, REQUEST_EXCLUDED_LOCATIONS, CONCURRENT_QUERIES) + upsert_start = time.perf_counter() + up_item = _get_upsert_item() + await cont.upsert_item(up_item) + elapsed = time.perf_counter() - upsert_start + counter["count"] += 1 + counter["upsert_time"] += elapsed + + read_start = time.perf_counter() + item = get_existing_random_item() + await cont.read_item(item["id"], item[PARTITION_KEY]) + elapsed = time.perf_counter() - read_start + counter["count"] += 1 + counter["read_time"] += elapsed + + # await upsert_item_concurrently(cont, REQUEST_EXCLUDED_LOCATIONS, CONCURRENT_REQUESTS) + # await read_item_concurrently(cont, REQUEST_EXCLUDED_LOCATIONS, CONCURRENT_REQUESTS) + # await query_items_concurrently(cont, REQUEST_EXCLUDED_LOCATIONS, CONCURRENT_QUERIES) except Exception as e: + counter["error_count"] += 1 client_logger.info("Exception in application layer") - client_logger.error(e) if __name__ == "__main__": file_name = os.path.basename(__file__) prefix, logger = create_logger(file_name) + create_inner_logger() + utc_now = datetime.now(timezone.utc) + print("Current UTC time:", utc_now) asyncio.run(run_workload(prefix, logger)) diff --git a/sdk/cosmos/azure-cosmos/tests/workloads/workload_utils.py b/sdk/cosmos/azure-cosmos/tests/workloads/workload_utils.py index 6a0f95128e5d..fe3d69b3bfbe 100644 --- a/sdk/cosmos/azure-cosmos/tests/workloads/workload_utils.py +++ b/sdk/cosmos/azure-cosmos/tests/workloads/workload_utils.py @@ -3,6 +3,7 @@ import asyncio import os import random +import sys import uuid from datetime import datetime from logging.handlers import RotatingFileHandler @@ -160,15 +161,27 @@ def create_logger(file_name): handler = RotatingFileHandler( "log-" + get_user_agent(prefix) + '.log', maxBytes=1024 * 1024 * 10, # 10 mb - backupCount=2 + backupCount=5 ) logger.setLevel(LOG_LEVEL) # create filters for the logger handler to reduce the noise workload_logger_filter = WorkloadLoggerFilter() - handler.addFilter(workload_logger_filter) + # handler.addFilter(workload_logger_filter) logger.addHandler(handler) return prefix, logger +def create_inner_logger(file_name="internal_logger_tues"): + logger = logging.getLogger("internal_requests") + prefix = os.path.splitext(file_name)[0] + "-" + str(os.getpid()) + # Create a rotating file handler + handler = RotatingFileHandler( + "log-" + file_name + '.log', + maxBytes=1024 * 1024 * 10, # 10 mb + backupCount=5 + ) + logger.setLevel(LOG_LEVEL) + logger.addHandler(handler) + class WorkloadLoggerFilter(logging.Filter): def filter(self, record): From 8f87b13f76facb15f4d033f1f037f1ef1c2c8a04 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 9 Oct 2025 19:30:56 -0400 Subject: [PATCH 53/68] Update _endpoint_discovery_retry_policy.py --- .../cosmos/_endpoint_discovery_retry_policy.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index c87373eb9560..fd209603a620 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -60,17 +60,17 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument return False if self.failover_retry_count >= self.Max_retry_attempt_count: - if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): - # only refresh the cache if PPAF is enabled once we're out of retries - self.global_endpoint_manager.refresh_needed = True return False self.failover_retry_count += 1 + # set the refresh_needed flag to ensure that endpoint list is + # refreshed with new writable and readable locations + self.global_endpoint_manager.refresh_needed = True + # If per partition automatic failover is applicable, we mark the current endpoint as unavailable # and resolve the service endpoint for the partition range - otherwise, continue the default retry logic if self.global_endpoint_manager.is_per_partition_automatic_failover_applicable(self.request): - #add log partition_level_info = self.global_endpoint_manager.partition_range_to_failover_info[self.pk_range_wrapper] location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( str(self.request.location_endpoint_to_route)) @@ -80,10 +80,6 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) return True - # set the refresh_needed flag to ensure that endpoint list is - # refreshed with new writable and readable locations - self.global_endpoint_manager.refresh_needed = True - if self.request.location_endpoint_to_route: if _OperationType.IsReadOnlyOperation(self.request.operation_type): # Mark current read endpoint as unavailable @@ -99,8 +95,7 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument self.request.clear_route_to_location() # set location-based routing directive based on retry count - # simulating single master writes by ensuring usePreferredLocations - # is set to false + # simulating single master writes by ensuring usePreferredLocations is set to false # reasoning being that 403.3 is only expected for write region failover in single writer account # and we must rely on account locations as they are the source of truth self.request.route_to_location_with_preferred_location_flag(self.failover_retry_count, False) From 3e1f6bec8641995b094d1913ed0462b94842549a Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 16 Oct 2025 17:53:54 -0700 Subject: [PATCH 54/68] Update _asynchronous_request.py --- .../azure-cosmos/azure/cosmos/aio/_asynchronous_request.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 8981104688b1..945c4615b49e 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -149,9 +149,10 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p print_string += f"Routing duration: {route_duration} ms | " print_string += f"Request/response duration: {duration} ms | " print_string += f"Activity Id: {request.headers.get('x-ms-activity-id', 'N/A')} |" - print_string += f"Partition Id: {request.headers.get('x-ms-cosmos-internal-partition-id', 'N/A')} |" - print_string += f"Physical Id: {request.headers.get('x-ms-cosmos-physical-partition-id', 'N/A')} |" + print_string += f"Partition Id: {response.http_response.headers.get('x-ms-cosmos-internal-partition-id', 'N/A')} |" + print_string += f"Physical Id: {response.http_response.headers.get('x-ms-cosmos-physical-partition-id', 'N/A')} |" logger.info(print_string) + print(print_string) response = response.http_response headers = copy.copy(response.headers) From 42817fcf4aa6c3b1631a4157397e2dd83bfb4766 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:07:07 -0700 Subject: [PATCH 55/68] add user agent feature flags --- .../azure-cosmos/azure/cosmos/_constants.py | 18 ++++++++++++++- ...anager_per_partition_automatic_failover.py | 2 ++ .../azure/cosmos/_retry_utility.py | 8 +++++++ .../azure-cosmos/azure/cosmos/_utils.py | 23 ++++++++++++++++++- ..._per_partition_automatic_failover_async.py | 2 ++ .../azure/cosmos/aio/_retry_utility_async.py | 8 +++++++ .../tests/_fault_injection_transport.py | 4 ++-- .../test_per_partition_automatic_failover.py | 13 +++++++++++ ..._per_partition_automatic_failover_async.py | 9 +++++++- .../test_per_partition_circuit_breaker_mm.py | 13 +++++++++++ ..._per_partition_circuit_breaker_mm_async.py | 14 ++++++++--- ...st_per_partition_circuit_breaker_sm_mrr.py | 10 +++++++- ..._partition_circuit_breaker_sm_mrr_async.py | 16 +++++++++---- 13 files changed, 127 insertions(+), 13 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py index 149c2d4daf99..a82994414996 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py @@ -22,7 +22,7 @@ """Class for defining internal constants in the Azure Cosmos database service. """ - +from enum import IntEnum from typing_extensions import Literal # cspell:ignore PPAF @@ -106,3 +106,19 @@ class Kwargs: """Whether to retry write operations if they fail. Used either at client level or request level.""" EXCLUDED_LOCATIONS: Literal["excludedLocations"] = "excludedLocations" + + class UserAgentFeatureFlags(IntEnum): + """ + User agent feature flags. + Each flag represents a bit in a number to encode what features are enabled. Therefore, the first feature flag + will be 1, the second 2, the third 4, etc. When constructing the user agent suffix, the feature flags will be + used to encode a unique number representing the features enabled. This number will be converted into a hex + string following the prefix "F" to save space in the user agent as it is limited and appended to the user agent + suffix. This number will then be used to determine what features are enabled by decoding the hex string back + to a number and checking what bits are set. + + Example: + If the user agent suffix has "F3", this means that flags 1 and 2. + """ + PER_PARTITION_AUTOMATIC_FAILOVER = 1 + PER_PARTITION_CIRCUIT_BREAKER = 2 diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 991e5709eda7..0e912fbace67 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -218,6 +218,8 @@ def compute_available_preferred_regions( excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations else: excluded_locations = self.location_cache.connection_policy.ExcludedLocations + if excluded_locations is None: + excluded_locations = [] preferred_locations = self.location_cache.effective_preferred_locations available_regions = [item for item in preferred_locations if item not in excluded_locations] available_regional_endpoints = {} diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index c61a59410671..d460fb0364d9 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -43,6 +43,7 @@ from .exceptions import CosmosHttpResponseError from .http_constants import HttpHeaders, StatusCodes, SubStatusCodes, ResourceType from ._cosmos_http_logging_policy import _log_diagnostics_error +from ._utils import get_user_agent_features # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches @@ -114,6 +115,13 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin container_recreate_retry_policy = _container_recreate_retry_policy.ContainerRecreateRetryPolicy( client, client._container_properties_cache, None, *args) + user_agent_features = get_user_agent_features(global_endpoint_manager) + if len(user_agent_features) > 0: + user_agent = kwargs.pop("user_agent", client._user_agent) + user_agent = "{} {}".format(user_agent, user_agent_features) + kwargs.update({"user_agent": user_agent}) + kwargs.update({"user_agent_overwrite": True}) + while True: client_timeout = kwargs.get('timeout') start_time = time.time() diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py index 9144afca613d..8bb57ccd6562 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py @@ -27,10 +27,12 @@ import base64 import json import time +import os from typing import Any, Optional, Tuple - +from ._constants import _Constants from ._version import VERSION +# cspell:ignore ppcb def get_user_agent(suffix: Optional[str] = None) -> str: os_name = safe_user_agent_header(platform.platform()) @@ -146,3 +148,22 @@ def valid_key_value_exist( :rtype: bool """ return key in kwargs and kwargs[key] is not invalid_value + + +def get_user_agent_features(global_endpoint_manager: Any) -> str: + """Check the account and client configurations in order to add feature flags to the user agent. + + :param Any global_endpoint_manager: The global endpoint manager instance used to check against. + :return: The string representing the user agent features to include. + :rtype: str + """ + feature_flag = 0 + if global_endpoint_manager._database_account_cache is not None: + if global_endpoint_manager._database_account_cache._EnablePerPartitionFailoverBehavior is True: + feature_flag += _Constants.UserAgentFeatureFlags.PER_PARTITION_AUTOMATIC_FAILOVER + ppcb_check = os.environ.get( + _Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, + _Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT).lower() + if ppcb_check == "true" or feature_flag > 0: + feature_flag += _Constants.UserAgentFeatureFlags.PER_PARTITION_CIRCUIT_BREAKER + return f"| F{feature_flag}" if feature_flag > 0 else "" \ No newline at end of file diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 035cb5193548..0e91a5e42069 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -224,6 +224,8 @@ def compute_available_preferred_regions( excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations else: excluded_locations = self.location_cache.connection_policy.ExcludedLocations + if excluded_locations is None: + excluded_locations = [] preferred_locations = self.location_cache.effective_preferred_locations available_regions = [item for item in preferred_locations if item not in excluded_locations] available_regional_endpoints = {} diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index 80dffa6aee72..f54cd61bfd0b 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -44,6 +44,7 @@ from ..exceptions import CosmosHttpResponseError from ..http_constants import HttpHeaders, StatusCodes, SubStatusCodes from .._cosmos_http_logging_policy import _log_diagnostics_error +from .._utils import get_user_agent_features # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches @@ -113,6 +114,13 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg container_recreate_retry_policy = ContainerRecreateRetryPolicy( client, client._container_properties_cache, None, *args) + user_agent_features = get_user_agent_features(global_endpoint_manager) + if len(user_agent_features) > 0: + user_agent = kwargs.pop("user_agent", client._user_agent) + user_agent = "{} {}".format(user_agent, user_agent_features) + kwargs.update({"user_agent": user_agent}) + kwargs.update({"user_agent_overwrite": True}) + while True: client_timeout = kwargs.get('timeout') start_time = time.time() diff --git a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py index e17cba91ee0a..f71a21003c98 100644 --- a/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py +++ b/sdk/cosmos/azure-cosmos/tests/_fault_injection_transport.py @@ -26,7 +26,7 @@ import logging import sys from time import sleep -from typing import Callable, Optional, Any, MutableMapping +from typing import Callable, Optional, Any, MutableMapping, Mapping, Tuple, Sequence from azure.core.pipeline.transport import HttpRequest, HttpResponse from azure.core.pipeline.transport._requests_basic import RequestsTransport, RequestsTransportResponse @@ -355,7 +355,7 @@ def __init__( self.files: Optional[Any] = files self.data: Optional[Any] = data self.multipart_mixed_info: Optional[ - Tuple[Sequence[Any], Sequence[Any], Optional[str], Dict[str, Any]]] = None + Tuple[Sequence[Any], Sequence[Any], Optional[str], dict[str, Any]]] = None class MockHttpResponse(RequestsTransportResponse): def __init__(self, request: HttpRequest, status_code: int, content: Optional[Any] = None): diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index b3fede1c9f65..8f34f14a3913 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -264,11 +264,24 @@ def test_ppaf_session_unavailable_retry(self, write_operation, error): # We verify that the read request was going to the correct region by using the raw_response_hook fault_injection_container.read_item(doc_fail_id, PK_VALUE, raw_response_hook=session_retry_hook) + def test_ppaf_user_agent_feature_flag(self): + # Simple test to verify the user agent suffix is being updated with the relevant feature flags + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info() + fault_injection_container = custom_setup['col'] + # Create a document to check the response headers + fault_injection_container.upsert_item(body={'id': doc_success_id, 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}, + raw_response_hook=ppaf_user_agent_hook) + def session_retry_hook(raw_response): if raw_response.http_request.headers.get('x-ms-thinclient-proxy-resource-type') != 'databaseaccount': # This hook is used to verify the request routing that happens after the session retry logic region_string = "-" + REGION_2.replace(' ', '').lower() + "." assert region_string in raw_response.http_request.url +def ppaf_user_agent_hook(raw_response): + # Used to verify the user agent feature flags + user_agent = raw_response.http_request.headers.get('user-agent') + assert user_agent.endswith('| F3') + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index ac0e97576ed2..214662e20603 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -16,7 +16,7 @@ from azure.cosmos._request_object import RequestObject from _fault_injection_transport import FaultInjectionTransport from _fault_injection_transport_async import FaultInjectionTransportAsync -from test_per_partition_automatic_failover import create_failover_errors, create_threshold_errors, session_retry_hook +from test_per_partition_automatic_failover import create_failover_errors, create_threshold_errors, session_retry_hook, ppaf_user_agent_hook from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors, write_operations_and_boolean from test_per_partition_circuit_breaker_mm_async import perform_write_operation @@ -258,6 +258,13 @@ async def test_ppaf_session_unavailable_retry_async(self, write_operation, error # We verify that the read request was going to the correct region by using the raw_response_hook fault_injection_container.read_item(doc_fail_id, PK_VALUE, raw_response_hook=session_retry_hook) + async def test_ppaf_user_agent_feature_flag_async(self): + # Simple test to verify the user agent suffix is being updated with the relevant feature flags + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info() + fault_injection_container = custom_setup['col'] + # Create a document to check the response headers + await fault_injection_container.upsert_item(body={'id': doc_success_id, 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}, + raw_response_hook=ppaf_user_agent_hook) if __name__ == '__main__': unittest.main() diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py index 8ab0f5f2e9b2..8e4ab4386695 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py @@ -498,6 +498,14 @@ def test_service_request_error(self, read_operation, write_operation): # there shouldn't be region marked as unavailable assert len(global_endpoint_manager.location_cache.location_unavailability_info_by_endpoint) == 1 + def test_circuit_breaker_user_agent_feature_flag_mm(self): + # Simple test to verify the user agent suffix is being updated with the relevant feature flags + custom_setup = self.setup_method_with_custom_transport(None) + container = custom_setup['col'] + # Create a document to check the response headers + container.upsert_item(body={'id': str(uuid.uuid4()), 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}, + raw_response_hook=user_agent_hook) + # test cosmos client timeout if __name__ == '__main__': @@ -520,3 +528,8 @@ def validate_stats(global_endpoint_manager, assert health_info.write_failure_count == expected_write_failure_count assert health_info.read_success_count == expected_read_success_count assert health_info.write_success_count == expected_write_success_count + +def user_agent_hook(raw_response): + # Used to verify the user agent feature flags + user_agent = raw_response.http_request.headers.get('user-agent') + assert user_agent.endswith('| F2') \ No newline at end of file diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py index f8d06687483a..90131646c17a 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm_async.py @@ -4,7 +4,7 @@ import os import unittest import uuid -from typing import Any +from typing import Any, Union import pytest from azure.core.pipeline.transport._aiohttp import AioHttpTransport @@ -18,7 +18,7 @@ from test_per_partition_circuit_breaker_mm import create_doc, read_operations_and_errors, \ write_operations_and_errors, operations, REGION_1, REGION_2, CHANGE_FEED, CHANGE_FEED_PK, CHANGE_FEED_EPK, READ, \ CREATE, READ_ALL_ITEMS, DELETE_ALL_ITEMS_BY_PARTITION_KEY, QUERY, QUERY_PK, BATCH, UPSERT, REPLACE, PATCH, DELETE, \ - PK_VALUE, validate_unhealthy_partitions, validate_response_uri + PK_VALUE, validate_unhealthy_partitions, validate_response_uri, user_agent_hook from test_per_partition_circuit_breaker_mm import validate_stats COLLECTION = "created_collection" @@ -111,7 +111,7 @@ class TestPerPartitionCircuitBreakerMMAsync: TEST_DATABASE_ID = test_config.TestConfig.TEST_DATABASE_ID TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID - async def setup_method_with_custom_transport(self, custom_transport: AioHttpTransport, default_endpoint=host, **kwargs): + async def setup_method_with_custom_transport(self, custom_transport: Union[AioHttpTransport, Any], default_endpoint=host, **kwargs): container_id = kwargs.pop("container_id", None) if not container_id: container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID @@ -481,5 +481,13 @@ async def concurrent_upsert(): _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = original_unavailable_time await cleanup_method([custom_setup, setup]) + async def test_circuit_breaker_user_agent_feature_flag_mm_async(self): + # Simple test to verify the user agent suffix is being updated with the relevant feature flags + custom_setup = await self.setup_method_with_custom_transport(None) + container = custom_setup['col'] + # Create a document to check the response headers + await container.upsert_item(body={'id': str(uuid.uuid4()), 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}, + raw_response_hook=user_agent_hook) + if __name__ == '__main__': unittest.main() diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py index d92c46c0f622..0ec3df11d270 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr.py @@ -15,7 +15,7 @@ from azure.cosmos.exceptions import CosmosHttpResponseError from _fault_injection_transport import FaultInjectionTransport from test_per_partition_circuit_breaker_mm import create_doc, write_operations_and_errors, operations, REGION_1, \ - REGION_2, PK_VALUE, perform_write_operation, perform_read_operation, CREATE, READ, validate_stats + REGION_2, PK_VALUE, perform_write_operation, perform_read_operation, CREATE, READ, validate_stats, user_agent_hook COLLECTION = "created_collection" @@ -235,6 +235,14 @@ def test_service_request_error(self, read_operation, write_operation): # there shouldn't be region marked as unavailable assert len(global_endpoint_manager.location_cache.location_unavailability_info_by_endpoint) == 1 + def test_circuit_breaker_user_agent_feature_flag_sm(self): + # Simple test to verify the user agent suffix is being updated with the relevant feature flags + custom_setup = self.setup_method_with_custom_transport(None) + container = custom_setup['col'] + # Create a document to check the response headers + container.upsert_item(body={'id': str(uuid.uuid4()), 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}, + raw_response_hook=user_agent_hook) + # test cosmos client timeout if __name__ == '__main__': diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py index ae5bc8198043..2d43fb492b8c 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_sm_mrr_async.py @@ -4,7 +4,7 @@ import os import unittest import uuid -from typing import Dict, Any +from typing import Any, Union import pytest from azure.core.pipeline.transport._aiohttp import AioHttpTransport @@ -17,7 +17,7 @@ from _fault_injection_transport_async import FaultInjectionTransportAsync from test_per_partition_circuit_breaker_mm_async import perform_write_operation, cleanup_method, perform_read_operation from test_per_partition_circuit_breaker_mm import create_doc, write_operations_and_errors, operations, REGION_1, \ - REGION_2, PK_VALUE, READ, validate_stats, CREATE + REGION_2, PK_VALUE, READ, validate_stats, CREATE, user_agent_hook from test_per_partition_circuit_breaker_sm_mrr import validate_unhealthy_partitions COLLECTION = "created_collection" @@ -31,7 +31,7 @@ class TestPerPartitionCircuitBreakerSmMrrAsync: TEST_DATABASE_ID = test_config.TestConfig.TEST_DATABASE_ID TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID - async def setup_method_with_custom_transport(self, custom_transport: AioHttpTransport, default_endpoint=host, **kwargs): + async def setup_method_with_custom_transport(self, custom_transport: Union[AioHttpTransport, Any], default_endpoint=host, **kwargs): container_id = kwargs.pop("container_id", None) if not container_id: container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID @@ -43,7 +43,7 @@ async def setup_method_with_custom_transport(self, custom_transport: AioHttpTran return {"client": client, "db": db, "col": container} @staticmethod - async def cleanup_method(initialized_objects: Dict[str, Any]): + async def cleanup_method(initialized_objects: dict[str, Any]): method_client: CosmosClient = initialized_objects["client"] await method_client.close() @@ -234,6 +234,14 @@ async def test_service_request_error_async(self, read_operation, write_operation assert len(global_endpoint_manager.location_cache.location_unavailability_info_by_endpoint) == 1 await cleanup_method([custom_setup, setup]) + async def test_circuit_breaker_user_agent_feature_flag_sm_async(self): + # Simple test to verify the user agent suffix is being updated with the relevant feature flags + custom_setup = await self.setup_method_with_custom_transport(None) + container = custom_setup['col'] + # Create a document to check the response headers + await container.upsert_item(body={'id': str(uuid.uuid4()), 'pk': PK_VALUE, 'name': 'sample document', 'key': 'value'}, + raw_response_hook=user_agent_hook) + # test cosmos client timeout if __name__ == '__main__': From 65f9e0126e9b64bd306dce1f4e925f9417256034 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 20 Oct 2025 18:01:05 -0400 Subject: [PATCH 56/68] Update test_per_partition_automatic_failover_async.py --- .../tests/test_per_partition_automatic_failover_async.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 214662e20603..25dbcf5b64af 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -20,7 +20,7 @@ from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors, write_operations_and_boolean from test_per_partition_circuit_breaker_mm_async import perform_write_operation -# cspell:disable +#cspell:ignore PPAF, ppaf # These tests assume that the configured live account has one main write region and one secondary read region. From e15e43d4b0fda695b866a2c07891d3931412b070 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 24 Oct 2025 16:22:55 -0400 Subject: [PATCH 57/68] move user agent logic --- .../azure/cosmos/aio/_asynchronous_request.py | 10 ++++++++++ .../azure/cosmos/aio/_retry_utility_async.py | 8 -------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 945c4615b49e..24c05cab8959 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -34,6 +34,7 @@ from .. import http_constants from . import _retry_utility_async from .._synchronized_request import _request_body_from_data, _replace_url_prefix +from .._utils import get_user_agent_features async def _Request(global_endpoint_manager, request_params, connection_policy, pipeline_client, request, **kwargs): # pylint: disable=too-many-statements @@ -90,6 +91,15 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p parse_result = urlparse(request.url) + # Add relevant enabled features to user agent for debugging + if request.headers['x-ms-thinclient-proxy-resource-type'] == 'docs': + user_agent_features = get_user_agent_features(global_endpoint_manager) + if len(user_agent_features) > 0: + user_agent = kwargs.pop("user_agent", global_endpoint_manager.client._user_agent) + user_agent = "{} {}".format(user_agent, user_agent_features) + kwargs.update({"user_agent": user_agent}) + kwargs.update({"user_agent_overwrite": True}) + # The requests library now expects header values to be strings only starting 2.11, # and will raise an error on validation if they are not, so casting all header values to strings. request.headers.update({header: str(value) for header, value in request.headers.items()}) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py index f54cd61bfd0b..80dffa6aee72 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py @@ -44,7 +44,6 @@ from ..exceptions import CosmosHttpResponseError from ..http_constants import HttpHeaders, StatusCodes, SubStatusCodes from .._cosmos_http_logging_policy import _log_diagnostics_error -from .._utils import get_user_agent_features # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches @@ -114,13 +113,6 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg container_recreate_retry_policy = ContainerRecreateRetryPolicy( client, client._container_properties_cache, None, *args) - user_agent_features = get_user_agent_features(global_endpoint_manager) - if len(user_agent_features) > 0: - user_agent = kwargs.pop("user_agent", client._user_agent) - user_agent = "{} {}".format(user_agent, user_agent_features) - kwargs.update({"user_agent": user_agent}) - kwargs.update({"user_agent_overwrite": True}) - while True: client_timeout = kwargs.get('timeout') start_time = time.time() From 0d7e887ef2315aacdad1af1727ba9bc780e4d804 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:28:57 -0400 Subject: [PATCH 58/68] sync and async match, remove print statements --- .../azure/cosmos/_synchronized_request.py | 10 +++++++ .../azure-cosmos/azure/cosmos/_utils.py | 3 ++- .../azure/cosmos/aio/_asynchronous_request.py | 27 ------------------- ..._per_partition_automatic_failover_async.py | 5 ---- 4 files changed, 12 insertions(+), 33 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py index e6109b5bd621..8f3e36728f7f 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py @@ -29,6 +29,7 @@ from azure.core.exceptions import DecodeError # type: ignore from . import exceptions, http_constants, _retry_utility +from ._utils import get_user_agent_features def _is_readable_stream(obj): @@ -115,6 +116,15 @@ def _Request(global_endpoint_manager, request_params, connection_policy, pipelin parse_result = urlparse(request.url) + # Add relevant enabled features to user agent for debugging + if request.headers['x-ms-thinclient-proxy-resource-type'] == 'docs': + user_agent_features = get_user_agent_features(global_endpoint_manager) + if len(user_agent_features) > 0: + user_agent = kwargs.pop("user_agent", global_endpoint_manager.client._user_agent) + user_agent = "{} {}".format(user_agent, user_agent_features) + kwargs.update({"user_agent": user_agent}) + kwargs.update({"user_agent_overwrite": True}) + # The requests library now expects header values to be strings only starting 2.11, # and will raise an error on validation if they are not, so casting all header values to strings. request.headers.update({header: str(value) for header, value in request.headers.items()}) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py index 8bb57ccd6562..aaf7f5b39b83 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py @@ -33,6 +33,7 @@ from ._version import VERSION # cspell:ignore ppcb +# pylint: disable=protected-access def get_user_agent(suffix: Optional[str] = None) -> str: os_name = safe_user_agent_header(platform.platform()) @@ -166,4 +167,4 @@ def get_user_agent_features(global_endpoint_manager: Any) -> str: _Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT).lower() if ppcb_check == "true" or feature_flag > 0: feature_flag += _Constants.UserAgentFeatureFlags.PER_PARTITION_CIRCUIT_BREAKER - return f"| F{feature_flag}" if feature_flag > 0 else "" \ No newline at end of file + return f"| F{feature_flag}" if feature_flag > 0 else "" diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 24c05cab8959..873d6d58685f 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -24,8 +24,6 @@ import copy import json import time -from datetime import datetime, timezone -import logging from urllib.parse import urlparse from azure.core.exceptions import DecodeError # type: ignore @@ -112,12 +110,6 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p and not connection_policy.DisableSSLVerification ) - route_end = time.perf_counter() - - route_duration = (route_end - route_start) * 1000 - - start = time.perf_counter() - if connection_policy.SSLConfiguration or "connection_cert" in kwargs: ca_certs = connection_policy.SSLConfiguration.SSLCaCerts cert_files = (connection_policy.SSLConfiguration.SSLCertFile, connection_policy.SSLConfiguration.SSLKeyFile) @@ -145,25 +137,6 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p **kwargs ) - end = time.perf_counter() - duration = (end - start) * 1000 - - logger = logging.getLogger("internal_requests") - response_time = datetime.now(timezone.utc) - print_string = f"Response time: {response_time.isoformat()} | " - print_string += f"Request URL: {request.url} | " - print_string += f"Resource type: {request.headers['x-ms-thinclient-proxy-resource-type']} | " - print_string += f"Operation type: {request.headers['x-ms-thinclient-proxy-operation-type']} | " - print_string += f"Status code: {response.http_response.status_code} | " - print_string += f"Sub-status code: {response.http_response.headers.get('x-ms-substatus', 'N/A')} | " - print_string += f"Routing duration: {route_duration} ms | " - print_string += f"Request/response duration: {duration} ms | " - print_string += f"Activity Id: {request.headers.get('x-ms-activity-id', 'N/A')} |" - print_string += f"Partition Id: {response.http_response.headers.get('x-ms-cosmos-internal-partition-id', 'N/A')} |" - print_string += f"Physical Id: {response.http_response.headers.get('x-ms-cosmos-physical-partition-id', 'N/A')} |" - logger.info(print_string) - print(print_string) - response = response.http_response headers = copy.copy(response.headers) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 0e91a5e42069..29205b4051b7 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -51,7 +51,6 @@ def try_move_to_next_location( :rtype: bool """ with self._lock: - print("got lock to move to next location") if endpoint_region != self.current_region and self.current_region is not None: regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) @@ -68,8 +67,6 @@ def try_move_to_next_location( logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) regional_endpoint = available_account_regional_endpoints[self.current_region] request.route_to_location(regional_endpoint) - print(f"routing to {regional_endpoint} from {endpoint_region}") - print(f"current unavailable: {str(self.unavailable_regional_endpoints)}") return True return False @@ -199,8 +196,6 @@ def resolve_service_endpoint_for_partition( request) else: # Update the current regional endpoint to whatever the request is routing to - endpoint_region = self.location_cache.get_location_from_endpoint( - request.location_endpoint_to_route) partition_failover_info.current_region = endpoint_region else: partition_failover_info = PartitionLevelFailoverInfo() From aa3b641279f049eca9b1402188ef7e5c060f204a Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 29 Oct 2025 11:39:18 -0400 Subject: [PATCH 59/68] leftover timer --- .../azure-cosmos/azure/cosmos/aio/_asynchronous_request.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 873d6d58685f..56ba13c24e1d 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -73,8 +73,6 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p if kwargs['timeout'] <= 0: raise exceptions.CosmosClientTimeoutError() - route_start = time.perf_counter() - if request_params.endpoint_override: base_url = request_params.endpoint_override else: From 799f6de06f1472eff69a8a7974f633648b3ba67b Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:44:27 -0400 Subject: [PATCH 60/68] Update _retry_utility.py --- sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 106709f0e7bf..af3fcd1edb11 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -43,7 +43,6 @@ from .exceptions import CosmosHttpResponseError from .http_constants import HttpHeaders, StatusCodes, SubStatusCodes, ResourceType from ._cosmos_http_logging_policy import _log_diagnostics_error -from ._utils import get_user_agent_features # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches @@ -115,13 +114,6 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin container_recreate_retry_policy = _container_recreate_retry_policy.ContainerRecreateRetryPolicy( client, client._container_properties_cache, None, *args) - user_agent_features = get_user_agent_features(global_endpoint_manager) - if len(user_agent_features) > 0: - user_agent = kwargs.pop("user_agent", client._user_agent) - user_agent = "{} {}".format(user_agent, user_agent_features) - kwargs.update({"user_agent": user_agent}) - kwargs.update({"user_agent_overwrite": True}) - while True: client_timeout = kwargs.get('timeout') start_time = time.time() From 36249b451ea761ae4dbc77b079b355fae60fc4fe Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:46:42 -0400 Subject: [PATCH 61/68] use constants --- sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py | 2 +- .../azure-cosmos/azure/cosmos/aio/_asynchronous_request.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py index 8f3e36728f7f..4e37131e6161 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py @@ -117,7 +117,7 @@ def _Request(global_endpoint_manager, request_params, connection_policy, pipelin parse_result = urlparse(request.url) # Add relevant enabled features to user agent for debugging - if request.headers['x-ms-thinclient-proxy-resource-type'] == 'docs': + if request.headers[http_constants.HttpHeaders.ThinClientProxyResourceType] == http_constants.ResourceType.Document: user_agent_features = get_user_agent_features(global_endpoint_manager) if len(user_agent_features) > 0: user_agent = kwargs.pop("user_agent", global_endpoint_manager.client._user_agent) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index 56ba13c24e1d..7091c598d302 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -88,7 +88,7 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p parse_result = urlparse(request.url) # Add relevant enabled features to user agent for debugging - if request.headers['x-ms-thinclient-proxy-resource-type'] == 'docs': + if request.headers[http_constants.HttpHeaders.ThinClientProxyResourceType] == http_constants.ResourceType.Document: user_agent_features = get_user_agent_features(global_endpoint_manager) if len(user_agent_features) > 0: user_agent = kwargs.pop("user_agent", global_endpoint_manager.client._user_agent) From 0495c7b696684a487ed950e1f5f6151f9d8422de Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:13:00 -0400 Subject: [PATCH 62/68] pylint --- sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py | 2 +- .../azure-cosmos/azure/cosmos/aio/_asynchronous_request.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py index aad9256a424f..55f9ac40a00c 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py @@ -79,7 +79,7 @@ def _Request(global_endpoint_manager, request_params, connection_policy, pipelin :rtype: tuple of (dict, dict) """ - # pylint: disable=protected-access + # pylint: disable=protected-access, too-many-branches connection_timeout = connection_policy.RequestTimeout connection_timeout = kwargs.pop("connection_timeout", connection_timeout) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py index f97dd99e8722..2d8c7e313a62 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py @@ -50,7 +50,7 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p :rtype: tuple of (dict, dict) """ - # pylint: disable=protected-access + # pylint: disable=protected-access, too-many-branches connection_timeout = connection_policy.RequestTimeout read_timeout = connection_policy.ReadTimeout From 8639093f8b4b374deba8ef36ed1b0d0f0c358c26 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Mon, 17 Nov 2025 10:18:18 -0500 Subject: [PATCH 63/68] Update CHANGELOG.md --- sdk/cosmos/azure-cosmos/CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index 612a2ec3ab16..d9b36f4dab5c 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -3,12 +3,14 @@ ### 4.14.3 (Unreleased) #### Features Added +* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover). See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). #### Breaking Changes #### Bugs Fixed #### Other Changes +* Added cross-regional retries for 503 (Service Unavailable) errors. See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). ### 4.14.2 (2025-11-14) @@ -56,7 +58,6 @@ This version and all future versions will require Python 3.9+. #### Features Added * Added read_items API to provide an efficient method for retrieving multiple items in a single request. See [PR 42167](https://github.com/Azure/azure-sdk-for-python/pull/42167). -* Added support for Per Partition Automatic Failover. To enable this feature, you must follow the guide [here](https://learn.microsoft.com/azure/cosmos-db/how-to-configure-per-partition-automatic-failover). See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). * Added ability to replace a container's indexing policy if a vector embedding policy was present. See [PR 42810](https://github.com/Azure/azure-sdk-for-python/pull/42810). #### Bugs Fixed @@ -64,7 +65,6 @@ This version and all future versions will require Python 3.9+. * Fixed bug where during health checks read regions were marked as unavailable for write operations. See [PR 42525](https://github.com/Azure/azure-sdk-for-python/pull/42525). * Fixed bug where containers named with spaces or special characters using session consistency would fall back to eventual consistency. See [PR 42608](https://github.com/Azure/azure-sdk-for-python/pull/42608) * Fixed bug where `excluded_locations` was not being honored for some metadata calls. See [PR 42266](https://github.com/Azure/azure-sdk-for-python/pull/42266). -* Added cross-regional retries for 503 (Service Unavailable) errors. See [PR 41588](https://github.com/Azure/azure-sdk-for-python/pull/41588). * Fixed bug where Hybrid Search queries using parameters were not working. See [PR 42787](https://github.com/Azure/azure-sdk-for-python/pull/42787) * Fixed partition scoping for per partition circuit breaker. See [PR 42751](https://github.com/Azure/azure-sdk-for-python/pull/42751) * Fixed bug where `partition_key` set to None was not properly handled for some operations. See [PR 42747](https://github.com/Azure/azure-sdk-for-python/pull/42747) From 5b3815f9182d23acdecf66886e7df0c27779340e Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 19 Nov 2025 12:04:48 -0500 Subject: [PATCH 64/68] react to comments --- .../azure-cosmos/azure/cosmos/_constants.py | 3 ++ .../_endpoint_discovery_retry_policy.py | 2 +- ...anager_per_partition_automatic_failover.py | 48 +++++------------- .../azure/cosmos/_retry_utility.py | 4 +- .../_service_unavailable_retry_policy.py | 2 +- .../azure-cosmos/azure/cosmos/_utils.py | 18 +++---- ..._per_partition_automatic_failover_async.py | 48 +++++------------- .../test_per_partition_automatic_failover.py | 48 ++++++++---------- ..._per_partition_automatic_failover_async.py | 49 ++++++++----------- .../test_per_partition_circuit_breaker_mm.py | 8 +-- 10 files changed, 88 insertions(+), 142 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py index a519f1110ee2..0a5e961f7aa9 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py @@ -116,6 +116,9 @@ class UserAgentFeatureFlags(IntEnum): suffix. This number will then be used to determine what features are enabled by decoding the hex string back to a number and checking what bits are set. + Features being developed should align with the .NET SDK as a source of truth for feature flag assignments: + https://github.com/Azure/azure-cosmos-dotnet-v3/blob/master/Microsoft.Azure.Cosmos/src/Diagnostics/UserAgentFeatureFlags.cs + Example: If the user agent suffix has "F3", this means that flags 1 and 2. """ diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py index 83bd966dc3f9..3357c097c63a 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py @@ -75,7 +75,7 @@ def ShouldRetry(self, exception): # pylint: disable=unused-argument location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( str(self.request.location_endpoint_to_route)) regional_endpoint = (self.global_endpoint_manager.location_cache. - account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) + account_read_regional_routing_contexts_by_location.get(location)) partition_level_info.unavailable_regional_endpoints[location] = regional_endpoint self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) return True diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index 0e912fbace67..a4fb22b821ca 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: from azure.cosmos._cosmos_client_connection import CosmosClientConnection + from azure.cosmos._location_cache import RegionalRoutingContext logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") @@ -33,18 +34,18 @@ class PartitionLevelFailoverInfo: Used to track the partition key range and the regions where it is available. """ def __init__(self) -> None: - self.unavailable_regional_endpoints: dict[str, str] = {} + self.unavailable_regional_endpoints: dict[str, "RegionalRoutingContext"] = {} self._lock = threading.Lock() self.current_region: Optional[str] = None def try_move_to_next_location( self, - available_account_regional_endpoints: dict[str, str], + available_account_regional_endpoints: dict[str, "RegionalRoutingContext"], endpoint_region: str, request: RequestObject) -> bool: """ Tries to move to the next available regional endpoint for the partition key range. - :param Dict[str, str] available_account_regional_endpoints: The available regional endpoints + :param Dict[str, RegionalRoutingContext] available_account_regional_endpoints: The available regional endpoints :param str endpoint_region: The current regional endpoint :param RequestObject request: The request object containing the routing context. :return: True if the move was successful, False otherwise. @@ -53,7 +54,7 @@ def try_move_to_next_location( with self._lock: if endpoint_region != self.current_region and self.current_region is not None: regional_endpoint = available_account_regional_endpoints[self.current_region] - request.route_to_location(regional_endpoint) + request.route_to_location(regional_endpoint.primary_endpoint) return True for regional_endpoint in available_account_regional_endpoints: @@ -66,7 +67,7 @@ def try_move_to_next_location( self.current_region = regional_endpoint logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) regional_endpoint = available_account_regional_endpoints[self.current_region] - request.route_to_location(regional_endpoint) + request.route_to_location(regional_endpoint.primary_endpoint) return True return False @@ -99,7 +100,7 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) return False # if we have at most one region available in the account, we cannot do per partition automatic failover - available_regions = self.compute_available_preferred_regions(request) + available_regions = self.location_cache.account_read_regional_routing_contexts_by_location if len(available_regions) <= 1: return False @@ -134,7 +135,6 @@ def try_ppaf_failover_threshold( # Once we mark the endpoint unavailable, the PPAF endpoint manager will try to move to the next # available region for the partition key range with self._threshold_lock: - logger.warning("PPAF - Failover threshold reached for partition key range: %s", pk_range_wrapper) # Check for count again, since a previous request may have now reset the count if (self.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, @@ -143,6 +143,8 @@ def try_ppaf_failover_threshold( partition_level_info = self.partition_range_to_failover_info[pk_range_wrapper] location = self.location_cache.get_location_from_endpoint( str(request.location_endpoint_to_route)) + logger.warning("PPAF - Failover threshold reached for partition key range: %s for region: %s", #pylint: disable=line-too-long + pk_range_wrapper, location) regional_context = (self.location_cache. account_read_regional_routing_contexts_by_location. get(location).primary_endpoint) @@ -171,15 +173,15 @@ def resolve_service_endpoint_for_partition( if request.location_endpoint_to_route is not None: endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) if endpoint_region in partition_failover_info.unavailable_regional_endpoints: - available_account_regional_endpoints = self.compute_available_preferred_regions(request) + available_account_regional_endpoints = self.location_cache.account_read_regional_routing_contexts_by_location #pylint: disable=line-too-long if (partition_failover_info.current_region is not None and endpoint_region != partition_failover_info.current_region): # this request has not yet seen there's an available region being used for this partition regional_endpoint = available_account_regional_endpoints[ - partition_failover_info.current_region] + partition_failover_info.current_region].primary_endpoint request.route_to_location(regional_endpoint) else: - if (len(self.compute_available_preferred_regions(request)) + if (len(self.location_cache.account_read_regional_routing_contexts_by_location) == len(partition_failover_info.unavailable_regional_endpoints)): # If no other region is available, we invalidate the cache and start once again # from our main write region in the account configurations @@ -190,7 +192,7 @@ def resolve_service_endpoint_for_partition( else: # If the current region is unavailable, we try to move to the next available region partition_failover_info.try_move_to_next_location( - self.compute_available_preferred_regions(request), + self.location_cache.account_read_regional_routing_contexts_by_location, endpoint_region, request) else: @@ -204,30 +206,6 @@ def resolve_service_endpoint_for_partition( self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) - def compute_available_preferred_regions( - self, - request: RequestObject - ) -> dict[str, str]: - """ - Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. - :param RequestObject request: The request object containing the routing context. - :return: A set of available regional endpoints. - :rtype: Dict[str, str] - """ - if request.excluded_locations: - excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations - else: - excluded_locations = self.location_cache.connection_policy.ExcludedLocations - if excluded_locations is None: - excluded_locations = [] - preferred_locations = self.location_cache.effective_preferred_locations - available_regions = [item for item in preferred_locations if item not in excluded_locations] - available_regional_endpoints = {} - for region, context in self.location_cache.account_read_regional_routing_contexts_by_location.items(): - if region in available_regions: - available_regional_endpoints[region] = context.primary_endpoint - return available_regional_endpoints - def record_failure(self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 47fef50b8779..32efca023d11 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -209,8 +209,8 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs): # pylin request.headers[retry_policy._intended_headers] = retry_policy.container_rid elif e.status_code == StatusCodes.SERVICE_UNAVAILABLE: if args: - # record the failure for ppaf/circuit breaker tracking - global_endpoint_manager.record_failure(args[0], pk_range_wrapper) + # record the failure for circuit breaker tracking + global_endpoint_manager.record_ppcb_failure(args[0], pk_range_wrapper) retry_policy = service_unavailable_retry_policy elif e.status_code == StatusCodes.REQUEST_TIMEOUT or e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR: if args: diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py index 8269eb86799d..a210f9348f89 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_service_unavailable_retry_policy.py @@ -57,7 +57,7 @@ def ShouldRetry(self, _exception: CosmosHttpResponseError): location = self.global_endpoint_manager.location_cache.get_location_from_endpoint( str(self.request.location_endpoint_to_route)) regional_context = (self.global_endpoint_manager.location_cache. - account_read_regional_routing_contexts_by_location.get(location).primary_endpoint) + account_read_regional_routing_contexts_by_location.get(location)) partition_level_info.unavailable_regional_endpoints[location] = regional_context self.global_endpoint_manager.resolve_service_endpoint_for_partition(self.request, self.pk_range_wrapper) return True diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py index aaf7f5b39b83..1e5f5602b9e5 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py @@ -152,19 +152,19 @@ def valid_key_value_exist( def get_user_agent_features(global_endpoint_manager: Any) -> str: - """Check the account and client configurations in order to add feature flags to the user agent. - - :param Any global_endpoint_manager: The global endpoint manager instance used to check against. - :return: The string representing the user agent features to include. - :rtype: str + """ + Check the account and client configurations in order to add feature flags + to the user agent using bitmask logic and hex encoding (matching .NET/Java). """ feature_flag = 0 + # Bitwise OR for feature flags if global_endpoint_manager._database_account_cache is not None: if global_endpoint_manager._database_account_cache._EnablePerPartitionFailoverBehavior is True: - feature_flag += _Constants.UserAgentFeatureFlags.PER_PARTITION_AUTOMATIC_FAILOVER + feature_flag |= _Constants.UserAgentFeatureFlags.PER_PARTITION_AUTOMATIC_FAILOVER ppcb_check = os.environ.get( _Constants.CIRCUIT_BREAKER_ENABLED_CONFIG, - _Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT).lower() + _Constants.CIRCUIT_BREAKER_ENABLED_CONFIG_DEFAULT + ).lower() if ppcb_check == "true" or feature_flag > 0: - feature_flag += _Constants.UserAgentFeatureFlags.PER_PARTITION_CIRCUIT_BREAKER - return f"| F{feature_flag}" if feature_flag > 0 else "" + feature_flag |= _Constants.UserAgentFeatureFlags.PER_PARTITION_CIRCUIT_BREAKER + return f"| F{feature_flag:X}" if feature_flag > 0 else "" diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 29205b4051b7..8f94dbebd008 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: from azure.cosmos.aio._cosmos_client_connection_async import CosmosClientConnection + from azure.cosmos._location_cache import RegionalRoutingContext logger = logging.getLogger("azure.cosmos._GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover") @@ -33,18 +34,18 @@ class PartitionLevelFailoverInfo: Used to track the partition key range and the regions where it is available. """ def __init__(self) -> None: - self.unavailable_regional_endpoints: dict[str, str] = {} + self.unavailable_regional_endpoints: dict[str, "RegionalRoutingContext"] = {} self._lock = threading.Lock() self.current_region: Optional[str] = None def try_move_to_next_location( self, - available_account_regional_endpoints: dict[str, str], + available_account_regional_endpoints: dict[str, "RegionalRoutingContext"], endpoint_region: str, request: RequestObject) -> bool: """ Tries to move to the next available regional endpoint for the partition key range. - :param Dict[str, str] available_account_regional_endpoints: The available regional endpoints + :param Dict[str, RegionalRoutingContext] available_account_regional_endpoints: The available regional endpoints :param str endpoint_region: The current regional endpoint :param RequestObject request: The request object containing the routing context. :return: True if the move was successful, False otherwise. @@ -53,7 +54,7 @@ def try_move_to_next_location( with self._lock: if endpoint_region != self.current_region and self.current_region is not None: regional_endpoint = available_account_regional_endpoints[self.current_region] - request.route_to_location(regional_endpoint) + request.route_to_location(regional_endpoint.primary_endpoint) return True for regional_endpoint in available_account_regional_endpoints: @@ -66,7 +67,7 @@ def try_move_to_next_location( self.current_region = regional_endpoint logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) regional_endpoint = available_account_regional_endpoints[self.current_region] - request.route_to_location(regional_endpoint) + request.route_to_location(regional_endpoint.primary_endpoint) return True return False @@ -100,7 +101,7 @@ def is_per_partition_automatic_failover_applicable(self, request: RequestObject) return False # if we have at most one region available in the account, we cannot do per partition automatic failover - available_regions = self.compute_available_preferred_regions(request) + available_regions = self.location_cache.account_read_regional_routing_contexts_by_location if len(available_regions) <= 1: return False @@ -135,7 +136,6 @@ def try_ppaf_failover_threshold( # Once we mark the endpoint unavailable, the PPAF endpoint manager will try to move to the next # available region for the partition key range with self._threshold_lock: - logger.warning("PPAF - Failover threshold reached for partition key range: %s", pk_range_wrapper) # Check for count again, since a previous request may have now reset the count if (self.ppaf_thresholds_tracker.get_pk_failures(pk_range_wrapper) >= int(os.environ.get(Constants.TIMEOUT_ERROR_THRESHOLD_PPAF, @@ -144,6 +144,8 @@ def try_ppaf_failover_threshold( partition_level_info = self.partition_range_to_failover_info[pk_range_wrapper] location = self.location_cache.get_location_from_endpoint( str(request.location_endpoint_to_route)) + logger.warning("PPAF - Failover threshold reached for partition key range: %s for region: %s", #pylint: disable=line-too-long + pk_range_wrapper, location) regional_context = (self.location_cache. account_read_regional_routing_contexts_by_location. get(location).primary_endpoint) @@ -172,15 +174,15 @@ def resolve_service_endpoint_for_partition( if request.location_endpoint_to_route is not None: endpoint_region = self.location_cache.get_location_from_endpoint(request.location_endpoint_to_route) if endpoint_region in partition_failover_info.unavailable_regional_endpoints: - available_account_regional_endpoints = self.compute_available_preferred_regions(request) + available_account_regional_endpoints = self.location_cache.account_read_regional_routing_contexts_by_location #pylint: disable=line-too-long if (partition_failover_info.current_region is not None and endpoint_region != partition_failover_info.current_region): # this request has not yet seen there's an available region being used for this partition regional_endpoint = available_account_regional_endpoints[ - partition_failover_info.current_region] + partition_failover_info.current_region].primary_endpoint request.route_to_location(regional_endpoint) else: - if (len(self.compute_available_preferred_regions(request)) == + if (len(self.location_cache.account_read_regional_routing_contexts_by_location) == len(partition_failover_info.unavailable_regional_endpoints)): # If no other region is available, we invalidate the cache and start once again # from our main write region in the account configurations @@ -191,7 +193,7 @@ def resolve_service_endpoint_for_partition( else: # If the current region is unavailable, we try to move to the next available region partition_failover_info.try_move_to_next_location( - self.compute_available_preferred_regions(request), + self.location_cache.account_read_regional_routing_contexts_by_location, endpoint_region, request) else: @@ -205,30 +207,6 @@ def resolve_service_endpoint_for_partition( self.partition_range_to_failover_info[pk_range_wrapper] = partition_failover_info return self._resolve_service_endpoint_for_partition_circuit_breaker(request, pk_range_wrapper) - def compute_available_preferred_regions( - self, - request: RequestObject - ) -> dict[str, str]: - """ - Computes the available regional endpoints for the request based on customer-set preferred and excluded regions. - :param RequestObject request: The request object containing the routing context. - :return: A set of available regional endpoints. - :rtype: Dict[str, str] - """ - if request.excluded_locations: - excluded_locations = request.excluded_locations + self.location_cache.connection_policy.ExcludedLocations - else: - excluded_locations = self.location_cache.connection_policy.ExcludedLocations - if excluded_locations is None: - excluded_locations = [] - preferred_locations = self.location_cache.effective_preferred_locations - available_regions = [item for item in preferred_locations if item not in excluded_locations] - available_regional_endpoints = {} - for region, context in self.location_cache.account_read_regional_routing_contexts_by_location.items(): - if region in available_regions: - available_regional_endpoints[region] = context.primary_endpoint - return available_regional_endpoints - async def record_failure(self, request: RequestObject, pk_range_wrapper: Optional[PartitionKeyRangeWrapper] = None) -> None: diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index 8f34f14a3913..b9a161829fc3 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -8,10 +8,9 @@ from azure.core.exceptions import ServiceResponseError from azure.cosmos import CosmosClient from azure.cosmos.exceptions import CosmosHttpResponseError -from azure.cosmos._request_object import RequestObject from _fault_injection_transport import FaultInjectionTransport -from test_per_partition_circuit_breaker_mm import (REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_boolean, - write_operations_and_errors, perform_write_operation) +from test_per_partition_circuit_breaker_mm import (REGION_1, REGION_2, PK_VALUE, BATCH, + write_operations_errors_and_boolean, perform_write_operation) # cspell:disable @@ -101,12 +100,15 @@ def setup_info(self, error=None, max_count=None, is_batch=False, exclude_client_ exclude_client_regions=exclude_client_regions, **kwargs) return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) - def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): + @pytest.mark.parametrize("write_operation, error, exclude_regions", write_operations_errors_and_boolean(create_failover_errors())) + def test_ppaf_partition_info_cache_and_routing(self, write_operation, error, exclude_regions): # This test validates that the partition info cache is updated correctly upon failures, and that the # per-partition automatic failover logic routes requests to the next available regional endpoint on 403.3 errors. + # We also verify that this logic is unaffected by user excluded regions, since write-region routing is entirely + # taken care of on the service. error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) - setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, 1, write_operation == BATCH) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, 1, + write_operation == BATCH, exclude_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager @@ -145,13 +147,15 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error): assert partition_info.current_region is None - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_threshold_errors())) - def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): + @pytest.mark.parametrize("write_operation, error, exclude_regions", write_operations_errors_and_boolean(create_threshold_errors())) + def test_ppaf_partition_thresholds_and_routing(self, write_operation, error, exclude_regions): # This test validates the consecutive failures logic is properly handled for per-partition automatic failover, # and that the per-partition automatic failover logic routes requests to the next available regional endpoint - # after enough consecutive failures have occurred. + # after enough consecutive failures have occurred. We also verify that this logic is unaffected by user excluded + # regions, since write-region routing is entirely taken care of on the service. error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) - setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error=error_lambda, + exclude_client_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager @@ -208,29 +212,17 @@ def test_ppaf_partition_thresholds_and_routing(self, write_operation, error): failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] assert failure_count == 2 - @pytest.mark.parametrize("write_operation, exclude_client_regions", write_operations_and_boolean()) - def test_ppaf_exclude_regions(self, write_operation, exclude_client_regions): - # This test validates that the per-partition automatic failover logic does not apply to configs without enough regions. - setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(exclude_client_regions=exclude_client_regions) - fault_injection_container = custom_setup['col'] - global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager - # Check that computing valid regions for PPAF only returns a single region - request_object = RequestObject(resource_type="docs", operation_type=write_operation, headers={}) - if exclude_client_regions is False: - request_object.excluded_locations = [REGION_2] - available_ppaf_regions = global_endpoint_manager.compute_available_preferred_regions(request_object) - assert len(available_ppaf_regions) == 1 - # Check that all requests are marked as non-PPAF available due to the fact that we only have one region - assert global_endpoint_manager.is_per_partition_automatic_failover_applicable(request_object) is False - - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) - def test_ppaf_session_unavailable_retry(self, write_operation, error): + @pytest.mark.parametrize("write_operation, error, exclude_regions", write_operations_errors_and_boolean(create_failover_errors())) + def test_ppaf_session_unavailable_retry(self, write_operation, error, exclude_regions): # Account config has 2 regions: West US 3 (A) and West US (B). This test validates that after marking the write # region (A) as unavailable, the next request is retried to the read region (B) and succeeds. The next read request # should see that the write region (A) is unavailable for the partition, and should retry to the read region (B) as well. + # We also verify that this logic is unaffected by user excluded regions, since write-region routing is entirely + # taken care of on the service. error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, max_count=1, - is_batch=write_operation==BATCH, session_error=True) + is_batch=write_operation==BATCH, + session_error=True, exclude_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 25dbcf5b64af..7fa4982dd70a 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -13,11 +13,10 @@ from azure.core.exceptions import ServiceResponseError from azure.cosmos.exceptions import CosmosHttpResponseError from azure.cosmos.aio import CosmosClient -from azure.cosmos._request_object import RequestObject from _fault_injection_transport import FaultInjectionTransport from _fault_injection_transport_async import FaultInjectionTransportAsync from test_per_partition_automatic_failover import create_failover_errors, create_threshold_errors, session_retry_hook, ppaf_user_agent_hook -from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_and_errors, write_operations_and_boolean +from test_per_partition_circuit_breaker_mm import REGION_1, REGION_2, PK_VALUE, BATCH, write_operations_errors_and_boolean from test_per_partition_circuit_breaker_mm_async import perform_write_operation #cspell:ignore PPAF, ppaf @@ -96,12 +95,15 @@ async def setup_info(self, error=None, max_count=None, is_batch=False, exclude_c exclude_client_regions=exclude_client_regions, **kwargs) return setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) - async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation, error): + @pytest.mark.parametrize("write_operation, error, exclude_regions", write_operations_errors_and_boolean(create_failover_errors())) + async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation, error, exclude_regions): # This test validates that the partition info cache is updated correctly upon failures, and that the - # per-partition automatic failover logic routes requests to the next available regional endpoint + # per-partition automatic failover logic routes requests to the next available regional endpoint. + # We also verify that this logic is unaffected by user excluded regions, since write-region routing is + # entirely taken care of on the service. error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) - setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, 1, write_operation == BATCH) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, 1, + write_operation == BATCH, exclude_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager @@ -139,12 +141,15 @@ async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation assert initial_region not in partition_info.unavailable_regional_endpoints assert partition_info.current_region is None - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_threshold_errors())) - async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation, error): + @pytest.mark.parametrize("write_operation, error, exclude_regions", write_operations_errors_and_boolean(create_threshold_errors())) + async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation, error, exclude_regions): # This test validates that the partition info cache is updated correctly upon failures, and that the - # per-partition automatic failover logic routes requests to the next available regional endpoint + # per-partition automatic failover logic routes requests to the next available regional endpoint. + # We also verify that this logic is unaffected by user excluded regions, since write-region routing is + # entirely taken care of on the service. error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) - setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda) + setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, + exclude_regions=exclude_regions,) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager @@ -202,29 +207,17 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation failure_count = global_endpoint_manager.ppaf_thresholds_tracker.pk_range_wrapper_to_failure_count[pk_range_wrappers[0]] assert failure_count == 2 - @pytest.mark.parametrize("write_operation, exclude_client_regions", write_operations_and_boolean()) - async def test_ppaf_exclude_regions_async(self, write_operation, exclude_client_regions): - # This test validates that the per-partition automatic failover logic does not apply to configs without enough regions. - setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(exclude_client_regions=exclude_client_regions) - fault_injection_container = custom_setup['col'] - global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager - # Check that computing valid regions for PPAF only returns a single region - request_object = RequestObject(resource_type="docs", operation_type=write_operation, headers={}) - if exclude_client_regions is False: - request_object.excluded_locations = [REGION_2] - available_ppaf_regions = global_endpoint_manager.compute_available_preferred_regions(request_object) - assert len(available_ppaf_regions) == 1 - # Check that all requests are marked as non-PPAF available due to the fact that we only have one region - assert global_endpoint_manager.is_per_partition_automatic_failover_applicable(request_object) is False - - @pytest.mark.parametrize("write_operation, error", write_operations_and_errors(create_failover_errors())) - async def test_ppaf_session_unavailable_retry_async(self, write_operation, error): + @pytest.mark.parametrize("write_operation, error, exclude_regions", write_operations_errors_and_boolean(create_failover_errors())) + async def test_ppaf_session_unavailable_retry_async(self, write_operation, error, exclude_regions): # Account config has 2 regions: West US 3 (A) and West US (B). This test validates that after marking the write # region (A) as unavailable, the next request is retried to the read region (B) and succeeds. The next read request # should see that the write region (A) is unavailable for the partition, and should retry to the read region (B) as well. + # We also verify that this logic is unaffected by user excluded regions, since write-region routing is + # entirely taken care of on the service. error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, max_count=1, - is_batch=write_operation==BATCH, session_error=True) + is_batch=write_operation==BATCH, + session_error=True, exclude_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py index 8e4ab4386695..700e2112621b 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py @@ -60,12 +60,14 @@ def write_operations_and_errors(error_list=None): return params -def write_operations_and_boolean(): +def write_operations_errors_and_boolean(error_list=None): write_operations = [CREATE, UPSERT, REPLACE, DELETE, PATCH, BATCH] + errors = error_list or create_errors() params = [] for write_operation in write_operations: - for boolean in [True, False]: - params.append((write_operation, boolean)) + for error in errors: + for boolean in [True, False]: + params.append((write_operation, error, boolean)) return params From e31d674eebf9937c40a9de92ea7b75ba0856f94e Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:35:05 -0500 Subject: [PATCH 65/68] Update _retry_utility.py --- sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py index 32efca023d11..b52a957c4be0 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py @@ -46,7 +46,7 @@ # pylint: disable=protected-access, disable=too-many-lines, disable=too-many-statements, disable=too-many-branches -# cspell:ignore PPAF,ppaf +# cspell:ignore PPAF,ppaf,ppcb # args [0] is the request object # args [1] is the connection policy From e55871cda07f3c694d4f3e7aa1e3d5e942ff780a Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:20:10 -0500 Subject: [PATCH 66/68] mypy pylint --- ...ndpoint_manager_per_partition_automatic_failover.py | 10 +++++----- sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py | 4 ++++ ...t_manager_per_partition_automatic_failover_async.py | 8 ++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py index a4fb22b821ca..0547cb41df32 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_global_partition_endpoint_manager_per_partition_automatic_failover.py @@ -45,7 +45,7 @@ def try_move_to_next_location( request: RequestObject) -> bool: """ Tries to move to the next available regional endpoint for the partition key range. - :param Dict[str, RegionalRoutingContext] available_account_regional_endpoints: The available regional endpoints + :param dict[str, RegionalRoutingContext] available_account_regional_endpoints: The available regional endpoints :param str endpoint_region: The current regional endpoint :param RequestObject request: The request object containing the routing context. :return: True if the move was successful, False otherwise. @@ -53,8 +53,8 @@ def try_move_to_next_location( """ with self._lock: if endpoint_region != self.current_region and self.current_region is not None: - regional_endpoint = available_account_regional_endpoints[self.current_region] - request.route_to_location(regional_endpoint.primary_endpoint) + regional_endpoint = available_account_regional_endpoints[self.current_region].primary_endpoint + request.route_to_location(regional_endpoint) return True for regional_endpoint in available_account_regional_endpoints: @@ -66,8 +66,8 @@ def try_move_to_next_location( self.current_region = regional_endpoint logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) - regional_endpoint = available_account_regional_endpoints[self.current_region] - request.route_to_location(regional_endpoint.primary_endpoint) + regional_endpoint = available_account_regional_endpoints[self.current_region].primary_endpoint + request.route_to_location(regional_endpoint) return True return False diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py index 1e5f5602b9e5..0587556f198e 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_utils.py @@ -155,6 +155,10 @@ def get_user_agent_features(global_endpoint_manager: Any) -> str: """ Check the account and client configurations in order to add feature flags to the user agent using bitmask logic and hex encoding (matching .NET/Java). + + :param Any global_endpoint_manager: The GlobalEndpointManager instance. + :return: A string representing the user agent feature flags. + :rtype: str """ feature_flag = 0 # Bitwise OR for feature flags diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py index 8f94dbebd008..c96b46ca46b3 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/aio/_global_partition_endpoint_manager_per_partition_automatic_failover_async.py @@ -53,8 +53,8 @@ def try_move_to_next_location( """ with self._lock: if endpoint_region != self.current_region and self.current_region is not None: - regional_endpoint = available_account_regional_endpoints[self.current_region] - request.route_to_location(regional_endpoint.primary_endpoint) + regional_endpoint = available_account_regional_endpoints[self.current_region].primary_endpoint + request.route_to_location(regional_endpoint) return True for regional_endpoint in available_account_regional_endpoints: @@ -66,8 +66,8 @@ def try_move_to_next_location( self.current_region = regional_endpoint logger.warning("PPAF - Moving to next available regional endpoint: %s", self.current_region) - regional_endpoint = available_account_regional_endpoints[self.current_region] - request.route_to_location(regional_endpoint.primary_endpoint) + regional_endpoint = available_account_regional_endpoints[self.current_region].primary_endpoint + request.route_to_location(regional_endpoint) return True return False From 0463a3f79202903ed7f5ece033a832b7b51c2338 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:13:00 -0500 Subject: [PATCH 67/68] test fixes --- .../tests/test_per_partition_automatic_failover.py | 4 ++-- .../tests/test_per_partition_automatic_failover_async.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py index b9a161829fc3..437c25556e05 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover.py @@ -108,7 +108,7 @@ def test_ppaf_partition_info_cache_and_routing(self, write_operation, error, exc # taken care of on the service. error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, 1, - write_operation == BATCH, exclude_regions=exclude_regions) + write_operation == BATCH, exclude_client_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager @@ -222,7 +222,7 @@ def test_ppaf_session_unavailable_retry(self, write_operation, error, exclude_re error_lambda = lambda r: FaultInjectionTransport.error_after_delay(0, error) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = self.setup_info(error_lambda, max_count=1, is_batch=write_operation==BATCH, - session_error=True, exclude_regions=exclude_regions) + session_error=True, exclude_client_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager diff --git a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py index 7fa4982dd70a..860727c18b0b 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py +++ b/sdk/cosmos/azure-cosmos/tests/test_per_partition_automatic_failover_async.py @@ -103,7 +103,7 @@ async def test_ppaf_partition_info_cache_and_routing_async(self, write_operation # entirely taken care of on the service. error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, 1, - write_operation == BATCH, exclude_regions=exclude_regions) + write_operation == BATCH, exclude_client_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager @@ -149,7 +149,7 @@ async def test_ppaf_partition_thresholds_and_routing_async(self, write_operation # entirely taken care of on the service. error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, - exclude_regions=exclude_regions,) + exclude_client_regions=exclude_regions,) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager @@ -217,7 +217,7 @@ async def test_ppaf_session_unavailable_retry_async(self, write_operation, error error_lambda = lambda r: asyncio.create_task(FaultInjectionTransportAsync.error_after_delay(0, error)) setup, doc_fail_id, doc_success_id, custom_setup, custom_transport, predicate = await self.setup_info(error_lambda, max_count=1, is_batch=write_operation==BATCH, - session_error=True, exclude_regions=exclude_regions) + session_error=True, exclude_client_regions=exclude_regions) container = setup['col'] fault_injection_container = custom_setup['col'] global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager From cdfdc0187c9bb706870050bac58aed7618f35ab9 Mon Sep 17 00:00:00 2001 From: Simon Moreno <30335873+simorenoh@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:49:05 -0500 Subject: [PATCH 68/68] add lock to failure additions --- .../azure/cosmos/_partition_health_tracker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py index 8218950a8dff..50f4c79bceb4 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py @@ -312,11 +312,13 @@ class _PPAFPartitionThresholdsTracker(object): def __init__(self) -> None: self.pk_range_wrapper_to_failure_count: dict[PartitionKeyRangeWrapper, int] = {} + self._failure_lock = threading.Lock() def add_failure(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> None: - if pk_range_wrapper not in self.pk_range_wrapper_to_failure_count: - self.pk_range_wrapper_to_failure_count[pk_range_wrapper] = 0 - self.pk_range_wrapper_to_failure_count[pk_range_wrapper] += 1 + with self._failure_lock: + if pk_range_wrapper not in self.pk_range_wrapper_to_failure_count: + self.pk_range_wrapper_to_failure_count[pk_range_wrapper] = 0 + self.pk_range_wrapper_to_failure_count[pk_range_wrapper] += 1 def clear_pk_failures(self, pk_range_wrapper: PartitionKeyRangeWrapper) -> None: if pk_range_wrapper in self.pk_range_wrapper_to_failure_count: