diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index 088e16ec47b7..2ce5dce6ec67 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -8,6 +8,7 @@ #### Bugs Fixed * Fixed bug where sdk was encountering a timeout issue caused by infinite recursion during the 410 (Gone) error. See [PR 44770](https://github.com/Azure/azure-sdk-for-python/pull/44770) * Fixed crash in sync and async clients when `force_refresh_on_startup` was set to `None`, which could surface as `AttributeError: 'NoneType' object has no attribute '_WritableLocations'` during region discovery when `database_account` was `None`. See [PR 44987](https://github.com/Azure/azure-sdk-for-python/pull/44987) +* Fixed bug where unavailable regional endpoints were dropped from the routing list instead of being kept as fallback options. See [PR 45200](https://github.com/Azure/azure-sdk-for-python/pull/45200) #### Other Changes * Added tests for multi-language support for full text search. See [PR 44254](https://github.com/Azure/azure-sdk-for-python/pull/44254) diff --git a/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py b/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py index 38d22f0505b8..ef498a27b82a 100644 --- a/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py +++ b/sdk/cosmos/azure-cosmos/azure/cosmos/_location_cache.py @@ -524,8 +524,11 @@ def get_preferred_regional_routing_contexts( else: regional_endpoints.append(regional_endpoint) - # If all preferred locations are unavailable, honor the preferred list by trying them anyway. - if not regional_endpoints and unavailable_endpoints: + # Always append unavailable endpoints to the end of the list so they can be + # used as a last resort. This ensures that when all healthy endpoints are filtered + # out (e.g., by excluded_locations), the SDK can still fall back to unavailable + # regional endpoints rather than the global endpoint. + if unavailable_endpoints: regional_endpoints.extend(unavailable_endpoints) # If there are no preferred locations or none of the preferred locations are in the account, diff --git a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py index 0a8f4c519cbb..8f44511cfdb9 100644 --- a/sdk/cosmos/azure-cosmos/tests/test_location_cache.py +++ b/sdk/cosmos/azure-cosmos/tests/test_location_cache.py @@ -472,5 +472,88 @@ def test_write_fallback_to_global_after_regional_retries_exhausted(self): final_endpoint = lc.resolve_service_endpoint(write_request) assert final_endpoint == location1_endpoint + def test_unavailable_endpoints_not_dropped_from_routing_list(self): + """ + Unavailable endpoints should be appended to the end of the routing list, + not dropped entirely. + + Scenario: + - Customer has preferred_locations = ["East US", "West US 2"] + - East US is marked unavailable for writes + - Customer makes a request with excluded_locations = ["West US 2"] + - Expected: East US should still be available as fallback (unavailable but in the list) + """ + # Setup: Two preferred locations, multi-write enabled + preferred_locations = [location1_name, location2_name] + lc = refresh_location_cache(preferred_locations, use_multiple_write_locations=True) + db_acc = create_database_account(enable_multiple_writable_locations=True) + lc.perform_on_database_account_read(db_acc) + + # Verify initial state: Both locations are in write_regional_routing_contexts + write_contexts = lc.get_write_regional_routing_contexts() + assert len(write_contexts) == 2 + assert write_contexts[0].get_primary() == location1_endpoint + assert write_contexts[1].get_primary() == location2_endpoint + + # Mark location1 (East US) as unavailable for writes + lc.mark_endpoint_unavailable_for_write(location1_endpoint, refresh_cache=True, context="test") + + # After marking unavailable, the routing list should still contain + # both endpoints - healthy ones first, unavailable ones at the end + write_contexts_after = lc.get_write_regional_routing_contexts() + assert len(write_contexts_after) == 2, \ + f"Expected 2 endpoints in routing list, got {len(write_contexts_after)}. " \ + "Unavailable endpoint was incorrectly dropped!" + # location2 (healthy) should be first + assert write_contexts_after[0].get_primary() == location2_endpoint + # location1 (unavailable) should be at the end as fallback + assert write_contexts_after[1].get_primary() == location1_endpoint + + # Now simulate the customer request with excluded_locations = ["location2"] + write_request = RequestObject(ResourceType.Document, _OperationType.Create, None) + write_request.excluded_locations = [location2_name] + + # Resolve endpoint - should get location1 (unavailable) as the only remaining option + # NOT the global default endpoint! + resolved_endpoint = lc.resolve_service_endpoint(write_request) + + # Should fall back to location1 (unavailable regional endpoint) + # NOT the global endpoint + assert resolved_endpoint == location1_endpoint, \ + f"Expected {location1_endpoint} but got {resolved_endpoint}. " \ + f"Bug: Unavailable endpoint was dropped and SDK fell back to global endpoint!" + + def test_unavailable_endpoints_ordering_in_routing_list(self): + """ + Test that healthy endpoints come before unavailable endpoints in the routing list. + This ensures the SDK tries healthy regions first, but has unavailable ones as fallback. + """ + # Setup: Three preferred locations + preferred_locations = [location1_name, location2_name, location3_name] + lc = refresh_location_cache(preferred_locations, use_multiple_write_locations=True) + db_acc = create_database_account(enable_multiple_writable_locations=True) + lc.perform_on_database_account_read(db_acc) + + # Mark location1 as unavailable + lc.mark_endpoint_unavailable_for_write(location1_endpoint, refresh_cache=True, context="test") + + # Check ordering: location2, location3 (healthy) should come before location1 (unavailable) + write_contexts = lc.get_write_regional_routing_contexts() + assert len(write_contexts) == 3 + assert write_contexts[0].get_primary() == location2_endpoint # First healthy + assert write_contexts[1].get_primary() == location3_endpoint # Second healthy + assert write_contexts[2].get_primary() == location1_endpoint # Unavailable at end + + # Mark location2 as unavailable too + lc.mark_endpoint_unavailable_for_write(location2_endpoint, refresh_cache=True, context="test") + + # Check ordering: location3 (healthy) should come before location1, location2 (unavailable) + write_contexts = lc.get_write_regional_routing_contexts() + assert len(write_contexts) == 3 + assert write_contexts[0].get_primary() == location3_endpoint # Only healthy + # Unavailable ones at end, in original preferred order + assert write_contexts[1].get_primary() == location1_endpoint + assert write_contexts[2].get_primary() == location2_endpoint + if __name__ == "__main__": unittest.main()