From 6dee412c25b5775598391e4a805516a5a8bd4d46 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 16 May 2026 10:23:37 -0500 Subject: [PATCH] Raise on any mid-pagination failure instead of silently truncating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_walk_pages` and `get_stats_data`'s pagination loops have, since PR #273, logged failures correctly but preserved a "best effort" contract of returning whatever pages had been collected when a follow-up page failed. The waterdata API exposes no resume cursor once a paginated walk is interrupted, so the partial DataFrame couldn't reliably be extended — silently returning it handed callers truncated data they had no way to know was truncated. Both loops now wrap any mid-pagination exception (429, 5xx, network error) in a ``RuntimeError`` carrying: - the number of pages successfully collected, - the upstream cause (as both the message text and ``__cause__`` for programmatic inspection), - a short menu of recovery actions (wait and retry, reduce request size, or obtain an API token). The shared helper ``_paginated_failure_message`` builds the user- facing string so both loops stay aligned. Behavior change: callers that previously consumed partial DataFrames on transient upstream blips will now see an exception they need to handle (typically: retry, possibly with a smaller ``limit`` or narrower query). Called out in NEWS. Tests: - Replaced the prior best-effort-preserves-partial assertions with raises-with-cause-chain assertions for all three failure modes (connection error, 5xx, 429), in both ``_walk_pages`` and ``get_stats_data`` variants. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 +- dataretrieval/waterdata/utils.py | 45 ++++++++++-- tests/waterdata_utils_test.py | 116 ++++++++++++++++++++----------- 3 files changed, 116 insertions(+), 47 deletions(-) diff --git a/NEWS.md b/NEWS.md index 18fb12b5..7761e29b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -**05/14/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. +**05/16/2026:** Fixed silent truncation in the paginated `waterdata` request loops (`_walk_pages` and `get_stats_data`). Mid-pagination failures (HTTP 429, 5xx, network error) were previously swallowed — pagination would quietly stop and the function would return whatever rows it had collected, leaving callers with truncated DataFrames they had no way to detect. The loops now status-check every page like the initial request and raise `RuntimeError` on any failure, with the upstream exception chained as `__cause__` and a short menu of recovery actions (wait and retry, reduce the request, or obtain an API token) in the message. **Behavior change**: callers that previously consumed partial DataFrames on transient upstream blips will now see an exception; retry the call (possibly with a smaller `limit` or narrower query). **05/07/2026:** Bumped the declared minimum Python version from **3.8** to **3.9** (`pyproject.toml`'s `requires-python` and the ruff target). This brings the manifest in line with what was already being tested — CI's matrix has long covered only 3.9, 3.13, and 3.14, the `waterdata` test module already skipped itself on Python < 3.10, and several modules already use 3.9-only stdlib (e.g. `zoneinfo`). Users on 3.8 will no longer be able to install the package; please upgrade. diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 91228357..9245bb92 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -422,6 +422,32 @@ def _raise_for_non_200(resp: requests.Response) -> None: raise RuntimeError(_error_body(resp)) +def _paginated_failure_message(pages_collected: int, cause: BaseException) -> str: + """User-facing message for a mid-pagination failure. + + The API exposes no resume cursor, so the caller's only recovery is + to retry the whole call — the message lists the practical knobs, + tailored to whether the failure was rate-limit (429) or something + else. + """ + cause_str = str(cause).removesuffix(".") + # Some ``requests`` exceptions (e.g. ``Timeout()`` with no args) + # stringify to empty; fall back to the class name so the wrapper + # message is always informative. + if not cause_str.strip(): + cause_str = type(cause).__name__ + if cause_str.startswith("429"): + action = "wait for the rate-limit window to reset and retry" + else: + action = "retry the request (possibly after a short backoff)" + return ( + f"Paginated request failed after collecting {pages_collected} " + f"page(s): {cause_str}. To recover: {action}, reduce the " + f"request size (e.g. fewer locations, a shorter time range, or " + f"a smaller ``limit``), or obtain an API token." + ) + + def _construct_api_requests( service: str, properties: list[str] | None = None, @@ -647,8 +673,17 @@ def _walk_pages( Raises ------ - Exception - If a request fails/returns a non-200 status code. + RuntimeError + On a non-200 initial response (bare message from ``_error_body``) + or any failure on a subsequent page (wrapped message built by + ``_paginated_failure_message`` with the original exception + chained as ``__cause__``). + requests.exceptions.RequestException + Network-level failures on the *initial* request (e.g. + ``ConnectionError``, ``Timeout``) propagate unmodified to + preserve their specific type for callers that branch on it. + Equivalent failures on *subsequent* pages are caught and + re-raised as ``RuntimeError`` per the rule above. """ logger.info("Requesting: %s", req.url) @@ -690,11 +725,10 @@ def _walk_pages( dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception as e: # noqa: BLE001 - logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url ) - curr_url = None + raise RuntimeError(_paginated_failure_message(len(dfs), e)) from e # Concatenate all pages at once for efficiency return pd.concat(dfs, ignore_index=True), initial_response @@ -1154,14 +1188,13 @@ def get_stats_data( all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] except Exception as e: # noqa: BLE001 - logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s (next_token=%s). " "Data download interrupted.", url, next_token, ) - next_token = None + raise RuntimeError(_paginated_failure_message(len(all_dfs), e)) from e dfs = pd.concat(all_dfs, ignore_index=True) if len(all_dfs) > 1 else all_dfs[0] diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index b05587e2..78868c38 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -2,6 +2,7 @@ from unittest import mock import pandas as pd +import pytest import requests import dataretrieval.waterdata.utils as _utils_module @@ -106,12 +107,6 @@ def _resp_ok(features): return resp -def _error_log_messages(caplog): - """Pull ERROR-and-above message strings out of caplog. Shared by the - pagination-failure tests below.""" - return [r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR] - - def _walk_pages_with_failure(failure_resp_or_exc): """Run _walk_pages where page 1 succeeds and page 2 fails as given.""" resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}]) @@ -131,23 +126,38 @@ def _walk_pages_with_failure(failure_resp_or_exc): return _walk_pages(geopd=False, req=mock_req, client=mock_client) -def test_walk_pages_logs_actual_exception_when_request_raises(caplog): - """Exception from client.request() must be logged with its actual message.""" - caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) +def test_walk_pages_raises_on_connection_error_mid_pagination(): + """A connection error mid-pagination must raise with the upstream cause + chained, and the wrapper message must include recovery guidance that + is NOT rate-limit-specific (no quota window involved).""" + with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + _walk_pages_with_failure(requests.ConnectionError("boom")) - df, _ = _walk_pages_with_failure(requests.ConnectionError("boom")) + msg = str(excinfo.value) + assert isinstance(excinfo.value.__cause__, requests.ConnectionError) + assert "boom" in msg + assert "retry the request" in msg + assert "rate-limit window" not in msg - # First page's data is preserved (best-effort behavior). - assert list(df["val"]) == ["a"] - # Logged error mentions the actual ConnectionError, not a stale page body. - messages = _error_log_messages(caplog) - assert any("boom" in m for m in messages), messages +def test_walk_pages_raises_with_class_name_when_cause_stringifies_empty(): + """Some ``requests`` exceptions (e.g. ``Timeout()`` with no args) + stringify to ``""``. The wrapper must still produce an informative + message — fall back to the exception class name.""" + with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + _walk_pages_with_failure(requests.Timeout()) -def test_walk_pages_surfaces_5xx_mid_pagination(caplog): - """A non-200 mid-pagination response must be logged, not silently swallowed.""" - caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) + msg = str(excinfo.value) + assert "Timeout" in msg, msg + # Sanity-check the malformed-empty placeholder didn't slip through. + assert "page(s): ." not in msg + assert "page(s): To recover" not in msg + +def test_walk_pages_raises_on_5xx_mid_pagination(): + """A 5xx mid-pagination must raise — partial data is no longer returned + because the API has no resume cursor, so silently truncating is the + wrong default.""" page2_503 = mock.MagicMock() page2_503.status_code = 503 page2_503.json.return_value = { @@ -156,11 +166,27 @@ def test_walk_pages_surfaces_5xx_mid_pagination(caplog): } page2_503.url = "https://example.com/page2" - df, _ = _walk_pages_with_failure(page2_503) + with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + _walk_pages_with_failure(page2_503) + + msg = str(excinfo.value) + assert "503" in msg or "ServiceUnavailable" in msg + assert "rate-limit window" not in msg # not rate-limited + + +def test_walk_pages_raises_on_mid_pagination_429(): + """A 429 mid-pagination must raise. Specific status code is preserved in + the chained cause so callers can branch on rate-limit vs other failures.""" + page2_429 = mock.MagicMock() + page2_429.status_code = 429 + page2_429.url = "https://example.com/page2" + + with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + _walk_pages_with_failure(page2_429) - assert list(df["val"]) == ["a"] - messages = _error_log_messages(caplog) - assert any("503" in m or "ServiceUnavailable" in m for m in messages), messages + msg = str(excinfo.value) + assert "429" in msg + assert "rate-limit window" in msg # 429-specific guidance present def _stats_initial_ok(): @@ -205,23 +231,20 @@ def _run_get_stats_data_with_failure(failure_resp_or_exc, monkeypatch): ) -def test_get_stats_data_logs_actual_exception_when_request_raises(caplog, monkeypatch): - """get_stats_data variant of the connection-error scenario.""" - caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) +def test_get_stats_data_raises_on_connection_error_mid_pagination(monkeypatch): + """get_stats_data variant of the connection-error-raises contract.""" + with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + _run_get_stats_data_with_failure( + requests.ConnectionError("stats-boom"), + monkeypatch, + ) - _run_get_stats_data_with_failure( - requests.ConnectionError("stats-boom"), - monkeypatch, - ) - - messages = _error_log_messages(caplog) - assert any("stats-boom" in m for m in messages), messages + assert isinstance(excinfo.value.__cause__, requests.ConnectionError) + assert "stats-boom" in str(excinfo.value) -def test_get_stats_data_surfaces_5xx_mid_pagination(caplog, monkeypatch): - """get_stats_data variant of the mid-pagination 5xx scenario.""" - caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) - +def test_get_stats_data_raises_on_5xx_mid_pagination(monkeypatch): + """get_stats_data variant of the 5xx-raises contract.""" page2_503 = mock.MagicMock() page2_503.status_code = 503 page2_503.json.return_value = { @@ -230,10 +253,22 @@ def test_get_stats_data_surfaces_5xx_mid_pagination(caplog, monkeypatch): } page2_503.url = "https://example.com/stats?service=foo&next_token=tok2" - _run_get_stats_data_with_failure(page2_503, monkeypatch) + with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + _run_get_stats_data_with_failure(page2_503, monkeypatch) + + assert "503" in str(excinfo.value) or "ServiceUnavailable" in str(excinfo.value) + + +def test_get_stats_data_raises_on_mid_pagination_429(monkeypatch): + """get_stats_data variant of the 429-raises contract.""" + page2_429 = mock.MagicMock() + page2_429.status_code = 429 + page2_429.url = "https://example.com/stats?service=foo&next_token=tok2" + + with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + _run_get_stats_data_with_failure(page2_429, monkeypatch) - messages = _error_log_messages(caplog) - assert any("503" in m or "ServiceUnavailable" in m for m in messages), messages + assert "429" in str(excinfo.value) def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): @@ -249,7 +284,8 @@ def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): "description": "upstream timeout", } - _run_get_stats_data_with_failure(page2_503, monkeypatch) + with pytest.raises(RuntimeError): + _run_get_stats_data_with_failure(page2_503, monkeypatch) warnings_ = [r.getMessage() for r in caplog.records if r.levelno == logging.WARNING] # The initial response from _stats_initial_ok carries next=tok2.