From 1295e91d8bf57039627c7cd74e096d7eaeffbf68 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 8 Aug 2025 17:19:37 -0500 Subject: [PATCH 01/56] start adding functions --- dataretrieval/waterdata.py | 46 ++++- dataretrieval/waterdata_helpers.py | 265 +++++++++++++++++++++++++++++ 2 files changed, 306 insertions(+), 5 deletions(-) create mode 100644 dataretrieval/waterdata_helpers.py diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index ceed581e..f28d5293 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -7,13 +7,14 @@ import json from io import StringIO -from typing import TYPE_CHECKING, Literal, get_args +from typing import TYPE_CHECKING, Literal, List, get_args import pandas as pd import requests from requests.models import PreparedRequest from dataretrieval.utils import BaseMetadata, to_str +import dataretrieval.waterdata_helpers if TYPE_CHECKING: from typing import Optional, Tuple, Union @@ -21,7 +22,9 @@ from pandas import DataFrame -_BASE_URL = "https://api.waterdata.usgs.gov/samples-data" +_BASE_URL = "https://api.waterdata.usgs.gov/" + +_SAMPLES_URL = _BASE_URL + "samples-data" _CODE_SERVICES = Literal[ "characteristicgroup", @@ -34,7 +37,6 @@ "states", ] - _SERVICES = Literal["activities", "locations", "organizations", "projects", "results"] _PROFILES = Literal[ @@ -72,6 +74,40 @@ ], } +def get_daily( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + properties: Optional[List[str]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + daily_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[Union[str, List[str]]] = None, + last_modified: Optional[str] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True + ) -> pd.DataFrame: + + service = "daily" + output_id = "daily_id" + + return_list = _get_ogc_data( + + ) + +def get_monitoring_locations(): + +def get_ts_meta(): + +def get_latest_continuous(): + +def get_field_measurements(): def get_codes(code_service: _CODE_SERVICES) -> DataFrame: """Return codes from a Samples code service. @@ -90,7 +126,7 @@ def get_codes(code_service: _CODE_SERVICES) -> DataFrame: f"Valid options are: {valid_code_services}." ) - url = f"{_BASE_URL}/codeservice/{code_service}?mimeType=application%2Fjson" + url = f"{_SAMPLES_URL}/codeservice/{code_service}?mimeType=application%2Fjson" response = requests.get(url) @@ -305,7 +341,7 @@ def get_samples( if "boundingBox" in params: params["boundingBox"] = to_str(params["boundingBox"]) - url = f"{_BASE_URL}/{service}/{profile}" + url = f"{_SAMPLES_URL}/{service}/{profile}" req = PreparedRequest() req.prepare_url(url, params=params) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py new file mode 100644 index 00000000..6ea14232 --- /dev/null +++ b/dataretrieval/waterdata_helpers.py @@ -0,0 +1,265 @@ +import httpx +import os +import warnings +from typing import List, Dict, Any, Optional, Union +from datetime import datetime +import pytz +import pandas as pd + +BASE_API = "https://api.waterdata.usgs.gov/ogcapi/" +API_VERSION = "v0" + +# --- Caching for repeated calls --- +_cached_base_url = None +def _base_url(): + global _cached_base_url + if _cached_base_url is None: + _cached_base_url = f"{BASE_API}{API_VERSION}/" + return _cached_base_url + +def _setup_api(service: str): + return f"{_base_url()}collections/{service}/items" + +def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): + service_id = service.replace("-", "_") + "_id" + ls.setdefault("id", ls.pop(service_id, ls.pop(id_name, None))) + return ls + +def _switch_properties_id(properties: Optional[List[str]], id_name: str, service: str): + if not properties: + return [] + service_id = service.replace("-", "_") + "_id" + last_letter = service[-1] + service_id_singular = "" + if last_letter == "s": + service_singular = service[:-1] + service_id_singular = service_singular.replace("-", "_") + "_id" + # Replace id fields with "id" + id_fields = [service_id, service_id_singular, id_name] + properties = ["id" if p in id_fields else p.replace("-", "_") for p in properties] + # Remove unwanted fields + return [p for p in properties if p not in ["geometry", service_id]] + +def _format_api_dates(datetime_list: Union[str, List[Union[str, datetime]]], date: bool = False): + def _iso8601(dt): + if isinstance(dt, str): + return dt + elif isinstance(dt, datetime): + if dt.tzinfo is None: + dt = pytz.UTC.localize(dt) + return dt.isoformat() + return str(dt) + + if isinstance(datetime_list, str): + if not datetime_list: + return None + if "P" in datetime_list or "/" in datetime_list: + return datetime_list + return datetime_list + if isinstance(datetime_list, list): + datetime_list = [None if not d else d for d in datetime_list] + if all(d is None for d in datetime_list): + return None + if len(datetime_list) == 1: + d = datetime_list[0] + if isinstance(d, str) and ("P" in d or "/" in d): + return d + return datetime.strptime(d, "%Y-%m-%d").strftime("%Y-%m-%d") if date else _iso8601(d) + elif len(datetime_list) == 2: + dates = [datetime.strptime(str(d), "%Y-%m-%d").strftime("%Y-%m-%d") if date and d else _iso8601(d) if d else "" for d in datetime_list] + return "/".join(dates).replace("NA", "..") + else: + raise ValueError("datetime should only include 1-2 values") + return None + +def _explode_post(ls: Dict[str, Any]): + return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} + +def _cql2_param(parameter: Dict[str, List[str]]): + property_name = next(iter(parameter)) + parameters = [str(x) for x in parameter[property_name]] + return {"property": property_name, "parameter": parameters} + +def _default_headers(): + headers = { + "Accept-Encoding": "compress, gzip", + "Accept": "application/json", + "User-Agent": "python-dataretrieval/1.0", + "lang": "en-US" + } + token = os.getenv("API_USGS_PAT", "") + if token: + headers["X-Api-Key"] = token + return headers + +def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): + assert req_type in ["queryables", "schema"] + url = f"{_base_url()}collections/{endpoint}/{req_type}" + resp = httpx.get(url, headers=_default_headers()) + resp.raise_for_status() + return resp.json() + +def _error_body(resp: httpx.Response): + if resp.status_code == 429: + return resp.json().get('error', {}).get('message') + elif resp.status_code == 403: + return "Query request denied. Possible reasons include query exceeding server limits." + return resp.text + +def _get_collection(): + url = f"{_base_url()}openapi?f=json" + resp = httpx.get(url, headers=_default_headers()) + resp.raise_for_status() + return resp.json() + +def _get_description(service: str): + tags = _get_collection().get("tags", []) + for tag in tags: + if tag.get("name") == service: + return tag.get("description") + return None + +def _get_params(service: str): + url = f"{_base_url()}collections/{service}/schema" + resp = httpx.get(url, headers=_default_headers()) + resp.raise_for_status() + properties = resp.json().get("properties", {}) + return {k: v.get("description") for k, v in properties.items()} + +def construct_api_requests( + service: str, + properties: Optional[List[str]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + skipGeometry: bool = False, + **kwargs +): + baseURL = _setup_api(service) + single_params = {"datetime", "last_modified", "begin", "end", "time"} + params = {k: v for k, v in kwargs.items() if k in single_params} + params["skipGeometry"] = skipGeometry + # Limit logic + params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 + if max_results is not None and limit is not None and limit > max_results: + raise ValueError("limit cannot be greater than max_result") + post_params = _explode_post({k: v for k, v in kwargs.items() if k not in single_params}) + POST = bool(post_params) + + time_periods = {"last_modified", "datetime", "time", "begin", "end"} + for i in time_periods: + if i in params: + dates = service == "daily" and i != "last_modified" + params[i] = _format_api_dates(params[i], date=dates) + kwargs[i] = _format_api_dates(kwargs[i], date=dates) + + if bbox: + params["bbox"] = ",".join(map(str, bbox)) + if properties: + params["properties"] = ",".join(_switch_properties_id(properties, "monitoring_location_id", service)) + + headers = _default_headers() + if POST: + headers["Content-Type"] = "application/query-cql-json" + resp = httpx.post(baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) + else: + resp = httpx.get(baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + if resp.status_code != 200: + raise Exception(_error_body(resp)) + return resp.json() + +def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: + if return_list.empty: + if not properties or all(pd.isna(properties)): + schema = _check_OGC_requests(endpoint=service, req_type="schema") + properties = list(schema.get("properties", {}).keys()) + return pd.DataFrame(columns=properties) + return return_list + +def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: + if properties and not all(pd.isna(properties)): + if "id" not in properties: + if output_id in properties: + df = df.rename(columns={"id": output_id}) + else: + plural = output_id.replace("_id", "s_id") + if plural in properties: + df = df.rename(columns={"id": plural}) + return df.loc[:, [col for col in properties if col in df.columns]] + else: + return df.rename(columns={"id": output_id}) + +def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: + if "qualifier" in df.columns: + df["qualifier"] = df["qualifier"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x) + if "time" in df.columns and service == "daily": + df["time"] = pd.to_datetime(df["time"]).dt.date + for col in ["value", "contributing_drainage_area"]: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors="coerce") + return df + +def _next_req_url(resp: httpx.Response, req_url: str) -> Optional[str]: + body = resp.json() + if not body.get("numberReturned"): + return None + header_info = resp.headers + if os.getenv("API_USGS_PAT", ""): + print("Remaining requests this hour:", header_info.get("x-ratelimit-remaining", "")) + for link in body.get("links", []): + if link.get("rel") == "next": + return link.get("href") + return None + +def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: + body = resp.json() + if not body.get("numberReturned"): + return pd.DataFrame() + df = pd.DataFrame(body.get("features", [])) + for col in ["geometry", "AsGeoJSON(geometry)"]: + if col in df.columns: + df = df.drop(columns=[col]) + return df + +def _walk_pages(req_url: str, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: + print(f"Requesting:\n{req_url}") + client = client or httpx.Client() + if max_results is None or pd.isna(max_results): + dfs = [] + curr_url = req_url + failures = [] + while curr_url: + try: + resp = client.get(curr_url) + resp.raise_for_status() + df1 = _get_resp_data(resp) + dfs.append(df1) + curr_url = _next_req_url(resp, curr_url) + except Exception: + failures.append(curr_url) + curr_url = None + if failures: + print(f"There were {len(failures)} failed requests.") + return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() + else: + resp = client.get(req_url) + resp.raise_for_status() + return _get_resp_data(resp) + +def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: + args = args.copy() # Don't mutate input + args["service"] = service + max_results = args.pop("max_results", None) + args = _switch_arg_id(args, id_name=output_id, service=service) + properties = args.get("properties") + args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) + convertType = args.pop("convertType", False) + req_url = construct_api_requests(**args) + return_list = _walk_pages(req_url, max_results) + return_list = _deal_with_empty(return_list, properties, service) + if convertType: + return_list = _cleanup_cols(return_list, service=service) + return_list = _rejigger_cols(return_list, properties, output_id) + # Metadata + return_list.attrs.update(request=req_url, queryTime=pd.Timestamp.now()) + return return_list \ No newline at end of file From c4b0b9ae1e02f15d2c5f1e3407ba390621e7ad90 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 27 Aug 2025 17:26:12 -0500 Subject: [PATCH 02/56] start adding documentation and going through functions --- dataretrieval/waterdata.py | 30 +++++++--- dataretrieval/waterdata_helpers.py | 93 +++++++++++++++++++++++++----- 2 files changed, 100 insertions(+), 23 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index f28d5293..cb0ec592 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -14,7 +14,7 @@ from requests.models import PreparedRequest from dataretrieval.utils import BaseMetadata, to_str -import dataretrieval.waterdata_helpers +from dataretrieval import waterdata_helpers if TYPE_CHECKING: from typing import Optional, Tuple, Union @@ -97,17 +97,33 @@ def get_daily( service = "daily" output_id = "daily_id" - return_list = _get_ogc_data( + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers._get_ogc_data(args, output_id, service) + +# def get_monitoring_locations(): +# service = "monitoring-locations" +# output_id = "monitoring_location_id" - ) +# # Build argument dictionary, omitting None values +# args = { +# k: v for k, v in locals().items() +# if k not in {"service", "output_id"} and v is not None +# } +# args["convertType"] = False -def get_monitoring_locations(): +# return _get_ogc_data(args, output_id, service) -def get_ts_meta(): +# def get_ts_meta(): -def get_latest_continuous(): +# def get_latest_continuous(): -def get_field_measurements(): +# def get_field_measurements(): def get_codes(code_service: _CODE_SERVICES) -> DataFrame: """Return codes from a Samples code service. diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 6ea14232..49b4b3d1 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -12,20 +12,79 @@ # --- Caching for repeated calls --- _cached_base_url = None def _base_url(): + """ + Returns the base URL for the USGS Water Data OGC API. + + Uses a cached value to avoid repeated string formatting. If the cached value + is not set, it constructs the base URL using the BASE_API and API_VERSION constants. + + Returns: + str: The base URL for the API (e.g., "https://api.waterdata.usgs.gov/ogcapi/v0/"). + """ global _cached_base_url if _cached_base_url is None: _cached_base_url = f"{BASE_API}{API_VERSION}/" return _cached_base_url def _setup_api(service: str): + """ + Constructs and returns the API endpoint URL for a specified service. + + Args: + service (str): The name of the service to be used in the API endpoint. + + Returns: + str: The full URL for the API endpoint corresponding to the given service. + + Example: + >>> _setup_api("daily") + 'https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items' + """ return f"{_base_url()}collections/{service}/items" def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): + """ + Switch argument id from its package-specific identifier to the standardized "id" key + that the API recognizes. + + Sets the "id" key in the provided dictionary `ls` + with the value from either the service name or the expected id column name. + If neither key exists, "id" will be set to None. + + Example: for service "time-series-metadata", the function will look for either "time_series_metadata_id" + or "time_series_id" and change the key to simply "id". + + Args: + ls (Dict[str, Any]): The dictionary containing identifier keys to be standardized. + id_name (str): The name of the specific identifier key to look for. + service (str): The service name. + + Returns: + Dict[str, Any]: The modified dictionary with the "id" key set appropriately. + """ service_id = service.replace("-", "_") + "_id" ls.setdefault("id", ls.pop(service_id, ls.pop(id_name, None))) return ls def _switch_properties_id(properties: Optional[List[str]], id_name: str, service: str): + """ + Switch properties id from its package-specific identifier to the standardized "id" key + that the API recognizes. + + Sets the "id" key in the provided dictionary `ls` with the value from either the service name + or the expected id column name. If neither key exists, "id" will be set to None. + + Example: for service "monitoring-locations", it will look for "monitoring_location_id" and change + it to "id". + + Args: + ls (Dict[str, Any]): The dictionary containing identifier keys to be standardized. + id_name (str): The name of the specific identifier key to look for. + service (str): The service name. + + Returns: + Dict[str, Any]: The modified dictionary with the "id" key set appropriately. + """ if not properties: return [] service_id = service.replace("-", "_") + "_id" @@ -87,7 +146,7 @@ def _default_headers(): "User-Agent": "python-dataretrieval/1.0", "lang": "en-US" } - token = os.getenv("API_USGS_PAT", "") + token = os.getenv("API_USGS_PAT") if token: headers["X-Api-Key"] = token return headers @@ -112,20 +171,6 @@ def _get_collection(): resp.raise_for_status() return resp.json() -def _get_description(service: str): - tags = _get_collection().get("tags", []) - for tag in tags: - if tag.get("name") == service: - return tag.get("description") - return None - -def _get_params(service: str): - url = f"{_base_url()}collections/{service}/schema" - resp = httpx.get(url, headers=_default_headers()) - resp.raise_for_status() - properties = resp.json().get("properties", {}) - return {k: v.get("description") for k, v in properties.items()} - def construct_api_requests( service: str, properties: Optional[List[str]] = None, @@ -159,6 +204,7 @@ def construct_api_requests( params["properties"] = ",".join(_switch_properties_id(properties, "monitoring_location_id", service)) headers = _default_headers() + print({**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) if POST: headers["Content-Type"] = "application/query-cql-json" resp = httpx.post(baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) @@ -262,4 +308,19 @@ def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.Data return_list = _rejigger_cols(return_list, properties, output_id) # Metadata return_list.attrs.update(request=req_url, queryTime=pd.Timestamp.now()) - return return_list \ No newline at end of file + return return_list + + +# def _get_description(service: str): +# tags = _get_collection().get("tags", []) +# for tag in tags: +# if tag.get("name") == service: +# return tag.get("description") +# return None + +# def _get_params(service: str): +# url = f"{_base_url()}collections/{service}/schema" +# resp = httpx.get(url, headers=_default_headers()) +# resp.raise_for_status() +# properties = resp.json().get("properties", {}) +# return {k: v.get("description") for k, v in properties.items()} \ No newline at end of file From c32ded583cc4890f9a8abef2b67411b0b4e94b9b Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 28 Aug 2025 17:06:24 -0500 Subject: [PATCH 03/56] adjust date function --- dataretrieval/waterdata_helpers.py | 81 ++++++++++++++++++------------ 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 49b4b3d1..9cb6a45d 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -5,6 +5,9 @@ from datetime import datetime import pytz import pandas as pd +from datetime import datetime +from zoneinfo import ZoneInfo +import re BASE_API = "https://api.waterdata.usgs.gov/ogcapi/" API_VERSION = "v0" @@ -78,12 +81,12 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service it to "id". Args: - ls (Dict[str, Any]): The dictionary containing identifier keys to be standardized. + properties (List[str]): A list containing the properties or column names to be pulled from the service. id_name (str): The name of the specific identifier key to look for. service (str): The service name. Returns: - Dict[str, Any]: The modified dictionary with the "id" key set appropriately. + List[str]: The modified list with the "id" key set appropriately. """ if not properties: return [] @@ -99,37 +102,53 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service # Remove unwanted fields return [p for p in properties if p not in ["geometry", service_id]] -def _format_api_dates(datetime_list: Union[str, List[Union[str, datetime]]], date: bool = False): - def _iso8601(dt): - if isinstance(dt, str): +def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: + # Get timezone + local_timezone = ZoneInfo.local() + + # Return empty strings as None + if isinstance(datetime_input, str) and datetime_input.strip() == "": + return None + + # Convert single string to list for uniform processing + if isinstance(datetime_input, str): + datetime_input = [datetime_input] + + # Check for null or all NA and return None + if all(pd.isna(dt) or dt == "" for dt in datetime_input): + return None + # If the list is of length 1, first look for things like "P7D" or dates + # already formatted in ISO08601. Otherwise, try to coerce to datetime + if len(datetime_input) == 1: + dt = datetime_input[0] + if re.search(r"P", dt, re.IGNORECASE) or "/" in dt: return dt - elif isinstance(dt, datetime): - if dt.tzinfo is None: - dt = pytz.UTC.localize(dt) - return dt.isoformat() - return str(dt) - - if isinstance(datetime_list, str): - if not datetime_list: - return None - if "P" in datetime_list or "/" in datetime_list: - return datetime_list - return datetime_list - if isinstance(datetime_list, list): - datetime_list = [None if not d else d for d in datetime_list] - if all(d is None for d in datetime_list): - return None - if len(datetime_list) == 1: - d = datetime_list[0] - if isinstance(d, str) and ("P" in d or "/" in d): - return d - return datetime.strptime(d, "%Y-%m-%d").strftime("%Y-%m-%d") if date else _iso8601(d) - elif len(datetime_list) == 2: - dates = [datetime.strptime(str(d), "%Y-%m-%d").strftime("%Y-%m-%d") if date and d else _iso8601(d) if d else "" for d in datetime_list] - return "/".join(dates).replace("NA", "..") else: - raise ValueError("datetime should only include 1-2 values") - return None + try: + parsed_dt = pd.to_datetime(dt) + # If the service only accepts dates for this input, not datetimes (e.g. "daily"), + # return just the date, otherwise, return the datetime in UTC format. + if date: + return parsed_dt.strftime("%Y-%m-%d") + else: + parsed_dt.strftime("%Y-%m-%dT%H:%M:%SZ") + parsed_dt.replace(tzinfo=local_timezone) + return parsed_dt.astimezone(pytz.UTC) + except Exception: + return None + + elif len(datetime_input) == 2: + try: + parsed_dates = [pd.to_datetime(dt) for dt in datetime_input] + if date: + formatted = "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) + else: + formatted = "/".join(dt.strftime("%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=local_timezone).astimezone(pytz.UTC) for dt in parsed_dates) + return formatted.replace("", "..") + except Exception: + return None + else: + raise ValueError("datetime_input should only include 1-2 values") def _explode_post(ls: Dict[str, Any]): return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} From 99e949cadb665b22706256302d1f9954e206d23f Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 29 Aug 2025 12:30:21 -0500 Subject: [PATCH 04/56] fix dates function --- dataretrieval/waterdata_helpers.py | 32 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 9cb6a45d..6793b09b 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -5,6 +5,7 @@ from datetime import datetime import pytz import pandas as pd +import numpy as np from datetime import datetime from zoneinfo import ZoneInfo import re @@ -104,19 +105,19 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: # Get timezone - local_timezone = ZoneInfo.local() - - # Return empty strings as None - if isinstance(datetime_input, str) and datetime_input.strip() == "": - return None - + local_timezone = datetime.now().astimezone().tzinfo + # Convert single string to list for uniform processing if isinstance(datetime_input, str): datetime_input = [datetime_input] - + # Check for null or all NA and return None - if all(pd.isna(dt) or dt == "" for dt in datetime_input): + if all(pd.isna(dt) or dt == "" or dt == None for dt in datetime_input): return None + + # Replace all blanks with "nan" + datetime_input = ["nan" if x == "" else x for x in datetime_input] + # If the list is of length 1, first look for things like "P7D" or dates # already formatted in ISO08601. Otherwise, try to coerce to datetime if len(datetime_input) == 1: @@ -125,26 +126,27 @@ def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) return dt else: try: - parsed_dt = pd.to_datetime(dt) + # Parse to naive datetime + parsed_dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") # If the service only accepts dates for this input, not datetimes (e.g. "daily"), # return just the date, otherwise, return the datetime in UTC format. if date: return parsed_dt.strftime("%Y-%m-%d") else: - parsed_dt.strftime("%Y-%m-%dT%H:%M:%SZ") - parsed_dt.replace(tzinfo=local_timezone) - return parsed_dt.astimezone(pytz.UTC) + dt_local = parsed_dt.replace(tzinfo=local_timezone) + # Convert to UTC and format as ISO 8601 with 'Z' + return dt_local.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") except Exception: return None elif len(datetime_input) == 2: try: - parsed_dates = [pd.to_datetime(dt) for dt in datetime_input] + parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] if date: formatted = "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) else: - formatted = "/".join(dt.strftime("%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=local_timezone).astimezone(pytz.UTC) for dt in parsed_dates) - return formatted.replace("", "..") + formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_dates) + return formatted.replace("nan", "..") except Exception: return None else: From 1641e851b2229814051b5355d9fcf74444979f06 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 29 Aug 2025 14:52:19 -0500 Subject: [PATCH 05/56] keep working out issues with api calls --- dataretrieval/waterdata_helpers.py | 58 +++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 6793b09b..ce8413b0 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -66,10 +66,22 @@ def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): Returns: Dict[str, Any]: The modified dictionary with the "id" key set appropriately. """ + service_id = service.replace("-", "_") + "_id" - ls.setdefault("id", ls.pop(service_id, ls.pop(id_name, None))) + + if "id" not in ls: + if service_id in ls: + ls["id"] = ls[service_id] + elif id_name in ls: + ls["id"] = ls[id_name] + + # Remove the original keys regardless of whether they were used + ls.pop(service_id, None) + ls.pop(id_name, None) + return ls + def _switch_properties_id(properties: Optional[List[str]], id_name: str, service: str): """ Switch properties id from its package-specific identifier to the standardized "id" key @@ -103,7 +115,7 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service # Remove unwanted fields return [p for p in properties if p not in ["geometry", service_id]] -def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: +def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: # Get timezone local_timezone = datetime.now().astimezone().tzinfo @@ -138,7 +150,8 @@ def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) return dt_local.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") except Exception: return None - + # If the list is of length 2, parse the dates and if necessary, combine them together into + # the date range format accepted by the API elif len(datetime_input) == 2: try: parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] @@ -186,13 +199,7 @@ def _error_body(resp: httpx.Response): return "Query request denied. Possible reasons include query exceeding server limits." return resp.text -def _get_collection(): - url = f"{_base_url()}openapi?f=json" - resp = httpx.get(url, headers=_default_headers()) - resp.raise_for_status() - return resp.json() - -def construct_api_requests( +def _construct_api_requests( service: str, properties: Optional[List[str]] = None, bbox: Optional[List[float]] = None, @@ -209,9 +216,18 @@ def construct_api_requests( params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") - post_params = _explode_post({k: v for k, v in kwargs.items() if k not in single_params}) + + # Create post calls for any input parameters that are not in the single_params list + # and have more than one element associated with the list or tuple. + post_params = _explode_post({ + k: v for k, v in kwargs.items() + if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 + }) + + # Indicate if function needs to perform POST conversion POST = bool(post_params) + # Convert dates to ISO08601 format time_periods = {"last_modified", "datetime", "time", "begin", "end"} for i in time_periods: if i in params: @@ -219,18 +235,21 @@ def construct_api_requests( params[i] = _format_api_dates(params[i], date=dates) kwargs[i] = _format_api_dates(kwargs[i], date=dates) + # String together bbox elements from a list to a comma-separated string, + # and string together properties if provided if bbox: params["bbox"] = ",".join(map(str, bbox)) if properties: - params["properties"] = ",".join(_switch_properties_id(properties, "monitoring_location_id", service)) + params["properties"] = ",".join(properties) headers = _default_headers() - print({**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + if POST: headers["Content-Type"] = "application/query-cql-json" resp = httpx.post(baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) else: resp = httpx.get(baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + print(resp.url) if resp.status_code != 200: raise Exception(_error_body(resp)) return resp.json() @@ -313,7 +332,7 @@ def _walk_pages(req_url: str, max_results: Optional[int], client: Optional[httpx resp.raise_for_status() return _get_resp_data(resp) -def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: +def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: args = args.copy() # Don't mutate input args["service"] = service max_results = args.pop("max_results", None) @@ -321,7 +340,8 @@ def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.Data properties = args.get("properties") args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) convertType = args.pop("convertType", False) - req_url = construct_api_requests(**args) + args = {k: v for k, v in args.items() if v is not None} + req_url = _construct_api_requests(**args) return_list = _walk_pages(req_url, max_results) return_list = _deal_with_empty(return_list, properties, service) if convertType: @@ -344,4 +364,10 @@ def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.Data # resp = httpx.get(url, headers=_default_headers()) # resp.raise_for_status() # properties = resp.json().get("properties", {}) -# return {k: v.get("description") for k, v in properties.items()} \ No newline at end of file +# return {k: v.get("description") for k, v in properties.items()} + +# def _get_collection(): +# url = f"{_base_url()}openapi?f=json" +# resp = httpx.get(url, headers=_default_headers()) +# resp.raise_for_status() +# return resp.json() \ No newline at end of file From 7bc6c6f884631e09792b1884968362fb6830b869 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 29 Aug 2025 15:05:39 -0500 Subject: [PATCH 06/56] add documentation --- dataretrieval/waterdata_helpers.py | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index ce8413b0..841e1ed8 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -116,6 +116,31 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service return [p for p in properties if p not in ["geometry", service_id]] def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: + """ + Formats date or datetime input(s) for use with an API, handling single values or ranges, and converting to ISO 8601 or date-only formats as needed. + Parameters + ---------- + datetime_input : Union[str, List[str]] + A single date/datetime string or a list of one or two date/datetime strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601, or relative periods (e.g., "P7D"). + date : bool, optional + If True, returns only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). + Returns + ------- + Union[str, None] + - If input is a single value, returns the formatted date/datetime string or None if parsing fails. + - If input is a list of two values, returns a date/datetime range string separated by "/" (e.g., "YYYY-MM-DD/YYYY-MM-DD" or "YYYY-MM-DDTHH:MM:SSZ/YYYY-MM-DDTHH:MM:SSZ"). + - Returns None if input is empty, all NA, or cannot be parsed. + Raises + ------ + ValueError + If `datetime_input` contains more than two values. + Notes + ----- + - Handles blank or NA values by returning None. + - Supports relative period strings (e.g., "P7D") and passes them through unchanged. + - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when `date` is False. + - For date ranges, replaces "nan" with ".." in the output. + """ # Get timezone local_timezone = datetime.now().astimezone().tzinfo @@ -174,6 +199,14 @@ def _cql2_param(parameter: Dict[str, List[str]]): return {"property": property_name, "parameter": parameters} def _default_headers(): + """ + Generate default HTTP headers for API requests. + + Returns: + dict: A dictionary containing default headers including 'Accept-Encoding', + 'Accept', 'User-Agent', and 'lang'. If the environment variable 'API_USGS_PAT' + is set, its value is included as the 'X-Api-Key' header. + """ headers = { "Accept-Encoding": "compress, gzip", "Accept": "application/json", @@ -186,6 +219,20 @@ def _default_headers(): return headers def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): + """ + Sends an HTTP GET request to the specified OGC endpoint and request type, returning the JSON response. + + Args: + endpoint (str): The OGC collection endpoint to query. Defaults to "daily". + req_type (str): The type of request to make. Must be either "queryables" or "schema". Defaults to "queryables". + + Returns: + dict: The JSON response from the OGC endpoint. + + Raises: + AssertionError: If req_type is not "queryables" or "schema". + httpx.HTTPStatusError: If the HTTP request returns an unsuccessful status code. + """ assert req_type in ["queryables", "schema"] url = f"{_base_url()}collections/{endpoint}/{req_type}" resp = httpx.get(url, headers=_default_headers()) From 1b29d6aecf4c30fa399da0a341b56f09a20decc1 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 08:59:33 -0500 Subject: [PATCH 07/56] adjust how response is handled and edit walk pages, fix API limit print --- dataretrieval/waterdata_helpers.py | 73 +++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 841e1ed8..331194ab 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -240,6 +240,17 @@ def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): return resp.json() def _error_body(resp: httpx.Response): + """ + Extracts and returns an error message from an HTTP response object based on its status code. + + Args: + resp (httpx.Response): The HTTP response object to extract the error message from. + + Returns: + str: The extracted error message. For status code 429, returns the 'message' field from the JSON error object. + For status code 403, returns a predefined message indicating possible reasons for denial. + For other status codes, returns the raw response text. + """ if resp.status_code == 429: return resp.json().get('error', {}).get('message') elif resp.status_code == 403: @@ -255,11 +266,35 @@ def _construct_api_requests( skipGeometry: bool = False, **kwargs ): + """ + Constructs an HTTP request object for the specified water data API service. + Depending on the input parameters, the function determines whether to use a GET or POST request, + formats parameters appropriately, and sets required headers. + Args: + service (str): The name of the API service to query (e.g., "daily"). + properties (Optional[List[str]], optional): List of property names to include in the request. + bbox (Optional[List[float]], optional): Bounding box coordinates as a list of floats. + limit (Optional[int], optional): Maximum number of results to return per request. + max_results (Optional[int], optional): Maximum number of results allowed by the API. + skipGeometry (bool, optional): Whether to exclude geometry from the response. + **kwargs: Additional query parameters, including date/time filters and other API-specific options. + Returns: + httpx.Request: The constructed HTTP request object ready to be sent. + Raises: + ValueError: If `limit` is greater than `max_results`. + Notes: + - Date/time parameters are automatically formatted to ISO8601. + - If multiple values are provided for non-single parameters, a POST request is constructed. + - The function sets appropriate headers for GET and POST requests. + """ baseURL = _setup_api(service) + # Single parameters can only have one value single_params = {"datetime", "last_modified", "begin", "end", "time"} params = {k: v for k, v in kwargs.items() if k in single_params} + # Set skipGeometry parameter params["skipGeometry"] = skipGeometry - # Limit logic + # If limit is none and max_results is not none, then set limit to max results. Otherwise, + # if max_results is none, set it to 10000 (the API max). params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") @@ -293,13 +328,10 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" - resp = httpx.post(baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) + req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) else: - resp = httpx.get(baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) - print(resp.url) - if resp.status_code != 200: - raise Exception(_error_body(resp)) - return resp.json() + req = httpx.Request(method="GET", url=baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + return req def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: if return_list.empty: @@ -341,7 +373,9 @@ def _next_req_url(resp: httpx.Response, req_url: str) -> Optional[str]: print("Remaining requests this hour:", header_info.get("x-ratelimit-remaining", "")) for link in body.get("links", []): if link.get("rel") == "next": - return link.get("href") + next_url = link.get("href") + print(f"Next URL: {next_url}") + return next_url return None def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: @@ -354,17 +388,23 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: df = df.drop(columns=[col]) return df -def _walk_pages(req_url: str, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: - print(f"Requesting:\n{req_url}") +def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: + print(f"Requesting:\n{req.url}") + + # Get first response from client + # using GET or POST call client = client or httpx.Client() + resp = client.send(req) + if resp.status_code != 200: raise Exception(_error_body(resp)) + if max_results is None or pd.isna(max_results): dfs = [] - curr_url = req_url + curr_url = _next_req_url(resp, req.url) failures = [] while curr_url: try: - resp = client.get(curr_url) - resp.raise_for_status() + resp = client.get(curr_url, headers=_default_headers()) + if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp) dfs.append(df1) curr_url = _next_req_url(resp, curr_url) @@ -375,7 +415,6 @@ def _walk_pages(req_url: str, max_results: Optional[int], client: Optional[httpx print(f"There were {len(failures)} failed requests.") return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() else: - resp = client.get(req_url) resp.raise_for_status() return _get_resp_data(resp) @@ -388,14 +427,14 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) convertType = args.pop("convertType", False) args = {k: v for k, v in args.items() if v is not None} - req_url = _construct_api_requests(**args) - return_list = _walk_pages(req_url, max_results) + req = _construct_api_requests(**args) + return_list = _walk_pages(req, max_results) return_list = _deal_with_empty(return_list, properties, service) if convertType: return_list = _cleanup_cols(return_list, service=service) return_list = _rejigger_cols(return_list, properties, output_id) # Metadata - return_list.attrs.update(request=req_url, queryTime=pd.Timestamp.now()) + return_list.attrs.update(request=req.url, queryTime=pd.Timestamp.now()) return return_list From 3289982351dc95be62e4d194468f7cbbb9361e8b Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 09:09:25 -0500 Subject: [PATCH 08/56] add documentation --- dataretrieval/waterdata_helpers.py | 72 +++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 331194ab..17a80306 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -334,6 +334,21 @@ def _construct_api_requests( return req def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: + """ + Handles empty DataFrame results by returning a DataFrame with appropriate columns. + + If `return_list` is empty, determines the column names to use: + - If `properties` is not provided or contains only NaN values, retrieves the schema properties from the specified service. + - Otherwise, uses the provided `properties` list as column names. + + Args: + return_list (pd.DataFrame): The DataFrame to check for emptiness. + properties (Optional[List[str]]): List of property names to use as columns, or None. + service (str): The service endpoint to query for schema properties if needed. + + Returns: + pd.DataFrame: The original DataFrame if not empty, otherwise an empty DataFrame with the appropriate columns. + """ if return_list.empty: if not properties or all(pd.isna(properties)): schema = _check_OGC_requests(endpoint=service, req_type="schema") @@ -342,6 +357,23 @@ def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], return return_list def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: + """ + Rearranges and renames columns in a DataFrame based on provided properties and output identifier. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame whose columns are to be rearranged or renamed. + properties : Optional[List[str]] + A list of column names to possibly rename. If None or contains only NaN, the function will rename 'id' to output_id. + output_id : str + The name to which the 'id' column should be renamed if applicable. + + Returns + ------- + pd.DataFrame + The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. + """ if properties and not all(pd.isna(properties)): if "id" not in properties: if output_id in properties: @@ -355,6 +387,27 @@ def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: return df.rename(columns={"id": output_id}) def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: + """ + Cleans and standardizes columns in a pandas DataFrame for water data endpoints. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing water data. + service : str, optional + The type of water data service (default is "daily"). + + Returns + ------- + pd.DataFrame + The cleaned DataFrame with standardized columns. + + Notes + ----- + - If the 'qualifier' column exists, lists are joined into comma-separated strings. + - If the 'time' column exists and service is "daily", it is converted to date objects. + - The 'value' and 'contributing_drainage_area' columns are coerced to numeric types. + """ if "qualifier" in df.columns: df["qualifier"] = df["qualifier"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x) if "time" in df.columns and service == "daily": @@ -364,7 +417,24 @@ def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: df[col] = pd.to_numeric(df[col], errors="coerce") return df -def _next_req_url(resp: httpx.Response, req_url: str) -> Optional[str]: +def _next_req_url(resp: httpx.Response) -> Optional[str]: + """ + Extracts the URL for the next page of results from an HTTP response from a water data endpoint. + + Parameters: + resp (httpx.Response): The HTTP response object containing JSON data and headers. + + Returns: + Optional[str]: The URL for the next page of results if available, otherwise None. + + Side Effects: + If the environment variable "API_USGS_PAT" is set, prints the remaining requests for the current hour. + Prints the next URL if found. + + Notes: + - Expects the response JSON to contain a "links" list with objects having "rel" and "href" keys. + - Checks for the "next" relation in the "links" to determine the next URL. + """ body = resp.json() if not body.get("numberReturned"): return None From 867d7283f550b11743e063f74c8b2cad43a39a80 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 13:30:10 -0500 Subject: [PATCH 09/56] add more documentation, correct waterdata module --- dataretrieval/waterdata.py | 4 +- dataretrieval/waterdata_helpers.py | 90 ++++++++++++++++++++++++++---- 2 files changed, 81 insertions(+), 13 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index cb0ec592..a2376952 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -104,7 +104,7 @@ def get_daily( } args["convertType"] = False - return waterdata_helpers._get_ogc_data(args, output_id, service) + return waterdata_helpers.get_ogc_data(args, output_id, service) # def get_monitoring_locations(): # service = "monitoring-locations" @@ -117,7 +117,7 @@ def get_daily( # } # args["convertType"] = False -# return _get_ogc_data(args, output_id, service) +# return waterdata_helpers.get_ogc_data(args, output_id, service) # def get_ts_meta(): diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 17a80306..d73ba3c7 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -404,12 +404,9 @@ def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: Notes ----- - - If the 'qualifier' column exists, lists are joined into comma-separated strings. - If the 'time' column exists and service is "daily", it is converted to date objects. - The 'value' and 'contributing_drainage_area' columns are coerced to numeric types. """ - if "qualifier" in df.columns: - df["qualifier"] = df["qualifier"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x) if "time" in df.columns and service == "daily": df["time"] = pd.to_datetime(df["time"]).dt.date for col in ["value", "contributing_drainage_area"]: @@ -449,16 +446,58 @@ def _next_req_url(resp: httpx.Response) -> Optional[str]: return None def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: + """ + Extracts and normalizes data from an httpx.Response object containing GeoJSON features. + + Parameters: + resp (httpx.Response): The HTTP response object expected to contain a JSON body with a "features" key. + + Returns: + pd.DataFrame: A pandas DataFrame containing the normalized feature properties. + Returns an empty DataFrame if no features are returned. + + Notes: + - Drops columns "type", "geometry", and "AsGeoJSON(geometry)" if present. + - Flattens nested properties and removes the "properties_" prefix from column names. + """ body = resp.json() if not body.get("numberReturned"): return pd.DataFrame() - df = pd.DataFrame(body.get("features", [])) - for col in ["geometry", "AsGeoJSON(geometry)"]: - if col in df.columns: - df = df.drop(columns=[col]) + df = pd.json_normalize( + resp.json()["features"], + sep="_") + df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") + df.columns = [col.replace("properties_", "") for col in df.columns] return df def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: + """ + Iterates through paginated API responses and aggregates the results into a single DataFrame. + + Parameters + ---------- + req : httpx.Request + The initial HTTP request to send. + max_results : Optional[int] + The maximum number of results to retrieve. If None or NaN, retrieves all available pages. + client : Optional[httpx.Client], default None + An optional HTTP client to use for requests. If not provided, a new client is created. + + Returns + ------- + pd.DataFrame + A DataFrame containing the aggregated results from all pages. + + Raises + ------ + Exception + If a request fails or returns a non-200 status code. + + Notes + ----- + - If `max_results` is None or NaN, the function will continue to request subsequent pages until no more pages are available. + - Failed requests are tracked and reported, but do not halt the entire process unless the initial request fails. + """ print(f"Requesting:\n{req.url}") # Get first response from client @@ -469,7 +508,7 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional if max_results is None or pd.isna(max_results): dfs = [] - curr_url = _next_req_url(resp, req.url) + curr_url = _next_req_url(resp) failures = [] while curr_url: try: @@ -477,7 +516,7 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp) dfs.append(df1) - curr_url = _next_req_url(resp, curr_url) + curr_url = _next_req_url(resp) except Exception: failures.append(curr_url) curr_url = None @@ -489,21 +528,50 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional return _get_resp_data(resp) def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: - args = args.copy() # Don't mutate input + """ + Retrieves OGC (Open Geospatial Consortium) data from a specified water data endpoint and returns it as a pandas DataFrame. + + This function prepares request arguments, constructs API requests, handles pagination, processes the results, + and formats the output DataFrame according to the specified parameters. + + Args: + args (Dict[str, Any]): Dictionary of request arguments for the OGC service. + output_id (str): The name of the output identifier to use in the request. + service (str): The OGC service type (e.g., "wfs", "wms"). + + Returns: + pd.DataFrame: A DataFrame containing the retrieved and processed OGC data, with metadata attributes + including the request URL and query timestamp. + + Notes: + - The function does not mutate the input `args` dictionary. + - Handles optional arguments such as `max_results` and `convertType`. + - Applies column cleanup and reordering based on service and properties. + - Metadata is attached to the DataFrame via the `.attrs` attribute. + """ + args = args.copy() + # Add service as an argument args["service"] = service + # Pull out a max results input if exists max_results = args.pop("max_results", None) + # Switch the input id to "id" if needed args = _switch_arg_id(args, id_name=output_id, service=service) properties = args.get("properties") + # Switch properties id to "id" if needed args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) convertType = args.pop("convertType", False) + # Create fresh dictionary of args without any None values args = {k: v for k, v in args.items() if v is not None} + # Build API request req = _construct_api_requests(**args) + # Run API request and iterate through pages if needed return_list = _walk_pages(req, max_results) + # Manage some aspects of the returned dataset return_list = _deal_with_empty(return_list, properties, service) if convertType: return_list = _cleanup_cols(return_list, service=service) return_list = _rejigger_cols(return_list, properties, output_id) - # Metadata + # Add metadata return_list.attrs.update(request=req.url, queryTime=pd.Timestamp.now()) return return_list From 44213b58378cb24796d28f4c3ff028be96253400 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 14:24:57 -0500 Subject: [PATCH 10/56] allow post and get calls in recursive walk pages, fix typo where first page not downloading, start to add more function outlines --- dataretrieval/waterdata.py | 59 +++++++++++++++++++++++------- dataretrieval/waterdata_helpers.py | 15 ++++++-- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index a2376952..10887af1 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -106,24 +106,57 @@ def get_daily( return waterdata_helpers.get_ogc_data(args, output_id, service) -# def get_monitoring_locations(): -# service = "monitoring-locations" -# output_id = "monitoring_location_id" +def get_monitoring_locations() -> pd.DataFrame: + service = "monitoring-locations" + output_id = "monitoring_location_id" -# # Build argument dictionary, omitting None values -# args = { -# k: v for k, v in locals().items() -# if k not in {"service", "output_id"} and v is not None -# } -# args["convertType"] = False + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers.get_ogc_data(args, output_id, service) + +def get_ts_meta() -> pd.DataFrame: + service = "time-series-metadata" + output_id = "time_series_id" + + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers.get_ogc_data(args, output_id, service) -# return waterdata_helpers.get_ogc_data(args, output_id, service) +def get_latest_continuous() -> pd.DataFrame: + service = "latest-continuous" + output_id = "latest_continuous_id" -# def get_ts_meta(): + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers.get_ogc_data(args, output_id, service) -# def get_latest_continuous(): +def get_field_measurements() -> pd.DataFrame: + service = "field-measurements" + output_id = "field_measurement_id" -# def get_field_measurements(): + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers.get_ogc_data(args, output_id, service) def get_codes(code_service: _CODE_SERVICES) -> DataFrame: """Return codes from a Samples code service. diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index d73ba3c7..632405f8 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -506,23 +506,30 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional resp = client.send(req) if resp.status_code != 200: raise Exception(_error_body(resp)) + # Grab some aspects of the original request: headers and the + # request type (GET or POST) + method = req.method.upper() + headers = req.headers + content = req.content if method == "POST" else None + if max_results is None or pd.isna(max_results): - dfs = [] + dfs = _get_resp_data(resp) curr_url = _next_req_url(resp) failures = [] while curr_url: try: - resp = client.get(curr_url, headers=_default_headers()) + resp = client.request(method, curr_url, headers=headers, content=content if method == "POST" else None) if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp) - dfs.append(df1) + dfs = pd.concat([dfs, df1], ignore_index=True) + #dfs.append(df1) curr_url = _next_req_url(resp) except Exception: failures.append(curr_url) curr_url = None if failures: print(f"There were {len(failures)} failed requests.") - return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() + return dfs else: resp.raise_for_status() return _get_resp_data(resp) From 4affa2f41048802e9f1fd72eaa535658ec136719 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 15:35:02 -0500 Subject: [PATCH 11/56] add in all possible arguments --- dataretrieval/waterdata.py | 121 +++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index 10887af1..a97830a0 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -93,7 +93,7 @@ def get_daily( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: - + service = "daily" output_id = "daily_id" @@ -106,7 +106,55 @@ def get_daily( return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_monitoring_locations() -> pd.DataFrame: +def get_monitoring_locations( + monitoring_location_id: Optional[List[str]] = None, + agency_code: Optional[List[str]] = None, + agency_name: Optional[List[str]] = None, + monitoring_location_number: Optional[List[str]] = None, + monitoring_location_name: Optional[List[str]] = None, + district_code: Optional[List[str]] = None, + country_code: Optional[List[str]] = None, + country_name: Optional[List[str]] = None, + state_code: Optional[List[str]] = None, + state_name: Optional[List[str]] = None, + county_code: Optional[List[str]] = None, + county_name: Optional[List[str]] = None, + minor_civil_division_code: Optional[List[str]] = None, + site_type_code: Optional[List[str]] = None, + site_type: Optional[List[str]] = None, + hydrologic_unit_code: Optional[List[str]] = None, + basin_code: Optional[List[str]] = None, + altitude: Optional[List[str]] = None, + altitude_accuracy: Optional[List[str]] = None, + altitude_method_code: Optional[List[str]] = None, + altitude_method_name: Optional[List[str]] = None, + vertical_datum: Optional[List[str]] = None, + vertical_datum_name: Optional[List[str]] = None, + horizontal_positional_accuracy_code: Optional[List[str]] = None, + horizontal_positional_accuracy: Optional[List[str]] = None, + horizontal_position_method_code: Optional[List[str]] = None, + horizontal_position_method_name: Optional[List[str]] = None, + original_horizontal_datum: Optional[List[str]] = None, + original_horizontal_datum_name: Optional[List[str]] = None, + drainage_area: Optional[List[str]] = None, + contributing_drainage_area: Optional[List[str]] = None, + time_zone_abbreviation: Optional[List[str]] = None, + uses_daylight_savings: Optional[List[str]] = None, + construction_date: Optional[List[str]] = None, + aquifer_code: Optional[List[str]] = None, + national_aquifer_code: Optional[List[str]] = None, + aquifer_type_code: Optional[List[str]] = None, + well_constructed_depth: Optional[List[str]] = None, + hole_constructed_depth: Optional[List[str]] = None, + depth_source_code: Optional[List[str]] = None, + properties: Optional[List[str]] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True + ) -> pd.DataFrame: service = "monitoring-locations" output_id = "monitoring_location_id" @@ -119,7 +167,32 @@ def get_monitoring_locations() -> pd.DataFrame: return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_ts_meta() -> pd.DataFrame: +def get_timeseries_metadata( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + parameter_name: Optional[Union[str, List[str]]] = None, + properties: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + last_modified: Optional[Union[str, List[str]]] = None, + begin: Optional[Union[str, List[str]]] = None, + end: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + computation_period_identifier: Optional[Union[str, List[str]]] = None, + computation_identifier: Optional[Union[str, List[str]]] = None, + thresholds: Optional[int] = None, + sublocation_identifier: Optional[Union[str, List[str]]] = None, + primary: Optional[Union[str, List[str]]] = None, + parent_time_series_id: Optional[Union[str, List[str]]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + web_description: Optional[Union[str, List[str]]] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True +) -> pd.DataFrame: + service = "time-series-metadata" output_id = "time_series_id" @@ -132,7 +205,25 @@ def get_ts_meta() -> pd.DataFrame: return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_latest_continuous() -> pd.DataFrame: +def get_latest_continuous( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + properties: Optional[Union[str, List[str]]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + latest_continuous_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[int] = None, + last_modified: Optional[Union[str, List[str]]] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True + ) -> pd.DataFrame: service = "latest-continuous" output_id = "latest_continuous_id" @@ -145,7 +236,27 @@ def get_latest_continuous() -> pd.DataFrame: return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_field_measurements() -> pd.DataFrame: +def get_field_measurements( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + observing_procedure_code: Optional[Union[str, List[str]]] = None, + properties: Optional[List[str]] = None, + field_visit_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[Union[str, List[str]]] = None, + last_modified: Optional[Union[str, List[str]]] = None, + observing_procedure: Optional[Union[str, List[str]]] = None, + vertical_datum: Optional[Union[str, List[str]]] = None, + measuring_agency: Optional[Union[str, List[str]]] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True + ) -> pd.DataFrame: service = "field-measurements" output_id = "field_measurement_id" From 21691d0b8657a48c05d369c6aecb1096723eaca5 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 16:57:14 -0500 Subject: [PATCH 12/56] trying to get cql2 query correct, will keep at it --- dataretrieval/waterdata_helpers.py | 187 ++++++++++++++++------------- 1 file changed, 101 insertions(+), 86 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 632405f8..ab6e4bfa 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -6,6 +6,7 @@ import pytz import pandas as pd import numpy as np +import json from datetime import datetime from zoneinfo import ZoneInfo import re @@ -193,10 +194,23 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) def _explode_post(ls: Dict[str, Any]): return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} -def _cql2_param(parameter: Dict[str, List[str]]): - property_name = next(iter(parameter)) - parameters = [str(x) for x in parameter[property_name]] - return {"property": property_name, "parameter": parameters} +def _cql2_param(args): + filters = [] + for key, values in args.items(): + filters.append({ + "op": "in", + "args": [ + {"property": key}, + values + ] + }) + + query = { + "op": "and", + "args": filters + } + + return json.dumps(query, indent=4) def _default_headers(): """ @@ -328,92 +342,12 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" - req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) + #req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) + req = httpx.Request(method="POST", url=baseURL, headers=headers, data=_cql2_param(post_params), params=params) else: req = httpx.Request(method="GET", url=baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) return req -def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: - """ - Handles empty DataFrame results by returning a DataFrame with appropriate columns. - - If `return_list` is empty, determines the column names to use: - - If `properties` is not provided or contains only NaN values, retrieves the schema properties from the specified service. - - Otherwise, uses the provided `properties` list as column names. - - Args: - return_list (pd.DataFrame): The DataFrame to check for emptiness. - properties (Optional[List[str]]): List of property names to use as columns, or None. - service (str): The service endpoint to query for schema properties if needed. - - Returns: - pd.DataFrame: The original DataFrame if not empty, otherwise an empty DataFrame with the appropriate columns. - """ - if return_list.empty: - if not properties or all(pd.isna(properties)): - schema = _check_OGC_requests(endpoint=service, req_type="schema") - properties = list(schema.get("properties", {}).keys()) - return pd.DataFrame(columns=properties) - return return_list - -def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: - """ - Rearranges and renames columns in a DataFrame based on provided properties and output identifier. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame whose columns are to be rearranged or renamed. - properties : Optional[List[str]] - A list of column names to possibly rename. If None or contains only NaN, the function will rename 'id' to output_id. - output_id : str - The name to which the 'id' column should be renamed if applicable. - - Returns - ------- - pd.DataFrame - The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. - """ - if properties and not all(pd.isna(properties)): - if "id" not in properties: - if output_id in properties: - df = df.rename(columns={"id": output_id}) - else: - plural = output_id.replace("_id", "s_id") - if plural in properties: - df = df.rename(columns={"id": plural}) - return df.loc[:, [col for col in properties if col in df.columns]] - else: - return df.rename(columns={"id": output_id}) - -def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: - """ - Cleans and standardizes columns in a pandas DataFrame for water data endpoints. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing water data. - service : str, optional - The type of water data service (default is "daily"). - - Returns - ------- - pd.DataFrame - The cleaned DataFrame with standardized columns. - - Notes - ----- - - If the 'time' column exists and service is "daily", it is converted to date objects. - - The 'value' and 'contributing_drainage_area' columns are coerced to numeric types. - """ - if "time" in df.columns and service == "daily": - df["time"] = pd.to_datetime(df["time"]).dt.date - for col in ["value", "contributing_drainage_area"]: - if col in df.columns: - df[col] = pd.to_numeric(df[col], errors="coerce") - return df - def _next_req_url(resp: httpx.Response) -> Optional[str]: """ Extracts the URL for the next page of results from an HTTP response from a water data endpoint. @@ -534,6 +468,87 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional resp.raise_for_status() return _get_resp_data(resp) +def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: + """ + Handles empty DataFrame results by returning a DataFrame with appropriate columns. + + If `return_list` is empty, determines the column names to use: + - If `properties` is not provided or contains only NaN values, retrieves the schema properties from the specified service. + - Otherwise, uses the provided `properties` list as column names. + + Args: + return_list (pd.DataFrame): The DataFrame to check for emptiness. + properties (Optional[List[str]]): List of property names to use as columns, or None. + service (str): The service endpoint to query for schema properties if needed. + + Returns: + pd.DataFrame: The original DataFrame if not empty, otherwise an empty DataFrame with the appropriate columns. + """ + if return_list.empty: + if not properties or all(pd.isna(properties)): + schema = _check_OGC_requests(endpoint=service, req_type="schema") + properties = list(schema.get("properties", {}).keys()) + return pd.DataFrame(columns=properties) + return return_list + +def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: + """ + Rearranges and renames columns in a DataFrame based on provided properties and output identifier. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame whose columns are to be rearranged or renamed. + properties : Optional[List[str]] + A list of column names to possibly rename. If None or contains only NaN, the function will rename 'id' to output_id. + output_id : str + The name to which the 'id' column should be renamed if applicable. + + Returns + ------- + pd.DataFrame + The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. + """ + if properties and not all(pd.isna(properties)): + if "id" not in properties: + if output_id in properties: + df = df.rename(columns={"id": output_id}) + else: + plural = output_id.replace("_id", "s_id") + if plural in properties: + df = df.rename(columns={"id": plural}) + return df.loc[:, [col for col in properties if col in df.columns]] + else: + return df.rename(columns={"id": output_id}) + +def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: + """ + Cleans and standardizes columns in a pandas DataFrame for water data endpoints. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing water data. + service : str, optional + The type of water data service (default is "daily"). + + Returns + ------- + pd.DataFrame + The cleaned DataFrame with standardized columns. + + Notes + ----- + - If the 'time' column exists and service is "daily", it is converted to date objects. + - The 'value' and 'contributing_drainage_area' columns are coerced to numeric types. + """ + if "time" in df.columns and service == "daily": + df["time"] = pd.to_datetime(df["time"]).dt.date + for col in ["value", "contributing_drainage_area"]: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors="coerce") + return df + def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: """ Retrieves OGC (Open Geospatial Consortium) data from a specified water data endpoint and returns it as a pandas DataFrame. From 4c2a3eef75a282b38b9febeb6bfd035bc9e67492 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Sep 2025 12:59:14 -0500 Subject: [PATCH 13/56] correct cql2 queries --- dataretrieval/waterdata_helpers.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index ab6e4bfa..166df130 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -191,9 +191,6 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) else: raise ValueError("datetime_input should only include 1-2 values") -def _explode_post(ls: Dict[str, Any]): - return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} - def _cql2_param(args): filters = [] for key, values in args.items(): @@ -313,12 +310,11 @@ def _construct_api_requests( if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") - # Create post calls for any input parameters that are not in the single_params list - # and have more than one element associated with the list or tuple. - post_params = _explode_post({ - k: v for k, v in kwargs.items() - if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 - }) + # Identify which parameters should be included in the POST content body + post_params = { + k: v for k, v in kwargs.items() + if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 + } # Indicate if function needs to perform POST conversion POST = bool(post_params) @@ -343,7 +339,7 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" #req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) - req = httpx.Request(method="POST", url=baseURL, headers=headers, data=_cql2_param(post_params), params=params) + req = httpx.Request(method="POST", url=baseURL, headers=headers, content=_cql2_param(post_params), params=params) else: req = httpx.Request(method="GET", url=baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) return req @@ -616,4 +612,7 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF # url = f"{_base_url()}openapi?f=json" # resp = httpx.get(url, headers=_default_headers()) # resp.raise_for_status() -# return resp.json() \ No newline at end of file +# return resp.json() + +# def _explode_post(ls: Dict[str, Any]): +# return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} \ No newline at end of file From 14f283025198aa4efd60214f07b8b02201f2e729 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Sep 2025 13:18:24 -0500 Subject: [PATCH 14/56] simplify syntax, remove unneeded dependencies --- dataretrieval/waterdata_helpers.py | 31 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 166df130..164fdfc9 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -3,9 +3,7 @@ import warnings from typing import List, Dict, Any, Optional, Union from datetime import datetime -import pytz import pandas as pd -import numpy as np import json from datetime import datetime from zoneinfo import ZoneInfo @@ -301,7 +299,23 @@ def _construct_api_requests( baseURL = _setup_api(service) # Single parameters can only have one value single_params = {"datetime", "last_modified", "begin", "end", "time"} - params = {k: v for k, v in kwargs.items() if k in single_params} + # params = {k: v for k, v in kwargs.items() if k in single_params} + # # Set skipGeometry parameter + # params["skipGeometry"] = skipGeometry + # # If limit is none and max_results is not none, then set limit to max results. Otherwise, + # # if max_results is none, set it to 10000 (the API max). + # params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 + # if max_results is not None and limit is not None and limit > max_results: + # raise ValueError("limit cannot be greater than max_result") + + # Identify which parameters should be included in the POST content body + post_params = { + k: v for k, v in kwargs.items() + if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 + } + + # Everything else goes into the params dictionary for the URL + params = {k: v for k, v in kwargs.items() if k not in post_params} # Set skipGeometry parameter params["skipGeometry"] = skipGeometry # If limit is none and max_results is not none, then set limit to max results. Otherwise, @@ -309,12 +323,6 @@ def _construct_api_requests( params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") - - # Identify which parameters should be included in the POST content body - post_params = { - k: v for k, v in kwargs.items() - if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 - } # Indicate if function needs to perform POST conversion POST = bool(post_params) @@ -325,7 +333,7 @@ def _construct_api_requests( if i in params: dates = service == "daily" and i != "last_modified" params[i] = _format_api_dates(params[i], date=dates) - kwargs[i] = _format_api_dates(kwargs[i], date=dates) + #kwargs[i] = _format_api_dates(kwargs[i], date=dates) # String together bbox elements from a list to a comma-separated string, # and string together properties if provided @@ -338,10 +346,9 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" - #req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) req = httpx.Request(method="POST", url=baseURL, headers=headers, content=_cql2_param(post_params), params=params) else: - req = httpx.Request(method="GET", url=baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + req = httpx.Request(method="GET", url=baseURL, headers=headers, params=params) return req def _next_req_url(resp: httpx.Response) -> Optional[str]: From d25f854a77ee20dd13d572f9b4dc92c274268136 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 24 Sep 2025 15:26:36 -0500 Subject: [PATCH 15/56] start adding function documentation --- dataretrieval/waterdata.py | 598 ++++++++++++++++++++++++++++++++++++- 1 file changed, 596 insertions(+), 2 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index a97830a0..4ff056a3 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -93,7 +93,142 @@ def get_daily( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: + """Daily data provide one data value to represent water conditions for the day. + Throughout much of the history of the USGS, the primary water data available was + daily data collected manually at the monitoring location once each day. With + improved availability of computer storage and automated transmission of data, the + daily data published today are generally a statistical summary or metric of the + continuous data collected each day, such as the daily mean, minimum, or maximum + value. Daily data are automatically calculated from the continuous data of the same + parameter code and are described by parameter code and a statistic code. These data + have also been referred to as “daily values” or “DV”. + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter + codes and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + statistic_id : string or list of strings, optional + A code corresponding to the statistic an observation represents. + Example codes include 00001 (max), 00002 (min), and 00003 (mean). + A complete list of codes and their descriptions can be found at + https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. + Available options are: geometry, id, time_series_id, + monitoring_location_id, parameter_code, statistic_id, time, value, + unit_of_measure, approval_status, qualifier, last_modified + time_series_id : string or list of strings, optional + A unique identifier representing a single time series. This + corresponds to the id field in the time-series-metadata endpoint. + daily_id : string or list of strings, optional + A universally unique identifier (UUID) representing a single + version of a record. It is not stable over time. Every time the + record is refreshed in our database (which may happen as part of + normal operations and does not imply any change to the data itself) + a new ID will be generated. To uniquely identify a single observation + over time, compare the time and time_series_id fields; each time series + will only have a single observation at a given time. + approval_status : string or list of strings, optional + Some of the data that you have obtained from this U.S. Geological + Survey database may not have received Director's approval. Any such + data values are qualified as provisional and are subject to revision. + Provisional data are released on the condition that neither the USGS + nor the United States Government may be held liable for any damages + resulting from its use. This field reflects the approval status of + each record, and is either "Approved", meaining processing review has + been completed and the data is approved for publication, or + "Provisional" and subject to revision. For more information about + provisional data, go to + https://waterdata.usgs.gov/provisional-data-statement/. + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + qualifier : string or list of strings, optional + This field indicates any qualifiers associated with an observation, for + instance if a sensor may have been impacted by ice or if values were + estimated. + value : string or list of strings, optional + The value of the observation. Values are transmitted as strings in + the JSON response format in order to preserve precision. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a last_modified that intersects the value of datetime are selected. + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + time : string, optional + The date an observation represents. You can query this field using date-times + or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. + Intervals may be bounded or half-bounded (double-dots at start or end). + Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a time that intersects the value of datetime are selected. If + a feature has multiple temporal properties, it is the decision of the server whether + only a single temporal property is used to determine the extent or all relevant temporal properties. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + convertType : boolean, optional + If True, the function will convert the data to dates and qualifier to string vector + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = "USGS-02238500", + ... parameter_code = "00060", + ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], + ... approval_status = "Approved", + ... time = "2024-01-01/.." + """ service = "daily" output_id = "daily_id" @@ -154,7 +289,232 @@ def get_monitoring_locations( limit: Optional[int] = None, max_results: Optional[int] = None, convertType: bool = True - ) -> pd.DataFrame: + ) -> pd.DataFrame: + """Location information is basic information about the monitoring location + including the name, identifier, agency responsible for data collection, and + the date the location was established. It also includes information about + the type of location, such as stream, lake, or groundwater, and geographic + information about the location, such as state, county, latitude and longitude, + and hydrologic unit code (HUC). + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + agency_code : string or list of strings, optional + The agency that is reporting the data. Agency codes are fixed values + assigned by the National Water Information System (NWIS). A list of + agency codes is available at this link. + agency_name : string or list of strings, optional + The name of the agency that is reporting the data. + monitoring_location_number : string or list of strings, optional + Each monitoring location in the USGS data base has a unique 8- to + 15-digit identification number. Monitoring location numbers are + assigned based on this logic. + monitoring_location_name : string or list of strings, optional + This is the official name of the monitoring location in the database. + For well information this can be a district-assigned local number. + district_code : string or list of strings, optional + The Water Science Centers (WSCs) across the United States use the FIPS + state code as the district code. In some case, monitoring locations and + samples may be managed by a water science center that is adjacent to the + state in which the monitoring location actually resides. For example a + monitoring location may have a district code of 30 which translates to + Montana, but the state code could be 56 for Wyoming because that is where + the monitoring location actually is located. + country_code : string or list of strings, optional + The code for the country in which the monitoring location is located. + country_name : string or list of strings, optional + The name of the country in which the monitoring location is located. + state_code : string or list of strings, optional + State code. A two-digit ANSI code (formerly FIPS code) as defined by + the American National Standards Institute, to define States and + equivalents. A three-digit ANSI code is used to define counties and + county equivalents. A lookup table is available. The only countries with + political subdivisions other than the US are Mexico and Canada. The Mexican + states have US state codes ranging from 81-86 and Canadian provinces have + state codes ranging from 90-98. + state_name : string or list of strings, optional + The name of the state or state equivalent in which the monitoring location + is located. + county_code : string or list of strings, optional + The code for the county or county equivalent (parish, borough, etc.) in which + the monitoring location is located. A list of codes is available. + county_name : string or list of strings, optional + The name of the county or county equivalent (parish, borough, etc.) in which + the monitoring location is located. A list of codes is available. + minor_civil_division_code : string or list of strings, optional + Codes for primary governmental or administrative divisions of the county or + county equivalent in which the monitoring location is located. + site_type_code : string or list of strings, optional + A code describing the hydrologic setting of the monitoring location. A list of + codes is available. + Example: "US:15:001" (United States: Hawaii, Hawaii County) + site_type : string or list of strings, optional + A description of the hydrologic setting of the monitoring location. A list of + codes is available. + hydrologic_unit_code : string or list of strings, optional + The United States is divided and sub-divided into successively smaller + hydrologic units which are classified into four levels: regions, + sub-regions, accounting units, and cataloging units. The hydrologic units + are arranged within each other, from the smallest (cataloging units) to the + largest (regions). Each hydrologic unit is identified by a unique hydrologic + unit code (HUC) consisting of two to eight digits based on the four levels + of classification in the hydrologic unit system. + basin_code : string or list of strings, optional + The Basin Code or "drainage basin code" is a two-digit code that further + subdivides the 8-digit hydrologic-unit code. The drainage basin code is + defined by the USGS State Office where the monitoring location is located. + altitude : string or list of strings, optional + Altitude of the monitoring location referenced to the specified Vertical + Datum. + altitude_accuracy : string or list of strings, optional + Accuracy of the altitude, in feet. An accuracy of +/- 0.1 foot would be + entered as “.1”. Many altitudes are interpolated from the contours on + topographic maps; accuracies determined in this way are generally entered + as one-half of the contour interval. + altitude_method_code : string or list of strings, optional + Codes representing the method used to measure altitude. A list of codes is + available. + altitude_method_name : float, optional + The name of the the method used to measure altitude. A list of codes is + available. + vertical_datum : float, optional + The datum used to determine altitude and vertical position at the + monitoring location. A list of codes is available. + vertical_datum_name : float, optional + The datum used to determine altitude and vertical position at the + monitoring location. A list of codes is available. + horizontal_positional_accuracy_code : string or list of strings, optional + Indicates the accuracy of the latitude longitude values. A list of codes + is available. + horizontal_positional_accuracy : string or list of strings, optional + Indicates the accuracy of the latitude longitude values. A list of codes + is available. + horizontal_position_method_code : string or list of strings, optional + Indicates the method used to determine latitude longitude values. A + list of codes is available. + horizontal_position_method_name : string or list of strings, optional + Indicates the method used to determine latitude longitude values. A + list of codes is available. + original_horizontal_datum : string or list of strings, optional + Coordinates are published in EPSG:4326 / WGS84 / World Geodetic System + 1984. This field indicates the original datum used to determine + coordinates before they were converted. A list of codes is available. + original_horizontal_datum_name : string or list of strings, optional + Coordinates are published in EPSG:4326 / WGS84 / World Geodetic System + 1984. This field indicates the original datum used to determine coordinates + before they were converted. A list of codes is available. + drainage_area : string or list of strings, optional + The area enclosed by a topographic divide from which direct surface runoff + from precipitation normally drains by gravity into the stream above that + point. + contributing_drainage_area : string or list of strings, optional + The contributing drainage area of a lake, stream, wetland, or estuary + monitoring location, in square miles. This item should be present only if + the contributing area is different from the total drainage area. This + situation can occur when part of the drainage area consists of very porous + soil or depressions that either allow all runoff to enter the groundwater + or traps the water in ponds so that rainfall does not contribute to runoff. + A transbasin diversion can also affect the total drainage area. + time_zone_abbreviation : string or list of strings, optional + A short code describing the time zone used by a monitoring location. + uses_daylight_savings : string or list of strings, optional + A flag indicating whether or not a monitoring location uses daylight savings. + construction_date : string or list of strings, optional + Date the well was completed. + aquifer_code : string or list of strings, optional + Local aquifers in the USGS water resources data base are identified by a + geohydrologic unit code (a three-digit number related to the age of the + formation, followed by a 4 or 5 character abbreviation for the geologic unit + or aquifer name). Additional information is available at this link. + national_aquifer_code : string or list of strings, optional + National aquifers are the principal aquifers or aquifer systems in the United + States, defined as regionally extensive aquifers or aquifer systems that have + the potential to be used as a source of potable water. Not all groundwater + monitoring locations can be associated with a National Aquifer. Such + monitoring locations will not be retrieved using this search criteria. A list + of National aquifer codes and names is available. + aquifer_type_code : string or list of strings, optional + Groundwater occurs in aquifers under two different conditions. Where water + only partly fills an aquifer, the upper surface is free to rise and decline. + These aquifers are referred to as unconfined (or water-table) aquifers. Where + water completely fills an aquifer that is overlain by a confining bed, the + aquifer is referred to as a confined (or artesian) aquifer. When a confined + aquifer is penetrated by a well, the water level in the well will rise above + the top of the aquifer (but not necessarily above land surface). Additional + information is available at this link. + well_constructed_depth : string or list of strings, optional + The depth of the finished well, in feet below land surface datum. Note: Not + all groundwater monitoring locations have information on Well Depth. Such + monitoring locations will not be retrieved using this search criteria. + hole_constructed_depth : string or list of strings, optional + The total depth to which the hole is drilled, in feet below land surface datum. + Note: Not all groundwater monitoring locations have information on Hole Depth. + Such monitoring locations will not be retrieved using this search criteria. + depth_source_code : string or list of strings, optional + A code indicating the source of water-level data. A list of codes is available. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. Available options + are: geometry, id, agency_code, agency_name, monitoring_location_number, + monitoring_location_name, district_code, country_code, country_name, state_code, + state_name, county_code, county_name, minor_civil_division_code, site_type_code, + site_type, hydrologic_unit_code, basin_code, altitude, altitude_accuracy, + altitude_method_code, altitude_method_name, vertical_datum, vertical_datum_name, + horizontal_positional_accuracy_code, horizontal_positional_accuracy, + horizontal_position_method_code, horizontal_position_method_name, + original_horizontal_datum, original_horizontal_datum_name, drainage_area, + contributing_drainage_area, time_zone_abbreviation, uses_daylight_savings, + construction_date, aquifer_code, national_aquifer_code, aquifer_type_code, + well_constructed_depth, hole_constructed_depth, depth_source_code. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get monitoring locations within a bounding box + >>> # and leave out geometry + >>> df = dataretrieval.waterdata.get_monitoring_locations( + ... bbox=[-90.2,42.6,-88.7,43.2], + ... skipGeometry=True + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_monitoring_locations( + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], + ... properties = ["monitoring_location_id", + ... "state_name", + ... "country_name"]) + """ service = "monitoring-locations" output_id = "monitoring_location_id" @@ -167,7 +527,7 @@ def get_monitoring_locations( return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_timeseries_metadata( +def get_time_series_metadata( monitoring_location_id: Optional[Union[str, List[str]]] = None, parameter_code: Optional[Union[str, List[str]]] = None, parameter_name: Optional[Union[str, List[str]]] = None, @@ -192,6 +552,110 @@ def get_timeseries_metadata( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: + """Daily data and continuous measurements are grouped into time series, + which represent a collection of observations of a single parameter, + potentially aggregated using a standard statistic, at a single monitoring + location. This endpoint provides metadata about those time series, + including their operational thresholds, units of measurement, and when + the earliest and most recent observations in a time series occurred. + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter + codes and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + parameter_name : + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. + Available options are: geometry, id, time_series_id, + monitoring_location_id, parameter_code, statistic_id, time, value, + unit_of_measure, approval_status, qualifier, last_modified + statistic_id : string or list of strings, optional + A code corresponding to the statistic an observation represents. + Example codes include 00001 (max), 00002 (min), and 00003 (mean). + A complete list of codes and their descriptions can be found at + https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a last_modified that intersects the value of datetime are selected. + begin : + end : + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + computation_period_identifier : + computation_identifier : + thresholds : + sublocation_identifier : + primary : + parent_time_series_id : + time_series_id : string or list of strings, optional + A unique identifier representing a single time series. This + corresponds to the id field in the time-series-metadata endpoint. + web_description : + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + convertType : boolean, optional + If True, the function will convert the data to dates and qualifier to string vector + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = "USGS-02238500", + ... parameter_code = "00060", + ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], + ... approval_status = "Approved", + ... time = "2024-01-01/.." + """ service = "time-series-metadata" output_id = "time_series_id" @@ -224,6 +688,136 @@ def get_latest_continuous( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: + """This endpoint provides the most recent observation for each time series + of continuous data. Continuous data are collected via automated sensors + installed at a monitoring location. They are collected at a high frequencyand often at a fixed 15-minute interval. Depending on the specific monitoring location, the data may be transmitted automatically via telemetry and be available on WDFN within minutes of collection, while other times the delivery of data may be delayed if the monitoring location does not have the capacity to automatically transmit data. Continuous data are described by parameter name and parameter code. These data might also be referred to as "instantaneous values" or "IV" + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter + codes and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + statistic_id : string or list of strings, optional + A code corresponding to the statistic an observation represents. + Example codes include 00001 (max), 00002 (min), and 00003 (mean). + A complete list of codes and their descriptions can be found at + https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. + Available options are: geometry, id, time_series_id, + monitoring_location_id, parameter_code, statistic_id, time, value, + unit_of_measure, approval_status, qualifier, last_modified + time_series_id : string or list of strings, optional + A unique identifier representing a single time series. This + corresponds to the id field in the time-series-metadata endpoint. + latest_continuous_id : string or list of strings, optional + A universally unique identifier (UUID) representing a single + version of a record. It is not stable over time. Every time the + record is refreshed in our database (which may happen as part of + normal operations and does not imply any change to the data itself) + a new ID will be generated. To uniquely identify a single observation + over time, compare the time and time_series_id fields; each time series + will only have a single observation at a given time. + approval_status : string or list of strings, optional + Some of the data that you have obtained from this U.S. Geological + Survey database may not have received Director's approval. Any such + data values are qualified as provisional and are subject to revision. + Provisional data are released on the condition that neither the USGS + nor the United States Government may be held liable for any damages + resulting from its use. This field reflects the approval status of + each record, and is either "Approved", meaining processing review has + been completed and the data is approved for publication, or + "Provisional" and subject to revision. For more information about + provisional data, go to + https://waterdata.usgs.gov/provisional-data-statement/. + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + qualifier : string or list of strings, optional + This field indicates any qualifiers associated with an observation, for + instance if a sensor may have been impacted by ice or if values were + estimated. + value : string or list of strings, optional + The value of the observation. Values are transmitted as strings in + the JSON response format in order to preserve precision. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a last_modified that intersects the value of datetime are selected. + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + time : string, optional + The date an observation represents. You can query this field using date-times + or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. + Intervals may be bounded or half-bounded (double-dots at start or end). + Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a time that intersects the value of datetime are selected. If + a feature has multiple temporal properties, it is the decision of the server whether + only a single temporal property is used to determine the extent or all relevant temporal properties. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + convertType : boolean, optional + If True, the function will convert the data to dates and qualifier to string vector + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = "USGS-02238500", + ... parameter_code = "00060", + ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], + ... approval_status = "Approved", + ... time = "2024-01-01/.." + """ service = "latest-continuous" output_id = "latest_continuous_id" From 7fe486af62fac41344b709b35ddd2fc467df6a32 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 24 Sep 2025 20:10:10 -0500 Subject: [PATCH 16/56] add link urls --- dataretrieval/waterdata.py | 288 +++++++++++++++++++++++++++++++------ 1 file changed, 242 insertions(+), 46 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index 4ff056a3..3c2335a8 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -309,13 +309,14 @@ def get_monitoring_locations( agency_code : string or list of strings, optional The agency that is reporting the data. Agency codes are fixed values assigned by the National Water Information System (NWIS). A list of - agency codes is available at this link. + agency codes is available at + [this link](https://help.waterdata.usgs.gov/code/agency_cd_query?fmt=html). agency_name : string or list of strings, optional The name of the agency that is reporting the data. monitoring_location_number : string or list of strings, optional Each monitoring location in the USGS data base has a unique 8- to 15-digit identification number. Monitoring location numbers are - assigned based on this logic. + assigned based on [this logic](https://help.waterdata.usgs.gov/faq/sites/do-station-numbers-have-any-particular-meaning). monitoring_location_name : string or list of strings, optional This is the official name of the monitoring location in the database. For well information this can be a district-assigned local number. @@ -335,7 +336,8 @@ def get_monitoring_locations( State code. A two-digit ANSI code (formerly FIPS code) as defined by the American National Standards Institute, to define States and equivalents. A three-digit ANSI code is used to define counties and - county equivalents. A lookup table is available. The only countries with + county equivalents. A [lookup table](https://www.census.gov/library/reference/code-lists/ansi.html#states) + is available. The only countries with political subdivisions other than the US are Mexico and Canada. The Mexican states have US state codes ranging from 81-86 and Canadian provinces have state codes ranging from 90-98. @@ -344,20 +346,22 @@ def get_monitoring_locations( is located. county_code : string or list of strings, optional The code for the county or county equivalent (parish, borough, etc.) in which - the monitoring location is located. A list of codes is available. + the monitoring location is located. A [list of codes](https://help.waterdata.usgs.gov/code/county_query?fmt=html) + is available. county_name : string or list of strings, optional The name of the county or county equivalent (parish, borough, etc.) in which - the monitoring location is located. A list of codes is available. + the monitoring location is located. A [list of codes](https://help.waterdata.usgs.gov/code/county_query?fmt=html) + is available. minor_civil_division_code : string or list of strings, optional Codes for primary governmental or administrative divisions of the county or county equivalent in which the monitoring location is located. site_type_code : string or list of strings, optional - A code describing the hydrologic setting of the monitoring location. A list of - codes is available. + A code describing the hydrologic setting of the monitoring location. A [list of + codes](https://help.waterdata.usgs.gov/code/site_tp_query?fmt=html) is available. Example: "US:15:001" (United States: Hawaii, Hawaii County) site_type : string or list of strings, optional - A description of the hydrologic setting of the monitoring location. A list of - codes is available. + A description of the hydrologic setting of the monitoring location. A [list of + codes](https://help.waterdata.usgs.gov/code/site_tp_query?fmt=html) is available. hydrologic_unit_code : string or list of strings, optional The United States is divided and sub-divided into successively smaller hydrologic units which are classified into four levels: regions, @@ -379,37 +383,44 @@ def get_monitoring_locations( topographic maps; accuracies determined in this way are generally entered as one-half of the contour interval. altitude_method_code : string or list of strings, optional - Codes representing the method used to measure altitude. A list of codes is - available. + Codes representing the method used to measure altitude. A [list of codes](https://help.waterdata.usgs.gov/code/alt_meth_cd_query?fmt=html) + is available. altitude_method_name : float, optional - The name of the the method used to measure altitude. A list of codes is + The name of the the method used to measure altitude. A [list of codes](https://help.waterdata.usgs.gov/code/alt_meth_cd_query?fmt=html) + is available. vertical_datum : float, optional The datum used to determine altitude and vertical position at the - monitoring location. A list of codes is available. + monitoring location. A [list of codes](https://help.waterdata.usgs.gov/code/alt_datum_cd_query?fmt=html) + is available. vertical_datum_name : float, optional The datum used to determine altitude and vertical position at the - monitoring location. A list of codes is available. + monitoring location. A [list of codes](https://help.waterdata.usgs.gov/code/alt_datum_cd_query?fmt=html) + is available. horizontal_positional_accuracy_code : string or list of strings, optional - Indicates the accuracy of the latitude longitude values. A list of codes + Indicates the accuracy of the latitude longitude values. A [list of codes](https://help.waterdata.usgs.gov/code/coord_acy_cd_query?fmt=html) is available. horizontal_positional_accuracy : string or list of strings, optional - Indicates the accuracy of the latitude longitude values. A list of codes + Indicates the accuracy of the latitude longitude values. A [list of codes](https://help.waterdata.usgs.gov/code/coord_acy_cd_query?fmt=html) is available. horizontal_position_method_code : string or list of strings, optional Indicates the method used to determine latitude longitude values. A - list of codes is available. + [list of codes](https://help.waterdata.usgs.gov/code/coord_meth_cd_query?fmt=html) + is available. horizontal_position_method_name : string or list of strings, optional Indicates the method used to determine latitude longitude values. A - list of codes is available. + [list of codes](https://help.waterdata.usgs.gov/code/coord_meth_cd_query?fmt=html) + is available. original_horizontal_datum : string or list of strings, optional Coordinates are published in EPSG:4326 / WGS84 / World Geodetic System 1984. This field indicates the original datum used to determine - coordinates before they were converted. A list of codes is available. + coordinates before they were converted. A [list of codes](https://help.waterdata.usgs.gov/code/coord_datum_cd_query?fmt=html) + is available. original_horizontal_datum_name : string or list of strings, optional Coordinates are published in EPSG:4326 / WGS84 / World Geodetic System 1984. This field indicates the original datum used to determine coordinates - before they were converted. A list of codes is available. + before they were converted. A [list of codes](https://help.waterdata.usgs.gov/code/coord_datum_cd_query?fmt=html) + is available. drainage_area : string or list of strings, optional The area enclosed by a topographic divide from which direct surface runoff from precipitation normally drains by gravity into the stream above that @@ -432,14 +443,15 @@ def get_monitoring_locations( Local aquifers in the USGS water resources data base are identified by a geohydrologic unit code (a three-digit number related to the age of the formation, followed by a 4 or 5 character abbreviation for the geologic unit - or aquifer name). Additional information is available at this link. + or aquifer name). Additional information is available [at this link](https://help.waterdata.usgs.gov/faq/groundwater/local-aquifer-description). national_aquifer_code : string or list of strings, optional National aquifers are the principal aquifers or aquifer systems in the United States, defined as regionally extensive aquifers or aquifer systems that have the potential to be used as a source of potable water. Not all groundwater monitoring locations can be associated with a National Aquifer. Such - monitoring locations will not be retrieved using this search criteria. A list - of National aquifer codes and names is available. + monitoring locations will not be retrieved using this search criteria. A [list + of National aquifer codes and names](https://help.waterdata.usgs.gov/code/nat_aqfr_query?fmt=html) + is available. aquifer_type_code : string or list of strings, optional Groundwater occurs in aquifers under two different conditions. Where water only partly fills an aquifer, the upper surface is free to rise and decline. @@ -448,7 +460,7 @@ def get_monitoring_locations( aquifer is referred to as a confined (or artesian) aquifer. When a confined aquifer is penetrated by a well, the water level in the well will rise above the top of the aquifer (but not necessarily above land surface). Additional - information is available at this link. + information is available [at this link](https://help.waterdata.usgs.gov/faq/groundwater/local-aquifer-description). well_constructed_depth : string or list of strings, optional The depth of the finished well, in feet below land surface datum. Note: Not all groundwater monitoring locations have information on Well Depth. Such @@ -458,7 +470,8 @@ def get_monitoring_locations( Note: Not all groundwater monitoring locations have information on Hole Depth. Such monitoring locations will not be retrieved using this search criteria. depth_source_code : string or list of strings, optional - A code indicating the source of water-level data. A list of codes is available. + A code indicating the source of water-level data. A [list of codes](https://help.waterdata.usgs.gov/code/water_level_src_cd_query?fmt=html) + is available. properties : string or list of strings, optional A vector of requested columns to be returned from the query. Available options are: geometry, id, agency_code, agency_name, monitoring_location_number, @@ -573,7 +586,8 @@ def get_time_series_metadata( measured and the units of measure. A complete list of parameter codes and associated groupings can be found at https://help.waterdata.usgs.gov/codes-and-parameters/parameters. - parameter_name : + parameter_name : string or list of strings, optional + A human-understandable name corresponding to parameter_code. properties : string or list of strings, optional A vector of requested columns to be returned from the query. Available options are: geometry, id, time_series_id, @@ -596,21 +610,67 @@ def get_time_series_metadata( - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours Only features that have a last_modified that intersects the value of datetime are selected. - begin : - end : + begin : string or list of strings, optional + The datetime of the earliest observation in the time series. Together with end, + this field represents the period of record of a time series. Note that some time + series may have large gaps in their collection record. This field is currently + in the local time of the monitoring location. We intend to update this in version + v0 to use UTC with a time zone. You can query this field using date-times or + intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals + may be bounded or half-bounded (double-dots at start or end). Examples: + + - A date-time: "2018-02-12T23:20:50Z" + + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + + Only features that have a begin that intersects the value of datetime are selected. + end : string or list of strings, optional + The datetime of the most recent observation in the time series. Data returned by + this endpoint updates at most once per day, and potentially less frequently than + that, and as such there may be more recent observations within a time series + than the time series end value reflects. Together with begin, this field + represents the period of record of a time series. It is additionally used to + determine whether a time series is "active". We intend to update this in + version v0 to use UTC with a time zone. You can query this field using date-times + or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals + may be bounded or half-bounded (double-dots at start or end). Examples: + + - A date-time: "2018-02-12T23:20:50Z" + + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + + Only features that have a end that intersects the value of datetime are selected. unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. - computation_period_identifier : - computation_identifier : - thresholds : - sublocation_identifier : - primary : - parent_time_series_id : + computation_period_identifier : string or list of strings, optional + Indicates the period of data used for any statistical computations. + computation_identifier : string or list of strings, optional + Indicates whether the data from this time series represent a specific statistical + computation. + thresholds : numeric or list of numbers, optional + Thresholds represent known numeric limits for a time series, for example the + historic maximum value for a parameter or a level below which a sensor is + non-operative. These thresholds are sometimes used to automatically determine if + an observation is erroneous due to sensor error, and therefore shouldn't be included + in the time series. + sublocation_identifier : string or list of strings, optional + primary : string or list of strings, optional + parent_time_series_id : string or list of strings, optional time_series_id : string or list of strings, optional A unique identifier representing a single time series. This corresponds to the id field in the time-series-metadata endpoint. - web_description : + web_description : string or list of strings, optional + A description of what this time series represents, as used by WDFN and other USGS + data dissemination products. skipGeometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. @@ -643,7 +703,7 @@ def get_time_series_metadata( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df = dataretrieval.waterdata.get_daily( + >>> df = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id = "USGS-02238500", ... parameter_code = "00060", ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" @@ -651,12 +711,10 @@ def get_time_series_metadata( >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df = dataretrieval.waterdata.get_daily( + >>> df = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], - ... approval_status = "Approved", ... time = "2024-01-01/.." """ - service = "time-series-metadata" output_id = "time_series_id" @@ -690,7 +748,14 @@ def get_latest_continuous( ) -> pd.DataFrame: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors - installed at a monitoring location. They are collected at a high frequencyand often at a fixed 15-minute interval. Depending on the specific monitoring location, the data may be transmitted automatically via telemetry and be available on WDFN within minutes of collection, while other times the delivery of data may be delayed if the monitoring location does not have the capacity to automatically transmit data. Continuous data are described by parameter name and parameter code. These data might also be referred to as "instantaneous values" or "IV" + installed at a monitoring location. They are collected at a high frequency + and often at a fixed 15-minute interval. Depending on the specific monitoring + location, the data may be transmitted automatically via telemetry and be + available on WDFN within minutes of collection, while other times the delivery + of data may be delayed if the monitoring location does not have the capacity to + automatically transmit data. Continuous data are described by parameter name + and parameter code. These data might also be referred to as "instantaneous + values" or "IV" Parameters ---------- @@ -805,18 +870,16 @@ def get_latest_continuous( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df = dataretrieval.waterdata.get_daily( + >>> df = dataretrieval.waterdata.get_latest_continuous( ... monitoring_location_id = "USGS-02238500", - ... parameter_code = "00060", - ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... parameter_code = "00060" ... ) >>> # Get monitoring location info for specific sites >>> # and only specific properties >>> df = dataretrieval.waterdata.get_daily( - ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], - ... approval_status = "Approved", - ... time = "2024-01-01/.." + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"] + ... ) """ service = "latest-continuous" output_id = "latest_continuous_id" @@ -851,6 +914,139 @@ def get_field_measurements( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: + """Field measurements are physically measured values collected during + a visit to the monitoring location. Field measurements consist of + measurements of gage height and discharge, and readings of groundwater + levels, and are primarily used as calibration readings for the automated + sensors collecting continuous data. They are collected at a low frequency, + and delivery of the data in WDFN may be delayed due to data processing + time. + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter + codes and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + observing_procedure_code : string or list of strings, optional + A short code corresponding to the observing procedure for the field + measurement. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. + Available options are: geometry, id, time_series_id, + monitoring_location_id, parameter_code, statistic_id, time, value, + unit_of_measure, approval_status, qualifier, last_modified + field_visit_id : string or list of strings, optional + A universally unique identifier (UUID) for the field visit. + Multiple measurements may be made during a single field visit. + approval_status : string or list of strings, optional + Some of the data that you have obtained from this U.S. Geological + Survey database may not have received Director's approval. Any such + data values are qualified as provisional and are subject to revision. + Provisional data are released on the condition that neither the USGS + nor the United States Government may be held liable for any damages + resulting from its use. This field reflects the approval status of + each record, and is either "Approved", meaining processing review has + been completed and the data is approved for publication, or + "Provisional" and subject to revision. For more information about + provisional data, go to + https://waterdata.usgs.gov/provisional-data-statement/. + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + qualifier : string or list of strings, optional + This field indicates any qualifiers associated with an observation, for + instance if a sensor may have been impacted by ice or if values were + estimated. + value : string or list of strings, optional + The value of the observation. Values are transmitted as strings in + the JSON response format in order to preserve precision. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a last_modified that intersects the value of datetime are selected. + observing_procedure : string or list of strings, optional + Water measurement or water-quality observing procedure descriptions. + vertical_datum : string or list of strings, optional + The datum used to determine altitude and vertical position at the monitoring location. + A list of codes is available. + measuring_agency : string or list of strings, optional + The agency performing the measurement. + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + time : string, optional + The date an observation represents. You can query this field using date-times + or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. + Intervals may be bounded or half-bounded (double-dots at start or end). + Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a time that intersects the value of datetime are selected. If + a feature has multiple temporal properties, it is the decision of the server whether + only a single temporal property is used to determine the extent or all relevant temporal properties. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + convertType : boolean, optional + If True, the function will convert the data to dates and qualifier to string vector + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df = dataretrieval.waterdata.get_field_measurements( + ... monitoring_location_id = "USGS-375907091432201", + ... parameter_code = "72019", + ... skipGeometry = True + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_field_measurements( + ... monitoring_location_id = ["USGS-451605097071701", + "USGS-263819081585801"], + ... parameter_code = ["62611", "72019"], + ... time = "P20Y" + ... ) + """ service = "field-measurements" output_id = "field_measurement_id" From fad9ce0d0c063ce89fb64eca6e429138c09484d7 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 24 Sep 2025 20:10:30 -0500 Subject: [PATCH 17/56] fix date formatting function --- dataretrieval/waterdata_helpers.py | 48 +++++++++++++----------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 164fdfc9..c0714bd6 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -122,7 +122,7 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) datetime_input : Union[str, List[str]] A single date/datetime string or a list of one or two date/datetime strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601, or relative periods (e.g., "P7D"). date : bool, optional - If True, returns only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). + If True, uses only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). Returns ------- Union[str, None] @@ -154,38 +154,31 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) # Replace all blanks with "nan" datetime_input = ["nan" if x == "" else x for x in datetime_input] - # If the list is of length 1, first look for things like "P7D" or dates - # already formatted in ISO08601. Otherwise, try to coerce to datetime - if len(datetime_input) == 1: - dt = datetime_input[0] - if re.search(r"P", dt, re.IGNORECASE) or "/" in dt: - return dt + if len(datetime_input) <=2: + # If the list is of length 1, first look for things like "P7D" or dates + # already formatted in ISO08601. Otherwise, try to coerce to datetime + if len(datetime_input) == 1 and re.search(r"P", datetime_input[0], re.IGNORECASE) or "/" in datetime_input[0]: + return datetime_input[0] + # Otherwise, use list comprehension to parse dates else: try: # Parse to naive datetime - parsed_dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") - # If the service only accepts dates for this input, not datetimes (e.g. "daily"), - # return just the date, otherwise, return the datetime in UTC format. - if date: - return parsed_dt.strftime("%Y-%m-%d") - else: - dt_local = parsed_dt.replace(tzinfo=local_timezone) - # Convert to UTC and format as ISO 8601 with 'Z' - return dt_local.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") + parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] except Exception: - return None - # If the list is of length 2, parse the dates and if necessary, combine them together into - # the date range format accepted by the API - elif len(datetime_input) == 2: - try: - parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] + # Parse to date only + try: + parsed_dates = [datetime.strptime(dt, "%Y-%m-%d") for dt in datetime_input] + except Exception: + return None + # If the service only accepts dates for this input, not datetimes (e.g. "daily"), + # return just the dates separated by a "/", otherwise, return the datetime in UTC + # format. if date: - formatted = "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) + return "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) else: - formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_dates) - return formatted.replace("nan", "..") - except Exception: - return None + parsed_locals = [dt.replace(tzinfo=local_timezone) for dt in parsed_dates] + formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_locals) + return formatted.replace("nan", "..") else: raise ValueError("datetime_input should only include 1-2 values") @@ -333,7 +326,6 @@ def _construct_api_requests( if i in params: dates = service == "daily" and i != "last_modified" params[i] = _format_api_dates(params[i], date=dates) - #kwargs[i] = _format_api_dates(kwargs[i], date=dates) # String together bbox elements from a list to a comma-separated string, # and string together properties if provided From a33d201bb99230e4606af1bd9142ba06def6ec60 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 13:02:53 -0500 Subject: [PATCH 18/56] make waterdata outputs geopandas if geometry included --- dataretrieval/waterdata_helpers.py | 37 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index c0714bd6..3a35d40d 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -1,10 +1,10 @@ import httpx import os -import warnings from typing import List, Dict, Any, Optional, Union from datetime import datetime import pandas as pd import json +import geopandas as gpd from datetime import datetime from zoneinfo import ZoneInfo import re @@ -243,7 +243,7 @@ def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): def _error_body(resp: httpx.Response): """ - Extracts and returns an error message from an HTTP response object based on its status code. + Provide more informative error messages based on the response status. Args: resp (httpx.Response): The HTTP response object to extract the error message from. @@ -270,8 +270,10 @@ def _construct_api_requests( ): """ Constructs an HTTP request object for the specified water data API service. - Depending on the input parameters, the function determines whether to use a GET or POST request, - formats parameters appropriately, and sets required headers. + Depending on the input parameters (whether there's lists of multiple argument values), + the function determines whether to use a GET or POST request, formats parameters + appropriately, and sets required headers. + Args: service (str): The name of the API service to query (e.g., "daily"). properties (Optional[List[str]], optional): List of property names to include in the request. @@ -382,21 +384,25 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: resp (httpx.Response): The HTTP response object expected to contain a JSON body with a "features" key. Returns: - pd.DataFrame: A pandas DataFrame containing the normalized feature properties. - Returns an empty DataFrame if no features are returned. - - Notes: - - Drops columns "type", "geometry", and "AsGeoJSON(geometry)" if present. - - Flattens nested properties and removes the "properties_" prefix from column names. + gpd.GeoDataFrame or pd.DataFrame: A geopandas GeoDataFrame if geometry is included, or a + pandas DataFrame containing the feature properties and each row's service-specific id. + Returns an empty pandas DataFrame if no features are returned. """ body = resp.json() if not body.get("numberReturned"): return pd.DataFrame() - df = pd.json_normalize( - resp.json()["features"], - sep="_") - df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") - df.columns = [col.replace("properties_", "") for col in df.columns] + #df = pd.json_normalize( + # resp.json()["features"], + # sep="_") + #df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") + #df.columns = [col.replace("properties_", "") for col in df.columns] + + df = gpd.GeoDataFrame.from_features(body["features"]) + df["id"] = pd.json_normalize(body["features"])["id"].values + + if df["geometry"].isnull().all(): + df = pd.DataFrame(df.drop(columns="geometry")) + return df def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: @@ -451,7 +457,6 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp) dfs = pd.concat([dfs, df1], ignore_index=True) - #dfs.append(df1) curr_url = _next_req_url(resp) except Exception: failures.append(curr_url) From bd82c4900a5f44145691643e1c4a156dd0181f3e Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 13:25:33 -0500 Subject: [PATCH 19/56] make gpd an optional dependency and change returns accordingly --- dataretrieval/waterdata_helpers.py | 31 +++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 3a35d40d..0c84ea59 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -1,13 +1,21 @@ import httpx import os +import warnings from typing import List, Dict, Any, Optional, Union from datetime import datetime import pandas as pd import json -import geopandas as gpd from datetime import datetime from zoneinfo import ZoneInfo import re +try: + import geopandas as gpd + gpd = True +except ImportError: + warnings.warn("Geopandas is not installed. Data frames containing geometry will be returned as pandas DataFrames.", ImportWarning) + gpd = False + + BASE_API = "https://api.waterdata.usgs.gov/ogcapi/" API_VERSION = "v0" @@ -388,18 +396,27 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: pandas DataFrame containing the feature properties and each row's service-specific id. Returns an empty pandas DataFrame if no features are returned. """ + # Check if it's an empty response body = resp.json() if not body.get("numberReturned"): return pd.DataFrame() - #df = pd.json_normalize( - # resp.json()["features"], - # sep="_") - #df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") - #df.columns = [col.replace("properties_", "") for col in df.columns] + # If geopandas not installed, return a pandas dataframe + if not gpd: + df = pd.json_normalize( + body["features"], + sep="_") + df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") + df.columns = [col.replace("properties_", "") for col in df.columns] + return df + + # Organize json into geodataframe and make sure id column comes along. df = gpd.GeoDataFrame.from_features(body["features"]) df["id"] = pd.json_normalize(body["features"])["id"].values + df = df[["id"] + [col for col in df.columns if col != "id"]] + # If no geometry present, then return pandas dataframe. A geodataframe + # is not needed. if df["geometry"].isnull().all(): df = pd.DataFrame(df.drop(columns="geometry")) @@ -506,7 +523,7 @@ def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: Returns ------- - pd.DataFrame + pd.DataFrame or gpd.GeoDataFrame The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. """ if properties and not all(pd.isna(properties)): From 06b0e69e90fb9f179382e0b32c62bcb3d0c891cc Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 13:46:50 -0500 Subject: [PATCH 20/56] incorporate geopandas boolean into function arguments and ensure user knows when they will receive a pandas df --- dataretrieval/waterdata_helpers.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 0c84ea59..19937afc 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -10,10 +10,9 @@ import re try: import geopandas as gpd - gpd = True + geopd = True except ImportError: - warnings.warn("Geopandas is not installed. Data frames containing geometry will be returned as pandas DataFrames.", ImportWarning) - gpd = False + geopd = False @@ -384,7 +383,7 @@ def _next_req_url(resp: httpx.Response) -> Optional[str]: return next_url return None -def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: +def _get_resp_data(resp: httpx.Response, geopd: bool) -> pd.DataFrame: """ Extracts and normalizes data from an httpx.Response object containing GeoJSON features. @@ -402,7 +401,7 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: return pd.DataFrame() # If geopandas not installed, return a pandas dataframe - if not gpd: + if not geopd: df = pd.json_normalize( body["features"], sep="_") @@ -422,7 +421,7 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: return df -def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: +def _walk_pages(geopd: bool, req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: """ Iterates through paginated API responses and aggregates the results into a single DataFrame. @@ -452,6 +451,9 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional """ print(f"Requesting:\n{req.url}") + if not geopd: + print("Geopandas is not installed. Data frames containing geometry will be returned as pandas DataFrames.") + # Get first response from client # using GET or POST call client = client or httpx.Client() @@ -465,14 +467,14 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional content = req.content if method == "POST" else None if max_results is None or pd.isna(max_results): - dfs = _get_resp_data(resp) + dfs = _get_resp_data(resp, geopd=geopd) curr_url = _next_req_url(resp) failures = [] while curr_url: try: resp = client.request(method, curr_url, headers=headers, content=content if method == "POST" else None) if resp.status_code != 200: raise Exception(_error_body(resp)) - df1 = _get_resp_data(resp) + df1 = _get_resp_data(resp, geopd=geopd) dfs = pd.concat([dfs, df1], ignore_index=True) curr_url = _next_req_url(resp) except Exception: @@ -483,7 +485,7 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional return dfs else: resp.raise_for_status() - return _get_resp_data(resp) + return _get_resp_data(resp, geopd=geopd) def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: """ @@ -604,7 +606,7 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF # Build API request req = _construct_api_requests(**args) # Run API request and iterate through pages if needed - return_list = _walk_pages(req, max_results) + return_list = _walk_pages(geopd=geopd, req=req, max_results=max_results) # Manage some aspects of the returned dataset return_list = _deal_with_empty(return_list, properties, service) if convertType: From 253da79846cbf13beb43c51e71c6e1c0642b6fe8 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 14:07:26 -0500 Subject: [PATCH 21/56] clean up some documentation and comments --- dataretrieval/waterdata.py | 10 +++++----- dataretrieval/waterdata_helpers.py | 14 +++++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index 3c2335a8..04f36157 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -207,7 +207,7 @@ def get_daily( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples @@ -506,7 +506,7 @@ def get_monitoring_locations( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples @@ -694,7 +694,7 @@ def get_time_series_metadata( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples @@ -861,7 +861,7 @@ def get_latest_continuous( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples @@ -1023,7 +1023,7 @@ def get_field_measurements( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 19937afc..ef8e2359 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -512,7 +512,7 @@ def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: """ - Rearranges and renames columns in a DataFrame based on provided properties and output identifier. + Rearranges and renames columns in a DataFrame based on provided properties and service's output id. Parameters ---------- @@ -530,8 +530,14 @@ def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: """ if properties and not all(pd.isna(properties)): if "id" not in properties: + # If user refers to service-specific output id in properties, + # then rename the "id" column to the output_id (id column is + # automatically included). if output_id in properties: df = df.rename(columns={"id": output_id}) + # If output id is not in properties, but user requests the plural + # of the output_id (e.g. "monitoring_locations_id"), then rename + # "id" to plural. This is pretty niche. else: plural = output_id.replace("_id", "s_id") if plural in properties: @@ -581,8 +587,8 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF service (str): The OGC service type (e.g., "wfs", "wms"). Returns: - pd.DataFrame: A DataFrame containing the retrieved and processed OGC data, with metadata attributes - including the request URL and query timestamp. + pd.DataFrame or gpd.GeoDataFrame: A DataFrame containing the retrieved and processed OGC data, + with metadata attributes including the request URL and query timestamp. Notes: - The function does not mutate the input `args` dictionary. @@ -637,5 +643,3 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF # resp.raise_for_status() # return resp.json() -# def _explode_post(ls: Dict[str, Any]): -# return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} \ No newline at end of file From f5cca0777e63a753724aa34c4df745b31fcc29ed Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 14:14:30 -0500 Subject: [PATCH 22/56] add optional dependency to pyproject.toml --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index a276f113..e55dc812 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,10 @@ nldi = [ 'geopandas>=0.10' ] +waterdata = [ + 'geopandas>=0.10', +] + [project.urls] homepage = "https://github.com/DOI-USGS/dataretrieval-python" documentation = "https://doi-usgs.github.io/dataretrieval-python/" From 5c546e7e3baca5f9713d45a83ddc97ccf1763c0e Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 16:58:06 -0500 Subject: [PATCH 23/56] set convertType to default or user specification --- dataretrieval/waterdata.py | 5 ----- dataretrieval/waterdata_helpers.py | 8 ++------ 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index 04f36157..c2d70313 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -237,7 +237,6 @@ def get_daily( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) @@ -536,7 +535,6 @@ def get_monitoring_locations( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) @@ -723,7 +721,6 @@ def get_time_series_metadata( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) @@ -889,7 +886,6 @@ def get_latest_continuous( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) @@ -1055,7 +1051,6 @@ def get_field_measurements( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index ef8e2359..c535afdd 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -1,11 +1,9 @@ import httpx import os -import warnings from typing import List, Dict, Any, Optional, Union from datetime import datetime import pandas as pd import json -from datetime import datetime from zoneinfo import ZoneInfo import re try: @@ -158,9 +156,6 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) if all(pd.isna(dt) or dt == "" or dt == None for dt in datetime_input): return None - # Replace all blanks with "nan" - datetime_input = ["nan" if x == "" else x for x in datetime_input] - if len(datetime_input) <=2: # If the list is of length 1, first look for things like "P7D" or dates # already formatted in ISO08601. Otherwise, try to coerce to datetime @@ -185,7 +180,7 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) else: parsed_locals = [dt.replace(tzinfo=local_timezone) for dt in parsed_dates] formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_locals) - return formatted.replace("nan", "..") + return formatted else: raise ValueError("datetime_input should only include 1-2 values") @@ -407,6 +402,7 @@ def _get_resp_data(resp: httpx.Response, geopd: bool) -> pd.DataFrame: sep="_") df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") df.columns = [col.replace("properties_", "") for col in df.columns] + df.rename(columns={"geometry_coordinates": "geometry"}, inplace=True) return df # Organize json into geodataframe and make sure id column comes along. From e9221ac68831722b8cf15858090fcbbcbdcfdf52 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 16:58:27 -0500 Subject: [PATCH 24/56] start unit tests on new functions --- tests/waterdata_test.py | 90 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 50eefdc5..d0e7a49e 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -6,6 +6,11 @@ from dataretrieval.waterdata import ( _check_profiles, get_samples, + get_daily, + get_monitoring_locations, + get_latest_continuous, + get_field_measurements, + get_time_series_metadata, _SERVICES, _PROFILES ) @@ -105,3 +110,88 @@ def test_samples_organizations(): ) assert len(df) == 1 assert df.size == 3 + +def test_get_daily(): + df = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/.." + ) + assert "daily_id" in df.columns + assert "geometry" in df.columns + assert df.shape[1] == 12 + assert df.parameter_code.unique().tolist() == ["00060"] + assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] + assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() + assert df["value"].dtype == "float64" + +def test_get_daily_properties(): + df = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"] + ) + assert "daily_id" in df.columns + assert "geometry" in df.columns + assert df.shape[1] == 6 + assert (df["time"] >= datetime.date(2025, 1, 1)).all() + +def test_get_daily_no_geometry(): + df = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + skipGeometry=True + ) + assert "geometry" not in df.columns + assert df.shape[1] == 11 + assert isinstance(df, DataFrame) + +def test_get_monitoring_locations(): + df = get_monitoring_locations( + state_name="Connecticut", + site_type_code="GW" + ) + assert df.site_type_code.unique().tolist() == ["GW"] + +def test_get_monitoring_locations_hucs(): + df = get_monitoring_locations( + hydrologic_unit_code=["010802050102", "010802050103"] + ) + assert set(df.hydrologic_unit_code.unique().tolist()) == {"010802050102", "010802050103"} + +def test_get_latest_continuous(): + df = get_latest_continuous( + monitoring_location_id=["USGS-05427718", "USGS-05427719"], + parameter_code=["00060", "00065"] + ) + assert df.shape[0] <= 4 + assert df.statistic_id.unique().tolist() == ["00011"] + try: + datetime.datetime.strptime(df['time'].iloc[0], "%Y-%m-%dT%H:%M:%S+00:00") + out=True + except: + out=False + assert out + +def test_get_field_measurements(): + df = get_field_measurements( + monitoring_location_id="USGS-05427718", + unit_of_measure="ft^3/s", + time="2025-01-01/2025-10-01", + skipGeometry=True + ) + assert "field_measurement_id" in df.columns + assert "geometry" not in df.columns + assert df.unit_of_measure.unique().tolist() == ["ft^3/s"] + +def test_get_time_series_metadata(): + df = get_time_series_metadata( + bbox=[-89.840355,42.853411,-88.818626,43.422598], + parameter_code=["00060", "00065", "72019"], + skipGeometry=True + ) + assert set(df['parameter_name'].unique().tolist()) == {"Gage height", "Water level, depth LSD", "Discharge"} + + From b1436db5e7ea5ac355b26eeaa7ec38fd7b73effe Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 26 Sep 2025 10:19:42 -0500 Subject: [PATCH 25/56] update README and add a NEWS markdown in which to place past updates --- NEWS.md | 7 +++++++ README.md | 25 ++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) create mode 100644 NEWS.md diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..a071d491 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,7 @@ +**10/01/2025:** `dataretrieval` is pleased to offer a new module, `waterdata`, which gives users access USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, instantaneous values, field measurements (modernized groundwater levels service), time series metadata, and discrete water quality data from the Samples database. Though there will be a period of overlap, the functions within `waterdata` will eventually replace the `nwis` module, which currently provides access to the legacy [NWIS Water Services](https://waterservices.usgs.gov/). More example workflows and functions coming soon. Check `help(waterdata)` for more information. + +**09/03/2024:** The groundwater levels service has switched endpoints, and `dataretrieval` was updated accordingly in [`v1.0.10`](https://github.com/DOI-USGS/dataretrieval-python/releases/tag/v1.0.10). Older versions using the discontinued endpoint will return 503 errors for `nwis.get_gwlevels` or the `service='gwlevels'` argument. Visit [Water Data For the Nation](https://waterdata.usgs.gov/blog/wdfn-waterservices-2024/) for more information. + +**03/01/2024:** USGS data availability and format have changed on Water Quality Portal (WQP). Since March 2024, data obtained from WQP legacy profiles will not include new USGS data or recent updates to existing data. All USGS data (up to and beyond March 2024) are available using the new WQP beta services. You can access the beta services by setting `legacy=False` in the functions in the `wqp` module. + +To view the status of changes in data availability and code functionality, visit: https://doi-usgs.github.io/dataRetrieval/articles/Status.html \ No newline at end of file diff --git a/README.md b/README.md index f8c14a36..74641211 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,19 @@ ![Conda Version](https://img.shields.io/conda/v/conda-forge/dataretrieval) ![Downloads](https://static.pepy.tech/badge/dataretrieval) -:warning: USGS data availability and format have changed on Water Quality Portal (WQP). Since March 2024, data obtained from WQP legacy profiles will not include new USGS data or recent updates to existing data. All USGS data (up to and beyond March 2024) are available using the new WQP beta services. You can access the beta services by setting `legacy=False` in the functions in the `wqp` module. +## Latest Announcements -To view the status of changes in data availability and code functionality, visit: https://doi-usgs.github.io/dataRetrieval/articles/Status.html +:mega: **10/01/2025:** `dataretrieval` is pleased to offer a new, *in-development* module, `waterdata`, which gives users access USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, instantaneous values, field measurements (modernized groundwater levels service), time series metadata, and discrete water quality data from the Samples database. Though there will be a period of overlap, the functions within `waterdata` will eventually replace the `nwis` module, which currently provides access to the legacy [NWIS Water Services](https://waterservices.usgs.gov/). More example workflows and functions coming soon. Check `help(waterdata)` for more information. -:mega: **09/03/2024:** The groundwater levels service has switched endpoints, and `dataretrieval` was updated accordingly in [`v1.0.10`](https://github.com/DOI-USGS/dataretrieval-python/releases/tag/v1.0.10). Older versions using the discontinued endpoint will return 503 errors for `nwis.get_gwlevels` or the `service='gwlevels'` argument. Visit [Water Data For the Nation](https://waterdata.usgs.gov/blog/wdfn-waterservices-2024/) for more information. +**Important:** Users of the Water Data APIs are strongly encouraged to obtain an API key, which gives users higher rate limits and thus greater access to USGS data. [Register for an API key](https://api.waterdata.usgs.gov/signup/) and then place that API key in your python environment as an environment variable named "API_USGS_PAT". One option is to set the variable as follows: + +```python +import os +os.environ["API_USGS_PAT"] = "your_api_key_here" +``` +Note that you may need to restart your python session for the environment variable to be recognized. + +Check out the [NEWS](NEWS.md) file for all updates and announcements, or track updates to the package via the GitHub releases. ## What is dataretrieval? `dataretrieval` was created to simplify the process of loading hydrologic data into the Python environment. @@ -20,8 +28,7 @@ Environmental Protection Agency (EPA), U.S. Department of Agriculture (USDA), and USGS. Direct USGS data is obtained from a service called the National Water Information System (NWIS). -Note that the python version is not a direct port of the original: it attempts to reproduce the functionality of the R package, -though its organization and interface often differ. +Note that the python version is not a direct port of the original: it attempts to reproduce the functionality of the R package, though its organization and interface often differ. If there's a hydrologic or environmental data portal that you'd like dataretrieval to work with, raise it as an [issue](https://github.com/USGS-python/dataretrieval/issues). @@ -53,7 +60,7 @@ Water quality data are available from: - [Samples](https://waterdata.usgs.gov/download-samples/#dataProfile=site) - Discrete USGS water quality data only - [Water Quality Portal](https://www.waterqualitydata.us/) - Discrete water quality data from USGS and EPA. Older data are available in the legacy WQX version 2 format; all data are available in the beta WQX3.0 format. -To access the full functionality available from NWIS web services, nwis.get record appends any additional kwargs into the REST request. For example, this function call: +To access the full functionality available from NWIS web services, `nwis.get_record()` appends any additional kwargs into the REST request. For example, this function call: ```python nwis.get_record(sites='03339000', service='dv', start='2017-12-31', parameterCd='00060') ``` @@ -67,8 +74,6 @@ For example nwis.get_record(sites='05404147',service='iv', start='2021-01-01', end='2021-3-01', access='3') ``` -More services and documentation to come! - ## Quick start dataretrieval can be installed using pip: @@ -99,13 +104,11 @@ For more details, see the file [CONTRIBUTING.md](CONTRIBUTING.md). ## Need help? -The Water Mission Area of the USGS supports the development and maintenance of `dataretrieval`. Any questions can be directed to the Computational Tools team at -comptools@usgs.gov. +The Water Mission Area of the USGS supports the development and maintenance of `dataretrieval`. Any questions can be directed to the Computational Tools team at comptools@usgs.gov. Resources are available primarily for maintenance and responding to user questions. Priorities on the development of new features are determined by the `dataretrieval` development team. - ## Acknowledgments This material is partially based upon work supported by the National Science Foundation (NSF) under award 1931297. Any opinions, findings, conclusions, or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the NSF. From dc24658e0e8a292fe8296c62015871b5ad374563 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 26 Sep 2025 10:57:17 -0500 Subject: [PATCH 26/56] make a few small changes to names and documentation --- dataretrieval/waterdata_helpers.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index c535afdd..0b5ad14b 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -21,7 +21,7 @@ _cached_base_url = None def _base_url(): """ - Returns the base URL for the USGS Water Data OGC API. + Returns the base URL for the USGS Water Data APIs. Uses a cached value to avoid repeated string formatting. If the cached value is not set, it constructs the base URL using the BASE_API and API_VERSION constants. @@ -222,7 +222,7 @@ def _default_headers(): headers["X-Api-Key"] = token return headers -def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): +def _check_ogc_requests(endpoint: str = "daily", req_type: str = "queryables"): """ Sends an HTTP GET request to the specified OGC endpoint and request type, returning the JSON response. @@ -281,7 +281,7 @@ def _construct_api_requests( properties (Optional[List[str]], optional): List of property names to include in the request. bbox (Optional[List[float]], optional): Bounding box coordinates as a list of floats. limit (Optional[int], optional): Maximum number of results to return per request. - max_results (Optional[int], optional): Maximum number of results allowed by the API. + max_results (Optional[int], optional): Maximum number of rows to return. skipGeometry (bool, optional): Whether to exclude geometry from the response. **kwargs: Additional query parameters, including date/time filters and other API-specific options. Returns: @@ -296,14 +296,6 @@ def _construct_api_requests( baseURL = _setup_api(service) # Single parameters can only have one value single_params = {"datetime", "last_modified", "begin", "end", "time"} - # params = {k: v for k, v in kwargs.items() if k in single_params} - # # Set skipGeometry parameter - # params["skipGeometry"] = skipGeometry - # # If limit is none and max_results is not none, then set limit to max results. Otherwise, - # # if max_results is none, set it to 10000 (the API max). - # params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 - # if max_results is not None and limit is not None and limit > max_results: - # raise ValueError("limit cannot be greater than max_result") # Identify which parameters should be included in the POST content body post_params = { @@ -384,6 +376,7 @@ def _get_resp_data(resp: httpx.Response, geopd: bool) -> pd.DataFrame: Parameters: resp (httpx.Response): The HTTP response object expected to contain a JSON body with a "features" key. + geopd (bool): Indicates whether geopandas is installed and should be used to handle geometries. Returns: gpd.GeoDataFrame or pd.DataFrame: A geopandas GeoDataFrame if geometry is included, or a @@ -423,10 +416,12 @@ def _walk_pages(geopd: bool, req: httpx.Request, max_results: Optional[int], cli Parameters ---------- + geopd : bool + Indicates whether geopandas is installed and should be used for handling geometries. req : httpx.Request The initial HTTP request to send. max_results : Optional[int] - The maximum number of results to retrieve. If None or NaN, retrieves all available pages. + Maximum number of rows to return. If None or NaN, retrieves all available pages. client : Optional[httpx.Client], default None An optional HTTP client to use for requests. If not provided, a new client is created. @@ -501,12 +496,12 @@ def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], """ if return_list.empty: if not properties or all(pd.isna(properties)): - schema = _check_OGC_requests(endpoint=service, req_type="schema") + schema = _check_ogc_requests(endpoint=service, req_type="schema") properties = list(schema.get("properties", {}).keys()) return pd.DataFrame(columns=properties) return return_list -def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: +def _arrange_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: """ Rearranges and renames columns in a DataFrame based on provided properties and service's output id. @@ -613,7 +608,7 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF return_list = _deal_with_empty(return_list, properties, service) if convertType: return_list = _cleanup_cols(return_list, service=service) - return_list = _rejigger_cols(return_list, properties, output_id) + return_list = _arrange_cols(return_list, properties, output_id) # Add metadata return_list.attrs.update(request=req.url, queryTime=pd.Timestamp.now()) return return_list From 89b960ca822a7dec3a601ee25791216c346f7534 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 26 Sep 2025 11:14:20 -0500 Subject: [PATCH 27/56] define max_results when it is an input --- dataretrieval/waterdata_helpers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 0b5ad14b..70e9530c 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -310,6 +310,10 @@ def _construct_api_requests( # If limit is none and max_results is not none, then set limit to max results. Otherwise, # if max_results is none, set it to 10000 (the API max). params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 + # Add max results as a parameter if it is not None + if max_results is not None: + params["max_results"] = max_results + if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") From 1237777bcf0e036be065bd45e845ede92294a4c8 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 26 Sep 2025 13:27:42 -0500 Subject: [PATCH 28/56] comment out code that wasn't doing the correct thing with max_results --- dataretrieval/waterdata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index c2d70313..7c503b23 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -683,7 +683,7 @@ def get_time_series_metadata( The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The - default (NA) will set the limit to the maximum allowable limit for the service. + default (None) will set the limit to the maximum allowable limit for the service. max_results : numeric, optional The optional maximum number of rows to return. This value must be less than the requested limit. @@ -849,7 +849,7 @@ def get_latest_continuous( The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The - default (NA) will set the limit to the maximum allowable limit for the service. + default (None) will set the limit to the maximum allowable limit for the service. max_results : numeric, optional The optional maximum number of rows to return. This value must be less than the requested limit. @@ -1010,7 +1010,7 @@ def get_field_measurements( The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The - default (NA) will set the limit to the maximum allowable limit for the service. + default (None) will set the limit to the maximum allowable limit for the service. max_results : numeric, optional The optional maximum number of rows to return. This value must be less than the requested limit. From e84984ae35218483c6914901a5ba99cd14ebe79e Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 29 Sep 2025 09:11:57 -0500 Subject: [PATCH 29/56] Revert waterdata to requrests --- dataretrieval/waterdata_helpers.py | 117 +++++++++++++++++------------ 1 file changed, 67 insertions(+), 50 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 70e9530c..6cfc233c 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -1,4 +1,4 @@ -import httpx +import requests import os from typing import List, Dict, Any, Optional, Union from datetime import datetime @@ -235,20 +235,20 @@ def _check_ogc_requests(endpoint: str = "daily", req_type: str = "queryables"): Raises: AssertionError: If req_type is not "queryables" or "schema". - httpx.HTTPStatusError: If the HTTP request returns an unsuccessful status code. + requests.HTTPError: If the HTTP request returns an unsuccessful status code. """ assert req_type in ["queryables", "schema"] url = f"{_base_url()}collections/{endpoint}/{req_type}" - resp = httpx.get(url, headers=_default_headers()) + resp = requests.get(url, headers=_default_headers()) resp.raise_for_status() return resp.json() -def _error_body(resp: httpx.Response): +def _error_body(resp: requests.Response): """ Provide more informative error messages based on the response status. Args: - resp (httpx.Response): The HTTP response object to extract the error message from. + resp (requests.Response): The HTTP response object to extract the error message from. Returns: str: The extracted error message. For status code 429, returns the 'message' field from the JSON error object. @@ -285,7 +285,7 @@ def _construct_api_requests( skipGeometry (bool, optional): Whether to exclude geometry from the response. **kwargs: Additional query parameters, including date/time filters and other API-specific options. Returns: - httpx.Request: The constructed HTTP request object ready to be sent. + requests.PreparedRequest: The constructed HTTP request object ready to be sent. Raises: ValueError: If `limit` is greater than `max_results`. Notes: @@ -338,17 +338,28 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" - req = httpx.Request(method="POST", url=baseURL, headers=headers, content=_cql2_param(post_params), params=params) + request = requests.Request( + method="POST", + url=baseURL, + headers=headers, + data=_cql2_param(post_params), + params=params, + ) else: - req = httpx.Request(method="GET", url=baseURL, headers=headers, params=params) - return req + request = requests.Request( + method="GET", + url=baseURL, + headers=headers, + params=params, + ) + return request.prepare() -def _next_req_url(resp: httpx.Response) -> Optional[str]: +def _next_req_url(resp: requests.Response) -> Optional[str]: """ Extracts the URL for the next page of results from an HTTP response from a water data endpoint. Parameters: - resp (httpx.Response): The HTTP response object containing JSON data and headers. + resp (requests.Response): The HTTP response object containing JSON data and headers. Returns: Optional[str]: The URL for the next page of results if available, otherwise None. @@ -374,12 +385,12 @@ def _next_req_url(resp: httpx.Response) -> Optional[str]: return next_url return None -def _get_resp_data(resp: httpx.Response, geopd: bool) -> pd.DataFrame: +def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame: """ - Extracts and normalizes data from an httpx.Response object containing GeoJSON features. + Extracts and normalizes data from an HTTP response containing GeoJSON features. Parameters: - resp (httpx.Response): The HTTP response object expected to contain a JSON body with a "features" key. + resp (requests.Response): The HTTP response object expected to contain a JSON body with a "features" key. geopd (bool): Indicates whether geopandas is installed and should be used to handle geometries. Returns: @@ -414,7 +425,7 @@ def _get_resp_data(resp: httpx.Response, geopd: bool) -> pd.DataFrame: return df -def _walk_pages(geopd: bool, req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: +def _walk_pages(geopd: bool, req: requests.PreparedRequest, max_results: Optional[int], client: Optional[requests.Session] = None) -> pd.DataFrame: """ Iterates through paginated API responses and aggregates the results into a single DataFrame. @@ -422,11 +433,11 @@ def _walk_pages(geopd: bool, req: httpx.Request, max_results: Optional[int], cli ---------- geopd : bool Indicates whether geopandas is installed and should be used for handling geometries. - req : httpx.Request + req : requests.PreparedRequest The initial HTTP request to send. max_results : Optional[int] Maximum number of rows to return. If None or NaN, retrieves all available pages. - client : Optional[httpx.Client], default None + client : Optional[requests.Session], default None An optional HTTP client to use for requests. If not provided, a new client is created. Returns @@ -451,36 +462,43 @@ def _walk_pages(geopd: bool, req: httpx.Request, max_results: Optional[int], cli # Get first response from client # using GET or POST call - client = client or httpx.Client() - resp = client.send(req) - if resp.status_code != 200: raise Exception(_error_body(resp)) - - # Grab some aspects of the original request: headers and the - # request type (GET or POST) - method = req.method.upper() - headers = req.headers - content = req.content if method == "POST" else None - - if max_results is None or pd.isna(max_results): - dfs = _get_resp_data(resp, geopd=geopd) - curr_url = _next_req_url(resp) - failures = [] - while curr_url: - try: - resp = client.request(method, curr_url, headers=headers, content=content if method == "POST" else None) - if resp.status_code != 200: raise Exception(_error_body(resp)) - df1 = _get_resp_data(resp, geopd=geopd) - dfs = pd.concat([dfs, df1], ignore_index=True) - curr_url = _next_req_url(resp) - except Exception: - failures.append(curr_url) - curr_url = None - if failures: - print(f"There were {len(failures)} failed requests.") - return dfs - else: - resp.raise_for_status() - return _get_resp_data(resp, geopd=geopd) + close_client = client is None + client = client or requests.Session() + try: + resp = client.send(req) + if resp.status_code != 200: + raise Exception(_error_body(resp)) + + # Grab some aspects of the original request: headers and the + # request type (GET or POST) + method = req.method.upper() + headers = dict(req.headers) + content = req.body if method == "POST" else None + + if max_results is None or pd.isna(max_results): + dfs = _get_resp_data(resp, geopd=geopd) + curr_url = _next_req_url(resp) + failures = [] + while curr_url: + try: + resp = client.request(method, curr_url, headers=headers, data=content if method == "POST" else None) + if resp.status_code != 200: + raise Exception(_error_body(resp)) + df1 = _get_resp_data(resp, geopd=geopd) + dfs = pd.concat([dfs, df1], ignore_index=True) + curr_url = _next_req_url(resp) + except Exception: + failures.append(curr_url) + curr_url = None + if failures: + print(f"There were {len(failures)} failed requests.") + return dfs + else: + resp.raise_for_status() + return _get_resp_data(resp, geopd=geopd) + finally: + if close_client: + client.close() def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: """ @@ -627,14 +645,13 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF # def _get_params(service: str): # url = f"{_base_url()}collections/{service}/schema" -# resp = httpx.get(url, headers=_default_headers()) +# resp = requests.get(url, headers=_default_headers()) # resp.raise_for_status() # properties = resp.json().get("properties", {}) # return {k: v.get("description") for k, v in properties.items()} # def _get_collection(): # url = f"{_base_url()}openapi?f=json" -# resp = httpx.get(url, headers=_default_headers()) +# resp = requests.get(url, headers=_default_headers()) # resp.raise_for_status() # return resp.json() - From 4c84fc09a7019a93adcb76884902db903b6898aa Mon Sep 17 00:00:00 2001 From: Timothy Hodson Date: Wed, 1 Oct 2025 21:43:02 -0500 Subject: [PATCH 30/56] Review waterdata module --- .github/workflows/python-package.yml | 4 +- dataretrieval/samples.py | 12 +- dataretrieval/waterdata/__init__.py | 43 + .../{waterdata.py => waterdata/api.py} | 1024 ++++++++--------- dataretrieval/waterdata/types.py | 56 + .../utils.py} | 568 +++++---- tests/nldi_test.py | 28 +- tests/waterdata_test.py | 46 +- tests/waterservices_test.py | 48 +- tests/wqp_test.py | 20 +- 10 files changed, 1042 insertions(+), 807 deletions(-) create mode 100644 dataretrieval/waterdata/__init__.py rename dataretrieval/{waterdata.py => waterdata/api.py} (63%) create mode 100644 dataretrieval/waterdata/types.py rename dataretrieval/{waterdata_helpers.py => waterdata/utils.py} (57%) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c7d487f2..4563b449 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -36,7 +36,5 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest and report coverage run: | - cd tests - coverage run -m pytest + coverage run -m pytest tests/ coverage report -m - cd .. diff --git a/dataretrieval/samples.py b/dataretrieval/samples.py index c55c1a84..a6df85b3 100644 --- a/dataretrieval/samples.py +++ b/dataretrieval/samples.py @@ -11,18 +11,17 @@ import pandas as pd import warnings -from dataretrieval.utils import BaseMetadata, to_str -from dataretrieval.waterdata import get_samples +from dataretrieval.utils import BaseMetadata if TYPE_CHECKING: from typing import Optional, Tuple, Union - from dataretrieval.waterdata import _SERVICES, _PROFILES + from dataretrieval.waterdata import SERVICES, PROFILES from pandas import DataFrame def get_usgs_samples( ssl_check: bool = True, - service: _SERVICES = "results", - profile: _PROFILES = "fullphyschem", + service: SERVICES = "results", + profile: PROFILES = "fullphyschem", activityMediaName: Optional[Union[str, list[str]]] = None, activityStartDateLower: Optional[str] = None, activityStartDateUpper: Optional[str] = None, @@ -212,7 +211,8 @@ def get_usgs_samples( DeprecationWarning, stacklevel=2, ) - + + from dataretrieval.waterdata import get_samples result = get_samples( ssl_check=ssl_check, service=service, diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py new file mode 100644 index 00000000..7d87f79c --- /dev/null +++ b/dataretrieval/waterdata/__init__.py @@ -0,0 +1,43 @@ +""" +Water Data API module for accessing USGS water data services. + +This module provides functions for downloading data from the Water Data APIs, +including the USGS Aquarius Samples database. + +See https://api.waterdata.usgs.gov/ for API reference. +""" + +from __future__ import annotations + +# Public API exports +from .api import ( + get_codes, + get_daily, + get_field_measurements, + get_latest_continuous, + get_monitoring_locations, + get_samples, + get_time_series_metadata, + _check_profiles, +) +from .types import ( + CODE_SERVICES, + SERVICES, + PROFILES, + PROFILE_LOOKUP, +) + +__all__ = [ + "get_codes", + "get_daily", + "get_field_measurements", + "get_latest_continuous", + "get_monitoring_locations", + "get_samples", + "get_time_series_metadata", + "_check_profiles", + "CODE_SERVICES", + "SERVICES", + "PROFILES", + "PROFILE_LOOKUP", +] diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata/api.py similarity index 63% rename from dataretrieval/waterdata.py rename to dataretrieval/waterdata/api.py index 7c503b23..d1ae55f0 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata/api.py @@ -1,107 +1,62 @@ -"""Functions for downloading data from the Water Data APIs, including the USGS Aquarius Samples database. +"""Functions for downloading data from the Water Data APIs, including the USGS +Aquarius Samples database. See https://api.waterdata.usgs.gov/ for API reference. """ -from __future__ import annotations - import json +import logging from io import StringIO -from typing import TYPE_CHECKING, Literal, List, get_args +from typing import Optional, List, Tuple, Union, get_args import pandas as pd import requests from requests.models import PreparedRequest from dataretrieval.utils import BaseMetadata, to_str -from dataretrieval import waterdata_helpers - -if TYPE_CHECKING: - from typing import Optional, Tuple, Union - - from pandas import DataFrame - - -_BASE_URL = "https://api.waterdata.usgs.gov/" - -_SAMPLES_URL = _BASE_URL + "samples-data" - -_CODE_SERVICES = Literal[ - "characteristicgroup", - "characteristics", - "counties", - "countries", - "observedproperty", - "samplemedia", - "sitetype", - "states", -] - -_SERVICES = Literal["activities", "locations", "organizations", "projects", "results"] - -_PROFILES = Literal[ - "actgroup", - "actmetric", - "basicbio", - "basicphyschem", - "count", - "fullbio", - "fullphyschem", - "labsampleprep", - "narrow", - "organization", - "project", - "projectmonitoringlocationweight", - "resultdetectionquantitationlimit", - "sampact", - "site", -] - -_PROFILE_LOOKUP = { - "activities": ["sampact", "actmetric", "actgroup", "count"], - "locations": ["site", "count"], - "organizations": ["organization", "count"], - "projects": ["project", "projectmonitoringlocationweight"], - "results": [ - "fullphyschem", - "basicphyschem", - "fullbio", - "basicbio", - "narrow", - "resultdetectionquantitationlimit", - "labsampleprep", - "count", - ], -} +from dataretrieval.waterdata.types import ( + CODE_SERVICES, + PROFILE_LOOKUP, + PROFILES, + SERVICES, +) +from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data + +# Set up logger for this module +logger = logging.getLogger(__name__) + def get_daily( - monitoring_location_id: Optional[Union[str, List[str]]] = None, - parameter_code: Optional[Union[str, List[str]]] = None, - statistic_id: Optional[Union[str, List[str]]] = None, - properties: Optional[List[str]] = None, - time_series_id: Optional[Union[str, List[str]]] = None, - daily_id: Optional[Union[str, List[str]]] = None, - approval_status: Optional[Union[str, List[str]]] = None, - unit_of_measure: Optional[Union[str, List[str]]] = None, - qualifier: Optional[Union[str, List[str]]] = None, - value: Optional[Union[str, List[str]]] = None, - last_modified: Optional[str] = None, - skipGeometry: Optional[bool] = None, - time: Optional[Union[str, List[str]]] = None, - bbox: Optional[List[float]] = None, - limit: Optional[int] = None, - max_results: Optional[int] = None, - convertType: bool = True - ) -> pd.DataFrame: - """Daily data provide one data value to represent water conditions for the day. - Throughout much of the history of the USGS, the primary water data available was - daily data collected manually at the monitoring location once each day. With - improved availability of computer storage and automated transmission of data, the - daily data published today are generally a statistical summary or metric of the - continuous data collected each day, such as the daily mean, minimum, or maximum - value. Daily data are automatically calculated from the continuous data of the same - parameter code and are described by parameter code and a statistic code. These data - have also been referred to as “daily values” or “DV”. + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + properties: Optional[List[str]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + daily_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[Union[str, List[str]]] = None, + last_modified: Optional[str] = None, + skip_geometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convert_type: bool = True, +) -> pd.DataFrame: + """Daily data provide one data value to represent water conditions for the + day. + + Throughout much of the history of the USGS, the primary water data available + was daily data collected manually at the monitoring location once each day. + With improved availability of computer storage and automated transmission of + data, the daily data published today are generally a statistical summary or + metric of the continuous data collected each day, such as the daily mean, + minimum, or maximum value. Daily data are automatically calculated from the + continuous data of the same parameter code and are described by parameter + code and a statistic code. These data have also been referred to as “daily + values” or “DV”. Parameters ---------- @@ -131,25 +86,17 @@ def get_daily( A unique identifier representing a single time series. This corresponds to the id field in the time-series-metadata endpoint. daily_id : string or list of strings, optional - A universally unique identifier (UUID) representing a single - version of a record. It is not stable over time. Every time the - record is refreshed in our database (which may happen as part of - normal operations and does not imply any change to the data itself) - a new ID will be generated. To uniquely identify a single observation - over time, compare the time and time_series_id fields; each time series - will only have a single observation at a given time. + A universally unique identifier (UUID) representing a single version of + a record. It is not stable over time. Every time the record is refreshed + in our database (which may happen as part of normal operations and does + not imply any change to the data itself) a new ID will be generated. To + uniquely identify a single observation over time, compare the time and + time_series_id fields; each time series will only have a single + observation at a given time. approval_status : string or list of strings, optional - Some of the data that you have obtained from this U.S. Geological - Survey database may not have received Director's approval. Any such - data values are qualified as provisional and are subject to revision. - Provisional data are released on the condition that neither the USGS - nor the United States Government may be held liable for any damages - resulting from its use. This field reflects the approval status of - each record, and is either "Approved", meaining processing review has - been completed and the data is approved for publication, or - "Provisional" and subject to revision. For more information about - provisional data, go to - https://waterdata.usgs.gov/provisional-data-statement/. + Some of the data that you have obtained from this U.S. Geological Survey + database may not have received Director's approval. Any such data values + are qualified as provisional and are subject to revision. unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. @@ -166,44 +113,55 @@ def get_daily( anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots - at start or end). Examples: + at start or end). + Examples: - A date-time: "2018-02-12T23:20:50Z" - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - Only features that have a last_modified that intersects the value of datetime are selected. - skipGeometry : boolean, optional - This option can be used to skip response geometries for each feature. The returning - object will be a data frame with no spatial information. + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" + for the last 36 hours + Only features that have a last_modified that intersects the value of + datetime are selected. + skip_geometry : boolean, optional + This option can be used to skip response geometries for each feature. + The returning object will be a data frame with no spatial information. time : string, optional - The date an observation represents. You can query this field using date-times - or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. - Intervals may be bounded or half-bounded (double-dots at start or end). + The date an observation represents. You can query this field using + date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Only features that have a time that intersects the + value of datetime are selected. If a feature has multiple temporal + properties, it is the decision of the server whether only a single + temporal property is used to determine the extent or all relevant + temporal properties. Examples: - A date-time: "2018-02-12T23:20:50Z" - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - Only features that have a time that intersects the value of datetime are selected. If - a feature has multiple temporal properties, it is the decision of the server whether - only a single temporal property is used to determine the extent or all relevant temporal properties. + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" + for the last 36 hours bbox : list of numbers, optional - Only features that have a geometry that intersects the bounding box are selected. - The bounding box is provided as four or six numbers, depending on whether the - coordinate reference system includes a vertical axis (height or depth). Coordinates - are assumed to be in crs 4326. The expected format is a numeric vector structured: - c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, - Southern-most latitude, Eastern-most longitude, Northern-most longitude). + Only features that have a geometry that intersects the bounding box are + selected. The bounding box is provided as four or six numbers, + depending on whether the coordinate reference system includes a vertical + axis (height or depth). Coordinates are assumed to be in crs 4326. The + expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). + Another way to think of it is c(Western-most longitude, Southern-most + latitude, Eastern-most longitude, Northern-most longitude). limit : numeric, optional - The optional limit parameter is used to control the subset of the selected features - that should be returned in each page. The maximum allowable limit is 10000. It may - be beneficial to set this number lower if your internet connection is spotty. The - default (NA) will set the limit to the maximum allowable limit for the service. + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 10000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (NA) will set the + limit to the maximum allowable limit for the service. max_results : numeric, optional - The optional maximum number of rows to return. This value must be less than the - requested limit. - convertType : boolean, optional - If True, the function will convert the data to dates and qualifier to string vector + The optional maximum number of rows to return. This value must be less + than the requested limit. + convert_type : boolean, optional + If True, the function will convert the data to dates and qualifier to + string vector Returns ------- @@ -217,9 +175,9 @@ def get_daily( >>> # Get daily flow data from a single site >>> # over a yearlong period >>> df = dataretrieval.waterdata.get_daily( - ... monitoring_location_id = "USGS-02238500", - ... parameter_code = "00060", - ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... monitoring_location_id="USGS-02238500", + ... parameter_code="00060", + ... time="2021-01-01T00:00:00Z/2022-01-01T00:00:00Z", ... ) >>> # Get monitoring location info for specific sites @@ -228,73 +186,75 @@ def get_daily( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], ... approval_status = "Approved", ... time = "2024-01-01/.." - """ + """ service = "daily" output_id = "daily_id" # Build argument dictionary, omitting None values - args = { - k: v for k, v in locals().items() + args = { + k: v + for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - return waterdata_helpers.get_ogc_data(args, output_id, service) + return get_ogc_data(args, output_id, service) + def get_monitoring_locations( - monitoring_location_id: Optional[List[str]] = None, - agency_code: Optional[List[str]] = None, - agency_name: Optional[List[str]] = None, - monitoring_location_number: Optional[List[str]] = None, - monitoring_location_name: Optional[List[str]] = None, - district_code: Optional[List[str]] = None, - country_code: Optional[List[str]] = None, - country_name: Optional[List[str]] = None, - state_code: Optional[List[str]] = None, - state_name: Optional[List[str]] = None, - county_code: Optional[List[str]] = None, - county_name: Optional[List[str]] = None, - minor_civil_division_code: Optional[List[str]] = None, - site_type_code: Optional[List[str]] = None, - site_type: Optional[List[str]] = None, - hydrologic_unit_code: Optional[List[str]] = None, - basin_code: Optional[List[str]] = None, - altitude: Optional[List[str]] = None, - altitude_accuracy: Optional[List[str]] = None, - altitude_method_code: Optional[List[str]] = None, - altitude_method_name: Optional[List[str]] = None, - vertical_datum: Optional[List[str]] = None, - vertical_datum_name: Optional[List[str]] = None, - horizontal_positional_accuracy_code: Optional[List[str]] = None, - horizontal_positional_accuracy: Optional[List[str]] = None, - horizontal_position_method_code: Optional[List[str]] = None, - horizontal_position_method_name: Optional[List[str]] = None, - original_horizontal_datum: Optional[List[str]] = None, - original_horizontal_datum_name: Optional[List[str]] = None, - drainage_area: Optional[List[str]] = None, - contributing_drainage_area: Optional[List[str]] = None, - time_zone_abbreviation: Optional[List[str]] = None, - uses_daylight_savings: Optional[List[str]] = None, - construction_date: Optional[List[str]] = None, - aquifer_code: Optional[List[str]] = None, - national_aquifer_code: Optional[List[str]] = None, - aquifer_type_code: Optional[List[str]] = None, - well_constructed_depth: Optional[List[str]] = None, - hole_constructed_depth: Optional[List[str]] = None, - depth_source_code: Optional[List[str]] = None, - properties: Optional[List[str]] = None, - skipGeometry: Optional[bool] = None, - time: Optional[Union[str, List[str]]] = None, - bbox: Optional[List[float]] = None, - limit: Optional[int] = None, - max_results: Optional[int] = None, - convertType: bool = True - ) -> pd.DataFrame: + monitoring_location_id: Optional[List[str]] = None, + agency_code: Optional[List[str]] = None, + agency_name: Optional[List[str]] = None, + monitoring_location_number: Optional[List[str]] = None, + monitoring_location_name: Optional[List[str]] = None, + district_code: Optional[List[str]] = None, + country_code: Optional[List[str]] = None, + country_name: Optional[List[str]] = None, + state_code: Optional[List[str]] = None, + state_name: Optional[List[str]] = None, + county_code: Optional[List[str]] = None, + county_name: Optional[List[str]] = None, + minor_civil_division_code: Optional[List[str]] = None, + site_type_code: Optional[List[str]] = None, + site_type: Optional[List[str]] = None, + hydrologic_unit_code: Optional[List[str]] = None, + basin_code: Optional[List[str]] = None, + altitude: Optional[List[str]] = None, + altitude_accuracy: Optional[List[str]] = None, + altitude_method_code: Optional[List[str]] = None, + altitude_method_name: Optional[List[str]] = None, + vertical_datum: Optional[List[str]] = None, + vertical_datum_name: Optional[List[str]] = None, + horizontal_positional_accuracy_code: Optional[List[str]] = None, + horizontal_positional_accuracy: Optional[List[str]] = None, + horizontal_position_method_code: Optional[List[str]] = None, + horizontal_position_method_name: Optional[List[str]] = None, + original_horizontal_datum: Optional[List[str]] = None, + original_horizontal_datum_name: Optional[List[str]] = None, + drainage_area: Optional[List[str]] = None, + contributing_drainage_area: Optional[List[str]] = None, + time_zone_abbreviation: Optional[List[str]] = None, + uses_daylight_savings: Optional[List[str]] = None, + construction_date: Optional[List[str]] = None, + aquifer_code: Optional[List[str]] = None, + national_aquifer_code: Optional[List[str]] = None, + aquifer_type_code: Optional[List[str]] = None, + well_constructed_depth: Optional[List[str]] = None, + hole_constructed_depth: Optional[List[str]] = None, + depth_source_code: Optional[List[str]] = None, + properties: Optional[List[str]] = None, + skip_geometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convert_type: bool = True, +) -> pd.DataFrame: """Location information is basic information about the monitoring location including the name, identifier, agency responsible for data collection, and the date the location was established. It also includes information about the type of location, such as stream, lake, or groundwater, and geographic - information about the location, such as state, county, latitude and longitude, - and hydrologic unit code (HUC). + information about the location, such as state, county, latitude and + longitude, and hydrologic unit code (HUC). Parameters ---------- @@ -364,23 +324,25 @@ def get_monitoring_locations( hydrologic_unit_code : string or list of strings, optional The United States is divided and sub-divided into successively smaller hydrologic units which are classified into four levels: regions, - sub-regions, accounting units, and cataloging units. The hydrologic units - are arranged within each other, from the smallest (cataloging units) to the - largest (regions). Each hydrologic unit is identified by a unique hydrologic - unit code (HUC) consisting of two to eight digits based on the four levels - of classification in the hydrologic unit system. + sub-regions, accounting units, and cataloging units. The hydrologic + units are arranged within each other, from the smallest (cataloging + units) to the largest (regions). Each hydrologic unit is identified by a + unique hydrologic unit code (HUC) consisting of two to eight digits + based on the four levels of classification in the hydrologic unit + system. basin_code : string or list of strings, optional The Basin Code or "drainage basin code" is a two-digit code that further subdivides the 8-digit hydrologic-unit code. The drainage basin code is - defined by the USGS State Office where the monitoring location is located. + defined by the USGS State Office where the monitoring location is + located. altitude : string or list of strings, optional Altitude of the monitoring location referenced to the specified Vertical Datum. altitude_accuracy : string or list of strings, optional Accuracy of the altitude, in feet. An accuracy of +/- 0.1 foot would be entered as “.1”. Many altitudes are interpolated from the contours on - topographic maps; accuracies determined in this way are generally entered - as one-half of the contour interval. + topographic maps; accuracies determined in this way are generally + entered as one-half of the contour interval. altitude_method_code : string or list of strings, optional Codes representing the method used to measure altitude. A [list of codes](https://help.waterdata.usgs.gov/code/alt_meth_cd_query?fmt=html) is available. @@ -426,12 +388,13 @@ def get_monitoring_locations( point. contributing_drainage_area : string or list of strings, optional The contributing drainage area of a lake, stream, wetland, or estuary - monitoring location, in square miles. This item should be present only if - the contributing area is different from the total drainage area. This - situation can occur when part of the drainage area consists of very porous - soil or depressions that either allow all runoff to enter the groundwater - or traps the water in ponds so that rainfall does not contribute to runoff. - A transbasin diversion can also affect the total drainage area. + monitoring location, in square miles. This item should be present only + if the contributing area is different from the total drainage area. This + situation can occur when part of the drainage area consists of very + porous soil or depressions that either allow all runoff to enter the + groundwater or traps the water in ponds so that rainfall does not + contribute to runoff. A transbasin diversion can also affect the total + drainage area. time_zone_abbreviation : string or list of strings, optional A short code describing the time zone used by a monitoring location. uses_daylight_savings : string or list of strings, optional @@ -441,8 +404,9 @@ def get_monitoring_locations( aquifer_code : string or list of strings, optional Local aquifers in the USGS water resources data base are identified by a geohydrologic unit code (a three-digit number related to the age of the - formation, followed by a 4 or 5 character abbreviation for the geologic unit - or aquifer name). Additional information is available [at this link](https://help.waterdata.usgs.gov/faq/groundwater/local-aquifer-description). + formation, followed by a 4 or 5 character abbreviation for the geologic + unit or aquifer name). Additional information is available + [at this link](https://help.waterdata.usgs.gov/faq/groundwater/local-aquifer-description). national_aquifer_code : string or list of strings, optional National aquifers are the principal aquifers or aquifer systems in the United States, defined as regionally extensive aquifers or aquifer systems that have @@ -472,36 +436,41 @@ def get_monitoring_locations( A code indicating the source of water-level data. A [list of codes](https://help.waterdata.usgs.gov/code/water_level_src_cd_query?fmt=html) is available. properties : string or list of strings, optional - A vector of requested columns to be returned from the query. Available options - are: geometry, id, agency_code, agency_name, monitoring_location_number, - monitoring_location_name, district_code, country_code, country_name, state_code, - state_name, county_code, county_name, minor_civil_division_code, site_type_code, - site_type, hydrologic_unit_code, basin_code, altitude, altitude_accuracy, - altitude_method_code, altitude_method_name, vertical_datum, vertical_datum_name, - horizontal_positional_accuracy_code, horizontal_positional_accuracy, - horizontal_position_method_code, horizontal_position_method_name, - original_horizontal_datum, original_horizontal_datum_name, drainage_area, - contributing_drainage_area, time_zone_abbreviation, uses_daylight_savings, - construction_date, aquifer_code, national_aquifer_code, aquifer_type_code, - well_constructed_depth, hole_constructed_depth, depth_source_code. + A vector of requested columns to be returned from the query. Available + options are: geometry, id, agency_code, agency_name, + monitoring_location_number, monitoring_location_name, district_code, + country_code, country_name, state_code, state_name, county_code, + county_name, minor_civil_division_code, site_type_code, site_type, + hydrologic_unit_code, basin_code, altitude, altitude_accuracy, + altitude_method_code, altitude_method_name, vertical_datum, + vertical_datum_name, horizontal_positional_accuracy_code, + horizontal_positional_accuracy, horizontal_position_method_code, + horizontal_position_method_name, original_horizontal_datum, + original_horizontal_datum_name, drainage_area, + contributing_drainage_area, time_zone_abbreviation, + uses_daylight_savings, construction_date, aquifer_code, + national_aquifer_code, aquifer_type_code, well_constructed_depth, + hole_constructed_depth, depth_source_code. bbox : list of numbers, optional - Only features that have a geometry that intersects the bounding box are selected. - The bounding box is provided as four or six numbers, depending on whether the - coordinate reference system includes a vertical axis (height or depth). Coordinates - are assumed to be in crs 4326. The expected format is a numeric vector structured: - c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, - Southern-most latitude, Eastern-most longitude, Northern-most longitude). + Only features that have a geometry that intersects the bounding box are + selected. The bounding box is provided as four or six numbers, + depending on whether the coordinate reference system includes a vertical + axis (height or depth). Coordinates are assumed to be in crs 4326. The + expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). + Another way to think of it is c(Western-most longitude, Southern-most + latitude, Eastern-most longitude, Northern-most longitude). limit : numeric, optional - The optional limit parameter is used to control the subset of the selected features - that should be returned in each page. The maximum allowable limit is 10000. It may - be beneficial to set this number lower if your internet connection is spotty. The - default (NA) will set the limit to the maximum allowable limit for the service. + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 10000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (NA) will set the + limit to the maximum allowable limit for the service. max_results : numeric, optional - The optional maximum number of rows to return. This value must be less than the - requested limit. - skipGeometry : boolean, optional - This option can be used to skip response geometries for each feature. The returning - object will be a data frame with no spatial information. + The optional maximum number of rows to return. This value must be less + than the requested limit. + skip_geometry : boolean, optional + This option can be used to skip response geometries for each feature. + The returning object will be a data frame with no spatial information. Returns ------- @@ -515,54 +484,54 @@ def get_monitoring_locations( >>> # Get monitoring locations within a bounding box >>> # and leave out geometry >>> df = dataretrieval.waterdata.get_monitoring_locations( - ... bbox=[-90.2,42.6,-88.7,43.2], - ... skipGeometry=True + ... bbox=[-90.2, 42.6, -88.7, 43.2], skip_geometry=True ... ) >>> # Get monitoring location info for specific sites >>> # and only specific properties >>> df = dataretrieval.waterdata.get_monitoring_locations( - ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], - ... properties = ["monitoring_location_id", - ... "state_name", - ... "country_name"]) - """ + ... monitoring_location_id=["USGS-05114000", "USGS-09423350"], + ... properties=["monitoring_location_id", "state_name", "country_name"], + ... ) + """ service = "monitoring-locations" output_id = "monitoring_location_id" # Build argument dictionary, omitting None values - args = { - k: v for k, v in locals().items() + args = { + k: v + for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - return waterdata_helpers.get_ogc_data(args, output_id, service) + return get_ogc_data(args, output_id, service) + def get_time_series_metadata( - monitoring_location_id: Optional[Union[str, List[str]]] = None, - parameter_code: Optional[Union[str, List[str]]] = None, - parameter_name: Optional[Union[str, List[str]]] = None, - properties: Optional[Union[str, List[str]]] = None, - statistic_id: Optional[Union[str, List[str]]] = None, - last_modified: Optional[Union[str, List[str]]] = None, - begin: Optional[Union[str, List[str]]] = None, - end: Optional[Union[str, List[str]]] = None, - unit_of_measure: Optional[Union[str, List[str]]] = None, - computation_period_identifier: Optional[Union[str, List[str]]] = None, - computation_identifier: Optional[Union[str, List[str]]] = None, - thresholds: Optional[int] = None, - sublocation_identifier: Optional[Union[str, List[str]]] = None, - primary: Optional[Union[str, List[str]]] = None, - parent_time_series_id: Optional[Union[str, List[str]]] = None, - time_series_id: Optional[Union[str, List[str]]] = None, - web_description: Optional[Union[str, List[str]]] = None, - skipGeometry: Optional[bool] = None, - time: Optional[Union[str, List[str]]] = None, - bbox: Optional[List[float]] = None, - limit: Optional[int] = None, - max_results: Optional[int] = None, - convertType: bool = True -) -> pd.DataFrame: + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + parameter_name: Optional[Union[str, List[str]]] = None, + properties: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + last_modified: Optional[Union[str, List[str]]] = None, + begin: Optional[Union[str, List[str]]] = None, + end: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + computation_period_identifier: Optional[Union[str, List[str]]] = None, + computation_identifier: Optional[Union[str, List[str]]] = None, + thresholds: Optional[int] = None, + sublocation_identifier: Optional[Union[str, List[str]]] = None, + primary: Optional[Union[str, List[str]]] = None, + parent_time_series_id: Optional[Union[str, List[str]]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + web_description: Optional[Union[str, List[str]]] = None, + skip_geometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convert_type: bool = True, +) -> Tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, which represent a collection of observations of a single parameter, potentially aggregated using a standard statistic, at a single monitoring @@ -602,30 +571,30 @@ def get_time_series_metadata( anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots - at start or end). Examples: + at start or end). Only features that have a last_modified that + intersects the value of datetime are selected. + Examples: - A date-time: "2018-02-12T23:20:50Z" - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - Only features that have a last_modified that intersects the value of datetime are selected. + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" + for the last 36 hours begin : string or list of strings, optional - The datetime of the earliest observation in the time series. Together with end, - this field represents the period of record of a time series. Note that some time - series may have large gaps in their collection record. This field is currently - in the local time of the monitoring location. We intend to update this in version - v0 to use UTC with a time zone. You can query this field using date-times or - intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals - may be bounded or half-bounded (double-dots at start or end). Examples: - + The datetime of the earliest observation in the time series. Together + with end, this field represents the period of record of a time series. + Note that some time series may have large gaps in their collection + record. This field is currently in the local time of the monitoring + location. We intend to update this in version v0 to use UTC with a time + zone. You can query this field using date-times or intervals, adhering + to RFC 3339, or using ISO 8601 duration objects. Intervals may be + bounded or half-bounded (double-dots at start or end). Only features + that have a begin that intersects the value of datetime are selected. + Examples: - A date-time: "2018-02-12T23:20:50Z" - - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - - Only features that have a begin that intersects the value of datetime are selected. end : string or list of strings, optional The datetime of the most recent observation in the time series. Data returned by this endpoint updates at most once per day, and potentially less frequently than @@ -635,31 +604,30 @@ def get_time_series_metadata( determine whether a time series is "active". We intend to update this in version v0 to use UTC with a time zone. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals - may be bounded or half-bounded (double-dots at start or end). Examples: - + may be bounded or half-bounded (double-dots at start or end). Only + features that have a end that intersects the value of datetime are + selected. + Examples: - A date-time: "2018-02-12T23:20:50Z" - - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - - - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - - - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - - Only features that have a end that intersects the value of datetime are selected. + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for + the last 36 hours unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. computation_period_identifier : string or list of strings, optional Indicates the period of data used for any statistical computations. computation_identifier : string or list of strings, optional - Indicates whether the data from this time series represent a specific statistical - computation. + Indicates whether the data from this time series represent a specific + statistical computation. thresholds : numeric or list of numbers, optional - Thresholds represent known numeric limits for a time series, for example the - historic maximum value for a parameter or a level below which a sensor is - non-operative. These thresholds are sometimes used to automatically determine if - an observation is erroneous due to sensor error, and therefore shouldn't be included - in the time series. + Thresholds represent known numeric limits for a time series, for example + the historic maximum value for a parameter or a level below which a + sensor is non-operative. These thresholds are sometimes used to + automatically determine if an observation is erroneous due to sensor + error, and therefore shouldn't be included in the time series. sublocation_identifier : string or list of strings, optional primary : string or list of strings, optional parent_time_series_id : string or list of strings, optional @@ -667,28 +635,31 @@ def get_time_series_metadata( A unique identifier representing a single time series. This corresponds to the id field in the time-series-metadata endpoint. web_description : string or list of strings, optional - A description of what this time series represents, as used by WDFN and other USGS - data dissemination products. - skipGeometry : boolean, optional - This option can be used to skip response geometries for each feature. The returning - object will be a data frame with no spatial information. + A description of what this time series represents, as used by WDFN and + other USGS data dissemination products. + skip_geometry : boolean, optional + This option can be used to skip response geometries for each feature. + The returning object will be a data frame with no spatial information. bbox : list of numbers, optional - Only features that have a geometry that intersects the bounding box are selected. - The bounding box is provided as four or six numbers, depending on whether the - coordinate reference system includes a vertical axis (height or depth). Coordinates - are assumed to be in crs 4326. The expected format is a numeric vector structured: - c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, - Southern-most latitude, Eastern-most longitude, Northern-most longitude). + Only features that have a geometry that intersects the bounding box are + selected. The bounding box is provided as four or six numbers, + depending on whether the coordinate reference system includes a vertical + axis (height or depth). Coordinates are assumed to be in crs 4326. The + expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). + Another way to think of it is c(Western-most longitude, Southern-most + latitude, Eastern-most longitude, Northern-most longitude). limit : numeric, optional - The optional limit parameter is used to control the subset of the selected features - that should be returned in each page. The maximum allowable limit is 10000. It may - be beneficial to set this number lower if your internet connection is spotty. The - default (None) will set the limit to the maximum allowable limit for the service. + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 10000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (None) will set the + limit to the maximum allowable limit for the service. max_results : numeric, optional - The optional maximum number of rows to return. This value must be less than the - requested limit. - convertType : boolean, optional - If True, the function will convert the data to dates and qualifier to string vector + The optional maximum number of rows to return. This value must be less + than the requested limit. + convert_type : boolean, optional + If True, the function will convert the data to dates and qualifier to + string vector Returns ------- @@ -702,9 +673,9 @@ def get_time_series_metadata( >>> # Get daily flow data from a single site >>> # over a yearlong period >>> df = dataretrieval.waterdata.get_time_series_metadata( - ... monitoring_location_id = "USGS-02238500", - ... parameter_code = "00060", - ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... monitoring_location_id="USGS-02238500", + ... parameter_code="00060", + ... time="2021-01-01T00:00:00Z/2022-01-01T00:00:00Z", ... ) >>> # Get monitoring location info for specific sites @@ -712,37 +683,39 @@ def get_time_series_metadata( >>> df = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], ... time = "2024-01-01/.." - """ + """ service = "time-series-metadata" output_id = "time_series_id" # Build argument dictionary, omitting None values - args = { - k: v for k, v in locals().items() + args = { + k: v + for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - return waterdata_helpers.get_ogc_data(args, output_id, service) + return get_ogc_data(args, output_id, service) + def get_latest_continuous( - monitoring_location_id: Optional[Union[str, List[str]]] = None, - parameter_code: Optional[Union[str, List[str]]] = None, - statistic_id: Optional[Union[str, List[str]]] = None, - properties: Optional[Union[str, List[str]]] = None, - time_series_id: Optional[Union[str, List[str]]] = None, - latest_continuous_id: Optional[Union[str, List[str]]] = None, - approval_status: Optional[Union[str, List[str]]] = None, - unit_of_measure: Optional[Union[str, List[str]]] = None, - qualifier: Optional[Union[str, List[str]]] = None, - value: Optional[int] = None, - last_modified: Optional[Union[str, List[str]]] = None, - skipGeometry: Optional[bool] = None, - time: Optional[Union[str, List[str]]] = None, - bbox: Optional[List[float]] = None, - limit: Optional[int] = None, - max_results: Optional[int] = None, - convertType: bool = True - ) -> pd.DataFrame: + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + properties: Optional[Union[str, List[str]]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + latest_continuous_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[int] = None, + last_modified: Optional[Union[str, List[str]]] = None, + skip_geometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convert_type: bool = True, +) -> pd.DataFrame: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors installed at a monitoring location. They are collected at a high frequency @@ -759,14 +732,14 @@ def get_latest_continuous( monitoring_location_id : string or list of strings, optional A unique identifier representing a single monitoring location. This corresponds to the id field in the monitoring-locations endpoint. - Monitoring location IDs are created by combining the agency code of - the agency responsible for the monitoring location (e.g. USGS) with - the ID number of the monitoring location (e.g. 02238500), separated - by a hyphen (e.g. USGS-02238500). + Monitoring location IDs are created by combining the agency code of the + agency responsible for the monitoring location (e.g. USGS) with the ID + number of the monitoring location (e.g. 02238500), separated by a hyphen + (e.g. USGS-02238500). parameter_code : string or list of strings, optional Parameter codes are 5-digit codes used to identify the constituent - measured and the units of measure. A complete list of parameter - codes and associated groupings can be found at + measured and the units of measure. A complete list of parameter codes + and associated groupings can be found at https://help.waterdata.usgs.gov/codes-and-parameters/parameters. statistic_id : string or list of strings, optional A code corresponding to the statistic an observation represents. @@ -774,33 +747,25 @@ def get_latest_continuous( A complete list of codes and their descriptions can be found at https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. properties : string or list of strings, optional - A vector of requested columns to be returned from the query. - Available options are: geometry, id, time_series_id, - monitoring_location_id, parameter_code, statistic_id, time, value, - unit_of_measure, approval_status, qualifier, last_modified + A vector of requested columns to be returned from the query. Available + options are: geometry, id, time_series_id, monitoring_location_id, + parameter_code, statistic_id, time, value, unit_of_measure, + approval_status, qualifier, last_modified time_series_id : string or list of strings, optional A unique identifier representing a single time series. This corresponds to the id field in the time-series-metadata endpoint. latest_continuous_id : string or list of strings, optional - A universally unique identifier (UUID) representing a single - version of a record. It is not stable over time. Every time the - record is refreshed in our database (which may happen as part of - normal operations and does not imply any change to the data itself) - a new ID will be generated. To uniquely identify a single observation - over time, compare the time and time_series_id fields; each time series - will only have a single observation at a given time. + A universally unique identifier (UUID) representing a single version of + a record. It is not stable over time. Every time the record is refreshed + in our database (which may happen as part of normal operations and does + not imply any change to the data itself) a new ID will be generated. To + uniquely identify a single observation over time, compare the time and + time_series_id fields; each time series will only have a single + observation at a given time. approval_status : string or list of strings, optional - Some of the data that you have obtained from this U.S. Geological - Survey database may not have received Director's approval. Any such - data values are qualified as provisional and are subject to revision. - Provisional data are released on the condition that neither the USGS - nor the United States Government may be held liable for any damages - resulting from its use. This field reflects the approval status of - each record, and is either "Approved", meaining processing review has - been completed and the data is approved for publication, or - "Provisional" and subject to revision. For more information about - provisional data, go to - https://waterdata.usgs.gov/provisional-data-statement/. + Some of the data that you have obtained from this U.S. Geological Survey + database may not have received Director's approval. Any such data values + are qualified as provisional and are subject to revision. unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. @@ -817,44 +782,54 @@ def get_latest_continuous( anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots - at start or end). Examples: + at start or end). Only features that have a last_modified that + intersects the value of datetime are selected. + Examples: - A date-time: "2018-02-12T23:20:50Z" - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - Only features that have a last_modified that intersects the value of datetime are selected. - skipGeometry : boolean, optional - This option can be used to skip response geometries for each feature. The returning - object will be a data frame with no spatial information. + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" + for the last 36 hours + skip_geometry : boolean, optional + This option can be used to skip response geometries for each feature. + The returning object will be a data frame with no spatial information. time : string, optional - The date an observation represents. You can query this field using date-times - or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. - Intervals may be bounded or half-bounded (double-dots at start or end). + The date an observation represents. You can query this field using + date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Only features that have a time that intersects the + value of datetime are selected. If a feature has multiple temporal + properties, it is the decision of the server whether only a single + temporal property is used to determine the extent or all relevant + temporal properties. Examples: - A date-time: "2018-02-12T23:20:50Z" - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - Only features that have a time that intersects the value of datetime are selected. If - a feature has multiple temporal properties, it is the decision of the server whether - only a single temporal property is used to determine the extent or all relevant temporal properties. + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" + for the last 36 hours bbox : list of numbers, optional - Only features that have a geometry that intersects the bounding box are selected. - The bounding box is provided as four or six numbers, depending on whether the - coordinate reference system includes a vertical axis (height or depth). Coordinates - are assumed to be in crs 4326. The expected format is a numeric vector structured: - c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, - Southern-most latitude, Eastern-most longitude, Northern-most longitude). + Only features that have a geometry that intersects the bounding box are + selected. The bounding box is provided as four or six numbers, + depending on whether the coordinate reference system includes a vertical + axis (height or depth). Coordinates are assumed to be in crs 4326. The + expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). + Another way to think of it is c(Western-most longitude, Southern-most + latitude, Eastern-most longitude, Northern-most longitude). limit : numeric, optional - The optional limit parameter is used to control the subset of the selected features - that should be returned in each page. The maximum allowable limit is 10000. It may - be beneficial to set this number lower if your internet connection is spotty. The - default (None) will set the limit to the maximum allowable limit for the service. + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 10000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (None) will set the + limit to the maximum allowable limit for the service. max_results : numeric, optional - The optional maximum number of rows to return. This value must be less than the - requested limit. - convertType : boolean, optional - If True, the function will convert the data to dates and qualifier to string vector + The optional maximum number of rows to return. This value must be less + than the requested limit. + convert_type : boolean, optional + If True, the function will convert the data to dates and qualifier to + string vector Returns ------- @@ -868,93 +843,85 @@ def get_latest_continuous( >>> # Get daily flow data from a single site >>> # over a yearlong period >>> df = dataretrieval.waterdata.get_latest_continuous( - ... monitoring_location_id = "USGS-02238500", - ... parameter_code = "00060" + ... monitoring_location_id="USGS-02238500", parameter_code="00060" ... ) >>> # Get monitoring location info for specific sites >>> # and only specific properties >>> df = dataretrieval.waterdata.get_daily( - ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"] + ... monitoring_location_id=["USGS-05114000", "USGS-09423350"] ... ) """ service = "latest-continuous" output_id = "latest_continuous_id" # Build argument dictionary, omitting None values - args = { - k: v for k, v in locals().items() + args = { + k: v + for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - return waterdata_helpers.get_ogc_data(args, output_id, service) + return get_ogc_data(args, output_id, service) + def get_field_measurements( - monitoring_location_id: Optional[Union[str, List[str]]] = None, - parameter_code: Optional[Union[str, List[str]]] = None, - observing_procedure_code: Optional[Union[str, List[str]]] = None, - properties: Optional[List[str]] = None, - field_visit_id: Optional[Union[str, List[str]]] = None, - approval_status: Optional[Union[str, List[str]]] = None, - unit_of_measure: Optional[Union[str, List[str]]] = None, - qualifier: Optional[Union[str, List[str]]] = None, - value: Optional[Union[str, List[str]]] = None, - last_modified: Optional[Union[str, List[str]]] = None, - observing_procedure: Optional[Union[str, List[str]]] = None, - vertical_datum: Optional[Union[str, List[str]]] = None, - measuring_agency: Optional[Union[str, List[str]]] = None, - skipGeometry: Optional[bool] = None, - time: Optional[Union[str, List[str]]] = None, - bbox: Optional[List[float]] = None, - limit: Optional[int] = None, - max_results: Optional[int] = None, - convertType: bool = True - ) -> pd.DataFrame: - """Field measurements are physically measured values collected during - a visit to the monitoring location. Field measurements consist of - measurements of gage height and discharge, and readings of groundwater - levels, and are primarily used as calibration readings for the automated - sensors collecting continuous data. They are collected at a low frequency, - and delivery of the data in WDFN may be delayed due to data processing - time. + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + observing_procedure_code: Optional[Union[str, List[str]]] = None, + properties: Optional[List[str]] = None, + field_visit_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[Union[str, List[str]]] = None, + last_modified: Optional[Union[str, List[str]]] = None, + observing_procedure: Optional[Union[str, List[str]]] = None, + vertical_datum: Optional[Union[str, List[str]]] = None, + measuring_agency: Optional[Union[str, List[str]]] = None, + skip_geometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convert_type: bool = True, +) -> pd.DataFrame: + """Field measurements are physically measured values collected during a + visit to the monitoring location. Field measurements consist of measurements + of gage height and discharge, and readings of groundwater levels, and are + primarily used as calibration readings for the automated sensors collecting + continuous data. They are collected at a low frequency, and delivery of the + data in WDFN may be delayed due to data processing time. Parameters ---------- monitoring_location_id : string or list of strings, optional A unique identifier representing a single monitoring location. This corresponds to the id field in the monitoring-locations endpoint. - Monitoring location IDs are created by combining the agency code of - the agency responsible for the monitoring location (e.g. USGS) with - the ID number of the monitoring location (e.g. 02238500), separated - by a hyphen (e.g. USGS-02238500). + Monitoring location IDs are created by combining the agency code of the + agency responsible for the monitoring location (e.g. USGS) with the ID + number of the monitoring location (e.g. 02238500), separated by a hyphen + (e.g. USGS-02238500). parameter_code : string or list of strings, optional Parameter codes are 5-digit codes used to identify the constituent - measured and the units of measure. A complete list of parameter - codes and associated groupings can be found at + measured and the units of measure. A complete list of parameter codes + and associated groupings can be found at https://help.waterdata.usgs.gov/codes-and-parameters/parameters. observing_procedure_code : string or list of strings, optional A short code corresponding to the observing procedure for the field measurement. properties : string or list of strings, optional - A vector of requested columns to be returned from the query. - Available options are: geometry, id, time_series_id, - monitoring_location_id, parameter_code, statistic_id, time, value, - unit_of_measure, approval_status, qualifier, last_modified + A vector of requested columns to be returned from the query. Available + options are: geometry, id, time_series_id, monitoring_location_id, + parameter_code, statistic_id, time, value, unit_of_measure, + approval_status, qualifier, last_modified field_visit_id : string or list of strings, optional A universally unique identifier (UUID) for the field visit. Multiple measurements may be made during a single field visit. approval_status : string or list of strings, optional - Some of the data that you have obtained from this U.S. Geological - Survey database may not have received Director's approval. Any such - data values are qualified as provisional and are subject to revision. - Provisional data are released on the condition that neither the USGS - nor the United States Government may be held liable for any damages - resulting from its use. This field reflects the approval status of - each record, and is either "Approved", meaining processing review has - been completed and the data is approved for publication, or - "Provisional" and subject to revision. For more information about - provisional data, go to - https://waterdata.usgs.gov/provisional-data-statement/. + Some of the data that you have obtained from this U.S. Geological Survey + database may not have received Director's approval. Any such data values + are qualified as provisional and are subject to revision. unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. @@ -971,12 +938,13 @@ def get_field_measurements( anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots - at start or end). Examples: + at start or end). Only features that have a last_modified that + intersects the value of datetime are selected. + Examples: - A date-time: "2018-02-12T23:20:50Z" - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - Only features that have a last_modified that intersects the value of datetime are selected. observing_procedure : string or list of strings, optional Water measurement or water-quality observing procedure descriptions. vertical_datum : string or list of strings, optional @@ -984,38 +952,44 @@ def get_field_measurements( A list of codes is available. measuring_agency : string or list of strings, optional The agency performing the measurement. - skipGeometry : boolean, optional + skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. time : string, optional The date an observation represents. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots at start or end). + Only features that have a time that intersects the value of datetime are + selected. If a feature has multiple temporal properties, it is the + decision of the server whether only a single temporal property is used + to determine the extent or all relevant temporal properties. Examples: - A date-time: "2018-02-12T23:20:50Z" - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - Only features that have a time that intersects the value of datetime are selected. If - a feature has multiple temporal properties, it is the decision of the server whether - only a single temporal property is used to determine the extent or all relevant temporal properties. + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" + for the last 36 hours bbox : list of numbers, optional - Only features that have a geometry that intersects the bounding box are selected. - The bounding box is provided as four or six numbers, depending on whether the - coordinate reference system includes a vertical axis (height or depth). Coordinates - are assumed to be in crs 4326. The expected format is a numeric vector structured: - c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, - Southern-most latitude, Eastern-most longitude, Northern-most longitude). + Only features that have a geometry that intersects the bounding box are + selected. The bounding box is provided as four or six numbers, + depending on whether the coordinate reference system includes a vertical + axis (height or depth). Coordinates are assumed to be in crs 4326. The + expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). + Another way to think of it is c(Western-most longitude, Southern-most + latitude, Eastern-most longitude, Northern-most longitude). limit : numeric, optional - The optional limit parameter is used to control the subset of the selected features - that should be returned in each page. The maximum allowable limit is 10000. It may - be beneficial to set this number lower if your internet connection is spotty. The - default (None) will set the limit to the maximum allowable limit for the service. + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 10000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (None) will set the + limit to the maximum allowable limit for the service. max_results : numeric, optional - The optional maximum number of rows to return. This value must be less than the - requested limit. - convertType : boolean, optional - If True, the function will convert the data to dates and qualifier to string vector + The optional maximum number of rows to return. This value must be less + than the requested limit. + convert_type : boolean, optional + If True, the function will convert the data to dates and qualifier to + string vector Returns ------- @@ -1029,9 +1003,9 @@ def get_field_measurements( >>> # Get daily flow data from a single site >>> # over a yearlong period >>> df = dataretrieval.waterdata.get_field_measurements( - ... monitoring_location_id = "USGS-375907091432201", - ... parameter_code = "72019", - ... skipGeometry = True + ... monitoring_location_id="USGS-375907091432201", + ... parameter_code="72019", + ... skip_geometry=True, ... ) >>> # Get monitoring location info for specific sites @@ -1047,16 +1021,18 @@ def get_field_measurements( output_id = "field_measurement_id" # Build argument dictionary, omitting None values - args = { - k: v for k, v in locals().items() + args = { + k: v + for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - return waterdata_helpers.get_ogc_data(args, output_id, service) - -def get_codes(code_service: _CODE_SERVICES) -> DataFrame: + return get_ogc_data(args, output_id, service) + + +def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame: """Return codes from a Samples code service. - + Parameters ---------- code_service : string @@ -1064,30 +1040,31 @@ def get_codes(code_service: _CODE_SERVICES) -> DataFrame: "sitetype", "samplemedia", "characteristicgroup", "characteristics", or "observedproperty" """ - valid_code_services = get_args(_CODE_SERVICES) + valid_code_services = get_args(CODE_SERVICES) if code_service not in valid_code_services: raise ValueError( f"Invalid code service: '{code_service}'. " f"Valid options are: {valid_code_services}." ) - url = f"{_SAMPLES_URL}/codeservice/{code_service}?mimeType=application%2Fjson" - + url = f"{SAMPLES_URL}/codeservice/{code_service}?mimeType=application%2Fjson" + response = requests.get(url) - + response.raise_for_status() data_dict = json.loads(response.text) - data_list = data_dict['data'] + data_list = data_dict["data"] df = pd.DataFrame(data_list) return df + def get_samples( ssl_check: bool = True, - service: _SERVICES = "results", - profile: _PROFILES = "fullphyschem", + service: SERVICES = "results", + profile: PROFILES = "fullphyschem", activityMediaName: Optional[Union[str, list[str]]] = None, activityStartDateLower: Optional[str] = None, activityStartDateUpper: Optional[str] = None, @@ -1110,7 +1087,7 @@ def get_samples( pointLocationWithinMiles: Optional[float] = None, projectIdentifier: Optional[Union[str, list[str]]] = None, recordIdentifierUserSupplied: Optional[Union[str, list[str]]] = None, -) -> Tuple[DataFrame, BaseMetadata]: +) -> Tuple[pd.DataFrame, BaseMetadata]: """Search Samples database for USGS water quality data. This is a wrapper function for the Samples database API. All potential filters are provided as arguments to the function, but please do not @@ -1177,7 +1154,7 @@ def get_samples( A user supplied characteristic name describing one or more results. boundingBox: list of four floats, optional Filters on the the associated monitoring location's point location - by checking if it is located within the specified geographic area. + by checking if it is located within the specified geographic area. The logic is inclusive, i.e. it will include locations that overlap with the edge of the bounding box. Values are separated by commas, expressed in decimal degrees, NAD83, and longitudes west of Greenwich @@ -1186,7 +1163,7 @@ def get_samples( - Western-most longitude - Southern-most latitude - Eastern-most longitude - - Northern-most longitude + - Northern-most longitude Example: [-92.8,44.2,-88.9,46.0] countryFips : string or list of strings, optional Example: "US" (United States) @@ -1209,7 +1186,7 @@ def get_samples( usgsPCode : string or list of strings, optional 5-digit number used in the US Geological Survey computerized data system, National Water Information System (NWIS), to - uniquely identify a specific constituent. Check the + uniquely identify a specific constituent. Check the `characteristic_lookup()` function in this module for all possible inputs. Example: "00060" (Discharge, cubic feet per second) @@ -1239,7 +1216,7 @@ def get_samples( recordIdentifierUserSupplied : string or list of strings, optional Internal AQS record identifier that returns 1 entry. Only available for the "results" service. - + Returns ------- df : ``pandas.DataFrame`` @@ -1253,8 +1230,8 @@ def get_samples( >>> # Get PFAS results within a bounding box >>> df, md = dataretrieval.waterdata.get_samples( - ... boundingBox=[-90.2,42.6,-88.7,43.2], - ... characteristicGroup="Organics, PFAS" + ... boundingBox=[-90.2, 42.6, -88.7, 43.2], + ... characteristicGroup="Organics, PFAS", ... ) >>> # Get all activities for the Commonwealth of Virginia over a date range @@ -1263,34 +1240,38 @@ def get_samples( ... profile="sampact", ... activityStartDateLower="2023-10-01", ... activityStartDateUpper="2024-01-01", - ... stateFips="US:51") + ... stateFips="US:51", + ... ) >>> # Get all pH samples for two sites in Utah >>> df, md = dataretrieval.waterdata.get_samples( - ... monitoringLocationIdentifier=['USGS-393147111462301', 'USGS-393343111454101'], - ... usgsPCode='00400') + ... monitoringLocationIdentifier=[ + ... "USGS-393147111462301", + ... "USGS-393343111454101", + ... ], + ... usgsPCode="00400", + ... ) """ _check_profiles(service, profile) params = { - k: v for k, v in locals().items() - if k not in ["ssl_check", "service", "profile"] - and v is not None - } - + k: v + for k, v in locals().items() + if k not in ["ssl_check", "service", "profile"] and v is not None + } params.update({"mimeType": "text/csv"}) if "boundingBox" in params: params["boundingBox"] = to_str(params["boundingBox"]) - url = f"{_SAMPLES_URL}/{service}/{profile}" + url = f"{SAMPLES_URL}/{service}/{profile}" req = PreparedRequest() req.prepare_url(url, params=params) - print(f"Request: {req.url}") + logger.info("Request: %s", req.url) response = requests.get(url, params=params, verify=ssl_check) @@ -1300,9 +1281,10 @@ def get_samples( return df, BaseMetadata(response) + def _check_profiles( - service: _SERVICES, - profile: _PROFILES, + service: SERVICES, + profile: PROFILES, ) -> None: """Check whether a service profile is valid. @@ -1313,19 +1295,17 @@ def _check_profiles( profile : string One of the profile names from "results_profiles", "locations_profiles", "activities_profiles", - "projects_profiles" or "organizations_profiles". + "projects_profiles" or "organizations_profiles". """ - valid_services = get_args(_SERVICES) + valid_services = get_args(SERVICES) if service not in valid_services: raise ValueError( - f"Invalid service: '{service}'. " - f"Valid options are: {valid_services}." + f"Invalid service: '{service}'. Valid options are: {valid_services}." ) - valid_profiles = _PROFILE_LOOKUP[service] + valid_profiles = PROFILE_LOOKUP[service] if profile not in valid_profiles: raise ValueError( f"Invalid profile: '{profile}' for service '{service}'. " f"Valid options are: {valid_profiles}." ) - diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py new file mode 100644 index 00000000..07e000c0 --- /dev/null +++ b/dataretrieval/waterdata/types.py @@ -0,0 +1,56 @@ +from typing import Literal + + +CODE_SERVICES = Literal[ + "characteristicgroup", + "characteristics", + "counties", + "countries", + "observedproperty", + "samplemedia", + "sitetype", + "states", +] + +SERVICES = Literal[ + "activities", + "locations", + "organizations", + "projects", + "results", +] + +PROFILES = Literal[ + "actgroup", + "actmetric", + "basicbio", + "basicphyschem", + "count", + "fullbio", + "fullphyschem", + "labsampleprep", + "narrow", + "organization", + "project", + "projectmonitoringlocationweight", + "resultdetectionquantitationlimit", + "sampact", + "site", +] + +PROFILE_LOOKUP = { + "activities": ["sampact", "actmetric", "actgroup", "count"], + "locations": ["site", "count"], + "organizations": ["organization", "count"], + "projects": ["project", "projectmonitoringlocationweight"], + "results": [ + "fullphyschem", + "basicphyschem", + "fullbio", + "basicbio", + "narrow", + "resultdetectionquantitationlimit", + "labsampleprep", + "count", + ], +} diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata/utils.py similarity index 57% rename from dataretrieval/waterdata_helpers.py rename to dataretrieval/waterdata/utils.py index 6cfc233c..10857503 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata/utils.py @@ -1,54 +1,30 @@ import requests import os -from typing import List, Dict, Any, Optional, Union +import logging +from typing import List, Dict, Any, Optional, Union, Tuple from datetime import datetime import pandas as pd import json from zoneinfo import ZoneInfo import re -try: - import geopandas as gpd - geopd = True -except ImportError: - geopd = False - - - -BASE_API = "https://api.waterdata.usgs.gov/ogcapi/" -API_VERSION = "v0" - -# --- Caching for repeated calls --- -_cached_base_url = None -def _base_url(): - """ - Returns the base URL for the USGS Water Data APIs. - Uses a cached value to avoid repeated string formatting. If the cached value - is not set, it constructs the base URL using the BASE_API and API_VERSION constants. +from dataretrieval.utils import BaseMetadata - Returns: - str: The base URL for the API (e.g., "https://api.waterdata.usgs.gov/ogcapi/v0/"). - """ - global _cached_base_url - if _cached_base_url is None: - _cached_base_url = f"{BASE_API}{API_VERSION}/" - return _cached_base_url +try: + import geopandas as gpd -def _setup_api(service: str): - """ - Constructs and returns the API endpoint URL for a specified service. + GEOPANDAS = True +except ImportError: + GEOPANDAS = False - Args: - service (str): The name of the service to be used in the API endpoint. +# Set up logger for this module +logger = logging.getLogger(__name__) - Returns: - str: The full URL for the API endpoint corresponding to the given service. +BASE_URL = "https://api.waterdata.usgs.gov" +OGC_API_VERSION = "v0" +OGC_API_URL = f"{BASE_URL}/ogcapi/{OGC_API_VERSION}" +SAMPLES_URL = f"{BASE_URL}/samples-data" - Example: - >>> _setup_api("daily") - 'https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items' - """ - return f"{_base_url()}collections/{service}/items" def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): """ @@ -59,16 +35,25 @@ def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): with the value from either the service name or the expected id column name. If neither key exists, "id" will be set to None. - Example: for service "time-series-metadata", the function will look for either "time_series_metadata_id" - or "time_series_id" and change the key to simply "id". + Parameters + ---------- + ls : Dict[str, Any] + The dictionary containing identifier keys to be standardized. + id_name : str + The name of the specific identifier key to look for. + service : str + The service name. - Args: - ls (Dict[str, Any]): The dictionary containing identifier keys to be standardized. - id_name (str): The name of the specific identifier key to look for. - service (str): The service name. + Returns + ------- + Dict[str, Any] + The modified dictionary with the "id" key set appropriately. - Returns: - Dict[str, Any]: The modified dictionary with the "id" key set appropriately. + Examples + -------- + For service "time-series-metadata", the function will look for either + "time_series_metadata_id" or "time_series_id" and change the key to simply + "id". """ service_id = service.replace("-", "_") + "_id" @@ -88,22 +73,33 @@ def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): def _switch_properties_id(properties: Optional[List[str]], id_name: str, service: str): """ - Switch properties id from its package-specific identifier to the standardized "id" key - that the API recognizes. + Switch properties id from its package-specific identifier to the + standardized "id" key that the API recognizes. - Sets the "id" key in the provided dictionary `ls` with the value from either the service name - or the expected id column name. If neither key exists, "id" will be set to None. - - Example: for service "monitoring-locations", it will look for "monitoring_location_id" and change - it to "id". + Sets the "id" key in the provided dictionary `ls` with the value from either + the service name or the expected id column name. If neither key exists, "id" + will be set to None. - Args: - properties (List[str]): A list containing the properties or column names to be pulled from the service. - id_name (str): The name of the specific identifier key to look for. - service (str): The service name. + Parameters + ---------- + properties : Optional[List[str]] + A list containing the properties or column names to be pulled from the + service, or None. + id_name : str + The name of the specific identifier key to look for. + service : str + The service name. - Returns: - List[str]: The modified list with the "id" key set appropriately. + Returns + ------- + List[str] + The modified list with the "id" key set appropriately. + + Examples + -------- + For service "monitoring-locations", it will look for + "monitoring_location_id" and change + it to "id". """ if not properties: return [] @@ -119,197 +115,276 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service # Remove unwanted fields return [p for p in properties if p not in ["geometry", service_id]] -def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: + +def _format_api_dates( + datetime_input: Union[str, List[str]], date: bool = False +) -> Union[str, None]: """ - Formats date or datetime input(s) for use with an API, handling single values or ranges, and converting to ISO 8601 or date-only formats as needed. + Formats date or datetime input(s) for use with an API. + + Handles single values or ranges, and converting to ISO 8601 or date-only + formats as needed. + Parameters ---------- datetime_input : Union[str, List[str]] - A single date/datetime string or a list of one or two date/datetime strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601, or relative periods (e.g., "P7D"). + A single date/datetime string or a list of one or two date/datetime + strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601, or relative + periods (e.g., "P7D"). date : bool, optional - If True, uses only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). + If True, uses only the date portion ("YYYY-MM-DD"). If False (default), + returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). + Returns ------- Union[str, None] - - If input is a single value, returns the formatted date/datetime string or None if parsing fails. - - If input is a list of two values, returns a date/datetime range string separated by "/" (e.g., "YYYY-MM-DD/YYYY-MM-DD" or "YYYY-MM-DDTHH:MM:SSZ/YYYY-MM-DDTHH:MM:SSZ"). + - If input is a single value, returns the formatted date/datetime string + or None if parsing fails. + - If input is a list of two values, returns a date/datetime range string + separated by "/" (e.g., "YYYY-MM-DD/YYYY-MM-DD" or + "YYYY-MM-DDTHH:MM:SSZ/YYYY-MM-DDTHH:MM:SSZ"). - Returns None if input is empty, all NA, or cannot be parsed. + Raises ------ ValueError If `datetime_input` contains more than two values. + Notes ----- - Handles blank or NA values by returning None. - - Supports relative period strings (e.g., "P7D") and passes them through unchanged. - - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when `date` is False. + - Supports relative period strings (e.g., "P7D") and passes them through + unchanged. + - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when + `date` is False. - For date ranges, replaces "nan" with ".." in the output. """ # Get timezone local_timezone = datetime.now().astimezone().tzinfo - + # Convert single string to list for uniform processing if isinstance(datetime_input, str): datetime_input = [datetime_input] - + # Check for null or all NA and return None if all(pd.isna(dt) or dt == "" or dt == None for dt in datetime_input): return None - if len(datetime_input) <=2: + if len(datetime_input) <= 2: # If the list is of length 1, first look for things like "P7D" or dates # already formatted in ISO08601. Otherwise, try to coerce to datetime - if len(datetime_input) == 1 and re.search(r"P", datetime_input[0], re.IGNORECASE) or "/" in datetime_input[0]: + if ( + len(datetime_input) == 1 + and re.search(r"P", datetime_input[0], re.IGNORECASE) + or "/" in datetime_input[0] + ): return datetime_input[0] # Otherwise, use list comprehension to parse dates else: try: # Parse to naive datetime - parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] + parsed_dates = [ + datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input + ] except Exception: # Parse to date only try: - parsed_dates = [datetime.strptime(dt, "%Y-%m-%d") for dt in datetime_input] + parsed_dates = [ + datetime.strptime(dt, "%Y-%m-%d") for dt in datetime_input + ] except Exception: return None - # If the service only accepts dates for this input, not datetimes (e.g. "daily"), - # return just the dates separated by a "/", otherwise, return the datetime in UTC - # format. + # If the service only accepts dates for this input, not + # datetimes (e.g. "daily"), return just the dates separated by a + # "/", otherwise, return the datetime in UTC format. if date: return "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) else: - parsed_locals = [dt.replace(tzinfo=local_timezone) for dt in parsed_dates] - formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_locals) + parsed_locals = [ + dt.replace(tzinfo=local_timezone) for dt in parsed_dates + ] + formatted = "/".join( + dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") + for dt in parsed_locals + ) return formatted else: raise ValueError("datetime_input should only include 1-2 values") -def _cql2_param(args): + +def _cql2_param(args: Dict[str, Any]) -> str: + """ + Convert query parameters to CQL2 JSON format for POST requests. + + Parameters + ---------- + args : Dict[str, Any] + Dictionary of query parameters to convert to CQL2 format. + + Returns + ------- + str + JSON string representation of the CQL2 query. + """ filters = [] for key, values in args.items(): - filters.append({ - "op": "in", - "args": [ - {"property": key}, - values - ] - }) - - query = { - "op": "and", - "args": filters - } + filters.append({"op": "in", "args": [{"property": key}, values]}) + + query = {"op": "and", "args": filters} return json.dumps(query, indent=4) + def _default_headers(): """ Generate default HTTP headers for API requests. - Returns: - dict: A dictionary containing default headers including 'Accept-Encoding', - 'Accept', 'User-Agent', and 'lang'. If the environment variable 'API_USGS_PAT' - is set, its value is included as the 'X-Api-Key' header. + Returns + ------- + dict + A dictionary containing default headers including 'Accept-Encoding', + 'Accept', 'User-Agent', and 'lang'. If the environment variable + 'API_USGS_PAT' is set, its value is included as the 'X-Api-Key' header. """ headers = { "Accept-Encoding": "compress, gzip", "Accept": "application/json", "User-Agent": "python-dataretrieval/1.0", - "lang": "en-US" + "lang": "en-US", } token = os.getenv("API_USGS_PAT") if token: headers["X-Api-Key"] = token return headers + def _check_ogc_requests(endpoint: str = "daily", req_type: str = "queryables"): """ - Sends an HTTP GET request to the specified OGC endpoint and request type, returning the JSON response. + Sends an HTTP GET request to the specified OGC endpoint and request type, + returning the JSON response. - Args: - endpoint (str): The OGC collection endpoint to query. Defaults to "daily". - req_type (str): The type of request to make. Must be either "queryables" or "schema". Defaults to "queryables". + Parameters + ---------- + endpoint : str, optional + The OGC collection endpoint to query (default is "daily"). + req_type : str, optional + The type of request to make. Must be either "queryables" or "schema" + (default is "queryables"). - Returns: - dict: The JSON response from the OGC endpoint. + Returns + ------- + dict + The JSON response from the OGC endpoint. - Raises: - AssertionError: If req_type is not "queryables" or "schema". - requests.HTTPError: If the HTTP request returns an unsuccessful status code. + Raises + ------ + AssertionError + If req_type is not "queryables" or "schema". + requests.HTTPError + If the HTTP request returns an unsuccessful status code. """ assert req_type in ["queryables", "schema"] - url = f"{_base_url()}collections/{endpoint}/{req_type}" + url = f"{OGC_API_URL}/collections/{endpoint}/{req_type}" resp = requests.get(url, headers=_default_headers()) resp.raise_for_status() return resp.json() + def _error_body(resp: requests.Response): """ Provide more informative error messages based on the response status. - Args: - resp (requests.Response): The HTTP response object to extract the error message from. + Parameters + ---------- + resp : requests.Response + The HTTP response object to extract the error message from. - Returns: - str: The extracted error message. For status code 429, returns the 'message' field from the JSON error object. - For status code 403, returns a predefined message indicating possible reasons for denial. - For other status codes, returns the raw response text. + Returns + ------- + str + The extracted error message. For status code 429, returns the 'message' + field from the JSON error object. For status code 403, returns a + predefined message indicating possible reasons for denial. For other + status codes, returns the raw response text. """ if resp.status_code == 429: - return resp.json().get('error', {}).get('message') + return resp.json().get("error", {}).get("message") elif resp.status_code == 403: return "Query request denied. Possible reasons include query exceeding server limits." return resp.text + def _construct_api_requests( service: str, properties: Optional[List[str]] = None, bbox: Optional[List[float]] = None, limit: Optional[int] = None, max_results: Optional[int] = None, - skipGeometry: bool = False, - **kwargs + skip_geometry: bool = False, + **kwargs, ): """ Constructs an HTTP request object for the specified water data API service. - Depending on the input parameters (whether there's lists of multiple argument values), - the function determines whether to use a GET or POST request, formats parameters - appropriately, and sets required headers. - - Args: - service (str): The name of the API service to query (e.g., "daily"). - properties (Optional[List[str]], optional): List of property names to include in the request. - bbox (Optional[List[float]], optional): Bounding box coordinates as a list of floats. - limit (Optional[int], optional): Maximum number of results to return per request. - max_results (Optional[int], optional): Maximum number of rows to return. - skipGeometry (bool, optional): Whether to exclude geometry from the response. - **kwargs: Additional query parameters, including date/time filters and other API-specific options. - Returns: - requests.PreparedRequest: The constructed HTTP request object ready to be sent. - Raises: - ValueError: If `limit` is greater than `max_results`. - Notes: - - Date/time parameters are automatically formatted to ISO8601. - - If multiple values are provided for non-single parameters, a POST request is constructed. - - The function sets appropriate headers for GET and POST requests. + + Depending on the input parameters (whether there's lists of multiple + argument values), the function determines whether to use a GET or POST + request, formats parameters appropriately, and sets required headers. + + Parameters + ---------- + service : str + The name of the API service to query (e.g., "daily"). + properties : Optional[List[str]], optional + List of property names to include in the request. + bbox : Optional[List[float]], optional + Bounding box coordinates as a list of floats. + limit : Optional[int], optional + Maximum number of results to return per request. + max_results : Optional[int], optional + Maximum number of rows to return. + skip_geometry : bool, optional + Whether to exclude geometry from the response (default is False). + **kwargs + Additional query parameters, including date/time filters and other + API-specific options. + + Returns + ------- + requests.PreparedRequest + The constructed HTTP request object ready to be sent. + + Raises + ------ + ValueError + If `limit` is greater than `max_results`. + + Notes + ----- + - Date/time parameters are automatically formatted to ISO8601. + - If multiple values are provided for non-single parameters, a POST request + is constructed. + - The function sets appropriate headers for GET and POST requests. """ - baseURL = _setup_api(service) + service_url = f"{OGC_API_URL}/collections/{service}/items" # Single parameters can only have one value single_params = {"datetime", "last_modified", "begin", "end", "time"} - + # Identify which parameters should be included in the POST content body post_params = { - k: v for k, v in kwargs.items() - if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 - } - + k: v + for k, v in kwargs.items() + if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 + } + # Everything else goes into the params dictionary for the URL params = {k: v for k, v in kwargs.items() if k not in post_params} - # Set skipGeometry parameter - params["skipGeometry"] = skipGeometry + # Set skipGeometry parameter (API expects camelCase) + params["skipGeometry"] = skip_geometry # If limit is none and max_results is not none, then set limit to max results. Otherwise, # if max_results is none, set it to 10000 (the API max). - params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 + params["limit"] = ( + max_results if limit is None and max_results is not None else limit or 10000 + ) # Add max results as a parameter if it is not None if max_results is not None: params["max_results"] = max_results @@ -340,7 +415,7 @@ def _construct_api_requests( headers["Content-Type"] = "application/query-cql-json" request = requests.Request( method="POST", - url=baseURL, + url=service_url, headers=headers, data=_cql2_param(post_params), params=params, @@ -348,67 +423,83 @@ def _construct_api_requests( else: request = requests.Request( method="GET", - url=baseURL, + url=service_url, headers=headers, params=params, ) return request.prepare() + def _next_req_url(resp: requests.Response) -> Optional[str]: """ - Extracts the URL for the next page of results from an HTTP response from a water data endpoint. - - Parameters: - resp (requests.Response): The HTTP response object containing JSON data and headers. + Extracts the URL for the next page of results from an HTTP response from a + water data endpoint. - Returns: - Optional[str]: The URL for the next page of results if available, otherwise None. + Parameters + ---------- + resp : requests.Response + The HTTP response object containing JSON data and headers. - Side Effects: - If the environment variable "API_USGS_PAT" is set, prints the remaining requests for the current hour. - Prints the next URL if found. + Returns + ------- + Optional[str] + The URL for the next page of results if available, otherwise None. - Notes: - - Expects the response JSON to contain a "links" list with objects having "rel" and "href" keys. - - Checks for the "next" relation in the "links" to determine the next URL. + Notes + ----- + - If the environment variable "API_USGS_PAT" is set, logs the remaining + requests for the current hour. + - Logs the next URL if found at debug level. + - Expects the response JSON to contain a "links" list with objects having + "rel" and "href" keys. + - Checks for the "next" relation in the "links" to determine the next URL. """ body = resp.json() if not body.get("numberReturned"): return None header_info = resp.headers if os.getenv("API_USGS_PAT", ""): - print("Remaining requests this hour:", header_info.get("x-ratelimit-remaining", "")) + logger.info( + "Remaining requests this hour: %s", + header_info.get("x-ratelimit-remaining", ""), + ) for link in body.get("links", []): if link.get("rel") == "next": next_url = link.get("href") - print(f"Next URL: {next_url}") + logger.debug("Next URL: %s", next_url) return next_url return None + def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame: """ Extracts and normalizes data from an HTTP response containing GeoJSON features. - Parameters: - resp (requests.Response): The HTTP response object expected to contain a JSON body with a "features" key. - geopd (bool): Indicates whether geopandas is installed and should be used to handle geometries. + Parameters + ---------- + resp : requests.Response + The HTTP response object expected to contain a JSON body with a "features" key. + geopd : bool + Indicates whether geopandas is installed and should be used to handle geometries. - Returns: - gpd.GeoDataFrame or pd.DataFrame: A geopandas GeoDataFrame if geometry is included, or a - pandas DataFrame containing the feature properties and each row's service-specific id. + Returns + ------- + gpd.GeoDataFrame or pd.DataFrame + A geopandas GeoDataFrame if geometry is included, or a pandas DataFrame + containing the feature properties and each row's service-specific id. Returns an empty pandas DataFrame if no features are returned. """ # Check if it's an empty response body = resp.json() if not body.get("numberReturned"): return pd.DataFrame() - + # If geopandas not installed, return a pandas dataframe if not geopd: - df = pd.json_normalize( - body["features"], - sep="_") - df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") + df = pd.json_normalize(body["features"], sep="_") + df = df.drop( + columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore" + ) df.columns = [col.replace("properties_", "") for col in df.columns] df.rename(columns={"geometry_coordinates": "geometry"}, inplace=True) return df @@ -425,25 +516,36 @@ def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame: return df -def _walk_pages(geopd: bool, req: requests.PreparedRequest, max_results: Optional[int], client: Optional[requests.Session] = None) -> pd.DataFrame: + +def _walk_pages( + geopd: bool, + req: requests.PreparedRequest, + max_results: Optional[int], + client: Optional[requests.Session] = None, +) -> Tuple[pd.DataFrame, requests.Response]: """ Iterates through paginated API responses and aggregates the results into a single DataFrame. Parameters ---------- geopd : bool - Indicates whether geopandas is installed and should be used for handling geometries. + Indicates whether geopandas is installed and should be used for handling + geometries. req : requests.PreparedRequest The initial HTTP request to send. max_results : Optional[int] - Maximum number of rows to return. If None or NaN, retrieves all available pages. + Maximum number of rows to return. If None or NaN, retrieves all + available pages. client : Optional[requests.Session], default None - An optional HTTP client to use for requests. If not provided, a new client is created. + An optional HTTP client to use for requests. If not provided, a new + client is created. Returns ------- pd.DataFrame A DataFrame containing the aggregated results from all pages. + requests.Response + The initial response object containing metadata about the first request. Raises ------ @@ -452,13 +554,18 @@ def _walk_pages(geopd: bool, req: requests.PreparedRequest, max_results: Optiona Notes ----- - - If `max_results` is None or NaN, the function will continue to request subsequent pages until no more pages are available. - - Failed requests are tracked and reported, but do not halt the entire process unless the initial request fails. + - If `max_results` is None or NaN, the function will continue to request + subsequent pages until no more pages are available. + - Failed requests are tracked and reported, but do not halt the entire + process unless the initial request fails. """ - print(f"Requesting:\n{req.url}") + logger.info("Requesting: %s", req.url) if not geopd: - print("Geopandas is not installed. Data frames containing geometry will be returned as pandas DataFrames.") + logger.warning( + "Geopandas is not installed. ", + "Geometries will be flattened into pandas DataFrames.", + ) # Get first response from client # using GET or POST call @@ -469,6 +576,9 @@ def _walk_pages(geopd: bool, req: requests.PreparedRequest, max_results: Optiona if resp.status_code != 200: raise Exception(_error_body(resp)) + # Store the initial response for metadata + initial_response = resp + # Grab some aspects of the original request: headers and the # request type (GET or POST) method = req.method.upper() @@ -481,7 +591,12 @@ def _walk_pages(geopd: bool, req: requests.PreparedRequest, max_results: Optiona failures = [] while curr_url: try: - resp = client.request(method, curr_url, headers=headers, data=content if method == "POST" else None) + resp = client.request( + method, + curr_url, + headers=headers, + data=content if method == "POST" else None, + ) if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp, geopd=geopd) @@ -491,16 +606,19 @@ def _walk_pages(geopd: bool, req: requests.PreparedRequest, max_results: Optiona failures.append(curr_url) curr_url = None if failures: - print(f"There were {len(failures)} failed requests.") - return dfs + logger.warning("There were %d failed requests.", len(failures)) + return dfs, initial_response else: resp.raise_for_status() - return _get_resp_data(resp, geopd=geopd) + return _get_resp_data(resp, geopd=geopd), initial_response finally: if close_client: client.close() -def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: + +def _deal_with_empty( + return_list: pd.DataFrame, properties: Optional[List[str]], service: str +) -> pd.DataFrame: """ Handles empty DataFrame results by returning a DataFrame with appropriate columns. @@ -508,13 +626,19 @@ def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], - If `properties` is not provided or contains only NaN values, retrieves the schema properties from the specified service. - Otherwise, uses the provided `properties` list as column names. - Args: - return_list (pd.DataFrame): The DataFrame to check for emptiness. - properties (Optional[List[str]]): List of property names to use as columns, or None. - service (str): The service endpoint to query for schema properties if needed. + Parameters + ---------- + return_list : pd.DataFrame + The DataFrame to check for emptiness. + properties : Optional[List[str]] + List of property names to use as columns, or None. + service : str + The service endpoint to query for schema properties if needed. - Returns: - pd.DataFrame: The original DataFrame if not empty, otherwise an empty DataFrame with the appropriate columns. + Returns + ------- + pd.DataFrame + The original DataFrame if not empty, otherwise an empty DataFrame with the appropriate columns. """ if return_list.empty: if not properties or all(pd.isna(properties)): @@ -523,7 +647,10 @@ def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], return pd.DataFrame(columns=properties) return return_list -def _arrange_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: + +def _arrange_cols( + df: pd.DataFrame, properties: Optional[List[str]], output_id: str +) -> pd.DataFrame: """ Rearranges and renames columns in a DataFrame based on provided properties and service's output id. @@ -544,7 +671,7 @@ def _arrange_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: if properties and not all(pd.isna(properties)): if "id" not in properties: # If user refers to service-specific output id in properties, - # then rename the "id" column to the output_id (id column is + # then rename the "id" column to the output_id (id column is # automatically included). if output_id in properties: df = df.rename(columns={"id": output_id}) @@ -559,6 +686,7 @@ def _arrange_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: else: return df.rename(columns={"id": output_id}) + def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: """ Cleans and standardizes columns in a pandas DataFrame for water data endpoints. @@ -587,27 +715,37 @@ def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: df[col] = pd.to_numeric(df[col], errors="coerce") return df -def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: + +def get_ogc_data( + args: Dict[str, Any], output_id: str, service: str +) -> Tuple[pd.DataFrame, BaseMetadata]: """ - Retrieves OGC (Open Geospatial Consortium) data from a specified water data endpoint and returns it as a pandas DataFrame. + Retrieves OGC (Open Geospatial Consortium) data from a specified water data endpoint and returns it as a pandas DataFrame with metadata. This function prepares request arguments, constructs API requests, handles pagination, processes the results, and formats the output DataFrame according to the specified parameters. - Args: - args (Dict[str, Any]): Dictionary of request arguments for the OGC service. - output_id (str): The name of the output identifier to use in the request. - service (str): The OGC service type (e.g., "wfs", "wms"). + Parameters + ---------- + args : Dict[str, Any] + Dictionary of request arguments for the OGC service. + output_id : str + The name of the output identifier to use in the request. + service : str + The OGC service type (e.g., "wfs", "wms"). - Returns: - pd.DataFrame or gpd.GeoDataFrame: A DataFrame containing the retrieved and processed OGC data, - with metadata attributes including the request URL and query timestamp. + Returns + ------- + pd.DataFrame or gpd.GeoDataFrame + A DataFrame containing the retrieved and processed OGC data. + BaseMetadata + A metadata object containing request information including URL and query time. - Notes: - - The function does not mutate the input `args` dictionary. - - Handles optional arguments such as `max_results` and `convertType`. - - Applies column cleanup and reordering based on service and properties. - - Metadata is attached to the DataFrame via the `.attrs` attribute. + Notes + ----- + - The function does not mutate the input `args` dictionary. + - Handles optional arguments such as `max_results` and `convert_type`. + - Applies column cleanup and reordering based on service and properties. """ args = args.copy() # Add service as an argument @@ -618,22 +756,26 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF args = _switch_arg_id(args, id_name=output_id, service=service) properties = args.get("properties") # Switch properties id to "id" if needed - args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) - convertType = args.pop("convertType", False) + args["properties"] = _switch_properties_id( + properties, id_name=output_id, service=service + ) + convert_type = args.pop("convert_type", False) # Create fresh dictionary of args without any None values args = {k: v for k, v in args.items() if v is not None} # Build API request req = _construct_api_requests(**args) # Run API request and iterate through pages if needed - return_list = _walk_pages(geopd=geopd, req=req, max_results=max_results) + return_list, response = _walk_pages( + geopd=GEOPANDAS, req=req, max_results=max_results + ) # Manage some aspects of the returned dataset return_list = _deal_with_empty(return_list, properties, service) - if convertType: + if convert_type: return_list = _cleanup_cols(return_list, service=service) return_list = _arrange_cols(return_list, properties, output_id) - # Add metadata - return_list.attrs.update(request=req.url, queryTime=pd.Timestamp.now()) - return return_list + # Create metadata object from response + metadata = BaseMetadata(response) + return return_list, metadata # def _get_description(service: str): diff --git a/tests/nldi_test.py b/tests/nldi_test.py index c4d6675f..9993a899 100644 --- a/tests/nldi_test.py +++ b/tests/nldi_test.py @@ -47,7 +47,7 @@ def test_get_basin(requests_mock): f"{NLDI_API_BASE_URL}/WQP/USGS-054279485/basin" f"?simplified=true&splitCatchment=false" ) - response_file_path = "data/nldi_get_basin.json" + response_file_path = "tests/data/nldi_get_basin.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -62,7 +62,7 @@ def test_get_flowlines(requests_mock): f"{NLDI_API_BASE_URL}/WQP/USGS-054279485/navigation/UM/flowlines" f"?distance=5&trimStart=false" ) - response_file_path = "data/nldi_get_flowlines.json" + response_file_path = "tests/data/nldi_get_flowlines.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -78,7 +78,7 @@ def test_get_flowlines_by_comid(requests_mock): request_url = ( f"{NLDI_API_BASE_URL}/comid/13294314/navigation/UM/flowlines?distance=50" ) - response_file_path = "data/nldi_get_flowlines_by_comid.json" + response_file_path = "tests/data/nldi_get_flowlines_by_comid.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -94,7 +94,7 @@ def test_features_by_feature_source_with_navigation(requests_mock): request_url = ( f"{NLDI_API_BASE_URL}/WQP/USGS-054279485/navigation/UM/nwissite?distance=50" ) - response_file_path = "data/nldi_get_features_by_feature_source_with_nav_mode.json" + response_file_path = "tests/data/nldi_get_features_by_feature_source_with_nav_mode.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -115,7 +115,7 @@ def test_features_by_feature_source_without_navigation(requests_mock): """ request_url = f"{NLDI_API_BASE_URL}/WQP/USGS-054279485" response_file_path = ( - "data/nldi_get_features_by_feature_source_without_nav_mode.json" + "tests/data/nldi_get_features_by_feature_source_without_nav_mode.json" ) mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -128,7 +128,7 @@ def test_features_by_feature_source_without_navigation(requests_mock): def test_get_features_by_comid(requests_mock): """Tests NLDI get features query using comid as the origin""" request_url = f"{NLDI_API_BASE_URL}/comid/13294314/navigation/UM/WQP?distance=5" - response_file_path = "data/nldi_get_features_by_comid.json" + response_file_path = "tests/data/nldi_get_features_by_comid.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -144,7 +144,7 @@ def test_get_features_by_lat_long(requests_mock): request_url = ( f"{NLDI_API_BASE_URL}/comid/position?coords=POINT%28-89.509%2043.087%29" ) - response_file_path = "data/nldi_get_features_by_lat_long.json" + response_file_path = "tests/data/nldi_get_features_by_lat_long.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -156,7 +156,7 @@ def test_get_features_by_lat_long(requests_mock): def test_search_for_basin(requests_mock): """Tests NLDI search query for basin""" request_url = f"{NLDI_API_BASE_URL}/WQP/USGS-054279485/basin" - response_file_path = "data/nldi_get_basin.json" + response_file_path = "tests/data/nldi_get_basin.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -172,7 +172,7 @@ def test_search_for_basin(requests_mock): def test_search_for_flowlines(requests_mock): """Tests NLDI search query for flowlines""" request_url = f"{NLDI_API_BASE_URL}/WQP/USGS-054279485/navigation/UM/flowlines" - response_file_path = "data/nldi_get_flowlines.json" + response_file_path = "tests/data/nldi_get_flowlines.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -191,7 +191,7 @@ def test_search_for_flowlines(requests_mock): def test_search_for_flowlines_by_comid(requests_mock): """Tests NLDI search query for flowlines by comid""" request_url = f"{NLDI_API_BASE_URL}/comid/13294314/navigation/UM/flowlines" - response_file_path = "data/nldi_get_flowlines_by_comid.json" + response_file_path = "tests/data/nldi_get_flowlines_by_comid.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -207,7 +207,7 @@ def test_search_for_features_by_feature_source_with_navigation(requests_mock): request_url = ( f"{NLDI_API_BASE_URL}/WQP/USGS-054279485/navigation/UM/nwissite?distance=50" ) - response_file_path = "data/nldi_get_features_by_feature_source_with_nav_mode.json" + response_file_path = "tests/data/nldi_get_features_by_feature_source_with_nav_mode.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -228,7 +228,7 @@ def test_search_for_features_by_feature_source_without_navigation(requests_mock) """Tests NLDI search query for features by feature source""" request_url = f"{NLDI_API_BASE_URL}/WQP/USGS-054279485" response_file_path = ( - "data/nldi_get_features_by_feature_source_without_nav_mode.json" + "tests/data/nldi_get_features_by_feature_source_without_nav_mode.json" ) mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -245,7 +245,7 @@ def test_search_for_features_by_feature_source_without_navigation(requests_mock) def test_search_for_features_by_comid(requests_mock): """Tests NLDI search query for features by comid""" request_url = f"{NLDI_API_BASE_URL}/comid/13294314/navigation/UM/WQP?distance=5" - response_file_path = "data/nldi_get_features_by_comid.json" + response_file_path = "tests/data/nldi_get_features_by_comid.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) @@ -267,7 +267,7 @@ def test_search_for_features_by_lat_long(requests_mock): request_url = ( f"{NLDI_API_BASE_URL}/comid/position?coords=POINT%28-89.509%2043.087%29" ) - response_file_path = "data/nldi_get_features_by_lat_long.json" + response_file_path = "tests/data/nldi_get_features_by_lat_long.json" mock_request_data_sources(requests_mock) mock_request(requests_mock, request_url, response_file_path) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index d0e7a49e..0f46e231 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -11,8 +11,8 @@ get_latest_continuous, get_field_measurements, get_time_series_metadata, - _SERVICES, - _PROFILES + SERVICES, + PROFILES, ) def mock_request(requests_mock, request_url, file_path): @@ -29,7 +29,7 @@ def test_mock_get_samples(requests_mock): "activityMediaName=Water&activityStartDateLower=2020-01-01" "&activityStartDateUpper=2024-12-31&monitoringLocationIdentifier=USGS-05406500&mimeType=text%2Fcsv" ) - response_file_path = "data/samples_results.txt" + response_file_path = "tests/data/samples_results.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_samples( service="results", @@ -112,7 +112,7 @@ def test_samples_organizations(): assert df.size == 3 def test_get_daily(): - df = get_daily( + df, metadata = get_daily( monitoring_location_id="USGS-05427718", parameter_code="00060", time="2025-01-01/.." @@ -123,10 +123,12 @@ def test_get_daily(): assert df.parameter_code.unique().tolist() == ["00060"] assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() + assert hasattr(metadata, 'url') + assert hasattr(metadata, 'query_time') assert df["value"].dtype == "float64" def test_get_daily_properties(): - df = get_daily( + df, metadata = get_daily( monitoring_location_id="USGS-05427718", parameter_code="00060", time="2025-01-01/..", @@ -135,39 +137,49 @@ def test_get_daily_properties(): assert "daily_id" in df.columns assert "geometry" in df.columns assert df.shape[1] == 6 - assert (df["time"] >= datetime.date(2025, 1, 1)).all() + assert df.parameter_code.unique().tolist() == ["00060"] + assert hasattr(metadata, 'url') + assert hasattr(metadata, 'query_time') def test_get_daily_no_geometry(): - df = get_daily( + df, metadata = get_daily( monitoring_location_id="USGS-05427718", parameter_code="00060", time="2025-01-01/..", - skipGeometry=True + skip_geometry=True ) assert "geometry" not in df.columns assert df.shape[1] == 11 assert isinstance(df, DataFrame) + assert hasattr(metadata, 'url') + assert hasattr(metadata, 'query_time') def test_get_monitoring_locations(): - df = get_monitoring_locations( + df, metadata = get_monitoring_locations( state_name="Connecticut", site_type_code="GW" ) assert df.site_type_code.unique().tolist() == ["GW"] + assert hasattr(metadata, 'url') + assert hasattr(metadata, 'query_time') def test_get_monitoring_locations_hucs(): - df = get_monitoring_locations( + df, metadata = get_monitoring_locations( hydrologic_unit_code=["010802050102", "010802050103"] ) assert set(df.hydrologic_unit_code.unique().tolist()) == {"010802050102", "010802050103"} + assert hasattr(metadata, 'url') + assert hasattr(metadata, 'query_time') def test_get_latest_continuous(): - df = get_latest_continuous( + df, metadata = get_latest_continuous( monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"] ) assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] + assert hasattr(metadata, 'url') + assert hasattr(metadata, 'query_time') try: datetime.datetime.strptime(df['time'].iloc[0], "%Y-%m-%dT%H:%M:%S+00:00") out=True @@ -176,22 +188,26 @@ def test_get_latest_continuous(): assert out def test_get_field_measurements(): - df = get_field_measurements( + df, metadata = get_field_measurements( monitoring_location_id="USGS-05427718", unit_of_measure="ft^3/s", time="2025-01-01/2025-10-01", - skipGeometry=True + skip_geometry=True ) assert "field_measurement_id" in df.columns assert "geometry" not in df.columns assert df.unit_of_measure.unique().tolist() == ["ft^3/s"] + assert hasattr(metadata, 'url') + assert hasattr(metadata, 'query_time') def test_get_time_series_metadata(): - df = get_time_series_metadata( + df, metadata = get_time_series_metadata( bbox=[-89.840355,42.853411,-88.818626,43.422598], parameter_code=["00060", "00065", "72019"], - skipGeometry=True + skip_geometry=True ) assert set(df['parameter_name'].unique().tolist()) == {"Gage height", "Water level, depth LSD", "Discharge"} + assert hasattr(metadata, 'url') + assert hasattr(metadata, 'query_time') diff --git a/tests/waterservices_test.py b/tests/waterservices_test.py index 19cc30fb..449650aa 100755 --- a/tests/waterservices_test.py +++ b/tests/waterservices_test.py @@ -93,7 +93,7 @@ def test_get_dv(requests_mock): "https://waterservices.usgs.gov/nwis/dv?format={}" "&startDT=2020-02-14&endDT=2020-02-15&sites={}".format(format, site) ) - response_file_path = "data/waterservices_dv.txt" + response_file_path = "tests/data/waterservices_dv.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_dv( sites=["01491000", "01645000"], start="2020-02-14", end="2020-02-15" @@ -115,7 +115,7 @@ def test_get_dv_site_value_types(requests_mock, site_input_type_list): "https://waterservices.usgs.gov/nwis/dv?format={}" "&startDT=2020-02-14&endDT=2020-02-15&sites={}".format(_format, site) ) - response_file_path = "data/waterservices_dv.txt" + response_file_path = "tests/data/waterservices_dv.txt" mock_request(requests_mock, request_url, response_file_path) if site_input_type_list: sites = [site] @@ -136,7 +136,7 @@ def test_get_iv(requests_mock): "https://waterservices.usgs.gov/nwis/iv?format={}" "&startDT=2019-02-14&endDT=2020-02-15&sites={}".format(format, site) ) - response_file_path = "data/waterservices_iv.txt" + response_file_path = "tests/data/waterservices_iv.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_iv( sites=["01491000", "01645000"], start="2019-02-14", end="2020-02-15" @@ -158,7 +158,7 @@ def test_get_iv_site_value_types(requests_mock, site_input_type_list): "https://waterservices.usgs.gov/nwis/iv?format={}" "&startDT=2019-02-14&endDT=2020-02-15&sites={}".format(_format, site) ) - response_file_path = "data/waterservices_iv.txt" + response_file_path = "tests/data/waterservices_iv.txt" mock_request(requests_mock, request_url, response_file_path) if site_input_type_list: sites = [site] @@ -183,7 +183,7 @@ def test_get_info(requests_mock): request_url = "https://waterservices.usgs.gov/nwis/site?sites={}¶meterCd={}&siteOutput=Expanded&format={}".format( site, parameter_cd, format ) - response_file_path = "data/waterservices_site.txt" + response_file_path = "tests/data/waterservices_site.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_info(sites=["01491000", "01645000"], parameterCd="00618") if not isinstance(df, DataFrame): @@ -210,7 +210,7 @@ def test_get_gwlevels(requests_mock): "https://nwis.waterdata.usgs.gov/nwis/gwlevels?format={}&begin_date=1851-01-01" "&site_no={}".format(format, site) ) - response_file_path = "data/waterdata_gwlevels.txt" + response_file_path = "tests/data/waterdata_gwlevels.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_gwlevels(sites=site) if not isinstance(df, DataFrame): @@ -229,7 +229,7 @@ def test_get_gwlevels_site_value_types(requests_mock, site_input_type_list): "https://nwis.waterdata.usgs.gov/nwis/gwlevels?format={}&begin_date=1851-01-01" "&site_no={}".format(_format, site) ) - response_file_path = "data/waterdata_gwlevels.txt" + response_file_path = "tests/data/waterdata_gwlevels.txt" mock_request(requests_mock, request_url, response_file_path) if site_input_type_list: sites = [site] @@ -249,7 +249,7 @@ def test_get_discharge_peaks(requests_mock): "https://nwis.waterdata.usgs.gov/nwis/peaks?format={}&site_no={}" "&begin_date=2000-02-14&end_date=2020-02-15".format(format, site) ) - response_file_path = "data/waterservices_peaks.txt" + response_file_path = "tests/data/waterservices_peaks.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_discharge_peaks(sites=[site], start="2000-02-14", end="2020-02-15") if not isinstance(df, DataFrame): @@ -269,7 +269,7 @@ def test_get_discharge_peaks_sites_value_types(requests_mock, site_input_type_li "https://nwis.waterdata.usgs.gov/nwis/peaks?format={}&site_no={}" "&begin_date=2000-02-14&end_date=2020-02-15".format(_format, site) ) - response_file_path = "data/waterservices_peaks.txt" + response_file_path = "tests/data/waterservices_peaks.txt" mock_request(requests_mock, request_url, response_file_path) if site_input_type_list: sites = [site] @@ -292,7 +292,7 @@ def test_get_discharge_measurements(requests_mock): "https://nwis.waterdata.usgs.gov/nwis/measurements?site_no={}" "&begin_date=2000-02-14&end_date=2020-02-15&format={}".format(site, format) ) - response_file_path = "data/waterdata_measurements.txt" + response_file_path = "tests/data/waterdata_measurements.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_discharge_measurements( sites=[site], start="2000-02-14", end="2020-02-15" @@ -315,7 +315,7 @@ def test_get_discharge_measurements_sites_value_types( "https://nwis.waterdata.usgs.gov/nwis/measurements?site_no={}" "&begin_date=2000-02-14&end_date=2020-02-15&format={}".format(site, format) ) - response_file_path = "data/waterdata_measurements.txt" + response_file_path = "tests/data/waterdata_measurements.txt" mock_request(requests_mock, request_url, response_file_path) if site_input_type_list: sites = [site] @@ -334,7 +334,7 @@ def test_get_pmcodes(requests_mock): DataFrame""" format = "rdb" request_url = "https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?fmt=rdb&parm_nm_cd=%2500618%25" - response_file_path = "data/waterdata_pmcodes.txt" + response_file_path = "tests/data/waterdata_pmcodes.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_pmcodes(parameterCd="00618") if not isinstance(df, DataFrame): @@ -352,7 +352,7 @@ def test_get_pmcodes_parameterCd_value_types( parameterCd = "00618" request_url = "https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?fmt={}&parm_nm_cd=%25{}%25" request_url = request_url.format(_format, parameterCd) - response_file_path = "data/waterdata_pmcodes.txt" + response_file_path = "tests/data/waterdata_pmcodes.txt" mock_request(requests_mock, request_url, response_file_path) if parameterCd_input_type_list: parameterCd = [parameterCd] @@ -372,7 +372,7 @@ def test_get_water_use_national(requests_mock): "https://nwis.waterdata.usgs.gov/nwis/water_use?rdb_compression=value&format={}&wu_year=ALL" "&wu_category=ALL&wu_county=ALL".format(format) ) - response_file_path = "data/water_use_national.txt" + response_file_path = "tests/data/water_use_national.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_water_use() if not isinstance(df, DataFrame): @@ -390,7 +390,7 @@ def test_get_water_use_national_year_value_types(requests_mock, year_input_type_ "https://nwis.waterdata.usgs.gov/nwis/water_use?rdb_compression=value&format={}&wu_year=ALL" "&wu_category=ALL&wu_county=ALL".format(_format) ) - response_file_path = "data/water_use_national.txt" + response_file_path = "tests/data/water_use_national.txt" mock_request(requests_mock, request_url, response_file_path) if year_input_type_list: years = [year] @@ -412,7 +412,7 @@ def test_get_water_use_national_county_value_types( "https://nwis.waterdata.usgs.gov/nwis/water_use?rdb_compression=value&format={}&wu_year=ALL" "&wu_category=ALL&wu_county=ALL".format(_format) ) - response_file_path = "data/water_use_national.txt" + response_file_path = "tests/data/water_use_national.txt" mock_request(requests_mock, request_url, response_file_path) if county_input_type_list: counties = [county] @@ -435,7 +435,7 @@ def test_get_water_use_national_county_value_types( "https://nwis.waterdata.usgs.gov/nwis/water_use?rdb_compression=value&format={}&wu_year=ALL" "&wu_category=ALL&wu_county=ALL".format(_format) ) - response_file_path = "data/water_use_national.txt" + response_file_path = "tests/data/water_use_national.txt" mock_request(requests_mock, request_url, response_file_path) if category_input_type_list: categories = [category] @@ -455,7 +455,7 @@ def test_get_water_use_allegheny(requests_mock): "https://nwis.waterdata.usgs.gov/PA/nwis/water_use?rdb_compression=value&format=rdb&wu_year=ALL" "&wu_category=ALL&wu_county=003&wu_area=county" ) - response_file_path = "data/water_use_allegheny.txt" + response_file_path = "tests/data/water_use_allegheny.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_water_use(state="PA", counties="003") if not isinstance(df, DataFrame): @@ -481,7 +481,7 @@ def test_get_ratings(requests_mock): request_url = "https://nwis.waterdata.usgs.gov/nwisweb/get_ratings/?site_no={}&file_type=base".format( site ) - response_file_path = "data/waterservices_ratings.txt" + response_file_path = "tests/data/waterservices_ratings.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_ratings(site_no=site) if not isinstance(df, DataFrame): @@ -501,7 +501,7 @@ def test_what_sites(requests_mock): "https://waterservices.usgs.gov/nwis/site?bBox=-83.0%2C36.5%2C-81.0%2C38.5" "¶meterCd={}&hasDataTypeCd=dv&format={}".format(parameter_cd, format) ) - response_file_path = "data/nwis_sites.txt" + response_file_path = "tests/data/nwis_sites.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_sites( @@ -534,7 +534,7 @@ def test_get_stats(requests_mock): request_url = "https://waterservices.usgs.gov/nwis/stat?sites=01491000%2C01645000&format={}".format( format ) - response_file_path = "data/waterservices_stats.txt" + response_file_path = "tests/data/waterservices_stats.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_stats(sites=["01491000", "01645000"]) @@ -552,7 +552,7 @@ def test_get_stats_site_value_types(requests_mock, site_input_type_list): request_url = "https://waterservices.usgs.gov/nwis/stat?sites={}&format={}".format( site, _format ) - response_file_path = "data/waterservices_stats.txt" + response_file_path = "tests/data/waterservices_stats.txt" mock_request(requests_mock, request_url, response_file_path) if site_input_type_list: sites = [site] @@ -579,7 +579,7 @@ def assert_metadata(requests_mock, request_url, md, site, parameter_cd, format): site_request_url = ( "https://waterservices.usgs.gov/nwis/site?sites={}&format=rdb".format(site) ) - with open("data/waterservices_site.txt") as text: + with open("tests/data/waterservices_site.txt") as text: requests_mock.get(site_request_url, text=text.read()) site_info, _ = md.site_info if not isinstance(site_info, DataFrame): @@ -591,7 +591,7 @@ def assert_metadata(requests_mock, request_url, md, site, parameter_cd, format): pcode_request_url = "https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?fmt=rdb&parm_nm_cd=%25{}%25".format( param ) - with open("data/waterdata_pmcodes.txt") as text: + with open("tests/data/waterdata_pmcodes.txt") as text: requests_mock.get(pcode_request_url, text=text.read()) variable_info, _ = md.variable_info assert type(variable_info) is DataFrame diff --git a/tests/wqp_test.py b/tests/wqp_test.py index acf48c36..f36558bc 100755 --- a/tests/wqp_test.py +++ b/tests/wqp_test.py @@ -24,7 +24,7 @@ def test_get_results(requests_mock): "&characteristicName=Specific+conductance&startDateLo=05-01-2011&startDateHi=09-30-2011" "&mimeType=csv" ) - response_file_path = "data/wqp_results.txt" + response_file_path = "tests/data/wqp_results.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_results( siteid="WIDNR_WQX-10032762", @@ -48,7 +48,7 @@ def test_get_results_WQX3(requests_mock): "&mimeType=csv" "&dataProfile=fullPhysChem" ) - response_file_path = "data/wqp3_results.txt" + response_file_path = "tests/data/wqp3_results.txt" mock_request(requests_mock, request_url, response_file_path) df, md = get_results( legacy=False, @@ -71,7 +71,7 @@ def test_what_sites(requests_mock): "https://www.waterqualitydata.us/data/Station/Search?statecode=US%3A34&characteristicName=Chloride" "&mimeType=csv" ) - response_file_path = "data/wqp_sites.txt" + response_file_path = "tests/data/wqp_sites.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_sites(statecode="US:34", characteristicName="Chloride") assert type(df) is DataFrame @@ -88,7 +88,7 @@ def test_what_organizations(requests_mock): "https://www.waterqualitydata.us/data/Organization/Search?statecode=US%3A34&characteristicName=Chloride" "&mimeType=csv" ) - response_file_path = "data/wqp_organizations.txt" + response_file_path = "tests/data/wqp_organizations.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_organizations(statecode="US:34", characteristicName="Chloride") assert type(df) is DataFrame @@ -105,7 +105,7 @@ def test_what_projects(requests_mock): "https://www.waterqualitydata.us/data/Project/Search?statecode=US%3A34&characteristicName=Chloride" "&mimeType=csv" ) - response_file_path = "data/wqp_projects.txt" + response_file_path = "tests/data/wqp_projects.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_projects(statecode="US:34", characteristicName="Chloride") assert type(df) is DataFrame @@ -122,7 +122,7 @@ def test_what_activities(requests_mock): "https://www.waterqualitydata.us/data/Activity/Search?statecode=US%3A34&characteristicName=Chloride" "&mimeType=csv" ) - response_file_path = "data/wqp_activities.txt" + response_file_path = "tests/data/wqp_activities.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_activities(statecode="US:34", characteristicName="Chloride") assert type(df) is DataFrame @@ -139,7 +139,7 @@ def test_what_detection_limits(requests_mock): "https://www.waterqualitydata.us/data/ResultDetectionQuantitationLimit/Search?statecode=US%3A34&characteristicName=Chloride" "&mimeType=csv" ) - response_file_path = "data/wqp_detection_limits.txt" + response_file_path = "tests/data/wqp_detection_limits.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_detection_limits(statecode="US:34", characteristicName="Chloride") assert type(df) is DataFrame @@ -156,7 +156,7 @@ def test_what_habitat_metrics(requests_mock): "https://www.waterqualitydata.us/data/BiologicalMetric/Search?statecode=US%3A34&characteristicName=Chloride" "&mimeType=csv" ) - response_file_path = "data/wqp_habitat_metrics.txt" + response_file_path = "tests/data/wqp_habitat_metrics.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_habitat_metrics(statecode="US:34", characteristicName="Chloride") assert type(df) is DataFrame @@ -173,7 +173,7 @@ def test_what_project_weights(requests_mock): "https://www.waterqualitydata.us/data/ProjectMonitoringLocationWeighting/Search?statecode=US%3A34&characteristicName=Chloride" "&mimeType=csv" ) - response_file_path = "data/wqp_project_weights.txt" + response_file_path = "tests/data/wqp_project_weights.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_project_weights(statecode="US:34", characteristicName="Chloride") assert type(df) is DataFrame @@ -190,7 +190,7 @@ def test_what_activity_metrics(requests_mock): "https://www.waterqualitydata.us/data/ActivityMetric/Search?statecode=US%3A34&characteristicName=Chloride" "&mimeType=csv" ) - response_file_path = "data/wqp_activity_metrics.txt" + response_file_path = "tests/data/wqp_activity_metrics.txt" mock_request(requests_mock, request_url, response_file_path) df, md = what_activity_metrics(statecode="US:34", characteristicName="Chloride") assert type(df) is DataFrame From f4693b6b0a88ee593bc147f51a18e1326ef2c03d Mon Sep 17 00:00:00 2001 From: nodohs Date: Wed, 1 Oct 2025 22:46:43 -0500 Subject: [PATCH 31/56] Update README.md --- README.md | 247 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 171 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index 74641211..f7a8664d 100644 --- a/README.md +++ b/README.md @@ -6,124 +6,219 @@ ## Latest Announcements -:mega: **10/01/2025:** `dataretrieval` is pleased to offer a new, *in-development* module, `waterdata`, which gives users access USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, instantaneous values, field measurements (modernized groundwater levels service), time series metadata, and discrete water quality data from the Samples database. Though there will be a period of overlap, the functions within `waterdata` will eventually replace the `nwis` module, which currently provides access to the legacy [NWIS Water Services](https://waterservices.usgs.gov/). More example workflows and functions coming soon. Check `help(waterdata)` for more information. - -**Important:** Users of the Water Data APIs are strongly encouraged to obtain an API key, which gives users higher rate limits and thus greater access to USGS data. [Register for an API key](https://api.waterdata.usgs.gov/signup/) and then place that API key in your python environment as an environment variable named "API_USGS_PAT". One option is to set the variable as follows: +:mega: **10/01/2025:** `dataretrieval` now features the new `waterdata` module, +which provides access to USGS's modernized [Water Data +APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include +daily values, instantaneous values, field measurements, time series metadata, +and discrete water quality data from the Samples database. This new module will +eventually replace the `nwis` module, which provides access to the legacy [NWIS +Water Services](https://waterservices.usgs.gov/). + +**Important:** Users of the Water Data APIs are strongly encouraged to obtain an +API key for higher rate limits and greater access to USGS data. [Register for +an API key](https://api.waterdata.usgs.gov/signup/) and set it as an +environment variable: ```python import os os.environ["API_USGS_PAT"] = "your_api_key_here" ``` -Note that you may need to restart your python session for the environment variable to be recognized. -Check out the [NEWS](NEWS.md) file for all updates and announcements, or track updates to the package via the GitHub releases. +Check out the [NEWS](NEWS.md) file for all updates and announcements. ## What is dataretrieval? -`dataretrieval` was created to simplify the process of loading hydrologic data into the Python environment. -Like the original R version [`dataRetrieval`](https://github.com/DOI-USGS/dataRetrieval), -it is designed to retrieve the major data types of U.S. Geological Survey (USGS) hydrology -data that are available on the Web, as well as data from the Water -Quality Portal (WQP), which currently houses water quality data from the -Environmental Protection Agency (EPA), U.S. Department of Agriculture -(USDA), and USGS. Direct USGS data is obtained from a service called the -National Water Information System (NWIS). -Note that the python version is not a direct port of the original: it attempts to reproduce the functionality of the R package, though its organization and interface often differ. +`dataretrieval` simplifies the process of loading hydrologic data into Python. +Like the original R version +[`dataRetrieval`](https://github.com/DOI-USGS/dataRetrieval), it retrieves major +U.S. Geological Survey (USGS) hydrology data types available on the Web, as well +as data from the Water Quality Portal (WQP) and Network Linked Data Index +(NLDI). -If there's a hydrologic or environmental data portal that you'd like dataretrieval to -work with, raise it as an [issue](https://github.com/USGS-python/dataretrieval/issues). +## Usage Examples -Here's an example using `dataretrieval` to retrieve data from the National Water Information System (NWIS). +### Water Data API (Recommended - Modern USGS Data) -```python -# first import the functions for downloading data from NWIS -import dataretrieval.nwis as nwis +The `waterdata` module provides access to modern USGS Water Data APIs: -# specify the USGS site code for which we want data. -site = '03339000' +```python +import dataretrieval.waterdata as waterdata + +# Get daily streamflow data (returns DataFrame and metadata) +df, metadata = waterdata.get_daily( + monitoring_location_id='USGS-01646500', + parameter_code='00060', # Discharge + time='2024-10-01/2024-10-02' +) + +print(f"Retrieved {len(df)} records") +print(f"Site: {df['monitoring_location_id'].iloc[0]}") +print(f"Mean discharge: {df['value'].mean():.2f} {df['unit_of_measure'].iloc[0]}") +``` -# get instantaneous values (iv) -df = nwis.get_record(sites=site, service='iv', start='2017-12-31', end='2018-01-01') +```python +# Get monitoring location information +locations, metadata = waterdata.get_monitoring_locations( + state_name='Maryland', + site_type_code='ST' # Stream sites +) -# get basic info about the site -df2 = nwis.get_record(sites=site, service='site') +print(f"Found {len(locations)} stream monitoring locations in Maryland") ``` -Services available from NWIS include: -- instantaneous values (iv) -- daily values (dv) -- statistics (stat) -- site info (site) -- discharge peaks (peaks) -- discharge measurements (measurements) - -Water quality data are available from: -- [Samples](https://waterdata.usgs.gov/download-samples/#dataProfile=site) - Discrete USGS water quality data only -- [Water Quality Portal](https://www.waterqualitydata.us/) - Discrete water quality data from USGS and EPA. Older data are available in the legacy WQX version 2 format; all data are available in the beta WQX3.0 format. - -To access the full functionality available from NWIS web services, `nwis.get_record()` appends any additional kwargs into the REST request. For example, this function call: + +### NWIS Legacy Services (Deprecated but still functional) + +The `nwis` module accesses legacy NWIS Water Services: + ```python -nwis.get_record(sites='03339000', service='dv', start='2017-12-31', parameterCd='00060') +import dataretrieval.nwis as nwis + +# Get site information +info, metadata = nwis.get_info(sites='01646500') + +print(f"Site name: {info['station_nm'].iloc[0]}") + +# Get daily values +dv, metadata = nwis.get_dv( + sites='01646500', + start='2024-10-01', + end='2024-10-02', + parameterCd='00060', +) + +print(f"Retrieved {len(dv)} daily values") ``` -...will download daily data with the parameter code 00060 (discharge). -## Accessing the "Internal" NWIS -If you're connected to the USGS network, dataretrieval call pull from the internal (non-public) NWIS interface. -Most dataretrieval functions pass kwargs directly to NWIS's REST API, which provides simple access to internal data; simply specify "access='3'". -For example +### Water Quality Portal (WQP) + +Access water quality data from multiple agencies: + ```python -nwis.get_record(sites='05404147',service='iv', start='2021-01-01', end='2021-3-01', access='3') +import dataretrieval.wqp as wqp + +# Find water quality monitoring sites +sites = wqp.what_sites( + statecode='US:55', # Wisconsin + siteType='Stream' +) + +print(f"Found {len(sites)} stream monitoring sites in Wisconsin") + +# Get water quality results +results = wqp.get_results( + siteid='USGS-05427718', + characteristicName='Temperature, water' +) + +print(f"Retrieved {len(results)} temperature measurements") ``` -## Quick start +### Network Linked Data Index (NLDI) -dataretrieval can be installed using pip: - - $ python3 -m pip install -U dataretrieval +Discover and navigate hydrologic networks: -or conda: +```python +import dataretrieval.nldi as nldi - $ conda install -c conda-forge dataretrieval +# Get watershed basin for a stream reach +basin = nldi.get_basin( + feature_source='comid', + feature_id='13293474' # NHD reach identifier +) -More examples of use are include in [`demos`](https://github.com/USGS-python/dataretrieval/tree/main/demos). +print(f"Basin contains {len(basin)} feature(s)") -## Issue tracker +# Find upstream flowlines +flowlines = nldi.get_flowlines( + feature_source='comid', + feature_id='13293474', + navigation_mode='UT', # Upstream tributaries + distance=50 # km +) -Please report any bugs and enhancement ideas using the dataretrieval issue -tracker: +print(f"Found {len(flowlines)} upstream tributaries within 50km") +``` - https://github.com/USGS-python/dataretrieval/issues +## Available Data Services + +### Modern USGS Water Data APIs (Recommended) +- **Daily values**: Daily statistical summaries (mean, min, max) +- **Instantaneous values**: High-frequency continuous data +- **Field measurements**: Discrete measurements from field visits +- **Monitoring locations**: Site information and metadata +- **Time series metadata**: Information about available data parameters + +### Legacy NWIS Services (Deprecated) +- **Daily values (dv)**: Legacy daily statistical data +- **Instantaneous values (iv)**: Legacy continuous data +- **Site info (site)**: Basic site information +- **Statistics (stat)**: Statistical summaries +- **Discharge peaks (peaks)**: Annual peak discharge events +- **Discharge measurements (measurements)**: Direct flow measurements + +### Water Quality Portal +- **Results**: Water quality analytical results from USGS, EPA, and other agencies +- **Sites**: Monitoring location information +- **Organizations**: Data provider information +- **Projects**: Sampling project details + +### Network Linked Data Index (NLDI) +- **Basin delineation**: Watershed boundaries for any point +- **Flow navigation**: Upstream/downstream network traversal +- **Feature discovery**: Find monitoring sites, dams, and other features +- **Hydrologic connectivity**: Link data across the stream network + +## Installation + +Install dataretrieval using pip: + +```bash +pip install dataretrieval +``` -Feel free to also ask questions on the tracker. +Or using conda: +```bash +conda install -c conda-forge dataretrieval +``` -## Contributing +## More Examples -Any help in testing, development, documentation and other tasks is welcome. -For more details, see the file [CONTRIBUTING.md](CONTRIBUTING.md). +Explore additional examples in the +[`demos`](https://github.com/USGS-python/dataretrieval/tree/main/demos) +directory, including Jupyter notebooks demonstrating advanced usage patterns. +## Getting Help -## Need help? +- **Issue tracker**: Report bugs and request features at https://github.com/USGS-python/dataretrieval/issues +- **Documentation**: Full API documentation available in the source code docstrings -The Water Mission Area of the USGS supports the development and maintenance of `dataretrieval`. Any questions can be directed to the Computational Tools team at comptools@usgs.gov. +## Contributing -Resources are available primarily for maintenance and responding to user questions. -Priorities on the development of new features are determined by the `dataretrieval` development team. +Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for +development guidelines. ## Acknowledgments -This material is partially based upon work supported by the National Science Foundation (NSF) under award 1931297. -Any opinions, findings, conclusions, or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the NSF. + +This material is partially based upon work supported by the National Science +Foundation (NSF) under award 1931297. Any opinions, findings, conclusions, or +recommendations expressed in this material are those of the authors and do not +necessarily reflect the views of the NSF. ## Disclaimer -This software is preliminary or provisional and is subject to revision. -It is being provided to meet the need for timely best science. -The software has not received final approval by the U.S. Geological Survey (USGS). -No warranty, expressed or implied, is made by the USGS or the U.S. Government as to the functionality of the software and related material nor shall the fact of release constitute any such warranty. -The software is provided on the condition that neither the USGS nor the U.S. Government shall be held liable for any damages resulting from the authorized or unauthorized use of the software. +This software is preliminary or provisional and is subject to revision. It is +being provided to meet the need for timely best science. The software has not +received final approval by the U.S. Geological Survey (USGS). No warranty, +expressed or implied, is made by the USGS or the U.S. Government as to the +functionality of the software and related material nor shall the fact of release +constitute any such warranty. The software is provided on the condition that +neither the USGS nor the U.S. Government shall be held liable for any damages +resulting from the authorized or unauthorized use of the software. ## Citation -Hodson, T.O., Hariharan, J.A., Black, S., and Horsburgh, J.S., 2023, dataretrieval (Python): a Python package for discovering -and retrieving water data available from U.S. federal hydrologic web services: -U.S. Geological Survey software release, -https://doi.org/10.5066/P94I5TX3. +Hodson, T.O., Hariharan, J.A., Black, S., and Horsburgh, J.S., 2023, +dataretrieval (Python): a Python package for discovering and retrieving water +data available from U.S. federal hydrologic web services: U.S. Geological Survey +software release, https://doi.org/10.5066/P94I5TX3. From 0d066728d7b42213d160ad7aac354b9ad91602ea Mon Sep 17 00:00:00 2001 From: nodohs Date: Tue, 21 Oct 2025 23:35:39 -0500 Subject: [PATCH 32/56] Add deprecation warning for nwis --- dataretrieval/nwis.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 1189b790..e4615d10 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -2,13 +2,6 @@ .. _National Water Information System (NWIS): https://waterdata.usgs.gov/nwis - -.. todo:: - - * Create a test to check whether functions pull multiple sites - * Work on multi-index capabilities. - * Check that all timezones are handled properly for each service. - """ import re @@ -19,7 +12,7 @@ import pandas as pd import requests -from dataretrieval.utils import BaseMetadata, format_datetime, to_str +from dataretrieval.utils import BaseMetadata, format_datetime from .utils import query @@ -28,6 +21,14 @@ except ImportError: gpd = None +# Issue deprecation warning upon import +warnings.warn( + "The 'nwis' services are deprecated and being decommissioned. " + "Please use the 'waterdata' module to access the new services.", + DeprecationWarning, + stacklevel=2 +) + WATERDATA_BASE_URL = "https://nwis.waterdata.usgs.gov/" WATERDATA_URL = WATERDATA_BASE_URL + "nwis/" WATERSERVICE_URL = "https://waterservices.usgs.gov/nwis/" From 96a4356422d2e6b09892f6ae31ffe27747a582c3 Mon Sep 17 00:00:00 2001 From: Elise Hinman <121896266+ehinman@users.noreply.github.com> Date: Fri, 21 Nov 2025 15:14:19 -0600 Subject: [PATCH 33/56] Update dataretrieval/waterdata/api.py --- dataretrieval/waterdata/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index d1ae55f0..42ec8f53 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -44,7 +44,7 @@ def get_daily( limit: Optional[int] = None, max_results: Optional[int] = None, convert_type: bool = True, -) -> pd.DataFrame: +) -> Tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. From 7f7f184a96d97bb39b4bfb11c5302d342bb79d5c Mon Sep 17 00:00:00 2001 From: Elise Hinman <121896266+ehinman@users.noreply.github.com> Date: Fri, 21 Nov 2025 15:16:30 -0600 Subject: [PATCH 34/56] Update dataretrieval/waterdata/api.py --- dataretrieval/waterdata/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 42ec8f53..f8c0d939 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -248,7 +248,7 @@ def get_monitoring_locations( limit: Optional[int] = None, max_results: Optional[int] = None, convert_type: bool = True, -) -> pd.DataFrame: +) -> Tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location including the name, identifier, agency responsible for data collection, and the date the location was established. It also includes information about From c14e00b97eb0858fc0d2578c7505dc2eb0b92b19 Mon Sep 17 00:00:00 2001 From: Elise Hinman <121896266+ehinman@users.noreply.github.com> Date: Fri, 21 Nov 2025 15:17:02 -0600 Subject: [PATCH 35/56] Update dataretrieval/waterdata/api.py --- dataretrieval/waterdata/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index f8c0d939..f3bbf9e4 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -672,7 +672,7 @@ def get_time_series_metadata( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df = dataretrieval.waterdata.get_time_series_metadata( + >>> df, metadata = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id="USGS-02238500", ... parameter_code="00060", ... time="2021-01-01T00:00:00Z/2022-01-01T00:00:00Z", From dcc7a1ab5e9c226ce39bc16d0feb11fdaee4a12a Mon Sep 17 00:00:00 2001 From: Elise Hinman <121896266+ehinman@users.noreply.github.com> Date: Fri, 21 Nov 2025 15:20:18 -0600 Subject: [PATCH 36/56] Apply suggestions from code review --- dataretrieval/waterdata/api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index f3bbf9e4..ad40d132 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -174,7 +174,7 @@ def get_daily( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df = dataretrieval.waterdata.get_daily( + >>> df, metadata = dataretrieval.waterdata.get_daily( ... monitoring_location_id="USGS-02238500", ... parameter_code="00060", ... time="2021-01-01T00:00:00Z/2022-01-01T00:00:00Z", @@ -680,7 +680,7 @@ def get_time_series_metadata( >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df = dataretrieval.waterdata.get_time_series_metadata( + >>> df, metadata = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], ... time = "2024-01-01/.." """ @@ -715,7 +715,7 @@ def get_latest_continuous( limit: Optional[int] = None, max_results: Optional[int] = None, convert_type: bool = True, -) -> pd.DataFrame: +) -> Tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors installed at a monitoring location. They are collected at a high frequency @@ -885,7 +885,7 @@ def get_field_measurements( limit: Optional[int] = None, max_results: Optional[int] = None, convert_type: bool = True, -) -> pd.DataFrame: +) -> Tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a visit to the monitoring location. Field measurements consist of measurements of gage height and discharge, and readings of groundwater levels, and are From 4482751b72087142e0d0019f0753760008e92ae4 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 21 Nov 2025 16:18:48 -0600 Subject: [PATCH 37/56] add back in documentation and make formatting changes --- dataretrieval/waterdata/__init__.py | 6 +++--- dataretrieval/waterdata/api.py | 32 +++++++++++++++++++++++++---- dataretrieval/waterdata/types.py | 1 - dataretrieval/waterdata/utils.py | 11 +++++----- 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 7d87f79c..f2d508ad 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -11,6 +11,7 @@ # Public API exports from .api import ( + _check_profiles, get_codes, get_daily, get_field_measurements, @@ -18,13 +19,12 @@ get_monitoring_locations, get_samples, get_time_series_metadata, - _check_profiles, ) from .types import ( CODE_SERVICES, - SERVICES, - PROFILES, PROFILE_LOOKUP, + PROFILES, + SERVICES, ) __all__ = [ diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index ad40d132..2112dcc3 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -7,7 +7,7 @@ import json import logging from io import StringIO -from typing import Optional, List, Tuple, Union, get_args +from typing import List, Optional, Tuple, Union, get_args import pandas as pd import requests @@ -96,7 +96,15 @@ def get_daily( approval_status : string or list of strings, optional Some of the data that you have obtained from this U.S. Geological Survey database may not have received Director's approval. Any such data values - are qualified as provisional and are subject to revision. + are qualified as provisional and are subject to revision. Provisional + data are released on the condition that neither the USGS nor the United + States Government may be held liable for any damages resulting from its + use. This field reflects the approval status of each record, and is either + "Approved", meaining processing review has been completed and the data is + approved for publication, or "Provisional" and subject to revision. For + more information about provisional data, go to + [https://waterdata.usgs.gov/provisional-data-statement/] + (https://waterdata.usgs.gov/provisional-data-statement/). unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. @@ -765,7 +773,15 @@ def get_latest_continuous( approval_status : string or list of strings, optional Some of the data that you have obtained from this U.S. Geological Survey database may not have received Director's approval. Any such data values - are qualified as provisional and are subject to revision. + are qualified as provisional and are subject to revision. Provisional + data are released on the condition that neither the USGS nor the United + States Government may be held liable for any damages resulting from its + use. This field reflects the approval status of each record, and is either + "Approved", meaining processing review has been completed and the data is + approved for publication, or "Provisional" and subject to revision. For + more information about provisional data, go to + [https://waterdata.usgs.gov/provisional-data-statement/] + (https://waterdata.usgs.gov/provisional-data-statement/). unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. @@ -921,7 +937,15 @@ def get_field_measurements( approval_status : string or list of strings, optional Some of the data that you have obtained from this U.S. Geological Survey database may not have received Director's approval. Any such data values - are qualified as provisional and are subject to revision. + are qualified as provisional and are subject to revision. Provisional + data are released on the condition that neither the USGS nor the United + States Government may be held liable for any damages resulting from its + use. This field reflects the approval status of each record, and is either + "Approved", meaining processing review has been completed and the data is + approved for publication, or "Provisional" and subject to revision. For + more information about provisional data, go to + [https://waterdata.usgs.gov/provisional-data-statement/] + (https://waterdata.usgs.gov/provisional-data-statement/). unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index 07e000c0..65e73394 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -1,6 +1,5 @@ from typing import Literal - CODE_SERVICES = Literal[ "characteristicgroup", "characteristics", diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 10857503..4f3b60ff 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -1,12 +1,13 @@ -import requests -import os +import json import logging -from typing import List, Dict, Any, Optional, Union, Tuple +import os +import re from datetime import datetime +from typing import Any, Dict, List, Optional, Tuple, Union + import pandas as pd -import json +import requests from zoneinfo import ZoneInfo -import re from dataretrieval.utils import BaseMetadata From 37063b9c2bdae1c8074380b6558712baec360e8c Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 21 Nov 2025 16:34:31 -0600 Subject: [PATCH 38/56] add metadata to api.py and testing --- dataretrieval/waterdata/api.py | 30 ++++++++++++++------- tests/waterdata_test.py | 48 +++++++++++++++++----------------- 2 files changed, 44 insertions(+), 34 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 2112dcc3..663bb92e 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -175,6 +175,8 @@ def get_daily( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. + md: :obj:`dataretrieval.utils.Metadata` + A custom metadata object Examples -------- @@ -182,7 +184,7 @@ def get_daily( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df, metadata = dataretrieval.waterdata.get_daily( + >>> df, md = dataretrieval.waterdata.get_daily( ... monitoring_location_id="USGS-02238500", ... parameter_code="00060", ... time="2021-01-01T00:00:00Z/2022-01-01T00:00:00Z", @@ -190,7 +192,7 @@ def get_daily( >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df = dataretrieval.waterdata.get_daily( + >>> df, md = dataretrieval.waterdata.get_daily( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], ... approval_status = "Approved", ... time = "2024-01-01/.." @@ -484,6 +486,8 @@ def get_monitoring_locations( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. + md: :obj:`dataretrieval.utils.Metadata` + A custom metadata object Examples -------- @@ -491,13 +495,13 @@ def get_monitoring_locations( >>> # Get monitoring locations within a bounding box >>> # and leave out geometry - >>> df = dataretrieval.waterdata.get_monitoring_locations( + >>> df, md = dataretrieval.waterdata.get_monitoring_locations( ... bbox=[-90.2, 42.6, -88.7, 43.2], skip_geometry=True ... ) >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df = dataretrieval.waterdata.get_monitoring_locations( + >>> df, md = dataretrieval.waterdata.get_monitoring_locations( ... monitoring_location_id=["USGS-05114000", "USGS-09423350"], ... properties=["monitoring_location_id", "state_name", "country_name"], ... ) @@ -673,6 +677,8 @@ def get_time_series_metadata( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. + md: :obj:`dataretrieval.utils.Metadata` + A custom metadata object Examples -------- @@ -680,7 +686,7 @@ def get_time_series_metadata( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df, metadata = dataretrieval.waterdata.get_time_series_metadata( + >>> df, md = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id="USGS-02238500", ... parameter_code="00060", ... time="2021-01-01T00:00:00Z/2022-01-01T00:00:00Z", @@ -688,7 +694,7 @@ def get_time_series_metadata( >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df, metadata = dataretrieval.waterdata.get_time_series_metadata( + >>> df, md = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], ... time = "2024-01-01/.." """ @@ -851,6 +857,8 @@ def get_latest_continuous( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. + md: :obj:`dataretrieval.utils.Metadata` + A custom metadata object Examples -------- @@ -858,13 +866,13 @@ def get_latest_continuous( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df = dataretrieval.waterdata.get_latest_continuous( + >>> df, md = dataretrieval.waterdata.get_latest_continuous( ... monitoring_location_id="USGS-02238500", parameter_code="00060" ... ) >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df = dataretrieval.waterdata.get_daily( + >>> df, md = dataretrieval.waterdata.get_daily( ... monitoring_location_id=["USGS-05114000", "USGS-09423350"] ... ) """ @@ -1019,6 +1027,8 @@ def get_field_measurements( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. + md: :obj:`dataretrieval.utils.Metadata` + A custom metadata object Examples -------- @@ -1026,7 +1036,7 @@ def get_field_measurements( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df = dataretrieval.waterdata.get_field_measurements( + >>> df, md = dataretrieval.waterdata.get_field_measurements( ... monitoring_location_id="USGS-375907091432201", ... parameter_code="72019", ... skip_geometry=True, @@ -1034,7 +1044,7 @@ def get_field_measurements( >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df = dataretrieval.waterdata.get_field_measurements( + >>> df, md = dataretrieval.waterdata.get_field_measurements( ... monitoring_location_id = ["USGS-451605097071701", "USGS-263819081585801"], ... parameter_code = ["62611", "72019"], diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 0f46e231..e928d0e6 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -112,7 +112,7 @@ def test_samples_organizations(): assert df.size == 3 def test_get_daily(): - df, metadata = get_daily( + df, md = get_daily( monitoring_location_id="USGS-05427718", parameter_code="00060", time="2025-01-01/.." @@ -123,12 +123,12 @@ def test_get_daily(): assert df.parameter_code.unique().tolist() == ["00060"] assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() - assert hasattr(metadata, 'url') - assert hasattr(metadata, 'query_time') + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') assert df["value"].dtype == "float64" def test_get_daily_properties(): - df, metadata = get_daily( + df, md = get_daily( monitoring_location_id="USGS-05427718", parameter_code="00060", time="2025-01-01/..", @@ -138,11 +138,11 @@ def test_get_daily_properties(): assert "geometry" in df.columns assert df.shape[1] == 6 assert df.parameter_code.unique().tolist() == ["00060"] - assert hasattr(metadata, 'url') - assert hasattr(metadata, 'query_time') + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') def test_get_daily_no_geometry(): - df, metadata = get_daily( + df, md = get_daily( monitoring_location_id="USGS-05427718", parameter_code="00060", time="2025-01-01/..", @@ -151,35 +151,35 @@ def test_get_daily_no_geometry(): assert "geometry" not in df.columns assert df.shape[1] == 11 assert isinstance(df, DataFrame) - assert hasattr(metadata, 'url') - assert hasattr(metadata, 'query_time') + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') def test_get_monitoring_locations(): - df, metadata = get_monitoring_locations( + df, md = get_monitoring_locations( state_name="Connecticut", site_type_code="GW" ) assert df.site_type_code.unique().tolist() == ["GW"] - assert hasattr(metadata, 'url') - assert hasattr(metadata, 'query_time') + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') def test_get_monitoring_locations_hucs(): - df, metadata = get_monitoring_locations( + df, md = get_monitoring_locations( hydrologic_unit_code=["010802050102", "010802050103"] ) assert set(df.hydrologic_unit_code.unique().tolist()) == {"010802050102", "010802050103"} - assert hasattr(metadata, 'url') - assert hasattr(metadata, 'query_time') + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') def test_get_latest_continuous(): - df, metadata = get_latest_continuous( + df, md = get_latest_continuous( monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"] ) assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] - assert hasattr(metadata, 'url') - assert hasattr(metadata, 'query_time') + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') try: datetime.datetime.strptime(df['time'].iloc[0], "%Y-%m-%dT%H:%M:%S+00:00") out=True @@ -188,7 +188,7 @@ def test_get_latest_continuous(): assert out def test_get_field_measurements(): - df, metadata = get_field_measurements( + df, md = get_field_measurements( monitoring_location_id="USGS-05427718", unit_of_measure="ft^3/s", time="2025-01-01/2025-10-01", @@ -197,17 +197,17 @@ def test_get_field_measurements(): assert "field_measurement_id" in df.columns assert "geometry" not in df.columns assert df.unit_of_measure.unique().tolist() == ["ft^3/s"] - assert hasattr(metadata, 'url') - assert hasattr(metadata, 'query_time') + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') def test_get_time_series_metadata(): - df, metadata = get_time_series_metadata( + df, md = get_time_series_metadata( bbox=[-89.840355,42.853411,-88.818626,43.422598], parameter_code=["00060", "00065", "72019"], skip_geometry=True ) assert set(df['parameter_name'].unique().tolist()) == {"Gage height", "Water level, depth LSD", "Discharge"} - assert hasattr(metadata, 'url') - assert hasattr(metadata, 'query_time') + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') From 8bb2de88ade36ae9d3af55050aabb4546e62d95c Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 21 Nov 2025 16:56:17 -0600 Subject: [PATCH 39/56] small changes to remove unnecessary imports and add more documentation --- tests/waterdata_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index e928d0e6..f971a206 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -11,8 +11,6 @@ get_latest_continuous, get_field_measurements, get_time_series_metadata, - SERVICES, - PROFILES, ) def mock_request(requests_mock, request_url, file_path): From 2f6af7d21fe353a849b6c78ac40b287aec007054 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 21 Nov 2025 17:04:32 -0600 Subject: [PATCH 40/56] remove some redundant testing, make next url be an info log, not debug --- tests/waterdata_test.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index f971a206..f3bdd493 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -136,8 +136,6 @@ def test_get_daily_properties(): assert "geometry" in df.columns assert df.shape[1] == 6 assert df.parameter_code.unique().tolist() == ["00060"] - assert hasattr(md, 'url') - assert hasattr(md, 'query_time') def test_get_daily_no_geometry(): df, md = get_daily( @@ -149,8 +147,6 @@ def test_get_daily_no_geometry(): assert "geometry" not in df.columns assert df.shape[1] == 11 assert isinstance(df, DataFrame) - assert hasattr(md, 'url') - assert hasattr(md, 'query_time') def test_get_monitoring_locations(): df, md = get_monitoring_locations( @@ -166,8 +162,6 @@ def test_get_monitoring_locations_hucs(): hydrologic_unit_code=["010802050102", "010802050103"] ) assert set(df.hydrologic_unit_code.unique().tolist()) == {"010802050102", "010802050103"} - assert hasattr(md, 'url') - assert hasattr(md, 'query_time') def test_get_latest_continuous(): df, md = get_latest_continuous( From f0bef3e385262ec3f7f335143b55d8cb748192f1 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 24 Nov 2025 08:55:26 -0600 Subject: [PATCH 41/56] same as previous commit message, was behind on what I was committing --- dataretrieval/waterdata/api.py | 10 ++++++++++ dataretrieval/waterdata/utils.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 663bb92e..46883754 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -134,6 +134,8 @@ def get_daily( skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. + Note that the USGS Water Data APIs use camelCase "skipGeometry" in + CQL2 queries. time : string, optional The date an observation represents. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 @@ -481,6 +483,8 @@ def get_monitoring_locations( skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. + Note that the USGS Water Data APIs use camelCase "skipGeometry" in + CQL2 queries. Returns ------- @@ -652,6 +656,8 @@ def get_time_series_metadata( skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. + Note that the USGS Water Data APIs use camelCase "skipGeometry" in + CQL2 queries. bbox : list of numbers, optional Only features that have a geometry that intersects the bounding box are selected. The bounding box is provided as four or six numbers, @@ -816,6 +822,8 @@ def get_latest_continuous( skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. + Note that the USGS Water Data APIs use camelCase "skipGeometry" in + CQL2 queries. time : string, optional The date an observation represents. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 @@ -987,6 +995,8 @@ def get_field_measurements( skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. + Note that the USGS Water Data APIs use camelCase "skipGeometry" in + CQL2 queries. time : string, optional The date an observation represents. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 4f3b60ff..0f43fd12 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -450,7 +450,7 @@ def _next_req_url(resp: requests.Response) -> Optional[str]: ----- - If the environment variable "API_USGS_PAT" is set, logs the remaining requests for the current hour. - - Logs the next URL if found at debug level. + - Logs the next URL if found at info level. - Expects the response JSON to contain a "links" list with objects having "rel" and "href" keys. - Checks for the "next" relation in the "links" to determine the next URL. @@ -467,7 +467,7 @@ def _next_req_url(resp: requests.Response) -> Optional[str]: for link in body.get("links", []): if link.get("rel") == "next": next_url = link.get("href") - logger.debug("Next URL: %s", next_url) + logger.info("Next URL: %s", next_url) return next_url return None From 8605deaf9e2a48bae8b043cc60a0b6f68e0915b1 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 24 Nov 2025 10:41:14 -0600 Subject: [PATCH 42/56] convert failures counter to a stop that shows URL that failed --- dataretrieval/waterdata/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 0f43fd12..0c58193a 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -589,7 +589,6 @@ def _walk_pages( if max_results is None or pd.isna(max_results): dfs = _get_resp_data(resp, geopd=geopd) curr_url = _next_req_url(resp) - failures = [] while curr_url: try: resp = client.request( @@ -604,10 +603,8 @@ def _walk_pages( dfs = pd.concat([dfs, df1], ignore_index=True) curr_url = _next_req_url(resp) except Exception: - failures.append(curr_url) + logger.info("Request failed for URL: %s. Stopping pagination and data download.", curr_url) curr_url = None - if failures: - logger.warning("There were %d failed requests.", len(failures)) return dfs, initial_response else: resp.raise_for_status() From bd3f6ad2dc7ead7fcf8c4d627882090a669751e3 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 24 Nov 2025 12:33:14 -0600 Subject: [PATCH 43/56] remove max_requests as this is confusing and should be better vetted and documented before adding --- dataretrieval/waterdata/api.py | 20 -------- dataretrieval/waterdata/utils.py | 81 ++++++++++---------------------- 2 files changed, 26 insertions(+), 75 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 46883754..125b9487 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -42,7 +42,6 @@ def get_daily( time: Optional[Union[str, List[str]]] = None, bbox: Optional[List[float]] = None, limit: Optional[int] = None, - max_results: Optional[int] = None, convert_type: bool = True, ) -> Tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the @@ -166,9 +165,6 @@ def get_daily( allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The default (NA) will set the limit to the maximum allowable limit for the service. - max_results : numeric, optional - The optional maximum number of rows to return. This value must be less - than the requested limit. convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector @@ -258,7 +254,6 @@ def get_monitoring_locations( time: Optional[Union[str, List[str]]] = None, bbox: Optional[List[float]] = None, limit: Optional[int] = None, - max_results: Optional[int] = None, convert_type: bool = True, ) -> Tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location @@ -477,9 +472,6 @@ def get_monitoring_locations( allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The default (NA) will set the limit to the maximum allowable limit for the service. - max_results : numeric, optional - The optional maximum number of rows to return. This value must be less - than the requested limit. skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. @@ -545,7 +537,6 @@ def get_time_series_metadata( time: Optional[Union[str, List[str]]] = None, bbox: Optional[List[float]] = None, limit: Optional[int] = None, - max_results: Optional[int] = None, convert_type: bool = True, ) -> Tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, @@ -672,9 +663,6 @@ def get_time_series_metadata( allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. - max_results : numeric, optional - The optional maximum number of rows to return. This value must be less - than the requested limit. convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector @@ -733,7 +721,6 @@ def get_latest_continuous( time: Optional[Union[str, List[str]]] = None, bbox: Optional[List[float]] = None, limit: Optional[int] = None, - max_results: Optional[int] = None, convert_type: bool = True, ) -> Tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series @@ -854,9 +841,6 @@ def get_latest_continuous( allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. - max_results : numeric, optional - The optional maximum number of rows to return. This value must be less - than the requested limit. convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector @@ -915,7 +899,6 @@ def get_field_measurements( time: Optional[Union[str, List[str]]] = None, bbox: Optional[List[float]] = None, limit: Optional[int] = None, - max_results: Optional[int] = None, convert_type: bool = True, ) -> Tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a @@ -1026,9 +1009,6 @@ def get_field_measurements( allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. - max_results : numeric, optional - The optional maximum number of rows to return. This value must be less - than the requested limit. convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 0c58193a..a85124f8 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -320,7 +320,6 @@ def _construct_api_requests( properties: Optional[List[str]] = None, bbox: Optional[List[float]] = None, limit: Optional[int] = None, - max_results: Optional[int] = None, skip_geometry: bool = False, **kwargs, ): @@ -341,8 +340,6 @@ def _construct_api_requests( Bounding box coordinates as a list of floats. limit : Optional[int], optional Maximum number of results to return per request. - max_results : Optional[int], optional - Maximum number of rows to return. skip_geometry : bool, optional Whether to exclude geometry from the response (default is False). **kwargs @@ -354,11 +351,6 @@ def _construct_api_requests( requests.PreparedRequest The constructed HTTP request object ready to be sent. - Raises - ------ - ValueError - If `limit` is greater than `max_results`. - Notes ----- - Date/time parameters are automatically formatted to ISO8601. @@ -367,6 +359,7 @@ def _construct_api_requests( - The function sets appropriate headers for GET and POST requests. """ service_url = f"{OGC_API_URL}/collections/{service}/items" + # Single parameters can only have one value single_params = {"datetime", "last_modified", "begin", "end", "time"} @@ -381,17 +374,12 @@ def _construct_api_requests( params = {k: v for k, v in kwargs.items() if k not in post_params} # Set skipGeometry parameter (API expects camelCase) params["skipGeometry"] = skip_geometry - # If limit is none and max_results is not none, then set limit to max results. Otherwise, - # if max_results is none, set it to 10000 (the API max). + + # If limit is none or greater than 10000, then set limit to max results. Otherwise, + # use the limit params["limit"] = ( - max_results if limit is None and max_results is not None else limit or 10000 - ) - # Add max results as a parameter if it is not None - if max_results is not None: - params["max_results"] = max_results - - if max_results is not None and limit is not None and limit > max_results: - raise ValueError("limit cannot be greater than max_result") + 10000 if limit is None or limit > 10000 else limit + ) # Indicate if function needs to perform POST conversion POST = bool(post_params) @@ -521,7 +509,6 @@ def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame: def _walk_pages( geopd: bool, req: requests.PreparedRequest, - max_results: Optional[int], client: Optional[requests.Session] = None, ) -> Tuple[pd.DataFrame, requests.Response]: """ @@ -534,9 +521,6 @@ def _walk_pages( geometries. req : requests.PreparedRequest The initial HTTP request to send. - max_results : Optional[int] - Maximum number of rows to return. If None or NaN, retrieves all - available pages. client : Optional[requests.Session], default None An optional HTTP client to use for requests. If not provided, a new client is created. @@ -552,13 +536,6 @@ def _walk_pages( ------ Exception If a request fails or returns a non-200 status code. - - Notes - ----- - - If `max_results` is None or NaN, the function will continue to request - subsequent pages until no more pages are available. - - Failed requests are tracked and reported, but do not halt the entire - process unless the initial request fails. """ logger.info("Requesting: %s", req.url) @@ -586,29 +563,25 @@ def _walk_pages( headers = dict(req.headers) content = req.body if method == "POST" else None - if max_results is None or pd.isna(max_results): - dfs = _get_resp_data(resp, geopd=geopd) - curr_url = _next_req_url(resp) - while curr_url: - try: - resp = client.request( - method, - curr_url, - headers=headers, - data=content if method == "POST" else None, + dfs = _get_resp_data(resp, geopd=geopd) + curr_url = _next_req_url(resp) + while curr_url: + try: + resp = client.request( + method, + curr_url, + headers=headers, + data=content if method == "POST" else None, ) - if resp.status_code != 200: - raise Exception(_error_body(resp)) - df1 = _get_resp_data(resp, geopd=geopd) - dfs = pd.concat([dfs, df1], ignore_index=True) - curr_url = _next_req_url(resp) - except Exception: - logger.info("Request failed for URL: %s. Stopping pagination and data download.", curr_url) - curr_url = None - return dfs, initial_response - else: - resp.raise_for_status() - return _get_resp_data(resp, geopd=geopd), initial_response + if resp.status_code != 200: + raise Exception(_error_body(resp)) + df1 = _get_resp_data(resp, geopd=geopd) + dfs = pd.concat([dfs, df1], ignore_index=True) + curr_url = _next_req_url(resp) + except Exception: + logger.info("Request failed for URL: %s. Stopping pagination and data download.", curr_url) + curr_url = None + return dfs, initial_response finally: if close_client: client.close() @@ -742,14 +715,12 @@ def get_ogc_data( Notes ----- - The function does not mutate the input `args` dictionary. - - Handles optional arguments such as `max_results` and `convert_type`. + - Handles optional arguments such as `convert_type`. - Applies column cleanup and reordering based on service and properties. """ args = args.copy() # Add service as an argument args["service"] = service - # Pull out a max results input if exists - max_results = args.pop("max_results", None) # Switch the input id to "id" if needed args = _switch_arg_id(args, id_name=output_id, service=service) properties = args.get("properties") @@ -764,7 +735,7 @@ def get_ogc_data( req = _construct_api_requests(**args) # Run API request and iterate through pages if needed return_list, response = _walk_pages( - geopd=GEOPANDAS, req=req, max_results=max_results + geopd=GEOPANDAS, req=req ) # Manage some aspects of the returned dataset return_list = _deal_with_empty(return_list, properties, service) From 6a326ce7c8a8af52445396e84937f74483ed3bdd Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 24 Nov 2025 13:09:06 -0600 Subject: [PATCH 44/56] add new latest-daily service --- dataretrieval/waterdata/__init__.py | 2 + dataretrieval/waterdata/api.py | 179 +++++++++++++++++++++++++++- tests/waterdata_test.py | 12 ++ 3 files changed, 192 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index f2d508ad..7f68bfd6 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -16,6 +16,7 @@ get_daily, get_field_measurements, get_latest_continuous, + get_latest_daily, get_monitoring_locations, get_samples, get_time_series_metadata, @@ -32,6 +33,7 @@ "get_daily", "get_field_measurements", "get_latest_continuous", + "get_latest_daily", "get_monitoring_locations", "get_samples", "get_time_series_metadata", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 125b9487..3b8cd433 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -864,7 +864,7 @@ def get_latest_continuous( >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df, md = dataretrieval.waterdata.get_daily( + >>> df, md = dataretrieval.waterdata.get_latest_continuous( ... monitoring_location_id=["USGS-05114000", "USGS-09423350"] ... ) """ @@ -881,6 +881,183 @@ def get_latest_continuous( return get_ogc_data(args, output_id, service) +def get_latest_daily( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + properties: Optional[Union[str, List[str]]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + latest_daily_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[int] = None, + last_modified: Optional[Union[str, List[str]]] = None, + skip_geometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + convert_type: bool = True, +) -> Tuple[pd.DataFrame, BaseMetadata]: + """Daily data provide one data value to represent water conditions for the + day. + + Throughout much of the history of the USGS, the primary water data available + was daily data collected manually at the monitoring location once each day. + With improved availability of computer storage and automated transmission of + data, the daily data published today are generally a statistical summary or + metric of the continuous data collected each day, such as the daily mean, + minimum, or maximum value. Daily data are automatically calculated from the + continuous data of the same parameter code and are described by parameter + code and a statistic code. These data have also been referred to as “daily + values” or “DV”. + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of the + agency responsible for the monitoring location (e.g. USGS) with the ID + number of the monitoring location (e.g. 02238500), separated by a hyphen + (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter codes + and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + statistic_id : string or list of strings, optional + A code corresponding to the statistic an observation represents. + Example codes include 00001 (max), 00002 (min), and 00003 (mean). + A complete list of codes and their descriptions can be found at + https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. Available + options are: geometry, id, time_series_id, monitoring_location_id, + parameter_code, statistic_id, time, value, unit_of_measure, + approval_status, qualifier, last_modified + time_series_id : string or list of strings, optional + A unique identifier representing a single time series. This + corresponds to the id field in the time-series-metadata endpoint. + latest_daily_id : string or list of strings, optional + A universally unique identifier (UUID) representing a single version of + a record. It is not stable over time. Every time the record is refreshed + in our database (which may happen as part of normal operations and does + not imply any change to the data itself) a new ID will be generated. To + uniquely identify a single observation over time, compare the time and + time_series_id fields; each time series will only have a single + observation at a given time. + approval_status : string or list of strings, optional + Some of the data that you have obtained from this U.S. Geological Survey + database may not have received Director's approval. Any such data values + are qualified as provisional and are subject to revision. Provisional + data are released on the condition that neither the USGS nor the United + States Government may be held liable for any damages resulting from its + use. This field reflects the approval status of each record, and is either + "Approved", meaining processing review has been completed and the data is + approved for publication, or "Provisional" and subject to revision. For + more information about provisional data, go to + [https://waterdata.usgs.gov/provisional-data-statement/] + (https://waterdata.usgs.gov/provisional-data-statement/). + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + qualifier : string or list of strings, optional + This field indicates any qualifiers associated with an observation, for + instance if a sensor may have been impacted by ice or if values were + estimated. + value : string or list of strings, optional + The value of the observation. Values are transmitted as strings in + the JSON response format in order to preserve precision. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Only features that have a last_modified that + intersects the value of datetime are selected. + Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" + for the last 36 hours + skip_geometry : boolean, optional + This option can be used to skip response geometries for each feature. + The returning object will be a data frame with no spatial information. + Note that the USGS Water Data APIs use camelCase "skipGeometry" in + CQL2 queries. + time : string, optional + The date an observation represents. You can query this field using + date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Only features that have a time that intersects the + value of datetime are selected. If a feature has multiple temporal + properties, it is the decision of the server whether only a single + temporal property is used to determine the extent or all relevant + temporal properties. + Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" + for the last 36 hours + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are + selected. The bounding box is provided as four or six numbers, + depending on whether the coordinate reference system includes a vertical + axis (height or depth). Coordinates are assumed to be in crs 4326. The + expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). + Another way to think of it is c(Western-most longitude, Southern-most + latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 10000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (None) will set the + limit to the maximum allowable limit for the service. + convert_type : boolean, optional + If True, the function will convert the data to dates and qualifier to + string vector + + Returns + ------- + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` + Formatted data returned from the API query. + md: :obj:`dataretrieval.utils.Metadata` + A custom metadata object + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df, md = dataretrieval.waterdata.get_latest_daily( + ... monitoring_location_id="USGS-02238500", parameter_code="00060" + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df, md = dataretrieval.waterdata.get_latest_daily( + ... monitoring_location_id=["USGS-05114000", "USGS-09423350"] + ... ) + """ + service = "latest-daily" + output_id = "latest_daily_id" + + # Build argument dictionary, omitting None values + args = { + k: v + for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + + return get_ogc_data(args, output_id, service) + def get_field_measurements( monitoring_location_id: Optional[Union[str, List[str]]] = None, parameter_code: Optional[Union[str, List[str]]] = None, diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index f3bdd493..90254f46 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -9,6 +9,7 @@ get_daily, get_monitoring_locations, get_latest_continuous, + get_latest_daily, get_field_measurements, get_time_series_metadata, ) @@ -168,6 +169,7 @@ def test_get_latest_continuous(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"] ) + assert "latest_continuous_id" in df.columns assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] assert hasattr(md, 'url') @@ -179,6 +181,16 @@ def test_get_latest_continuous(): out=False assert out +def test_get_latest_daily(): + df, md = get_latest_daily( + monitoring_location_id=["USGS-05427718", "USGS-05427719"], + parameter_code=["00060", "00065"] + ) + assert "latest_daily_id" in df.columns + assert df.shape[1] == 12 + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') + def test_get_field_measurements(): df, md = get_field_measurements( monitoring_location_id="USGS-05427718", From ada9a41f9bd56010fb3d49254d101ecd5b90197a Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 24 Nov 2025 13:37:07 -0600 Subject: [PATCH 45/56] correct example documentation and add info about logging --- README.md | 19 +++++++++++++++++- dataretrieval/waterdata/api.py | 36 ++++++++++++++-------------------- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index f7a8664d..198edd90 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ ## Latest Announcements -:mega: **10/01/2025:** `dataretrieval` now features the new `waterdata` module, +:mega: **11/24/2025:** `dataretrieval` now features the new `waterdata` module, which provides access to USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, instantaneous values, field measurements, time series metadata, @@ -65,6 +65,23 @@ locations, metadata = waterdata.get_monitoring_locations( print(f"Found {len(locations)} stream monitoring locations in Maryland") ``` +This new module implements +[logging](https://docs.python.org/3/howto/logging.html#logging-basic-tutorial) +in which users can view the URL requests sent to the USGS Water Data APIs +and the number of requests they have remaining each hour. These messages can +be helpful for troubleshooting and support. To enable logging in your python +console or notebook: + +```python +import logging +logging.basicConfig(level=logging.INFO) +``` +To log messages to a file, you can specify a filename in the +`basicConfig` call: + +```python +logging.basicConfig(filename='waterdata.log', level=logging.INFO) +``` ### NWIS Legacy Services (Deprecated but still functional) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 3b8cd433..7e17f254 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -188,8 +188,7 @@ def get_daily( ... time="2021-01-01T00:00:00Z/2022-01-01T00:00:00Z", ... ) - >>> # Get monitoring location info for specific sites - >>> # and only specific properties + >>> # Get approved daily flow data from multiple sites >>> df, md = dataretrieval.waterdata.get_daily( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], ... approval_status = "Approved", @@ -678,19 +677,18 @@ def get_time_series_metadata( -------- .. code:: - >>> # Get daily flow data from a single site + >>> # Get timeseries metadata information from a single site >>> # over a yearlong period >>> df, md = dataretrieval.waterdata.get_time_series_metadata( - ... monitoring_location_id="USGS-02238500", - ... parameter_code="00060", - ... time="2021-01-01T00:00:00Z/2022-01-01T00:00:00Z", + ... monitoring_location_id="USGS-02238500" ... ) - >>> # Get monitoring location info for specific sites - >>> # and only specific properties + >>> # Get timeseries metadata information from multiple sites + >>> # that begin after January 1, 1990. >>> df, md = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], - ... time = "2024-01-01/.." + ... begin = "1990-01-01/.." + ... ) """ service = "time-series-metadata" output_id = "time_series_id" @@ -856,14 +854,12 @@ def get_latest_continuous( -------- .. code:: - >>> # Get daily flow data from a single site - >>> # over a yearlong period + >>> # Get latest flow data from a single site >>> df, md = dataretrieval.waterdata.get_latest_continuous( ... monitoring_location_id="USGS-02238500", parameter_code="00060" ... ) - >>> # Get monitoring location info for specific sites - >>> # and only specific properties + >>> # Get latest continuous measurements for multiple sites >>> df, md = dataretrieval.waterdata.get_latest_continuous( ... monitoring_location_id=["USGS-05114000", "USGS-09423350"] ... ) @@ -1034,14 +1030,12 @@ def get_latest_daily( -------- .. code:: - >>> # Get daily flow data from a single site - >>> # over a yearlong period + >>> # Get most recent daily flow data from a single site >>> df, md = dataretrieval.waterdata.get_latest_daily( ... monitoring_location_id="USGS-02238500", parameter_code="00060" ... ) - >>> # Get monitoring location info for specific sites - >>> # and only specific properties + >>> # Get most recent daily measurements for two sites >>> df, md = dataretrieval.waterdata.get_latest_daily( ... monitoring_location_id=["USGS-05114000", "USGS-09423350"] ... ) @@ -1201,16 +1195,16 @@ def get_field_measurements( -------- .. code:: - >>> # Get daily flow data from a single site - >>> # over a yearlong period + >>> # Get field measurements from a single groundwater site + >>> # and parameter code, and do not return geometry >>> df, md = dataretrieval.waterdata.get_field_measurements( ... monitoring_location_id="USGS-375907091432201", ... parameter_code="72019", ... skip_geometry=True, ... ) - >>> # Get monitoring location info for specific sites - >>> # and only specific properties + >>> # Get field measurements from multiple sites and + >>> # parameter codes from the last 20 years >>> df, md = dataretrieval.waterdata.get_field_measurements( ... monitoring_location_id = ["USGS-451605097071701", "USGS-263819081585801"], From 4f734846c1717aa506ad6e137afb199abd39e307 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 24 Nov 2025 13:55:00 -0600 Subject: [PATCH 46/56] correct date, add nldi as module to init.py --- NEWS.md | 2 +- dataretrieval/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index a071d491..2efdc76c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -**10/01/2025:** `dataretrieval` is pleased to offer a new module, `waterdata`, which gives users access USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, instantaneous values, field measurements (modernized groundwater levels service), time series metadata, and discrete water quality data from the Samples database. Though there will be a period of overlap, the functions within `waterdata` will eventually replace the `nwis` module, which currently provides access to the legacy [NWIS Water Services](https://waterservices.usgs.gov/). More example workflows and functions coming soon. Check `help(waterdata)` for more information. +**11/24/2025:** `dataretrieval` is pleased to offer a new module, `waterdata`, which gives users access USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, instantaneous values, field measurements (modernized groundwater levels service), time series metadata, and discrete water quality data from the Samples database. Though there will be a period of overlap, the functions within `waterdata` will eventually replace the `nwis` module, which currently provides access to the legacy [NWIS Water Services](https://waterservices.usgs.gov/). More example workflows and functions coming soon. Check `help(waterdata)` for more information. **09/03/2024:** The groundwater levels service has switched endpoints, and `dataretrieval` was updated accordingly in [`v1.0.10`](https://github.com/DOI-USGS/dataretrieval-python/releases/tag/v1.0.10). Older versions using the discontinued endpoint will return 503 errors for `nwis.get_gwlevels` or the `service='gwlevels'` argument. Visit [Water Data For the Nation](https://waterdata.usgs.gov/blog/wdfn-waterservices-2024/) for more information. diff --git a/dataretrieval/__init__.py b/dataretrieval/__init__.py index 07374f47..501e16bf 100644 --- a/dataretrieval/__init__.py +++ b/dataretrieval/__init__.py @@ -1,6 +1,7 @@ from importlib.metadata import PackageNotFoundError, version from dataretrieval.nadp import * +from dataretrieval.nldi import * from dataretrieval.nwis import * from dataretrieval.samples import * from dataretrieval.streamstats import * From da71b907da2901fb531fbd426f30d7e1880c10e3 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 10:17:28 -0600 Subject: [PATCH 47/56] make error messages louder, clearer --- dataretrieval/waterdata/utils.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index a85124f8..9ce030ad 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -1,5 +1,6 @@ import json import logging +import warnings import os import re from datetime import datetime @@ -309,10 +310,11 @@ def _error_body(resp: requests.Response): status codes, returns the raw response text. """ if resp.status_code == 429: - return resp.json().get("error", {}).get("message") + return "429: Too many requests made. Please obtain an API token or try again later." elif resp.status_code == 403: - return "Query request denied. Possible reasons include query exceeding server limits." - return resp.text + return "403: Query request denied. Possible reasons include query exceeding server limits." + return f"{resp.status_code}: {resp.json().get('code', 'Unknown type')}. \ + {resp.json().get('description', "No description provided")}." def _construct_api_requests( @@ -574,12 +576,15 @@ def _walk_pages( data=content if method == "POST" else None, ) if resp.status_code != 200: - raise Exception(_error_body(resp)) + error_text = _error_body(resp) + raise Exception(error_text) df1 = _get_resp_data(resp, geopd=geopd) dfs = pd.concat([dfs, df1], ignore_index=True) curr_url = _next_req_url(resp) except Exception: - logger.info("Request failed for URL: %s. Stopping pagination and data download.", curr_url) + warnings.warn(f"{error_text}. Data request incomplete.") + logger.error("Request incomplete. %s", error_text) + logger.warning("Request failed for URL: %s. Data download interrupted.", curr_url) curr_url = None return dfs, initial_response finally: From e614d83d5b5375839afcfa56badce9c6454524f3 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 10:17:59 -0600 Subject: [PATCH 48/56] re-arrange README a little --- README.md | 61 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 198edd90..5d8ee798 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,29 @@ U.S. Geological Survey (USGS) hydrology data types available on the Web, as well as data from the Water Quality Portal (WQP) and Network Linked Data Index (NLDI). +## Installation + +Install dataretrieval using pip: + +```bash +pip install dataretrieval +``` + +Or using conda: + +```bash +conda install -c conda-forge dataretrieval +``` + ## Usage Examples ### Water Data API (Recommended - Modern USGS Data) -The `waterdata` module provides access to modern USGS Water Data APIs: +The `waterdata` module provides access to modern USGS Water Data APIs. + +The example below retrieves daily streamflow data for a specific monitoring +location for water year 2025, where a "/" between two dates in the "time" +input argument indicates a desired date range: ```python import dataretrieval.waterdata as waterdata @@ -48,13 +66,27 @@ import dataretrieval.waterdata as waterdata df, metadata = waterdata.get_daily( monitoring_location_id='USGS-01646500', parameter_code='00060', # Discharge - time='2024-10-01/2024-10-02' + time='2024-10-01/2025-09-30' ) print(f"Retrieved {len(df)} records") print(f"Site: {df['monitoring_location_id'].iloc[0]}") print(f"Mean discharge: {df['value'].mean():.2f} {df['unit_of_measure'].iloc[0]}") ``` +Fetch daily discharge data for multiple sites from a start date to present +using the following code: + +```python +df, metadata = waterdata.get_daily( + monitoring_location_id=["USGS-13018750","USGS-13013650"], + parameter_code='00060', + time='2024-10-01/..' +) + +print(f"Retrieved {len(df)} records") +``` +The following example downloads location information for all monitoring +locations that are categorized as stream sites in the state of Maryland: ```python # Get monitoring location information @@ -65,7 +97,11 @@ locations, metadata = waterdata.get_monitoring_locations( print(f"Found {len(locations)} stream monitoring locations in Maryland") ``` -This new module implements +Visit the +[API Reference](https://doi-usgs.github.io/dataretrieval-python/reference/waterdata.html) +for more information and examples on available services and input parameters. + +**NEW:** This new module implements [logging](https://docs.python.org/3/howto/logging.html#logging-basic-tutorial) in which users can view the URL requests sent to the USGS Water Data APIs and the number of requests they have remaining each hour. These messages can @@ -160,10 +196,13 @@ print(f"Found {len(flowlines)} upstream tributaries within 50km") ### Modern USGS Water Data APIs (Recommended) - **Daily values**: Daily statistical summaries (mean, min, max) -- **Instantaneous values**: High-frequency continuous data - **Field measurements**: Discrete measurements from field visits - **Monitoring locations**: Site information and metadata - **Time series metadata**: Information about available data parameters +- **Latest daily values**: Most recent daily statistical summary data +- **Latest instantaneous values**: Most recent high-frequency continuous data +- **Samples data**: Discrete USGS water quality data +- **Instantaneous values (:alarm_clock: COMING SOON)**: High-frequency continuous data ### Legacy NWIS Services (Deprecated) - **Daily values (dv)**: Legacy daily statistical data @@ -185,20 +224,6 @@ print(f"Found {len(flowlines)} upstream tributaries within 50km") - **Feature discovery**: Find monitoring sites, dams, and other features - **Hydrologic connectivity**: Link data across the stream network -## Installation - -Install dataretrieval using pip: - -```bash -pip install dataretrieval -``` - -Or using conda: - -```bash -conda install -c conda-forge dataretrieval -``` - ## More Examples Explore additional examples in the From 535b30ff4a0cc0434aecd6d305c9d44cb13cd2e0 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 10:28:21 -0600 Subject: [PATCH 49/56] try to fix ubuntu flake8 error --- dataretrieval/waterdata/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 9ce030ad..af6a9f6b 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -169,7 +169,7 @@ def _format_api_dates( datetime_input = [datetime_input] # Check for null or all NA and return None - if all(pd.isna(dt) or dt == "" or dt == None for dt in datetime_input): + if all(pd.isna(dt) or dt == "" or dt is None for dt in datetime_input): return None if len(datetime_input) <= 2: @@ -313,8 +313,7 @@ def _error_body(resp: requests.Response): return "429: Too many requests made. Please obtain an API token or try again later." elif resp.status_code == 403: return "403: Query request denied. Possible reasons include query exceeding server limits." - return f"{resp.status_code}: {resp.json().get('code', 'Unknown type')}. \ - {resp.json().get('description', "No description provided")}." + return f"{resp.status_code}: {resp.json().get('code', 'Unknown type')}. {resp.json().get('description', "No description provided")}." def _construct_api_requests( From 1c5657363855f8abcade8da237df5af1170a4de3 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 10:28:43 -0600 Subject: [PATCH 50/56] Adjust readme styling --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5d8ee798..2686b448 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,7 @@ print(f"Found {len(flowlines)} upstream tributaries within 50km") - **Latest daily values**: Most recent daily statistical summary data - **Latest instantaneous values**: Most recent high-frequency continuous data - **Samples data**: Discrete USGS water quality data -- **Instantaneous values (:alarm_clock: COMING SOON)**: High-frequency continuous data +- **Instantaneous values** (:alarm_clock: *COMING SOON*): High-frequency continuous data ### Legacy NWIS Services (Deprecated) - **Daily values (dv)**: Legacy daily statistical data From 9e260969adf359bacf5ca71054a19ef917b8ea60 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 10:47:05 -0600 Subject: [PATCH 51/56] will this appease flake8 --- README.md | 2 +- dataretrieval/waterdata/utils.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2686b448..3821c478 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,7 @@ print(f"Found {len(flowlines)} upstream tributaries within 50km") - **Latest daily values**: Most recent daily statistical summary data - **Latest instantaneous values**: Most recent high-frequency continuous data - **Samples data**: Discrete USGS water quality data -- **Instantaneous values** (:alarm_clock: *COMING SOON*): High-frequency continuous data +- **Instantaneous values** (*COMING SOON*): High-frequency continuous data ### Legacy NWIS Services (Deprecated) - **Daily values (dv)**: Legacy daily statistical data diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index af6a9f6b..06154856 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -309,11 +309,16 @@ def _error_body(resp: requests.Response): predefined message indicating possible reasons for denial. For other status codes, returns the raw response text. """ - if resp.status_code == 429: + status = resp.status_code + if status == 429: return "429: Too many requests made. Please obtain an API token or try again later." - elif resp.status_code == 403: + elif status == 403: return "403: Query request denied. Possible reasons include query exceeding server limits." - return f"{resp.status_code}: {resp.json().get('code', 'Unknown type')}. {resp.json().get('description', "No description provided")}." + j_txt = resp.json() + return ( + f"{status}: {j_txt.get('code', 'Unknown type')}. " + f"{j_txt.get('description', 'No description provided')}." + ) def _construct_api_requests( From c4a0591c12e81fbc472edd8b3a58eba1c2999962 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 11:29:08 -0600 Subject: [PATCH 52/56] move versioning to above imports --- dataretrieval/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dataretrieval/__init__.py b/dataretrieval/__init__.py index 501e16bf..5c35a3f2 100644 --- a/dataretrieval/__init__.py +++ b/dataretrieval/__init__.py @@ -1,5 +1,10 @@ from importlib.metadata import PackageNotFoundError, version +try: + __version__ = version("dataretrieval") +except PackageNotFoundError: + __version__ = "version-unknown" + from dataretrieval.nadp import * from dataretrieval.nldi import * from dataretrieval.nwis import * @@ -9,8 +14,3 @@ from dataretrieval.waterdata import * from dataretrieval.waterwatch import * from dataretrieval.wqp import * - -try: - __version__ = version("dataretrieval") -except PackageNotFoundError: - __version__ = "version-unknown" From ed8fa23b9605211467c433112da5d3db4a7fc90b Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 12:00:54 -0600 Subject: [PATCH 53/56] add actual version to user agent --- dataretrieval/waterdata/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 06154856..68ae9e13 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -11,6 +11,7 @@ from zoneinfo import ZoneInfo from dataretrieval.utils import BaseMetadata +from dataretrieval import __version__ try: import geopandas as gpd @@ -251,7 +252,7 @@ def _default_headers(): headers = { "Accept-Encoding": "compress, gzip", "Accept": "application/json", - "User-Agent": "python-dataretrieval/1.0", + "User-Agent": f"python-dataretrieval/{__version__}", "lang": "en-US", } token = os.getenv("API_USGS_PAT") From cb2976e71b2fb212c2ca02831c7eaad9e31c2ff6 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 12:05:14 -0600 Subject: [PATCH 54/56] update waterdata test to skip on python 3.9 and older --- tests/waterdata_test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 90254f46..816bc112 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -1,8 +1,11 @@ import datetime - +import sys import pytest from pandas import DataFrame +if sys.version_info < (3, 10): + pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) + from dataretrieval.waterdata import ( _check_profiles, get_samples, From 7927f1fdf6d4298b578f2ff03f5f0d7cae71a30b Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 12:15:10 -0600 Subject: [PATCH 55/56] try new import to avoid errors --- tests/nadp_test.py | 2 +- tests/utils_test.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/nadp_test.py b/tests/nadp_test.py index 123e9e04..5d71b516 100644 --- a/tests/nadp_test.py +++ b/tests/nadp_test.py @@ -2,7 +2,7 @@ import os -import dataretrieval.nadp as nadp +from dataretrieval import nadp class TestMDNmap: diff --git a/tests/utils_test.py b/tests/utils_test.py index a99f91e7..711e5886 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -4,8 +4,10 @@ import pytest -import dataretrieval.nwis as nwis -from dataretrieval import utils +from dataretrieval import ( + utils, + nwis +) class Test_query: From c5754471ce1f5cca3a93dfc291e5a7cc002b572f Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 25 Nov 2025 12:25:43 -0600 Subject: [PATCH 56/56] remove ubuntu 3.8 from github actions --- .github/workflows/python-package.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4563b449..fc467976 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,6 +17,10 @@ jobs: matrix: os: [ubuntu-latest, windows-latest] python-version: [3.8, 3.9, '3.10', 3.11, 3.12] + exclude: + - os: ubuntu-latest + python-version: 3.8 + steps: - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871