diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index ad268194..eb26304e 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -29,6 +29,7 @@ SAMPLES_URL, _check_profiles, _default_headers, + _drop_hash_columns, _get_args, get_ogc_data, get_stats_data, @@ -57,6 +58,7 @@ def get_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -189,6 +191,19 @@ def get_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -257,6 +272,7 @@ def get_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Continuous data provide instantanous water conditions. @@ -384,6 +400,19 @@ def get_continuous( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -477,6 +506,7 @@ def get_monitoring_locations( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location including the name, identifier, agency responsible for data collection, and @@ -692,6 +722,19 @@ def get_monitoring_locations( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -755,6 +798,7 @@ def get_time_series_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, which represent a collection of observations of a single parameter, @@ -915,6 +959,19 @@ def get_time_series_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1012,6 +1069,7 @@ def get_combined_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get combined monitoring-location and time-series metadata. @@ -1112,6 +1170,19 @@ def get_combined_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1200,6 +1271,7 @@ def get_latest_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors @@ -1329,6 +1401,19 @@ def get_latest_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1395,6 +1480,7 @@ def get_latest_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -1526,6 +1612,19 @@ def get_latest_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1593,6 +1692,7 @@ def get_field_measurements( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a visit to the monitoring location. Field measurements consist of measurements @@ -1714,6 +1814,19 @@ def get_field_measurements( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1777,6 +1890,7 @@ def get_field_measurements_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get field-measurement metadata: one row per (location, parameter) series. @@ -1832,6 +1946,19 @@ def get_field_measurements_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1898,6 +2025,7 @@ def get_peaks( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get the annual peak streamflow / stage record for a monitoring location. @@ -1956,6 +2084,19 @@ def get_peaks( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -2133,6 +2274,7 @@ def get_samples( pointLocationWithinMiles: float | None = None, projectIdentifier: str | Iterable[str] | None = None, recordIdentifierUserSupplied: str | Iterable[str] | None = None, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Search Samples database for USGS water quality data. This is a wrapper function for the Samples database API. All potential @@ -2263,6 +2405,14 @@ def get_samples( recordIdentifierUserSupplied : string or iterable of strings, optional Internal AQS record identifier that returns 1 entry. Only available for the "results" service. + include_hash_ids : boolean, optional + If False (default), the per-activity UUID + (``Activity_ActivityIdentifier``) and per-result UUID + (``Result_MeasureIdentifier``) are dropped from the returned + DataFrame. Stable identifiers (``Org_Identifier``, + ``Location_Identifier``, ``Project_Identifier``, + ``USGSpcode``, …) are kept. Set to True to restore the legacy + behavior of including every column. Returns ------- @@ -2312,7 +2462,7 @@ def get_samples( _check_profiles(service, profile) # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"ssl_check", "profile"}) + params = _get_args(locals(), exclude={"ssl_check", "profile", "include_hash_ids"}) params.update({"mimeType": "text/csv"}) @@ -2333,6 +2483,7 @@ def get_samples( df = pd.read_csv(StringIO(response.text), delimiter=",") df = _attach_datetime_columns(df) + df = _drop_hash_columns(df, include_hash_ids) return df, BaseMetadata(response) @@ -2423,6 +2574,7 @@ def get_stats_por( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get day-of-year and month-of-year water data statistics from the USGS Water Data API. @@ -2501,6 +2653,13 @@ def get_stats_por( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. + include_hash_ids : boolean, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. Stable identifiers + (``monitoring_location_id``, ``parameter_code``, the time keys) + are kept. Set to True to restore the legacy behavior of + including every column. Examples -------- @@ -2525,10 +2684,13 @@ def get_stats_por( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles"}) + params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"}) return get_stats_data( - args=params, service="observationNormals", expand_percentiles=expand_percentiles + args=params, + service="observationNormals", + expand_percentiles=expand_percentiles, + include_hash_ids=include_hash_ids, ) @@ -2547,6 +2709,7 @@ def get_stats_date_range( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get monthly and annual water data statistics from the USGS Water Data API. This service (called the "observationIntervals" endpoint on api.waterdata.usgs.gov) @@ -2629,6 +2792,13 @@ def get_stats_date_range( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. + include_hash_ids : boolean, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. Stable identifiers + (``monitoring_location_id``, ``parameter_code``, the time keys) + are kept. Set to True to restore the legacy behavior of + including every column. Examples -------- @@ -2654,12 +2824,13 @@ def get_stats_date_range( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles"}) + params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"}) return get_stats_data( args=params, service="observationIntervals", expand_percentiles=expand_percentiles, + include_hash_ids=include_hash_ids, ) @@ -2695,6 +2866,7 @@ def get_channel( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Channel measurements taken as part of streamflow field measurements. @@ -2808,6 +2980,19 @@ def get_channel( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 91228357..e020d85c 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -157,6 +157,134 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # parameters and require POST with CQL2 JSON instead. _CQL2_REQUIRED_SERVICES = frozenset({"monitoring-locations"}) +# Column names whose values are server-generated hashes (UUIDs or hex +# digests). These are unstable across record refreshes — joining or +# diffing on them produces spurious churn — and they bloat the payload +# of large queries. Dropped by default; opt in with +# ``include_hash_ids=True``. Includes both: +# - The per-record version UUIDs that are aliased to a service's +# ``output_id`` (``daily_id``, ``continuous_id``, …). These get +# mapped to/from ``"id"`` on the wire; both names are listed so the +# filter works on either side of ``_switch_properties_id``. +# - Secondary hash columns embedded in record payloads +# (``time_series_id``, ``field_visit_id``, ``parent_time_series_id``, +# ``field_measurements_series_id``). +# ``monitoring_location_id`` (AGENCY-ID format, e.g. ``USGS-01646500``) +# and other code columns (``parameter_code``, ``statistic_id``, …) are +# intentionally absent — they're stable, human-meaningful identifiers. +_HASH_ID_COLUMNS = frozenset( + { + "daily_id", + "continuous_id", + "latest_continuous_id", + "latest_daily_id", + "field_measurement_id", + "field_series_id", + "peak_id", + "channel_measurements_id", + "combined_meta_id", + "time_series_id", + "parent_time_series_id", + "field_visit_id", + "field_measurements_series_id", + # ``get_stats_*`` (statistics service) output — per-computation + # UUID; ``parent_time_series_id`` is already listed above. + "computation_id", + # ``get_samples`` (Samples database CSV) — per-activity and + # per-result UUIDs. The Samples service uses CamelCase column + # names rather than snake_case, but the drop logic only needs + # exact name matches so they share this set. + "Activity_ActivityIdentifier", + "Result_MeasureIdentifier", + } +) + +# Cache of per-service queryables column lists, populated on first call +# from each service when computing the default ``properties=`` for +# ``include_hash_ids=False``. Keyed by service name; value is the full +# list of property names the server exposes for that collection. +_queryables_cache: dict[str, list[str]] = {} +# Cache of the derived non-hash property whitelist, keyed by +# ``(service, output_id)``. Both inputs determine the result, and +# both are stable per call site — re-deriving on every OGC request +# would do ~30–100 frozenset lookups per call for no reason. +_default_props_cache: dict[tuple[str, str], list[str]] = {} + + +def _service_queryables(service: str) -> list[str]: + """Return the cached queryables property list for ``service``. + + One HTTP GET per service per process; the list is reused for every + subsequent call. Raises ``requests.HTTPError`` on a non-200 — the + caller's ``include_hash_ids=False`` request can't be satisfied + without it, so failing loudly is preferable to silently dropping + the server-side trim. + """ + cached = _queryables_cache.get(service) + if cached is not None: + return cached + body = _check_ogc_requests(endpoint=service, req_type="queryables") + props = list(body.get("properties", {}).keys()) + _queryables_cache[service] = props + return props + + +def _default_non_hash_properties(service: str, output_id: str) -> list[str]: + """Build the ``properties=`` whitelist sent to the server when the + caller didn't supply one and ``include_hash_ids=False``. + + The whitelist is the service's queryables minus :data:`_HASH_ID_COLUMNS`, + minus ``"geometry"`` (the OGC server returns geometry via the feature + envelope, not as a property — some collections reject it as a + property name), and minus the wire-format ``"id"`` column when the + service's ``output_id`` is itself a hash column (e.g. ``daily_id``). + For ``monitoring-locations``, ``id`` becomes the AGENCY-ID + ``monitoring_location_id``, so it's kept. + """ + key = (service, output_id) + cached = _default_props_cache.get(key) + if cached is not None: + return cached + drop_wire_id = output_id in _HASH_ID_COLUMNS + props = [ + p + for p in _service_queryables(service) + if p not in _HASH_ID_COLUMNS + and p != "geometry" + and not (drop_wire_id and p == "id") + ] + _default_props_cache[key] = props + return props + + +def _properties_unspecified(properties) -> bool: + """True when the caller didn't pin a ``properties`` list. + + A ``None``, empty list, or list-of-only-NaN counts as unspecified. + Centralizes the predicate so the (subtly different) ``not properties`` + vs ``properties is None`` variants across call sites stay aligned. + """ + return not properties or all(pd.isna(properties)) + + +def _drop_hash_columns( + df: pd.DataFrame, + include_hash_ids: bool, + keep: set[str] | None = None, +) -> pd.DataFrame: + """Drop hash-valued ID columns from ``df`` when not opting in. + + When ``include_hash_ids`` is True, returns ``df`` unchanged. Otherwise + drops every column whose name is in :data:`_HASH_ID_COLUMNS`, except + those the caller listed in ``keep`` (e.g. names appearing in an + explicit user ``properties=`` request — explicit beats default). + A no-op when no hash columns are present. + """ + if include_hash_ids: + return df + drop = (set(df.columns) & _HASH_ID_COLUMNS) - (keep or set()) + return df.drop(columns=drop) if drop else df + def _parse_datetime(value: str) -> datetime | None: """Parse a single datetime string against the supported formats. @@ -738,7 +866,10 @@ def _deal_with_empty( def _arrange_cols( - df: pd.DataFrame, properties: list[str] | None, output_id: str + df: pd.DataFrame, + properties: list[str] | None, + output_id: str, + include_hash_ids: bool = False, ) -> pd.DataFrame: """ Rearranges and renames columns in a DataFrame based on provided @@ -753,6 +884,13 @@ def _arrange_cols( only NaN, the function renames 'id' to output_id. output_id : str The name to which the 'id' column should be renamed if applicable. + include_hash_ids : bool, optional + If False (default), hash-valued ID columns (see + :data:`_HASH_ID_COLUMNS`) are dropped from the result unless the + caller explicitly named them in ``properties``. If True, the + legacy behavior is preserved: hash columns are kept and the + per-record output_id columns are moved to the end of the + DataFrame when ``properties`` is unspecified. Returns ------- @@ -764,7 +902,9 @@ def _arrange_cols( # Rename id column to output_id df = df.rename(columns={"id": output_id}) - if properties and not all(pd.isna(properties)): + user_specified = not _properties_unspecified(properties) + + if user_specified: # Don't alias the caller's list — we mutate below. local_properties = list(properties) if "geometry" in df.columns and "geometry" not in local_properties: @@ -775,22 +915,32 @@ def _arrange_cols( local_properties[local_properties.index("id")] = output_id df = df.loc[:, [col for col in local_properties if col in df.columns]] - # Move meaningless-to-user, extra id columns to the end - # of the dataframe, if they exist - extra_id_col = set(df.columns).intersection( - { - "latest_continuous_id", - "latest_daily_id", - "daily_id", - "continuous_id", - "field_measurement_id", - } - ) + # Client-side safety net for the server-side trim done in + # ``get_ogc_data``: no-op on the happy path (server already omitted + # hash columns), drops them here when the queryables fetch failed + # and we fell back to a full payload. An explicit caller + # ``properties`` list — including ``"id"``, which resolved to + # ``output_id`` above — wins over the default. + keep: set[str] = set() + if user_specified: + keep = set(properties) + if "id" in keep: + keep.add(output_id) + df = _drop_hash_columns(df, include_hash_ids, keep=keep) + + # Legacy ordering: when ``include_hash_ids=True`` and ``properties`` + # is unspecified, move the per-record version IDs to the end so they + # don't crowd the front. With ``include_hash_ids=False`` those + # columns are gone above, so this branch is a no-op. + extra_id_col = set(df.columns) & { + "latest_continuous_id", + "latest_daily_id", + "daily_id", + "continuous_id", + "field_measurement_id", + } - # If the arbitrary id column is returned (either due to properties - # being none or NaN), then move it to the end of the dataframe, but - # if part of properties, keep in requested order - if extra_id_col and (properties is None or all(pd.isna(properties))): + if extra_id_col and _properties_unspecified(properties): id_col_order = [col for col in df.columns if col not in extra_id_col] + list( extra_id_col ) @@ -907,17 +1057,39 @@ def get_ogc_data( # Capture `properties` before the id-switch so post-processing sees # the user-facing names, not the wire-format ones. properties = args.get("properties") - args["properties"] = _switch_properties_id( - properties, id_name=output_id, service=service - ) convert_type = args.pop("convert_type", False) + include_hash_ids = args.pop("include_hash_ids", False) + + # When the caller didn't pin ``properties`` and isn't opting into + # hash IDs, try a server-side whitelist of the non-hash columns so + # the server skips serializing UUID/hex fields. On any queryables + # failure, fall through to the full payload — ``_arrange_cols`` + # post-processes the drop as a safety net. + use_server_trim = not include_hash_ids and _properties_unspecified(properties) + if use_server_trim: + try: + args["properties"] = _default_non_hash_properties(service, output_id) + except (requests.HTTPError, requests.RequestException, ValueError) as exc: + logger.warning( + "Could not fetch queryables for %s (%s); " + "falling back to client-side hash-ID drop.", + service, + exc, + ) + use_server_trim = False + if not use_server_trim: + args["properties"] = _switch_properties_id( + properties, id_name=output_id, service=service + ) args = {k: v for k, v in args.items() if v is not None} return_list, response = _fetch_once(args) return_list = _deal_with_empty(return_list, properties, service) if convert_type: return_list = _type_cols(return_list) - return_list = _arrange_cols(return_list, properties, output_id) + return_list = _arrange_cols( + return_list, properties, output_id, include_hash_ids=include_hash_ids + ) return_list = _sort_rows(return_list) return return_list, BaseMetadata(response) @@ -1073,6 +1245,7 @@ def get_stats_data( service: str, expand_percentiles: bool, client: requests.Session | None = None, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Retrieves statistical data from a specified endpoint and returns it @@ -1094,6 +1267,13 @@ def get_stats_data( each percentile gets its own row in the returned dataframe. If True and user requests a computation_type other than percentiles, a percentile column is still returned. + include_hash_ids : bool, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. These IDs are not + stable across record refreshes; ``computation_id`` is used as a + join key internally during percentile expansion and only + removed after that step completes. Returns ------- @@ -1170,6 +1350,10 @@ def get_stats_data( if expand_percentiles: dfs = _expand_percentiles(dfs) + # Drop hash IDs after ``_expand_percentiles`` — it merges on + # ``computation_id`` while exploding the percentile lists. + dfs = _drop_hash_columns(dfs, include_hash_ids) + return dfs, BaseMetadata(initial_response) finally: if close_client: diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 18e78594..da045f60 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -82,8 +82,14 @@ def test_mock_get_samples(requests_mock): monitoringLocationIdentifier="USGS-05406500", ) assert type(df) is DataFrame - # 181 source columns + 6 derived DateTime columns - assert df.shape == (67, 187) + # 181 source columns + 6 derived DateTime columns − 2 hash IDs + # (Activity_ActivityIdentifier, Result_MeasureIdentifier) dropped by default. + assert df.shape == (67, 185) + assert "Activity_ActivityIdentifier" not in df.columns + assert "Result_MeasureIdentifier" not in df.columns + # Stable identifiers are preserved. + assert "Location_Identifier" in df.columns + assert "Org_Identifier" in df.columns assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} @@ -91,6 +97,29 @@ def test_mock_get_samples(requests_mock): assert df["Activity_StartDateTime"].notna().any() +def test_mock_get_samples_include_hash_ids(requests_mock): + """``include_hash_ids=True`` restores the legacy column set.""" + request_url = ( + "https://api.waterdata.usgs.gov/samples-data/results/fullphyschem?" + "activityMediaName=Water&activityStartDateLower=2020-01-01" + "&activityStartDateUpper=2024-12-31&monitoringLocationIdentifier=USGS-05406500&mimeType=text%2Fcsv" + ) + response_file_path = "tests/data/samples_results.txt" + mock_request(requests_mock, request_url, response_file_path) + df, _md = get_samples( + service="results", + profile="fullphyschem", + activityMediaName="Water", + activityStartDateLower="2020-01-01", + activityStartDateUpper="2024-12-31", + monitoringLocationIdentifier="USGS-05406500", + include_hash_ids=True, + ) + assert df.shape == (67, 187) + assert "Activity_ActivityIdentifier" in df.columns + assert "Result_MeasureIdentifier" in df.columns + + def test_mock_get_samples_summary(requests_mock): """Tests USGS Samples summary query""" request_url = ( @@ -216,10 +245,11 @@ def test_samples_results(): activityStartDateLower="2024-10-01", activityStartDateUpper="2025-04-24", ) - assert all( - col in df.columns - for col in ["Location_Identifier", "Activity_ActivityIdentifier"] - ) + # Stable identifiers are kept; hash IDs (Activity_ActivityIdentifier, + # Result_MeasureIdentifier) are dropped by default. + assert "Location_Identifier" in df.columns + assert "Activity_ActivityIdentifier" not in df.columns + assert "Result_MeasureIdentifier" not in df.columns assert len(df) > 0 @@ -231,7 +261,10 @@ def test_samples_activity(): monitoringLocationIdentifier="USGS-06719505", ) assert len(df) > 0 - assert len(df.columns) == 97 + # 97 → 96 cols after dropping Activity_ActivityIdentifier + # (Result_MeasureIdentifier is not in the ``activities`` profile). + assert len(df.columns) == 96 + assert "Activity_ActivityIdentifier" not in df.columns assert "Location_HUCTwelveDigitCode" in df.columns @@ -277,10 +310,14 @@ def test_get_daily(): parameter_code="00060", time="2025-01-01/..", ) - assert "daily_id" in df.columns + # Default: hash-valued ID columns (daily_id, time_series_id) are + # dropped. Stable identifiers (monitoring_location_id, + # parameter_code, statistic_id, time) are preserved. + assert "daily_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "geometry" in df.columns - assert df.columns[-1] == "daily_id" - assert df.shape[1] == 12 + assert df.shape[1] == 10 assert df.parameter_code.unique().tolist() == ["00060"] assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() @@ -290,6 +327,22 @@ def test_get_daily(): assert df["value"].dtype == "float64" +def test_get_daily_include_hash_ids(): + """``include_hash_ids=True`` restores the legacy behavior: the + per-record UUID (``daily_id``) and secondary hashes + (``time_series_id``) are present.""" + df, _ = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + include_hash_ids=True, + ) + assert "daily_id" in df.columns + assert "time_series_id" in df.columns + assert df.columns[-1] == "daily_id" + assert df.shape[1] == 12 + + def test_get_daily_properties(): df, _ = get_daily( monitoring_location_id="USGS-05427718", @@ -335,7 +388,8 @@ def test_get_daily_no_geometry(): skip_geometry=True, ) assert "geometry" not in df.columns - assert df.shape[1] == 11 + # 10 default cols minus geometry, with hash IDs dropped by default. + assert df.shape[1] == 9 assert isinstance(df, DataFrame) @@ -351,7 +405,11 @@ def test_get_continuous(): df["time"].dtype.name.startswith("datetime64[") and "UTC" in df["time"].dtype.name ) - assert "continuous_id" in df.columns + # Default: continuous_id (UUID) and time_series_id (hex hash) are + # dropped. Set ``include_hash_ids=True`` to keep them. + assert "continuous_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns def test_get_monitoring_locations(): @@ -376,7 +434,10 @@ def test_get_latest_continuous(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"], ) - assert df.columns[-1] == "latest_continuous_id" + # Default: latest_continuous_id (UUID) and time_series_id are dropped. + assert "latest_continuous_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] assert hasattr(md, "url") @@ -391,8 +452,11 @@ def test_get_latest_daily(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"], ) - assert "latest_daily_id" in df.columns - assert df.shape[1] == 12 + # Default: latest_daily_id (UUID) and time_series_id are dropped. + assert "latest_daily_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns + assert df.shape[1] == 10 assert hasattr(md, "url") assert hasattr(md, "query_time") @@ -420,7 +484,12 @@ def test_get_field_measurements(): time="2025-01-01/2025-10-01", skip_geometry=True, ) - assert "field_measurement_id" in df.columns + # Default: field_measurement_id (UUID), field_measurements_series_id + # (UUID), and field_visit_id (UUID) are dropped. + assert "field_measurement_id" not in df.columns + assert "field_measurements_series_id" not in df.columns + assert "field_visit_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "geometry" not in df.columns assert df.unit_of_measure.unique().tolist() == ["ft^3/s"] assert hasattr(md, "url") @@ -478,7 +547,9 @@ def test_get_field_measurements_metadata(): df, md = get_field_measurements_metadata( monitoring_location_id="USGS-02238500", skip_geometry=True ) - assert "field_series_id" in df.columns + # Default: field_series_id (UUID) is dropped. + assert "field_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "begin" in df.columns assert "end" in df.columns assert (df["monitoring_location_id"] == "USGS-02238500").all() @@ -506,7 +577,10 @@ def test_get_field_measurements_metadata_multi_site(): def test_get_peaks(): df, md = get_peaks(monitoring_location_id="USGS-02238500", skip_geometry=True) - assert "peak_id" in df.columns + # Default: peak_id (UUID) and time_series_id are dropped. + assert "peak_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "value" in df.columns assert "water_year" in df.columns assert (df["monitoring_location_id"] == "USGS-02238500").all() @@ -577,13 +651,31 @@ def test_get_stats_por_expanded_false(): computation_type=["minimum", "percentile"], ) assert df.shape[0] == 4 - assert df.shape[1] == 20 # if geopandas installed, 21 columns if not + # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols. + assert df.shape[1] == 18 + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns assert "percentile" not in df.columns assert "percentiles" in df.columns assert type(df["percentiles"][2]) is list assert df.loc[~df["percentiles"].isna(), "value"].isnull().all() +def test_get_stats_por_include_hash_ids(): + """``include_hash_ids=True`` preserves the per-computation UUID + and the upstream time-series hex hash that ``get_stats_*`` used + to return unconditionally.""" + df, _ = get_stats_por( + monitoring_location_id="USGS-12451000", + parameter_code="00060", + start_date="01-01", + end_date="01-01", + include_hash_ids=True, + ) + assert "computation_id" in df.columns + assert "parent_time_series_id" in df.columns + + def test_get_stats_date_range(): df, _ = get_stats_date_range( monitoring_location_id="USGS-12451000", @@ -594,7 +686,10 @@ def test_get_stats_date_range(): ) assert df.shape[0] == 3 - assert df.shape[1] == 20 # if geopandas installed, 21 columns if not + # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols. + assert df.shape[1] == 18 + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns assert "interval_type" in df.columns assert "percentile" in df.columns assert df["interval_type"].isin(["month", "calendar_year", "water_year"]).all() @@ -604,8 +699,12 @@ def test_get_channel(): df, _ = get_channel(monitoring_location_id="USGS-02238500") assert df.shape[0] > 470 - assert df.shape[1] == 27 # if geopandas installed, 21 columns if not - assert "channel_measurements_id" in df.columns + # Default: channel_measurements_id (UUID) and field_visit_id (UUID) + # are dropped. 27 → 25 cols. + assert df.shape[1] == 25 # if geopandas installed, fewer if not + assert "channel_measurements_id" not in df.columns + assert "field_visit_id" not in df.columns + assert "monitoring_location_id" in df.columns class TestCheckMonitoringLocationId: diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index b05587e2..6e8e5777 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -256,6 +256,88 @@ def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): assert any("tok2" in m for m in warnings_), warnings_ +def test_get_stats_data_drops_hash_ids_by_default(monkeypatch): + """``get_stats_data`` drops ``computation_id`` and + ``parent_time_series_id`` from the result by default — the + ``include_hash_ids=False`` counterpart for the stats path.""" + from dataretrieval.waterdata.utils import get_stats_data + + monkeypatch.setattr( + _utils_module, + "_handle_stats_nesting", + mock.MagicMock( + return_value=pd.DataFrame( + { + "monitoring_location_id": ["USGS-1"], + "parameter_code": ["00060"], + "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], + "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + "value": [1.0], + } + ) + ), + ) + + page1 = mock.MagicMock() + page1.status_code = 200 + page1.json.return_value = {"next": None, "features": []} + page1.elapsed = __import__("datetime").timedelta(milliseconds=1) + page1.headers = {} + page1.url = "https://example/stats" + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = page1 + + df, _ = get_stats_data( + args={"monitoring_location_id": "USGS-1"}, + service="observationNormals", + expand_percentiles=False, + client=client, + ) + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns + assert "parameter_code" in df.columns + assert "value" in df.columns + + +def test_get_stats_data_keeps_hash_ids_when_opted_in(monkeypatch): + """``include_hash_ids=True`` preserves the legacy stats columns.""" + from dataretrieval.waterdata.utils import get_stats_data + + monkeypatch.setattr( + _utils_module, + "_handle_stats_nesting", + mock.MagicMock( + return_value=pd.DataFrame( + { + "monitoring_location_id": ["USGS-1"], + "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], + "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + } + ) + ), + ) + + page1 = mock.MagicMock() + page1.status_code = 200 + page1.json.return_value = {"next": None, "features": []} + page1.elapsed = __import__("datetime").timedelta(milliseconds=1) + page1.headers = {} + page1.url = "https://example/stats" + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = page1 + + df, _ = get_stats_data( + args={"monitoring_location_id": "USGS-1"}, + service="observationNormals", + expand_percentiles=False, + client=client, + include_hash_ids=True, + ) + assert "computation_id" in df.columns + assert "parent_time_series_id" in df.columns + + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the @@ -330,6 +412,81 @@ def test_arrange_cols_keeps_geometry_when_present(): assert "geometry" in result.columns +def test_arrange_cols_drops_hash_ids_by_default(): + """Default ``include_hash_ids=False`` drops the per-record UUID + (renamed to ``daily_id``) and secondary hash columns + (``time_series_id``), keeping stable identifiers.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols(df, properties=None, output_id="daily_id") + assert "daily_id" not in result.columns + assert "time_series_id" not in result.columns + assert "monitoring_location_id" in result.columns + assert "value" in result.columns + + +def test_arrange_cols_include_hash_ids_keeps_them(): + """``include_hash_ids=True`` preserves the legacy behavior — hash + columns are kept and the per-record UUID lands at the end of the + column order.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols( + df, properties=None, output_id="daily_id", include_hash_ids=True + ) + assert "daily_id" in result.columns + assert "time_series_id" in result.columns + # Legacy ordering: ``daily_id`` moves to the end. + assert result.columns[-1] == "daily_id" + + +def test_arrange_cols_explicit_properties_keep_hash_ids(): + """A user who lists a hash column in ``properties`` gets it back even + with the default ``include_hash_ids=False`` — explicit beats default.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols( + df, + properties=["daily_id", "time_series_id", "value"], + output_id="daily_id", + ) + assert "daily_id" in result.columns + assert "time_series_id" in result.columns + + +def test_arrange_cols_non_hash_output_id_kept(): + """``monitoring_location_id`` (the output_id for monitoring-locations) + is NOT a hash — the AGENCY-ID format is stable and human-meaningful — + so it must stay even under the default.""" + df = pd.DataFrame( + { + "id": ["USGS-01"], + "agency_code": ["USGS"], + } + ) + result = _arrange_cols(df, properties=None, output_id="monitoring_location_id") + assert "monitoring_location_id" in result.columns + assert result.loc[0, "monitoring_location_id"] == "USGS-01" + + # --- _format_api_dates -------------------------------------------------------