diff --git a/README.md b/README.md index 465fb5f2..aa38ba39 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,12 @@ ## Latest Announcements -:mega: **12/04/2025:** `dataretrieval` now features the new `waterdata` module, +:mega: **01/16/2025:** `dataretrieval` now features the `waterdata` module, which provides access to USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, **instantaneous values**, field measurements, time series metadata, -and discrete water quality data from the Samples database. This new module will -eventually replace the `nwis` module, which provides access to the legacy [NWIS -Water Services](https://waterservices.usgs.gov/). +and discrete water quality data from the [Samples database](https://waterdata.usgs.gov/download-samples/#dataProfile=site). This new module replaces the `nwis` module, which provides access to the legacy [NWIS +Water Services](https://waterservices.usgs.gov/). Take a look at the new [`waterdata` module demo notebook](demos/WaterData_demo.ipynb), which walks through an extended example using a majority of the available `waterdata` functions. Check out the [NEWS](NEWS.md) file for all updates and announcements. diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 30659580..662bb898 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -11,7 +11,6 @@ # Public API exports from .api import ( - _check_profiles, get_codes, get_continuous, get_daily, @@ -41,7 +40,6 @@ "get_reference_table", "get_samples", "get_time_series_metadata", - "_check_profiles", "CODE_SERVICES", "SERVICES", "PROFILES", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 59cc5a17..1e5c48f3 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -17,7 +17,6 @@ from dataretrieval.waterdata.types import ( CODE_SERVICES, METADATA_COLLECTIONS, - PROFILE_LOOKUP, PROFILES, SERVICES, ) @@ -25,7 +24,8 @@ SAMPLES_URL, get_ogc_data, _construct_api_requests, - _walk_pages + _walk_pages, + _check_profiles ) # Set up logger for this module @@ -691,9 +691,13 @@ def get_time_series_metadata( parameter_name: Optional[Union[str, List[str]]] = None, properties: Optional[Union[str, List[str]]] = None, statistic_id: Optional[Union[str, List[str]]] = None, + hydrologic_unit_code: Optional[Union[str, List[str]]] = None, + state_name: Optional[Union[str, List[str]]] = None, last_modified: Optional[Union[str, List[str]]] = None, begin: Optional[Union[str, List[str]]] = None, end: Optional[Union[str, List[str]]] = None, + begin_utc: Optional[Union[str, List[str]]] = None, + end_utc: Optional[Union[str, List[str]]] = None, unit_of_measure: Optional[Union[str, List[str]]] = None, computation_period_identifier: Optional[Union[str, List[str]]] = None, computation_identifier: Optional[Union[str, List[str]]] = None, @@ -742,6 +746,17 @@ def get_time_series_metadata( Example codes include 00001 (max), 00002 (min), and 00003 (mean). A complete list of codes and their descriptions can be found at https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + hydrologic_unit_code : string or list of strings, optional + The United States is divided and sub-divided into successively smaller + hydrologic units which are classified into four levels: regions, + sub-regions, accounting units, and cataloging units. The hydrologic + units are arranged within each other, from the smallest (cataloging units) + to the largest (regions). Each hydrologic unit is identified by a unique + hydrologic unit code (HUC) consisting of two to eight digits based on the + four levels of classification in the hydrologic unit system. + state_name : string or list of strings, optional + The name of the state or state equivalent in which the monitoring location + is located. last_modified : string, optional The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate @@ -760,6 +775,14 @@ def get_time_series_metadata( for the last 36 hours begin : string or list of strings, optional + This field contains the same information as "begin_utc", but in the + local time of the monitoring location. It is retained for backwards + compatibility, but will be removed in V1 of these APIs. + end : string or list of strings, optional + This field contains the same information as "end_utc", but in the + local time of the monitoring location. It is retained for backwards + compatibility, but will be removed in V1 of these APIs. + begin_utc : string or list of strings, optional The datetime of the earliest observation in the time series. Together with end, this field represents the period of record of a time series. Note that some time series may have large gaps in their collection @@ -776,7 +799,7 @@ def get_time_series_metadata( * Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" * Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - end : string or list of strings, optional + end_utc : string or list of strings, optional The datetime of the most recent observation in the time series. Data returned by this endpoint updates at most once per day, and potentially less frequently than that, and as such there may be more recent observations within a time series @@ -1703,31 +1726,3 @@ def get_samples( return df, BaseMetadata(response) - -def _check_profiles( - service: SERVICES, - profile: PROFILES, -) -> None: - """Check whether a service profile is valid. - - Parameters - ---------- - service : string - One of the service names from the "services" list. - profile : string - One of the profile names from "results_profiles", - "locations_profiles", "activities_profiles", - "projects_profiles" or "organizations_profiles". - """ - valid_services = get_args(SERVICES) - if service not in valid_services: - raise ValueError( - f"Invalid service: '{service}'. Valid options are: {valid_services}." - ) - - valid_profiles = PROFILE_LOOKUP[service] - if profile not in valid_profiles: - raise ValueError( - f"Invalid profile: '{profile}' for service '{service}'. " - f"Valid options are: {valid_profiles}." - ) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 5c6ae106..39e0a357 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -4,7 +4,7 @@ import os import re from datetime import datetime -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union, get_args import pandas as pd import requests @@ -13,6 +13,12 @@ from dataretrieval.utils import BaseMetadata from dataretrieval import __version__ +from dataretrieval.waterdata.types import ( + PROFILE_LOOKUP, + PROFILES, + SERVICES, +) + try: import geopandas as gpd @@ -498,6 +504,7 @@ def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame: ) df.columns = [col.replace("properties_", "") for col in df.columns] df.rename(columns={"geometry_coordinates": "geometry"}, inplace=True) + df = df.loc[:, ~df.columns.duplicated()] return df # Organize json into geodataframe and make sure id column comes along. @@ -824,3 +831,31 @@ def get_ogc_data( return return_list, metadata +def _check_profiles( + service: SERVICES, + profile: PROFILES, +) -> None: + """Check whether a service profile is valid. + + Parameters + ---------- + service : string + One of the service names from the "services" list. + profile : string + One of the profile names from "results_profiles", + "locations_profiles", "activities_profiles", + "projects_profiles" or "organizations_profiles". + """ + valid_services = get_args(SERVICES) + if service not in valid_services: + raise ValueError( + f"Invalid service: '{service}'. Valid options are: {valid_services}." + ) + + valid_profiles = PROFILE_LOOKUP[service] + if profile not in valid_profiles: + raise ValueError( + f"Invalid profile: '{profile}' for service '{service}'. " + f"Valid options are: {valid_profiles}." + ) + diff --git a/demos/WaterData_demo.ipynb b/demos/WaterData_demo.ipynb new file mode 100644 index 00000000..b7d116b6 --- /dev/null +++ b/demos/WaterData_demo.ipynb @@ -0,0 +1,621 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7d0ca866", + "metadata": {}, + "source": [ + "# Using the `waterdata` module to pull data from the USGS Water Data APIs\n", + "The `waterdata` module replaces the `nwis` module for accessing USGS water data. It leverages the [Water Data APIs](https://api.waterdata.usgs.gov/) to download metadata, daily values, and instantaneous values. \n", + "\n", + "While the specifics of this transition timeline are opaque, it is advised to switch to the new functions as soon as possible to reduce unexpected interruptions in your workflow.\n", + "\n", + "As always, please report any issues you encounter on our [Issues](https://github.com/DOI-USGS/dataretrieval-python/issues) page. If you have questions or need help, please reach out to us at comptools@usgs.gov." + ] + }, + { + "cell_type": "markdown", + "id": "fcccb6e8", + "metadata": {}, + "source": [ + "## Prerequisite: Get your Water Data API key\n", + "We highly suggest signing up for your own API key [here](https://api.waterdata.usgs.gov/signup/) to afford yourself higher rate limits and more reliable access to the data. If you opt not to register for an API key, then the number of requests you can make to the Water Data APIs is considerably lower, and if you share an IP address across users or workflows, you may hit those limits even faster. Luckily, registering for an API key is free and easy.\n", + "\n", + "Once you've copied your API key and saved it in a safe place, you can set it as an environment variable in your python script for the current session:\n", + "\n", + "```python\n", + "import os\n", + "os.environ['API_USGS_PAT'] = 'your_api_key_here'\n", + "``` \n", + "Note that the environment variable name is `API_USGS_PAT`, which stands for \"API USGS Personal Access Token\".\n", + "\n", + "If you'd like a more permanent, repository-specific solution, you can use the `python-dotenv` package to read your API key from a `.env` file in your repository root directory, like this:\n", + "\n", + "```python\n", + "!pip install python-dotenv # only run this line once to install the package in your environment\n", + "from dotenv import load_dotenv\n", + "load_dotenv() # this will load the environment variables from the .env file\n", + "```\n", + "Make sure your `.env` file contains the following line:\n", + "```\n", + "API_USGS_PAT=your_api_key_here\n", + "```\n", + "Also, do not commit your `.env` file to version control, as it contains sensitive information. You can add it to your `.gitignore` file to prevent accidental commits." + ] + }, + { + "cell_type": "markdown", + "id": "4a2b3f0f", + "metadata": {}, + "source": [ + "## Lay of the Land\n", + "Now that your API key is configured, it's time to take a 10,000-ft view of the functions in the `waterdata` module.\n", + "\n", + "### Metadata endpoints\n", + "These functions retrieve metadata tables that can be used to refine your data requests.\n", + "\n", + "- `get_reference_table()` - Not sure which parameter code you're looking for, or which hydrologic unit your study area is in? This function will help you find the right input values for the data endpoints to retrieve the information you want.\n", + "- `get_codes()` - Similar to `get_reference_table()`, this function retrieves dataframes containing available input values that correspond to the Samples water quality database.\n", + "\n", + "### Data endpoints\n", + "- `get_daily()` - Daily values for monitoring locations, parameters, stat codes, and more.\n", + "- `get_continuous()` - Instantaneous values for monitoring locations, parameters, statistical codes, and more.\n", + "- `get_monitoring_locations()`- Monitoring location information such as name, monitoring location ID, latitude, longitude, huc code, site types, and more.\n", + "- `get_time_series_metadata()` - Timeseries metadata across monitoring locations, parameter codes, statistical codes, and more. Can be used to answer the question: what types of data are collected at my site(s) of interest and over what time period are/were they collected? \n", + "- `get_latest_continuous()` - Latest instantaneous values for requested monitoring locations, parameter codes, statistical codes, and more.\n", + "- `get_latest_daily()` - Latest daily values for requested monitoring locations, parameter codes, statistical codes, and more.\n", + "- `get_field_measurements()` - Physically measured values (a.k.a discrete) of gage height, discharge, groundwater levels, and more for requested monitoring locations.\n", + "- `get_samples()` - Discrete water quality sample results for monitoring locations, observed properties, and more." + ] + }, + { + "cell_type": "markdown", + "id": "19b5aebf", + "metadata": {}, + "source": [ + "### A few key tips\n", + "- You'll notice that each of the data functions have many unique inputs you can specify. **DO NOT** specify too many! Specify *just enough* inputs to return what you need. But do not provide redundant geographical or parameter information as this may slow down your query and lead to errors.\n", + "- Each function returns a Tuple, containing a dataframe and a Metadata class. If you have `geopandas` installed in your environment, the dataframe will be a `GeoDataFrame` with a geometry included. If you do not have `geopandas`, the dataframe will be a `pandas` dataframe with the geometry contained in a coordinates column. The Metadata object contains information about your query, like the query url.\n", + "- If you do not want to return the `geometry` column, use the input `skip_geometry=True`.\n", + "- All of these functions (except `get_samples()`) have a `limit` argument, which signifies the number of rows returned with each \"page\" of data. The Water Data APIs use paging to chunk up large responses and send data most efficiently to the requester. The `waterdata` functions collect the rows of data from each page and combine them into one final dataframe at the end. The default and maximum limit per page is 50,000 rows. In other words, if you request 100,000 rows of data from the database, it will return all the data in 2 pages, and each page counts as a \"request\" using your API key. If you were to change the argument to `limit=10000`, then each page returned would contain 10,000 rows, and it would take 10 requests/pages to return the total 100,000 rows. In general, there is no need to adjust the `limit` argument. However, if you are working with slow internet speeds, adjusting the `limit` argument may reduce chances of failures due to bandwidth.\n", + "- You can find some other helpful tips in the [Water Data API documentation](https://api.waterdata.usgs.gov/docs/ogcapi/)." + ] + }, + { + "cell_type": "markdown", + "id": "68591b52", + "metadata": {}, + "source": [ + "## Examples\n", + "Let's get into some examples using the functions listed above. First, we need to load the `waterdata` module and a few other packages and functions to go through the examples. To run the entirety of this notebook, you will need to install `dataretrieval`, `matplotlib`, and `geopandas` packages. `matplotlib` is needed to create the plots, and `geopandas` is needed to create the interactive maps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd626a14", + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages to run notebook\n", + "!pip install dataretrieval\n", + "!pip install matplotlib\n", + "!pip install geopandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ca9bb6a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import matplotlib.ticker as mtick\n", + "from IPython.display import display\n", + "from datetime import datetime, timedelta\n", + "from datetime import date\n", + "from dateutil.relativedelta import relativedelta\n", + "from dataretrieval import waterdata" + ] + }, + { + "cell_type": "markdown", + "id": "406762ab", + "metadata": {}, + "source": [ + "#### Reference tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1035ebbb", + "metadata": {}, + "outputs": [], + "source": [ + "pcodes,metadata = waterdata.get_reference_table(\"parameter-codes\")\n", + "display(pcodes.head())" + ] + }, + { + "cell_type": "markdown", + "id": "1e0eab77", + "metadata": {}, + "source": [ + "Let's say we want to find all parameter codes relating to streamflow discharge. We can use some string matching to find applicable codes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "665ccb23", + "metadata": {}, + "outputs": [], + "source": [ + "streamflow_pcodes = pcodes[pcodes['parameter_name'].str.contains('streamflow|discharge', case=False, na=False)]\n", + "display(streamflow_pcodes[['parameter_code_id', 'parameter_name']])" + ] + }, + { + "cell_type": "markdown", + "id": "d9487ee4", + "metadata": {}, + "source": [ + "Interesting that there are so many different streamflow-related parameter codes! Going on experience, let's use the most common one, `00060`, which is \"Discharge, cubic feet per second\".\n", + "\n", + "### Timeseries metadata\n", + "\n", + "Now that we know which parameter code we want to use, let's find all the stream monitoring locations that have recent discharge data and at least 10 years of daily values in the state of Nebraska. We will use the `waterdata.get_time_series_metadata()` function to suss out which sites fit the bill. This function will return a row for each *timeseries* that matches our inputs. It doesn't contain the daily discharge values themselves, just information *about* that timeseries." + ] + }, + { + "cell_type": "markdown", + "id": "70ee1da9", + "metadata": {}, + "source": [ + "First, let's get our expected date range in order. Note that the `waterdata` functions are capable of taking in bounded and unbounded date and datetime ranges. In this case, we want the start date of the discharge timeseries to be no more recent than 10 years ago, and we want the end date of the timeseries to be from at most a week ago. We can use the notation `{date}/..` to mean that we want all timeseries that end a week ago or more recently. Similarly, we can use the notation `../{date}` to mean we want all timeseries that started at least 10 years ago (and thus likely have at least 10 years of data)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57e2c93a", + "metadata": {}, + "outputs": [], + "source": [ + "ten_years_ago =(date.today() - relativedelta(years=10)).strftime(\"%Y-%m-%d\")\n", + "one_week_ago = (datetime.now() - timedelta(days=7)).date().strftime(\"%Y-%m-%d\")" + ] + }, + { + "cell_type": "markdown", + "id": "261b5a32", + "metadata": {}, + "source": [ + "We will also use the `skip_geometry` argument in our timeseries metadata call. By default, most `waterdata` functions return a geometry column containing the monitoring location's coordinates. This is a really cool feature that we will use later, but for this particular data pull, we don't need it. Setting `skip_geometry=True` makes the returned dataframe smaller and more efficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a901f5fa", + "metadata": {}, + "outputs": [], + "source": [ + "NE_discharge,_ = waterdata.get_time_series_metadata(\n", + " state_name=\"Nebraska\",\n", + " parameter_code='00060',\n", + " begin=f\"../{ten_years_ago}\",\n", + " end=f\"{one_week_ago}/..\",\n", + " skip_geometry=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8809a98d", + "metadata": {}, + "outputs": [], + "source": [ + "display(NE_discharge.sort_values(\"monitoring_location_id\").head())\n", + "print(f\"There are {len(NE_discharge['monitoring_location_id'].unique())} sites with recent discharge data available in the state of Nebraska\")" + ] + }, + { + "cell_type": "markdown", + "id": "8f464470", + "metadata": {}, + "source": [ + "In the dataframe above, we are looking at 5 timeseries returned, ordered by monitoring location. You can also see that the first two rows show two different kinds of discharge for the same monitoring location: a mean daily discharge timeseries (with [statistic id](https://api.waterdata.usgs.gov/docs/ogcapi/) 00003, which represents \"mean\") and an instantaneous discharge timeseries (with statistic id 00011, which represents \"points\" or \"instantaneous\" values). Look closely and you may also notice that the `parent_timeseries_id` column for daily mean discharge matches the `time_series_id` for the instantaneous discharge. This is because once instantaneous measurements began at the site, they were used to calculate the daily mean." + ] + }, + { + "cell_type": "markdown", + "id": "650ac2e6", + "metadata": {}, + "source": [ + "### Monitoring locations\n", + "Now that we know which sites have recent discharge data, let's find stream sites and plot them on a map. We will use the `waterdata.get_monitoring_locations()` function to grab more metadata about these sites.\n", + "\n", + "We can feed the unique monitoring location IDs from `NE_discharge` into the `get_monitoring_locations()` function to get the metadata for just those sites. However, there is a limit to the number of IDs that can be passed in one call to the API. Further down in this notebook, you'll see an example where we successfully feed all ~100 IDs in one call to the API. However, for demonstration purposes, we will split the list of monitoring location IDs into a few chunks of 50 sent to the API and stitch the resulting dataframes together. A loose rule of thumb is to keep the number of IDs below 200, but this exact number will depend on the typical length of each monitoring location ID (i.e. if your monitoring location IDs are > 13 characters long: \"USGS-XXXXXXXX\"+, you will need to feed in less than 200 at a time)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c3eeac3", + "metadata": {}, + "outputs": [], + "source": [ + "chunk_size=50\n", + "site_list = NE_discharge['monitoring_location_id'].unique().tolist()\n", + "chunks = [site_list[i:i + chunk_size] for i in range(0, len(site_list), chunk_size)]\n", + "NE_locations = pd.DataFrame()\n", + "for site_group in chunks:\n", + " try:\n", + " chunk_data,_ = waterdata.get_monitoring_locations(\n", + " monitoring_location_id=site_group,\n", + " site_type_code=\"ST\"\n", + " )\n", + " if not chunk_data.empty:\n", + " NE_locations = pd.concat([NE_locations, chunk_data])\n", + " except Exception as e:\n", + " print(f\"Chunk failed: {e}\")\n", + "\n", + "display(NE_locations[[\"monitoring_location_id\", \"monitoring_location_name\", \"hydrologic_unit_code\"]].head())" + ] + }, + { + "cell_type": "markdown", + "id": "21a0f28f", + "metadata": {}, + "source": [ + "That took a little bit of work to loop through the site chunks and bind the data back together. Admittedly, there may be times where chunking and iterating might be the most efficient workflow. But in this particular case, we have a less onerous option available: `get_monitoring_locations()` has a `state_name` argument. It will likely be faster to pull all stream sites for Nebraska and then filter down to the sites present in the timeseries dataframe: no iteration needed. Let's try this too." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce4df5fb", + "metadata": {}, + "outputs": [], + "source": [ + "NE_locations,_ = waterdata.get_monitoring_locations(\n", + " state_name=\"Nebraska\",\n", + " site_type_code=\"ST\"\n", + " )\n", + "\n", + "NE_locations_discharge = NE_locations.loc[NE_locations['monitoring_location_id'].isin(NE_discharge['monitoring_location_id'].unique().tolist())]\n", + "display(NE_locations_discharge[[\"monitoring_location_id\", \"monitoring_location_name\", \"hydrologic_unit_code\"]].head())" + ] + }, + { + "cell_type": "markdown", + "id": "f0fe5c4e", + "metadata": {}, + "source": [ + "If you have `geopandas` installed, the function will return a `GeoDataFrame` with a `geometry` column containing the monitoring locations' coordinates. You can use `gpd.explore()` to view your geometry coordinates on an interactive map. We will demo this functionality below (Hover over the site points to see all the columns returned from `waterdata.get_monitoring_locations()`). If you don't have `geopandas` installed, `dataretrieval` will return a regular `pandas` DataFrame with coordinate columns instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "659b19a5", + "metadata": {}, + "outputs": [], + "source": [ + "NE_locations_discharge.set_crs(crs=\"WGS84\").explore()" + ] + }, + { + "cell_type": "markdown", + "id": "3c7fd0df", + "metadata": {}, + "source": [ + "### Latest daily and instantaneous values\n", + "Now that we know which sites in Nebraska have recent discharge data, and we know where they are located, we can start downloading some actual flow values. Let's start with some of the most \"lightweight\" functions, `waterdata.get_latest_daily()` and `waterdata.get_latest_continuous()`, which will return only the latest value for each monitoring location requested. \n", + "\n", + "Recall from above, we are working with ~100 sites with discharge data. Conveniently, the `waterdata` functions are *usually* pretty good at handling requests of up to ~200 monitoring locations. However, if you have more than 200, you may be better off chopping up your list of sites into a few lists that you loop over. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8a8cf6b", + "metadata": {}, + "outputs": [], + "source": [ + "latest_dv,_ = waterdata.get_latest_daily(\n", + " monitoring_location_id=NE_locations_discharge['monitoring_location_id'].tolist(),\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\"\n", + ")\n", + "display(latest_dv.head())" + ] + }, + { + "cell_type": "markdown", + "id": "f5a38bde", + "metadata": {}, + "source": [ + "Note that because these measurements are less than a week old, most of them are still tagged as \"Provisional\" in the `approval_status` column. Some may also be missing values in the `value` column. You can often check the `qualifier` column for clues as to why a value is missing, or additional information specific to that measurement. Let's map out the monitoring locations again and color the points based on the latest daily value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f97f172f", + "metadata": {}, + "outputs": [], + "source": [ + "latest_dv['date'] = latest_dv['time'].astype(str)\n", + "latest_dv[['geometry', 'monitoring_location_id', 'date', 'value', 'unit_of_measure']].set_crs(crs=\"WGS84\").explore(column='value', tiles='CartoDB dark matter', cmap='YlOrRd', scheme=None, legend=True)" + ] + }, + { + "cell_type": "markdown", + "id": "aa6fa717", + "metadata": {}, + "source": [ + "Let's do the same routine with `waterdata.get_latest_continuous()`, but note that we do not need to specify the `statistic_id`: all instantaneous values have the statistical code \"00011\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83c7f5d4", + "metadata": {}, + "outputs": [], + "source": [ + "latest_instantaneous,_ = waterdata.get_latest_continuous(\n", + " monitoring_location_id=NE_locations_discharge['monitoring_location_id'].tolist(),\n", + " parameter_code=\"00060\"\n", + ")\n", + "\n", + "latest_instantaneous['datetime'] = latest_instantaneous['time'].astype(str)\n", + "latest_instantaneous[['geometry', 'monitoring_location_id', 'datetime', 'value', 'unit_of_measure']].set_crs(crs=\"WGS84\").explore(column='value', cmap='YlOrRd', scheme=None, legend=True)" + ] + }, + { + "cell_type": "markdown", + "id": "f179dfd0", + "metadata": {}, + "source": [ + "### Daily and continuous values datasets\n", + "While the \"latest\" functions might be helpful for \"realtime\" or \"current\" dashboards or reports, many users desire to work with a complete timeseries of daily summary (min, max, mean) or instantaneous values for their analyses. For these workflows, `waterdata.get_daily()` and `waterdata.get_continuous()` are helpful.\n", + "\n", + "Using our current example, let's say that you want to compare daily and instantaneous discharge values for monitoring locations along the Missouri River in Nebraska." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf50e007", + "metadata": {}, + "outputs": [], + "source": [ + "missouri_river_sites = NE_locations_discharge.loc[NE_locations_discharge['monitoring_location_name'].str.contains(\"Missouri\")]\n", + "display(missouri_river_sites[[\n", + " 'county_name',\n", + " 'site_type',\n", + " 'monitoring_location_id',\n", + " 'monitoring_location_name',\n", + " 'drainage_area',\n", + " 'altitude'\n", + " ]])" + ] + }, + { + "cell_type": "markdown", + "id": "c5c5881e", + "metadata": {}, + "source": [ + "Currently, users may only request 3 years or less of continuous data in one pull. For this example, let's pull the last 1 year of daily mean values and instantaneous values for these Missouri River sites. We'll skip pulling geometry in the `waterdata.get_daily()` function; the `waterdata.get_continuous()` function does not return geometry at all to economize the size of the dataset returned." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d812f4e", + "metadata": {}, + "outputs": [], + "source": [ + "one_year_ago = (date.today() - relativedelta(years=1)).strftime(\"%Y-%m-%d\")\n", + "missouri_site_ids = missouri_river_sites['monitoring_location_id'].tolist()\n", + "missouri_site_names = missouri_river_sites['monitoring_location_name'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bbda23d", + "metadata": {}, + "outputs": [], + "source": [ + "daily_values,_ = waterdata.get_daily(\n", + " monitoring_location_id=missouri_site_ids,\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\", # mean daily value\n", + " time=f\"{one_year_ago}/..\",\n", + " skip_geometry=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "795b9eca", + "metadata": {}, + "outputs": [], + "source": [ + "instantaneous_values,_ = waterdata.get_continuous(\n", + " monitoring_location_id=missouri_site_ids,\n", + " parameter_code=\"00060\",\n", + " time=f\"{one_year_ago}T00:00:00Z/..\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c1663311", + "metadata": {}, + "source": [ + "With these two datasets, let's plot daily and instantaneous discharge values for the four Missouri River sites using `matplotlib`. We will plot each site on a different subplot, with instantaneous values represented by a blue line and daily mean values represented by black points." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebc2c70d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, figsize=(14, 8), dpi=150, sharex=False, sharey=True)\n", + "axes = axes.ravel()\n", + "\n", + "# Y-axis formatter (with thousands separators)\n", + "tick_fmt = mtick.StrMethodFormatter('{x:,.0f}')\n", + "\n", + "for ax, site, site_name in zip(axes, missouri_site_ids, missouri_site_names):\n", + " # Filter per site & sort by time\n", + " inst = instantaneous_values.loc[instantaneous_values['monitoring_location_id'] == site, [\"time\", \"value\"]].sort_values(\"time\")\n", + " daily = daily_values.loc[daily_values['monitoring_location_id'] == site, [\"time\", \"value\"]].sort_values(\"time\")\n", + "\n", + " # Instantaneous (line)\n", + " ax.plot(\n", + " inst[\"time\"], inst[\"value\"],\n", + " color=\"#1f77b4\", lw=1.0, label=\"Instantaneous\", zorder=1\n", + " )\n", + "\n", + " # Daily mean (black dots)\n", + " ax.scatter(\n", + " daily[\"time\"], daily[\"value\"],\n", + " c=\"black\", s=2, label=\"Daily mean\", zorder=2\n", + " )\n", + "\n", + " # Axes styling\n", + " ax.set_title(f\"{site}\\n{site_name}\", fontsize=10)\n", + " ax.grid(True, which=\"both\", alpha=0.25)\n", + " ax.yaxis.set_major_formatter(tick_fmt)\n", + "\n", + " # Time ticks\n", + " ax.xaxis.set_major_locator(mdates.MonthLocator())\n", + " ax.xaxis.set_major_formatter(mdates.DateFormatter(\"%b %Y\"))\n", + " ax.xaxis.set_minor_locator(mdates.WeekdayLocator(byweekday=mdates.MO))\n", + "\n", + "# Common axis labels (left y on both left subplots; x labels on bottom row)\n", + "axes[0].set_ylabel(\"Discharge (cubic feet per second)\")\n", + "axes[2].set_ylabel(\"Discharge (cubic feet per second)\")\n", + "axes[2].set_xlabel(\"\")\n", + "axes[3].set_xlabel(\"\")\n", + "\n", + "handles, labels = axes[-1].get_legend_handles_labels()\n", + "fig.legend(handles, labels, loc=\"lower center\", ncol=2, frameon=False)\n", + "fig.suptitle(f\"Missouri River sites - Daily Mean vs Instantaneous Discharge\")\n", + "fig.autofmt_xdate()\n" + ] + }, + { + "cell_type": "markdown", + "id": "d04a8f8a", + "metadata": {}, + "source": [ + "### Field values\n", + "Finally, let's see if there are any discharge field measurements for these sites. These are manually recorded measurements (by a human), often used during calibration checks. We will use `waterdata.get_field_measurements()` to check. More commonly, a user would head to this function to gather groundwater level data, which are categorized as field measurements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56bf6166", + "metadata": {}, + "outputs": [], + "source": [ + "field_measurements,_ = waterdata.get_field_measurements(\n", + " monitoring_location_id=missouri_site_ids,\n", + " parameter_code=\"00060\",\n", + " time=f\"{one_year_ago}T00:00:00Z/..\"\n", + ")\n", + "display(field_measurements.head())" + ] + }, + { + "cell_type": "markdown", + "id": "e621e45a", + "metadata": {}, + "source": [ + "Hey! We have some! Let's add these to our plots from above. We'll loop through each monitoring location plot and add in field measurements as red points." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42f22d69", + "metadata": {}, + "outputs": [], + "source": [ + "for ax, site in zip(axes, missouri_site_ids):\n", + " field = field_measurements.loc[\n", + " field_measurements['monitoring_location_id'] == site, [\"time\", \"value\"]\n", + " ].sort_values(\"time\")\n", + "\n", + " ax.scatter(\n", + " field[\"time\"], field[\"value\"],\n", + " c=\"red\", s=4, label=\"Field\", zorder=3\n", + " )\n", + "\n", + "# Remove any existing figure-level legends\n", + "for leg in fig.legends:\n", + " leg.remove()\n", + "handles, labels = axes[-1].get_legend_handles_labels()\n", + "fig.legend(handles, labels, loc=\"lower center\", ncol=3, frameon=False)\n", + "\n", + "# Redraw the figure\n", + "fig.canvas.draw_idle()\n", + "fig\n" + ] + }, + { + "cell_type": "markdown", + "id": "60a1b100", + "metadata": {}, + "source": [ + "## Additional Resources\n", + "The USGS Water Data APIs belong to the Water Data for the Nation (WDFN) group of applications and tools. These products exist under the broader National Water Information System (NWIS) program. Check out the links below for more information on the USGS Water Data APIs and other ways to download or view USGS water data:\n", + "* [Water Data APIs Home](https://api.waterdata.usgs.gov/)\n", + "* [Get an API Key](https://api.waterdata.usgs.gov/signup/)\n", + "* [Water Data API OGC Endpoint Catalog](https://api.waterdata.usgs.gov/ogcapi/v0/collections?f=html)\n", + "* [Water Data Download Form](https://api.waterdata.usgs.gov/download)\n", + "* [Water Data for the Nation Home](https://waterdata.usgs.gov/)\n", + "* [Water Data for the Nation Feedback Form](https://waterdata.usgs.gov/questions-comments/?referrerUrl=https://api.waterdata.usgs.gov)\n", + "* [R dataRetrieval package](https://github.com/DOI-USGS/dataretrieval)\n", + "* [WDFN Blog Post on NWISWeb Decommissioning Timeline](https://waterdata.usgs.gov/blog/nwisweb-decommission-campaign2/)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "waterdata-demo", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/examples/WaterData_demo.nblink b/docs/source/examples/WaterData_demo.nblink new file mode 100644 index 00000000..7c485f1a --- /dev/null +++ b/docs/source/examples/WaterData_demo.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../demos/WaterData_demo.ipynb" +} \ No newline at end of file diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 044360fa..edd43beb 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -4,6 +4,17 @@ Examples ======== +Introduction to the ``waterdata`` module of ``dataretrieval`` +------------------------------------------------------------- +The ``waterdata`` module will replace the ``nwis`` module as the primary +set of data download functions for USGS water data. This Jupyter notebook +covers a basic introduction to module functions and usage. + +.. toctree:: + :maxdepth: 1 + + WaterData_demo + Simple uses of the ``dataretrieval`` package -------------------------------------------- diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index abdd823b..d8191c87 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -6,8 +6,8 @@ if sys.version_info < (3, 10): pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) +from dataretrieval.waterdata.utils import _check_profiles from dataretrieval.waterdata import ( - _check_profiles, get_samples, get_daily, get_continuous,