diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index 1aae4b6aef..b630994682 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -37,7 +37,6 @@ from esmvalcore._task import TaskSet from esmvalcore.dataset import Dataset - from esmvalcore.typing import Facets logger = logging.getLogger(__name__) @@ -504,20 +503,6 @@ def valid_time_selection(timerange: str) -> None: _check_timerange_values(_format_years(date), timerange_list) -def differing_timeranges( - timeranges: set[str], - required_vars: list[Facets], -) -> None: - """Log error if required variables have differing timeranges.""" - if len(timeranges) > 1: - msg = ( - f"Differing timeranges with values {timeranges} " - f"found for required variables {required_vars}. " - "Set `timerange` to a common value." - ) - raise ValueError(msg) - - def _check_literal( settings: dict, *, diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 38f48fc663..d54d35fba5 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -52,8 +52,7 @@ from . import check from .from_datasets import datasets_to_recipe from .to_datasets import ( - _derive_needed, - _get_input_datasets, + _get_required_datasets, _representative_datasets, ) @@ -251,7 +250,7 @@ def _get_default_settings(dataset: Dataset) -> PreprocessorSettings: settings = {} - if _derive_needed(dataset): + if dataset._derivation_necessary(): # noqa: SLF001 (will be replaced soon) settings["derive"] = { "short_name": facets["short_name"], "standard_name": facets["standard_name"], @@ -622,21 +621,26 @@ def _allow_skipping(dataset: Dataset) -> bool: ) -def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None: - """Set the 'version' facet based on derivation input datasets.""" - versions = set() - for in_dataset in input_datasets: - in_dataset.set_version() - if version := in_dataset.facets.get("version"): - if isinstance(version, list): - versions.update(version) - else: - versions.add(version) - if versions: - version = versions.pop() if len(versions) == 1 else sorted(versions) - dataset.set_facet("version", version) - for supplementary_ds in dataset.supplementaries: - supplementary_ds.set_version() +def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: + """Automatically correct the wrong ensemble for CMIP5 fx variables.""" + if ( + dataset.facets.get("project") == "CMIP5" + and dataset.facets.get("mip") == "fx" + and dataset.facets.get("ensemble") != "r0i0p0" + and not dataset.files + ): + original_ensemble = dataset["ensemble"] + copy = dataset.copy() + copy.facets["ensemble"] = "r0i0p0" + if copy.files: + dataset.facets["ensemble"] = "r0i0p0" + logger.info( + "Corrected wrong 'ensemble' from '%s' to '%s' for %s", + original_ensemble, + dataset["ensemble"], + dataset.summary(shorten=True), + ) + dataset.find_files() def _get_preprocessor_products( @@ -662,28 +666,29 @@ def _get_preprocessor_products( settings = _get_default_settings(dataset) _apply_preprocessor_profile(settings, profile) _update_multi_dataset_settings(dataset.facets, settings) + _fix_cmip5_fx_ensemble(dataset) _update_preproc_functions(settings, dataset, datasets, missing_vars) _add_dataset_specific_settings(dataset, settings) check.preprocessor_supplementaries(dataset, settings) - input_datasets = _get_input_datasets(dataset) - missing = _check_input_files(input_datasets) + required_datasets = _get_required_datasets(dataset) + missing = _check_input_files(required_datasets) if missing: if _allow_skipping(dataset): logger.info("Skipping: %s", missing) else: missing_vars.update(missing) continue - _set_version(dataset, input_datasets) + dataset.set_version() USED_DATASETS.append(dataset) - _schedule_for_download(input_datasets) - _log_input_files(input_datasets) + _schedule_for_download(required_datasets) + _log_input_files(required_datasets) logger.info("Found input files for %s", dataset.summary(shorten=True)) filename = _get_preprocessor_filename(dataset) product = PreprocessorFile( filename=filename, attributes=dataset.facets, settings=settings, - datasets=input_datasets, + datasets=required_datasets, ) products.add(product) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 454947915a..e992c767f8 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -13,7 +13,6 @@ from esmvalcore.exceptions import RecipeError from esmvalcore.io.esgf.facets import FACETS from esmvalcore.io.local import _replace_years_with_timerange -from esmvalcore.preprocessor._derive import get_required from esmvalcore.preprocessor._io import DATASET_KEYS from esmvalcore.preprocessor._supplementary_vars import ( PREPROCESSOR_SUPPLEMENTARIES, @@ -189,28 +188,6 @@ def _merge_supplementary_dicts( return list(merged.values()) -def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: - """Automatically correct the wrong ensemble for CMIP5 fx variables.""" - if ( - dataset.facets.get("project") == "CMIP5" - and dataset.facets.get("mip") == "fx" - and dataset.facets.get("ensemble") != "r0i0p0" - and not dataset.files - ): - original_ensemble = dataset["ensemble"] - copy = dataset.copy() - copy.facets["ensemble"] = "r0i0p0" - if copy.files: - dataset.facets["ensemble"] = "r0i0p0" - logger.info( - "Corrected wrong 'ensemble' from '%s' to '%s' for %s", - original_ensemble, - dataset["ensemble"], - dataset.summary(shorten=True), - ) - dataset.find_files() - - def _get_supplementary_short_names( facets: Facets, step: str, @@ -431,9 +408,7 @@ def datasets_from_recipe( return datasets -def _dataset_from_files( # noqa: C901 - dataset: Dataset, -) -> list[Dataset]: +def _dataset_from_files(dataset: Dataset) -> list[Dataset]: """Replace facet values of '*' based on available files.""" result: list[Dataset] = [] errors: list[str] = [] @@ -444,53 +419,32 @@ def _dataset_from_files( # noqa: C901 dataset.summary(shorten=True), ) - representative_datasets = _representative_datasets(dataset) - - # For derived variables, representative_datasets might contain more than - # one element - all_datasets: list[list[tuple[dict, Dataset]]] = [] - for representative_dataset in representative_datasets: - all_datasets.append([]) - for expanded_ds in representative_dataset.from_files(): - updated_facets = {} - unexpanded_globs = {} - for key, value in dataset.facets.items(): - if _isglob(value): - if key in expanded_ds.facets and not _isglob( - expanded_ds[key], - ): - updated_facets[key] = expanded_ds.facets[key] - else: - unexpanded_globs[key] = value - - if unexpanded_globs: - msg = _report_unexpanded_globs( - dataset, - expanded_ds, - unexpanded_globs, - ) - errors.append(msg) - continue + for expanded_ds in dataset.from_files(): + updated_facets = {} + unexpanded_globs = {} + for key, value in dataset.facets.items(): + if _isglob(value): + if key in expanded_ds.facets and not _isglob( + expanded_ds[key], + ): + updated_facets[key] = expanded_ds.facets[key] + else: + unexpanded_globs[key] = value + + if unexpanded_globs: + msg = _report_unexpanded_globs( + dataset, + expanded_ds, + unexpanded_globs, + ) + errors.append(msg) + continue - new_ds = dataset.copy() - new_ds.facets.update(updated_facets) - new_ds.supplementaries = expanded_ds.supplementaries + new_ds = dataset.copy() + new_ds.facets.update(updated_facets) + new_ds.supplementaries = expanded_ds.supplementaries - all_datasets[-1].append((updated_facets, new_ds)) - - # If globs have been expanded, only consider those datasets that contain - # all necessary input variables if derivation is necessary - for updated_facets, new_ds in all_datasets[0]: - other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] - if all(updated_facets in facets for facets in other_facets): - result.append(new_ds) - else: - logger.debug( - "Not all necessary input variables to derive '%s' are " - "available for dataset %s", - dataset["short_name"], - updated_facets, - ) + result.append(new_ds) if errors: raise RecipeError("\n".join(errors)) @@ -535,66 +489,33 @@ def _report_unexpanded_globs( return msg -def _derive_needed(dataset: Dataset) -> bool: - """Check if dataset needs to be derived from other datasets.""" - if not dataset.facets.get("derive"): - return False - if dataset.facets.get("force_derivation"): - return True - if _isglob(dataset.facets.get("timerange", "")): - # Our file finding routines are not able to handle globs. - dataset = dataset.copy() - dataset.facets.pop("timerange") - - copy = dataset.copy() - copy.supplementaries = [] - return not copy.files - - -def _get_input_datasets(dataset: Dataset) -> list[Dataset]: - """Determine the input datasets needed for deriving `dataset`.""" - facets = dataset.facets - if not _derive_needed(dataset): - _fix_cmip5_fx_ensemble(dataset) - return [dataset] +def _get_required_datasets(dataset: Dataset) -> list[Dataset]: + """Determine the datasets required for deriving `dataset`.""" + if not dataset._derivation_necessary(): # noqa: SLF001 + return dataset.required_datasets - # Configure input datasets needed to derive variable - datasets = [] - required_vars = get_required(facets["short_name"], facets["project"]) # type: ignore - # idea: add option to specify facets in list of dicts that is value of - # 'derive' in the recipe and use that instead of get_required? - for input_facets in required_vars: - input_dataset = dataset.copy() - keep = {"alias", "recipe_dataset_index", *dataset.minimal_facets} - input_dataset.facets = { - k: v for k, v in input_dataset.facets.items() if k in keep - } - input_dataset.facets.update(input_facets) - input_dataset.augment_facets() - _fix_cmip5_fx_ensemble(input_dataset) - if input_facets.get("optional") and not input_dataset.files: + # Skip optional datasets if no data is available + required_datasets: list[Dataset] = [] + for required_dataset in dataset.required_datasets: + if ( + required_dataset.facets.get("optional") + and not required_dataset.files + ): logger.info( "Skipping: no data found for %s which is marked as 'optional'", - input_dataset, + required_dataset, ) else: - datasets.append(input_dataset) + required_datasets.append(required_dataset) - # Check timeranges of available input data. - timeranges: set[str] = set() - for input_dataset in datasets: - if "timerange" in input_dataset.facets: - timeranges.add(input_dataset.facets["timerange"]) # type: ignore - check.differing_timeranges(timeranges, required_vars) - - return datasets + return required_datasets def _representative_datasets(dataset: Dataset) -> list[Dataset]: """Find representative datasets for all input variables.""" copy = dataset.copy() copy.supplementaries = [] - representative_datasets = _get_input_datasets(copy) + representative_datasets = _get_required_datasets(copy) for representative_dataset in representative_datasets: representative_dataset.supplementaries = dataset.supplementaries return representative_datasets diff --git a/esmvalcore/config/_validated_config.py b/esmvalcore/config/_validated_config.py index 65d42cc5e7..7a5af93224 100644 --- a/esmvalcore/config/_validated_config.py +++ b/esmvalcore/config/_validated_config.py @@ -60,7 +60,7 @@ class ValidatedConfig(MutableMapping): """ # validate values on the way in - def __init__(self, *args, **kwargs): + def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__() self._mapping: dict[str, Any] = {} self.update(*args, **kwargs) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 6be9687a15..9e03c3ddbf 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -29,6 +29,7 @@ from esmvalcore.exceptions import InputFilesNotFound, RecipeError from esmvalcore.io.local import _dates_to_timerange from esmvalcore.preprocessor import _get_preprocessor_filename, preprocess +from esmvalcore.preprocessor._derive import get_required if TYPE_CHECKING: from collections.abc import Iterable, Iterator, Sequence @@ -99,7 +100,7 @@ class Dataset: Attributes ---------- - supplementaries : list[Dataset] + supplementaries: list[Dataset] List of supplementary datasets. facets: :obj:`esmvalcore.typing.Facets` Facets describing the dataset. @@ -129,6 +130,7 @@ def __init__(self, **facets: FacetValue) -> None: self._session: Session | None = None self._files: Sequence[DataElement] | None = None self._used_data_sources: Sequence[DataSource] = [] + self._required_datasets: list[Dataset] | None = None for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) @@ -185,87 +187,209 @@ def _derivation_necessary(self) -> bool: # are found ds_copy = self.copy() ds_copy.supplementaries = [] + + # Avoid potential errors from missing data during timerange glob + # expansion + if _isglob(ds_copy.facets.get("timerange", "")): + ds_copy.facets.pop("timerange", None) + return not ds_copy.files + def _get_required_datasets(self) -> list[Dataset]: + """Get required datasets for derivation.""" + required_datasets: list[Dataset] = [] + required_vars_facets = get_required( + self.facets["short_name"], # type: ignore + self.facets["project"], # type: ignore + ) + + for required_facets in required_vars_facets: + required_dataset = self._copy(derive=False, force_derivation=False) + keep = {"alias", "recipe_dataset_index", *self.minimal_facets} + required_dataset.facets = { + k: v for k, v in required_dataset.facets.items() if k in keep + } + required_dataset.facets.update(required_facets) + required_dataset.augment_facets() + required_datasets.append(required_dataset) + + return required_datasets + + @property + def required_datasets(self) -> list[Dataset]: + """Get required datasets. + + For non-derived variables (i.e., those with facet ``derive=False``), + this will simply return the dataset itself in a list. + + For derived variables (i.e., those with facet ``derive=True``), this + will return the datasets required for derivation if derivation is + necessary, and the dataset itself if derivation is not necessary. + Derivation is necessary if the facet ``force_derivation=True`` is set + or no files for the dataset itself are available. + + See also :func:`esmvalcore.preprocessor.derive` for an example usage. + + """ + if self._required_datasets is not None: + return self._required_datasets + + if not self._derivation_necessary(): + self._required_datasets = [self] + else: + self._required_datasets = self._get_required_datasets() + + return self._required_datasets + + @staticmethod def _file_to_dataset( - self, + dataset: Dataset, file: DataElement, ) -> Dataset: """Create a dataset from a file with a `facets` attribute.""" facets = dict(file.facets) - if "version" not in self.facets: + if "version" not in dataset.facets: # Remove version facet if no specific version requested facets.pop("version", None) updated_facets = { f: v for f, v in facets.items() - if f in self.facets - and _isglob(self.facets[f]) - and _ismatch(v, self.facets[f]) + if f in dataset.facets + and _isglob(dataset.facets[f]) + and _ismatch(v, dataset.facets[f]) } - dataset = self.copy() - dataset.facets.update(updated_facets) + new_dataset = dataset.copy() + new_dataset.facets.update(updated_facets) # If possible, remove unexpanded facets that can be automatically # populated. - unexpanded = {f for f, v in dataset.facets.items() if _isglob(v)} + unexpanded = {f for f, v in new_dataset.facets.items() if _isglob(v)} required_for_augment = {"project", "mip", "short_name", "dataset"} if unexpanded and not unexpanded & required_for_augment: - copy = dataset.copy() + copy = new_dataset.copy() copy.supplementaries = [] for facet in unexpanded: copy.facets.pop(facet) copy.augment_facets() for facet in unexpanded: if facet in copy.facets: - dataset.facets.pop(facet) + new_dataset.facets.pop(facet) - return dataset + return new_dataset - def _get_available_datasets(self) -> Iterator[Dataset]: + @staticmethod + def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: # noqa: C901 + """Yield datasets based on the available files. + + This function requires that dataset.facets['mip'] is not a glob + pattern. + + Does take variable derivation into account, i.e., datasets available + through variable derivation are returned. + + """ + datasets_found = False + + # If no forced derivation is requested, search for datasets based on + # files from dataset + if not dataset._is_force_derived(): + for available_ds in Dataset._get_available_datasets(dataset): + datasets_found = True + yield available_ds + + # For variables that cannot be derived, we are done here + if not dataset._is_derived(): + return + + # If forced derivation is requested or no datasets based on files from + # dataset have been found, search for datasets based on files from + # required datasets + if dataset._is_force_derived() or not datasets_found: + all_datasets: list[list[tuple[dict, Dataset]]] = [] + for required_dataset in dataset.required_datasets: + all_datasets.append([]) + for expanded_ds in Dataset._get_available_datasets( + required_dataset, + ): + updated_facets = {} + for key, value in dataset.facets.items(): + if _isglob(value): + if key in expanded_ds.facets and not _isglob( + expanded_ds[key], + ): + updated_facets[key] = expanded_ds.facets[key] + new_ds = dataset.copy() + new_ds.facets.update(updated_facets) + new_ds.supplementaries = dataset.supplementaries + + all_datasets[-1].append((updated_facets, new_ds)) + + # Only consider those datasets that contain all required variables + # with identical facets (e.g., skip those with different + # timeranges) + for updated_facets, new_ds in all_datasets[0]: + other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] + if all(updated_facets in facets for facets in other_facets): + yield new_ds + else: + logger.debug( + "Not all variables required to derive '%s' are " + "available for %s with facets %s", + dataset["short_name"], + new_ds.summary(shorten=True), + updated_facets, + ) + + @staticmethod + def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]: """Yield datasets based on the available files. This function requires that self.facets['mip'] is not a glob pattern. + + Does not take variable derivation into account, i.e., datasets + potentially available through variable derivation are ignored. To + consider derived variables properly, use the function + :func:`_get_all_available_datasets`. + """ - dataset_template = self.copy() + dataset_template = dataset.copy() dataset_template.supplementaries = [] seen = set() partially_defined = [] expanded = False for file in dataset_template.files: - dataset = self._file_to_dataset(file) - # Do not use the timerange facet from the file because there may be multiple - # files per dataset. - dataset.facets.pop("timerange", None) + new_dataset = Dataset._file_to_dataset(dataset, file) + # Do not use the timerange facet from the file because there may be + # multiple files per dataset. + new_dataset.facets.pop("timerange", None) # Restore the original timerange facet if it was specified. - if "timerange" in self.facets: - dataset.facets["timerange"] = self.facets["timerange"] + if "timerange" in dataset.facets: + new_dataset.facets["timerange"] = dataset.facets["timerange"] # Filter out identical datasets facetset = frozenset( (f, frozenset(v) if isinstance(v, list) else v) - for f, v in dataset.facets.items() + for f, v in new_dataset.facets.items() ) if facetset not in seen: seen.add(facetset) if any( _isglob(v) - for f, v in dataset.facets.items() + for f, v in new_dataset.facets.items() if f != "timerange" ): - partially_defined.append((dataset, file)) + partially_defined.append((new_dataset, file)) else: - dataset._update_timerange() # noqa: SLF001 - dataset._supplementaries_from_files() # noqa: SLF001 + new_dataset._update_timerange() # noqa: SLF001 expanded = True - yield dataset + yield new_dataset # Only yield datasets with globs if there is no better alternative - for dataset, file in partially_defined: + for new_dataset, file in partially_defined: msg = ( - f"{dataset} with unexpanded wildcards, created from file " + f"{new_dataset} with unexpanded wildcards, created from file " f"{file} with facets {file.facets}. Please check why " "the missing facets are not available for the file." "This will depend on the data source they come from, e.g. can " @@ -280,7 +404,7 @@ def _get_available_datasets(self) -> Iterator[Dataset]: "because it still contains wildcards.", msg, ) - yield dataset + yield new_dataset def from_files(self) -> Iterator[Dataset]: """Create datasets based on the available files. @@ -304,6 +428,10 @@ def from_files(self) -> Iterator[Dataset]: Supplementary datasets will in inherit the facet values from the main dataset for those facets listed in :obj:`INHERITED_FACETS`. + This also works for :ref:`derived variables `. The + datasets required for derivation can be accessed via + :attr:`Dataset.required_datasets`. + Examples -------- See :doc:`/notebooks/discovering-data` notebook for example use cases. @@ -330,7 +458,10 @@ def from_files(self) -> Iterator[Dataset]: for mip in mips: dataset_template = self.copy(mip=mip) - for dataset in dataset_template._get_available_datasets(): # noqa: SLF001 + for dataset in self._get_all_available_datasets( + dataset_template, + ): + dataset._supplementaries_from_files() # noqa: SLF001 expanded = True yield dataset @@ -605,15 +736,29 @@ def minimal_facets(self) -> Facets: """Return a dictionary with the persistent facets.""" return {k: v for k, v in self.facets.items() if k in self._persist} + @staticmethod + def _get_version(dataset: Dataset) -> str | list[str]: + """Get available version(s) of dataset.""" + versions: set[str] = set() + for file in dataset.files: + if "version" in file.facets: + versions.add(str(file.facets["version"])) + return versions.pop() if len(versions) == 1 else sorted(versions) + def set_version(self) -> None: """Set the ``'version'`` facet based on the available data.""" versions: set[str] = set() - for file in self.files: - if "version" in file.facets: - versions.add(file.facets["version"]) # type: ignore + for required_dataset in self.required_datasets: + version = self._get_version(required_dataset) + if version: + if isinstance(version, list): + versions.update(version) + else: + versions.add(version) version = versions.pop() if len(versions) == 1 else sorted(versions) if version: self.set_facet("version", version) + for supplementary_ds in self.supplementaries: supplementary_ds.set_version() @@ -972,8 +1117,9 @@ def _update_timerange(self) -> None: dataset = self.copy() dataset.facets.pop("timerange") dataset.supplementaries = [] - check.data_availability(dataset) - if all("timerange" in f.facets for f in dataset.files): + if dataset.files and all( + "timerange" in f.facets for f in dataset.files + ): # "timerange" can only be reliably computed when all DataElements # provide it. intervals = [ diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index 3a25d1f9a4..3817b86bc7 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -13,6 +13,8 @@ from esmvalcore.preprocessor._units import convert_units if TYPE_CHECKING: + from collections.abc import Sequence + from cf_units import Unit from iris.cube import Cube @@ -77,7 +79,7 @@ def get_required(short_name: str, project: str) -> list[Facets]: def derive( - cubes: CubeList, + cubes: Sequence[Cube], short_name: str, long_name: str, units: str | Unit, @@ -88,8 +90,7 @@ def derive( Parameters ---------- cubes: - Includes all the needed variables for derivation defined in - :func:`get_required`. + Includes all the needed variables for derivation. short_name: short_name long_name: @@ -103,6 +104,38 @@ def derive( ------- iris.cube.Cube The new derived variable. + + Examples + -------- + Required variables for derivation can be obtained via + :attr:`esmvalcore.dataset.Dataset.required_datasets`. + + For example, to derive the longwave cloud radiative effect (LWCRE) for the + model CESM2, you can use: + + >>> from esmvalcore.dataset import Dataset + >>> from esmvalcore.preprocessor import derive + >>> dataset = Dataset( + ... project="CMIP6", + ... dataset="CESM2", + ... exp="historical", + ... ensemble="r1i1p1f1", + ... grid="gn", + ... timerange="2000/2014", + ... short_name="lwcre", + ... mip="Amon", + ... derive=True, + ... ) + >>> cubes = [d.load() for d in dataset.required_datasets] + >>> cube = derive( + ... cubes, + ... short_name="lwcre", + ... long_name="TOA Longwave Cloud Radiative Effect", + ... units="W m-2", + ... ) + >>> print(cube.var_name) + lwcre # doctest: +SKIP + """ if short_name == cubes[0].var_name: return cubes[0] diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb index d6c9001ef2..581e8ca249 100644 --- a/notebooks/discovering-data.ipynb +++ b/notebooks/discovering-data.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "f0ccfe7f-c535-4606-99ce-be24960aece1", "metadata": {}, "outputs": [], @@ -89,7 +89,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 778 datasets, showing the first 10:\n" + "Found 727 datasets, showing the first 10:\n" ] }, { @@ -168,20 +168,20 @@ " 'grid': 'gn',\n", " 'institute': 'AWI'},\n", " Dataset:\n", - " {'dataset': 'BCC-CSM2-MR',\n", + " {'dataset': 'AWI-ESM-1-REcoM',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'tas',\n", " 'ensemble': 'r1i1p1f1',\n", " 'exp': 'historical',\n", " 'grid': 'gn',\n", - " 'institute': 'BCC'},\n", + " 'institute': 'AWI'},\n", " Dataset:\n", " {'dataset': 'BCC-CSM2-MR',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'tas',\n", - " 'ensemble': 'r2i1p1f1',\n", + " 'ensemble': 'r1i1p1f1',\n", " 'exp': 'historical',\n", " 'grid': 'gn',\n", " 'institute': 'BCC'}]" @@ -253,7 +253,7 @@ { "data": { "text/plain": [ - "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf-data1.llnl.gov', 'esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf-data04.diasjp.net', 'esgf.nci.org.au', 'esgf3.dkrz.de']]" + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]" ] }, "execution_count": 6, @@ -282,7 +282,7 @@ { "data": { "text/plain": [ - "LocalFile('~/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')" + "LocalFile('/home/manuel/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')" ] }, "execution_count": 7, @@ -312,6 +312,235 @@ "source": [ "download(dataset.files, CFG[\"download_dir\"])" ] + }, + { + "cell_type": "markdown", + "id": "d3006d90", + "metadata": {}, + "source": [ + "`Dataset.from_files` can also handle derived variables properly:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b75314e3", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_template = Dataset(\n", + " short_name=\"lwcre\",\n", + " mip=\"Amon\",\n", + " project=\"CMIP6\",\n", + " exp=\"historical\",\n", + " dataset=\"*\",\n", + " institute=\"*\",\n", + " ensemble=\"r1i1p1f1\",\n", + " grid=\"gn\",\n", + " derive=True,\n", + " force_derivation=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b87c247f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 36 datasets, showing the first 10:\n" + ] + }, + { + "data": { + "text/plain": [ + "[Dataset:\n", + " {'dataset': 'TaiESM1',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AS-RCEC'},\n", + " Dataset:\n", + " {'dataset': 'AWI-CM-1-1-MR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'AWI-ESM-1-1-LR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'AWI-ESM-1-REcoM',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'BCC-CSM2-MR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'BCC'},\n", + " Dataset:\n", + " {'dataset': 'BCC-ESM1',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'BCC'},\n", + " Dataset:\n", + " {'dataset': 'CAMS-CSM1-0',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAMS'},\n", + " Dataset:\n", + " {'dataset': 'CAS-ESM2-0',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAS'},\n", + " Dataset:\n", + " {'dataset': 'FGOALS-g3',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAS'},\n", + " Dataset:\n", + " {'dataset': 'IITM-ESM',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CCCR-IITM'}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datasets = list(dataset_template.from_files())\n", + "print(f\"Found {len(datasets)} datasets, showing the first 10:\")\n", + "datasets[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "18e3a0b7", + "metadata": {}, + "source": [ + "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned." + ] + }, + { + "cell_type": "markdown", + "id": "f00a886f", + "metadata": {}, + "source": [ + "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets may be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c5edfa65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = datasets[0]\n", + "dataset.files" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "97cdf12d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rlut\n", + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlut/gn/v20200623/rlut_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]\n", + "rlutcs\n", + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlutcs/gn/v20200623/rlutcs_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de']]\n" + ] + } + ], + "source": [ + "for d in dataset.input_datasets:\n", + " print(d[\"short_name\"])\n", + " print(d.files)" + ] } ], "metadata": { diff --git a/tests/conftest.py b/tests/conftest.py index 46cabf58f9..3c19e4c4df 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ from __future__ import annotations +import importlib import warnings from copy import deepcopy from functools import lru_cache @@ -7,6 +8,7 @@ import numpy as np import pytest +import yaml from cf_units import Unit from iris.coords import ( AncillaryVariable, @@ -17,6 +19,7 @@ ) from iris.cube import Cube +import esmvalcore from esmvalcore.config import CFG, Config if TYPE_CHECKING: @@ -55,6 +58,33 @@ def ignore_existing_user_config( monkeypatch.setattr(CFG, "_mapping", cfg_default._mapping) +@lru_cache +def _load_default_data_sources() -> dict[ + str, + dict[str, dict[str, dict[str, dict[str, str]]]], +]: + """Load default data sources for local users.""" + cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = { + "projects": {}, + } + for file in ( + "data-local.yml", + "data-local-esmvaltool.yml", + "data-native-cesm.yml", + "data-native-emac.yml", + "data-native-icon.yml", + "data-native-ipslcm.yml", + ): + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / file, + ) as config_file: + content = config_file.read_text(encoding="utf-8") + cfg["projects"].update(yaml.safe_load(content)["projects"]) + return cfg + + @pytest.fixture def session( tmp_path: Path, @@ -63,7 +93,15 @@ def session( ) -> Session: """Session object with default settings.""" monkeypatch.setitem(CFG, "output_dir", tmp_path / "esmvaltool_output") - return CFG.start_session("recipe_test") + session = CFG.start_session("recipe_test") + projects = _load_default_data_sources()["projects"] + for project in projects: + print(project) + data_sources = projects[project]["data"] + for data_source in data_sources.values(): + data_source["rootpath"] = str(tmp_path) + session["projects"][project]["data"] = data_sources + return session @pytest.fixture diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py index a9cf809a92..54d0d0bc90 100644 --- a/tests/integration/recipe/test_check.py +++ b/tests/integration/recipe/test_check.py @@ -272,27 +272,6 @@ def test_valid_time_selection_rejections(timerange, message): assert str(rec_err.value) == message -def test_differing_timeranges(caplog): - timeranges = set() - timeranges.add("1950/1951") - timeranges.add("1950/1952") - required_variables = [ - {"short_name": "rsdscs", "timerange": "1950/1951"}, - {"short_name": "rsuscs", "timerange": "1950/1952"}, - ] - with pytest.raises(ValueError) as exc: - check.differing_timeranges(timeranges, required_variables) - expected_log = ( - f"Differing timeranges with values {timeranges} " - "found for required variables " - "[{'short_name': 'rsdscs', 'timerange': '1950/1951'}, " - "{'short_name': 'rsuscs', 'timerange': '1950/1952'}]. " - "Set `timerange` to a common value." - ) - - assert expected_log in str(exc.value) - - def test_data_availability_nonexistent(tmp_path): var = { "dataset": "ABC", diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index 6edc59b168..0a8fbc4f79 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -22,7 +22,7 @@ import esmvalcore.io.esgf import esmvalcore.io.local from esmvalcore._recipe.recipe import ( - _get_input_datasets, + _get_required_datasets, _representative_datasets, read_recipe_file, ) @@ -182,7 +182,7 @@ def get_required(short_name, _): ] monkeypatch.setattr( - esmvalcore._recipe.to_datasets, + esmvalcore.dataset, "get_required", get_required, ) @@ -1707,7 +1707,7 @@ def test_alias_generation(tmp_path, patched_datafinder, session): # noqa: C901, assert dataset["alias"] == "CORDEX_ICHEC-EC-EARTH" else: assert dataset["alias"] == "CORDEX_MIROC-MIROC5" - elif dataset["version"] == 1: + elif dataset["version"] == "1": assert dataset["alias"] == "OBS_1" else: assert dataset["alias"] == "OBS_2" @@ -2599,9 +2599,7 @@ def test_representative_dataset_derived_var( expected_facets: Facets = { # Already present in variable "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": force_derivation, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2611,6 +2609,9 @@ def test_representative_dataset_derived_var( "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_required_datasets() + "derive": False, + "force_derivation": False, } if force_derivation: expected_datasets = [ @@ -2665,9 +2666,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "short_name": "rsdscs", # Already present in variables "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": True, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2680,6 +2679,9 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_required_datasets() + "derive": False, + "force_derivation": False, } rsdscs = Dataset(**rsdscs_facets) rsdscs.session = session @@ -2689,9 +2691,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "short_name": "rsuscs", # Already present in variables "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": True, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2704,11 +2704,14 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_required_datasets() + "derive": False, + "force_derivation": False, } rsuscs = Dataset(**rsuscs_facets) rsuscs.session = session - alb_derive_input = _get_input_datasets(alb) + alb_derive_input = _get_required_datasets(alb) assert alb_derive_input == [rsdscs, rsuscs] diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py index 6ed350c34d..fd19bc21a3 100644 --- a/tests/unit/recipe/test_recipe.py +++ b/tests/unit/recipe/test_recipe.py @@ -865,28 +865,6 @@ def test_get_default_settings(mocker): } -def test_set_version(mocker): - dataset = Dataset(short_name="tas") - supplementary = Dataset(short_name="areacella") - dataset.supplementaries = [supplementary] - - input_dataset = Dataset(short_name="tas") - file1 = mocker.Mock() - file1.facets = {"version": "v1"} - file2 = mocker.Mock() - file2.facets = {"version": "v2"} - input_dataset.files = [file1, file2] - - file3 = mocker.Mock() - file3.facets = {"version": "v3"} - supplementary.files = [file3] - - _recipe._set_version(dataset, [input_dataset]) - print(dataset) - assert dataset.facets["version"] == ["v1", "v2"] - assert dataset.supplementaries[0].facets["version"] == "v3" - - def test_extract_preprocessor_order(): profile = { "custom_order": True, @@ -956,3 +934,23 @@ def test_special_name_to_dataset_invalid_special_name_type(): ) with pytest.raises(RecipeError, match=msg): _recipe._special_name_to_dataset(facets, "reference_dataset") + + +def test_fix_cmip5_fx_ensemble(monkeypatch): + def find_files(self): + if self.facets["ensemble"] == "r0i0p0": + self._files = ["file1.nc"] + + monkeypatch.setattr(Dataset, "find_files", find_files) + + dataset = Dataset( + dataset="dataset1", + short_name="orog", + mip="fx", + project="CMIP5", + ensemble="r1i1p1", + ) + + _recipe._fix_cmip5_fx_ensemble(dataset) + + assert dataset["ensemble"] == "r0i0p0" diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py index 6e081c8fc3..443ec9b80a 100644 --- a/tests/unit/recipe/test_to_datasets.py +++ b/tests/unit/recipe/test_to_datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import textwrap from pathlib import Path from typing import TYPE_CHECKING @@ -284,7 +285,7 @@ def test_merge_supplementaries_missing_short_name_fails(session): Dataset.from_recipe(recipe_txt, session) -def test_get_input_datasets_derive(session): +def test_get_required_datasets_derive(session): dataset = Dataset( dataset="ERA5", project="native6", @@ -299,7 +300,7 @@ def test_get_input_datasets_derive(session): type="reanaly", version="v1", ) - rlds, rlns = to_datasets._get_input_datasets(dataset) + rlds, rlns = to_datasets._get_required_datasets(dataset) assert rlds["short_name"] == "rlds" assert rlds["long_name"] == "Surface Downwelling Longwave Radiation" assert rlds["frequency"] == "1hr" @@ -308,6 +309,57 @@ def test_get_input_datasets_derive(session): assert rlns["frequency"] == "1hr" +def test_get_required_datasets_optional(caplog, tmp_path, session): + facets = { + "project": "OBS6", + "dataset": "SAT", + "mip": "SImon", + "short_name": "siextent", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", + "derive": True, + } + + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + sic_file = LocalFile( + input_dir / "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc", + ) + sic_file.touch() + + dataset = Dataset(**facets) + dataset.files = [] + dataset.session = session + + with caplog.at_level(logging.INFO): + datasets = to_datasets._get_required_datasets(dataset) + + expected = Dataset( + dataset="SAT", + project="OBS6", + mip="SImon", + short_name="siconca", + derive=False, + frequency="mon", + long_name="Sea-Ice Area Percentage (Atmospheric Grid)", + modeling_realm=["seaIce"], + optional="true", + original_short_name="siconca", + standard_name="sea_ice_area_fraction", + tier=2, + timerange="1980/2000", + type="sat", + units="%", + ) + expected.session = session + + assert datasets == [expected] + + logger_infos = [r.message for r in caplog.records if r.levelname == "INFO"] + assert "which is marked as 'optional'" in logger_infos[-1] + + def test_max_years(session): recipe_txt = textwrap.dedent(""" diagnostics: @@ -355,26 +407,6 @@ def from_files(_): to_datasets._dataset_from_files(dataset) -def test_fix_cmip5_fx_ensemble(monkeypatch): - def find_files(self): - if self.facets["ensemble"] == "r0i0p0": - self._files = ["file1.nc"] - - monkeypatch.setattr(Dataset, "find_files", find_files) - - dataset = Dataset( - dataset="dataset1", - short_name="orog", - mip="fx", - project="CMIP5", - ensemble="r1i1p1", - ) - - to_datasets._fix_cmip5_fx_ensemble(dataset) - - assert dataset["ensemble"] == "r0i0p0" - - def test_get_supplementary_short_names(monkeypatch): def _update_cmor_facets(facets): facets["modeling_realm"] = "atmos" diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index a4fc0b527d..e8cd1ca67a 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1,15 +1,13 @@ from __future__ import annotations -import importlib.resources +import logging import textwrap from collections import defaultdict -from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING import pyesgf import pytest -import yaml import esmvalcore.dataset import esmvalcore.io.esgf @@ -24,45 +22,6 @@ from esmvalcore.typing import Facets -@lru_cache -def _load_default_data_sources() -> dict[ - str, - dict[str, dict[str, dict[str, dict[str, str]]]], -]: - """Load default data sources for local users.""" - cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = { - "projects": {}, - } - for file in ( - "data-local.yml", - "data-local-esmvaltool.yml", - "data-native-cesm.yml", - "data-native-emac.yml", - "data-native-icon.yml", - "data-native-ipslcm.yml", - ): - with importlib.resources.as_file( - importlib.resources.files(esmvalcore.config) - / "configurations" - / file, - ) as config_file: - content = config_file.read_text(encoding="utf-8") - cfg["projects"].update(yaml.safe_load(content)["projects"]) - return cfg - - -@pytest.fixture -def session(tmp_path: Path, session: Session) -> Session: - """Session fixture with default local data sources.""" - projects = _load_default_data_sources()["projects"] - for project in projects: - data_sources = projects[project]["data"] - for data_source in data_sources.values(): - data_source["rootpath"] = str(tmp_path) - session["projects"][project]["data"] = data_sources - return session - - def test_repr(): ds = Dataset(short_name="tas", dataset="dataset1") @@ -1231,6 +1190,718 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session): assert datasets == [expected] +OBS6_SAT_FACETS: Facets = { + "project": "OBS6", + "dataset": "SAT", + "mip": "Amon", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", +} + + +def test_from_files_no_files_glob(session): + dataset = Dataset(**{**OBS6_SAT_FACETS, "type": "*"}, short_name="tas") + datasets = list(dataset.from_files()) + assert datasets == [dataset] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_files_glob(timerange, session): + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + datasets = list(dataset.from_files()) + assert datasets == [dataset] + + +@pytest.fixture +def lwcre_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", + ) + lwcre.touch() + return lwcre + + +@pytest.fixture +def lwcre_file_ground(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_ground_1_Amon_lwcre_1980-2000.nc", + ) + lwcre.touch() + return lwcre + + +@pytest.fixture +def rlut_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlut_1980-2000.nc", + ) + rlut.touch() + return rlut + + +@pytest.fixture +def rlut_file_future(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlut_2100-2101.nc", + ) + rlut.touch() + return rlut + + +@pytest.fixture +def rlut_file_ground(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_ground_1_Amon_rlut_1980-2000.nc", + ) + rlut.touch() + return rlut + + +@pytest.fixture +def rlutcs_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlutcs = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlutcs_1980-2000.nc", + ) + rlutcs.touch() + return rlutcs + + +@pytest.fixture +def pr_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + pr = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_pr_1980-2000.nc", + ) + pr.touch() + return pr + + +def test_from_files_with_derived_no_derivation(lwcre_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_required_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ) + expected_required_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_required_dataset.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == [expected_required_dataset] + assert required_datasets[0].files == [lwcre_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_derivation_glob( + timerange, + lwcre_file, + lwcre_file_ground, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + ), + Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True), + ] + for expected_ds in expected_datasets: + expected_ds.add_supplementary(short_name="pr", type="sat") + expected_ds.session = session + + assert datasets == expected_datasets + assert datasets[0].files == [lwcre_file_ground] + assert datasets[0].supplementaries[0].files == [pr_file] + assert datasets[1].files == [lwcre_file] + assert datasets[1].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + for dataset, expected in zip( + datasets, + expected_required_datasets, + strict=True, + ): + assert dataset.required_datasets == [expected] + assert datasets[0].required_datasets[0].files == [lwcre_file_ground] + assert datasets[1].required_datasets[0].files == [lwcre_file] + + +def test_from_files_with_derived(rlut_file, rlutcs_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] + + +def test_from_files_with_derived_unavailable_years( + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="lwcre", + derive=True, + ) + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="lwcre", + derive=True, + ) + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_glob( + timerange, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, + caplog, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + with caplog.at_level(logging.DEBUG): + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] + + log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] + msg = "Not all variables required to derive 'lwcre' are available" + for log_debug in log_debugs: + if msg in log_debug: + break + else: + pytest.fail(f"No debug message '{msg}'") + + +def test_from_files_with_derived_no_force_derivation( + lwcre_file, + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_required_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ) + expected_required_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_required_dataset.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == [expected_required_dataset] + assert required_datasets[0].files == [lwcre_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_force_derivation_glob( # noqa: PLR0913 + timerange, + lwcre_file, + lwcre_file_ground, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + ), + Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True), + ] + for expected_ds in expected_datasets: + expected_ds.add_supplementary(short_name="pr", type="sat") + expected_ds.session = session + + assert datasets == expected_datasets + assert datasets[0].files == [lwcre_file_ground] + assert datasets[0].supplementaries[0].files == [pr_file] + assert datasets[1].files == [lwcre_file] + assert datasets[1].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + for dataset, expected in zip( + datasets, + expected_required_datasets, + strict=True, + ): + assert dataset.required_datasets == [expected] + assert datasets[0].required_datasets[0].files == [lwcre_file_ground] + assert datasets[1].required_datasets[0].files == [lwcre_file] + + +def test_from_files_with_derived_force_derivation( + lwcre_file, + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 + timerange, + lwcre_file, + lwcre_file_ground, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, + caplog, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + with caplog.at_level(logging.DEBUG): + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] + + log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] + msg = "Not all variables required to derive 'lwcre' are available" + for log_debug in log_debugs: + if msg in log_debug: + break + else: + pytest.fail(f"No debug message '{msg}'") + + def test_match(): dataset1 = Dataset( short_name="areacella", @@ -1614,7 +2285,7 @@ def test_find_files_outdated_local(mocker, dataset): assert dataset.files == esgf_files -def test_set_version(): +def test_set_version_non_derived_var(): dataset = Dataset(short_name="tas") dataset.add_supplementary(short_name="areacella") file_v1 = esmvalcore.io.local.LocalFile("/path/to/v1/tas.nc") @@ -1630,6 +2301,47 @@ def test_set_version(): assert dataset.supplementaries[0].facets["version"] == "v3" +def test_set_version_derived_var(monkeypatch, session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="areacella") + dataset.files = [] + areacella_file = esmvalcore.local.LocalFile("/path/to/areacella.nc") + areacella_file.facets["version"] = "v4" + dataset.supplementaries[0].files = [areacella_file] + + def _get_required_datasets(): + rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc") + rlut_file.facets["version"] = "v1" + rlut_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + ) + rlut_dataset.files = [rlut_file] + rlutcs_file_1 = esmvalcore.local.LocalFile("/path/to/rlutcs_1.nc") + rlutcs_file_2 = esmvalcore.local.LocalFile("/path/to/rlutcs_2.nc") + rlutcs_file_1.facets["version"] = "v2" + rlutcs_file_2.facets["version"] = "v3" + rlutcs_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + ) + rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2] + return [rlut_dataset, rlutcs_dataset] + + monkeypatch.setattr( + dataset, + "_get_required_datasets", + _get_required_datasets, + ) + + dataset.set_version() + + assert dataset.facets["version"] == ["v1", "v2", "v3"] + assert dataset.supplementaries[0].facets["version"] == "v4" + + @pytest.mark.parametrize("timerange", ["*", "185001/*", "*/185112"]) def test_update_timerange_from_esgf(mocker, timerange): esgf_files = [ @@ -1701,9 +2413,8 @@ def test_update_timerange_no_files(session, search_data): } dataset = Dataset(**variable) dataset.files = [] - msg = r"Missing data for Dataset: tas, Amon, CMIP6, HadGEM3-GC31-LL.*" - with pytest.raises(InputFilesNotFound, match=msg): - dataset._update_timerange() + dataset._update_timerange() + assert "timerange" not in dataset.facets def test_update_timerange_typeerror(): @@ -2142,16 +2853,6 @@ def test_get_extra_facets_native6(): } -OBS6_SAT_FACETS: Facets = { - "project": "OBS6", - "dataset": "SAT", - "mip": "Amon", - "tier": 2, - "type": "sat", - "timerange": "1980/2000", -} - - def test_is_derived_no_derivation(): dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") assert dataset._is_derived() is False @@ -2204,6 +2905,15 @@ def test_derivation_necessary_no_force_derivation_no_files( assert dataset._derivation_necessary() is True +def test_derivation_necessary_no_force_derivation_no_files_glob(session): + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="lwcre", + derive=True, + ) + assert dataset._derivation_necessary() is True + + def test_derivation_necessary_no_force_derivation(tmp_path, session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.session = session @@ -2277,3 +2987,67 @@ def test_add_derived_supplementary_to_derived(): force_derivation=True, ) assert dataset.supplementaries[0] == expected_supplementary + + +def test_required_datasets_derivation(session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + + expected_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_dataset in expected_datasets: + expected_dataset.session = dataset.session + + assert dataset.required_datasets == expected_datasets + + +def test_required_datasets_no_derivation(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") + dataset.add_supplementary(short_name="pr") + + assert dataset.required_datasets == [dataset] + + +def test_required_datasets_no_force_derivation(tmp_path, session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre_file = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", + ) + lwcre_file.touch() + + assert dataset.required_datasets == [dataset] + + +def test_required_datasets_no_derivation_available(session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True) + + msg = r"Cannot derive variable 'tas': no derivation script available" + with pytest.raises(NotImplementedError, match=msg): + dataset.required_datasets # noqa: B018