diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index 1aae4b6aef..b630994682 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -37,7 +37,6 @@
 
     from esmvalcore._task import TaskSet
     from esmvalcore.dataset import Dataset
-    from esmvalcore.typing import Facets
 
 
 logger = logging.getLogger(__name__)
@@ -504,20 +503,6 @@ def valid_time_selection(timerange: str) -> None:
             _check_timerange_values(_format_years(date), timerange_list)
 
 
-def differing_timeranges(
-    timeranges: set[str],
-    required_vars: list[Facets],
-) -> None:
-    """Log error if required variables have differing timeranges."""
-    if len(timeranges) > 1:
-        msg = (
-            f"Differing timeranges with values {timeranges} "
-            f"found for required variables {required_vars}. "
-            "Set `timerange` to a common value."
-        )
-        raise ValueError(msg)
-
-
 def _check_literal(
     settings: dict,
     *,
diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 38f48fc663..d54d35fba5 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -52,8 +52,7 @@
 from . import check
 from .from_datasets import datasets_to_recipe
 from .to_datasets import (
-    _derive_needed,
-    _get_input_datasets,
+    _get_required_datasets,
     _representative_datasets,
 )
 
@@ -251,7 +250,7 @@ def _get_default_settings(dataset: Dataset) -> PreprocessorSettings:
 
     settings = {}
 
-    if _derive_needed(dataset):
+    if dataset._derivation_necessary():  # noqa: SLF001 (will be replaced soon)
         settings["derive"] = {
             "short_name": facets["short_name"],
             "standard_name": facets["standard_name"],
@@ -622,21 +621,26 @@ def _allow_skipping(dataset: Dataset) -> bool:
     )
 
 
-def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None:
-    """Set the 'version' facet based on derivation input datasets."""
-    versions = set()
-    for in_dataset in input_datasets:
-        in_dataset.set_version()
-        if version := in_dataset.facets.get("version"):
-            if isinstance(version, list):
-                versions.update(version)
-            else:
-                versions.add(version)
-    if versions:
-        version = versions.pop() if len(versions) == 1 else sorted(versions)
-        dataset.set_facet("version", version)
-    for supplementary_ds in dataset.supplementaries:
-        supplementary_ds.set_version()
+def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None:
+    """Automatically correct the wrong ensemble for CMIP5 fx variables."""
+    if (
+        dataset.facets.get("project") == "CMIP5"
+        and dataset.facets.get("mip") == "fx"
+        and dataset.facets.get("ensemble") != "r0i0p0"
+        and not dataset.files
+    ):
+        original_ensemble = dataset["ensemble"]
+        copy = dataset.copy()
+        copy.facets["ensemble"] = "r0i0p0"
+        if copy.files:
+            dataset.facets["ensemble"] = "r0i0p0"
+            logger.info(
+                "Corrected wrong 'ensemble' from '%s' to '%s' for %s",
+                original_ensemble,
+                dataset["ensemble"],
+                dataset.summary(shorten=True),
+            )
+            dataset.find_files()
 
 
 def _get_preprocessor_products(
@@ -662,28 +666,29 @@ def _get_preprocessor_products(
         settings = _get_default_settings(dataset)
         _apply_preprocessor_profile(settings, profile)
         _update_multi_dataset_settings(dataset.facets, settings)
+        _fix_cmip5_fx_ensemble(dataset)
         _update_preproc_functions(settings, dataset, datasets, missing_vars)
         _add_dataset_specific_settings(dataset, settings)
         check.preprocessor_supplementaries(dataset, settings)
-        input_datasets = _get_input_datasets(dataset)
-        missing = _check_input_files(input_datasets)
+        required_datasets = _get_required_datasets(dataset)
+        missing = _check_input_files(required_datasets)
         if missing:
             if _allow_skipping(dataset):
                 logger.info("Skipping: %s", missing)
             else:
                 missing_vars.update(missing)
             continue
-        _set_version(dataset, input_datasets)
+        dataset.set_version()
         USED_DATASETS.append(dataset)
-        _schedule_for_download(input_datasets)
-        _log_input_files(input_datasets)
+        _schedule_for_download(required_datasets)
+        _log_input_files(required_datasets)
         logger.info("Found input files for %s", dataset.summary(shorten=True))
         filename = _get_preprocessor_filename(dataset)
         product = PreprocessorFile(
             filename=filename,
             attributes=dataset.facets,
             settings=settings,
-            datasets=input_datasets,
+            datasets=required_datasets,
         )
 
         products.add(product)
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index 454947915a..e992c767f8 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -13,7 +13,6 @@
 from esmvalcore.exceptions import RecipeError
 from esmvalcore.io.esgf.facets import FACETS
 from esmvalcore.io.local import _replace_years_with_timerange
-from esmvalcore.preprocessor._derive import get_required
 from esmvalcore.preprocessor._io import DATASET_KEYS
 from esmvalcore.preprocessor._supplementary_vars import (
     PREPROCESSOR_SUPPLEMENTARIES,
@@ -189,28 +188,6 @@ def _merge_supplementary_dicts(
     return list(merged.values())
 
 
-def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None:
-    """Automatically correct the wrong ensemble for CMIP5 fx variables."""
-    if (
-        dataset.facets.get("project") == "CMIP5"
-        and dataset.facets.get("mip") == "fx"
-        and dataset.facets.get("ensemble") != "r0i0p0"
-        and not dataset.files
-    ):
-        original_ensemble = dataset["ensemble"]
-        copy = dataset.copy()
-        copy.facets["ensemble"] = "r0i0p0"
-        if copy.files:
-            dataset.facets["ensemble"] = "r0i0p0"
-            logger.info(
-                "Corrected wrong 'ensemble' from '%s' to '%s' for %s",
-                original_ensemble,
-                dataset["ensemble"],
-                dataset.summary(shorten=True),
-            )
-            dataset.find_files()
-
-
 def _get_supplementary_short_names(
     facets: Facets,
     step: str,
@@ -431,9 +408,7 @@ def datasets_from_recipe(
     return datasets
 
 
-def _dataset_from_files(  # noqa: C901
-    dataset: Dataset,
-) -> list[Dataset]:
+def _dataset_from_files(dataset: Dataset) -> list[Dataset]:
     """Replace facet values of '*' based on available files."""
     result: list[Dataset] = []
     errors: list[str] = []
@@ -444,53 +419,32 @@ def _dataset_from_files(  # noqa: C901
             dataset.summary(shorten=True),
         )
 
-    representative_datasets = _representative_datasets(dataset)
-
-    # For derived variables, representative_datasets might contain more than
-    # one element
-    all_datasets: list[list[tuple[dict, Dataset]]] = []
-    for representative_dataset in representative_datasets:
-        all_datasets.append([])
-        for expanded_ds in representative_dataset.from_files():
-            updated_facets = {}
-            unexpanded_globs = {}
-            for key, value in dataset.facets.items():
-                if _isglob(value):
-                    if key in expanded_ds.facets and not _isglob(
-                        expanded_ds[key],
-                    ):
-                        updated_facets[key] = expanded_ds.facets[key]
-                    else:
-                        unexpanded_globs[key] = value
-
-            if unexpanded_globs:
-                msg = _report_unexpanded_globs(
-                    dataset,
-                    expanded_ds,
-                    unexpanded_globs,
-                )
-                errors.append(msg)
-                continue
+    for expanded_ds in dataset.from_files():
+        updated_facets = {}
+        unexpanded_globs = {}
+        for key, value in dataset.facets.items():
+            if _isglob(value):
+                if key in expanded_ds.facets and not _isglob(
+                    expanded_ds[key],
+                ):
+                    updated_facets[key] = expanded_ds.facets[key]
+                else:
+                    unexpanded_globs[key] = value
+
+        if unexpanded_globs:
+            msg = _report_unexpanded_globs(
+                dataset,
+                expanded_ds,
+                unexpanded_globs,
+            )
+            errors.append(msg)
+            continue
 
-            new_ds = dataset.copy()
-            new_ds.facets.update(updated_facets)
-            new_ds.supplementaries = expanded_ds.supplementaries
+        new_ds = dataset.copy()
+        new_ds.facets.update(updated_facets)
+        new_ds.supplementaries = expanded_ds.supplementaries
 
-            all_datasets[-1].append((updated_facets, new_ds))
-
-    # If globs have been expanded, only consider those datasets that contain
-    # all necessary input variables if derivation is necessary
-    for updated_facets, new_ds in all_datasets[0]:
-        other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]]
-        if all(updated_facets in facets for facets in other_facets):
-            result.append(new_ds)
-        else:
-            logger.debug(
-                "Not all necessary input variables to derive '%s' are "
-                "available for dataset %s",
-                dataset["short_name"],
-                updated_facets,
-            )
+        result.append(new_ds)
 
     if errors:
         raise RecipeError("\n".join(errors))
@@ -535,66 +489,33 @@ def _report_unexpanded_globs(
     return msg
 
 
-def _derive_needed(dataset: Dataset) -> bool:
-    """Check if dataset needs to be derived from other datasets."""
-    if not dataset.facets.get("derive"):
-        return False
-    if dataset.facets.get("force_derivation"):
-        return True
-    if _isglob(dataset.facets.get("timerange", "")):
-        # Our file finding routines are not able to handle globs.
-        dataset = dataset.copy()
-        dataset.facets.pop("timerange")
-
-    copy = dataset.copy()
-    copy.supplementaries = []
-    return not copy.files
-
-
-def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
-    """Determine the input datasets needed for deriving `dataset`."""
-    facets = dataset.facets
-    if not _derive_needed(dataset):
-        _fix_cmip5_fx_ensemble(dataset)
-        return [dataset]
+def _get_required_datasets(dataset: Dataset) -> list[Dataset]:
+    """Determine the datasets required for deriving `dataset`."""
+    if not dataset._derivation_necessary():  # noqa: SLF001
+        return dataset.required_datasets
 
-    # Configure input datasets needed to derive variable
-    datasets = []
-    required_vars = get_required(facets["short_name"], facets["project"])  # type: ignore
-    # idea: add option to specify facets in list of dicts that is value of
-    # 'derive' in the recipe and use that instead of get_required?
-    for input_facets in required_vars:
-        input_dataset = dataset.copy()
-        keep = {"alias", "recipe_dataset_index", *dataset.minimal_facets}
-        input_dataset.facets = {
-            k: v for k, v in input_dataset.facets.items() if k in keep
-        }
-        input_dataset.facets.update(input_facets)
-        input_dataset.augment_facets()
-        _fix_cmip5_fx_ensemble(input_dataset)
-        if input_facets.get("optional") and not input_dataset.files:
+    # Skip optional datasets if no data is available
+    required_datasets: list[Dataset] = []
+    for required_dataset in dataset.required_datasets:
+        if (
+            required_dataset.facets.get("optional")
+            and not required_dataset.files
+        ):
             logger.info(
                 "Skipping: no data found for %s which is marked as 'optional'",
-                input_dataset,
+                required_dataset,
             )
         else:
-            datasets.append(input_dataset)
+            required_datasets.append(required_dataset)
 
-    # Check timeranges of available input data.
-    timeranges: set[str] = set()
-    for input_dataset in datasets:
-        if "timerange" in input_dataset.facets:
-            timeranges.add(input_dataset.facets["timerange"])  # type: ignore
-    check.differing_timeranges(timeranges, required_vars)
-
-    return datasets
+    return required_datasets
 
 
 def _representative_datasets(dataset: Dataset) -> list[Dataset]:
     """Find representative datasets for all input variables."""
     copy = dataset.copy()
     copy.supplementaries = []
-    representative_datasets = _get_input_datasets(copy)
+    representative_datasets = _get_required_datasets(copy)
     for representative_dataset in representative_datasets:
         representative_dataset.supplementaries = dataset.supplementaries
     return representative_datasets
diff --git a/esmvalcore/config/_validated_config.py b/esmvalcore/config/_validated_config.py
index 65d42cc5e7..7a5af93224 100644
--- a/esmvalcore/config/_validated_config.py
+++ b/esmvalcore/config/_validated_config.py
@@ -60,7 +60,7 @@ class ValidatedConfig(MutableMapping):
     """
 
     # validate values on the way in
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__()
         self._mapping: dict[str, Any] = {}
         self.update(*args, **kwargs)
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 6be9687a15..9e03c3ddbf 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -29,6 +29,7 @@
 from esmvalcore.exceptions import InputFilesNotFound, RecipeError
 from esmvalcore.io.local import _dates_to_timerange
 from esmvalcore.preprocessor import _get_preprocessor_filename, preprocess
+from esmvalcore.preprocessor._derive import get_required
 
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator, Sequence
@@ -99,7 +100,7 @@ class Dataset:
 
     Attributes
     ----------
-    supplementaries : list[Dataset]
+    supplementaries: list[Dataset]
         List of supplementary datasets.
     facets: :obj:`esmvalcore.typing.Facets`
         Facets describing the dataset.
@@ -129,6 +130,7 @@ def __init__(self, **facets: FacetValue) -> None:
         self._session: Session | None = None
         self._files: Sequence[DataElement] | None = None
         self._used_data_sources: Sequence[DataSource] = []
+        self._required_datasets: list[Dataset] | None = None
 
         for key, value in facets.items():
             self.set_facet(key, deepcopy(value), persist=True)
@@ -185,87 +187,209 @@ def _derivation_necessary(self) -> bool:
         # are found
         ds_copy = self.copy()
         ds_copy.supplementaries = []
+
+        # Avoid potential errors from missing data during timerange glob
+        # expansion
+        if _isglob(ds_copy.facets.get("timerange", "")):
+            ds_copy.facets.pop("timerange", None)
+
         return not ds_copy.files
 
+    def _get_required_datasets(self) -> list[Dataset]:
+        """Get required datasets for derivation."""
+        required_datasets: list[Dataset] = []
+        required_vars_facets = get_required(
+            self.facets["short_name"],  # type: ignore
+            self.facets["project"],  # type: ignore
+        )
+
+        for required_facets in required_vars_facets:
+            required_dataset = self._copy(derive=False, force_derivation=False)
+            keep = {"alias", "recipe_dataset_index", *self.minimal_facets}
+            required_dataset.facets = {
+                k: v for k, v in required_dataset.facets.items() if k in keep
+            }
+            required_dataset.facets.update(required_facets)
+            required_dataset.augment_facets()
+            required_datasets.append(required_dataset)
+
+        return required_datasets
+
+    @property
+    def required_datasets(self) -> list[Dataset]:
+        """Get required datasets.
+
+        For non-derived variables (i.e., those with facet ``derive=False``),
+        this will simply return the dataset itself in a list.
+
+        For derived variables (i.e., those with facet ``derive=True``), this
+        will return the datasets required for derivation if derivation is
+        necessary, and the dataset itself if derivation is not necessary.
+        Derivation is necessary if the facet ``force_derivation=True`` is set
+        or no files for the dataset itself are available.
+
+        See also :func:`esmvalcore.preprocessor.derive` for an example usage.
+
+        """
+        if self._required_datasets is not None:
+            return self._required_datasets
+
+        if not self._derivation_necessary():
+            self._required_datasets = [self]
+        else:
+            self._required_datasets = self._get_required_datasets()
+
+        return self._required_datasets
+
+    @staticmethod
     def _file_to_dataset(
-        self,
+        dataset: Dataset,
         file: DataElement,
     ) -> Dataset:
         """Create a dataset from a file with a `facets` attribute."""
         facets = dict(file.facets)
-        if "version" not in self.facets:
+        if "version" not in dataset.facets:
             # Remove version facet if no specific version requested
             facets.pop("version", None)
 
         updated_facets = {
             f: v
             for f, v in facets.items()
-            if f in self.facets
-            and _isglob(self.facets[f])
-            and _ismatch(v, self.facets[f])
+            if f in dataset.facets
+            and _isglob(dataset.facets[f])
+            and _ismatch(v, dataset.facets[f])
         }
-        dataset = self.copy()
-        dataset.facets.update(updated_facets)
+        new_dataset = dataset.copy()
+        new_dataset.facets.update(updated_facets)
 
         # If possible, remove unexpanded facets that can be automatically
         # populated.
-        unexpanded = {f for f, v in dataset.facets.items() if _isglob(v)}
+        unexpanded = {f for f, v in new_dataset.facets.items() if _isglob(v)}
         required_for_augment = {"project", "mip", "short_name", "dataset"}
         if unexpanded and not unexpanded & required_for_augment:
-            copy = dataset.copy()
+            copy = new_dataset.copy()
             copy.supplementaries = []
             for facet in unexpanded:
                 copy.facets.pop(facet)
             copy.augment_facets()
             for facet in unexpanded:
                 if facet in copy.facets:
-                    dataset.facets.pop(facet)
+                    new_dataset.facets.pop(facet)
 
-        return dataset
+        return new_dataset
 
-    def _get_available_datasets(self) -> Iterator[Dataset]:
+    @staticmethod
+    def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:  # noqa: C901
+        """Yield datasets based on the available files.
+
+        This function requires that dataset.facets['mip'] is not a glob
+        pattern.
+
+        Does take variable derivation into account, i.e., datasets available
+        through variable derivation are returned.
+
+        """
+        datasets_found = False
+
+        # If no forced derivation is requested, search for datasets based on
+        # files from dataset
+        if not dataset._is_force_derived():
+            for available_ds in Dataset._get_available_datasets(dataset):
+                datasets_found = True
+                yield available_ds
+
+        # For variables that cannot be derived, we are done here
+        if not dataset._is_derived():
+            return
+
+        # If forced derivation is requested or no datasets based on files from
+        # dataset have been found, search for datasets based on files from
+        # required datasets
+        if dataset._is_force_derived() or not datasets_found:
+            all_datasets: list[list[tuple[dict, Dataset]]] = []
+            for required_dataset in dataset.required_datasets:
+                all_datasets.append([])
+                for expanded_ds in Dataset._get_available_datasets(
+                    required_dataset,
+                ):
+                    updated_facets = {}
+                    for key, value in dataset.facets.items():
+                        if _isglob(value):
+                            if key in expanded_ds.facets and not _isglob(
+                                expanded_ds[key],
+                            ):
+                                updated_facets[key] = expanded_ds.facets[key]
+                    new_ds = dataset.copy()
+                    new_ds.facets.update(updated_facets)
+                    new_ds.supplementaries = dataset.supplementaries
+
+                    all_datasets[-1].append((updated_facets, new_ds))
+
+            # Only consider those datasets that contain all required variables
+            # with identical facets (e.g., skip those with different
+            # timeranges)
+            for updated_facets, new_ds in all_datasets[0]:
+                other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]]
+                if all(updated_facets in facets for facets in other_facets):
+                    yield new_ds
+                else:
+                    logger.debug(
+                        "Not all variables required to derive '%s' are "
+                        "available for %s with facets %s",
+                        dataset["short_name"],
+                        new_ds.summary(shorten=True),
+                        updated_facets,
+                    )
+
+    @staticmethod
+    def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
         """Yield datasets based on the available files.
 
         This function requires that self.facets['mip'] is not a glob pattern.
+
+        Does not take variable derivation into account, i.e., datasets
+        potentially available through variable derivation are ignored. To
+        consider derived variables properly, use the function
+        :func:`_get_all_available_datasets`.
+
         """
-        dataset_template = self.copy()
+        dataset_template = dataset.copy()
         dataset_template.supplementaries = []
 
         seen = set()
         partially_defined = []
         expanded = False
         for file in dataset_template.files:
-            dataset = self._file_to_dataset(file)
-            # Do not use the timerange facet from the file because there may be multiple
-            # files per dataset.
-            dataset.facets.pop("timerange", None)
+            new_dataset = Dataset._file_to_dataset(dataset, file)
+            # Do not use the timerange facet from the file because there may be
+            # multiple files per dataset.
+            new_dataset.facets.pop("timerange", None)
             # Restore the original timerange facet if it was specified.
-            if "timerange" in self.facets:
-                dataset.facets["timerange"] = self.facets["timerange"]
+            if "timerange" in dataset.facets:
+                new_dataset.facets["timerange"] = dataset.facets["timerange"]
 
             # Filter out identical datasets
             facetset = frozenset(
                 (f, frozenset(v) if isinstance(v, list) else v)
-                for f, v in dataset.facets.items()
+                for f, v in new_dataset.facets.items()
             )
             if facetset not in seen:
                 seen.add(facetset)
                 if any(
                     _isglob(v)
-                    for f, v in dataset.facets.items()
+                    for f, v in new_dataset.facets.items()
                     if f != "timerange"
                 ):
-                    partially_defined.append((dataset, file))
+                    partially_defined.append((new_dataset, file))
                 else:
-                    dataset._update_timerange()  # noqa: SLF001
-                    dataset._supplementaries_from_files()  # noqa: SLF001
+                    new_dataset._update_timerange()  # noqa: SLF001
                     expanded = True
-                    yield dataset
+                    yield new_dataset
 
         # Only yield datasets with globs if there is no better alternative
-        for dataset, file in partially_defined:
+        for new_dataset, file in partially_defined:
             msg = (
-                f"{dataset} with unexpanded wildcards, created from file "
+                f"{new_dataset} with unexpanded wildcards, created from file "
                 f"{file} with facets {file.facets}. Please check why "
                 "the missing facets are not available for the file."
                 "This will depend on the data source they come from, e.g. can "
@@ -280,7 +404,7 @@ def _get_available_datasets(self) -> Iterator[Dataset]:
                     "because it still contains wildcards.",
                     msg,
                 )
-                yield dataset
+                yield new_dataset
 
     def from_files(self) -> Iterator[Dataset]:
         """Create datasets based on the available files.
@@ -304,6 +428,10 @@ def from_files(self) -> Iterator[Dataset]:
         Supplementary datasets will in inherit the facet values from the main
         dataset for those facets listed in :obj:`INHERITED_FACETS`.
 
+        This also works for :ref:`derived variables <Variable derivation>`. The
+        datasets required for derivation can be accessed via
+        :attr:`Dataset.required_datasets`.
+
         Examples
         --------
         See :doc:`/notebooks/discovering-data` notebook for example use cases.
@@ -330,7 +458,10 @@ def from_files(self) -> Iterator[Dataset]:
 
             for mip in mips:
                 dataset_template = self.copy(mip=mip)
-                for dataset in dataset_template._get_available_datasets():  # noqa: SLF001
+                for dataset in self._get_all_available_datasets(
+                    dataset_template,
+                ):
+                    dataset._supplementaries_from_files()  # noqa: SLF001
                     expanded = True
                     yield dataset
 
@@ -605,15 +736,29 @@ def minimal_facets(self) -> Facets:
         """Return a dictionary with the persistent facets."""
         return {k: v for k, v in self.facets.items() if k in self._persist}
 
+    @staticmethod
+    def _get_version(dataset: Dataset) -> str | list[str]:
+        """Get available version(s) of dataset."""
+        versions: set[str] = set()
+        for file in dataset.files:
+            if "version" in file.facets:
+                versions.add(str(file.facets["version"]))
+        return versions.pop() if len(versions) == 1 else sorted(versions)
+
     def set_version(self) -> None:
         """Set the ``'version'`` facet based on the available data."""
         versions: set[str] = set()
-        for file in self.files:
-            if "version" in file.facets:
-                versions.add(file.facets["version"])  # type: ignore
+        for required_dataset in self.required_datasets:
+            version = self._get_version(required_dataset)
+            if version:
+                if isinstance(version, list):
+                    versions.update(version)
+                else:
+                    versions.add(version)
         version = versions.pop() if len(versions) == 1 else sorted(versions)
         if version:
             self.set_facet("version", version)
+
         for supplementary_ds in self.supplementaries:
             supplementary_ds.set_version()
 
@@ -972,8 +1117,9 @@ def _update_timerange(self) -> None:
             dataset = self.copy()
             dataset.facets.pop("timerange")
             dataset.supplementaries = []
-            check.data_availability(dataset)
-            if all("timerange" in f.facets for f in dataset.files):
+            if dataset.files and all(
+                "timerange" in f.facets for f in dataset.files
+            ):
                 # "timerange" can only be reliably computed when all DataElements
                 # provide it.
                 intervals = [
diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index 3a25d1f9a4..3817b86bc7 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -13,6 +13,8 @@
 from esmvalcore.preprocessor._units import convert_units
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cf_units import Unit
     from iris.cube import Cube
 
@@ -77,7 +79,7 @@ def get_required(short_name: str, project: str) -> list[Facets]:
 
 
 def derive(
-    cubes: CubeList,
+    cubes: Sequence[Cube],
     short_name: str,
     long_name: str,
     units: str | Unit,
@@ -88,8 +90,7 @@ def derive(
     Parameters
     ----------
     cubes:
-        Includes all the needed variables for derivation defined in
-        :func:`get_required`.
+        Includes all the needed variables for derivation.
     short_name:
         short_name
     long_name:
@@ -103,6 +104,38 @@ def derive(
     -------
     iris.cube.Cube
         The new derived variable.
+
+    Examples
+    --------
+    Required variables for derivation can be obtained via
+    :attr:`esmvalcore.dataset.Dataset.required_datasets`.
+
+    For example, to derive the longwave cloud radiative effect (LWCRE) for the
+    model CESM2, you can use:
+
+    >>> from esmvalcore.dataset import Dataset
+    >>> from esmvalcore.preprocessor import derive
+    >>> dataset = Dataset(
+    ...     project="CMIP6",
+    ...     dataset="CESM2",
+    ...     exp="historical",
+    ...     ensemble="r1i1p1f1",
+    ...     grid="gn",
+    ...     timerange="2000/2014",
+    ...     short_name="lwcre",
+    ...     mip="Amon",
+    ...     derive=True,
+    ... )
+    >>> cubes = [d.load() for d in dataset.required_datasets]
+    >>> cube = derive(
+    ...     cubes,
+    ...     short_name="lwcre",
+    ...     long_name="TOA Longwave Cloud Radiative Effect",
+    ...     units="W m-2",
+    ... )
+    >>> print(cube.var_name)
+    lwcre  # doctest: +SKIP
+
     """
     if short_name == cubes[0].var_name:
         return cubes[0]
diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb
index d6c9001ef2..581e8ca249 100644
--- a/notebooks/discovering-data.ipynb
+++ b/notebooks/discovering-data.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "f0ccfe7f-c535-4606-99ce-be24960aece1",
    "metadata": {},
    "outputs": [],
@@ -89,7 +89,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 778 datasets, showing the first 10:\n"
+      "Found 727 datasets, showing the first 10:\n"
      ]
     },
     {
@@ -168,20 +168,20 @@
        "  'grid': 'gn',\n",
        "  'institute': 'AWI'},\n",
        " Dataset:\n",
-       " {'dataset': 'BCC-CSM2-MR',\n",
+       " {'dataset': 'AWI-ESM-1-REcoM',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'tas',\n",
        "  'ensemble': 'r1i1p1f1',\n",
        "  'exp': 'historical',\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'BCC'},\n",
+       "  'institute': 'AWI'},\n",
        " Dataset:\n",
        " {'dataset': 'BCC-CSM2-MR',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'tas',\n",
-       "  'ensemble': 'r2i1p1f1',\n",
+       "  'ensemble': 'r1i1p1f1',\n",
        "  'exp': 'historical',\n",
        "  'grid': 'gn',\n",
        "  'institute': 'BCC'}]"
@@ -253,7 +253,7 @@
     {
      "data": {
       "text/plain": [
-       "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf-data1.llnl.gov', 'esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf-data04.diasjp.net', 'esgf.nci.org.au', 'esgf3.dkrz.de']]"
+       "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]"
       ]
      },
      "execution_count": 6,
@@ -282,7 +282,7 @@
     {
      "data": {
       "text/plain": [
-       "LocalFile('~/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')"
+       "LocalFile('/home/manuel/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')"
       ]
      },
      "execution_count": 7,
@@ -312,6 +312,235 @@
    "source": [
     "download(dataset.files, CFG[\"download_dir\"])"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3006d90",
+   "metadata": {},
+   "source": [
+    "`Dataset.from_files` can also handle derived variables properly:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b75314e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_template = Dataset(\n",
+    "    short_name=\"lwcre\",\n",
+    "    mip=\"Amon\",\n",
+    "    project=\"CMIP6\",\n",
+    "    exp=\"historical\",\n",
+    "    dataset=\"*\",\n",
+    "    institute=\"*\",\n",
+    "    ensemble=\"r1i1p1f1\",\n",
+    "    grid=\"gn\",\n",
+    "    derive=True,\n",
+    "    force_derivation=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b87c247f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 36 datasets, showing the first 10:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[Dataset:\n",
+       " {'dataset': 'TaiESM1',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'AS-RCEC'},\n",
+       " Dataset:\n",
+       " {'dataset': 'AWI-CM-1-1-MR',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'AWI'},\n",
+       " Dataset:\n",
+       " {'dataset': 'AWI-ESM-1-1-LR',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'AWI'},\n",
+       " Dataset:\n",
+       " {'dataset': 'AWI-ESM-1-REcoM',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'AWI'},\n",
+       " Dataset:\n",
+       " {'dataset': 'BCC-CSM2-MR',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'BCC'},\n",
+       " Dataset:\n",
+       " {'dataset': 'BCC-ESM1',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'BCC'},\n",
+       " Dataset:\n",
+       " {'dataset': 'CAMS-CSM1-0',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CAMS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'CAS-ESM2-0',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CAS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'FGOALS-g3',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CAS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'IITM-ESM',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CCCR-IITM'}]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "datasets = list(dataset_template.from_files())\n",
+    "print(f\"Found {len(datasets)} datasets, showing the first 10:\")\n",
+    "datasets[:10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18e3a0b7",
+   "metadata": {},
+   "source": [
+    "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f00a886f",
+   "metadata": {},
+   "source": [
+    "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets may be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c5edfa65",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = datasets[0]\n",
+    "dataset.files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "97cdf12d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rlut\n",
+      "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlut/gn/v20200623/rlut_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]\n",
+      "rlutcs\n",
+      "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlutcs/gn/v20200623/rlutcs_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for d in dataset.input_datasets:\n",
+    "    print(d[\"short_name\"])\n",
+    "    print(d.files)"
+   ]
   }
  ],
  "metadata": {
diff --git a/tests/conftest.py b/tests/conftest.py
index 46cabf58f9..3c19e4c4df 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import importlib
 import warnings
 from copy import deepcopy
 from functools import lru_cache
@@ -7,6 +8,7 @@
 
 import numpy as np
 import pytest
+import yaml
 from cf_units import Unit
 from iris.coords import (
     AncillaryVariable,
@@ -17,6 +19,7 @@
 )
 from iris.cube import Cube
 
+import esmvalcore
 from esmvalcore.config import CFG, Config
 
 if TYPE_CHECKING:
@@ -55,6 +58,33 @@ def ignore_existing_user_config(
     monkeypatch.setattr(CFG, "_mapping", cfg_default._mapping)
 
 
+@lru_cache
+def _load_default_data_sources() -> dict[
+    str,
+    dict[str, dict[str, dict[str, dict[str, str]]]],
+]:
+    """Load default data sources for local users."""
+    cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = {
+        "projects": {},
+    }
+    for file in (
+        "data-local.yml",
+        "data-local-esmvaltool.yml",
+        "data-native-cesm.yml",
+        "data-native-emac.yml",
+        "data-native-icon.yml",
+        "data-native-ipslcm.yml",
+    ):
+        with importlib.resources.as_file(
+            importlib.resources.files(esmvalcore.config)
+            / "configurations"
+            / file,
+        ) as config_file:
+            content = config_file.read_text(encoding="utf-8")
+            cfg["projects"].update(yaml.safe_load(content)["projects"])
+    return cfg
+
+
 @pytest.fixture
 def session(
     tmp_path: Path,
@@ -63,7 +93,15 @@ def session(
 ) -> Session:
     """Session object with default settings."""
     monkeypatch.setitem(CFG, "output_dir", tmp_path / "esmvaltool_output")
-    return CFG.start_session("recipe_test")
+    session = CFG.start_session("recipe_test")
+    projects = _load_default_data_sources()["projects"]
+    for project in projects:
+        print(project)
+        data_sources = projects[project]["data"]
+        for data_source in data_sources.values():
+            data_source["rootpath"] = str(tmp_path)
+        session["projects"][project]["data"] = data_sources
+    return session
 
 
 @pytest.fixture
diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py
index a9cf809a92..54d0d0bc90 100644
--- a/tests/integration/recipe/test_check.py
+++ b/tests/integration/recipe/test_check.py
@@ -272,27 +272,6 @@ def test_valid_time_selection_rejections(timerange, message):
     assert str(rec_err.value) == message
 
 
-def test_differing_timeranges(caplog):
-    timeranges = set()
-    timeranges.add("1950/1951")
-    timeranges.add("1950/1952")
-    required_variables = [
-        {"short_name": "rsdscs", "timerange": "1950/1951"},
-        {"short_name": "rsuscs", "timerange": "1950/1952"},
-    ]
-    with pytest.raises(ValueError) as exc:
-        check.differing_timeranges(timeranges, required_variables)
-    expected_log = (
-        f"Differing timeranges with values {timeranges} "
-        "found for required variables "
-        "[{'short_name': 'rsdscs', 'timerange': '1950/1951'}, "
-        "{'short_name': 'rsuscs', 'timerange': '1950/1952'}]. "
-        "Set `timerange` to a common value."
-    )
-
-    assert expected_log in str(exc.value)
-
-
 def test_data_availability_nonexistent(tmp_path):
     var = {
         "dataset": "ABC",
diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index 6edc59b168..0a8fbc4f79 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -22,7 +22,7 @@
 import esmvalcore.io.esgf
 import esmvalcore.io.local
 from esmvalcore._recipe.recipe import (
-    _get_input_datasets,
+    _get_required_datasets,
     _representative_datasets,
     read_recipe_file,
 )
@@ -182,7 +182,7 @@ def get_required(short_name, _):
         ]
 
     monkeypatch.setattr(
-        esmvalcore._recipe.to_datasets,
+        esmvalcore.dataset,
         "get_required",
         get_required,
     )
@@ -1707,7 +1707,7 @@ def test_alias_generation(tmp_path, patched_datafinder, session):  # noqa: C901,
                 assert dataset["alias"] == "CORDEX_ICHEC-EC-EARTH"
             else:
                 assert dataset["alias"] == "CORDEX_MIROC-MIROC5"
-        elif dataset["version"] == 1:
+        elif dataset["version"] == "1":
             assert dataset["alias"] == "OBS_1"
         else:
             assert dataset["alias"] == "OBS_2"
@@ -2599,9 +2599,7 @@ def test_representative_dataset_derived_var(
     expected_facets: Facets = {
         # Already present in variable
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": force_derivation,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2611,6 +2609,9 @@ def test_representative_dataset_derived_var(
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_required_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     if force_derivation:
         expected_datasets = [
@@ -2665,9 +2666,7 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "short_name": "rsdscs",
         # Already present in variables
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": True,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2680,6 +2679,9 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_required_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     rsdscs = Dataset(**rsdscs_facets)
     rsdscs.session = session
@@ -2689,9 +2691,7 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "short_name": "rsuscs",
         # Already present in variables
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": True,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2704,11 +2704,14 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_required_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     rsuscs = Dataset(**rsuscs_facets)
     rsuscs.session = session
 
-    alb_derive_input = _get_input_datasets(alb)
+    alb_derive_input = _get_required_datasets(alb)
     assert alb_derive_input == [rsdscs, rsuscs]
 
 
diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py
index 6ed350c34d..fd19bc21a3 100644
--- a/tests/unit/recipe/test_recipe.py
+++ b/tests/unit/recipe/test_recipe.py
@@ -865,28 +865,6 @@ def test_get_default_settings(mocker):
     }
 
 
-def test_set_version(mocker):
-    dataset = Dataset(short_name="tas")
-    supplementary = Dataset(short_name="areacella")
-    dataset.supplementaries = [supplementary]
-
-    input_dataset = Dataset(short_name="tas")
-    file1 = mocker.Mock()
-    file1.facets = {"version": "v1"}
-    file2 = mocker.Mock()
-    file2.facets = {"version": "v2"}
-    input_dataset.files = [file1, file2]
-
-    file3 = mocker.Mock()
-    file3.facets = {"version": "v3"}
-    supplementary.files = [file3]
-
-    _recipe._set_version(dataset, [input_dataset])
-    print(dataset)
-    assert dataset.facets["version"] == ["v1", "v2"]
-    assert dataset.supplementaries[0].facets["version"] == "v3"
-
-
 def test_extract_preprocessor_order():
     profile = {
         "custom_order": True,
@@ -956,3 +934,23 @@ def test_special_name_to_dataset_invalid_special_name_type():
     )
     with pytest.raises(RecipeError, match=msg):
         _recipe._special_name_to_dataset(facets, "reference_dataset")
+
+
+def test_fix_cmip5_fx_ensemble(monkeypatch):
+    def find_files(self):
+        if self.facets["ensemble"] == "r0i0p0":
+            self._files = ["file1.nc"]
+
+    monkeypatch.setattr(Dataset, "find_files", find_files)
+
+    dataset = Dataset(
+        dataset="dataset1",
+        short_name="orog",
+        mip="fx",
+        project="CMIP5",
+        ensemble="r1i1p1",
+    )
+
+    _recipe._fix_cmip5_fx_ensemble(dataset)
+
+    assert dataset["ensemble"] == "r0i0p0"
diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py
index 6e081c8fc3..443ec9b80a 100644
--- a/tests/unit/recipe/test_to_datasets.py
+++ b/tests/unit/recipe/test_to_datasets.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import textwrap
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -284,7 +285,7 @@ def test_merge_supplementaries_missing_short_name_fails(session):
         Dataset.from_recipe(recipe_txt, session)
 
 
-def test_get_input_datasets_derive(session):
+def test_get_required_datasets_derive(session):
     dataset = Dataset(
         dataset="ERA5",
         project="native6",
@@ -299,7 +300,7 @@ def test_get_input_datasets_derive(session):
         type="reanaly",
         version="v1",
     )
-    rlds, rlns = to_datasets._get_input_datasets(dataset)
+    rlds, rlns = to_datasets._get_required_datasets(dataset)
     assert rlds["short_name"] == "rlds"
     assert rlds["long_name"] == "Surface Downwelling Longwave Radiation"
     assert rlds["frequency"] == "1hr"
@@ -308,6 +309,57 @@ def test_get_input_datasets_derive(session):
     assert rlns["frequency"] == "1hr"
 
 
+def test_get_required_datasets_optional(caplog, tmp_path, session):
+    facets = {
+        "project": "OBS6",
+        "dataset": "SAT",
+        "mip": "SImon",
+        "short_name": "siextent",
+        "tier": 2,
+        "type": "sat",
+        "timerange": "1980/2000",
+        "derive": True,
+    }
+
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    sic_file = LocalFile(
+        input_dir / "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc",
+    )
+    sic_file.touch()
+
+    dataset = Dataset(**facets)
+    dataset.files = []
+    dataset.session = session
+
+    with caplog.at_level(logging.INFO):
+        datasets = to_datasets._get_required_datasets(dataset)
+
+    expected = Dataset(
+        dataset="SAT",
+        project="OBS6",
+        mip="SImon",
+        short_name="siconca",
+        derive=False,
+        frequency="mon",
+        long_name="Sea-Ice Area Percentage (Atmospheric Grid)",
+        modeling_realm=["seaIce"],
+        optional="true",
+        original_short_name="siconca",
+        standard_name="sea_ice_area_fraction",
+        tier=2,
+        timerange="1980/2000",
+        type="sat",
+        units="%",
+    )
+    expected.session = session
+
+    assert datasets == [expected]
+
+    logger_infos = [r.message for r in caplog.records if r.levelname == "INFO"]
+    assert "which is marked as 'optional'" in logger_infos[-1]
+
+
 def test_max_years(session):
     recipe_txt = textwrap.dedent("""
     diagnostics:
@@ -355,26 +407,6 @@ def from_files(_):
         to_datasets._dataset_from_files(dataset)
 
 
-def test_fix_cmip5_fx_ensemble(monkeypatch):
-    def find_files(self):
-        if self.facets["ensemble"] == "r0i0p0":
-            self._files = ["file1.nc"]
-
-    monkeypatch.setattr(Dataset, "find_files", find_files)
-
-    dataset = Dataset(
-        dataset="dataset1",
-        short_name="orog",
-        mip="fx",
-        project="CMIP5",
-        ensemble="r1i1p1",
-    )
-
-    to_datasets._fix_cmip5_fx_ensemble(dataset)
-
-    assert dataset["ensemble"] == "r0i0p0"
-
-
 def test_get_supplementary_short_names(monkeypatch):
     def _update_cmor_facets(facets):
         facets["modeling_realm"] = "atmos"
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index a4fc0b527d..e8cd1ca67a 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1,15 +1,13 @@
 from __future__ import annotations
 
-import importlib.resources
+import logging
 import textwrap
 from collections import defaultdict
-from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import pyesgf
 import pytest
-import yaml
 
 import esmvalcore.dataset
 import esmvalcore.io.esgf
@@ -24,45 +22,6 @@
     from esmvalcore.typing import Facets
 
 
-@lru_cache
-def _load_default_data_sources() -> dict[
-    str,
-    dict[str, dict[str, dict[str, dict[str, str]]]],
-]:
-    """Load default data sources for local users."""
-    cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = {
-        "projects": {},
-    }
-    for file in (
-        "data-local.yml",
-        "data-local-esmvaltool.yml",
-        "data-native-cesm.yml",
-        "data-native-emac.yml",
-        "data-native-icon.yml",
-        "data-native-ipslcm.yml",
-    ):
-        with importlib.resources.as_file(
-            importlib.resources.files(esmvalcore.config)
-            / "configurations"
-            / file,
-        ) as config_file:
-            content = config_file.read_text(encoding="utf-8")
-            cfg["projects"].update(yaml.safe_load(content)["projects"])
-    return cfg
-
-
-@pytest.fixture
-def session(tmp_path: Path, session: Session) -> Session:
-    """Session fixture with default local data sources."""
-    projects = _load_default_data_sources()["projects"]
-    for project in projects:
-        data_sources = projects[project]["data"]
-        for data_source in data_sources.values():
-            data_source["rootpath"] = str(tmp_path)
-        session["projects"][project]["data"] = data_sources
-    return session
-
-
 def test_repr():
     ds = Dataset(short_name="tas", dataset="dataset1")
 
@@ -1231,6 +1190,718 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session):
     assert datasets == [expected]
 
 
+OBS6_SAT_FACETS: Facets = {
+    "project": "OBS6",
+    "dataset": "SAT",
+    "mip": "Amon",
+    "tier": 2,
+    "type": "sat",
+    "timerange": "1980/2000",
+}
+
+
+def test_from_files_no_files_glob(session):
+    dataset = Dataset(**{**OBS6_SAT_FACETS, "type": "*"}, short_name="tas")
+    datasets = list(dataset.from_files())
+    assert datasets == [dataset]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_no_files_glob(timerange, session):
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+    )
+    datasets = list(dataset.from_files())
+    assert datasets == [dataset]
+
+
+@pytest.fixture
+def lwcre_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
+    )
+    lwcre.touch()
+    return lwcre
+
+
+@pytest.fixture
+def lwcre_file_ground(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_ground_1_Amon_lwcre_1980-2000.nc",
+    )
+    lwcre.touch()
+    return lwcre
+
+
+@pytest.fixture
+def rlut_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlut_1980-2000.nc",
+    )
+    rlut.touch()
+    return rlut
+
+
+@pytest.fixture
+def rlut_file_future(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlut_2100-2101.nc",
+    )
+    rlut.touch()
+    return rlut
+
+
+@pytest.fixture
+def rlut_file_ground(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_ground_1_Amon_rlut_1980-2000.nc",
+    )
+    rlut.touch()
+    return rlut
+
+
+@pytest.fixture
+def rlutcs_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlutcs = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlutcs_1980-2000.nc",
+    )
+    rlutcs.touch()
+    return rlutcs
+
+
+@pytest.fixture
+def pr_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    pr = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_pr_1980-2000.nc",
+    )
+    pr.touch()
+    return pr
+
+
+def test_from_files_with_derived_no_derivation(lwcre_file, session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_required_dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        frequency="mon",
+        long_name="TOA Longwave Cloud Radiative Effect",
+        modeling_realm=["atmos"],
+        original_short_name="lwcre",
+        standard_name="",
+        units="W m-2",
+    )
+    expected_required_dataset.supplementaries = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    expected_required_dataset.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == [expected_required_dataset]
+    assert required_datasets[0].files == [lwcre_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_no_derivation_glob(
+    timerange,
+    lwcre_file,
+    lwcre_file_ground,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+        ),
+        Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True),
+    ]
+    for expected_ds in expected_datasets:
+        expected_ds.add_supplementary(short_name="pr", type="sat")
+        expected_ds.session = session
+
+    assert datasets == expected_datasets
+    assert datasets[0].files == [lwcre_file_ground]
+    assert datasets[0].supplementaries[0].files == [pr_file]
+    assert datasets[1].files == [lwcre_file]
+    assert datasets[1].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
+        expected_ds.session = session
+
+    for dataset, expected in zip(
+        datasets,
+        expected_required_datasets,
+        strict=True,
+    ):
+        assert dataset.required_datasets == [expected]
+    assert datasets[0].required_datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].required_datasets[0].files == [lwcre_file]
+
+
+def test_from_files_with_derived(rlut_file, rlutcs_file, session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
+
+
+def test_from_files_with_derived_unavailable_years(
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+        short_name="lwcre",
+        derive=True,
+    )
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == []
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_glob(
+    timerange,
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+    caplog,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    with caplog.at_level(logging.DEBUG):
+        datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
+
+    log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
+    msg = "Not all variables required to derive 'lwcre' are available"
+    for log_debug in log_debugs:
+        if msg in log_debug:
+            break
+    else:
+        pytest.fail(f"No debug message '{msg}'")
+
+
+def test_from_files_with_derived_no_force_derivation(
+    lwcre_file,
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_required_dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        frequency="mon",
+        long_name="TOA Longwave Cloud Radiative Effect",
+        modeling_realm=["atmos"],
+        original_short_name="lwcre",
+        standard_name="",
+        units="W m-2",
+    )
+    expected_required_dataset.supplementaries = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    expected_required_dataset.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == [expected_required_dataset]
+    assert required_datasets[0].files == [lwcre_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_no_force_derivation_glob(  # noqa: PLR0913
+    timerange,
+    lwcre_file,
+    lwcre_file_ground,
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+        ),
+        Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True),
+    ]
+    for expected_ds in expected_datasets:
+        expected_ds.add_supplementary(short_name="pr", type="sat")
+        expected_ds.session = session
+
+    assert datasets == expected_datasets
+    assert datasets[0].files == [lwcre_file_ground]
+    assert datasets[0].supplementaries[0].files == [pr_file]
+    assert datasets[1].files == [lwcre_file]
+    assert datasets[1].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
+        expected_ds.session = session
+
+    for dataset, expected in zip(
+        datasets,
+        expected_required_datasets,
+        strict=True,
+    ):
+        assert dataset.required_datasets == [expected]
+    assert datasets[0].required_datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].required_datasets[0].files == [lwcre_file]
+
+
+def test_from_files_with_derived_force_derivation(
+    lwcre_file,
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
+    timerange,
+    lwcre_file,
+    lwcre_file_ground,
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+    caplog,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    with caplog.at_level(logging.DEBUG):
+        datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
+
+    log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
+    msg = "Not all variables required to derive 'lwcre' are available"
+    for log_debug in log_debugs:
+        if msg in log_debug:
+            break
+    else:
+        pytest.fail(f"No debug message '{msg}'")
+
+
 def test_match():
     dataset1 = Dataset(
         short_name="areacella",
@@ -1614,7 +2285,7 @@ def test_find_files_outdated_local(mocker, dataset):
     assert dataset.files == esgf_files
 
 
-def test_set_version():
+def test_set_version_non_derived_var():
     dataset = Dataset(short_name="tas")
     dataset.add_supplementary(short_name="areacella")
     file_v1 = esmvalcore.io.local.LocalFile("/path/to/v1/tas.nc")
@@ -1630,6 +2301,47 @@ def test_set_version():
     assert dataset.supplementaries[0].facets["version"] == "v3"
 
 
+def test_set_version_derived_var(monkeypatch, session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="areacella")
+    dataset.files = []
+    areacella_file = esmvalcore.local.LocalFile("/path/to/areacella.nc")
+    areacella_file.facets["version"] = "v4"
+    dataset.supplementaries[0].files = [areacella_file]
+
+    def _get_required_datasets():
+        rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc")
+        rlut_file.facets["version"] = "v1"
+        rlut_dataset = Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+        )
+        rlut_dataset.files = [rlut_file]
+        rlutcs_file_1 = esmvalcore.local.LocalFile("/path/to/rlutcs_1.nc")
+        rlutcs_file_2 = esmvalcore.local.LocalFile("/path/to/rlutcs_2.nc")
+        rlutcs_file_1.facets["version"] = "v2"
+        rlutcs_file_2.facets["version"] = "v3"
+        rlutcs_dataset = Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+        )
+        rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2]
+        return [rlut_dataset, rlutcs_dataset]
+
+    monkeypatch.setattr(
+        dataset,
+        "_get_required_datasets",
+        _get_required_datasets,
+    )
+
+    dataset.set_version()
+
+    assert dataset.facets["version"] == ["v1", "v2", "v3"]
+    assert dataset.supplementaries[0].facets["version"] == "v4"
+
+
 @pytest.mark.parametrize("timerange", ["*", "185001/*", "*/185112"])
 def test_update_timerange_from_esgf(mocker, timerange):
     esgf_files = [
@@ -1701,9 +2413,8 @@ def test_update_timerange_no_files(session, search_data):
     }
     dataset = Dataset(**variable)
     dataset.files = []
-    msg = r"Missing data for Dataset: tas, Amon, CMIP6, HadGEM3-GC31-LL.*"
-    with pytest.raises(InputFilesNotFound, match=msg):
-        dataset._update_timerange()
+    dataset._update_timerange()
+    assert "timerange" not in dataset.facets
 
 
 def test_update_timerange_typeerror():
@@ -2142,16 +2853,6 @@ def test_get_extra_facets_native6():
     }
 
 
-OBS6_SAT_FACETS: Facets = {
-    "project": "OBS6",
-    "dataset": "SAT",
-    "mip": "Amon",
-    "tier": 2,
-    "type": "sat",
-    "timerange": "1980/2000",
-}
-
-
 def test_is_derived_no_derivation():
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
     assert dataset._is_derived() is False
@@ -2204,6 +2905,15 @@ def test_derivation_necessary_no_force_derivation_no_files(
     assert dataset._derivation_necessary() is True
 
 
+def test_derivation_necessary_no_force_derivation_no_files_glob(session):
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    assert dataset._derivation_necessary() is True
+
+
 def test_derivation_necessary_no_force_derivation(tmp_path, session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.session = session
@@ -2277,3 +2987,67 @@ def test_add_derived_supplementary_to_derived():
         force_derivation=True,
     )
     assert dataset.supplementaries[0] == expected_supplementary
+
+
+def test_required_datasets_derivation(session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+
+    expected_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_dataset in expected_datasets:
+        expected_dataset.session = dataset.session
+
+    assert dataset.required_datasets == expected_datasets
+
+
+def test_required_datasets_no_derivation():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
+    dataset.add_supplementary(short_name="pr")
+
+    assert dataset.required_datasets == [dataset]
+
+
+def test_required_datasets_no_force_derivation(tmp_path, session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre_file = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
+    )
+    lwcre_file.touch()
+
+    assert dataset.required_datasets == [dataset]
+
+
+def test_required_datasets_no_derivation_available(session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True)
+
+    msg = r"Cannot derive variable 'tas': no derivation script available"
+    with pytest.raises(NotImplementedError, match=msg):
+        dataset.required_datasets  # noqa: B018