From 22d1490cf1368065d09a14eed1f8974aa6e9ddb2 Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Mon, 20 Jan 2025 17:20:39 +0100 Subject: [PATCH 01/17] new plugin module --- xrlint/plugins/xcube/__init__.py | 3 ++- xrlint/plugins/xcube/plugin.py | 8 ++++++++ xrlint/plugins/xcube/rules/__init__.py | 8 -------- xrlint/plugins/xcube/rules/any_spatial_data_var.py | 2 +- xrlint/plugins/xcube/rules/cube_dims_order.py | 2 +- xrlint/plugins/xcube/rules/data_var_colors.py | 2 +- xrlint/plugins/xcube/rules/grid_mapping_naming.py | 2 +- xrlint/plugins/xcube/rules/increasing_time.py | 2 +- xrlint/plugins/xcube/rules/lat_lon_naming.py | 2 +- xrlint/plugins/xcube/rules/single_grid_mapping.py | 2 +- xrlint/plugins/xcube/rules/time_naming.py | 2 +- 11 files changed, 18 insertions(+), 17 deletions(-) create mode 100644 xrlint/plugins/xcube/plugin.py diff --git a/xrlint/plugins/xcube/__init__.py b/xrlint/plugins/xcube/__init__.py index ee60293..31b23c4 100644 --- a/xrlint/plugins/xcube/__init__.py +++ b/xrlint/plugins/xcube/__init__.py @@ -4,9 +4,10 @@ def export_plugin() -> Plugin: - from .rules import plugin + from .plugin import plugin import_submodules("xrlint.plugins.xcube.rules") + # import_submodules("xrlint.plugins.xcube.processors") plugin.configs["recommended"] = Config.from_value( { diff --git a/xrlint/plugins/xcube/plugin.py b/xrlint/plugins/xcube/plugin.py new file mode 100644 index 0000000..1de8247 --- /dev/null +++ b/xrlint/plugins/xcube/plugin.py @@ -0,0 +1,8 @@ +from xrlint.plugin import new_plugin +from xrlint.version import version + +plugin = new_plugin( + name="xcube", + version=version, + ref="xrlint.plugins.xcube:export_plugin", +) diff --git a/xrlint/plugins/xcube/rules/__init__.py b/xrlint/plugins/xcube/rules/__init__.py index 1de8247..e69de29 100644 --- a/xrlint/plugins/xcube/rules/__init__.py +++ b/xrlint/plugins/xcube/rules/__init__.py @@ -1,8 +0,0 @@ -from xrlint.plugin import new_plugin -from xrlint.version import version - -plugin = new_plugin( - name="xcube", - version=version, - ref="xrlint.plugins.xcube:export_plugin", -) diff --git a/xrlint/plugins/xcube/rules/any_spatial_data_var.py b/xrlint/plugins/xcube/rules/any_spatial_data_var.py index 697d2df..26f9375 100644 --- a/xrlint/plugins/xcube/rules/any_spatial_data_var.py +++ b/xrlint/plugins/xcube/rules/any_spatial_data_var.py @@ -1,5 +1,5 @@ from xrlint.node import DatasetNode -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.plugins.xcube.util import is_spatial_var from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/cube_dims_order.py b/xrlint/plugins/xcube/rules/cube_dims_order.py index 103ed42..aee767e 100644 --- a/xrlint/plugins/xcube/rules/cube_dims_order.py +++ b/xrlint/plugins/xcube/rules/cube_dims_order.py @@ -1,6 +1,6 @@ from xrlint.node import DataArrayNode from xrlint.plugins.xcube.constants import LAT_NAME, LON_NAME, X_NAME, Y_NAME, TIME_NAME -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/data_var_colors.py b/xrlint/plugins/xcube/rules/data_var_colors.py index 39225d2..48d8fc1 100644 --- a/xrlint/plugins/xcube/rules/data_var_colors.py +++ b/xrlint/plugins/xcube/rules/data_var_colors.py @@ -1,5 +1,5 @@ from xrlint.node import DataArrayNode -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.plugins.xcube.util import is_spatial_var from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/grid_mapping_naming.py b/xrlint/plugins/xcube/rules/grid_mapping_naming.py index dc079a3..cca1f8d 100644 --- a/xrlint/plugins/xcube/rules/grid_mapping_naming.py +++ b/xrlint/plugins/xcube/rules/grid_mapping_naming.py @@ -1,6 +1,6 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import GM_NAMES, GM_NAMES_TEXT -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/increasing_time.py b/xrlint/plugins/xcube/rules/increasing_time.py index 331fb05..ee38997 100644 --- a/xrlint/plugins/xcube/rules/increasing_time.py +++ b/xrlint/plugins/xcube/rules/increasing_time.py @@ -1,7 +1,7 @@ import numpy as np from xrlint.node import DataArrayNode -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleContext, RuleExit, RuleOp from xrlint.util.formatting import format_count, format_seq diff --git a/xrlint/plugins/xcube/rules/lat_lon_naming.py b/xrlint/plugins/xcube/rules/lat_lon_naming.py index 9ae2c98..e3d4a14 100644 --- a/xrlint/plugins/xcube/rules/lat_lon_naming.py +++ b/xrlint/plugins/xcube/rules/lat_lon_naming.py @@ -1,6 +1,6 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import LAT_NAME, LON_NAME -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleContext, RuleOp INVALID_LAT_NAMES = {"ltd", "latitude"} diff --git a/xrlint/plugins/xcube/rules/single_grid_mapping.py b/xrlint/plugins/xcube/rules/single_grid_mapping.py index 0e75e5c..cabb7c8 100644 --- a/xrlint/plugins/xcube/rules/single_grid_mapping.py +++ b/xrlint/plugins/xcube/rules/single_grid_mapping.py @@ -1,6 +1,6 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import GM_NAMES_TEXT, LAT_NAME, LON_NAME -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.plugins.xcube.util import is_spatial_var from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/time_naming.py b/xrlint/plugins/xcube/rules/time_naming.py index 5c2db10..08d3f5e 100644 --- a/xrlint/plugins/xcube/rules/time_naming.py +++ b/xrlint/plugins/xcube/rules/time_naming.py @@ -5,7 +5,7 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import TIME_NAME -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleOp, RuleContext From 19ae644563f8a9f6bdf16983d24db1733c0449b6 Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Mon, 20 Jan 2025 17:21:34 +0100 Subject: [PATCH 02/17] added xcube processor --- xrlint/plugins/xcube/processors/__init__.py | 0 xrlint/plugins/xcube/processors/mldataset.py | 29 ++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 xrlint/plugins/xcube/processors/__init__.py create mode 100644 xrlint/plugins/xcube/processors/mldataset.py diff --git a/xrlint/plugins/xcube/processors/__init__.py b/xrlint/plugins/xcube/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/xrlint/plugins/xcube/processors/mldataset.py b/xrlint/plugins/xcube/processors/mldataset.py new file mode 100644 index 0000000..603b720 --- /dev/null +++ b/xrlint/plugins/xcube/processors/mldataset.py @@ -0,0 +1,29 @@ +from typing import Any + +import xarray as xr + +from xrlint.plugins.xcube.plugin import plugin +from xrlint.processor import ProcessorOp +from xrlint.result import Message + + +@plugin.define_processor("multi-level-dataset") +class MultiLevelDatasetProcessor(ProcessorOp): + """This processor should be used with `files: ["**/*.levels"]`.""" + + def preprocess( + self, file_path: str, opener_options: dict[str, Any] + ) -> list[tuple[xr.Dataset, str]]: + engine = opener_options.pop("engine", "zarr") + new_file_path = file_path + "/" + "0.zarr" + return [ + ( + xr.open_dataset(new_file_path, engine=engine, **opener_options), + new_file_path, + ) + ] + + def postprocess( + self, messages: list[list[Message]], file_path: str + ) -> list[Message]: + return messages[0] From 1b2c78c497c6524bd415e384022e6ffc3c5754a6 Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Mon, 20 Jan 2025 17:24:48 +0100 Subject: [PATCH 03/17] new plugin module --- xrlint/plugins/core/__init__.py | 2 +- xrlint/plugins/core/plugin.py | 9 +++++++++ xrlint/plugins/core/rules/__init__.py | 9 --------- xrlint/plugins/core/rules/coords_for_dims.py | 2 +- xrlint/plugins/core/rules/dataset_title_attr.py | 2 +- xrlint/plugins/core/rules/flags.py | 2 +- xrlint/plugins/core/rules/grid_mappings.py | 2 +- xrlint/plugins/core/rules/lat_lon_coordinate.py | 2 +- xrlint/plugins/core/rules/no_empty_attrs.py | 2 +- xrlint/plugins/core/rules/no_empty_chunks.py | 2 +- xrlint/plugins/core/rules/time_coordinate.py | 2 +- xrlint/plugins/core/rules/var_units_attr.py | 2 +- 12 files changed, 19 insertions(+), 19 deletions(-) create mode 100644 xrlint/plugins/core/plugin.py diff --git a/xrlint/plugins/core/__init__.py b/xrlint/plugins/core/__init__.py index 2457614..1dc64ed 100644 --- a/xrlint/plugins/core/__init__.py +++ b/xrlint/plugins/core/__init__.py @@ -4,7 +4,7 @@ def export_plugin() -> Plugin: - from .rules import plugin + from .plugin import plugin import_submodules("xrlint.plugins.core.rules") diff --git a/xrlint/plugins/core/plugin.py b/xrlint/plugins/core/plugin.py new file mode 100644 index 0000000..ab64c5d --- /dev/null +++ b/xrlint/plugins/core/plugin.py @@ -0,0 +1,9 @@ +from xrlint.constants import CORE_PLUGIN_NAME +from xrlint.plugin import new_plugin +from xrlint.version import version + +plugin = new_plugin( + name=CORE_PLUGIN_NAME, + version=version, + ref="xrlint.plugins.core:export_plugin", +) diff --git a/xrlint/plugins/core/rules/__init__.py b/xrlint/plugins/core/rules/__init__.py index ab64c5d..e69de29 100644 --- a/xrlint/plugins/core/rules/__init__.py +++ b/xrlint/plugins/core/rules/__init__.py @@ -1,9 +0,0 @@ -from xrlint.constants import CORE_PLUGIN_NAME -from xrlint.plugin import new_plugin -from xrlint.version import version - -plugin = new_plugin( - name=CORE_PLUGIN_NAME, - version=version, - ref="xrlint.plugins.core:export_plugin", -) diff --git a/xrlint/plugins/core/rules/coords_for_dims.py b/xrlint/plugins/core/rules/coords_for_dims.py index 8a4a056..b6c5f44 100644 --- a/xrlint/plugins/core/rules/coords_for_dims.py +++ b/xrlint/plugins/core/rules/coords_for_dims.py @@ -1,5 +1,5 @@ from xrlint.node import DatasetNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp from xrlint.util.formatting import format_item diff --git a/xrlint/plugins/core/rules/dataset_title_attr.py b/xrlint/plugins/core/rules/dataset_title_attr.py index cd27ddc..4abb5e2 100644 --- a/xrlint/plugins/core/rules/dataset_title_attr.py +++ b/xrlint/plugins/core/rules/dataset_title_attr.py @@ -1,5 +1,5 @@ from xrlint.node import DatasetNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/core/rules/flags.py b/xrlint/plugins/core/rules/flags.py index 0138680..7d88213 100644 --- a/xrlint/plugins/core/rules/flags.py +++ b/xrlint/plugins/core/rules/flags.py @@ -3,7 +3,7 @@ import numpy as np from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleOp, RuleContext diff --git a/xrlint/plugins/core/rules/grid_mappings.py b/xrlint/plugins/core/rules/grid_mappings.py index 0f85c76..2050c3e 100644 --- a/xrlint/plugins/core/rules/grid_mappings.py +++ b/xrlint/plugins/core/rules/grid_mappings.py @@ -1,5 +1,5 @@ from xrlint.node import DatasetNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/core/rules/lat_lon_coordinate.py b/xrlint/plugins/core/rules/lat_lon_coordinate.py index bdcb6e1..903cf15 100644 --- a/xrlint/plugins/core/rules/lat_lon_coordinate.py +++ b/xrlint/plugins/core/rules/lat_lon_coordinate.py @@ -3,7 +3,7 @@ import xarray as xr from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext from xrlint.rule import RuleOp diff --git a/xrlint/plugins/core/rules/no_empty_attrs.py b/xrlint/plugins/core/rules/no_empty_attrs.py index 22c3ad3..7405c8e 100644 --- a/xrlint/plugins/core/rules/no_empty_attrs.py +++ b/xrlint/plugins/core/rules/no_empty_attrs.py @@ -1,5 +1,5 @@ from xrlint.node import AttrsNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.result import Suggestion from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/core/rules/no_empty_chunks.py b/xrlint/plugins/core/rules/no_empty_chunks.py index a4b7186..0e842f9 100644 --- a/xrlint/plugins/core/rules/no_empty_chunks.py +++ b/xrlint/plugins/core/rules/no_empty_chunks.py @@ -1,5 +1,5 @@ from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleExit, RuleOp diff --git a/xrlint/plugins/core/rules/time_coordinate.py b/xrlint/plugins/core/rules/time_coordinate.py index 6842ca9..c2156c3 100644 --- a/xrlint/plugins/core/rules/time_coordinate.py +++ b/xrlint/plugins/core/rules/time_coordinate.py @@ -1,5 +1,5 @@ from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/core/rules/var_units_attr.py b/xrlint/plugins/core/rules/var_units_attr.py index cec977f..81d2ba7 100644 --- a/xrlint/plugins/core/rules/var_units_attr.py +++ b/xrlint/plugins/core/rules/var_units_attr.py @@ -1,5 +1,5 @@ from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp From 5db6ee5ffdb6791d4679bcea868d6a95b1f0685a Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Mon, 20 Jan 2025 18:37:57 +0100 Subject: [PATCH 04/17] experimenting with ml-dataset config --- xrlint/plugins/xcube/__init__.py | 12 +++++++- xrlint/plugins/xcube/processors/mldataset.py | 30 ++++++++++++++------ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/xrlint/plugins/xcube/__init__.py b/xrlint/plugins/xcube/__init__.py index 31b23c4..762eb86 100644 --- a/xrlint/plugins/xcube/__init__.py +++ b/xrlint/plugins/xcube/__init__.py @@ -7,11 +7,12 @@ def export_plugin() -> Plugin: from .plugin import plugin import_submodules("xrlint.plugins.xcube.rules") - # import_submodules("xrlint.plugins.xcube.processors") + import_submodules("xrlint.plugins.xcube.processors") plugin.configs["recommended"] = Config.from_value( { "name": "xcube-recommended", + "ignores": ["**/*.levels"], "plugins": { "xcube": plugin, }, @@ -30,11 +31,20 @@ def export_plugin() -> Plugin: plugin.configs["all"] = Config.from_value( { "name": "xcube-all", + "ignores": ["**/*.levels"], "plugins": { "xcube": plugin, }, "rules": {f"xcube/{rule_id}": "error" for rule_id in plugin.rules.keys()}, } ) + plugin.configs["multi-level-datasets"] = Config.from_value( + { + "name": "xcube-multi-level-datasets", + "files": ["**/*.levels"], + "ignores": ["**/*.zarr", "**/*.nc"], + "processor": "xcube/multi-level-dataset", + } + ) return plugin diff --git a/xrlint/plugins/xcube/processors/mldataset.py b/xrlint/plugins/xcube/processors/mldataset.py index 603b720..ecaaa2b 100644 --- a/xrlint/plugins/xcube/processors/mldataset.py +++ b/xrlint/plugins/xcube/processors/mldataset.py @@ -1,3 +1,4 @@ +import itertools from typing import Any import xarray as xr @@ -15,15 +16,28 @@ def preprocess( self, file_path: str, opener_options: dict[str, Any] ) -> list[tuple[xr.Dataset, str]]: engine = opener_options.pop("engine", "zarr") - new_file_path = file_path + "/" + "0.zarr" - return [ - ( - xr.open_dataset(new_file_path, engine=engine, **opener_options), - new_file_path, - ) - ] + + level_datasets = [] + for level in itertools.count(): + level_path = f"{file_path}/{level}.zarr" + try: + level_dataset = xr.open_dataset( + level_path, engine=engine, **opener_options + ) + except FileNotFoundError: + break + level_datasets.append((level_dataset, level_path)) + + for level, (level_dataset, level_path) in enumerate(level_datasets): + level_dataset.attrs["_LEVEL_INFO"] = { + "index": level, + "count": len(level_datasets), + "datasets": [ds for ds, _ in level_datasets], + } + + return level_datasets def postprocess( self, messages: list[list[Message]], file_path: str ) -> list[Message]: - return messages[0] + return list(itertools.chain(*messages)) From 2eca8d41203f043003bf67a16eccb0e358f0b773 Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Mon, 20 Jan 2025 18:51:55 +0100 Subject: [PATCH 05/17] Node paths now contain the split index if dataset has been opened by a processor --- CHANGES.md | 6 +++++- tests/_linter/test_rulectx.py | 5 +++-- tests/test_linter.py | 4 ++-- xrlint/_linter/apply.py | 10 +++++++++- xrlint/_linter/rulectx.py | 9 +++++++-- xrlint/_linter/verify.py | 12 ++++++++---- 6 files changed, 34 insertions(+), 12 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 59c884d..f8cc5bc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,7 +2,11 @@ ## Version 0.4.0 (in development) - +- Internal changes: + - Inbuilt plugin now get their `plugin` instance from + `xrlint.plugins..plugin` module. + - Node paths now contain the split index if dataset + has been opened by a processor. ## Version 0.3.0 (from 2025-01-20) diff --git a/tests/_linter/test_rulectx.py b/tests/_linter/test_rulectx.py index a74600b..d7ce5fc 100644 --- a/tests/_linter/test_rulectx.py +++ b/tests/_linter/test_rulectx.py @@ -12,14 +12,15 @@ class RuleContextImplTest(TestCase): def test_defaults(self): config = Config() dataset = xr.Dataset() - context = RuleContextImpl(config, dataset, "./ds.zarr") + context = RuleContextImpl(config, dataset, "./ds.zarr", None) self.assertIs(config, context.config) self.assertIs(dataset, context.dataset) self.assertEqual({}, context.settings) self.assertEqual("./ds.zarr", context.file_path) + self.assertEqual(None, context.file_index) def test_report(self): - context = RuleContextImpl(Config(), xr.Dataset(), "./ds.zarr") + context = RuleContextImpl(Config(), xr.Dataset(), "./ds.zarr", None) with context.use_state(rule_id="no-xxx"): context.report( "What the heck do you mean?", diff --git a/tests/test_linter.py b/tests/test_linter.py index 1dfd196..f3b5a29 100644 --- a/tests/test_linter.py +++ b/tests/test_linter.py @@ -277,13 +277,13 @@ def test_processor(self): [ Message( message="Dataset does not have data variables", - node_path="dataset", + node_path="dataset[0]", rule_id="test/dataset-without-data-vars", severity=1, ), Message( message="Dataset does not have data variables", - node_path="dataset", + node_path="dataset[1]", rule_id="test/dataset-without-data-vars", severity=1, ), diff --git a/xrlint/_linter/apply.py b/xrlint/_linter/apply.py index 7c0993c..0080eb8 100644 --- a/xrlint/_linter/apply.py +++ b/xrlint/_linter/apply.py @@ -30,7 +30,15 @@ def apply_rule( _visit_dataset_node( rule_op, context, - DatasetNode(parent=None, path="dataset", dataset=context.dataset), + DatasetNode( + parent=None, + path=( + "dataset" + if context.file_index is None + else f"dataset[{context.file_index}]" + ), + dataset=context.dataset, + ), ) except RuleExit: # This is ok, the rule requested it. diff --git a/xrlint/_linter/rulectx.py b/xrlint/_linter/rulectx.py index d86e8e2..db3252f 100644 --- a/xrlint/_linter/rulectx.py +++ b/xrlint/_linter/rulectx.py @@ -16,13 +16,16 @@ def __init__( config: Config, dataset: xr.Dataset, file_path: str, + file_index: int | None, ): assert config is not None assert dataset is not None assert file_path is not None + assert file_index is None or isinstance(file_index, int) self._config = config self._dataset = dataset self._file_path = file_path + self._file_index = file_index self.messages: list[Message] = [] self.rule_id: str | None = None self.severity: Literal[1, 2] = SEVERITY_ERROR @@ -34,7 +37,6 @@ def config(self) -> Config: @property def settings(self) -> dict[str, Any]: - assert self._config is not None return self._config.settings or {} @property @@ -43,9 +45,12 @@ def dataset(self) -> xr.Dataset: @property def file_path(self) -> str: - assert self._file_path is not None return self._file_path + @property + def file_index(self) -> int | None: + return self._file_index + def report( self, message: str, diff --git a/xrlint/_linter/verify.py b/xrlint/_linter/verify.py index fa95148..328bb9c 100644 --- a/xrlint/_linter/verify.py +++ b/xrlint/_linter/verify.py @@ -18,7 +18,7 @@ def verify_dataset(config: Config, dataset: Any, file_path: str | None): assert isinstance(file_path, (str, type(None))) if isinstance(dataset, xr.Dataset): file_path = file_path or _get_file_path_for_dataset(dataset) - messages = _verify_dataset(config, dataset, file_path) + messages = _verify_dataset(config, dataset, file_path, None) else: file_path = file_path or _get_file_path_for_source(dataset) messages = _open_and_verify_dataset(config, dataset, file_path) @@ -29,12 +29,13 @@ def _verify_dataset( config: Config, dataset: xr.Dataset, file_path: str, + file_index: int | None, ) -> list[Message]: assert isinstance(config, Config) assert isinstance(dataset, xr.Dataset) assert isinstance(file_path, str) - context = RuleContextImpl(config, dataset, file_path) + context = RuleContextImpl(config, dataset, file_path, file_index) if not config.rules: context.report("No rules configured or applicable.", fatal=True) @@ -61,7 +62,10 @@ def _open_and_verify_dataset( except (OSError, ValueError, TypeError) as e: return [Message(message=str(e), fatal=True, severity=2)] return processor_op.postprocess( - [_verify_dataset(config, ds, path) for ds, path in ds_path_list], + [ + _verify_dataset(config, ds, path, i) + for i, (ds, path) in enumerate(ds_path_list) + ], file_path, ) else: @@ -70,7 +74,7 @@ def _open_and_verify_dataset( except (OSError, ValueError, TypeError) as e: return [Message(message=str(e), fatal=True, severity=2)] with dataset: - return _verify_dataset(config, dataset, file_path) + return _verify_dataset(config, dataset, file_path, None) def _open_dataset( From a81298660d11ba64b2d733b153fab0d1f01063e0 Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Wed, 22 Jan 2025 12:52:28 +0100 Subject: [PATCH 06/17] experimenting with ml-dataset config --- tests/plugins/xcube/test_plugin.py | 4 +- xrlint/plugins/xcube/__init__.py | 66 +++++++++++--------- xrlint/plugins/xcube/processors/mldataset.py | 13 ++++ 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/tests/plugins/xcube/test_plugin.py b/tests/plugins/xcube/test_plugin.py index e0f4654..240d074 100644 --- a/tests/plugins/xcube/test_plugin.py +++ b/tests/plugins/xcube/test_plugin.py @@ -32,9 +32,9 @@ def test_configs_complete(self): all_rule_names = set(f"xcube/{k}" for k in plugin.rules.keys()) self.assertEqual( all_rule_names, - set(plugin.configs["all"].rules.keys()), + set(plugin.configs["all"].configs[-1].rules.keys()), ) self.assertEqual( all_rule_names, - set(plugin.configs["recommended"].rules.keys()), + set(plugin.configs["recommended"].configs[-1].rules.keys()), ) diff --git a/xrlint/plugins/xcube/__init__.py b/xrlint/plugins/xcube/__init__.py index 762eb86..ee30ee4 100644 --- a/xrlint/plugins/xcube/__init__.py +++ b/xrlint/plugins/xcube/__init__.py @@ -1,4 +1,4 @@ -from xrlint.config import Config +from xrlint.config import Config, ConfigList from xrlint.plugin import Plugin from xrlint.util.importutil import import_submodules @@ -9,42 +9,50 @@ def export_plugin() -> Plugin: import_submodules("xrlint.plugins.xcube.rules") import_submodules("xrlint.plugins.xcube.processors") - plugin.configs["recommended"] = Config.from_value( + common_configs = [ { - "name": "xcube-recommended", - "ignores": ["**/*.levels"], "plugins": { "xcube": plugin, }, - "rules": { - "xcube/any-spatial-data-var": "error", - "xcube/cube-dims-order": "error", - "xcube/data-var-colors": "warn", - "xcube/grid-mapping-naming": "warn", - "xcube/increasing-time": "error", - "xcube/lat-lon-naming": "error", - "xcube/single-grid-mapping": "error", - "xcube/time-naming": "error", - }, - } - ) - plugin.configs["all"] = Config.from_value( + }, { - "name": "xcube-all", - "ignores": ["**/*.levels"], - "plugins": { - "xcube": plugin, - }, - "rules": {f"xcube/{rule_id}": "error" for rule_id in plugin.rules.keys()}, - } - ) - plugin.configs["multi-level-datasets"] = Config.from_value( + # Add *.levels to globally included list of file types + "files": ["**/*.levels"], + }, { - "name": "xcube-multi-level-datasets", + # Specify a processor for *.levels files "files": ["**/*.levels"], - "ignores": ["**/*.zarr", "**/*.nc"], "processor": "xcube/multi-level-dataset", - } + }, + ] + + plugin.configs["recommended"] = ConfigList.from_value( + [ + *common_configs, + { + "rules": { + "xcube/any-spatial-data-var": "error", + "xcube/cube-dims-order": "error", + "xcube/data-var-colors": "warn", + "xcube/grid-mapping-naming": "warn", + "xcube/increasing-time": "error", + "xcube/lat-lon-naming": "error", + "xcube/single-grid-mapping": "error", + "xcube/time-naming": "error", + }, + }, + ] + ) + + plugin.configs["all"] = ConfigList.from_value( + [ + *common_configs, + { + "rules": { + f"xcube/{rule_id}": "error" for rule_id in plugin.rules.keys() + }, + }, + ] ) return plugin diff --git a/xrlint/plugins/xcube/processors/mldataset.py b/xrlint/plugins/xcube/processors/mldataset.py index ecaaa2b..edcb918 100644 --- a/xrlint/plugins/xcube/processors/mldataset.py +++ b/xrlint/plugins/xcube/processors/mldataset.py @@ -1,6 +1,8 @@ import itertools +import json from typing import Any +import fsspec import xarray as xr from xrlint.plugins.xcube.plugin import plugin @@ -17,6 +19,15 @@ def preprocess( ) -> list[tuple[xr.Dataset, str]]: engine = opener_options.pop("engine", "zarr") + levels_meta_path = f"{file_path}/.zlevels" + try: + with fsspec.open(levels_meta_path, mode="wt") as stream: + levels_meta_content = json.load(stream.read()) + except FileNotFoundError: + levels_meta_path = None + levels_meta_content = None + pass + level_datasets = [] for level in itertools.count(): level_path = f"{file_path}/{level}.zarr" @@ -30,6 +41,8 @@ def preprocess( for level, (level_dataset, level_path) in enumerate(level_datasets): level_dataset.attrs["_LEVEL_INFO"] = { + "meta_path": levels_meta_path, + "meta_content": levels_meta_content, "index": level, "count": len(level_datasets), "datasets": [ds for ds, _ in level_datasets], From b879fa7d7d7612d85982f251b205a4648e3b732d Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Wed, 22 Jan 2025 13:04:12 +0100 Subject: [PATCH 07/17] plugin configs are always lists --- examples/virtual_plugin_config.py | 14 ++++++++------ tests/cli/configs/recommended.py | 2 +- tests/cli/test_config.py | 18 +++++++++--------- tests/test_config.py | 13 +++++++++++-- tests/test_plugin.py | 14 ++++++++------ xrlint/config.py | 24 ++++++++++++++---------- xrlint/plugin.py | 15 ++++++++++++--- 7 files changed, 63 insertions(+), 37 deletions(-) diff --git a/examples/virtual_plugin_config.py b/examples/virtual_plugin_config.py index 858d878..2fe5546 100644 --- a/examples/virtual_plugin_config.py +++ b/examples/virtual_plugin_config.py @@ -37,12 +37,14 @@ def export_configs(): # Add more rules here... }, "configs": { - "recommended": { - "rules": { - "hello/good-title": "warn", - # Configure more rules here... - }, - }, + "recommended": [ + { + "rules": { + "hello/good-title": "warn", + # Configure more rules here... + }, + } + ], # Add more configurations here... }, }, diff --git a/tests/cli/configs/recommended.py b/tests/cli/configs/recommended.py index 23bc763..6d1b3f8 100644 --- a/tests/cli/configs/recommended.py +++ b/tests/cli/configs/recommended.py @@ -11,7 +11,7 @@ def export_configs(): } }, core.configs["recommended"], - xcube.configs["recommended"], + *xcube.configs["recommended"].configs, { "rules": { "dataset-title-attr": "error", diff --git a/tests/cli/test_config.py b/tests/cli/test_config.py index ce70c6b..bd50f1d 100644 --- a/tests/cli/test_config.py +++ b/tests/cli/test_config.py @@ -117,13 +117,13 @@ def test_read_config_yaml_with_format_error(self): read_config_list(config_path) def test_read_config_yaml_with_type_error(self): - with text_file("config.yaml", "prime: 97") as config_path: + with text_file("config.yaml", "97") as config_path: with pytest.raises( ConfigError, match=( - "'config.yaml: configuration list must be of" - " type ConfigList|list\\[Config|dict|str\\]," - " but got dict'" + r"config\.yaml\: config_list must be of" + r" type ConfigList \| list\[Config \| dict \| str\]," + r" but got int" ), ): read_config_list(config_path) @@ -171,10 +171,10 @@ def test_read_config_py_with_invalid_config_list(self): with pytest.raises( ConfigError, match=( - ".py: return value of export_configs\\(\\):" - " configuration list must be of type" - " ConfigList|list\\[Config|dict|str\\]," - " but got int" + r"\.py: return value of export_configs\(\):" + r" config_list must be of type" + r" ConfigList \| list\[Config\ | dict \| str\]," + r" but got int" ), ): read_config_list(config_path) @@ -198,7 +198,7 @@ def test_read_config_yaml(self): def assert_ok(self, config_list: ConfigList): self.assertIsInstance(config_list, ConfigList) - self.assertEqual(4, len(config_list.configs)) + self.assertEqual(7, len(config_list.configs)) config = config_list.compute_config("test.zarr") self.assertIsInstance(config, Config) self.assertEqual("", config.name) diff --git a/tests/test_config.py b/tests/test_config.py index dcdfd5b..6ac9575 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -140,16 +140,25 @@ def test_from_value_ok(self): self.assertIsInstance(config_list, ConfigList) self.assertEqual([Config()], config_list.configs) + config_list = ConfigList.from_value({}) + self.assertIsInstance(config_list, ConfigList) + self.assertEqual([Config()], config_list.configs) + + config = Config.from_value({}) + config_list = ConfigList.from_value(config) + self.assertIsInstance(config_list, ConfigList) + self.assertIs(config, config_list.configs[0]) + # noinspection PyMethodMayBeStatic def test_from_value_fail(self): with pytest.raises( TypeError, match=( r"config_list must be of type" - r" ConfigList \| list\[Config \| dict\], but got dict" + r" ConfigList \| list\[Config \| dict \| str\], but got int" ), ): - ConfigList.from_value({}) + ConfigList.from_value(264) def test_compute_config(self): config_list = ConfigList([Config()]) diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 3c3bafb..e909050 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -38,12 +38,14 @@ class MyRule2(RuleOp): "r2": MyRule2, }, "configs": { - "recommended": { - "rules": { - "hello/r1": "warn", - "hello/r2": "error", - }, - }, + "recommended": [ + { + "rules": { + "hello/r1": "warn", + "hello/r2": "error", + }, + } + ], }, } ) diff --git a/xrlint/config.py b/xrlint/config.py index 6d26d72..75d66b4 100644 --- a/xrlint/config.py +++ b/xrlint/config.py @@ -359,12 +359,14 @@ def from_value(cls, value: Any, value_name: str | None = None) -> "ConfigList": Args: value: A `ConfigList` object or `list` of items which can be converted into `Config` objects including configuration - names of tyype `str`. The latter are resolved against + names of type `str`. The latter are resolved against the plugin configurations seen so far in the list. value_name: A value's name. Returns: A `ConfigList` object. """ + if isinstance(value, (Config, dict)): + return ConfigList(configs=[Config.from_value(value)]) return super().from_value(value, value_name=value_name) @classmethod @@ -375,12 +377,13 @@ def _from_sequence(cls, value: Sequence, value_name: str) -> "ConfigList": if isinstance(item, str): if CORE_PLUGIN_NAME not in plugins: plugins.update({CORE_PLUGIN_NAME: get_core_plugin()}) - config = cls._get_named_config(item, plugins) + new_configs = cls._get_named_config_list(item, plugins) else: - config = Config.from_value(item) - configs.append(config) - plugins.update(config.plugins if config.plugins else {}) - return ConfigList(configs) + new_configs = [Config.from_value(item)] + for config in new_configs: + configs.append(config) + plugins.update(config.plugins if config.plugins else {}) + return ConfigList(configs=configs) @classmethod def value_name(cls) -> str: @@ -388,10 +391,12 @@ def value_name(cls) -> str: @classmethod def value_type_name(cls) -> str: - return "ConfigList | list[Config | dict]" + return "ConfigList | list[Config | dict | str]" @classmethod - def _get_named_config(cls, config_spec: str, plugins: dict[str, "Plugin"]): + def _get_named_config_list( + cls, config_spec: str, plugins: dict[str, "Plugin"] + ) -> list[Config]: plugin_name, config_name = ( config_spec.split("/", maxsplit=1) if "/" in config_spec @@ -400,5 +405,4 @@ def _get_named_config(cls, config_spec: str, plugins: dict[str, "Plugin"]): plugin: Plugin | None = plugins.get(plugin_name) if plugin is None or not plugin.configs or config_name not in plugin.configs: raise ValueError(f"configuration {config_spec!r} not found") - config_value = plugin.configs[config_name] - return config_value + return ConfigList.from_value(plugin.configs[config_name]).configs diff --git a/xrlint/plugin.py b/xrlint/plugin.py index 1900d79..12bc0a7 100644 --- a/xrlint/plugin.py +++ b/xrlint/plugin.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field from typing import Any, Callable, Literal, Type -from xrlint.config import Config +from xrlint.config import Config, ConfigList from xrlint.processor import Processor, ProcessorOp, define_processor from xrlint.rule import Rule, RuleOp, define_rule from xrlint.util.constructible import MappingConstructible @@ -50,8 +50,8 @@ class Plugin(MappingConstructible, JsonSerializable): """A dictionary containing named processors. """ - configs: dict[str, Config] = field(default_factory=dict) - """A dictionary containing named configurations.""" + configs: dict[str, list[Config]] = field(default_factory=dict) + """A dictionary containing named configuration lists.""" def define_rule( self, @@ -99,6 +99,15 @@ def define_processor( registry=self.processors, ) + def define_config( + self, + name: str, + config_value: list[Config | dict[str, Any]] | Config | dict[str, Any], + ) -> list[Config]: + config_list = ConfigList.from_value(config_value) + self.configs[name] = config_list.configs + return config_list.configs + @classmethod def _from_str(cls, value: str, value_name: str) -> "Plugin": plugin, plugin_ref = import_value( From e6f36a260c994c70bbab293009cdea55f512cbdb Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Wed, 22 Jan 2025 16:38:59 +0100 Subject: [PATCH 08/17] plugin configs are always lists; included xcube processor in named config --- CHANGES.md | 17 ++++++++++ docs/config.md | 2 +- docs/todo.md | 4 --- examples/plugin_config.py | 35 ++++++++++---------- examples/virtual_plugin_config.py | 2 +- tests/cli/configs/recommended.py | 6 ++-- tests/cli/test_config.py | 14 ++++---- tests/cli/test_main.py | 1 - tests/plugins/core/test_plugin.py | 4 +-- tests/plugins/xcube/test_plugin.py | 4 +-- tests/test_config.py | 8 ++--- tests/test_examples.py | 4 +-- xrlint/cli/config.py | 2 +- xrlint/config.py | 8 ++--- xrlint/plugin.py | 20 ++++++++--- xrlint/plugins/core/__init__.py | 12 ++++--- xrlint/plugins/xcube/__init__.py | 11 +++--- xrlint/plugins/xcube/processors/mldataset.py | 5 ++- xrlint/util/importutil.py | 2 +- 19 files changed, 92 insertions(+), 69 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f8cc5bc..f166403 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,23 @@ ## Version 0.4.0 (in development) +- Now supporting xcube multi-level datasets `*.levels`: + - Added xcube plugin processor `"xcube/multi-level-dataset"` that is used + inside the predefined xcube configurations "all" and "recommended". + +- Changed type of `Plugin.configs` from `dict[str, Config]` to + `dict[str, list[Config]]`. +- Introduced method `Plugin.define_config` which defines a named plugin + configuration. It takes a name and a configuration object or list of + configuration objects. +- Changed they way how configuration is defined and exported from + Python configuration files: + - Renamed function that exports configuration from `export_configs` + into `export_config`. + - The returned value should be a list of values that can be + converted into configuration objects: mixed `Config` instances, + dictionary, or a name that refers to a named configuration of a plugin. + - Internal changes: - Inbuilt plugin now get their `plugin` instance from `xrlint.plugins..plugin` module. diff --git a/docs/config.md b/docs/config.md index c3819fb..3061703 100644 --- a/docs/config.md +++ b/docs/config.md @@ -43,7 +43,7 @@ Same using JSON: And as Python script: ```python -def export_configs(): +def export_config(): return [ {"files": ["**/*.zarr", "**/*.nc"]}, { diff --git a/docs/todo.md b/docs/todo.md index b6e1afa..8931dfd 100644 --- a/docs/todo.md +++ b/docs/todo.md @@ -14,10 +14,6 @@ ## Desired - project logo -- support validating xcube 'levels' format. Options: - - implement xarray backend so we can open them using `xr.open_dataset` - with `opener_options: {"engine": "xc-levels"}`. - - implement a `xrlint.processor.Processor` for that purpose. - add some more tests so we reach 99% coverage - support rule op args/kwargs schema validation - Support `RuleTest.expected`, it is currently unused diff --git a/examples/plugin_config.py b/examples/plugin_config.py index b08139e..e98e8b1 100644 --- a/examples/plugin_config.py +++ b/examples/plugin_config.py @@ -3,27 +3,11 @@ using the `Plugin` class and its `define_rule()` decorator method. """ -from xrlint.config import Config from xrlint.node import DatasetNode from xrlint.plugin import new_plugin from xrlint.rule import RuleContext, RuleOp -plugin = new_plugin( - name="hello-plugin", - version="1.0.0", - configs={ - # "configs" entries must be `Config` objects! - "recommended": Config.from_value( - { - "rules": { - "hello/good-title": "warn", - # Configure more rules here... - }, - } - ), - # Add more configurations here... - }, -) +plugin = new_plugin(name="hello-plugin", version="1.0.0") @plugin.define_rule("good-title") @@ -42,7 +26,22 @@ def dataset(self, ctx: RuleContext, node: DatasetNode): # Define more rules here... -def export_configs(): +plugin.define_config( + "recommended", + [ + { + "rules": { + "hello/good-title": "warn", + # Configure more rules here... + }, + } + ], +) + +# Add more configurations here... + + +def export_config(): return [ # Use "hello" plugin { diff --git a/examples/virtual_plugin_config.py b/examples/virtual_plugin_config.py index 2fe5546..da9a3a8 100644 --- a/examples/virtual_plugin_config.py +++ b/examples/virtual_plugin_config.py @@ -22,7 +22,7 @@ def dataset(self, ctx: RuleContext, node: DatasetNode): # Define more rules here... -def export_configs(): +def export_config(): return [ # Define and use "hello" plugin { diff --git a/tests/cli/configs/recommended.py b/tests/cli/configs/recommended.py index 6d1b3f8..f998c12 100644 --- a/tests/cli/configs/recommended.py +++ b/tests/cli/configs/recommended.py @@ -1,4 +1,4 @@ -def export_configs(): +def export_config(): import xrlint.plugins.core import xrlint.plugins.xcube @@ -10,8 +10,8 @@ def export_configs(): "xcube": xcube, } }, - core.configs["recommended"], - *xcube.configs["recommended"].configs, + *core.configs["recommended"], + *xcube.configs["recommended"], { "rules": { "dataset-title-attr": "error", diff --git a/tests/cli/test_config.py b/tests/cli/test_config.py index bd50f1d..6bd49a5 100644 --- a/tests/cli/test_config.py +++ b/tests/cli/test_config.py @@ -34,7 +34,7 @@ """ py_text = """ -def export_configs(): +def export_config(): return [ { "name": "py-test", @@ -141,14 +141,14 @@ def test_read_config_py_no_export(self): with pytest.raises( ConfigError, match=( - "config_1002.py: attribute 'export_configs'" + "config_1002.py: attribute 'export_config'" " not found in module 'config_1002'" ), ): read_config_list(config_path) def test_read_config_py_with_value_error(self): - py_code = "def export_configs():\n raise ValueError('value is useless!')\n" + py_code = "def export_config():\n raise ValueError('value is useless!')\n" with text_file(self.new_config_py(), py_code) as config_path: with pytest.raises( ValueError, @@ -157,7 +157,7 @@ def test_read_config_py_with_value_error(self): read_config_list(config_path) def test_read_config_py_with_os_error(self): - py_code = "def export_configs():\n raise OSError('where is my hat?')\n" + py_code = "def export_config():\n raise OSError('where is my hat?')\n" with text_file(self.new_config_py(), py_code) as config_path: with pytest.raises( ConfigError, @@ -166,12 +166,12 @@ def test_read_config_py_with_os_error(self): read_config_list(config_path) def test_read_config_py_with_invalid_config_list(self): - py_code = "def export_configs():\n return 42\n" + py_code = "def export_config():\n return 42\n" with text_file(self.new_config_py(), py_code) as config_path: with pytest.raises( ConfigError, match=( - r"\.py: return value of export_configs\(\):" + r"\.py: return value of export_config\(\):" r" config_list must be of type" r" ConfigList \| list\[Config\ | dict \| str\]," r" but got int" @@ -201,7 +201,7 @@ def assert_ok(self, config_list: ConfigList): self.assertEqual(7, len(config_list.configs)) config = config_list.compute_config("test.zarr") self.assertIsInstance(config, Config) - self.assertEqual("", config.name) + self.assertEqual(None, config.name) self.assertIsInstance(config.plugins, dict) self.assertEqual({"xcube"}, set(config.plugins.keys())) self.assertIsInstance(config.rules, dict) diff --git a/tests/cli/test_main.py b/tests/cli/test_main.py index bf0fb53..5aab920 100644 --- a/tests/cli/test_main.py +++ b/tests/cli/test_main.py @@ -191,7 +191,6 @@ def test_print_config_option(self): self.assertEqual( ( "{\n" - ' "name": "",\n' ' "plugins": {\n' ' "__core__": "xrlint.plugins.core:export_plugin"\n' " },\n" diff --git a/tests/plugins/core/test_plugin.py b/tests/plugins/core/test_plugin.py index 0c7baed..057ef4d 100644 --- a/tests/plugins/core/test_plugin.py +++ b/tests/plugins/core/test_plugin.py @@ -34,9 +34,9 @@ def test_configs_complete(self): all_rule_names = set(plugin.rules.keys()) self.assertEqual( all_rule_names, - set(plugin.configs["all"].rules.keys()), + set(plugin.configs["all"][-1].rules.keys()), ) self.assertEqual( all_rule_names, - set(plugin.configs["recommended"].rules.keys()), + set(plugin.configs["recommended"][-1].rules.keys()), ) diff --git a/tests/plugins/xcube/test_plugin.py b/tests/plugins/xcube/test_plugin.py index 240d074..4f4deaa 100644 --- a/tests/plugins/xcube/test_plugin.py +++ b/tests/plugins/xcube/test_plugin.py @@ -32,9 +32,9 @@ def test_configs_complete(self): all_rule_names = set(f"xcube/{k}" for k in plugin.rules.keys()) self.assertEqual( all_rule_names, - set(plugin.configs["all"].configs[-1].rules.keys()), + set(plugin.configs["all"][-1].rules.keys()), ) self.assertEqual( all_rule_names, - set(plugin.configs["recommended"].configs[-1].rules.keys()), + set(plugin.configs["recommended"][-1].rules.keys()), ) diff --git a/tests/test_config.py b/tests/test_config.py index 6ac9575..37b678a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -163,9 +163,7 @@ def test_from_value_fail(self): def test_compute_config(self): config_list = ConfigList([Config()]) file_path = "s3://wq-services/datacubes/chl-2.zarr" - self.assertEqual( - Config(name=""), config_list.compute_config(file_path) - ) + self.assertEqual(Config(), config_list.compute_config(file_path)) config_list = ConfigList( [ @@ -176,14 +174,14 @@ def test_compute_config(self): ) file_path = "s3://wq-services/datacubes/chl-2.zarr" self.assertEqual( - Config(name="", settings={"a": 1, "b": 2}), + Config(settings={"a": 1, "b": 2}), config_list.compute_config(file_path), ) # global ignores file_path = "s3://wq-services/datacubes/chl-2.txt" self.assertEqual( - Config(name="", settings={"a": 2, "b": 1}), + Config(settings={"a": 2, "b": 1}), config_list.compute_config(file_path), ) diff --git a/tests/test_examples.py b/tests/test_examples.py index bf1650c..feb684e 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -9,7 +9,7 @@ class ExamplesTest(TestCase): def test_plugin_config(self): config_list, _ = import_value( - "examples.plugin_config", "export_configs", factory=ConfigList.from_value + "examples.plugin_config", "export_config", factory=ConfigList.from_value ) self.assertIsInstance(config_list, ConfigList) self.assertEqual(3, len(config_list.configs)) @@ -17,7 +17,7 @@ def test_plugin_config(self): def test_virtual_plugin_config(self): config_list, _ = import_value( "examples.virtual_plugin_config", - "export_configs", + "export_config", factory=ConfigList.from_value, ) self.assertIsInstance(config_list, ConfigList) diff --git a/xrlint/cli/config.py b/xrlint/cli/config.py index 25c2aaf..f9cf0b5 100644 --- a/xrlint/cli/config.py +++ b/xrlint/cli/config.py @@ -87,7 +87,7 @@ def _read_config_python(config_path: str) -> Any: try: return import_value( module_name, - "export_configs", + "export_config", factory=ConfigList.from_value, )[0] except ValueImportError as e: diff --git a/xrlint/config.py b/xrlint/config.py index 75d66b4..0a45318 100644 --- a/xrlint/config.py +++ b/xrlint/config.py @@ -45,10 +45,11 @@ def get_core_config( CORE_PLUGIN_NAME: core_plugin, }, ) - if config_name: - return config.merge(core_plugin.configs[config_name]) - else: + if config_name is None: return config + config_list = core_plugin.configs[config_name] + assert len(config_list) == 1 + return config.merge(config_list[0]) def split_config_spec(config_spec: str) -> tuple[str, str]: @@ -341,7 +342,6 @@ def compute_config(self, file_path: str) -> Config | None: # Note, computed configurations do not have "files" and "ignores" return Config( - name="", linter_options=config.linter_options, opener_options=config.opener_options, processor=config.processor, diff --git a/xrlint/plugin.py b/xrlint/plugin.py index 12bc0a7..a7cab18 100644 --- a/xrlint/plugin.py +++ b/xrlint/plugin.py @@ -102,11 +102,23 @@ def define_processor( def define_config( self, name: str, - config_value: list[Config | dict[str, Any]] | Config | dict[str, Any], + value: list[Config | dict[str, Any]] | Config | dict[str, Any], ) -> list[Config]: - config_list = ConfigList.from_value(config_value) - self.configs[name] = config_list.configs - return config_list.configs + """Define a named configuration. + + Args: + name: The name of the configuration. + value: The configuration-like object or list. + A configuration-like object is either a + [Config][xrlint.config.Config] or a `dict` that + represents a configuration. + + Returns: + A list of `Config` objects. + """ + configs = ConfigList.from_value(value).configs + self.configs[name] = configs + return configs @classmethod def _from_str(cls, value: str, value_name: str) -> "Plugin": diff --git a/xrlint/plugins/core/__init__.py b/xrlint/plugins/core/__init__.py index 1dc64ed..5a69302 100644 --- a/xrlint/plugins/core/__init__.py +++ b/xrlint/plugins/core/__init__.py @@ -1,4 +1,3 @@ -from xrlint.config import Config from xrlint.plugin import Plugin from xrlint.util.importutil import import_submodules @@ -8,7 +7,8 @@ def export_plugin() -> Plugin: import_submodules("xrlint.plugins.core.rules") - plugin.configs["recommended"] = Config.from_value( + plugin.define_config( + "recommended", { "name": "recommended", "rules": { @@ -23,13 +23,15 @@ def export_plugin() -> Plugin: "time-coordinate": "error", "var-units-attr": "warn", }, - } + }, ) - plugin.configs["all"] = Config.from_value( + + plugin.define_config( + "all", { "name": "all", "rules": {rule_id: "error" for rule_id in plugin.rules.keys()}, - } + }, ) return plugin diff --git a/xrlint/plugins/xcube/__init__.py b/xrlint/plugins/xcube/__init__.py index ee30ee4..dcfc4d9 100644 --- a/xrlint/plugins/xcube/__init__.py +++ b/xrlint/plugins/xcube/__init__.py @@ -1,4 +1,3 @@ -from xrlint.config import Config, ConfigList from xrlint.plugin import Plugin from xrlint.util.importutil import import_submodules @@ -26,7 +25,8 @@ def export_plugin() -> Plugin: }, ] - plugin.configs["recommended"] = ConfigList.from_value( + plugin.define_config( + "recommended", [ *common_configs, { @@ -41,10 +41,11 @@ def export_plugin() -> Plugin: "xcube/time-naming": "error", }, }, - ] + ], ) - plugin.configs["all"] = ConfigList.from_value( + plugin.define_config( + "all", [ *common_configs, { @@ -52,7 +53,7 @@ def export_plugin() -> Plugin: f"xcube/{rule_id}": "error" for rule_id in plugin.rules.keys() }, }, - ] + ], ) return plugin diff --git a/xrlint/plugins/xcube/processors/mldataset.py b/xrlint/plugins/xcube/processors/mldataset.py index edcb918..3943101 100644 --- a/xrlint/plugins/xcube/processors/mldataset.py +++ b/xrlint/plugins/xcube/processors/mldataset.py @@ -21,12 +21,11 @@ def preprocess( levels_meta_path = f"{file_path}/.zlevels" try: - with fsspec.open(levels_meta_path, mode="wt") as stream: - levels_meta_content = json.load(stream.read()) + with fsspec.open(levels_meta_path, mode="rt", encoding="utf-8") as stream: + levels_meta_content = json.load(stream) except FileNotFoundError: levels_meta_path = None levels_meta_content = None - pass level_datasets = [] for level in itertools.count(): diff --git a/xrlint/util/importutil.py b/xrlint/util/importutil.py index 7430ab0..48ccd39 100644 --- a/xrlint/util/importutil.py +++ b/xrlint/util/importutil.py @@ -57,7 +57,7 @@ def import_value( If it is not given, the module itself will be the exported value. attr_ref: Attribute reference. Should be given in the case where `module_ref` does not contain an attribute reference. - Example values are "export_plugin", "export_configs". + Example values are "export_plugin", "export_config". constant: If `True` the value is expected to be a constant. If `False`, the default, the referenced attribute can be a no-arg callable that yields the actual exported value. From bf734837e9b18c883f4359c54769d5bb6158864c Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Thu, 23 Jan 2025 07:48:23 +0100 Subject: [PATCH 09/17] `JsonSerializable` now recognizes `dataclass` instances; no longer traverse dirs that have config --- CHANGES.md | 23 +++++---- tests/test_config.py | 8 +-- tests/test_operation.py | 11 ++--- tests/test_rule.py | 12 ++--- tests/util/test_codec.py | 56 ++------------------- xrlint/cli/engine.py | 97 +++++++++++++++++-------------------- xrlint/config.py | 15 +++--- xrlint/operation.py | 7 --- xrlint/util/serializable.py | 16 ++++-- 9 files changed, 94 insertions(+), 151 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f166403..624d274 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,25 +5,30 @@ - Now supporting xcube multi-level datasets `*.levels`: - Added xcube plugin processor `"xcube/multi-level-dataset"` that is used inside the predefined xcube configurations "all" and "recommended". - -- Changed type of `Plugin.configs` from `dict[str, Config]` to - `dict[str, list[Config]]`. +- Directories that are recognized by file patterns associated with a non-empty + configuration object are no longer recursively + traversed. - Introduced method `Plugin.define_config` which defines a named plugin configuration. It takes a name and a configuration object or list of configuration objects. -- Changed they way how configuration is defined and exported from +- Changed the way how configuration is defined and exported from Python configuration files: - Renamed function that exports configuration from `export_configs` into `export_config`. - The returned value should be a list of values that can be converted into configuration objects: mixed `Config` instances, dictionary, or a name that refers to a named configuration of a plugin. - -- Internal changes: - - Inbuilt plugin now get their `plugin` instance from +- Node path names now contain the dataset index if a file path + has been opened by a processor produced multiple + datasets to validate. + +- Other changes: + - Changed type of `Plugin.configs` from `dict[str, Config]` to + `dict[str, list[Config]]`. + - Inbuilt plugin rules now import their `plugin` instance from `xrlint.plugins..plugin` module. - - Node paths now contain the split index if dataset - has been opened by a processor. + - `JsonSerializable` now recognizes `dataclass` instances and no longer + serializes property values that are also default values. ## Version 0.3.0 (from 2025-01-20) diff --git a/tests/test_config.py b/tests/test_config.py index 37b678a..8fab8c3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -185,7 +185,7 @@ def test_compute_config(self): config_list.compute_config(file_path), ) - def test_get_global_filter(self): + def test_split_global_filter(self): config_list = ConfigList( [ Config(files=["**/*.hdf"]), # global file @@ -196,16 +196,18 @@ def test_get_global_filter(self): ] ) - file_filter = config_list.get_global_filter() + new_config_list, file_filter = config_list.split_global_filter() self.assertEqual( FileFilter.from_patterns(["**/*.hdf"], ["**/chl-?.txt"]), file_filter, ) + self.assertEqual(3, len(new_config_list.configs)) - file_filter = config_list.get_global_filter( + new_config_list, file_filter = config_list.split_global_filter( default=FileFilter.from_patterns(["**/*.h5"], None) ) self.assertEqual( FileFilter.from_patterns(["**/*.h5", "**/*.hdf"], ["**/chl-?.txt"]), file_filter, ) + self.assertEqual(3, len(new_config_list.configs)) diff --git a/tests/test_operation.py b/tests/test_operation.py index f0abc79..43c5fc5 100644 --- a/tests/test_operation.py +++ b/tests/test_operation.py @@ -146,14 +146,9 @@ class MyThingOp3(ThingOp): ) self.assertEqual( { - "meta": { - "name": "t3", - "version": "0.0.0", - "description": "What a thing.", - }, - "op_class": ".MyThingOp3'>", + "meta": {"description": "What a thing.", "name": "t3"}, + "op_class": ".MyThingOp3'>", }, rule.to_json(), ) diff --git a/tests/test_rule.py b/tests/test_rule.py index fc20fee..e794f6c 100644 --- a/tests/test_rule.py +++ b/tests/test_rule.py @@ -64,9 +64,8 @@ class MyRule3(RuleOp): rule = Rule(meta=RuleMeta(name="r3"), op_class=MyRule3) self.assertEqual( { - "meta": {"name": "r3", "version": "0.0.0", "type": "problem"}, - "op_class": ".MyRule3'>", + "meta": {"name": "r3"}, + "op_class": ".MyRule3'>", }, rule.to_json(), ) @@ -95,12 +94,7 @@ def test_from_value(self): def test_to_json(self): rule_meta = RuleMeta(name="r1", version="0.1.2", description="Nice one.") self.assertEqual( - { - "name": "r1", - "version": "0.1.2", - "type": "problem", - "description": "Nice one.", - }, + {"description": "Nice one.", "name": "r1", "version": "0.1.2"}, rule_meta.to_json(), ) diff --git a/tests/util/test_codec.py b/tests/util/test_codec.py index a0cdec1..6761723 100644 --- a/tests/util/test_codec.py +++ b/tests/util/test_codec.py @@ -124,15 +124,7 @@ def test_assumptions(self): class JsonSerializableTest(TestCase): def test_simple_ok(self): self.assertEqual( - { - "a": None, - "b": False, - "c": 0, - "d": 0.0, - "e": "abc", - "f": "", - "g": 0, - }, + {}, SimpleTypesContainer().to_json(), ) self.assertEqual( @@ -162,58 +154,18 @@ def test_complex_ok(self): ) self.assertEqual( { - "p": { - "a": None, - "b": False, - "c": 0, - "d": 0.0, - "e": "abc", - "f": "", - "g": 0, - }, + "p": {}, "q": {"p": True, "q": False}, - "r": { - "u": { - "a": None, - "b": False, - "c": 0, - "d": 0.0, - "e": "abc", - "f": "", - "g": 0, - }, - "v": { - "a": None, - "b": False, - "c": 0, - "d": 0.0, - "e": "abc", - "f": "", - "g": 0, - }, - }, + "r": {"u": {}, "v": {}}, "s": [1, 2, 3], "t": [ + {"c": 5, "d": 6.7}, { - "a": None, - "b": False, - "c": 5, - "d": 6.7, - "e": "abc", - "f": "", - "g": 0, - }, - { - "a": None, - "b": False, "c": 8, "d": 9.1, - "e": "abc", "f": "", - "g": 0, }, ], - "u": None, }, container.to_json(), ) diff --git a/xrlint/cli/engine.py b/xrlint/cli/engine.py index 71d44bd..f8eccf9 100644 --- a/xrlint/cli/engine.py +++ b/xrlint/cli/engine.py @@ -21,7 +21,7 @@ from xrlint.formatters import export_formatters from xrlint.linter import Linter from xrlint.plugin import Plugin -from xrlint.result import Message, Result, ResultStats +from xrlint.result import Result, ResultStats from xrlint.util.filefilter import FileFilter DEFAULT_GLOBAL_FILTER = FileFilter.from_patterns( @@ -147,25 +147,53 @@ def verify_datasets(self, files: Iterable[str]) -> Iterator[Result]: Returns: Iterator of reports. """ - global_filter = self.config_list.get_global_filter( + linter = Linter() + for file_path, config in self.get_files(files): + yield linter.verify_dataset(file_path, config=config) + + def get_files(self, file_paths: Iterable[str]) -> Iterator[tuple[str, Config]]: + """Provide an iterator for the list of files or directories. + + Directories in `files` that are not filtered out will be + recursively traversed. + + Args: + file_paths: Iterable of files or directory. + + Returns: + An iterator of filtered files or directories. + """ + config_list, global_filter = self.config_list.split_global_filter( default=DEFAULT_GLOBAL_FILTER ) - linter = Linter() - for file_path, is_dir in get_files(files, global_filter): - config = self.get_config_for_file(file_path) + + def compute_config(p: str): + return config_list.compute_config(p) if global_filter.accept(p) else None + + for file_path in file_paths: + _fs, root = fsspec.url_to_fs(file_path) + fs: fsspec.AbstractFileSystem = _fs + + config = compute_config(file_path) if config is not None: - yield linter.verify_dataset(file_path, config=config) - else: - yield Result.new( - config=config, - file_path=file_path, - messages=[ - Message( - message="No configuration matches this file.", - severity=2, - ) - ], - ) + yield file_path, config + continue + + if fs.isdir(root): + for path, dirs, files in fs.walk(root, topdown=True): + + for d in list(dirs): + d_path = f"{path}/{d}" + c = compute_config(d_path) + if c is not None: + dirs.remove(d) + yield d_path, c + + for f in files: + f_path = f"{path}/{f}" + c = compute_config(f_path) + if c is not None: + yield f_path, c def format_results(self, results: Iterable[Result]) -> str: """Format the given results. @@ -220,38 +248,3 @@ def init_config_file(cls) -> None: with open(file_path, "w") as f: f.write(INIT_CONFIG_YAML) click.echo(f"Configuration template written to {file_path}") - - -def get_files( - file_paths: Iterable[str], global_filter: FileFilter -) -> Iterator[tuple[str, bool | None]]: - """Provide an iterator for the list of files or directories. - - Directories in `files` that are not filtered out will be - recursively traversed. - - Args: - file_paths: Iterable of files or directory. - global_filter: A file filter that includes files that - covered by global file patterns and not excluded - by global ignore patterns. - - Returns: - An iterator of filtered files or directories. - """ - for file_path in file_paths: - _fs, root = fsspec.url_to_fs(file_path) - fs: fsspec.AbstractFileSystem = _fs - _dir = fs.isdir(root) - if global_filter.accept(file_path): - yield file_path, _dir - elif _dir: - for path, dirs, files in fs.walk(root): - for d in dirs: - d_path = f"{path}/{d}" - if global_filter.accept(d_path): - yield d_path, True - for f in files: - f_path = f"{path}/{f}" - if global_filter.accept(f_path): - yield f_path, False diff --git a/xrlint/config.py b/xrlint/config.py index 0a45318..ee294d4 100644 --- a/xrlint/config.py +++ b/xrlint/config.py @@ -6,7 +6,7 @@ from xrlint.util.constructible import MappingConstructible, ValueConstructible from xrlint.util.filefilter import FileFilter from xrlint.util.merge import merge_arrays, merge_dicts, merge_set_lists, merge_values -from xrlint.util.serializable import JsonSerializable, JsonValue +from xrlint.util.serializable import JsonSerializable if TYPE_CHECKING: # pragma: no cover # make IDEs and flake8 happy @@ -155,6 +155,7 @@ def empty(self) -> bool: return not ( self.linter_options or self.opener_options + or self.processor or self.plugins or self.rules or self.settings @@ -293,9 +294,6 @@ def value_name(cls) -> str: def value_type_name(cls) -> str: return "Config | dict | None" - def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: - return {k: v for k, v in super().to_dict().items() if v is not None} - @dataclass(frozen=True) class ConfigList(ValueConstructible, JsonSerializable): @@ -309,16 +307,21 @@ class ConfigList(ValueConstructible, JsonSerializable): configs: list[Config] = field(default_factory=list) """The list of configuration objects.""" - def get_global_filter(self, default: FileFilter | None = None) -> FileFilter: + def split_global_filter( + self, default: FileFilter | None = None + ) -> tuple["ConfigList", FileFilter]: """Get a global file filter for this configuration list.""" global_filter = FileFilter( default.files if default else (), default.ignores if default else (), ) + configs = [] for c in self.configs: if c.empty and not c.file_filter.empty: global_filter = global_filter.merge(c.file_filter) - return global_filter + else: + configs.append(c) + return ConfigList(configs=configs), global_filter def compute_config(self, file_path: str) -> Config | None: """Compute the configuration object for the given file path. diff --git a/xrlint/operation.py b/xrlint/operation.py index 9f1c35f..87332bd 100644 --- a/xrlint/operation.py +++ b/xrlint/operation.py @@ -46,13 +46,6 @@ class OperationMeta(MappingConstructible["OpMetadata"], JsonSerializable): Must have the form ":", if given. """ - def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: - return { - k: v - for k, v in super().to_dict(value_name=value_name).items() - if v is not None - } - class Operation(MappingConstructible["Operation"], JsonSerializable): """A mixin class that is used by operation classes. diff --git a/xrlint/util/serializable.py b/xrlint/util/serializable.py index b8b7415..51e305e 100644 --- a/xrlint/util/serializable.py +++ b/xrlint/util/serializable.py @@ -1,3 +1,4 @@ +from dataclasses import is_dataclass, fields from typing import Any, Final, Mapping, Sequence, TypeAlias from xrlint.util.formatting import format_message_type_of @@ -61,11 +62,16 @@ def _value_to_json(cls, value: Any, value_name: str) -> JsonValue: @classmethod def _object_to_json(cls, value: Any, value_name: str) -> dict[str, JsonValue]: - return { - k: cls._value_to_json(v, f"{value_name}.{k}") - for k, v in vars(value).items() - if cls._is_non_protected_property_name(k) - } + if is_dataclass(value): + _d = {f.name: (f, getattr(value, f.name)) for f in fields(value)} + d = {k: v for k, (f, v) in _d.items() if v != f.default} + else: + d = { + k: v + for k, v in vars(value).items() + if cls._is_non_protected_property_name(k) + } + return {k: cls._value_to_json(v, f"{value_name}.{k}") for k, v in d.items()} @classmethod def _mapping_to_json( From 80fe7195609403272d671ed57da302959ac9f20b Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Thu, 23 Jan 2025 08:37:38 +0100 Subject: [PATCH 10/17] missing util tests --- .../{test_codec.py => test_constructible.py} | 64 ----- tests/util/test_serializable.py | 237 ++++++++++++++++++ 2 files changed, 237 insertions(+), 64 deletions(-) rename tests/util/{test_codec.py => test_constructible.py} (87%) create mode 100644 tests/util/test_serializable.py diff --git a/tests/util/test_codec.py b/tests/util/test_constructible.py similarity index 87% rename from tests/util/test_codec.py rename to tests/util/test_constructible.py index 6761723..f5dac61 100644 --- a/tests/util/test_codec.py +++ b/tests/util/test_constructible.py @@ -121,70 +121,6 @@ def test_assumptions(self): # noinspection PyMethodMayBeStatic -class JsonSerializableTest(TestCase): - def test_simple_ok(self): - self.assertEqual( - {}, - SimpleTypesContainer().to_json(), - ) - self.assertEqual( - { - "a": "?", - "b": True, - "c": 12, - "d": 34.56, - "e": "uvw", - "f": "", - "g": "off", - }, - SimpleTypesContainer( - a="?", b=True, c=12, d=34.56, e="uvw", f=bool, g="off" - ).to_json(), - ) - - def test_complex_ok(self): - container = ComplexTypesContainer( - q=dict(p=True, q=False), - r=dict(u=SimpleTypesContainer(), v=SimpleTypesContainer()), - s=[1, 2, 3], - t=[ - SimpleTypesContainer(c=5, d=6.7), - SimpleTypesContainer(c=8, d=9.1, f=SimpleTypesContainer), - ], - ) - self.assertEqual( - { - "p": {}, - "q": {"p": True, "q": False}, - "r": {"u": {}, "v": {}}, - "s": [1, 2, 3], - "t": [ - {"c": 5, "d": 6.7}, - { - "c": 8, - "d": 9.1, - "f": "", - }, - ], - }, - container.to_json(), - ) - - def test_fail(self): - @dataclass() - class Problematic(JsonSerializable): - data: Any - - with pytest.raises( - TypeError, - match=( - "problematic.data must be of type" - " None|bool|int|float|str|dict|list|tuple, but got object" - ), - ): - Problematic(data=object()).to_json(value_name="problematic") - - class ValueConstructibleTest(TestCase): def test_useless_ok(self): container = UselessContainer() diff --git a/tests/util/test_serializable.py b/tests/util/test_serializable.py new file mode 100644 index 0000000..4206af9 --- /dev/null +++ b/tests/util/test_serializable.py @@ -0,0 +1,237 @@ +from dataclasses import dataclass, field +from typing import Any, Literal + +from unittest import TestCase + +import pytest + +from xrlint.util.serializable import JsonSerializable + + +class PlainSimpleTypesContainer(JsonSerializable): + def __init__( + self, + a: Any = None, + b: bool = False, + c: int = 0, + d: float = 0.0, + e: str = "abc", + f: type = int, + g: Literal[0, 1, True, False, "on", "off"] = 0, + ): + self.a = a + self.b = b + self.c = c + self.d = d + self.e = e + self.f = f + self.g = g + + +class PlainComplexTypesContainer(JsonSerializable): + def __init__( + self, + p: PlainSimpleTypesContainer = PlainSimpleTypesContainer(), + q: dict[str, bool] = None, + r: dict[str, PlainSimpleTypesContainer] = None, + s: list[int] = None, + t: list[PlainSimpleTypesContainer] = None, + u: int | float | None = None, + ): + self.p = p + self.q = q or {} + self.r = r or {} + self.s = s or [] + self.t = t or [] + self.u = u + + +@dataclass() +class DataclassSimpleTypesContainer(JsonSerializable): + a: Any = None + b: bool = False + c: int = 0 + d: float = 0.0 + e: str = "abc" + f: type = int + g: Literal[0, 1, True, False, "on", "off"] = 0 + + +@dataclass() +class DataclassComplexTypesContainer(JsonSerializable): + p: DataclassSimpleTypesContainer = field( + default_factory=DataclassSimpleTypesContainer + ) + q: dict[str, bool] = field(default_factory=dict) + r: dict[str, DataclassSimpleTypesContainer] = field(default_factory=dict) + s: list[int] = field(default_factory=list) + t: list[DataclassSimpleTypesContainer] = field(default_factory=list) + u: int | float | None = None + + +# noinspection PyMethodMayBeStatic +class JsonSerializableTest(TestCase): + def test_plain_simple_ok(self): + self.assertEqual( + { + "a": None, + "b": False, + "c": 0, + "d": 0.0, + "e": "abc", + "f": "", + "g": 0, + }, + PlainSimpleTypesContainer().to_json(), + ) + self.assertEqual( + { + "a": "?", + "b": True, + "c": 12, + "d": 34.56, + "e": "uvw", + "f": "", + "g": "off", + }, + PlainSimpleTypesContainer( + a="?", b=True, c=12, d=34.56, e="uvw", f=bool, g="off" + ).to_json(), + ) + + def test_plain_complex_ok(self): + container = PlainComplexTypesContainer( + q=dict(p=True, q=False), + r=dict(u=PlainSimpleTypesContainer(), v=PlainSimpleTypesContainer()), + s=[1, 2, 3], + t=[ + PlainSimpleTypesContainer(c=5, d=6.7), + PlainSimpleTypesContainer(c=8, d=9.1, f=PlainSimpleTypesContainer), + ], + ) + self.assertEqual( + { + "p": { + "a": None, + "b": False, + "c": 0, + "d": 0.0, + "e": "abc", + "f": "", + "g": 0, + }, + "q": {"p": True, "q": False}, + "r": { + "u": { + "a": None, + "b": False, + "c": 0, + "d": 0.0, + "e": "abc", + "f": "", + "g": 0, + }, + "v": { + "a": None, + "b": False, + "c": 0, + "d": 0.0, + "e": "abc", + "f": "", + "g": 0, + }, + }, + "s": [1, 2, 3], + "t": [ + { + "a": None, + "b": False, + "c": 5, + "d": 6.7, + "e": "abc", + "f": "", + "g": 0, + }, + { + "a": None, + "b": False, + "c": 8, + "d": 9.1, + "e": "abc", + "f": "", + "g": 0, + }, + ], + "u": None, + }, + container.to_json(), + ) + + def test_dataclass_simple_ok(self): + self.assertEqual( + {}, + DataclassSimpleTypesContainer().to_json(), + ) + self.assertEqual( + { + "a": "?", + "b": True, + "c": 12, + "d": 34.56, + "e": "uvw", + "f": "", + "g": "off", + }, + DataclassSimpleTypesContainer( + a="?", b=True, c=12, d=34.56, e="uvw", f=bool, g="off" + ).to_json(), + ) + + def test_dataclass_complex_ok(self): + container = DataclassComplexTypesContainer( + q=dict(p=True, q=False), + r=dict( + u=DataclassSimpleTypesContainer(), v=DataclassSimpleTypesContainer() + ), + s=[1, 2, 3], + t=[ + DataclassSimpleTypesContainer(c=5, d=6.7), + DataclassSimpleTypesContainer( + c=8, d=9.1, f=DataclassSimpleTypesContainer + ), + ], + ) + self.assertEqual( + { + "p": {}, + "q": {"p": True, "q": False}, + "r": {"u": {}, "v": {}}, + "s": [1, 2, 3], + "t": [ + {"c": 5, "d": 6.7}, + { + "c": 8, + "d": 9.1, + "f": ( + "" + ), + }, + ], + }, + container.to_json(), + ) + + def test_fail(self): + @dataclass() + class Problematic(JsonSerializable): + data: Any + + with pytest.raises( + TypeError, + match=( + "problematic.data must be of type" + " None|bool|int|float|str|dict|list|tuple, but got object" + ), + ): + Problematic(data=object()).to_json(value_name="problematic") From e4a48d702d41aade218e9b918b12f26fdad38200 Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Thu, 23 Jan 2025 14:07:06 +0100 Subject: [PATCH 11/17] testing MultiLevelDatasetProcessor --- environment.yml | 1 + pyproject.toml | 1 + tests/plugins/xcube/helpers.py | 63 ++++++++++++++ tests/plugins/xcube/processors/__init__.py | 0 .../xcube/processors/test_mldataset.py | 87 +++++++++++++++++++ 5 files changed, 152 insertions(+) create mode 100644 tests/plugins/xcube/helpers.py create mode 100644 tests/plugins/xcube/processors/__init__.py create mode 100644 tests/plugins/xcube/processors/test_mldataset.py diff --git a/environment.yml b/environment.yml index 5b3fd1b..9d45b3f 100644 --- a/environment.yml +++ b/environment.yml @@ -20,6 +20,7 @@ dependencies: - requests-mock - ruff # Testing Datasets + - dask - pandas - netcdf4 - numpy diff --git a/pyproject.toml b/pyproject.toml index 8981b1d..5097449 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ dev = [ "ruff", "twine", # Dataset testing + "dask", "netcdf4", "numpy", "pandas", diff --git a/tests/plugins/xcube/helpers.py b/tests/plugins/xcube/helpers.py new file mode 100644 index 0000000..77bbfb6 --- /dev/null +++ b/tests/plugins/xcube/helpers.py @@ -0,0 +1,63 @@ +import numpy as np +import xarray as xr + + +def make_cube(nx: int, ny: int, nt: int) -> xr.Dataset: + """Make an in-memory dataset that should pass all xcube rules. + + Args: + nx: length of the lon-dimension + ny: length of the lat-dimension + nt: length of the time-dimension + + Returns: + an in-memory dataset with one 3-d data variable "chl" + with dimensions "time", "lat", "lon". + """ + dx = 180.0 / nx + dy = 90.0 / ny + return xr.Dataset( + coords=dict( + lon=xr.DataArray( + np.linspace(-180 + dx, 180 - dx, nx), + dims="lon", + attrs=dict( + long_name="longitude", + standard_name="longitude", + units="degrees_east", + ), + ), + lat=xr.DataArray( + np.linspace(-90 + dy, 90 - dy, ny), + dims="lat", + attrs=dict( + long_name="latitude", + standard_name="latitude", + units="degrees_north", + ), + ), + time=xr.DataArray( + range(nt), + dims="time", + attrs=dict( + long_name="time", + standard_name="time", + units="days since 2024-06-10:12:00:00 utc", + calendar="gregorian", + ), + ), + ), + data_vars=dict( + chl=xr.DataArray( + np.zeros((nt, ny, nx)), + dims=["time", "lat", "lon"], + attrs=dict( + long_name="chlorophyll concentration", + standard_name="chlorophyll_concentration", + units="mg/m^3", + _FillValue=0, + ), + ).chunk(time=1, lat=min(ny, 90), lon=min(nx, 90)), + ), + attrs=dict(), + ) diff --git a/tests/plugins/xcube/processors/__init__.py b/tests/plugins/xcube/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/plugins/xcube/processors/test_mldataset.py b/tests/plugins/xcube/processors/test_mldataset.py new file mode 100644 index 0000000..757d7e6 --- /dev/null +++ b/tests/plugins/xcube/processors/test_mldataset.py @@ -0,0 +1,87 @@ +import json +from unittest import TestCase + +import fsspec + +from tests.plugins.xcube.helpers import make_cube +from xrlint.plugins.xcube.processors.mldataset import MultiLevelDatasetProcessor +from xrlint.result import Message + + +class MultiLevelDatasetProcessorTest(TestCase): + @classmethod + def setUpClass(cls): + cls.levels_dir = "memory://xrlint-test.levels" + cls.fs, _ = fsspec.core.url_to_fs(cls.levels_dir) + cls.fs.mkdir(cls.levels_dir) + + cls.nx = nx = 720 + cls.ny = ny = nx // 2 + cls.nt = 3 + cls.nl = 4 + + for level in range(cls.nl): + dataset = make_cube(nx, ny, cls.nt) + dataset.to_zarr(f"{cls.levels_dir}/{level}.zarr", write_empty_chunks=False) + nx //= 2 + ny //= 2 + + cls.meta_path = f"{cls.levels_dir}/.zlevels" + cls.meta_content = { + "version": "1.0", + "num_levels": cls.nl, + "use_saved_levels": False, + "agg_methods": { + "chl": "mean", + }, + } + + def test_preprocess(self): + with self.fs.open(self.meta_path, mode="wt") as stream: + json.dump(self.meta_content, stream, indent=2) + + processor = MultiLevelDatasetProcessor() + datasets = processor.preprocess(self.levels_dir, {}) + self.assertIsInstance(datasets, list) + self.assertEqual(self.nl, len(datasets)) + for i, (dataset, file_path) in enumerate(datasets): + self.assertEqual(f"{self.levels_dir}/{i}.zarr", file_path) + self.assertIn("_LEVEL_INFO", dataset.attrs) + level_info = dataset.attrs.get("_LEVEL_INFO") + self.assertIsInstance(level_info, dict) + self.assertEqual(i, level_info.get("index")) + self.assertEqual(self.nl, level_info.get("count")) + self.assertEqual(self.meta_path, level_info.get("meta_path")) + self.assertEqual(self.meta_content, level_info.get("meta_content")) + self.assertIsInstance(level_info.get("datasets"), list) + + self.fs.delete(self.meta_path) + + def test_preprocess_no_meta(self): + processor = MultiLevelDatasetProcessor() + datasets = processor.preprocess(self.levels_dir, {}) + self.assertIsInstance(datasets, list) + self.assertEqual(self.nl, len(datasets)) + for i, (dataset, file_path) in enumerate(datasets): + self.assertEqual(f"{self.levels_dir}/{i}.zarr", file_path) + self.assertIn("_LEVEL_INFO", dataset.attrs) + level_info = dataset.attrs.get("_LEVEL_INFO") + self.assertIsInstance(level_info, dict) + self.assertEqual(i, level_info.get("index")) + self.assertEqual(self.nl, level_info.get("count")) + self.assertEqual(None, level_info.get("meta_path")) + self.assertEqual(None, level_info.get("meta_content")) + self.assertIsInstance(level_info.get("datasets"), list) + + def test_postprocess(self): + processor = MultiLevelDatasetProcessor() + ml0 = [Message("m00"), Message("m01")] + ml1 = [Message("10"), Message("m11"), Message("m12")] + messages = processor.postprocess( + [ + ml0, + ml1, + ], + self.levels_dir, + ) + self.assertEqual([*ml0, *ml1], messages) From 299686ab46896af3fe9d1556e08447a7a7205aad Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Thu, 23 Jan 2025 20:39:48 +0100 Subject: [PATCH 12/17] new multi-level dataset rules --- CHANGES.md | 3 + .../xcube/processors/test_mldataset.py | 37 ++--- tests/plugins/xcube/test_plugin.py | 2 + xrlint/plugins/xcube/__init__.py | 7 +- xrlint/plugins/xcube/constants.py | 20 ++- xrlint/plugins/xcube/processors/mldataset.py | 132 ++++++++++++++---- xrlint/plugins/xcube/rules/ml_dataset_meta.py | 56 ++++++++ .../xcube/rules/ml_dataset_resolution.py | 57 ++++++++ xrlint/plugins/xcube/util.py | 67 ++++++++- 9 files changed, 325 insertions(+), 56 deletions(-) create mode 100644 xrlint/plugins/xcube/rules/ml_dataset_meta.py create mode 100644 xrlint/plugins/xcube/rules/ml_dataset_resolution.py diff --git a/CHANGES.md b/CHANGES.md index 624d274..a3b9b84 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,9 @@ ## Version 0.4.0 (in development) +- New xcube multi-level dataset rules: + - `ml-dataset-meta`: verifies that a meta info file exists and is consistent + - `ml-dataset-resolution`: verifies that the levels have expected resolutions - Now supporting xcube multi-level datasets `*.levels`: - Added xcube plugin processor `"xcube/multi-level-dataset"` that is used inside the predefined xcube configurations "all" and "recommended". diff --git a/tests/plugins/xcube/processors/test_mldataset.py b/tests/plugins/xcube/processors/test_mldataset.py index 757d7e6..aee7bf4 100644 --- a/tests/plugins/xcube/processors/test_mldataset.py +++ b/tests/plugins/xcube/processors/test_mldataset.py @@ -4,7 +4,12 @@ import fsspec from tests.plugins.xcube.helpers import make_cube -from xrlint.plugins.xcube.processors.mldataset import MultiLevelDatasetProcessor +from xrlint.plugins.xcube.constants import ML_INFO_ATTR +from xrlint.plugins.xcube.processors.mldataset import ( + MultiLevelDatasetProcessor, + LevelInfo, + MultiLevelDatasetMeta, +) from xrlint.result import Message @@ -46,14 +51,13 @@ def test_preprocess(self): self.assertEqual(self.nl, len(datasets)) for i, (dataset, file_path) in enumerate(datasets): self.assertEqual(f"{self.levels_dir}/{i}.zarr", file_path) - self.assertIn("_LEVEL_INFO", dataset.attrs) - level_info = dataset.attrs.get("_LEVEL_INFO") - self.assertIsInstance(level_info, dict) - self.assertEqual(i, level_info.get("index")) - self.assertEqual(self.nl, level_info.get("count")) - self.assertEqual(self.meta_path, level_info.get("meta_path")) - self.assertEqual(self.meta_content, level_info.get("meta_content")) - self.assertIsInstance(level_info.get("datasets"), list) + level_info = dataset.attrs.get(ML_INFO_ATTR) + self.assertIsInstance(level_info, LevelInfo) + self.assertEqual(i, level_info.level) + self.assertEqual(self.nl, level_info.num_levels) + self.assertIsInstance(level_info.meta, MultiLevelDatasetMeta) + self.assertIsInstance(level_info.datasets, list) + self.assertEqual(self.nl, len(level_info.datasets)) self.fs.delete(self.meta_path) @@ -64,14 +68,13 @@ def test_preprocess_no_meta(self): self.assertEqual(self.nl, len(datasets)) for i, (dataset, file_path) in enumerate(datasets): self.assertEqual(f"{self.levels_dir}/{i}.zarr", file_path) - self.assertIn("_LEVEL_INFO", dataset.attrs) - level_info = dataset.attrs.get("_LEVEL_INFO") - self.assertIsInstance(level_info, dict) - self.assertEqual(i, level_info.get("index")) - self.assertEqual(self.nl, level_info.get("count")) - self.assertEqual(None, level_info.get("meta_path")) - self.assertEqual(None, level_info.get("meta_content")) - self.assertIsInstance(level_info.get("datasets"), list) + level_info = dataset.attrs.get(ML_INFO_ATTR) + self.assertIsInstance(level_info, LevelInfo) + self.assertEqual(i, level_info.level) + self.assertEqual(self.nl, level_info.num_levels) + self.assertEqual(None, level_info.meta) + self.assertIsInstance(level_info.datasets, list) + self.assertEqual(self.nl, len(level_info.datasets)) def test_postprocess(self): processor = MultiLevelDatasetProcessor() diff --git a/tests/plugins/xcube/test_plugin.py b/tests/plugins/xcube/test_plugin.py index 4f4deaa..a7cd0c6 100644 --- a/tests/plugins/xcube/test_plugin.py +++ b/tests/plugins/xcube/test_plugin.py @@ -14,6 +14,8 @@ def test_rules_complete(self): "grid-mapping-naming", "increasing-time", "lat-lon-naming", + "ml-dataset-meta", + "ml-dataset-resolution", "single-grid-mapping", "time-naming", }, diff --git a/xrlint/plugins/xcube/__init__.py b/xrlint/plugins/xcube/__init__.py index dcfc4d9..3a1a917 100644 --- a/xrlint/plugins/xcube/__init__.py +++ b/xrlint/plugins/xcube/__init__.py @@ -1,4 +1,5 @@ from xrlint.plugin import Plugin +from xrlint.plugins.xcube.constants import ML_FILE_PATTERN from xrlint.util.importutil import import_submodules @@ -16,11 +17,11 @@ def export_plugin() -> Plugin: }, { # Add *.levels to globally included list of file types - "files": ["**/*.levels"], + "files": [ML_FILE_PATTERN], }, { # Specify a processor for *.levels files - "files": ["**/*.levels"], + "files": [ML_FILE_PATTERN], "processor": "xcube/multi-level-dataset", }, ] @@ -37,6 +38,8 @@ def export_plugin() -> Plugin: "xcube/grid-mapping-naming": "warn", "xcube/increasing-time": "error", "xcube/lat-lon-naming": "error", + "xcube/ml-dataset-meta": "warn", + "xcube/ml-dataset-resolution": "error", "xcube/single-grid-mapping": "error", "xcube/time-naming": "error", }, diff --git a/xrlint/plugins/xcube/constants.py b/xrlint/plugins/xcube/constants.py index d3727d7..b10b532 100644 --- a/xrlint/plugins/xcube/constants.py +++ b/xrlint/plugins/xcube/constants.py @@ -1,8 +1,14 @@ -LON_NAME = "lon" -LAT_NAME = "lat" -X_NAME = "x" -Y_NAME = "y" -TIME_NAME = "time" +from typing import Final -GM_NAMES = "spatial_ref", "crs" -GM_NAMES_TEXT = " or ".join(repr(gm_name) for gm_name in GM_NAMES) +LON_NAME: Final = "lon" +LAT_NAME: Final = "lat" +X_NAME: Final = "x" +Y_NAME: Final = "y" +TIME_NAME: Final = "time" + +GM_NAMES: Final = "spatial_ref", "crs" +GM_NAMES_TEXT: Final = " or ".join(repr(gm_name) for gm_name in GM_NAMES) + +ML_FILE_PATTERN: Final = "**/*.levels" +ML_META_FILENAME: Final = ".zlevels" +ML_INFO_ATTR: Final = "_LEVEL_INFO" diff --git a/xrlint/plugins/xcube/processors/mldataset.py b/xrlint/plugins/xcube/processors/mldataset.py index 3943101..3e4c3ee 100644 --- a/xrlint/plugins/xcube/processors/mldataset.py +++ b/xrlint/plugins/xcube/processors/mldataset.py @@ -1,51 +1,78 @@ import itertools import json +import re from typing import Any import fsspec import xarray as xr +from xrlint.plugins.xcube.constants import ( + ML_FILE_PATTERN, + ML_META_FILENAME, +) from xrlint.plugins.xcube.plugin import plugin +from xrlint.plugins.xcube.util import ( + set_dataset_level_info, + LevelInfo, + MultiLevelDatasetMeta, +) from xrlint.processor import ProcessorOp from xrlint.result import Message +level_pattern = re.compile(r"^(\d+)(?:\.zarr)?$") + @plugin.define_processor("multi-level-dataset") class MultiLevelDatasetProcessor(ProcessorOp): - """This processor should be used with `files: ["**/*.levels"]`.""" + f"""This processor should be used with `files: [{ML_FILE_PATTERN}"]`.""" def preprocess( self, file_path: str, opener_options: dict[str, Any] ) -> list[tuple[xr.Dataset, str]]: - engine = opener_options.pop("engine", "zarr") + fs, fs_path = get_filesystem(file_path, opener_options) + + file_names = [ + # extracting the filename could be done more robustly + f.replace(fs_path, "").strip("/") + for f in fs.listdir(fs_path, detail=False) + ] + + meta = None + if ML_META_FILENAME in file_names: + try: + with fs.open(f"{fs_path}/{ML_META_FILENAME}") as stream: + meta = parse_multi_level_dataset_meta(stream) + except FileNotFoundError: + pass - levels_meta_path = f"{file_path}/.zlevels" - try: - with fsspec.open(levels_meta_path, mode="rt", encoding="utf-8") as stream: - levels_meta_content = json.load(stream) - except FileNotFoundError: - levels_meta_path = None - levels_meta_content = None - - level_datasets = [] - for level in itertools.count(): - level_path = f"{file_path}/{level}.zarr" + level_0_path = None + if "0.link" in file_names: try: - level_dataset = xr.open_dataset( - level_path, engine=engine, **opener_options - ) + with fs.open(f"{fs_path}/0.link") as stream: + level_0_path = stream.read() except FileNotFoundError: - break + pass + + level_names, num_levels = parse_levels(file_names, level_0_path) + + engine = opener_options.pop("engine", "zarr") + + level_datasets: list[xr.Dataset | None] = [] + for level, level_name in level_names.items(): + level_path = f"{file_path}/{level_name}" + level_dataset = xr.open_dataset(level_path, engine=engine, **opener_options) level_datasets.append((level_dataset, level_path)) - for level, (level_dataset, level_path) in enumerate(level_datasets): - level_dataset.attrs["_LEVEL_INFO"] = { - "meta_path": levels_meta_path, - "meta_content": levels_meta_content, - "index": level, - "count": len(level_datasets), - "datasets": [ds for ds, _ in level_datasets], - } + for level, (level_dataset, _) in enumerate(level_datasets): + set_dataset_level_info( + level_dataset, + LevelInfo( + level=level, + num_levels=num_levels, + meta=meta, + datasets=level_datasets, + ), + ) return level_datasets @@ -53,3 +80,58 @@ def postprocess( self, messages: list[list[Message]], file_path: str ) -> list[Message]: return list(itertools.chain(*messages)) + + +def get_filesystem(file_path: str, opener_options: dict[str, Any]): + storage_options = ( + opener_options.get( + "storage_options", + opener_options.get("backend_kwargs", {}).get("storage_options"), + ) + or {} + ) + _fs, fs_path = fsspec.core.url_to_fs(file_path, **storage_options) + fs: fsspec.AbstractFileSystem = _fs + fs_path: str = fs_path.replace("\\", "/") + return fs, fs_path + + +def parse_levels( + file_names: list[str], level_0_path: str | None +) -> tuple[dict[int, str], int]: + level_names: dict[int, str] = {0: level_0_path} if level_0_path else {} + num_levels = 0 + for file_name in file_names: + m = level_pattern.match(file_name) + if m is not None: + level = int(m.group(1)) + level_names[level] = file_name + num_levels = max(num_levels, level + 1) + if not level_names: + raise ValueError("empty multi-level dataset") + num_levels = max(level_names.keys()) + 1 + for level in range(num_levels): + if level not in level_names: + raise ValueError( + f"missing dataset for level {level} in multi-level dataset" + ) + return level_names, num_levels + + +def parse_multi_level_dataset_meta(stream: Any) -> MultiLevelDatasetMeta: + meta_content = json.load(stream) + + msg_prefix = f"illegal xcube {ML_META_FILENAME!r} file" + + if not isinstance(meta_content, dict): + raise ValueError(f"{msg_prefix}, JSON object expected") + + version = meta_content.get("version") + if not isinstance(version, str) or not version.startswith("1."): + raise ValueError(f"{msg_prefix}, missing or invalid 'version'") + + num_levels = meta_content.get("num_levels") + if not isinstance(num_levels, int) or num_levels < 1: + raise ValueError(f"{msg_prefix}, missing or invalid 'num_levels'") + + return MultiLevelDatasetMeta(**meta_content) diff --git a/xrlint/plugins/xcube/rules/ml_dataset_meta.py b/xrlint/plugins/xcube/rules/ml_dataset_meta.py new file mode 100644 index 0000000..99d0aeb --- /dev/null +++ b/xrlint/plugins/xcube/rules/ml_dataset_meta.py @@ -0,0 +1,56 @@ +from xrlint.node import DatasetNode +from xrlint.plugins.xcube.constants import ML_META_FILENAME +from xrlint.plugins.xcube.plugin import plugin +from xrlint.plugins.xcube.util import get_dataset_level_info +from xrlint.rule import RuleContext, RuleOp +from xrlint.util.formatting import format_item + + +@plugin.define_rule( + "ml-dataset-meta", + version="1.0.0", + type="suggestion", + description=( + f"Multi-level datasets should provide {ML_META_FILENAME!r} meta information file" + f" and if so, it should be consistent." + ), + docs_url=( + "https://xcube.readthedocs.io/en/latest/cubespec.html#encoding-of-colors" + ), +) +class MultiLevelDatasetMeta(RuleOp): + def dataset(self, ctx: RuleContext, node: DatasetNode): + level_info = get_dataset_level_info(node.dataset) + if level_info is None: + # ok, this rules applies only to level datasets opened + # by the xcube multi-level processor + return + + level = level_info.level + if level > 0: + # ok, this rule does only apply to level 0 + return + + meta = level_info.meta + if meta is None: + ctx.report( + f"Missing {ML_META_FILENAME!r} file," + f" therefore dataset cannot be extended." + ) + return + + actual_count: int = level_info.num_levels + meta_count = meta.num_levels + if meta_count != actual_count: + ctx.report( + f"Expected {format_item(meta_count, 'level')}, but found {actual_count}" + ) + + actual_count: int = level_info.num_levels + meta_count = meta.use_saved_levels + if meta_count != actual_count: + ctx.report( + f"Expected {format_item(meta_count, 'level')}, but found {actual_count}" + ) + + # TODO: verify variables listed in meta diff --git a/xrlint/plugins/xcube/rules/ml_dataset_resolution.py b/xrlint/plugins/xcube/rules/ml_dataset_resolution.py new file mode 100644 index 0000000..ce00558 --- /dev/null +++ b/xrlint/plugins/xcube/rules/ml_dataset_resolution.py @@ -0,0 +1,57 @@ +import math + +from xrlint.node import DatasetNode +from xrlint.plugins.xcube.plugin import plugin +from xrlint.plugins.xcube.util import get_spatial_size, get_dataset_level_info +from xrlint.rule import RuleContext, RuleOp + + +@plugin.define_rule( + "ml-dataset-resolution", + version="1.0.0", + type="problem", + description=( + "Multi-level dataset levels should provide spatial resolutions" + " decreasing by powers of two." + ), + docs_url=( + "https://xcube.readthedocs.io/en/latest/cubespec.html#encoding-of-colors" + ), +) +class MultiLevelDatasetMeta(RuleOp): + def dataset(self, ctx: RuleContext, node: DatasetNode): + level_info = get_dataset_level_info(node.dataset) + if level_info is None: + # ok, this rules applies only to level datasets opened + # by the xcube multi-level processor + return + + level = level_info.level + if level == 0: + # ok, this rule does only apply to level > 0 + return + + datasets = level_info.datasets + level_0_dataset, _ = datasets[0] + l0_size = get_spatial_size(level_0_dataset) + if l0_size is None: + # weird - maybe no data vars? + return + + (x_name, level_0_width), (y_name, level_0_height) = l0_size + level_width = node.dataset.sizes.get(x_name) + level_height = node.dataset.sizes.get(y_name) + expected_level_width = math.ceil(level_0_width >> level) + expected_level_height = math.ceil(level_0_height >> level) + + if level_width != expected_level_width: + ctx.report( + f"Expected size of dimension {x_name!r} in level {level}" + f" to be {expected_level_width}, but was {level_width}" + ) + + if level_height != expected_level_height: + ctx.report( + f"Expected size of dimension {y_name!r} in level {level}" + f" to be {expected_level_height}, but was {level_height}" + ) diff --git a/xrlint/plugins/xcube/util.py b/xrlint/plugins/xcube/util.py index 09a4e20..1cb2002 100644 --- a/xrlint/plugins/xcube/util.py +++ b/xrlint/plugins/xcube/util.py @@ -1,11 +1,68 @@ +from collections.abc import Hashable +from dataclasses import dataclass + import xarray as xr -from .constants import LAT_NAME, LON_NAME, X_NAME, Y_NAME +from .constants import LAT_NAME, LON_NAME, X_NAME, Y_NAME, ML_INFO_ATTR + + +@dataclass(frozen=True, kw_only=True) +class MultiLevelDatasetMeta: + version: str + num_levels: int + use_saved_levels: bool | None = None + agg_methods: dict[str, str] | None = None + + +@dataclass(frozen=True, kw_only=True) +class LevelInfo: + level: int + num_levels: int + datasets: list[tuple[xr.Dataset, str]] + meta: MultiLevelDatasetMeta | None = None + + +def get_dataset_level_info(dataset: xr.Dataset) -> LevelInfo | None: + return dataset.attrs.get(ML_INFO_ATTR) + + +def set_dataset_level_info(dataset: xr.Dataset, level_info: LevelInfo): + dataset.attrs[ML_INFO_ATTR] = level_info def is_spatial_var(var: xr.DataArray) -> bool: - """Return 'True' if `var` looks like a spatial variable.""" - dims = var.dims - return (X_NAME in dims and Y_NAME in dims) or ( - LON_NAME in dims and LAT_NAME in dims + """Return 'True' if `var` looks like a spatial 2+d variable.""" + if var.ndim < 2: + return False + y_name, x_name = var.dims[-2:] + return (x_name == X_NAME and y_name == Y_NAME) or ( + x_name == LON_NAME and y_name == LAT_NAME ) + + +def get_spatial_size( + dataset: xr.Dataset, +) -> tuple[tuple[Hashable, int], tuple[Hashable, int]] | None: + """Return (x_size, y_size) for given dataset.""" + for k, v in dataset.data_vars.items(): + if is_spatial_var(v): + y_name, x_name = v.dims[-2:] + x_size = dataset.sizes[x_name] + y_size = dataset.sizes[y_name] + if x_size and y_size: + return (x_name, x_size), (y_name, y_size) + return None + + +def get_spatial_size_for_dims( + dataset: xr.Dataset, x_name: str, y_name: str +) -> tuple[int, int]: + """Return (x_size, y_size) for given dataset and dim names.""" + x_size: int = 0 + y_size: int = 0 + for k, v in dataset.sizes.items(): + if k == x_name: + x_size = v + elif k == y_name: + y_size = v + return x_size, y_size From 24c0b8d3a3bc9c869e73498c357fe384f5d37455 Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Fri, 24 Jan 2025 12:35:27 +0100 Subject: [PATCH 13/17] testing MultiLevelDatasetProcessor --- tests/plugins/xcube/helpers.py | 9 ++ .../xcube/processors/test_mldataset.py | 141 ++++++++++++------ xrlint/cli/engine.py | 1 - xrlint/plugins/xcube/processors/mldataset.py | 37 +---- xrlint/plugins/xcube/rules/ml_dataset_meta.py | 52 +++++-- .../xcube/rules/ml_dataset_resolution.py | 6 +- xrlint/plugins/xcube/util.py | 32 +++- 7 files changed, 184 insertions(+), 94 deletions(-) diff --git a/tests/plugins/xcube/helpers.py b/tests/plugins/xcube/helpers.py index 77bbfb6..f397500 100644 --- a/tests/plugins/xcube/helpers.py +++ b/tests/plugins/xcube/helpers.py @@ -1,7 +1,16 @@ +import math + import numpy as np import xarray as xr +def make_cube_levels(nx: int, ny: int, nt: int, nl: int) -> list[xr.Dataset]: + return [ + make_cube(math.ceil(nx >> level), math.ceil(ny >> level), nt) + for level in range(nl) + ] + + def make_cube(nx: int, ny: int, nt: int) -> xr.Dataset: """Make an in-memory dataset that should pass all xcube rules. diff --git a/tests/plugins/xcube/processors/test_mldataset.py b/tests/plugins/xcube/processors/test_mldataset.py index aee7bf4..8889f6c 100644 --- a/tests/plugins/xcube/processors/test_mldataset.py +++ b/tests/plugins/xcube/processors/test_mldataset.py @@ -1,9 +1,11 @@ import json +from typing import Any from unittest import TestCase import fsspec +import pytest -from tests.plugins.xcube.helpers import make_cube +from tests.plugins.xcube.helpers import make_cube_levels from xrlint.plugins.xcube.constants import ML_INFO_ATTR from xrlint.plugins.xcube.processors.mldataset import ( MultiLevelDatasetProcessor, @@ -14,67 +16,116 @@ class MultiLevelDatasetProcessorTest(TestCase): + levels_name = "xrlint-test" + levels_dir = f"memory://{levels_name}.levels" + + x_size = nx = 720 + y_size = ny = 360 + time_size = 3 + num_levels = 4 + + meta_path = f"{levels_dir}/.zlevels" + meta_content = { + "version": "1.0", + "num_levels": num_levels, + "use_saved_levels": False, + # tile_size is optional + "agg_methods": { + "chl": "mean", + }, + } + @classmethod def setUpClass(cls): - cls.levels_dir = "memory://xrlint-test.levels" - cls.fs, _ = fsspec.core.url_to_fs(cls.levels_dir) - cls.fs.mkdir(cls.levels_dir) - - cls.nx = nx = 720 - cls.ny = ny = nx // 2 - cls.nt = 3 - cls.nl = 4 - - for level in range(cls.nl): - dataset = make_cube(nx, ny, cls.nt) - dataset.to_zarr(f"{cls.levels_dir}/{level}.zarr", write_empty_chunks=False) - nx //= 2 - ny //= 2 - - cls.meta_path = f"{cls.levels_dir}/.zlevels" - cls.meta_content = { - "version": "1.0", - "num_levels": cls.nl, - "use_saved_levels": False, - "agg_methods": { - "chl": "mean", - }, - } + fs, _ = fsspec.core.url_to_fs(cls.levels_dir) + cls.fs: fsspec.AbstractFileSystem = fs + + def setUp(self): + self._delete_files() + self.fs.mkdir(self.levels_dir) + + level_datasets = make_cube_levels( + self.x_size, self.y_size, self.time_size, self.num_levels + ) + for level, dataset in enumerate(level_datasets): + dataset.to_zarr(f"{self.levels_dir}/{level}.zarr", write_empty_chunks=False) - def test_preprocess(self): with self.fs.open(self.meta_path, mode="wt") as stream: json.dump(self.meta_content, stream, indent=2) - processor = MultiLevelDatasetProcessor() - datasets = processor.preprocess(self.levels_dir, {}) + def tearDown(self): + self._delete_files() + + def _delete_files(self): + if self.fs.exists(self.levels_dir): + self.fs.delete(self.levels_dir, recursive=True) + + def assert_levels_ok( + self, datasets: Any, expect_meta: bool = False, expect_link: bool = False + ): self.assertIsInstance(datasets, list) - self.assertEqual(self.nl, len(datasets)) + self.assertEqual(self.num_levels, len(datasets)) for i, (dataset, file_path) in enumerate(datasets): - self.assertEqual(f"{self.levels_dir}/{i}.zarr", file_path) + if expect_link and i == 0: + self.assertEqual(f"memory://{self.levels_name}.zarr", file_path) + else: + self.assertEqual( + f"memory://{self.levels_name}.levels/{i}.zarr", file_path + ) level_info = dataset.attrs.get(ML_INFO_ATTR) self.assertIsInstance(level_info, LevelInfo) self.assertEqual(i, level_info.level) - self.assertEqual(self.nl, level_info.num_levels) - self.assertIsInstance(level_info.meta, MultiLevelDatasetMeta) + self.assertEqual(self.num_levels, level_info.num_levels) self.assertIsInstance(level_info.datasets, list) - self.assertEqual(self.nl, len(level_info.datasets)) + self.assertEqual(self.num_levels, len(level_info.datasets)) + meta = level_info.meta + if expect_meta: + self.assertIsInstance(meta, MultiLevelDatasetMeta) + self.assertEqual("1.0", meta.version) + self.assertEqual(self.num_levels, meta.num_levels) + self.assertEqual(False, meta.use_saved_levels) + self.assertEqual(None, meta.tile_size) + self.assertEqual({"chl": "mean"}, meta.agg_methods) + else: + self.assertIsNone(meta) - self.fs.delete(self.meta_path) + def test_preprocess(self): + processor = MultiLevelDatasetProcessor() + datasets = processor.preprocess(self.levels_dir, {}) + self.assert_levels_ok(datasets, expect_meta=True) def test_preprocess_no_meta(self): + self.fs.delete(self.meta_path) processor = MultiLevelDatasetProcessor() datasets = processor.preprocess(self.levels_dir, {}) - self.assertIsInstance(datasets, list) - self.assertEqual(self.nl, len(datasets)) - for i, (dataset, file_path) in enumerate(datasets): - self.assertEqual(f"{self.levels_dir}/{i}.zarr", file_path) - level_info = dataset.attrs.get(ML_INFO_ATTR) - self.assertIsInstance(level_info, LevelInfo) - self.assertEqual(i, level_info.level) - self.assertEqual(self.nl, level_info.num_levels) - self.assertEqual(None, level_info.meta) - self.assertIsInstance(level_info.datasets, list) - self.assertEqual(self.nl, len(level_info.datasets)) + self.assert_levels_ok(datasets, expect_meta=False) + + def test_preprocess_with_link(self): + self.fs.copy( + f"{self.levels_dir}/0.zarr", + f"memory://{self.levels_name}.zarr", + recursive=True, + ) + self.fs.delete(f"{self.levels_dir}/0.zarr", recursive=True) + self.fs.write_text(f"{self.levels_dir}/0.link", f"../{self.levels_name}.zarr") + processor = MultiLevelDatasetProcessor() + datasets = processor.preprocess(self.levels_dir, {}) + self.assert_levels_ok(datasets, expect_meta=True, expect_link=True) + + def test_preprocess_fail_empty(self): + for i in range(self.num_levels): + self.fs.delete(f"{self.levels_dir}/{i}.zarr", recursive=True) + processor = MultiLevelDatasetProcessor() + with pytest.raises(ValueError, match="empty multi-level dataset"): + processor.preprocess(self.levels_dir, {}) + + def test_preprocess_fail_missing(self): + self.fs.delete(f"{self.levels_dir}/1.zarr", recursive=True) + processor = MultiLevelDatasetProcessor() + with pytest.raises( + ValueError, match="missing dataset for level 1 in multi-level dataset" + ): + processor.preprocess(self.levels_dir, {}) def test_postprocess(self): processor = MultiLevelDatasetProcessor() diff --git a/xrlint/cli/engine.py b/xrlint/cli/engine.py index f8eccf9..b40d99c 100644 --- a/xrlint/cli/engine.py +++ b/xrlint/cli/engine.py @@ -181,7 +181,6 @@ def compute_config(p: str): if fs.isdir(root): for path, dirs, files in fs.walk(root, topdown=True): - for d in list(dirs): d_path = f"{path}/{d}" c = compute_config(d_path) diff --git a/xrlint/plugins/xcube/processors/mldataset.py b/xrlint/plugins/xcube/processors/mldataset.py index 3e4c3ee..67580a2 100644 --- a/xrlint/plugins/xcube/processors/mldataset.py +++ b/xrlint/plugins/xcube/processors/mldataset.py @@ -15,6 +15,7 @@ set_dataset_level_info, LevelInfo, MultiLevelDatasetMeta, + norm_path, ) from xrlint.processor import ProcessorOp from xrlint.result import Message @@ -37,21 +38,16 @@ def preprocess( for f in fs.listdir(fs_path, detail=False) ] + # check for optional ".zlevels" that provides meta-info meta = None if ML_META_FILENAME in file_names: - try: - with fs.open(f"{fs_path}/{ML_META_FILENAME}") as stream: - meta = parse_multi_level_dataset_meta(stream) - except FileNotFoundError: - pass + with fs.open(f"{fs_path}/{ML_META_FILENAME}") as stream: + meta = MultiLevelDatasetMeta.from_value(json.load(stream)) + # check for optional ".0.link" that locates level 0 somewhere else level_0_path = None if "0.link" in file_names: - try: - with fs.open(f"{fs_path}/0.link") as stream: - level_0_path = stream.read() - except FileNotFoundError: - pass + level_0_path = fs.read_text(f"{fs_path}/0.link") level_names, num_levels = parse_levels(file_names, level_0_path) @@ -59,7 +55,7 @@ def preprocess( level_datasets: list[xr.Dataset | None] = [] for level, level_name in level_names.items(): - level_path = f"{file_path}/{level_name}" + level_path = norm_path(f"{file_path}/{level_name}") level_dataset = xr.open_dataset(level_path, engine=engine, **opener_options) level_datasets.append((level_dataset, level_path)) @@ -116,22 +112,3 @@ def parse_levels( f"missing dataset for level {level} in multi-level dataset" ) return level_names, num_levels - - -def parse_multi_level_dataset_meta(stream: Any) -> MultiLevelDatasetMeta: - meta_content = json.load(stream) - - msg_prefix = f"illegal xcube {ML_META_FILENAME!r} file" - - if not isinstance(meta_content, dict): - raise ValueError(f"{msg_prefix}, JSON object expected") - - version = meta_content.get("version") - if not isinstance(version, str) or not version.startswith("1."): - raise ValueError(f"{msg_prefix}, missing or invalid 'version'") - - num_levels = meta_content.get("num_levels") - if not isinstance(num_levels, int) or num_levels < 1: - raise ValueError(f"{msg_prefix}, missing or invalid 'num_levels'") - - return MultiLevelDatasetMeta(**meta_content) diff --git a/xrlint/plugins/xcube/rules/ml_dataset_meta.py b/xrlint/plugins/xcube/rules/ml_dataset_meta.py index 99d0aeb..a77c1c1 100644 --- a/xrlint/plugins/xcube/rules/ml_dataset_meta.py +++ b/xrlint/plugins/xcube/rules/ml_dataset_meta.py @@ -1,7 +1,7 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import ML_META_FILENAME from xrlint.plugins.xcube.plugin import plugin -from xrlint.plugins.xcube.util import get_dataset_level_info +from xrlint.plugins.xcube.util import get_dataset_level_info, is_spatial_var from xrlint.rule import RuleContext, RuleOp from xrlint.util.formatting import format_item @@ -11,11 +11,11 @@ version="1.0.0", type="suggestion", description=( - f"Multi-level datasets should provide {ML_META_FILENAME!r} meta information file" - f" and if so, it should be consistent." + f"Multi-level datasets should provide {ML_META_FILENAME!r}" + f" meta information file and if so, it should be consistent." ), docs_url=( - "https://xcube.readthedocs.io/en/latest/cubespec.html#encoding-of-colors" + "https://xcube.readthedocs.io/en/latest/mldatasets.html#the-xcube-levels-format" ), ) class MultiLevelDatasetMeta(RuleOp): @@ -34,23 +34,47 @@ def dataset(self, ctx: RuleContext, node: DatasetNode): meta = level_info.meta if meta is None: ctx.report( - f"Missing {ML_META_FILENAME!r} file," + f"Missing {ML_META_FILENAME!r} meta-info file," f" therefore dataset cannot be extended." ) return - actual_count: int = level_info.num_levels - meta_count = meta.num_levels - if meta_count != actual_count: + if not meta.version.startswith("1."): + ctx.report(f"Unsupported {ML_META_FILENAME!r} meta-info version") + + if meta.num_levels < 0: + ctx.report( + f"Invalid 'num_levels' in {ML_META_FILENAME!r} meta-info:" + f" {meta.num_levels}" + ) + elif meta.num_levels != level_info.num_levels: + ctx.report( + f"Expected {format_item(meta.num_levels, 'level')}," + f" but found {level_info.num_levels}" + ) + + if meta.use_saved_levels is None: ctx.report( - f"Expected {format_item(meta_count, 'level')}, but found {actual_count}" + f"Missing value for 'use_saved_levels'" + f" in {ML_META_FILENAME!r} meta-info." ) - actual_count: int = level_info.num_levels - meta_count = meta.use_saved_levels - if meta_count != actual_count: + if not meta.agg_methods: ctx.report( - f"Expected {format_item(meta_count, 'level')}, but found {actual_count}" + f"Missing value for 'agg_methods' in {ML_META_FILENAME!r} meta-info." ) + else: + for var_name, var in node.dataset.data_vars.items(): + if is_spatial_var(var) and not meta.agg_methods.get(var_name): + ctx.report( + f"Missing value for variable {var_name!r}" + f" in 'agg_methods' of {ML_META_FILENAME!r} meta-info." + ) + for var_name in meta.agg_methods.keys(): + if var_name not in node.dataset: + ctx.report( + f"Variable {var_name!r} not found in dataset, but specified" + f" in 'agg_methods' of {ML_META_FILENAME!r} meta-info." + ) - # TODO: verify variables listed in meta + # Later: check meta.tile_size as well... diff --git a/xrlint/plugins/xcube/rules/ml_dataset_resolution.py b/xrlint/plugins/xcube/rules/ml_dataset_resolution.py index ce00558..b58c886 100644 --- a/xrlint/plugins/xcube/rules/ml_dataset_resolution.py +++ b/xrlint/plugins/xcube/rules/ml_dataset_resolution.py @@ -14,9 +14,7 @@ "Multi-level dataset levels should provide spatial resolutions" " decreasing by powers of two." ), - docs_url=( - "https://xcube.readthedocs.io/en/latest/cubespec.html#encoding-of-colors" - ), + docs_url="https://xcube.readthedocs.io/en/latest/mldatasets.html#definition", ) class MultiLevelDatasetMeta(RuleOp): def dataset(self, ctx: RuleContext, node: DatasetNode): @@ -55,3 +53,5 @@ def dataset(self, ctx: RuleContext, node: DatasetNode): f"Expected size of dimension {y_name!r} in level {level}" f" to be {expected_level_height}, but was {level_height}" ) + + # Here: check spatial coordinates... diff --git a/xrlint/plugins/xcube/util.py b/xrlint/plugins/xcube/util.py index 1cb2002..373c75e 100644 --- a/xrlint/plugins/xcube/util.py +++ b/xrlint/plugins/xcube/util.py @@ -3,23 +3,43 @@ import xarray as xr +from xrlint.util.constructible import MappingConstructible from .constants import LAT_NAME, LON_NAME, X_NAME, Y_NAME, ML_INFO_ATTR @dataclass(frozen=True, kw_only=True) -class MultiLevelDatasetMeta: +class MultiLevelDatasetMeta(MappingConstructible): + """The contents of a xcube `.zlevels` meta-info file.""" + version: str num_levels: int + tile_size: tuple[int, int] | None = None use_saved_levels: bool | None = None agg_methods: dict[str, str] | None = None @dataclass(frozen=True, kw_only=True) class LevelInfo: + """Level info added to each level dataset to allow for validation + of multi-level datasets. + + Note, this need to be aligned with + + """ + level: int + """Current level index.""" + num_levels: int + """Actual number of levels found.""" + datasets: list[tuple[xr.Dataset, str]] + """A list of num_levels pairs comprising + level dataset and level file path. + """ + meta: MultiLevelDatasetMeta | None = None + """Content of a `.zlevels` file, if file was found.""" def get_dataset_level_info(dataset: xr.Dataset) -> LevelInfo | None: @@ -66,3 +86,13 @@ def get_spatial_size_for_dims( elif k == y_name: y_size = v return x_size, y_size + + +def norm_path(level_path: str) -> str: + parts = level_path.replace("\\", "/").split("/") + level_path = "/".join( + p + for i, p in enumerate(parts) + if p not in (".", "..") and (i == len(parts) - 1 or parts[i + 1] != "..") + ) + return level_path From f44c132bfaa6b8514ee1c1aa50beb1c4a9a96dbd Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Fri, 24 Jan 2025 15:48:38 +0100 Subject: [PATCH 14/17] added final tests --- CHANGES.md | 3 +- docs/rule-ref.md | 21 +++ mkruleref.py | 24 +++- tests/plugins/xcube/helpers.py | 121 +++++++++++------- .../xcube/processors/test_mldataset.py | 10 +- .../xcube/rules/test_ml_dataset_meta.py | 87 +++++++++++++ .../xcube/rules/test_ml_dataset_time.py | 39 ++++++ .../plugins/xcube/rules/test_ml_dataset_xy.py | 57 +++++++++ tests/plugins/xcube/test_plugin.py | 3 +- xrlint/plugins/xcube/__init__.py | 5 +- xrlint/plugins/xcube/processors/mldataset.py | 18 +-- xrlint/plugins/xcube/rules/ml_dataset_meta.py | 4 +- xrlint/plugins/xcube/rules/ml_dataset_time.py | 39 ++++++ ...dataset_resolution.py => ml_dataset_xy.py} | 6 +- xrlint/plugins/xcube/util.py | 20 ++- 15 files changed, 375 insertions(+), 82 deletions(-) create mode 100644 tests/plugins/xcube/rules/test_ml_dataset_meta.py create mode 100644 tests/plugins/xcube/rules/test_ml_dataset_time.py create mode 100644 tests/plugins/xcube/rules/test_ml_dataset_xy.py create mode 100644 xrlint/plugins/xcube/rules/ml_dataset_time.py rename xrlint/plugins/xcube/rules/{ml_dataset_resolution.py => ml_dataset_xy.py} (94%) diff --git a/CHANGES.md b/CHANGES.md index a3b9b84..43c0291 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,7 +4,8 @@ - New xcube multi-level dataset rules: - `ml-dataset-meta`: verifies that a meta info file exists and is consistent - - `ml-dataset-resolution`: verifies that the levels have expected resolutions + - `ml-dataset-xy`: verifies that the levels have expected spatial resolutions + - `ml-dataset-time`: verifies that the levels have expected time dimension, if any - Now supporting xcube multi-level datasets `*.levels`: - Added xcube plugin processor `"xcube/multi-level-dataset"` that is used inside the predefined xcube configurations "all" and "recommended". diff --git a/docs/rule-ref.md b/docs/rule-ref.md index d9aaf0b..bb4219d 100644 --- a/docs/rule-ref.md +++ b/docs/rule-ref.md @@ -114,6 +114,27 @@ Latitude and longitude coordinates and dimensions should be called 'lat' and 'lo Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt: +### :material-lightbulb: `ml-dataset-meta` + +Multi-level datasets should provide '.zlevels' meta information file and if so, it should be consistent. +[:material-information-variant:](https://xcube.readthedocs.io/en/latest/mldatasets.html#the-xcube-levels-format) + +Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt: + +### :material-bug: `ml-dataset-time` + +The `time` dimension of multi-level datasets should use a chunk size of 1. This allows for faster image tile generation for visualisation. +[:material-information-variant:](https://xcube.readthedocs.io/en/latest/mldatasets.html#definition) + +Contained in: `all`-:material-lightning-bolt: `recommended`-:material-alert: + +### :material-bug: `ml-dataset-xy` + +Multi-level dataset levels should provide spatial resolutions decreasing by powers of two. +[:material-information-variant:](https://xcube.readthedocs.io/en/latest/mldatasets.html#definition) + +Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt: + ### :material-bug: `single-grid-mapping` A single grid mapping shall be used for all spatial data variables of a datacube. diff --git a/mkruleref.py b/mkruleref.py index ebd1a31..6e2d396 100644 --- a/mkruleref.py +++ b/mkruleref.py @@ -1,4 +1,5 @@ from xrlint.plugin import Plugin +from xrlint.rule import RuleConfig # for icons, see # https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/ @@ -39,7 +40,7 @@ def write_rule_ref_page(): def write_plugin_rules(stream, plugin: Plugin): - configs = plugin.configs + config_rules = get_plugin_rule_configs(plugin) for rule_id in sorted(plugin.rules.keys()): rule_meta = plugin.rules[rule_id].meta stream.write( @@ -51,9 +52,8 @@ def write_plugin_rules(stream, plugin: Plugin): stream.write("\n\n") # List the predefined configurations that contain the rule stream.write("Contained in: ") - for config_id in sorted(configs.keys()): - config = configs[config_id] - rule_configs = config.rules or {} + for config_id in sorted(config_rules.keys()): + rule_configs = config_rules[config_id] rule_config = rule_configs.get(rule_id) or rule_configs.get( f"{plugin.meta.name}/{rule_id}" ) @@ -62,5 +62,21 @@ def write_plugin_rules(stream, plugin: Plugin): stream.write("\n\n") +def get_plugin_rule_configs(plugin): + configs = plugin.configs + config_rules: dict[str, dict[str, RuleConfig]] = {} + for config_name, config_list in configs.items(): + # note, here we assume most plugins configure their rules + # in one dedicated config object only. However, this is not + # the general case as file patterns may be used to make the + # rules configurations specific. + rule_configs = {} + for config in config_list: + if config.rules: + rule_configs.update(config.rules) + config_rules[config_name] = rule_configs + return config_rules + + if __name__ == "__main__": write_rule_ref_page() diff --git a/tests/plugins/xcube/helpers.py b/tests/plugins/xcube/helpers.py index f397500..181f0e2 100644 --- a/tests/plugins/xcube/helpers.py +++ b/tests/plugins/xcube/helpers.py @@ -3,70 +3,95 @@ import numpy as np import xarray as xr +from xrlint.plugins.xcube.util import LevelsMeta, attach_dataset_level_infos -def make_cube_levels(nx: int, ny: int, nt: int, nl: int) -> list[xr.Dataset]: - return [ + +def make_cube_levels( + nl: int, + nx: int, + ny: int, + nt: int | None = None, + meta: LevelsMeta | None = None, + force_infos: bool = False, +) -> list[xr.Dataset]: + levels = [ make_cube(math.ceil(nx >> level), math.ceil(ny >> level), nt) for level in range(nl) ] + if meta is not None or force_infos: + attach_dataset_level_infos( + [(ds, f"{i}.zarr") for i, ds in enumerate(levels)], meta=meta + ) + return levels -def make_cube(nx: int, ny: int, nt: int) -> xr.Dataset: +def make_cube(nx: int, ny: int, nt: int | None = None) -> xr.Dataset: """Make an in-memory dataset that should pass all xcube rules. Args: nx: length of the lon-dimension ny: length of the lat-dimension - nt: length of the time-dimension + nt: length of the time-dimension, optional Returns: an in-memory dataset with one 3-d data variable "chl" - with dimensions "time", "lat", "lon". + with dimensions ["time",] "lat", "lon". """ + x_attrs = dict( + long_name="longitude", + standard_name="longitude", + units="degrees_east", + ) + y_attrs = dict( + long_name="latitude", + standard_name="latitude", + units="degrees_north", + ) + dx = 180.0 / nx dy = 90.0 / ny - return xr.Dataset( - coords=dict( - lon=xr.DataArray( - np.linspace(-180 + dx, 180 - dx, nx), - dims="lon", - attrs=dict( - long_name="longitude", - standard_name="longitude", - units="degrees_east", - ), - ), - lat=xr.DataArray( - np.linspace(-90 + dy, 90 - dy, ny), - dims="lat", - attrs=dict( - long_name="latitude", - standard_name="latitude", - units="degrees_north", - ), + x_data = np.linspace(-180 + dx, 180 - dx, nx) + y_data = np.linspace(-90 + dy, 90 - dy, ny) + + chl_attrs = dict( + long_name="chlorophyll concentration", + standard_name="chlorophyll_concentration", + units="mg/m^3", + _FillValue=0, + ) + chl_chunks = dict(lat=min(ny, 90), lon=min(nx, 90)) + + ds_attrs = dict(title="Chlorophyll") + + coords = dict( + lon=xr.DataArray(x_data, dims="lon", attrs=x_attrs), + lat=xr.DataArray(y_data, dims="lat", attrs=y_attrs), + ) + + if nt is None: + return xr.Dataset( + data_vars=dict( + chl=xr.DataArray( + np.zeros((ny, nx)), dims=["lat", "lon"], attrs=chl_attrs + ).chunk(**chl_chunks), ), - time=xr.DataArray( - range(nt), - dims="time", - attrs=dict( - long_name="time", - standard_name="time", - units="days since 2024-06-10:12:00:00 utc", - calendar="gregorian", - ), + coords=coords, + attrs=ds_attrs, + ) + else: + time_attrs = dict( + long_name="time", + standard_name="time", + units="days since 2024-06-10:12:00:00 utc", + calendar="gregorian", + ) + coords.update(time=xr.DataArray(range(nt), dims="time", attrs=time_attrs)) + return xr.Dataset( + data_vars=dict( + chl=xr.DataArray( + np.zeros((nt, ny, nx)), dims=["time", "lat", "lon"], attrs=chl_attrs + ).chunk(time=1, **chl_chunks), ), - ), - data_vars=dict( - chl=xr.DataArray( - np.zeros((nt, ny, nx)), - dims=["time", "lat", "lon"], - attrs=dict( - long_name="chlorophyll concentration", - standard_name="chlorophyll_concentration", - units="mg/m^3", - _FillValue=0, - ), - ).chunk(time=1, lat=min(ny, 90), lon=min(nx, 90)), - ), - attrs=dict(), - ) + coords=coords, + attrs=ds_attrs, + ) diff --git a/tests/plugins/xcube/processors/test_mldataset.py b/tests/plugins/xcube/processors/test_mldataset.py index 8889f6c..f0511a6 100644 --- a/tests/plugins/xcube/processors/test_mldataset.py +++ b/tests/plugins/xcube/processors/test_mldataset.py @@ -7,10 +7,10 @@ from tests.plugins.xcube.helpers import make_cube_levels from xrlint.plugins.xcube.constants import ML_INFO_ATTR -from xrlint.plugins.xcube.processors.mldataset import ( - MultiLevelDatasetProcessor, +from xrlint.plugins.xcube.processors.mldataset import MultiLevelDatasetProcessor +from xrlint.plugins.xcube.util import ( LevelInfo, - MultiLevelDatasetMeta, + LevelsMeta, ) from xrlint.result import Message @@ -45,7 +45,7 @@ def setUp(self): self.fs.mkdir(self.levels_dir) level_datasets = make_cube_levels( - self.x_size, self.y_size, self.time_size, self.num_levels + self.num_levels, self.x_size, self.y_size, self.time_size ) for level, dataset in enumerate(level_datasets): dataset.to_zarr(f"{self.levels_dir}/{level}.zarr", write_empty_chunks=False) @@ -80,7 +80,7 @@ def assert_levels_ok( self.assertEqual(self.num_levels, len(level_info.datasets)) meta = level_info.meta if expect_meta: - self.assertIsInstance(meta, MultiLevelDatasetMeta) + self.assertIsInstance(meta, LevelsMeta) self.assertEqual("1.0", meta.version) self.assertEqual(self.num_levels, meta.num_levels) self.assertEqual(False, meta.use_saved_levels) diff --git a/tests/plugins/xcube/rules/test_ml_dataset_meta.py b/tests/plugins/xcube/rules/test_ml_dataset_meta.py new file mode 100644 index 0000000..5ad27a0 --- /dev/null +++ b/tests/plugins/xcube/rules/test_ml_dataset_meta.py @@ -0,0 +1,87 @@ +import xarray as xr +from tests.plugins.xcube.helpers import make_cube_levels +from xrlint.plugins.xcube.rules.ml_dataset_meta import MLDatasetMeta +from xrlint.plugins.xcube.util import ( + LevelsMeta, + set_dataset_level_info, + LevelInfo, + get_dataset_level_info, +) +from xrlint.testing import RuleTest, RuleTester + + +def _replace_meta(dataset: xr.Dataset, meta: LevelsMeta) -> xr.Dataset: + dataset = dataset.copy() + old_level_info = get_dataset_level_info(dataset) + new_level_info = LevelInfo( + level=old_level_info.level, + num_levels=old_level_info.num_levels, + datasets=old_level_info.datasets, + meta=meta, + ) + set_dataset_level_info(dataset, new_level_info) + return dataset + + +levels_with_meta = make_cube_levels( + 4, + 720, + 360, + meta=LevelsMeta( + version="1.0", + num_levels=4, + use_saved_levels=True, + agg_methods={"chl": "mean"}, + ), +) + +valid_dataset_0 = levels_with_meta[0] +valid_dataset_1 = levels_with_meta[1] +valid_dataset_2 = levels_with_meta[2] +valid_dataset_3 = xr.Dataset() + +levels_wo_meta = make_cube_levels(4, 720, 360, force_infos=True) +invalid_dataset_0 = levels_wo_meta[0] +invalid_dataset_1 = _replace_meta( + levels_wo_meta[0].copy(), + meta=LevelsMeta( + version="2.0", # error: != "1.x" + num_levels=0, # error: < 1 + # error: missing use_saved_levels=False + # error: missing agg_methods={"chl": "mean"} + ), +) +invalid_dataset_2 = _replace_meta( + levels_wo_meta[0], + meta=LevelsMeta( + version="1.0", # ok + num_levels=3, # error: != level_info.num_levels + ), +) + +invalid_dataset_3 = _replace_meta( + levels_wo_meta[0], + meta=LevelsMeta( + version="1.0", # ok + num_levels=4, # ok + use_saved_levels=False, # ok + agg_methods={"tsm": "median"}, # error: where is "chl"? + ), +) + +MLDatasetMetaTest = RuleTester.define_test( + "ml-dataset-meta", + MLDatasetMeta, + valid=[ + RuleTest(dataset=valid_dataset_0), + RuleTest(dataset=valid_dataset_1), + RuleTest(dataset=valid_dataset_2), + RuleTest(dataset=valid_dataset_3), + ], + invalid=[ + RuleTest(dataset=invalid_dataset_0), + RuleTest(dataset=invalid_dataset_1), + RuleTest(dataset=invalid_dataset_2), + RuleTest(dataset=invalid_dataset_3), + ], +) diff --git a/tests/plugins/xcube/rules/test_ml_dataset_time.py b/tests/plugins/xcube/rules/test_ml_dataset_time.py new file mode 100644 index 0000000..8d3d6f4 --- /dev/null +++ b/tests/plugins/xcube/rules/test_ml_dataset_time.py @@ -0,0 +1,39 @@ +import xarray as xr +from tests.plugins.xcube.helpers import make_cube_levels +from xrlint.plugins.xcube.rules.ml_dataset_time import MLDatasetTime +from xrlint.plugins.xcube.util import ( + LevelsMeta, +) +from xrlint.testing import RuleTest, RuleTester + + +meta = LevelsMeta( + version="1.0", + num_levels=4, + use_saved_levels=True, + agg_methods={"chl": "mean"}, +) +levels_with_time = make_cube_levels(4, 720, 360, nt=6, meta=meta) +levels_wo_time = make_cube_levels(4, 720, 360, meta=meta) + +valid_dataset_0 = levels_with_time[0] +valid_dataset_1 = levels_with_time[1] +valid_dataset_2 = levels_wo_time[0] +valid_dataset_3 = xr.Dataset() + +invalid_dataset_0 = levels_with_time[0].copy() +invalid_dataset_0["chl"] = invalid_dataset_0["chl"].chunk(time=3) + +MLDatasetTimeTest = RuleTester.define_test( + "ml-dataset-time", + MLDatasetTime, + valid=[ + RuleTest(dataset=valid_dataset_0), + RuleTest(dataset=valid_dataset_1), + RuleTest(dataset=valid_dataset_2), + RuleTest(dataset=valid_dataset_3), + ], + invalid=[ + RuleTest(dataset=invalid_dataset_0), + ], +) diff --git a/tests/plugins/xcube/rules/test_ml_dataset_xy.py b/tests/plugins/xcube/rules/test_ml_dataset_xy.py new file mode 100644 index 0000000..6faa58a --- /dev/null +++ b/tests/plugins/xcube/rules/test_ml_dataset_xy.py @@ -0,0 +1,57 @@ +import xarray as xr +from tests.plugins.xcube.helpers import make_cube_levels +from xrlint.plugins.xcube.rules.ml_dataset_xy import MLDatasetXY +from xrlint.plugins.xcube.util import ( + LevelsMeta, + get_dataset_level_info, +) +from xrlint.testing import RuleTest, RuleTester + + +meta = LevelsMeta( + version="1.0", + num_levels=4, + use_saved_levels=True, + agg_methods={"chl": "mean"}, +) +levels = make_cube_levels(4, 720, 360, nt=3, meta=meta) + +valid_dataset_0 = levels[0] +valid_dataset_1 = levels[1] +valid_dataset_2 = levels[2] +valid_dataset_3 = levels[3] +valid_dataset_4 = xr.Dataset() + +levels = make_cube_levels(4, 720, 360, meta=meta) +valid_dataset_5 = levels[2] +level_info = get_dataset_level_info(valid_dataset_5) +for ds, _ in level_info.datasets: + # remove spatial vars + del ds["chl"] + +levels = make_cube_levels(4, 720, 360, meta=meta) +invalid_dataset_0 = levels[2].copy() +# simulate resolution mismatch by exchanging 2 levels +level_info = get_dataset_level_info(invalid_dataset_0) +level_datasets = level_info.datasets +level_0_dataset = level_datasets[0] +level_1_dataset = level_datasets[1] +level_info.datasets[0] = level_1_dataset +level_info.datasets[1] = level_0_dataset + + +MLDatasetXYTest = RuleTester.define_test( + "ml-dataset-xy", + MLDatasetXY, + valid=[ + RuleTest(dataset=valid_dataset_0), + RuleTest(dataset=valid_dataset_1), + RuleTest(dataset=valid_dataset_2), + RuleTest(dataset=valid_dataset_3), + RuleTest(dataset=valid_dataset_4), + RuleTest(dataset=valid_dataset_5), + ], + invalid=[ + RuleTest(dataset=invalid_dataset_0), + ], +) diff --git a/tests/plugins/xcube/test_plugin.py b/tests/plugins/xcube/test_plugin.py index a7cd0c6..e469d74 100644 --- a/tests/plugins/xcube/test_plugin.py +++ b/tests/plugins/xcube/test_plugin.py @@ -15,7 +15,8 @@ def test_rules_complete(self): "increasing-time", "lat-lon-naming", "ml-dataset-meta", - "ml-dataset-resolution", + "ml-dataset-time", + "ml-dataset-xy", "single-grid-mapping", "time-naming", }, diff --git a/xrlint/plugins/xcube/__init__.py b/xrlint/plugins/xcube/__init__.py index 3a1a917..0c008a9 100644 --- a/xrlint/plugins/xcube/__init__.py +++ b/xrlint/plugins/xcube/__init__.py @@ -38,8 +38,9 @@ def export_plugin() -> Plugin: "xcube/grid-mapping-naming": "warn", "xcube/increasing-time": "error", "xcube/lat-lon-naming": "error", - "xcube/ml-dataset-meta": "warn", - "xcube/ml-dataset-resolution": "error", + "xcube/ml-dataset-meta": "error", + "xcube/ml-dataset-time": "warn", + "xcube/ml-dataset-xy": "error", "xcube/single-grid-mapping": "error", "xcube/time-naming": "error", }, diff --git a/xrlint/plugins/xcube/processors/mldataset.py b/xrlint/plugins/xcube/processors/mldataset.py index 67580a2..fc44eaf 100644 --- a/xrlint/plugins/xcube/processors/mldataset.py +++ b/xrlint/plugins/xcube/processors/mldataset.py @@ -12,10 +12,9 @@ ) from xrlint.plugins.xcube.plugin import plugin from xrlint.plugins.xcube.util import ( - set_dataset_level_info, - LevelInfo, - MultiLevelDatasetMeta, + LevelsMeta, norm_path, + attach_dataset_level_infos, ) from xrlint.processor import ProcessorOp from xrlint.result import Message @@ -42,7 +41,7 @@ def preprocess( meta = None if ML_META_FILENAME in file_names: with fs.open(f"{fs_path}/{ML_META_FILENAME}") as stream: - meta = MultiLevelDatasetMeta.from_value(json.load(stream)) + meta = LevelsMeta.from_value(json.load(stream)) # check for optional ".0.link" that locates level 0 somewhere else level_0_path = None @@ -59,16 +58,7 @@ def preprocess( level_dataset = xr.open_dataset(level_path, engine=engine, **opener_options) level_datasets.append((level_dataset, level_path)) - for level, (level_dataset, _) in enumerate(level_datasets): - set_dataset_level_info( - level_dataset, - LevelInfo( - level=level, - num_levels=num_levels, - meta=meta, - datasets=level_datasets, - ), - ) + attach_dataset_level_infos(level_datasets, meta=meta) return level_datasets diff --git a/xrlint/plugins/xcube/rules/ml_dataset_meta.py b/xrlint/plugins/xcube/rules/ml_dataset_meta.py index a77c1c1..7d19ab3 100644 --- a/xrlint/plugins/xcube/rules/ml_dataset_meta.py +++ b/xrlint/plugins/xcube/rules/ml_dataset_meta.py @@ -18,7 +18,7 @@ "https://xcube.readthedocs.io/en/latest/mldatasets.html#the-xcube-levels-format" ), ) -class MultiLevelDatasetMeta(RuleOp): +class MLDatasetMeta(RuleOp): def dataset(self, ctx: RuleContext, node: DatasetNode): level_info = get_dataset_level_info(node.dataset) if level_info is None: @@ -42,7 +42,7 @@ def dataset(self, ctx: RuleContext, node: DatasetNode): if not meta.version.startswith("1."): ctx.report(f"Unsupported {ML_META_FILENAME!r} meta-info version") - if meta.num_levels < 0: + if meta.num_levels <= 0: ctx.report( f"Invalid 'num_levels' in {ML_META_FILENAME!r} meta-info:" f" {meta.num_levels}" diff --git a/xrlint/plugins/xcube/rules/ml_dataset_time.py b/xrlint/plugins/xcube/rules/ml_dataset_time.py new file mode 100644 index 0000000..0273af4 --- /dev/null +++ b/xrlint/plugins/xcube/rules/ml_dataset_time.py @@ -0,0 +1,39 @@ +from xrlint.node import DatasetNode +from xrlint.plugins.xcube.constants import TIME_NAME +from xrlint.plugins.xcube.plugin import plugin +from xrlint.plugins.xcube.util import get_dataset_level_info, is_spatial_var +from xrlint.rule import RuleContext, RuleOp +from xrlint.util.formatting import format_seq + + +@plugin.define_rule( + "ml-dataset-time", + version="1.0.0", + type="problem", + description=( + "The `time` dimension of multi-level datasets should use a chunk size of 1." + " This allows for faster image tile generation for visualisation." + ), + docs_url="https://xcube.readthedocs.io/en/latest/mldatasets.html#definition", +) +class MLDatasetTime(RuleOp): + def dataset(self, ctx: RuleContext, node: DatasetNode): + level_info = get_dataset_level_info(node.dataset) + if level_info is None: + # ok, this rules applies only to level datasets opened + # by the xcube multi-level processor + return + + if TIME_NAME not in node.dataset.sizes or node.dataset.sizes[TIME_NAME] <= 1: + # ok, no time dimension used or no time extent + return + + for var_name, var in node.dataset.data_vars.items(): + if is_spatial_var(var) and TIME_NAME in var.dims and var.chunks is not None: + time_index = var.dims.index(TIME_NAME) + time_chunks = var.chunks[time_index] + if not all(c == 1 for c in time_chunks): + ctx.report( + f"Variable {var_name!r} uses chunking for {TIME_NAME!r}" + f" that differs from from one: {format_seq(time_chunks)}" + ) diff --git a/xrlint/plugins/xcube/rules/ml_dataset_resolution.py b/xrlint/plugins/xcube/rules/ml_dataset_xy.py similarity index 94% rename from xrlint/plugins/xcube/rules/ml_dataset_resolution.py rename to xrlint/plugins/xcube/rules/ml_dataset_xy.py index b58c886..863f9a7 100644 --- a/xrlint/plugins/xcube/rules/ml_dataset_resolution.py +++ b/xrlint/plugins/xcube/rules/ml_dataset_xy.py @@ -7,7 +7,7 @@ @plugin.define_rule( - "ml-dataset-resolution", + "ml-dataset-xy", version="1.0.0", type="problem", description=( @@ -16,7 +16,7 @@ ), docs_url="https://xcube.readthedocs.io/en/latest/mldatasets.html#definition", ) -class MultiLevelDatasetMeta(RuleOp): +class MLDatasetXY(RuleOp): def dataset(self, ctx: RuleContext, node: DatasetNode): level_info = get_dataset_level_info(node.dataset) if level_info is None: @@ -33,7 +33,7 @@ def dataset(self, ctx: RuleContext, node: DatasetNode): level_0_dataset, _ = datasets[0] l0_size = get_spatial_size(level_0_dataset) if l0_size is None: - # weird - maybe no data vars? + # ok, maybe no spatial data vars? return (x_name, level_0_width), (y_name, level_0_height) = l0_size diff --git a/xrlint/plugins/xcube/util.py b/xrlint/plugins/xcube/util.py index 373c75e..cb66758 100644 --- a/xrlint/plugins/xcube/util.py +++ b/xrlint/plugins/xcube/util.py @@ -8,7 +8,7 @@ @dataclass(frozen=True, kw_only=True) -class MultiLevelDatasetMeta(MappingConstructible): +class LevelsMeta(MappingConstructible): """The contents of a xcube `.zlevels` meta-info file.""" version: str @@ -38,7 +38,7 @@ class LevelInfo: level dataset and level file path. """ - meta: MultiLevelDatasetMeta | None = None + meta: LevelsMeta | None = None """Content of a `.zlevels` file, if file was found.""" @@ -50,6 +50,22 @@ def set_dataset_level_info(dataset: xr.Dataset, level_info: LevelInfo): dataset.attrs[ML_INFO_ATTR] = level_info +def attach_dataset_level_infos( + level_datasets: list[tuple[xr.Dataset, str]], + meta: LevelsMeta | None = None, +): + for level, (level_dataset, _) in enumerate(level_datasets): + set_dataset_level_info( + level_dataset, + LevelInfo( + level=level, + num_levels=len(level_datasets), + meta=meta, + datasets=level_datasets, + ), + ) + + def is_spatial_var(var: xr.DataArray) -> bool: """Return 'True' if `var` looks like a spatial 2+d variable.""" if var.ndim < 2: From 60f32af0881dd21503d29cbddac31fb13e5016cd Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Fri, 24 Jan 2025 15:56:24 +0100 Subject: [PATCH 15/17] removed unused fn --- xrlint/plugins/xcube/util.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/xrlint/plugins/xcube/util.py b/xrlint/plugins/xcube/util.py index cb66758..2d56d18 100644 --- a/xrlint/plugins/xcube/util.py +++ b/xrlint/plugins/xcube/util.py @@ -90,20 +90,6 @@ def get_spatial_size( return None -def get_spatial_size_for_dims( - dataset: xr.Dataset, x_name: str, y_name: str -) -> tuple[int, int]: - """Return (x_size, y_size) for given dataset and dim names.""" - x_size: int = 0 - y_size: int = 0 - for k, v in dataset.sizes.items(): - if k == x_name: - x_size = v - elif k == y_name: - y_size = v - return x_size, y_size - - def norm_path(level_path: str) -> str: parts = level_path.replace("\\", "/").split("/") level_path = "/".join( From 6df703516c44c6bdbf3d991de001d219f3e5189b Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Fri, 24 Jan 2025 16:12:05 +0100 Subject: [PATCH 16/17] try making FS asynchronous --- .../xcube/processors/test_mldataset.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/plugins/xcube/processors/test_mldataset.py b/tests/plugins/xcube/processors/test_mldataset.py index f0511a6..7a9b005 100644 --- a/tests/plugins/xcube/processors/test_mldataset.py +++ b/tests/plugins/xcube/processors/test_mldataset.py @@ -4,6 +4,7 @@ import fsspec import pytest +import xarray as xr from tests.plugins.xcube.helpers import make_cube_levels from xrlint.plugins.xcube.constants import ML_INFO_ATTR @@ -89,15 +90,20 @@ def assert_levels_ok( else: self.assertIsNone(meta) - def test_preprocess(self): + def preprocess(self) -> list[tuple[xr.Dataset, str]]: processor = MultiLevelDatasetProcessor() - datasets = processor.preprocess(self.levels_dir, {}) + return processor.preprocess( + self.levels_dir, + {"backend_kwargs": {"storage_options": {"asynchronous": True}}}, + ) + + def test_preprocess(self): + datasets = self.preprocess() self.assert_levels_ok(datasets, expect_meta=True) def test_preprocess_no_meta(self): self.fs.delete(self.meta_path) - processor = MultiLevelDatasetProcessor() - datasets = processor.preprocess(self.levels_dir, {}) + datasets = self.preprocess() self.assert_levels_ok(datasets, expect_meta=False) def test_preprocess_with_link(self): @@ -108,24 +114,21 @@ def test_preprocess_with_link(self): ) self.fs.delete(f"{self.levels_dir}/0.zarr", recursive=True) self.fs.write_text(f"{self.levels_dir}/0.link", f"../{self.levels_name}.zarr") - processor = MultiLevelDatasetProcessor() - datasets = processor.preprocess(self.levels_dir, {}) + datasets = self.preprocess() self.assert_levels_ok(datasets, expect_meta=True, expect_link=True) def test_preprocess_fail_empty(self): for i in range(self.num_levels): self.fs.delete(f"{self.levels_dir}/{i}.zarr", recursive=True) - processor = MultiLevelDatasetProcessor() with pytest.raises(ValueError, match="empty multi-level dataset"): - processor.preprocess(self.levels_dir, {}) + self.preprocess() def test_preprocess_fail_missing(self): self.fs.delete(f"{self.levels_dir}/1.zarr", recursive=True) - processor = MultiLevelDatasetProcessor() with pytest.raises( ValueError, match="missing dataset for level 1 in multi-level dataset" ): - processor.preprocess(self.levels_dir, {}) + self.preprocess() def test_postprocess(self): processor = MultiLevelDatasetProcessor() From 0955eda3e2f3acaca61da6e24ccbe4f4c1e1c3e7 Mon Sep 17 00:00:00 2001 From: Norman Fomferra Date: Fri, 24 Jan 2025 16:34:42 +0100 Subject: [PATCH 17/17] pinned zarr --- CHANGES.md | 3 +++ environment.yml | 2 +- pyproject.toml | 2 +- tests/plugins/xcube/processors/test_mldataset.py | 9 +++++---- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 43c0291..a4c104e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -33,6 +33,9 @@ `xrlint.plugins..plugin` module. - `JsonSerializable` now recognizes `dataclass` instances and no longer serializes property values that are also default values. + - Pinned zarr dependency to be >=2.18, <3 until test + `tests.plugins.xcube.processors.test_mldataset.MultiLevelDatasetProcessorTest` + is adjusted or fsspec's memory filesystem is updated. ## Version 0.3.0 (from 2025-01-20) diff --git a/environment.yml b/environment.yml index 9d45b3f..e5334fc 100644 --- a/environment.yml +++ b/environment.yml @@ -24,4 +24,4 @@ dependencies: - pandas - netcdf4 - numpy - - zarr + - zarr >=2.18,<3 # tests fail with zarr 3+ diff --git a/pyproject.toml b/pyproject.toml index 5097449..5b03f43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,7 @@ dev = [ "netcdf4", "numpy", "pandas", - "zarr", + "zarr >=2.18,<3", ] doc = [ "mkdocs", diff --git a/tests/plugins/xcube/processors/test_mldataset.py b/tests/plugins/xcube/processors/test_mldataset.py index 7a9b005..69005e4 100644 --- a/tests/plugins/xcube/processors/test_mldataset.py +++ b/tests/plugins/xcube/processors/test_mldataset.py @@ -15,6 +15,10 @@ ) from xrlint.result import Message +# TODO: This tests requires zarr >=2, <3, because the test used fsspec's +# memory filesystem, which is not async but zarr wants all filesystems +# to be async now. + class MultiLevelDatasetProcessorTest(TestCase): levels_name = "xrlint-test" @@ -92,10 +96,7 @@ def assert_levels_ok( def preprocess(self) -> list[tuple[xr.Dataset, str]]: processor = MultiLevelDatasetProcessor() - return processor.preprocess( - self.levels_dir, - {"backend_kwargs": {"storage_options": {"asynchronous": True}}}, - ) + return processor.preprocess(self.levels_dir, {}) def test_preprocess(self): datasets = self.preprocess()