From 4f63867bdcbb38f50946f782bf9953729c8a7136 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:15:41 -0400 Subject: [PATCH 001/118] Add prospectus --- chunk-grid-prospectus.md | 395 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 395 insertions(+) create mode 100644 chunk-grid-prospectus.md diff --git a/chunk-grid-prospectus.md b/chunk-grid-prospectus.md new file mode 100644 index 0000000000..58e7810c75 --- /dev/null +++ b/chunk-grid-prospectus.md @@ -0,0 +1,395 @@ +# Prospectus: Unified Chunk Grid Design for zarr-python + +**Related:** +- [#3750](https://github.com/zarr-developers/zarr-python/issues/3750) (single ChunkGrid proposal) +- [#3534](https://github.com/zarr-developers/zarr-python/pull/3534) (rectilinear implementation) +- [#3735](https://github.com/zarr-developers/zarr-python/pull/3735) (chunk grid module/registry) +- [ZEP0003](https://github.com/zarr-developers/zeps/blob/main/draft/ZEP0003.md) (variable chunking spec) +- [zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) (sharding v1.1: non-divisible subchunks) +- [zarr-extensions#25](https://github.com/zarr-developers/zarr-extensions/pull/25) (rectilinear extension) +- [zarr-extensions#34](https://github.com/zarr-developers/zarr-extensions/issues/34) (sharding + rectilinear) + +## Problem + +The Zarr V3 spec defines `chunk_grid` as an extension point, suggesting chunk grids should be pluggable like codecs or data types. But chunk grids are fundamentally different: + +- **Codecs are independent:** supporting `zstd` tells you nothing about `gzip`. +- **Chunk grids form a hierarchy:** the rectilinear chunk grid is strictly more general than the regular chunk grid (and zarrs' regular-bounded grid). Any regular grid is expressible as a rectilinear grid. Supporting rectilinear means you support all known grid types for free. + +A registry-based plugin system adds complexity without clear benefit — there is no known chunk grid that is both (a) more general than rectilinear and (b) retains the tessellation properties that Zarr assumes. All known grids are special cases of the rectilinear grid: + +| Grid type | Description | Rectilinear representation | +|---|---|---| +| Regular | All chunks same shape | All axes have a single repeated edge length | +| Regular-bounded (zarrs) | Regular, but boundary chunks trimmed to array extent | Last edge length per axis is `shape % chunk_size` | +| HPC boundary-padded | Regular interior, larger boundary chunks | First/last edge lengths differ from interior | +| Fully variable | Arbitrary per-chunk sizes | Direct representation | + +If a future grid cannot be expressed as rectilinear (e.g., non-axis-aligned chunking, space-filling curves), it would require fundamentally different indexing and storage. Speculative generality today adds cost without benefit. + +## Proposal + +Replace the current multi-class chunk grid architecture with a single `ChunkGrid` implementation that handles both regular and rectilinear chunking, and drop user-defined chunk grids. + +### Design principles + +1. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. It serializes to the simplest metadata — `"regular"` when all chunks are uniform, `"rectilinear"` otherwise. +2. **No chunk grid registry.** Remove the entrypoint-based registration system. A simple name-based dispatch in `parse_chunk_grid()` is sufficient. +3. **Fixed vs Varying per dimension.** Each axis is internally represented as either `FixedDimension(size)` (one integer — all chunks uniform) or `VaryingDimension(edges, cumulative)` (per-chunk edge lengths with precomputed prefix sums). This avoids expanding regular dimensions into lists of identical values. +4. **Shape-free grid.** The chunk grid describes a tiling pattern, not a bound region. It does not store the array shape. Methods that need the shape receive it as a parameter. This matches the Zarr V3 spec where `shape` and `chunk_grid` are independent fields. +5. **Transparent transitions.** Operations like `resize()` can move an array from regular to rectilinear chunking. This transition should be explicit and controllable. + +### Internal representation + +```python +@dataclass(frozen=True) +class FixedDimension: + """All chunks on this axis have the same size.""" + size: int # chunk edge length (> 0) + + def index_to_chunk(self, idx: int) -> int: + return idx // self.size + def chunk_offset(self, chunk_ix: int) -> int: + return chunk_ix * self.size + def chunk_size(self, chunk_ix: int, dim_len: int) -> int: + return min(self.size, dim_len - chunk_ix * self.size) + def indices_to_chunks(self, indices: NDArray) -> NDArray: + return indices // self.size + +@dataclass(frozen=True) +class VaryingDimension: + """Chunks on this axis have explicit per-chunk sizes.""" + edges: tuple[int, ...] # per-chunk edge lengths (all > 0) + cumulative: tuple[int, ...] # prefix sums for O(log n) lookup + + def index_to_chunk(self, idx: int) -> int: + return bisect.bisect_right(self.cumulative, idx) + def chunk_offset(self, chunk_ix: int) -> int: + return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 + def chunk_size(self, chunk_ix: int, dim_len: int) -> int: + return self.edges[chunk_ix] + def indices_to_chunks(self, indices: NDArray) -> NDArray: + return np.searchsorted(self.cumulative, indices, side='right') + +@dataclass(frozen=True) +class ChunkGrid: + dimensions: tuple[FixedDimension | VaryingDimension, ...] + + @property + def is_regular(self) -> bool: + return all(isinstance(d, FixedDimension) for d in self.dimensions) +``` + +`FixedDimension` and `VaryingDimension` share a common interface (`index_to_chunk`, `chunk_offset`, `chunk_size`, `indices_to_chunks`) used directly by the indexing pipeline. Memory usage scales with the number of *varying* dimensions and their chunk counts, not with the total number of chunks. + +### API surface + +#### Creating arrays + +```python +# Regular chunks — serializes as {"name": "regular", ...} +arr = zarr.create_array(shape=(100, 200), chunks=(10, 20)) + +# Rectilinear chunks — serializes as {"name": "rectilinear", ...} +arr = zarr.create_array(shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]]) + +# RLE shorthand for rectilinear +arr = zarr.create_array(shape=(1000,), chunks=[[[100, 10]]]) # 10 chunks of size 100 +``` + +#### Inspecting chunk grids + +```python +arr.chunk_grid # ChunkGrid instance (always) +arr.chunk_grid.is_regular # True if all dimensions are Fixed +arr.chunk_grid.chunk_shape # (10, 20) — only when is_regular, else raises +arr.chunk_grid.ndim # number of dimensions + +# Per-chunk queries (array shape passed as parameter): +arr.chunk_grid.get_chunk_shape(arr.shape, chunk_coord=(0, 1)) +arr.chunk_grid.get_chunk_origin(arr.shape, chunk_coord=(0, 1)) +arr.chunk_grid.all_chunk_coords(arr.shape) +arr.chunk_grid.grid_shape(arr.shape) # (10, 10) — chunks per dimension + +# Out-of-bounds returns None: +arr.chunk_grid.get_chunk_shape(arr.shape, chunk_coord=(99, 99)) # None +``` + +#### `.chunks` property + +`.chunks` is retained for regular grids, returning `tuple[int, ...]` as today. For rectilinear grids it raises `NotImplementedError`. `.chunk_grid` is the general-purpose API. + +Three different chunk tuple conventions exist in the ecosystem: + +| System | Type | Example | +|---|---|---| +| Zarr `arr.chunks` | `tuple[int, ...]` | `(256, 512)` | +| Dask `arr.chunks` | `tuple[tuple[int, ...], ...]` | `((256, 256, 64), (512, 512))` | +| xarray `.chunks` | `tuple[tuple[int, ...], ...]` | Same as dask | + +Switching `.chunks` to dask-style tuples would be a breaking change and risks [expensive materialization for large regular grids](https://github.com/zarr-developers/zarr-python/pull/3534#discussion_r2457283002). The least disruptive path: keep `.chunks` for regular grids (no deprecation), add `.chunk_grid` alongside it, and let downstream libraries migrate at their own pace. + +#### Serialization + +```python +# Regular grid: +{"name": "regular", "configuration": {"chunk_shape": [10, 20]}} + +# Rectilinear grid (with RLE compression): +{"name": "rectilinear", "configuration": {"chunk_shapes": [[10, 20, 30], [[25, 4]]]}} +``` + +Both names produce the same `ChunkGrid` class. Unknown names raise an error (chunk grids must always be understood). + +#### Resize + +```python +# Default: new region gets a single chunk spanning the growth +arr.resize((80, 100)) # becomes rectilinear if not evenly divisible + +# Explicit: specify chunks for the new region +arr.resize((80, 100), chunks=[[10, 20, 30, 20], [25, 25, 25, 25]]) + +# Staying regular: if new shape is divisible by chunk size +arr.resize((70, 100)) # stays regular +``` + +### Indexing + +The indexing pipeline is deeply coupled to regular grid assumptions. Every per-dimension indexer (`IntDimIndexer`, `SliceDimIndexer`, `BoolArrayDimIndexer`, `IntArrayDimIndexer`) takes a scalar `dim_chunk_len: int` and uses `//` and `*` for all arithmetic: + +```python +dim_chunk_ix = self.dim_sel // self.dim_chunk_len # IntDimIndexer +dim_offset = dim_chunk_ix * self.dim_chunk_len # SliceDimIndexer +dim_sel_chunk = dim_sel // dim_chunk_len # IntArrayDimIndexer (vectorized) +``` + +For `VaryingDimension`, element-to-chunk mapping becomes a binary search and offset-to-chunk becomes a prefix sum lookup. The indexers must work with either representation. + +**Recommended approach:** Replace `dim_chunk_len: int` with the dimension grid object (`FixedDimension | VaryingDimension`). The shared interface (`index_to_chunk`, `chunk_offset`, `chunk_size`, `indices_to_chunks`) means the indexer code structure stays the same — just replace `dim_sel // dim_chunk_len` with `dim_grid.index_to_chunk(dim_sel)`. This preserves O(1) arithmetic for regular dimensions and uses binary search only for varying ones. + +Alternatives considered: +- **Precompute arrays** (offsets, sizes) at indexer creation and branch on scalar vs array — awkward, two code paths per indexer. +- **Always use `np.searchsorted`** for both types — uniform code but penalizes regular grids. + +### Codec pipeline + +Once the indexers determine *which* chunks to read or write, the codec pipeline needs to know *what shape* each chunk is. Today, `ArrayV3Metadata.get_chunk_spec()` ignores `chunk_coords` entirely — it returns the same `ArraySpec(shape=chunk_grid.chunk_shape)` for every chunk, because all chunks have the same shape in a regular grid. + +For rectilinear grids, each chunk may have a different shape. `get_chunk_spec` must use the coordinates: + +```python +def get_chunk_spec(self, chunk_coords, array_config, prototype) -> ArraySpec: + chunk_shape = self.chunk_grid.get_chunk_shape(self.shape, chunk_coords) + return ArraySpec(shape=chunk_shape, ...) +``` + +The codec pipeline uses `ArraySpec.shape` to allocate buffers, decode data, and validate output, so the per-chunk shape must be correct. This is a mechanical change — the `chunk_coords` parameter already exists (currently prefixed with `_` to signal it's unused) — but it touches every read/write path. + +### Sharding + +PR #3534 marks sharding as incompatible with rectilinear chunk grids. This constraint is unnecessary once the design is understood as three independent grid levels: + +``` +Level 1 — Outer chunk grid (shard boundaries) + Can be regular or rectilinear. + e.g., chunks = [[5000, 5980], [5000, 5980]] + +Level 2 — Inner subchunk grid (within each shard) + Always regular, but boundary subchunks may be clipped to shard shape. + e.g., subchunk_shape = [512, 512] + +Level 3 — Shard index + ceil(shard_dim / subchunk_dim) entries per dimension, each (offset, size). +``` + +The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape and subchunk shape. It doesn't need to know whether the outer grid is regular or rectilinear — each shard is self-contained. + +[zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) (sharding v1.1) lifts the requirement that subchunk shapes evenly divide the shard shape. With the proposed `ChunkGrid`, this requires one change: remove the `shard_shape % subchunk_shape == 0` validation. `FixedDimension` already handles boundary clipping. + +These two features compose independently: + +| Outer grid | Subchunk divisibility | Required change | +|---|---|---| +| Regular | Evenly divides (v1.0) | None (works today) | +| Regular | Non-divisible (v1.1) | Remove divisibility validation | +| Rectilinear | Evenly divides | Remove "sharding incompatible" guard | +| Rectilinear | Non-divisible | Both changes; no additional work | + +### What this replaces + +| Current design | Proposed design | +|---|---| +| `ChunkGrid` abstract base class | Single concrete `ChunkGrid` class | +| `RegularChunkGrid` subclass | `ChunkGrid` with `is_regular` property | +| `RectilinearChunkGrid` subclass (#3534) | Same `ChunkGrid` class | +| Chunk grid registry + entrypoints (#3735) | Removed — direct name dispatch | +| `arr.chunks` → `tuple[int, ...]` | Retained for regular grids; `arr.chunk_grid` for general use | + +## Design decisions + +### Why not store the array shape in ChunkGrid? + +[#3736](https://github.com/zarr-developers/zarr-python/issues/3736) proposes adding `array_shape` to the chunk grid, motivated by the awkwardness of passing and re-validating `array_shape` on every method call in PR #3534. zarrs takes the same approach, storing the shape at construction. This prospectus diverges. + +**For:** + +- Simpler method signatures (no repeated `array_shape` parameter). +- Enables precomputing chunk count and boundary sizes. +- Prevents callers from passing the wrong shape. +- Eliminates repeated validation. + +**Against:** + +- The chunk grid is a tiling pattern, not a bound region. In the Zarr V3 spec, `chunk_grid` and `shape` are independent metadata fields. Storing the shape conflates "how to tile" with "what to tile over." Sharding exposes this — the same subchunk configuration produces different `ChunkGrid` instances for different shard shapes. `VaryingDimension` doesn't need the shape at all (edges fully define the grid). +- TensorStore validates the separation in production, storing only `chunk_shape`. +- serialization becomes awkward — `to_dict()` would need to return the shape alongside the grid even though the spec doesn't couple them. + +The repeated-validation problem from #3534 is real but has a simpler fix: validate once at `ArrayV3Metadata` construction (where both `shape` and `chunk_grid` are available), then trust that callers pass the correct shape downstream. For `VaryingDimension`, most methods don't use the shape at all — the edges and cumulative sums are self-contained. For `FixedDimension`, only boundary chunk size and grid extent need the shape, and these are computed with a single scalar per dimension, not the full tuple. + +The cost of keeping them separate is one extra parameter on ~5 methods that are called O(1) times per operation. The benefit is a cleaner abstraction that's reusable across contexts (sharding, resize, serialization). + +### Why not a chunk grid registry? + +zarrs uses compile-time + runtime plugin registration. This makes sense for a library that explicitly supports user-defined extensions. For zarr-python, there is no known chunk grid outside the rectilinear family that retains the tessellation properties the codebase assumes. A simple `match` on the grid name in `parse_chunk_grid()` is sufficient and avoids entrypoint complexity. + +### Why a single class instead of a Protocol? + +zarrs uses independent types behind a shared trait. In Rust, the trait system enforces a uniform interface at zero runtime cost. In Python, a Protocol-based approach means every caller programs against an abstract interface, and adding a grid type requires implementing ~10 methods. Since all known grids are special cases of rectilinear, a single class is simpler while supporting the same metadata formats. If a genuinely novel grid type emerges, a Protocol can be extracted at that point. + +## Prior art + +### zarrs (Rust) + +zarrs implements three independent chunk grid types (regular, regular-bounded, rectangular) behind a `ChunkGridTraits` trait. Key patterns adopted: + +- **Fixed vs Varying per dimension** — rectangular grid distinguishes `Fixed(size)` vs `Varying(Vec)` per axis +- **Prefix sums + binary search** — precomputed offsets with `partition_point` for O(log n) lookup +- **None for out-of-bounds** — chunk queries return `Option` instead of panicking +- **Non-zero chunk dimensions** — `NonZeroU64` makes zero-sized chunks unrepresentable +- **Sharding creates a separate grid** — `ShardingCodec` constructs an independent subchunk grid per shard + +### TensorStore (C++) + +TensorStore's `ChunkGridSpecification` stores only `chunk_shape`, not the array shape — validating the shape-free approach. It has both `RegularGridRef` and `IrregularGrid` internally (the latter with sorted breakpoints per dimension), but only the regular grid is used for Zarr V3. No chunk grid registry — the `"regular"` name is hardcoded. + +## Migration + +### Existing PRs + +**#3735** (chunk grid module, +313/−65, approved by @maxrjones) splits `chunk_grids.py` into a `chunk_grids/` package (`__init__.py`, `common.py`, `regular.py`) and adds a chunk grid registry. The module layout is reusable. The registry (`register_chunk_grid` / `get_chunk_grid_class` in `registry.py`) is not — it should be replaced with direct name dispatch before merging. + +**#3737** (chunk grid array shape, +514/−198, draft) implements #3736 by adding `array_shape` to `ChunkGrid`. Depends on #3735. The prospectus argues against storing the array shape in the grid (see Design decisions). This PR should be closed. + +**#3534** (rectilinear implementation, +5716/−408, extensive review) introduces `RectilinearChunkGrid` as a separate subclass. The prospectus proposes a different architecture (single `ChunkGrid` with `FixedDimension`/`VaryingDimension`). Reusable components: + +| #3534 component | Disposition | +|---|---| +| `_expand_run_length_encoding` / `_compress_run_length_encoding` | **Keep** as-is | +| `_normalize_rectilinear_chunks` / `_parse_chunk_shapes` | **Keep with modifications** — feed into `VaryingDimension` construction | +| `resolve_chunk_spec` / `ChunksLike` type alias | **Keep** — orthogonal to grid class design | +| `_validate_zarr_format_compatibility` | **Keep** — rectilinear is V3-only | +| `_validate_sharding_compatibility` | **Remove** — sharding is compatible with rectilinear | +| `_validate_data_compatibility` (`from_array` guard) | **Keep for now** — needs separate design work | +| `RectilinearChunkGrid` class / `ConfigurationDict` | **Replace** — single `ChunkGrid` class | +| `chunk_grid` property on `Array`/`AsyncArray` | **Keep** | +| `.chunks` raising for rectilinear | **Keep** | +| Tests | **Adapt** for single-class API | +| Indexing changes | **Insufficient** — `assert isinstance(chunk_grid, RegularChunkGrid)` guards remain | + +Given the scope of architectural changes, a **fresh PR** is more practical than adapting #3534. Rebasing and reworking its core classes would touch nearly every line of a 5700-line diff while inheriting review history that no longer applies. + +**#1483** (ZEP0003 POC, +346/−20, draft, V2) is @martindurant's original proof-of-concept for variable chunking on Zarr V2. It demonstrated feasibility but targets the V2 format and predates the V3 extension point design. Should be closed. + +### Plan + +1. **Amend and merge #3735.** Keep the `chunk_grids/` module layout. Replace the registry with direct name dispatch in `parse_chunk_grid()`. Remove `register_chunk_grid` / `get_chunk_grid_class` from `registry.py` and the entrypoint from `pyproject.toml`. + +2. **Open a new PR** implementing the prospectus: + - `FixedDimension` and `VaryingDimension` dataclasses with shared interface (`index_to_chunk`, `chunk_offset`, `chunk_size`, `indices_to_chunks`). + - Single `ChunkGrid` class with `dimensions: tuple[FixedDimension | VaryingDimension, ...]` and `is_regular`. + - `parse_chunk_grid()` recognizes `"regular"` and `"rectilinear"`. + - Port RLE helpers, `resolve_chunk_spec`, `ChunksLike`, and validation functions from #3534. + - Refactor per-dimension indexers to accept `FixedDimension | VaryingDimension` instead of `dim_chunk_len: int`. + - Update `get_chunk_spec` to compute per-chunk shapes from coordinates. + - Add `arr.chunk_grid` property. Keep `.chunks` for regular grids, raise for rectilinear. + - Remove the "sharding incompatible with rectilinear" guard. + - Adapt tests from #3534. + +3. **Close trial PRs** with comments linking to the new PR and crediting contributions: + - **Close #3534** — credit RLE helpers, validation logic, chunk spec resolution, test cases, and review discussion that shaped the design. + - **Close #3737** — reference the shape-free design decision. + - **Close #1483** — credit as the original POC that motivated the work; superseded by the V3 implementation. + - **Close #3736** — respond with the shape-free design rationale. + +4. **Sharding v1.1** (after zarr-specs#370 is accepted) — separate PR removing the `shard_shape % subchunk_shape == 0` validation in `ShardingCodec`. + +### Downstream migration + +Four active PRs/issues in the ecosystem depend on zarr-python's rectilinear chunk grid support. All currently track #3534 as their upstream dependency. The unified `ChunkGrid` design is a narrower API surface than the two-class hierarchy, so the net effect is less integration work per downstream — but each needs updates. + +#### xarray ([pydata/xarray#10880](https://github.com/pydata/xarray/pull/10880)) + +Draft PR by @keewis (+26/−9 in `xarray/backends/zarr.py`) enabling variable-sized chunk writes and reads via the zarr backend. Currently imports `RectilinearChunkGrid` / `RegularChunkGrid` for feature detection and branches on `isinstance` checks. + +**Required changes:** + +- **Feature detection.** Replace class-existence checks (`hasattr(zarr, 'RectilinearChunkGrid')`) with a version check or try-import of the unified `ChunkGrid`. Since the prospectus exports a single class, detection simplifies to checking whether `ChunkGrid` accepts non-uniform dimensions (or just `zarr.__version__`). +- **Write path.** Currently constructs chunk info that `RectilinearChunkGrid` understands. The prospectus's `chunks=[[10, 20, 30], [25, 25, 25, 25]]` API for `create_array` is a more natural fit — the xarray write path may get simpler. +- **Read path.** Replace `isinstance(chunk_grid, RectilinearChunkGrid)` with `not chunk_grid.is_regular`. Per-dimension chunk sizes come from `chunk_grid.dimensions[i].edges` (for `VaryingDimension`) or are computed from `chunk_grid.dimensions[i].size` (for `FixedDimension`). +- **`validate_grid_chunks_alignment`.** Still needs work regardless of class hierarchy — the approach is the same either way. + +**Effort:** ~1–2 days. The PR is small and the unified API is more ergonomic for xarray's use case. + +#### VirtualiZarr ([zarr-developers/VirtualiZarr#877](https://github.com/zarr-developers/VirtualiZarr/pull/877)) + +Draft PR by @maxrjones adding rectilinear support to `ManifestArray`, with a `has_rectilinear_chunk_grid_support` feature flag and vendored `_is_nested_sequence` helper from #3534. + +**Required changes:** + +- **Drop vendored `_is_nested_sequence`.** The prospectus eliminates `RectilinearChunkGrid` as a separate class, so nested-sequence detection for choosing grid type is unnecessary — just construct `ChunkGrid` with appropriate dimension types. +- **`isinstance` → `.is_regular`.** All `isinstance(chunk_grid, RectilinearChunkGrid)` checks become `not chunk_grid.is_regular`. +- **`ManifestArray.chunks`.** Currently returns `chunk_grid.chunk_shapes` for rectilinear grids. Under the prospectus, chunk shapes come from iterating dimension edges. The dask-style `tuple[tuple[int, ...], ...]` format VirtualiZarr uses internally is unaffected. +- **`copy_and_replace_metadata`.** Simplifies: no need to detect nested sequences to pick a grid class. +- **Test environment.** Currently pins jhamman's zarr-python fork — would track whatever branch implements the prospectus. + +**Effort:** ~1–2 days. Mostly mechanical type-check replacements plus dropping the vendored helper. Concat/stack logic is grid-type-agnostic once chunk shapes are available. + +#### Icechunk ([earth-mover/icechunk#1338](https://github.com/earth-mover/icechunk/issues/1338)) + +Investigation issue for supporting rectilinear grids in the IC2 on-disk format. The `DimensionShape { dim_length, chunk_length }` struct needs extension to encode per-chunk sizes. + +**Impact:** Minimal. Icechunk's format changes are driven by the *spec* (ZEP0003 / rectilinear extension), not zarr-python's class hierarchy. The unified `ChunkGrid` means Icechunk's Python-side metadata ingestion handles one type instead of two. The `shift_array` / `reindex` concerns raised in the discussion are orthogonal to this design. + +**Effort:** No change to the work already scoped. May marginally simplify the Python integration layer. + +#### cubed ([cubed-dev/cubed#876](https://github.com/cubed-dev/cubed/issues/876)) + +Draft by @TomNicholas using rectilinear intermediate stores to reduce rechunking stages (+142/−27 across storage adapter, blockwise, and ops). + +**Required changes:** + +- **Store creation.** `zarr_python_v3.py` currently creates `RectilinearChunkGrid` instances directly. Switch to constructing `ChunkGrid` via the prospectus's list-of-lists `chunks` API. +- **Chunk shape queries.** Any `isinstance` checks on grid type become `.is_regular` checks. +- The rechunking algorithm itself is independent of the class hierarchy — it operates on per-dimension chunk tuples internally. + +**Effort:** <1 day. Changes are concentrated in the storage adapter layer, and the prospectus's API is a natural fit for cubed's internal representation. + +#### Migration pattern + +All four downstreams follow the same pattern. The migration from the two-class API to the unified API is mechanical: + +| Two-class pattern | Unified pattern | +|---|---| +| `isinstance(cg, RegularChunkGrid)` | `cg.is_regular` | +| `isinstance(cg, RectilinearChunkGrid)` | `not cg.is_regular` | +| `cg.chunk_shape` (regular only) | `cg.chunk_shape` (raises if not regular) | +| `cg.chunk_shapes` (rectilinear) | `tuple(d.edges for d in cg.dimensions)` | +| `RegularChunkGrid(chunk_shape=(...))` | `ChunkGrid.from_regular((...))` or `chunks=(...)` in `create_array` | +| `RectilinearChunkGrid(chunk_shapes=(...))` | `ChunkGrid.from_rectilinear((...))` or `chunks=[[...], [...]]` in `create_array` | +| Feature detection via class import | Version check or `hasattr(ChunkGrid, 'is_regular')` | + +## Open questions + +1. **RLE in the Python API:** Should users pass RLE-encoded chunk specs directly, or only expanded lists? RLE is primarily a serialization concern, but for arrays with millions of chunks it matters at construction time too. +2. **Resize defaults:** When growing a regular array, should the default preserve regularity (extending the last chunk) or create a new chunk for the added region (transitioning to rectilinear)? From fb962071c0a636cbbb996e72ed8ed8e3936e50a8 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:11:53 -0400 Subject: [PATCH 002/118] Initial prospectus POC --- src/zarr/codecs/sharding.py | 7 +- src/zarr/core/array.py | 10 +- src/zarr/core/chunk_grids.py | 390 ++++++++++++++++++++---- src/zarr/core/indexing.py | 310 +++++++++++++------ src/zarr/core/metadata/v3.py | 41 ++- tests/test_array.py | 10 +- tests/test_unified_chunk_grid.py | 505 +++++++++++++++++++++++++++++++ 7 files changed, 1095 insertions(+), 178 deletions(-) create mode 100644 tests/test_unified_chunk_grid.py diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 85162c2f74..0b880fb7b8 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -387,9 +387,10 @@ def validate( raise ValueError( "The shard's `chunk_shape` and array's `shape` need to have the same number of dimensions." ) - if not isinstance(chunk_grid, RegularChunkGrid): - raise TypeError("Sharding is only compatible with regular chunk grids.") - if not all( + # Sharding works with both regular and rectilinear outer chunk grids. + # Each shard is self-contained — the ShardingCodec constructs an independent + # inner ChunkGrid per shard using the shard shape and subchunk shape. + if chunk_grid.is_regular and not all( s % c == 0 for s, c in zip( chunk_grid.chunk_shape, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 564d0e915a..55ea25286b 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -40,7 +40,7 @@ default_buffer_prototype, ) from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype -from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks +from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -744,9 +744,12 @@ def _create_metadata_v3( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: DimensionNames = None, attributes: dict[str, JSON] | None = None, + chunk_grid: ChunkGrid | None = None, ) -> ArrayV3Metadata: """ Create an instance of ArrayV3Metadata. + + If `chunk_grid` is provided, it takes precedence over `chunk_shape`. """ filters: tuple[ArrayArrayCodec, ...] compressors: tuple[BytesBytesCodec, ...] @@ -774,7 +777,10 @@ def _create_metadata_v3( else: fill_value_parsed = fill_value - chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape) + if chunk_grid is not None: + chunk_grid_parsed: ChunkGrid = chunk_grid + else: + chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape) return ArrayV3Metadata( shape=shape, data_type=dtype, diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 2c7945fa64..33f8ed907b 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -1,16 +1,18 @@ from __future__ import annotations +import bisect import itertools import math import numbers import operator import warnings -from abc import abstractmethod +from collections.abc import Sequence from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, cast import numpy as np +import numpy.typing as npt import zarr from zarr.abc.metadata import Metadata @@ -31,6 +33,338 @@ from zarr.core.array import ShardsLike +# --------------------------------------------------------------------------- +# Per-dimension grid types +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class FixedDimension: + """All chunks on this axis have the same size.""" + + size: int # chunk edge length (> 0) + + def __post_init__(self) -> None: + if self.size < 0: + raise ValueError(f"FixedDimension size must be >= 0, got {self.size}") + + def index_to_chunk(self, idx: int) -> int: + if self.size == 0: + return 0 + return idx // self.size + + def chunk_offset(self, chunk_ix: int) -> int: + return chunk_ix * self.size + + def chunk_size(self, chunk_ix: int, dim_len: int) -> int: + if self.size == 0: + return 0 + return min(self.size, dim_len - chunk_ix * self.size) + + def nchunks(self, dim_len: int) -> int: + if self.size == 0: + return 1 if dim_len == 0 else 0 + return ceildiv(dim_len, self.size) + + def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: + if self.size == 0: + return np.zeros_like(indices) + return indices // self.size + + +@dataclass(frozen=True) +class VaryingDimension: + """Chunks on this axis have explicit per-chunk sizes.""" + + edges: tuple[int, ...] # per-chunk edge lengths (all > 0) + cumulative: tuple[int, ...] # prefix sums for O(log n) lookup + + def __init__(self, edges: Sequence[int]) -> None: + edges_tuple = tuple(edges) + if not edges_tuple: + raise ValueError("VaryingDimension edges must not be empty") + if any(e <= 0 for e in edges_tuple): + raise ValueError(f"All edge lengths must be > 0, got {edges_tuple}") + cumulative = tuple(itertools.accumulate(edges_tuple)) + object.__setattr__(self, "edges", edges_tuple) + object.__setattr__(self, "cumulative", cumulative) + + def index_to_chunk(self, idx: int) -> int: + return bisect.bisect_right(self.cumulative, idx) + + def chunk_offset(self, chunk_ix: int) -> int: + return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 + + def chunk_size(self, chunk_ix: int, dim_len: int) -> int: + return self.edges[chunk_ix] + + def nchunks(self, dim_len: int) -> int: + return len(self.edges) + + def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: + return np.searchsorted(self.cumulative, indices, side="right") + + +DimensionGrid = FixedDimension | VaryingDimension + + +# --------------------------------------------------------------------------- +# RLE helpers (ported from #3534) +# --------------------------------------------------------------------------- + + +def _expand_rle(data: list[list[int]]) -> list[int]: + """Expand run-length encoded chunk sizes: [[size, count], ...] -> [size, size, ...]""" + result: list[int] = [] + for item in data: + if len(item) != 2: + raise ValueError(f"RLE entries must be [size, count], got {item}") + size, count = item + result.extend([size] * count) + return result + + +def _compress_rle(sizes: Sequence[int]) -> list[list[int]]: + """Compress chunk sizes to RLE: [10,10,10,20,20] -> [[10,3],[20,2]]""" + if not sizes: + return [] + result: list[list[int]] = [] + current = sizes[0] + count = 1 + for s in sizes[1:]: + if s == current: + count += 1 + else: + result.append([current, count]) + current = s + count = 1 + result.append([current, count]) + return result + + +# --------------------------------------------------------------------------- +# Unified ChunkGrid +# --------------------------------------------------------------------------- + +# Type alias for what users can pass as chunks to create_array +ChunksLike = tuple[int, ...] | list[list[int] | int] | int + + +@dataclass(frozen=True) +class ChunkGrid(Metadata): + """ + Unified chunk grid supporting both regular and rectilinear chunking. + + Internally represents each dimension as either FixedDimension (uniform chunks) + or VaryingDimension (per-chunk edge lengths with prefix sums). + """ + + dimensions: tuple[DimensionGrid, ...] + + def __init__(self, *, dimensions: tuple[DimensionGrid, ...]) -> None: + object.__setattr__(self, "dimensions", dimensions) + + @classmethod + def from_regular(cls, chunk_shape: ShapeLike) -> ChunkGrid: + """Create a ChunkGrid where all dimensions are fixed (regular).""" + parsed = parse_shapelike(chunk_shape) + dims = tuple(FixedDimension(size=s) for s in parsed) + return cls(dimensions=dims) + + @classmethod + def from_rectilinear(cls, chunk_shapes: Sequence[Sequence[int]]) -> ChunkGrid: + """Create a ChunkGrid with per-dimension edge lists. + + Each element of chunk_shapes is a sequence of chunk sizes for that dimension. + If all sizes in a dimension are identical, it's stored as FixedDimension. + """ + dims: list[DimensionGrid] = [] + for edges in chunk_shapes: + edges_list = list(edges) + if not edges_list: + raise ValueError("Each dimension must have at least one chunk") + if all(e == edges_list[0] for e in edges_list): + dims.append(FixedDimension(size=edges_list[0])) + else: + dims.append(VaryingDimension(edges_list)) + return cls(dimensions=tuple(dims)) + + # -- Properties -- + + @property + def ndim(self) -> int: + return len(self.dimensions) + + @property + def is_regular(self) -> bool: + return all(isinstance(d, FixedDimension) for d in self.dimensions) + + @property + def chunk_shape(self) -> tuple[int, ...]: + """Return the uniform chunk shape. Raises if grid is not regular.""" + # Check for a stored _chunk_shape (set by RegularChunkGrid subclass) + try: + stored: tuple[int, ...] = object.__getattribute__(self, "_chunk_shape") + except AttributeError: + pass + else: + return stored + if not self.is_regular: + raise ValueError( + "chunk_shape is only available for regular chunk grids. " + "Use get_chunk_shape(array_shape, chunk_coords) for rectilinear grids." + ) + return tuple(d.size for d in self.dimensions) # type: ignore[union-attr] + + # -- Chunk queries (shape-free where possible) -- + + def get_chunk_shape( + self, array_shape: tuple[int, ...], chunk_coords: tuple[int, ...] + ) -> tuple[int, ...] | None: + """Return the shape of a specific chunk, or None if out of bounds.""" + result: list[int] = [] + for dim, dim_len, chunk_ix in zip(self.dimensions, array_shape, chunk_coords, strict=True): + nch = dim.nchunks(dim_len) + if chunk_ix < 0 or chunk_ix >= nch: + return None + result.append(dim.chunk_size(chunk_ix, dim_len)) + return tuple(result) + + def get_chunk_origin( + self, array_shape: tuple[int, ...], chunk_coords: tuple[int, ...] + ) -> tuple[int, ...] | None: + """Return the origin (start indices) of a specific chunk, or None if OOB.""" + result: list[int] = [] + for dim, dim_len, chunk_ix in zip(self.dimensions, array_shape, chunk_coords, strict=True): + nch = dim.nchunks(dim_len) + if chunk_ix < 0 or chunk_ix >= nch: + return None + result.append(dim.chunk_offset(chunk_ix)) + return tuple(result) + + def grid_shape(self, array_shape: tuple[int, ...]) -> tuple[int, ...]: + """Return the number of chunks per dimension.""" + return tuple(d.nchunks(s) for d, s in zip(self.dimensions, array_shape, strict=True)) + + def all_chunk_coords(self, array_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: + return itertools.product( + *(range(d.nchunks(s)) for d, s in zip(self.dimensions, array_shape, strict=True)) + ) + + def get_nchunks(self, array_shape: tuple[int, ...]) -> int: + return reduce( + operator.mul, + (d.nchunks(s) for d, s in zip(self.dimensions, array_shape, strict=True)), + 1, + ) + + # -- Serialization -- + + @classmethod + def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> ChunkGrid: + if isinstance(data, ChunkGrid): + # Handle both ChunkGrid and legacy RegularChunkGrid + if isinstance(data, RegularChunkGrid): + return ChunkGrid.from_regular(data.chunk_shape) + return data + + name_parsed, configuration_parsed = parse_named_configuration(data) + + if name_parsed == "regular": + chunk_shape_raw = configuration_parsed.get("chunk_shape") + if chunk_shape_raw is None: + raise ValueError("Regular chunk grid requires 'chunk_shape' configuration") + if not isinstance(chunk_shape_raw, Sequence): + raise TypeError(f"chunk_shape must be a sequence, got {type(chunk_shape_raw)}") + return cls.from_regular(cast("Sequence[int]", chunk_shape_raw)) + + if name_parsed == "rectilinear": + chunk_shapes_raw = configuration_parsed.get("chunk_shapes") + if chunk_shapes_raw is None: + raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") + if not isinstance(chunk_shapes_raw, Sequence): + raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") + # Decode RLE if present + decoded: list[list[int]] = [] + for dim_spec in chunk_shapes_raw: + if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): + decoded.append(_expand_rle(dim_spec)) + elif isinstance(dim_spec, list): + decoded.append(dim_spec) + else: + raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + return cls.from_rectilinear(decoded) + + raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") + + def to_dict(self) -> dict[str, JSON]: + if self.is_regular: + return { + "name": "regular", + "configuration": {"chunk_shape": tuple(self.chunk_shape)}, + } + else: + chunk_shapes: list[Any] = [] + for dim in self.dimensions: + if isinstance(dim, FixedDimension): + # Single fixed size — store as RLE + chunk_shapes.append([[dim.size, 1]]) + else: + edges = list(dim.edges) + rle = _compress_rle(edges) + # Use RLE only if it actually compresses + if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): + chunk_shapes.append(rle) + else: + chunk_shapes.append(edges) + return { + "name": "rectilinear", + "configuration": {"chunk_shapes": chunk_shapes}, + } + + +# --------------------------------------------------------------------------- +# Backwards-compatible alias +# --------------------------------------------------------------------------- + + +class RegularChunkGrid(ChunkGrid): + """Backwards-compatible wrapper. Prefer ChunkGrid.from_regular() for new code.""" + + _chunk_shape: tuple[int, ...] + + def __init__(self, *, chunk_shape: ShapeLike) -> None: + chunk_shape_parsed = parse_shapelike(chunk_shape) + dims = tuple(FixedDimension(size=s) for s in chunk_shape_parsed) + object.__setattr__(self, "dimensions", dims) + object.__setattr__(self, "_chunk_shape", chunk_shape_parsed) + + @classmethod + def _from_dict(cls, data: dict[str, JSON] | NamedConfig[str, Any]) -> Self: + _, configuration_parsed = parse_named_configuration(data, "regular") + return cls(**configuration_parsed) # type: ignore[arg-type] + + def to_dict(self) -> dict[str, JSON]: + return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}} + + def all_chunk_coords(self, array_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: + return itertools.product( + *(range(ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape, strict=False)) + ) + + def get_nchunks(self, array_shape: tuple[int, ...]) -> int: + return reduce( + operator.mul, + itertools.starmap(ceildiv, zip(array_shape, self.chunk_shape, strict=True)), + 1, + ) + + +# --------------------------------------------------------------------------- +# Chunk guessing / normalization (unchanged) +# --------------------------------------------------------------------------- + + def _guess_chunks( shape: tuple[int, ...] | int, typesize: int, @@ -153,58 +487,6 @@ def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tupl return tuple(int(c) for c in chunks) -@dataclass(frozen=True) -class ChunkGrid(Metadata): - @classmethod - def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> ChunkGrid: - if isinstance(data, ChunkGrid): - return data - - name_parsed, _ = parse_named_configuration(data) - if name_parsed == "regular": - return RegularChunkGrid._from_dict(data) - raise ValueError(f"Unknown chunk grid. Got {name_parsed}.") - - @abstractmethod - def all_chunk_coords(self, array_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - pass - - @abstractmethod - def get_nchunks(self, array_shape: tuple[int, ...]) -> int: - pass - - -@dataclass(frozen=True) -class RegularChunkGrid(ChunkGrid): - chunk_shape: tuple[int, ...] - - def __init__(self, *, chunk_shape: ShapeLike) -> None: - chunk_shape_parsed = parse_shapelike(chunk_shape) - - object.__setattr__(self, "chunk_shape", chunk_shape_parsed) - - @classmethod - def _from_dict(cls, data: dict[str, JSON] | NamedConfig[str, Any]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "regular") - - return cls(**configuration_parsed) # type: ignore[arg-type] - - def to_dict(self) -> dict[str, JSON]: - return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}} - - def all_chunk_coords(self, array_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - return itertools.product( - *(range(ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape, strict=False)) - ) - - def get_nchunks(self, array_shape: tuple[int, ...]) -> int: - return reduce( - operator.mul, - itertools.starmap(ceildiv, zip(array_shape, self.chunk_shape, strict=True)), - 1, - ) - - def _guess_num_chunks_per_axis_shard( chunk_shape: tuple[int, ...], item_size: int, max_bytes: int, array_shape: tuple[int, ...] ) -> int: diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index d226c03675..cedd5a14bc 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1,7 +1,6 @@ from __future__ import annotations import itertools -import math import numbers import operator from collections.abc import Iterator, Sequence @@ -38,7 +37,7 @@ if TYPE_CHECKING: from zarr.core.array import AsyncArray from zarr.core.buffer import NDArrayLikeOrScalar - from zarr.core.chunk_grids import ChunkGrid + from zarr.core.chunk_grids import ChunkGrid, DimensionGrid from zarr.types import AnyArray @@ -333,14 +332,19 @@ def is_pure_orthogonal_indexing(selection: Selection, ndim: int) -> TypeGuard[Or def get_chunk_shape(chunk_grid: ChunkGrid) -> tuple[int, ...]: - from zarr.core.chunk_grids import RegularChunkGrid - - assert isinstance(chunk_grid, RegularChunkGrid), ( - "Only regular chunk grid is supported, currently." - ) + if not chunk_grid.is_regular: + raise ValueError( + "get_chunk_shape only works with regular chunk grids. " + "Use chunk_grid.dimensions for rectilinear grids." + ) return chunk_grid.chunk_shape +def _get_dim_grids(chunk_grid: ChunkGrid) -> tuple[DimensionGrid, ...]: + """Extract per-dimension grid objects from a ChunkGrid.""" + return chunk_grid.dimensions + + def normalize_integer_selection(dim_sel: int, dim_len: int) -> int: # normalize type to int dim_sel = int(dim_sel) @@ -380,20 +384,32 @@ class ChunkDimProjection(NamedTuple): class IntDimIndexer: dim_sel: int dim_len: int - dim_chunk_len: int + dim_chunk_len: int # kept for backwards compat; unused if dim_grid is set + dim_grid: DimensionGrid | None nitems: int = 1 - def __init__(self, dim_sel: int, dim_len: int, dim_chunk_len: int) -> None: + def __init__( + self, dim_sel: int, dim_len: int, dim_chunk_len: int, dim_grid: DimensionGrid | None = None + ) -> None: object.__setattr__(self, "dim_sel", normalize_integer_selection(dim_sel, dim_len)) object.__setattr__(self, "dim_len", dim_len) object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "dim_grid", dim_grid) def __iter__(self) -> Iterator[ChunkDimProjection]: - dim_chunk_ix = self.dim_sel // self.dim_chunk_len - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel - dim_offset - dim_out_sel = None - is_complete_chunk = self.dim_chunk_len == 1 + g = self.dim_grid + if g is not None: + dim_chunk_ix = g.index_to_chunk(self.dim_sel) + dim_offset = g.chunk_offset(dim_chunk_ix) + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + is_complete_chunk = g.chunk_size(dim_chunk_ix, self.dim_len) == 1 + else: + dim_chunk_ix = self.dim_sel // self.dim_chunk_len + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + is_complete_chunk = self.dim_chunk_len == 1 yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) @@ -403,12 +419,19 @@ class SliceDimIndexer: dim_chunk_len: int nitems: int nchunks: int + dim_grid: DimensionGrid | None start: int stop: int step: int - def __init__(self, dim_sel: slice, dim_len: int, dim_chunk_len: int) -> None: + def __init__( + self, + dim_sel: slice, + dim_len: int, + dim_chunk_len: int, + dim_grid: DimensionGrid | None = None, + ) -> None: # normalize start, stop, step = dim_sel.indices(dim_len) if step < 1: @@ -420,58 +443,92 @@ def __init__(self, dim_sel: slice, dim_len: int, dim_chunk_len: int) -> None: object.__setattr__(self, "dim_len", dim_len) object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "dim_grid", dim_grid) object.__setattr__(self, "nitems", max(0, ceildiv((stop - start), step))) - object.__setattr__(self, "nchunks", ceildiv(dim_len, dim_chunk_len)) - def __iter__(self) -> Iterator[ChunkDimProjection]: - # figure out the range of chunks we need to visit - dim_chunk_ix_from = 0 if self.start == 0 else self.start // self.dim_chunk_len - dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) - - # iterate over chunks in range - for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): - # compute offsets for chunk within overall array - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) - - # determine chunk length, accounting for trailing chunk - dim_chunk_len = dim_limit - dim_offset - - if self.start < dim_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - remainder = (dim_offset - self.start) % self.step - if remainder: - dim_chunk_sel_start += self.step - remainder - # compute number of previous items, provides offset into output array - dim_out_offset = ceildiv((dim_offset - self.start), self.step) - - else: - # selection starts within current chunk - dim_chunk_sel_start = self.start - dim_offset - dim_out_offset = 0 - - if self.stop > dim_limit: - # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_len - - else: - # selection ends within current chunk - dim_chunk_sel_stop = self.stop - dim_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) - dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) - - # If there are no elements on the selection within this chunk, then skip - if dim_chunk_nitems == 0: - continue - - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + if dim_grid is not None: + object.__setattr__(self, "nchunks", dim_grid.nchunks(dim_len)) + else: + object.__setattr__(self, "nchunks", ceildiv(dim_len, dim_chunk_len)) - is_complete_chunk = ( - dim_chunk_sel_start == 0 and (self.stop >= dim_limit) and self.step in [1, None] - ) - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) + def __iter__(self) -> Iterator[ChunkDimProjection]: + g = self.dim_grid + if g is not None: + # Use the dimension grid for chunk boundary lookups + dim_chunk_ix_from = g.index_to_chunk(self.start) if self.start > 0 else 0 + dim_chunk_ix_to = g.index_to_chunk(self.stop - 1) + 1 if self.stop > 0 else 0 + + for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + dim_offset = g.chunk_offset(dim_chunk_ix) + dim_chunk_len = g.chunk_size(dim_chunk_ix, self.dim_len) + dim_limit = dim_offset + dim_chunk_len + + if self.start < dim_offset: + dim_chunk_sel_start = 0 + remainder = (dim_offset - self.start) % self.step + if remainder: + dim_chunk_sel_start += self.step - remainder + dim_out_offset = ceildiv((dim_offset - self.start), self.step) + else: + dim_chunk_sel_start = self.start - dim_offset + dim_out_offset = 0 + + if self.stop > dim_limit: + dim_chunk_sel_stop = dim_chunk_len + else: + dim_chunk_sel_stop = self.stop - dim_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + + if dim_chunk_nitems == 0: + continue + + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + is_complete_chunk = ( + dim_chunk_sel_start == 0 and (self.stop >= dim_limit) and self.step in [1, None] + ) + yield ChunkDimProjection( + dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk + ) + else: + # Legacy path: scalar dim_chunk_len + dim_chunk_ix_from = 0 if self.start == 0 else self.start // self.dim_chunk_len + dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) + + for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) + dim_chunk_len = dim_limit - dim_offset + + if self.start < dim_offset: + dim_chunk_sel_start = 0 + remainder = (dim_offset - self.start) % self.step + if remainder: + dim_chunk_sel_start += self.step - remainder + dim_out_offset = ceildiv((dim_offset - self.start), self.step) + else: + dim_chunk_sel_start = self.start - dim_offset + dim_out_offset = 0 + + if self.stop > dim_limit: + dim_chunk_sel_stop = dim_chunk_len + else: + dim_chunk_sel_stop = self.stop - dim_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + + if dim_chunk_nitems == 0: + continue + + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + is_complete_chunk = ( + dim_chunk_sel_start == 0 and (self.stop >= dim_limit) and self.step in [1, None] + ) + yield ChunkDimProjection( + dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk + ) def check_selection_length(selection: SelectionNormalized, shape: tuple[int, ...]) -> None: @@ -588,21 +645,23 @@ def __init__( shape: tuple[int, ...], chunk_grid: ChunkGrid, ) -> None: - chunk_shape = get_chunk_shape(chunk_grid) + dim_grids = _get_dim_grids(chunk_grid) # handle ellipsis selection_normalized = replace_ellipsis(selection, shape) # setup per-dimension indexers dim_indexers: list[IntDimIndexer | SliceDimIndexer] = [] - for dim_sel, dim_len, dim_chunk_len in zip( - selection_normalized, shape, chunk_shape, strict=True - ): + for dim_sel, dim_len, dim_grid in zip(selection_normalized, shape, dim_grids, strict=True): + from zarr.core.chunk_grids import FixedDimension + + dim_chunk_len = dim_grid.size if isinstance(dim_grid, FixedDimension) else 1 + dim_indexer: IntDimIndexer | SliceDimIndexer if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) elif is_slice(dim_sel): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) else: raise IndexError( @@ -636,6 +695,7 @@ class BoolArrayDimIndexer: dim_sel: npt.NDArray[np.bool_] dim_len: int dim_chunk_len: int + dim_grid: DimensionGrid | None nchunks: int chunk_nitems: npt.NDArray[Any] @@ -643,7 +703,13 @@ class BoolArrayDimIndexer: nitems: int dim_chunk_ixs: npt.NDArray[np.intp] - def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: int) -> None: + def __init__( + self, + dim_sel: npt.NDArray[np.bool_], + dim_len: int, + dim_chunk_len: int, + dim_grid: DimensionGrid | None = None, + ) -> None: # check number of dimensions if not is_bool_array(dim_sel, 1): raise IndexError("Boolean arrays in an orthogonal selection must be 1-dimensional only") @@ -654,13 +720,24 @@ def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: f"Boolean array has the wrong length for dimension; expected {dim_len}, got {dim_sel.shape[0]}" ) + g = dim_grid + + if g is not None: + nchunks = g.nchunks(dim_len) + else: + nchunks = ceildiv(dim_len, dim_chunk_len) + # precompute number of selected items for each chunk - nchunks = ceildiv(dim_len, dim_chunk_len) chunk_nitems = np.zeros(nchunks, dtype="i8") for dim_chunk_ix in range(nchunks): - dim_offset = dim_chunk_ix * dim_chunk_len + if g is not None: + dim_offset = g.chunk_offset(dim_chunk_ix) + chunk_len = g.chunk_size(dim_chunk_ix, dim_len) + else: + dim_offset = dim_chunk_ix * dim_chunk_len + chunk_len = dim_chunk_len chunk_nitems[dim_chunk_ix] = np.count_nonzero( - dim_sel[dim_offset : dim_offset + dim_chunk_len] + dim_sel[dim_offset : dim_offset + chunk_len] ) chunk_nitems_cumsum = np.cumsum(chunk_nitems) nitems = chunk_nitems_cumsum[-1] @@ -670,6 +747,7 @@ def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: object.__setattr__(self, "dim_sel", dim_sel) object.__setattr__(self, "dim_len", dim_len) object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "dim_grid", dim_grid) object.__setattr__(self, "nchunks", nchunks) object.__setattr__(self, "chunk_nitems", chunk_nitems) object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) @@ -677,14 +755,21 @@ def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) def __iter__(self) -> Iterator[ChunkDimProjection]: + g = self.dim_grid + # iterate over chunks with at least one item for dim_chunk_ix in self.dim_chunk_ixs: # find region in chunk - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] + if g is not None: + dim_offset = g.chunk_offset(dim_chunk_ix) + chunk_len = g.chunk_size(dim_chunk_ix, self.dim_len) + else: + dim_offset = dim_chunk_ix * self.dim_chunk_len + chunk_len = self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + chunk_len] - # pad out if final chunk - if dim_chunk_sel.shape[0] < self.dim_chunk_len: + # pad out if final chunk (for fixed grids, actual chunk may be smaller than dim_chunk_len) + if g is None and dim_chunk_sel.shape[0] < self.dim_chunk_len: tmp = np.zeros(self.dim_chunk_len, dtype=bool) tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel dim_chunk_sel = tmp @@ -745,6 +830,7 @@ class IntArrayDimIndexer: dim_len: int dim_chunk_len: int + dim_grid: DimensionGrid | None nchunks: int nitems: int order: Order @@ -762,6 +848,7 @@ def __init__( wraparound: bool = True, boundscheck: bool = True, order: Order = Order.UNKNOWN, + dim_grid: DimensionGrid | None = None, ) -> None: # ensure 1d array dim_sel = np.asanyarray(dim_sel) @@ -769,7 +856,12 @@ def __init__( raise IndexError("integer arrays in an orthogonal selection must be 1-dimensional only") nitems = len(dim_sel) - nchunks = ceildiv(dim_len, dim_chunk_len) + g = dim_grid + + if g is not None: + nchunks = g.nchunks(dim_len) + else: + nchunks = ceildiv(dim_len, dim_chunk_len) # handle wraparound if wraparound: @@ -780,9 +872,10 @@ def __init__( boundscheck_indices(dim_sel, dim_len) # determine which chunk is needed for each selection item - # note: for dense integer selections, the division operation here is the - # bottleneck - dim_sel_chunk = dim_sel // dim_chunk_len + if g is not None: + dim_sel_chunk = g.indices_to_chunks(dim_sel) + else: + dim_sel_chunk = dim_sel // dim_chunk_len # determine order of indices if order == Order.UNKNOWN: @@ -793,7 +886,6 @@ def __init__( dim_out_sel = None elif order == Order.DECREASING: dim_sel = dim_sel[::-1] - # TODO should be possible to do this without creating an arange dim_out_sel = np.arange(nitems - 1, -1, -1) else: # sort indices to group by chunk @@ -812,6 +904,7 @@ def __init__( # store attributes object.__setattr__(self, "dim_len", dim_len) object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "dim_grid", dim_grid) object.__setattr__(self, "nchunks", nchunks) object.__setattr__(self, "nitems", nitems) object.__setattr__(self, "order", order) @@ -822,6 +915,8 @@ def __init__( object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) def __iter__(self) -> Iterator[ChunkDimProjection]: + g = self.dim_grid + for dim_chunk_ix in self.dim_chunk_ixs: dim_out_sel: slice | npt.NDArray[np.intp] # find region in output @@ -836,7 +931,10 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: dim_out_sel = self.dim_out_sel[start:stop] # find region in chunk - dim_offset = dim_chunk_ix * self.dim_chunk_len + if g is not None: + dim_offset = g.chunk_offset(dim_chunk_ix) + else: + dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel[start:stop] - dim_offset is_complete_chunk = False # TODO yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) @@ -902,7 +1000,7 @@ class OrthogonalIndexer(Indexer): drop_axes: tuple[int, ...] def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: ChunkGrid) -> None: - chunk_shape = get_chunk_shape(chunk_grid) + dim_grids = _get_dim_grids(chunk_grid) # handle ellipsis selection = replace_ellipsis(selection, shape) @@ -914,19 +1012,24 @@ def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: Chu dim_indexers: list[ IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer ] = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): + for dim_sel, dim_len, dim_grid in zip(selection, shape, dim_grids, strict=True): + from zarr.core.chunk_grids import FixedDimension + + dim_chunk_len = dim_grid.size if isinstance(dim_grid, FixedDimension) else 1 dim_indexer: IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) elif isinstance(dim_sel, slice): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) elif is_integer_array(dim_sel): - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) elif is_bool_array(dim_sel): - dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = BoolArrayDimIndexer( + dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid + ) else: raise IndexError( @@ -948,6 +1051,11 @@ def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: Chu else: drop_axes = () + # Compute chunk_shape for ix_() compatibility in __iter__ + from zarr.core.chunk_grids import FixedDimension + + chunk_shape = tuple(g.size if isinstance(g, FixedDimension) else 1 for g in dim_grids) + object.__setattr__(self, "dim_indexers", dim_indexers) object.__setattr__(self, "shape", shape) object.__setattr__(self, "chunk_shape", chunk_shape) @@ -1037,6 +1145,7 @@ def __init__( self, selection: BasicSelection, shape: tuple[int, ...], chunk_grid: ChunkGrid ) -> None: chunk_shape = get_chunk_shape(chunk_grid) + dim_grids = _get_dim_grids(chunk_grid) # handle ellipsis selection_normalized = replace_ellipsis(selection, shape) @@ -1046,8 +1155,8 @@ def __init__( # setup per-dimension indexers dim_indexers = [] - for dim_sel, dim_len, dim_chunk_size in zip( - selection_normalized, shape, chunk_shape, strict=True + for dim_sel, dim_len, dim_chunk_size, dim_grid in zip( + selection_normalized, shape, chunk_shape, dim_grids, strict=True ): dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) @@ -1086,7 +1195,7 @@ def __init__( f"expected integer or slice, got {type(dim_sel)!r}" ) - dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) + dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size, dim_grid=dim_grid) dim_indexers.append(dim_indexer) if start >= dim_len or start < 0: @@ -1158,18 +1267,20 @@ class CoordinateIndexer(Indexer): chunk_mixs: tuple[npt.NDArray[np.intp], ...] shape: tuple[int, ...] chunk_shape: tuple[int, ...] + dim_grids: tuple[DimensionGrid, ...] drop_axes: tuple[int, ...] def __init__( self, selection: CoordinateSelection, shape: tuple[int, ...], chunk_grid: ChunkGrid ) -> None: - chunk_shape = get_chunk_shape(chunk_grid) + dim_grids = _get_dim_grids(chunk_grid) + chunk_shape = tuple(g.size if hasattr(g, "size") else 1 for g in dim_grids) cdata_shape: tuple[int, ...] if shape == (): cdata_shape = (1,) else: - cdata_shape = tuple(math.ceil(s / c) for s, c in zip(shape, chunk_shape, strict=True)) + cdata_shape = tuple(g.nchunks(s) for g, s in zip(dim_grids, shape, strict=True)) nchunks = reduce(operator.mul, cdata_shape, 1) # some initial normalization @@ -1199,8 +1310,8 @@ def __init__( # compute chunk index for each point in the selection chunks_multi_index = tuple( - dim_sel // dim_chunk_len - for (dim_sel, dim_chunk_len) in zip(selection_normalized, chunk_shape, strict=True) + g.indices_to_chunks(dim_sel) + for (dim_sel, g) in zip(selection_normalized, dim_grids, strict=True) ) # broadcast selection - this will raise error if array dimensions don't match @@ -1247,6 +1358,7 @@ def __init__( object.__setattr__(self, "chunk_rixs", chunk_rixs) object.__setattr__(self, "chunk_mixs", chunk_mixs) object.__setattr__(self, "chunk_shape", chunk_shape) + object.__setattr__(self, "dim_grids", dim_grids) object.__setattr__(self, "shape", shape) object.__setattr__(self, "drop_axes", ()) @@ -1266,8 +1378,8 @@ def __iter__(self) -> Iterator[ChunkProjection]: out_selection = self.sel_sort[start:stop] chunk_offsets = tuple( - dim_chunk_ix * dim_chunk_len - for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.chunk_shape, strict=True) + g.chunk_offset(dim_chunk_ix) + for dim_chunk_ix, g in zip(chunk_coords, self.dim_grids, strict=True) ) chunk_selection = tuple( dim_sel[start:stop] - dim_chunk_offset diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 5ce155bd9a..1f8d6b4674 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -262,7 +262,9 @@ def __init__( self._validate_metadata() def _validate_metadata(self) -> None: - if isinstance(self.chunk_grid, RegularChunkGrid) and len(self.shape) != len( + if hasattr(self.chunk_grid, "ndim") and len(self.shape) != self.chunk_grid.ndim: + raise ValueError("`chunk_grid` and `shape` need to have the same number of dimensions.") + elif isinstance(self.chunk_grid, RegularChunkGrid) and len(self.shape) != len( self.chunk_grid.chunk_shape ): raise ValueError( @@ -287,7 +289,7 @@ def dtype(self) -> ZDType[TBaseDType, TBaseScalar]: @property def chunks(self) -> tuple[int, ...]: - if isinstance(self.chunk_grid, RegularChunkGrid): + if self.chunk_grid.is_regular: from zarr.codecs.sharding import ShardingCodec if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): @@ -298,14 +300,14 @@ def chunks(self) -> tuple[int, ...]: return self.chunk_grid.chunk_shape msg = ( - f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`." - f"This array has a {self.chunk_grid} instead." + "The `chunks` attribute is only defined for arrays using regular chunk grids. " + "This array has a rectilinear chunk grid. Use `chunk_grid` for general access." ) raise NotImplementedError(msg) @property def shards(self) -> tuple[int, ...] | None: - if isinstance(self.chunk_grid, RegularChunkGrid): + if self.chunk_grid.is_regular: from zarr.codecs.sharding import ShardingCodec if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): @@ -314,28 +316,37 @@ def shards(self) -> tuple[int, ...] | None: return None msg = ( - f"The `shards` attribute is only defined for arrays using `RegularChunkGrid`." - f"This array has a {self.chunk_grid} instead." + "The `shards` attribute is only defined for arrays using regular chunk grids. " + "This array has a rectilinear chunk grid. Use `chunk_grid` for general access." ) raise NotImplementedError(msg) @property def inner_codecs(self) -> tuple[Codec, ...]: - if isinstance(self.chunk_grid, RegularChunkGrid): - from zarr.codecs.sharding import ShardingCodec + from zarr.codecs.sharding import ShardingCodec - if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): - return self.codecs[0].codecs + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.codecs[0].codecs return self.codecs def get_chunk_spec( self, _chunk_coords: tuple[int, ...], array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: - assert isinstance(self.chunk_grid, RegularChunkGrid), ( - "Currently, only regular chunk grid is supported" - ) + if self.chunk_grid.is_regular: + # Regular grids: return the uniform chunk shape for all chunks, + # including boundary chunks. The codec pipeline expects full-sized + # buffers and handles boundary trimming separately. + chunk_shape = self.chunk_grid.chunk_shape + else: + # Rectilinear grids: each chunk may have a different shape. + chunk_shape_or_none = self.chunk_grid.get_chunk_shape(self.shape, _chunk_coords) + if chunk_shape_or_none is None: + raise ValueError( + f"Chunk coordinates {_chunk_coords} are out of bounds for shape {self.shape}" + ) + chunk_shape = chunk_shape_or_none return ArraySpec( - shape=self.chunk_grid.chunk_shape, + shape=chunk_shape, dtype=self.dtype, fill_value=self.fill_value, config=array_config, diff --git a/tests/test_array.py b/tests/test_array.py index 5b85c6ba1d..b57eea8fa1 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -786,8 +786,6 @@ def test_resize_growing_skips_chunk_enumeration( store: MemoryStore, zarr_format: ZarrFormat ) -> None: """Growing an array should not enumerate chunk coords for deletion (#3650 mitigation).""" - from zarr.core.chunk_grids import RegularChunkGrid - z = zarr.create( shape=(10, 10), chunks=(5, 5), @@ -798,9 +796,11 @@ def test_resize_growing_skips_chunk_enumeration( ) z[:] = np.ones((10, 10), dtype="i4") + grid_cls = type(z.metadata.chunk_grid) + # growth only - ensure no chunk coords are enumerated with mock.patch.object( - RegularChunkGrid, + grid_cls, "all_chunk_coords", wraps=z.metadata.chunk_grid.all_chunk_coords, ) as mock_coords: @@ -813,7 +813,7 @@ def test_resize_growing_skips_chunk_enumeration( # shrink - ensure no regression of behaviour with mock.patch.object( - RegularChunkGrid, + grid_cls, "all_chunk_coords", wraps=z.metadata.chunk_grid.all_chunk_coords, ) as mock_coords: @@ -836,7 +836,7 @@ def test_resize_growing_skips_chunk_enumeration( z2[:] = np.ones((10, 10), dtype="i4") with mock.patch.object( - RegularChunkGrid, + grid_cls, "all_chunk_coords", wraps=z2.metadata.chunk_grid.all_chunk_coords, ) as mock_coords: diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py new file mode 100644 index 0000000000..420e774342 --- /dev/null +++ b/tests/test_unified_chunk_grid.py @@ -0,0 +1,505 @@ +""" +Tests for the unified ChunkGrid design (POC). + +Tests the core ChunkGrid with FixedDimension/VaryingDimension internals, +serialization round-trips, indexing with rectilinear grids, and end-to-end +array creation + read/write. +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any + +import numpy as np +import pytest + +if TYPE_CHECKING: + from pathlib import Path + +from zarr.core.chunk_grids import ( + ChunkGrid, + FixedDimension, + RegularChunkGrid, + VaryingDimension, + _compress_rle, + _expand_rle, +) + +# --------------------------------------------------------------------------- +# FixedDimension +# --------------------------------------------------------------------------- + + +class TestFixedDimension: + def test_basic(self) -> None: + d = FixedDimension(size=10) + assert d.size == 10 + assert d.index_to_chunk(0) == 0 + assert d.index_to_chunk(9) == 0 + assert d.index_to_chunk(10) == 1 + assert d.index_to_chunk(25) == 2 + assert d.chunk_offset(0) == 0 + assert d.chunk_offset(1) == 10 + assert d.chunk_offset(3) == 30 + assert d.chunk_size(0, 100) == 10 + assert d.chunk_size(9, 100) == 10 + # boundary chunk + assert d.chunk_size(9, 95) == 5 + assert d.nchunks(100) == 10 + assert d.nchunks(95) == 10 + + def test_vectorized(self) -> None: + d = FixedDimension(size=10) + indices = np.array([0, 5, 10, 15, 99]) + chunks = d.indices_to_chunks(indices) + np.testing.assert_array_equal(chunks, [0, 0, 1, 1, 9]) + + def test_negative_size_rejected(self) -> None: + with pytest.raises(ValueError, match="must be >= 0"): + FixedDimension(size=-1) + + def test_zero_size_allowed(self) -> None: + d = FixedDimension(size=0) + assert d.size == 0 + + +# --------------------------------------------------------------------------- +# VaryingDimension +# --------------------------------------------------------------------------- + + +class TestVaryingDimension: + def test_basic(self) -> None: + d = VaryingDimension([10, 20, 30]) + assert d.edges == (10, 20, 30) + assert d.cumulative == (10, 30, 60) + assert d.nchunks(60) == 3 + + def test_index_to_chunk(self) -> None: + d = VaryingDimension([10, 20, 30]) + # First chunk: indices 0-9 + assert d.index_to_chunk(0) == 0 + assert d.index_to_chunk(9) == 0 + # Second chunk: indices 10-29 + assert d.index_to_chunk(10) == 1 + assert d.index_to_chunk(29) == 1 + # Third chunk: indices 30-59 + assert d.index_to_chunk(30) == 2 + assert d.index_to_chunk(59) == 2 + + def test_chunk_offset(self) -> None: + d = VaryingDimension([10, 20, 30]) + assert d.chunk_offset(0) == 0 + assert d.chunk_offset(1) == 10 + assert d.chunk_offset(2) == 30 + + def test_chunk_size(self) -> None: + d = VaryingDimension([10, 20, 30]) + assert d.chunk_size(0, 60) == 10 + assert d.chunk_size(1, 60) == 20 + assert d.chunk_size(2, 60) == 30 + + def test_vectorized(self) -> None: + d = VaryingDimension([10, 20, 30]) + indices = np.array([0, 9, 10, 29, 30, 59]) + chunks = d.indices_to_chunks(indices) + np.testing.assert_array_equal(chunks, [0, 0, 1, 1, 2, 2]) + + def test_empty_rejected(self) -> None: + with pytest.raises(ValueError, match="must not be empty"): + VaryingDimension([]) + + def test_zero_edge_rejected(self) -> None: + with pytest.raises(ValueError, match="must be > 0"): + VaryingDimension([10, 0, 5]) + + +# --------------------------------------------------------------------------- +# ChunkGrid construction +# --------------------------------------------------------------------------- + + +class TestChunkGridConstruction: + def test_from_regular(self) -> None: + g = ChunkGrid.from_regular((10, 20)) + assert g.is_regular + assert g.chunk_shape == (10, 20) + assert g.ndim == 2 + + def test_zero_dim(self) -> None: + """0-d arrays produce a ChunkGrid with no dimensions.""" + g = ChunkGrid.from_regular(()) + assert g.is_regular + assert g.chunk_shape == () + assert g.ndim == 0 + + def test_from_rectilinear(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + assert not g.is_regular + assert g.ndim == 2 + with pytest.raises(ValueError, match="only available for regular"): + _ = g.chunk_shape + + def test_rectilinear_with_uniform_dim(self) -> None: + """A rectilinear grid with all-same sizes in one dim stores it as Fixed.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + assert isinstance(g.dimensions[0], VaryingDimension) + assert isinstance(g.dimensions[1], FixedDimension) + + def test_all_uniform_becomes_regular(self) -> None: + """If all dimensions have uniform sizes, the grid is regular.""" + g = ChunkGrid.from_rectilinear([[10, 10, 10], [25, 25]]) + assert g.is_regular + assert g.chunk_shape == (10, 25) + + +# --------------------------------------------------------------------------- +# ChunkGrid queries +# --------------------------------------------------------------------------- + + +class TestChunkGridQueries: + def test_regular_grid_shape(self) -> None: + g = ChunkGrid.from_regular((10, 20)) + assert g.grid_shape((100, 200)) == (10, 10) + assert g.grid_shape((95, 200)) == (10, 10) + + def test_rectilinear_grid_shape(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + assert g.grid_shape((60, 100)) == (3, 4) + + def test_regular_get_chunk_shape(self) -> None: + g = ChunkGrid.from_regular((10, 20)) + assert g.get_chunk_shape((100, 200), (0, 0)) == (10, 20) + assert g.get_chunk_shape((95, 200), (9, 0)) == (5, 20) # boundary + assert g.get_chunk_shape((100, 200), (99, 0)) is None # OOB + + def test_rectilinear_get_chunk_shape(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + assert g.get_chunk_shape((60, 100), (0, 0)) == (10, 25) + assert g.get_chunk_shape((60, 100), (1, 0)) == (20, 25) + assert g.get_chunk_shape((60, 100), (2, 3)) == (30, 25) + assert g.get_chunk_shape((60, 100), (3, 0)) is None # OOB + + def test_get_chunk_origin(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + assert g.get_chunk_origin((60, 100), (0, 0)) == (0, 0) + assert g.get_chunk_origin((60, 100), (1, 0)) == (10, 0) + assert g.get_chunk_origin((60, 100), (2, 2)) == (30, 50) + + def test_all_chunk_coords(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + coords = list(g.all_chunk_coords((60, 100))) + assert len(coords) == 6 # 3 * 2 + assert coords[0] == (0, 0) + assert coords[-1] == (2, 1) + + def test_get_nchunks(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + assert g.get_nchunks((60, 100)) == 6 + + +# --------------------------------------------------------------------------- +# RLE helpers +# --------------------------------------------------------------------------- + + +class TestRLE: + def test_expand(self) -> None: + assert _expand_rle([[10, 3]]) == [10, 10, 10] + assert _expand_rle([[10, 2], [20, 1]]) == [10, 10, 20] + + def test_compress(self) -> None: + assert _compress_rle([10, 10, 10]) == [[10, 3]] + assert _compress_rle([10, 10, 20]) == [[10, 2], [20, 1]] + + def test_roundtrip(self) -> None: + original = [10, 10, 10, 20, 20, 30] + compressed = _compress_rle(original) + assert _expand_rle(compressed) == original + + +# --------------------------------------------------------------------------- +# Serialization +# --------------------------------------------------------------------------- + + +class TestSerialization: + def test_regular_roundtrip(self) -> None: + g = ChunkGrid.from_regular((10, 20)) + d = g.to_dict() + assert d["name"] == "regular" + config = d["configuration"] + assert isinstance(config, dict) + assert tuple(config["chunk_shape"]) == (10, 20) + g2 = ChunkGrid.from_dict(d) + assert g2.is_regular + assert g2.chunk_shape == (10, 20) + + def test_rectilinear_roundtrip(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + d = g.to_dict() + assert d["name"] == "rectilinear" + g2 = ChunkGrid.from_dict(d) + assert not g2.is_regular + # Verify the reconstructed grid produces correct shapes + assert g2.get_chunk_shape((60, 100), (0, 0)) == (10, 25) + assert g2.get_chunk_shape((60, 100), (1, 0)) == (20, 25) + assert g2.get_chunk_shape((60, 100), (2, 3)) == (30, 25) + + def test_rectilinear_rle_serialization(self) -> None: + """RLE should be used when it actually compresses.""" + g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]]) + d = g.to_dict() + # First dim: 10 identical chunks -> RLE + # Second dim: 4 identical chunks -> stored as FixedDimension -> RLE [[25, 1]] + assert d["name"] == "regular" # all uniform -> serializes as regular + + def test_rectilinear_rle_with_varying(self) -> None: + g = ChunkGrid.from_rectilinear([[100, 100, 100, 50], [25, 25, 25, 25]]) + d = g.to_dict() + assert d["name"] == "rectilinear" + # Check RLE used for first dimension + config = d["configuration"] + assert isinstance(config, dict) + chunk_shapes = config["chunk_shapes"] + assert isinstance(chunk_shapes, list) + # First dim: [100, 100, 100, 50] -> [[100, 3], [50, 1]] (RLE shorter) + assert chunk_shapes[0] == [[100, 3], [50, 1]] + + def test_json_roundtrip(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + d = g.to_dict() + json_str = json.dumps(d) + d2 = json.loads(json_str) + g2 = ChunkGrid.from_dict(d2) + assert g2.grid_shape((60, 100)) == (3, 2) + + def test_unknown_name_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown chunk grid"): + ChunkGrid.from_dict({"name": "hexagonal", "configuration": {}}) + + +# --------------------------------------------------------------------------- +# Backwards compatibility +# --------------------------------------------------------------------------- + + +class TestBackwardsCompat: + def test_regular_chunk_grid_still_works(self) -> None: + g = RegularChunkGrid(chunk_shape=(10, 20)) + assert g.chunk_shape == (10, 20) + assert g.is_regular + assert isinstance(g, ChunkGrid) + + def test_from_dict_regular(self) -> None: + d: dict[str, Any] = {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} + g = ChunkGrid.from_dict(d) + # from_dict now returns ChunkGrid, not RegularChunkGrid + assert isinstance(g, ChunkGrid) + assert g.is_regular + assert g.chunk_shape == (10, 20) + + def test_regular_chunk_grid_passed_to_from_dict(self) -> None: + """RegularChunkGrid instances should be convertible.""" + rcg = RegularChunkGrid(chunk_shape=(10, 20)) + g = ChunkGrid.from_dict(rcg) + assert isinstance(g, ChunkGrid) + assert g.is_regular + + +# --------------------------------------------------------------------------- +# Indexing with rectilinear grids +# --------------------------------------------------------------------------- + + +class TestRectilinearIndexing: + """Test that the indexing pipeline works with VaryingDimension.""" + + def test_basic_indexer_rectilinear(self) -> None: + from zarr.core.indexing import BasicIndexer + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + indexer = BasicIndexer( + selection=(slice(None), slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + # Should visit all 3*2=6 chunks + assert len(projections) == 6 + + # Check first chunk + p0 = projections[0] + assert p0.chunk_coords == (0, 0) + assert p0.chunk_selection == (slice(0, 10, 1), slice(0, 50, 1)) + + # Check second chunk on first axis + p1 = projections[2] # (1, 0) in product order + assert p1.chunk_coords == (1, 0) + assert p1.chunk_selection == (slice(0, 20, 1), slice(0, 50, 1)) + + def test_basic_indexer_int_selection(self) -> None: + from zarr.core.indexing import BasicIndexer + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + indexer = BasicIndexer( + selection=(15, slice(None)), # index 15 falls in chunk 1 (offset 10) + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) == 2 # 2 chunks in second dimension + assert projections[0].chunk_coords == (1, 0) + assert projections[0].chunk_selection == (5, slice(0, 50, 1)) # 15 - 10 = 5 + + def test_basic_indexer_slice_subset(self) -> None: + from zarr.core.indexing import BasicIndexer + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + indexer = BasicIndexer( + selection=(slice(5, 35), slice(0, 50)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + # slice(5, 35) spans chunks 0 (5:10), 1 (0:20), 2 (0:5) + chunk_coords_dim0 = sorted({p.chunk_coords[0] for p in projections}) + assert chunk_coords_dim0 == [0, 1, 2] + + def test_orthogonal_indexer_rectilinear(self) -> None: + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + indexer = OrthogonalIndexer( + selection=(slice(None), slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) == 6 + + +# --------------------------------------------------------------------------- +# End-to-end: array creation with rectilinear chunks +# --------------------------------------------------------------------------- + + +class TestEndToEnd: + """Test creating, writing, and reading arrays with rectilinear chunk grids.""" + + def test_create_regular_array(self, tmp_path: Path) -> None: + import zarr + + arr = zarr.create_array( + store=tmp_path / "regular.zarr", + shape=(100, 200), + chunks=(10, 20), + dtype="float32", + ) + assert arr.metadata.chunk_grid.is_regular + assert arr.chunks == (10, 20) + + def test_create_rectilinear_array(self, tmp_path: Path) -> None: + """Create an array with a rectilinear chunk grid via metadata.""" + from zarr.core.array import AsyncArray + from zarr.core.dtype import Float32 + from zarr.core.metadata.v3 import ArrayV3Metadata + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + + meta = AsyncArray._create_metadata_v3( + shape=(60, 100), + dtype=Float32(), + chunk_shape=(10, 20), # fallback, overridden by chunk_grid + chunk_grid=g, + ) + assert isinstance(meta, ArrayV3Metadata) + assert not meta.chunk_grid.is_regular + assert meta.chunk_grid.ndim == 2 + + def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: + """Verify metadata round-trips through JSON.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + d = g.to_dict() + g2 = ChunkGrid.from_dict(d) + assert g2.grid_shape((60, 100)) == g.grid_shape((60, 100)) + # All chunk shapes should match + for coord in g.all_chunk_coords((60, 100)): + assert g.get_chunk_shape((60, 100), coord) == g2.get_chunk_shape((60, 100), coord) + + def test_get_chunk_spec_regular(self, tmp_path: Path) -> None: + """get_chunk_spec works for regular grids.""" + from zarr.core.array import AsyncArray + from zarr.core.array_spec import ArrayConfig + from zarr.core.buffer.core import default_buffer_prototype + from zarr.core.dtype import Float32 + + meta = AsyncArray._create_metadata_v3( + shape=(100, 200), + dtype=Float32(), + chunk_shape=(10, 20), + ) + spec = meta.get_chunk_spec( + (0, 0), + ArrayConfig.from_dict({}), + default_buffer_prototype(), + ) + assert spec.shape == (10, 20) + + # Boundary chunk + spec_boundary = meta.get_chunk_spec( + (9, 9), + ArrayConfig.from_dict({}), + default_buffer_prototype(), + ) + assert spec_boundary.shape == (10, 20) + + def test_get_chunk_spec_rectilinear(self, tmp_path: Path) -> None: + """get_chunk_spec returns per-chunk shapes for rectilinear grids.""" + from zarr.core.array import AsyncArray + from zarr.core.array_spec import ArrayConfig + from zarr.core.buffer.core import default_buffer_prototype + from zarr.core.dtype import Float32 + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + meta = AsyncArray._create_metadata_v3( + shape=(60, 100), + dtype=Float32(), + chunk_shape=(10, 20), + chunk_grid=g, + ) + proto = default_buffer_prototype() + config = ArrayConfig.from_dict({}) + + spec0 = meta.get_chunk_spec((0, 0), config, proto) + assert spec0.shape == (10, 50) + + spec1 = meta.get_chunk_spec((1, 0), config, proto) + assert spec1.shape == (20, 50) + + spec2 = meta.get_chunk_spec((2, 1), config, proto) + assert spec2.shape == (30, 50) + + +# --------------------------------------------------------------------------- +# Sharding compatibility +# --------------------------------------------------------------------------- + + +class TestShardingCompat: + def test_sharding_accepts_rectilinear_outer_grid(self) -> None: + """ShardingCodec.validate should not reject rectilinear outer grids.""" + from zarr.codecs.sharding import ShardingCodec + from zarr.core.dtype import Float32 + + codec = ShardingCodec(chunk_shape=(5, 5)) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + + # Should not raise + codec.validate( + shape=(60, 100), + dtype=Float32(), + chunk_grid=g, + ) From b3e72ecf3bec6e168880d048955953de2d9acd0c Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:12:02 -0400 Subject: [PATCH 003/118] V2 prospectus --- chunk-grid-prospectus.md | 428 +++++++++++++++++---------------------- 1 file changed, 188 insertions(+), 240 deletions(-) diff --git a/chunk-grid-prospectus.md b/chunk-grid-prospectus.md index 58e7810c75..a8149d2ad2 100644 --- a/chunk-grid-prospectus.md +++ b/chunk-grid-prospectus.md @@ -1,6 +1,8 @@ # Prospectus: Unified Chunk Grid Design for zarr-python -**Related:** +Version: 2 + +**Related:** - [#3750](https://github.com/zarr-developers/zarr-python/issues/3750) (single ChunkGrid proposal) - [#3534](https://github.com/zarr-developers/zarr-python/pull/3534) (rectilinear implementation) - [#3735](https://github.com/zarr-developers/zarr-python/pull/3735) (chunk grid module/registry) @@ -11,123 +13,156 @@ ## Problem -The Zarr V3 spec defines `chunk_grid` as an extension point, suggesting chunk grids should be pluggable like codecs or data types. But chunk grids are fundamentally different: - -- **Codecs are independent:** supporting `zstd` tells you nothing about `gzip`. -- **Chunk grids form a hierarchy:** the rectilinear chunk grid is strictly more general than the regular chunk grid (and zarrs' regular-bounded grid). Any regular grid is expressible as a rectilinear grid. Supporting rectilinear means you support all known grid types for free. - -A registry-based plugin system adds complexity without clear benefit — there is no known chunk grid that is both (a) more general than rectilinear and (b) retains the tessellation properties that Zarr assumes. All known grids are special cases of the rectilinear grid: +The Zarr V3 spec defines `chunk_grid` as an extension point, but chunk grids are fundamentally different from codecs. Codecs are independent — supporting `zstd` tells you nothing about `gzip`. Chunk grids form a hierarchy — the rectilinear grid is strictly more general than the regular grid. Any regular grid is expressible as a rectilinear grid. -| Grid type | Description | Rectilinear representation | -|---|---|---| -| Regular | All chunks same shape | All axes have a single repeated edge length | -| Regular-bounded (zarrs) | Regular, but boundary chunks trimmed to array extent | Last edge length per axis is `shape % chunk_size` | -| HPC boundary-padded | Regular interior, larger boundary chunks | First/last edge lengths differ from interior | -| Fully variable | Arbitrary per-chunk sizes | Direct representation | +There is no known chunk grid that is both (a) more general than rectilinear and (b) retains the axis-aligned tessellation properties Zarr assumes. All known grids are special cases: -If a future grid cannot be expressed as rectilinear (e.g., non-axis-aligned chunking, space-filling curves), it would require fundamentally different indexing and storage. Speculative generality today adds cost without benefit. +| Grid type | Description | +|---|---| +| Regular | Uniform chunk size, boundary chunks padded with fill_value | +| Regular-bounded (zarrs) | Uniform chunk size, boundary chunks trimmed to array extent | +| HPC boundary-padded | Regular interior, larger boundary chunks | +| Fully variable | Arbitrary per-chunk sizes | -## Proposal +A registry-based plugin system adds complexity without clear benefit. -Replace the current multi-class chunk grid architecture with a single `ChunkGrid` implementation that handles both regular and rectilinear chunking, and drop user-defined chunk grids. +## Design -### Design principles +### Principles -1. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. It serializes to the simplest metadata — `"regular"` when all chunks are uniform, `"rectilinear"` otherwise. -2. **No chunk grid registry.** Remove the entrypoint-based registration system. A simple name-based dispatch in `parse_chunk_grid()` is sufficient. -3. **Fixed vs Varying per dimension.** Each axis is internally represented as either `FixedDimension(size)` (one integer — all chunks uniform) or `VaryingDimension(edges, cumulative)` (per-chunk edge lengths with precomputed prefix sums). This avoids expanding regular dimensions into lists of identical values. -4. **Shape-free grid.** The chunk grid describes a tiling pattern, not a bound region. It does not store the array shape. Methods that need the shape receive it as a parameter. This matches the Zarr V3 spec where `shape` and `chunk_grid` are independent fields. -5. **Transparent transitions.** Operations like `resize()` can move an array from regular to rectilinear chunking. This transition should be explicit and controllable. +1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern — the specific partition of a specific array. The grid stores enough information to answer any question about any chunk without external parameters. +2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class serializes as `"regular"` when all chunks are uniform, `"rectilinear"` otherwise. +3. **No chunk grid registry.** Simple name-based dispatch in `parse_chunk_grid()`. +4. **Fixed vs Varying per dimension.** `FixedDimension(size, extent)` for uniform chunks; `VaryingDimension(edges, cumulative)` for per-chunk edge lengths with precomputed prefix sums. Avoids expanding regular dimensions into lists of identical values. +5. **Transparent transitions.** Operations like `resize()` can move an array from regular to rectilinear chunking. ### Internal representation ```python @dataclass(frozen=True) class FixedDimension: - """All chunks on this axis have the same size.""" + """Uniform chunk size. Boundary chunks contain less data but are + encoded at full size by the codec pipeline.""" size: int # chunk edge length (> 0) + extent: int # array dimension length + + @property + def nchunks(self) -> int: + return ceildiv(self.extent, self.size) def index_to_chunk(self, idx: int) -> int: return idx // self.size def chunk_offset(self, chunk_ix: int) -> int: return chunk_ix * self.size - def chunk_size(self, chunk_ix: int, dim_len: int) -> int: - return min(self.size, dim_len - chunk_ix * self.size) + def chunk_size(self, chunk_ix: int) -> int: + return self.size # always uniform + def data_size(self, chunk_ix: int) -> int: + return min(self.size, self.extent - chunk_ix * self.size) # clipped at extent def indices_to_chunks(self, indices: NDArray) -> NDArray: return indices // self.size @dataclass(frozen=True) class VaryingDimension: - """Chunks on this axis have explicit per-chunk sizes.""" + """Explicit per-chunk sizes. No padding — each edge length is + both the codec size and the data size.""" edges: tuple[int, ...] # per-chunk edge lengths (all > 0) cumulative: tuple[int, ...] # prefix sums for O(log n) lookup + @property + def nchunks(self) -> int: + return len(self.edges) + @property + def extent(self) -> int: + return self.cumulative[-1] + def index_to_chunk(self, idx: int) -> int: return bisect.bisect_right(self.cumulative, idx) def chunk_offset(self, chunk_ix: int) -> int: return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 - def chunk_size(self, chunk_ix: int, dim_len: int) -> int: + def chunk_size(self, chunk_ix: int) -> int: return self.edges[chunk_ix] + def data_size(self, chunk_ix: int) -> int: + return self.edges[chunk_ix] # same as chunk_size def indices_to_chunks(self, indices: NDArray) -> NDArray: return np.searchsorted(self.cumulative, indices, side='right') +``` -@dataclass(frozen=True) -class ChunkGrid: - dimensions: tuple[FixedDimension | VaryingDimension, ...] +Both types share a common interface: `nchunks`, `extent`, `index_to_chunk`, `chunk_offset`, `chunk_size`, `data_size`, `indices_to_chunks`. Memory usage scales with the number of *varying* dimensions, not total chunks. - @property - def is_regular(self) -> bool: - return all(isinstance(d, FixedDimension) for d in self.dimensions) -``` +The two size methods serve different consumers: -`FixedDimension` and `VaryingDimension` share a common interface (`index_to_chunk`, `chunk_offset`, `chunk_size`, `indices_to_chunks`) used directly by the indexing pipeline. Memory usage scales with the number of *varying* dimensions and their chunk counts, not with the total number of chunks. +| Method | Returns | Consumer | +|---|---|---| +| `chunk_size` | Buffer size for codec processing | Codec pipeline (`ArraySpec.shape`) | +| `data_size` | Valid data region within the buffer | Indexing pipeline (`chunk_selection` slicing) | -### API surface +For `FixedDimension`, these differ only at the boundary. For `VaryingDimension`, they are identical. This matches current zarr-python behavior: `get_chunk_spec` passes the full `chunk_shape` to the codec for all chunks, and the indexer generates a `chunk_selection` that clips the decoded buffer. -#### Creating arrays +### ChunkSpec ```python -# Regular chunks — serializes as {"name": "regular", ...} -arr = zarr.create_array(shape=(100, 200), chunks=(10, 20)) +@dataclass(frozen=True) +class ChunkSpec: + slices: tuple[slice, ...] # valid data region in array coordinates + codec_shape: tuple[int, ...] # buffer shape for codec processing -# Rectilinear chunks — serializes as {"name": "rectilinear", ...} -arr = zarr.create_array(shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]]) + @property + def shape(self) -> tuple[int, ...]: + return tuple(s.stop - s.start for s in self.slices) -# RLE shorthand for rectilinear -arr = zarr.create_array(shape=(1000,), chunks=[[[100, 10]]]) # 10 chunks of size 100 + @property + def is_boundary(self) -> bool: + return self.shape != self.codec_shape ``` -#### Inspecting chunk grids +For interior chunks, `shape == codec_shape`. For boundary chunks of a regular grid, `codec_shape` is the full declared chunk size while `shape` is clipped. For rectilinear grids, `shape == codec_shape` always. + +### API ```python -arr.chunk_grid # ChunkGrid instance (always) -arr.chunk_grid.is_regular # True if all dimensions are Fixed -arr.chunk_grid.chunk_shape # (10, 20) — only when is_regular, else raises -arr.chunk_grid.ndim # number of dimensions - -# Per-chunk queries (array shape passed as parameter): -arr.chunk_grid.get_chunk_shape(arr.shape, chunk_coord=(0, 1)) -arr.chunk_grid.get_chunk_origin(arr.shape, chunk_coord=(0, 1)) -arr.chunk_grid.all_chunk_coords(arr.shape) -arr.chunk_grid.grid_shape(arr.shape) # (10, 10) — chunks per dimension - -# Out-of-bounds returns None: -arr.chunk_grid.get_chunk_shape(arr.shape, chunk_coord=(99, 99)) # None -``` +# Creating arrays +arr = zarr.create_array(shape=(100, 200), chunks=(10, 20)) # regular +arr = zarr.create_array(shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]]) # rectilinear +arr = zarr.create_array(shape=(1000,), chunks=[[[100, 10]]]) # RLE shorthand -#### `.chunks` property +# ChunkGrid as a collection +grid = arr.chunk_grid # ChunkGrid instance +grid.shape # (10, 8) — number of chunks per dimension +grid.ndim # 2 +grid.is_regular # True if all dimensions are Fixed -`.chunks` is retained for regular grids, returning `tuple[int, ...]` as today. For rectilinear grids it raises `NotImplementedError`. `.chunk_grid` is the general-purpose API. +spec = grid[0, 1] # ChunkSpec for chunk at grid position (0, 1) +spec.slices # (slice(0, 10), slice(25, 50)) +spec.shape # (10, 25) — data shape +spec.codec_shape # (10, 25) — same for interior chunks -Three different chunk tuple conventions exist in the ecosystem: +boundary = grid[9, 0] # boundary chunk (extent=95, size=10) +boundary.shape # (5, 25) — 5 elements of real data +boundary.codec_shape # (10, 25) — codec sees full buffer -| System | Type | Example | -|---|---|---| -| Zarr `arr.chunks` | `tuple[int, ...]` | `(256, 512)` | -| Dask `arr.chunks` | `tuple[tuple[int, ...], ...]` | `((256, 256, 64), (512, 512))` | -| xarray `.chunks` | `tuple[tuple[int, ...], ...]` | Same as dask | +grid[99, 99] # None — out of bounds -Switching `.chunks` to dask-style tuples would be a breaking change and risks [expensive materialization for large regular grids](https://github.com/zarr-developers/zarr-python/pull/3534#discussion_r2457283002). The least disruptive path: keep `.chunks` for regular grids (no deprecation), add `.chunk_grid` alongside it, and let downstream libraries migrate at their own pace. +for spec in grid: # iterate all chunks + ... + +# .chunks property: retained for regular grids, raises for rectilinear +arr.chunks # (10, 25) +``` + +`ChunkGrid.__getitem__` constructs `ChunkSpec` using `chunk_size` for `codec_shape` and `data_size` for `slices`: + +```python +def __getitem__(self, coords: tuple[int, ...]) -> ChunkSpec | None: + slices = [] + codec_shape = [] + for dim, ix in zip(self.dimensions, coords): + if ix < 0 or ix >= dim.nchunks: + return None + offset = dim.chunk_offset(ix) + slices.append(slice(offset, offset + dim.data_size(ix))) + codec_shape.append(dim.chunk_size(ix)) + return ChunkSpec(tuple(slices), tuple(codec_shape)) +``` #### Serialization @@ -139,257 +174,170 @@ Switching `.chunks` to dask-style tuples would be a breaking change and risks [e {"name": "rectilinear", "configuration": {"chunk_shapes": [[10, 20, 30], [[25, 4]]]}} ``` -Both names produce the same `ChunkGrid` class. Unknown names raise an error (chunk grids must always be understood). +Both names produce the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. For rectilinear grids, the extent is redundant (`sum(edges)`) and is validated for consistency. #### Resize ```python -# Default: new region gets a single chunk spanning the growth arr.resize((80, 100)) # becomes rectilinear if not evenly divisible - -# Explicit: specify chunks for the new region -arr.resize((80, 100), chunks=[[10, 20, 30, 20], [25, 25, 25, 25]]) - -# Staying regular: if new shape is divisible by chunk size -arr.resize((70, 100)) # stays regular +arr.resize((80, 100), chunks=[[10, 20, 30, 20], [25, 25, 25, 25]]) # explicit chunks +arr.resize((70, 100)) # stays regular if divisible ``` +Resize constructs a new frozen `ChunkGrid`, replacing the old one. + ### Indexing -The indexing pipeline is deeply coupled to regular grid assumptions. Every per-dimension indexer (`IntDimIndexer`, `SliceDimIndexer`, `BoolArrayDimIndexer`, `IntArrayDimIndexer`) takes a scalar `dim_chunk_len: int` and uses `//` and `*` for all arithmetic: +The indexing pipeline is coupled to regular grid assumptions — every per-dimension indexer takes a scalar `dim_chunk_len: int` and uses `//` and `*`: ```python dim_chunk_ix = self.dim_sel // self.dim_chunk_len # IntDimIndexer dim_offset = dim_chunk_ix * self.dim_chunk_len # SliceDimIndexer -dim_sel_chunk = dim_sel // dim_chunk_len # IntArrayDimIndexer (vectorized) ``` -For `VaryingDimension`, element-to-chunk mapping becomes a binary search and offset-to-chunk becomes a prefix sum lookup. The indexers must work with either representation. - -**Recommended approach:** Replace `dim_chunk_len: int` with the dimension grid object (`FixedDimension | VaryingDimension`). The shared interface (`index_to_chunk`, `chunk_offset`, `chunk_size`, `indices_to_chunks`) means the indexer code structure stays the same — just replace `dim_sel // dim_chunk_len` with `dim_grid.index_to_chunk(dim_sel)`. This preserves O(1) arithmetic for regular dimensions and uses binary search only for varying ones. - -Alternatives considered: -- **Precompute arrays** (offsets, sizes) at indexer creation and branch on scalar vs array — awkward, two code paths per indexer. -- **Always use `np.searchsorted`** for both types — uniform code but penalizes regular grids. +Replace `dim_chunk_len: int` with the dimension object (`FixedDimension | VaryingDimension`). The shared interface means the indexer code structure stays the same — `dim_sel // dim_chunk_len` becomes `dim_grid.index_to_chunk(dim_sel)`. O(1) for regular, binary search for varying. ### Codec pipeline -Once the indexers determine *which* chunks to read or write, the codec pipeline needs to know *what shape* each chunk is. Today, `ArrayV3Metadata.get_chunk_spec()` ignores `chunk_coords` entirely — it returns the same `ArraySpec(shape=chunk_grid.chunk_shape)` for every chunk, because all chunks have the same shape in a regular grid. - -For rectilinear grids, each chunk may have a different shape. `get_chunk_spec` must use the coordinates: +Today, `get_chunk_spec()` returns the same `ArraySpec(shape=chunk_grid.chunk_shape)` for every chunk. For rectilinear grids, each chunk has a different codec shape: ```python def get_chunk_spec(self, chunk_coords, array_config, prototype) -> ArraySpec: - chunk_shape = self.chunk_grid.get_chunk_shape(self.shape, chunk_coords) - return ArraySpec(shape=chunk_shape, ...) + spec = self.chunk_grid[chunk_coords] + return ArraySpec(shape=spec.codec_shape, ...) ``` -The codec pipeline uses `ArraySpec.shape` to allocate buffers, decode data, and validate output, so the per-chunk shape must be correct. This is a mechanical change — the `chunk_coords` parameter already exists (currently prefixed with `_` to signal it's unused) — but it touches every read/write path. +Note `spec.codec_shape`, not `spec.shape`. For regular grids, `codec_shape` is uniform (preserving current behavior). The boundary clipping flow is unchanged: + +``` +Write: user data → pad to codec_shape with fill_value → encode → store +Read: store → decode to codec_shape → slice via chunk_selection → user data +``` ### Sharding -PR #3534 marks sharding as incompatible with rectilinear chunk grids. This constraint is unnecessary once the design is understood as three independent grid levels: +PR #3534 marks sharding as incompatible with rectilinear grids. This is unnecessary — sharding has three independent grid levels: ``` -Level 1 — Outer chunk grid (shard boundaries) - Can be regular or rectilinear. - e.g., chunks = [[5000, 5980], [5000, 5980]] - -Level 2 — Inner subchunk grid (within each shard) - Always regular, but boundary subchunks may be clipped to shard shape. - e.g., subchunk_shape = [512, 512] - -Level 3 — Shard index - ceil(shard_dim / subchunk_dim) entries per dimension, each (offset, size). +Level 1 — Outer chunk grid (shard boundaries): regular or rectilinear +Level 2 — Inner subchunk grid (within each shard): always regular +Level 3 — Shard index: ceil(shard_dim / subchunk_dim) entries per dimension ``` -The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape and subchunk shape. It doesn't need to know whether the outer grid is regular or rectilinear — each shard is self-contained. - -[zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) (sharding v1.1) lifts the requirement that subchunk shapes evenly divide the shard shape. With the proposed `ChunkGrid`, this requires one change: remove the `shard_shape % subchunk_shape == 0` validation. `FixedDimension` already handles boundary clipping. +The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as extent and the subchunk shape as `FixedDimension`. Each shard is self-contained — it doesn't need to know whether the outer grid is regular or rectilinear. -These two features compose independently: +[zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) lifts the requirement that subchunk shapes evenly divide the shard shape. With the proposed `ChunkGrid`, this just means removing the `shard_shape % subchunk_shape == 0` validation — `FixedDimension` already handles boundary clipping via `data_size`. | Outer grid | Subchunk divisibility | Required change | |---|---|---| -| Regular | Evenly divides (v1.0) | None (works today) | +| Regular | Evenly divides (v1.0) | None | | Regular | Non-divisible (v1.1) | Remove divisibility validation | | Rectilinear | Evenly divides | Remove "sharding incompatible" guard | -| Rectilinear | Non-divisible | Both changes; no additional work | +| Rectilinear | Non-divisible | Both changes | ### What this replaces -| Current design | Proposed design | +| Current | Proposed | |---|---| -| `ChunkGrid` abstract base class | Single concrete `ChunkGrid` class | -| `RegularChunkGrid` subclass | `ChunkGrid` with `is_regular` property | -| `RectilinearChunkGrid` subclass (#3534) | Same `ChunkGrid` class | -| Chunk grid registry + entrypoints (#3735) | Removed — direct name dispatch | -| `arr.chunks` → `tuple[int, ...]` | Retained for regular grids; `arr.chunk_grid` for general use | +| `ChunkGrid` ABC + `RegularChunkGrid` subclass | Single concrete `ChunkGrid` with `is_regular` | +| `RectilinearChunkGrid` (#3534) | Same `ChunkGrid` class | +| Chunk grid registry + entrypoints (#3735) | Direct name dispatch | +| `arr.chunks` | Retained for regular; `arr.chunk_grid` for general use | +| `get_chunk_shape(shape, coord)` | `grid[coord].codec_shape` or `grid[coord].shape` | ## Design decisions -### Why not store the array shape in ChunkGrid? +### Why store the extent in ChunkGrid? -[#3736](https://github.com/zarr-developers/zarr-python/issues/3736) proposes adding `array_shape` to the chunk grid, motivated by the awkwardness of passing and re-validating `array_shape` on every method call in PR #3534. zarrs takes the same approach, storing the shape at construction. This prospectus diverges. +The chunk grid is a concrete arrangement, not an abstract tiling pattern. A finite collection naturally has an extent. Storing it enables `__getitem__`, eliminates `dim_len` parameters from every method, and makes the grid self-describing. -**For:** +This does *not* mean `ArrayV3Metadata.shape` should delegate to the grid. The array shape remains an independent field in metadata. The extent is passed into the grid at construction time so it can answer boundary questions without external parameters. It is **not** included in `to_dict()` — it comes from the `shape` field in array metadata and is passed to `parse_chunk_grid()`. -- Simpler method signatures (no repeated `array_shape` parameter). -- Enables precomputing chunk count and boundary sizes. -- Prevents callers from passing the wrong shape. -- Eliminates repeated validation. +### Why distinguish chunk_size from data_size? -**Against:** +A chunk in a regular grid has two sizes. `chunk_size` is the buffer size the codec processes — always `size` for `FixedDimension`, even at the boundary (padded with `fill_value`). `data_size` is the valid data region — clipped to `extent % size` at the boundary. The indexing layer uses `data_size` to generate `chunk_selection` slices. -- The chunk grid is a tiling pattern, not a bound region. In the Zarr V3 spec, `chunk_grid` and `shape` are independent metadata fields. Storing the shape conflates "how to tile" with "what to tile over." Sharding exposes this — the same subchunk configuration produces different `ChunkGrid` instances for different shard shapes. `VaryingDimension` doesn't need the shape at all (edges fully define the grid). -- TensorStore validates the separation in production, storing only `chunk_shape`. -- serialization becomes awkward — `to_dict()` would need to return the shape alongside the grid even though the spec doesn't couple them. +This matches current zarr-python behavior and matters for: +1. **Backward compatibility.** Existing stores have boundary chunks encoded at full `chunk_shape`. +2. **Codec simplicity.** Codecs assume uniform input shapes for regular grids. +3. **Shard index correctness.** The index assumes `subchunk_dim`-sized entries. -The repeated-validation problem from #3534 is real but has a simpler fix: validate once at `ArrayV3Metadata` construction (where both `shape` and `chunk_grid` are available), then trust that callers pass the correct shape downstream. For `VaryingDimension`, most methods don't use the shape at all — the edges and cumulative sums are self-contained. For `FixedDimension`, only boundary chunk size and grid extent need the shape, and these are computed with a single scalar per dimension, not the full tuple. - -The cost of keeping them separate is one extra parameter on ~5 methods that are called O(1) times per operation. The benefit is a cleaner abstraction that's reusable across contexts (sharding, resize, serialization). +For `VaryingDimension`, `chunk_size == data_size` — no padding. This is the fundamental difference: `FixedDimension` has a declared size plus an extent that clips data; `VaryingDimension` has explicit sizes that *are* the extent. ### Why not a chunk grid registry? -zarrs uses compile-time + runtime plugin registration. This makes sense for a library that explicitly supports user-defined extensions. For zarr-python, there is no known chunk grid outside the rectilinear family that retains the tessellation properties the codebase assumes. A simple `match` on the grid name in `parse_chunk_grid()` is sufficient and avoids entrypoint complexity. +There is no known chunk grid outside the rectilinear family that retains the tessellation properties zarr-python assumes. A `match` on the grid name is sufficient. ### Why a single class instead of a Protocol? -zarrs uses independent types behind a shared trait. In Rust, the trait system enforces a uniform interface at zero runtime cost. In Python, a Protocol-based approach means every caller programs against an abstract interface, and adding a grid type requires implementing ~10 methods. Since all known grids are special cases of rectilinear, a single class is simpler while supporting the same metadata formats. If a genuinely novel grid type emerges, a Protocol can be extracted at that point. +All known grids are special cases of rectilinear. A Protocol-based approach means every caller programs against an abstract interface and adding a grid type requires implementing ~10 methods. A single class is simpler. If a genuinely novel grid type emerges, a Protocol can be extracted. ## Prior art -### zarrs (Rust) - -zarrs implements three independent chunk grid types (regular, regular-bounded, rectangular) behind a `ChunkGridTraits` trait. Key patterns adopted: +**zarrs (Rust):** Three independent grid types behind a `ChunkGridTraits` trait. Key patterns adopted: Fixed vs Varying per dimension, prefix sums + binary search, `Option` for out-of-bounds, `NonZeroU64` for chunk dimensions, separate subchunk grid per shard, array shape at construction. -- **Fixed vs Varying per dimension** — rectangular grid distinguishes `Fixed(size)` vs `Varying(Vec)` per axis -- **Prefix sums + binary search** — precomputed offsets with `partition_point` for O(log n) lookup -- **None for out-of-bounds** — chunk queries return `Option` instead of panicking -- **Non-zero chunk dimensions** — `NonZeroU64` makes zero-sized chunks unrepresentable -- **Sharding creates a separate grid** — `ShardingCodec` constructs an independent subchunk grid per shard - -### TensorStore (C++) - -TensorStore's `ChunkGridSpecification` stores only `chunk_shape`, not the array shape — validating the shape-free approach. It has both `RegularGridRef` and `IrregularGrid` internally (the latter with sorted breakpoints per dimension), but only the regular grid is used for Zarr V3. No chunk grid registry — the `"regular"` name is hardcoded. +**TensorStore (C++):** Stores only `chunk_shape` — boundary clipping via `valid_data_bounds` at query time. Both `RegularGridRef` and `IrregularGrid` internally. No registry. ## Migration -### Existing PRs - -**#3735** (chunk grid module, +313/−65, approved by @maxrjones) splits `chunk_grids.py` into a `chunk_grids/` package (`__init__.py`, `common.py`, `regular.py`) and adds a chunk grid registry. The module layout is reusable. The registry (`register_chunk_grid` / `get_chunk_grid_class` in `registry.py`) is not — it should be replaced with direct name dispatch before merging. - -**#3737** (chunk grid array shape, +514/−198, draft) implements #3736 by adding `array_shape` to `ChunkGrid`. Depends on #3735. The prospectus argues against storing the array shape in the grid (see Design decisions). This PR should be closed. - -**#3534** (rectilinear implementation, +5716/−408, extensive review) introduces `RectilinearChunkGrid` as a separate subclass. The prospectus proposes a different architecture (single `ChunkGrid` with `FixedDimension`/`VaryingDimension`). Reusable components: - -| #3534 component | Disposition | -|---|---| -| `_expand_run_length_encoding` / `_compress_run_length_encoding` | **Keep** as-is | -| `_normalize_rectilinear_chunks` / `_parse_chunk_shapes` | **Keep with modifications** — feed into `VaryingDimension` construction | -| `resolve_chunk_spec` / `ChunksLike` type alias | **Keep** — orthogonal to grid class design | -| `_validate_zarr_format_compatibility` | **Keep** — rectilinear is V3-only | -| `_validate_sharding_compatibility` | **Remove** — sharding is compatible with rectilinear | -| `_validate_data_compatibility` (`from_array` guard) | **Keep for now** — needs separate design work | -| `RectilinearChunkGrid` class / `ConfigurationDict` | **Replace** — single `ChunkGrid` class | -| `chunk_grid` property on `Array`/`AsyncArray` | **Keep** | -| `.chunks` raising for rectilinear | **Keep** | -| Tests | **Adapt** for single-class API | -| Indexing changes | **Insufficient** — `assert isinstance(chunk_grid, RegularChunkGrid)` guards remain | - -Given the scope of architectural changes, a **fresh PR** is more practical than adapting #3534. Rebasing and reworking its core classes would touch nearly every line of a 5700-line diff while inheriting review history that no longer applies. - -**#1483** (ZEP0003 POC, +346/−20, draft, V2) is @martindurant's original proof-of-concept for variable chunking on Zarr V2. It demonstrated feasibility but targets the V2 format and predates the V3 extension point design. Should be closed. - ### Plan -1. **Amend and merge #3735.** Keep the `chunk_grids/` module layout. Replace the registry with direct name dispatch in `parse_chunk_grid()`. Remove `register_chunk_grid` / `get_chunk_grid_class` from `registry.py` and the entrypoint from `pyproject.toml`. - -2. **Open a new PR** implementing the prospectus: - - `FixedDimension` and `VaryingDimension` dataclasses with shared interface (`index_to_chunk`, `chunk_offset`, `chunk_size`, `indices_to_chunks`). - - Single `ChunkGrid` class with `dimensions: tuple[FixedDimension | VaryingDimension, ...]` and `is_regular`. - - `parse_chunk_grid()` recognizes `"regular"` and `"rectilinear"`. +1. **Amend and merge #3735.** Keep the `chunk_grids/` module layout. Replace the registry with direct name dispatch. Remove `register_chunk_grid` / `get_chunk_grid_class` and the entrypoint. +2. **Open a new PR** implementing this prospectus: + - `FixedDimension`, `VaryingDimension`, `ChunkSpec`, and `ChunkGrid` classes. + - `parse_chunk_grid(metadata, array_shape)` with `"regular"` and `"rectilinear"` dispatch. - Port RLE helpers, `resolve_chunk_spec`, `ChunksLike`, and validation functions from #3534. - - Refactor per-dimension indexers to accept `FixedDimension | VaryingDimension` instead of `dim_chunk_len: int`. - - Update `get_chunk_spec` to compute per-chunk shapes from coordinates. - - Add `arr.chunk_grid` property. Keep `.chunks` for regular grids, raise for rectilinear. + - Refactor per-dimension indexers to accept `FixedDimension | VaryingDimension`. + - Update `get_chunk_spec` to use `grid[chunk_coords].codec_shape`. + - Add `arr.chunk_grid`. Keep `.chunks` for regular, raise for rectilinear. - Remove the "sharding incompatible with rectilinear" guard. - Adapt tests from #3534. +3. **Close trial PRs** with credits: + - **#3534** — RLE helpers, validation logic, chunk spec resolution, test cases, review discussion. + - **#3737** — extent-in-grid idea (adopted per-dimension). + - **#1483** — original POC; superseded by V3 implementation. + - **#3736** — resolved by storing extent per-dimension. +4. **Sharding v1.1** (separate PR, after zarr-specs#370) — remove `shard_shape % subchunk_shape == 0` validation. -3. **Close trial PRs** with comments linking to the new PR and crediting contributions: - - **Close #3534** — credit RLE helpers, validation logic, chunk spec resolution, test cases, and review discussion that shaped the design. - - **Close #3737** — reference the shape-free design decision. - - **Close #1483** — credit as the original POC that motivated the work; superseded by the V3 implementation. - - **Close #3736** — respond with the shape-free design rationale. - -4. **Sharding v1.1** (after zarr-specs#370 is accepted) — separate PR removing the `shard_shape % subchunk_shape == 0` validation in `ShardingCodec`. - -### Downstream migration - -Four active PRs/issues in the ecosystem depend on zarr-python's rectilinear chunk grid support. All currently track #3534 as their upstream dependency. The unified `ChunkGrid` design is a narrower API surface than the two-class hierarchy, so the net effect is less integration work per downstream — but each needs updates. - -#### xarray ([pydata/xarray#10880](https://github.com/pydata/xarray/pull/10880)) - -Draft PR by @keewis (+26/−9 in `xarray/backends/zarr.py`) enabling variable-sized chunk writes and reads via the zarr backend. Currently imports `RectilinearChunkGrid` / `RegularChunkGrid` for feature detection and branches on `isinstance` checks. - -**Required changes:** - -- **Feature detection.** Replace class-existence checks (`hasattr(zarr, 'RectilinearChunkGrid')`) with a version check or try-import of the unified `ChunkGrid`. Since the prospectus exports a single class, detection simplifies to checking whether `ChunkGrid` accepts non-uniform dimensions (or just `zarr.__version__`). -- **Write path.** Currently constructs chunk info that `RectilinearChunkGrid` understands. The prospectus's `chunks=[[10, 20, 30], [25, 25, 25, 25]]` API for `create_array` is a more natural fit — the xarray write path may get simpler. -- **Read path.** Replace `isinstance(chunk_grid, RectilinearChunkGrid)` with `not chunk_grid.is_regular`. Per-dimension chunk sizes come from `chunk_grid.dimensions[i].edges` (for `VaryingDimension`) or are computed from `chunk_grid.dimensions[i].size` (for `FixedDimension`). -- **`validate_grid_chunks_alignment`.** Still needs work regardless of class hierarchy — the approach is the same either way. +### Reusable components from #3534 -**Effort:** ~1–2 days. The PR is small and the unified API is more ergonomic for xarray's use case. - -#### VirtualiZarr ([zarr-developers/VirtualiZarr#877](https://github.com/zarr-developers/VirtualiZarr/pull/877)) - -Draft PR by @maxrjones adding rectilinear support to `ManifestArray`, with a `has_rectilinear_chunk_grid_support` feature flag and vendored `_is_nested_sequence` helper from #3534. - -**Required changes:** - -- **Drop vendored `_is_nested_sequence`.** The prospectus eliminates `RectilinearChunkGrid` as a separate class, so nested-sequence detection for choosing grid type is unnecessary — just construct `ChunkGrid` with appropriate dimension types. -- **`isinstance` → `.is_regular`.** All `isinstance(chunk_grid, RectilinearChunkGrid)` checks become `not chunk_grid.is_regular`. -- **`ManifestArray.chunks`.** Currently returns `chunk_grid.chunk_shapes` for rectilinear grids. Under the prospectus, chunk shapes come from iterating dimension edges. The dask-style `tuple[tuple[int, ...], ...]` format VirtualiZarr uses internally is unaffected. -- **`copy_and_replace_metadata`.** Simplifies: no need to detect nested sequences to pick a grid class. -- **Test environment.** Currently pins jhamman's zarr-python fork — would track whatever branch implements the prospectus. - -**Effort:** ~1–2 days. Mostly mechanical type-check replacements plus dropping the vendored helper. Concat/stack logic is grid-type-agnostic once chunk shapes are available. - -#### Icechunk ([earth-mover/icechunk#1338](https://github.com/earth-mover/icechunk/issues/1338)) - -Investigation issue for supporting rectilinear grids in the IC2 on-disk format. The `DimensionShape { dim_length, chunk_length }` struct needs extension to encode per-chunk sizes. - -**Impact:** Minimal. Icechunk's format changes are driven by the *spec* (ZEP0003 / rectilinear extension), not zarr-python's class hierarchy. The unified `ChunkGrid` means Icechunk's Python-side metadata ingestion handles one type instead of two. The `shift_array` / `reindex` concerns raised in the discussion are orthogonal to this design. - -**Effort:** No change to the work already scoped. May marginally simplify the Python integration layer. - -#### cubed ([cubed-dev/cubed#876](https://github.com/cubed-dev/cubed/issues/876)) - -Draft by @TomNicholas using rectilinear intermediate stores to reduce rechunking stages (+142/−27 across storage adapter, blockwise, and ops). - -**Required changes:** - -- **Store creation.** `zarr_python_v3.py` currently creates `RectilinearChunkGrid` instances directly. Switch to constructing `ChunkGrid` via the prospectus's list-of-lists `chunks` API. -- **Chunk shape queries.** Any `isinstance` checks on grid type become `.is_regular` checks. -- The rechunking algorithm itself is independent of the class hierarchy — it operates on per-dimension chunk tuples internally. +| Component | Disposition | +|---|---| +| RLE encode/decode helpers | **Keep** | +| `_normalize_rectilinear_chunks` / `_parse_chunk_shapes` | **Keep** — feed into `VaryingDimension` | +| `resolve_chunk_spec` / `ChunksLike` | **Keep** | +| `_validate_zarr_format_compatibility` | **Keep** — rectilinear is V3-only | +| `_validate_sharding_compatibility` | **Remove** — sharding is compatible | +| `RectilinearChunkGrid` class | **Replace** | +| Indexing changes | **Insufficient** — `isinstance` guards remain | -**Effort:** <1 day. Changes are concentrated in the storage adapter layer, and the prospectus's API is a natural fit for cubed's internal representation. +A **fresh PR** is more practical than adapting #3534's 5700-line diff. -#### Migration pattern +### Downstream migration -All four downstreams follow the same pattern. The migration from the two-class API to the unified API is mechanical: +All four downstream PRs/issues follow the same pattern: | Two-class pattern | Unified pattern | |---|---| | `isinstance(cg, RegularChunkGrid)` | `cg.is_regular` | | `isinstance(cg, RectilinearChunkGrid)` | `not cg.is_regular` | -| `cg.chunk_shape` (regular only) | `cg.chunk_shape` (raises if not regular) | -| `cg.chunk_shapes` (rectilinear) | `tuple(d.edges for d in cg.dimensions)` | -| `RegularChunkGrid(chunk_shape=(...))` | `ChunkGrid.from_regular((...))` or `chunks=(...)` in `create_array` | -| `RectilinearChunkGrid(chunk_shapes=(...))` | `ChunkGrid.from_rectilinear((...))` or `chunks=[[...], [...]]` in `create_array` | +| `cg.chunk_shape` | `cg.dimensions[i].size` or `cg[coord].shape` | +| `cg.chunk_shapes` | `tuple(d.edges for d in cg.dimensions)` | +| `RegularChunkGrid(chunk_shape=...)` | `ChunkGrid.from_regular(shape, chunks)` | +| `RectilinearChunkGrid(chunk_shapes=...)` | `ChunkGrid.from_rectilinear(edges)` | | Feature detection via class import | Version check or `hasattr(ChunkGrid, 'is_regular')` | +**[xarray#10880](https://github.com/pydata/xarray/pull/10880):** Replace `isinstance` checks with `.is_regular`. Write path simplifies with `chunks=[[...]]` API. ~1–2 days. + +**[VirtualiZarr#877](https://github.com/zarr-developers/VirtualiZarr/pull/877):** Drop vendored `_is_nested_sequence`. Replace `isinstance` checks. ~1–2 days. + +**[Icechunk#1338](https://github.com/earth-mover/icechunk/issues/1338):** Minimal impact — format changes driven by spec, not class hierarchy. + +**[cubed#876](https://github.com/cubed-dev/cubed/issues/876):** Switch store creation to `ChunkGrid` API. <1 day. + ## Open questions -1. **RLE in the Python API:** Should users pass RLE-encoded chunk specs directly, or only expanded lists? RLE is primarily a serialization concern, but for arrays with millions of chunks it matters at construction time too. -2. **Resize defaults:** When growing a regular array, should the default preserve regularity (extending the last chunk) or create a new chunk for the added region (transitioning to rectilinear)? +1. **RLE in the Python API:** Should users pass RLE-encoded chunk specs directly, or only expanded lists? +2. **Resize defaults:** When growing a regular array, should the default preserve regularity or transition to rectilinear? +3. **`ChunkSpec` complexity:** `ChunkSpec` carries both `slices` and `codec_shape`. Should the grid expose separate methods for codec vs data queries instead? +4. **`__getitem__` with slices:** Should `grid[0, :]` or `grid[0:3, :]` return a sub-grid or an iterator of `ChunkSpec`s? From d6d551af504a7c5740c0b8c50195e34b19b23535 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:55:20 -0400 Subject: [PATCH 004/118] V3 prospectus --- chunk-grid-prospectus.md | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/chunk-grid-prospectus.md b/chunk-grid-prospectus.md index a8149d2ad2..c764394a28 100644 --- a/chunk-grid-prospectus.md +++ b/chunk-grid-prospectus.md @@ -1,6 +1,6 @@ # Prospectus: Unified Chunk Grid Design for zarr-python -Version: 2 +Version: 3 **Related:** - [#3750](https://github.com/zarr-developers/zarr-python/issues/3750) (single ChunkGrid proposal) @@ -31,7 +31,7 @@ A registry-based plugin system adds complexity without clear benefit. ### Principles 1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern — the specific partition of a specific array. The grid stores enough information to answer any question about any chunk without external parameters. -2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class serializes as `"regular"` when all chunks are uniform, `"rectilinear"` otherwise. +2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. The serialization format (`"regular"` vs `"rectilinear"`) is chosen by the metadata layer, not the grid. 3. **No chunk grid registry.** Simple name-based dispatch in `parse_chunk_grid()`. 4. **Fixed vs Varying per dimension.** `FixedDimension(size, extent)` for uniform chunks; `VaryingDimension(edges, cumulative)` for per-chunk edge lengths with precomputed prefix sums. Avoids expanding regular dimensions into lists of identical values. 5. **Transparent transitions.** Operations like `resize()` can move an array from regular to rectilinear chunking. @@ -174,17 +174,34 @@ def __getitem__(self, coords: tuple[int, ...]) -> ChunkSpec | None: {"name": "rectilinear", "configuration": {"chunk_shapes": [[10, 20, 30], [[25, 4]]]}} ``` -Both names produce the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. For rectilinear grids, the extent is redundant (`sum(edges)`) and is validated for consistency. +Both names deserialize to the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. For rectilinear grids, the extent is redundant (`sum(edges)`) and is validated for consistency. + +**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`, which already knows how to produce its JSON document. The flow is always: metadata document → `ChunkGrid` (via `parse_chunk_grid`), never the reverse. The grid is a pure runtime computation object. + +`ArrayV3Metadata` stores the chunk grid's JSON `name` from the original metadata document and uses it when serializing back. This gives round-trip fidelity for free — a store written as rectilinear with uniform edges stays rectilinear. + +The only place where a user needs to choose the format is when creating new metadata. For `create_array`, the format is inferred from the `chunks` argument: a flat tuple produces `"regular"`, a nested list produces `"rectilinear"`. For `resize`, the format can be specified explicitly via `chunk_grid_metadata`: + +```python +arr.resize( + (80, 100), + chunks=[[10, 20, 30, 20], [25, 25, 25, 25]], + chunk_grid_metadata="rectilinear", +) +``` + +`chunk_grid_metadata` is typed as `str`, not a closed literal — the Zarr V3 spec allows any registered chunk grid name. zarr-python supports `"regular"` and `"rectilinear"` natively; other names (e.g., zarrs' `"regular_bounded"`) would raise unless a handler is registered. If omitted, the format is inferred: `"rectilinear"` when chunks are non-uniform or explicitly nested, `"regular"` when chunks are a flat tuple and evenly divide the shape. Specifying `"regular"` when the chunks are non-uniform raises an error. #### Resize ```python -arr.resize((80, 100)) # becomes rectilinear if not evenly divisible +arr.resize((80, 100)) # inferred rectilinear if not evenly divisible arr.resize((80, 100), chunks=[[10, 20, 30, 20], [25, 25, 25, 25]]) # explicit chunks -arr.resize((70, 100)) # stays regular if divisible +arr.resize((70, 100)) # stays regular if divisible +arr.resize((100, 100), chunk_grid_metadata="rectilinear") # force rectilinear metadata ``` -Resize constructs a new frozen `ChunkGrid`, replacing the old one. +Resize creates new `ArrayV3Metadata` (and thus a new `ChunkGrid`). Since resize always creates new metadata, `chunk_grid_metadata` is the natural place to choose the serialization format. ### Indexing @@ -251,7 +268,7 @@ The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as The chunk grid is a concrete arrangement, not an abstract tiling pattern. A finite collection naturally has an extent. Storing it enables `__getitem__`, eliminates `dim_len` parameters from every method, and makes the grid self-describing. -This does *not* mean `ArrayV3Metadata.shape` should delegate to the grid. The array shape remains an independent field in metadata. The extent is passed into the grid at construction time so it can answer boundary questions without external parameters. It is **not** included in `to_dict()` — it comes from the `shape` field in array metadata and is passed to `parse_chunk_grid()`. +This does *not* mean `ArrayV3Metadata.shape` should delegate to the grid. The array shape remains an independent field in metadata. The extent is passed into the grid at construction time so it can answer boundary questions without external parameters. It is **not** serialized as part of the chunk grid JSON — it comes from the `shape` field in array metadata and is passed to `parse_chunk_grid()`. ### Why distinguish chunk_size from data_size? From 8b8af749bc9df23f420185a52cef87ad485655ac Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:05:01 -0400 Subject: [PATCH 005/118] Fastforward POC to V3 --- src/zarr/codecs/sharding.py | 12 +- src/zarr/core/array.py | 16 +- src/zarr/core/chunk_grids.py | 256 ++++++++++++----- src/zarr/core/indexing.py | 16 +- src/zarr/core/metadata/v2.py | 6 +- src/zarr/core/metadata/v3.py | 32 +-- src/zarr/testing/strategies.py | 4 +- tests/test_indexing.py | 4 +- tests/test_unified_chunk_grid.py | 459 ++++++++++++++++++++++++++----- 9 files changed, 616 insertions(+), 189 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 0b880fb7b8..1785199ec4 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -34,7 +34,7 @@ default_buffer_prototype, numpy_buffer_prototype, ) -from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid +from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import ( ShapeLike, parse_enum, @@ -416,7 +416,7 @@ async def _decode_single( indexer = BasicIndexer( tuple(slice(0, s) for s in shard_shape), shape=shard_shape, - chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + chunk_grid=ChunkGrid.from_regular(shard_shape, chunk_shape), ) # setup output array @@ -462,7 +462,7 @@ async def _decode_partial_single( indexer = get_indexer( selection, shape=shard_shape, - chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + chunk_grid=ChunkGrid.from_regular(shard_shape, chunk_shape), ) # setup output array @@ -537,7 +537,7 @@ async def _encode_single( BasicIndexer( tuple(slice(0, s) for s in shard_shape), shape=shard_shape, - chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + chunk_grid=ChunkGrid.from_regular(shard_shape, chunk_shape), ) ) @@ -586,7 +586,9 @@ async def _encode_partial_single( indexer = list( get_indexer( - selection, shape=shard_shape, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape) + selection, + shape=shard_shape, + chunk_grid=ChunkGrid.from_regular(shard_shape, chunk_shape), ) ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 55ea25286b..96706c3a04 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -40,7 +40,12 @@ default_buffer_prototype, ) from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype -from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid, _auto_partition, normalize_chunks +from zarr.core.chunk_grids import ( + ChunkGrid, + _auto_partition, + normalize_chunks, + parse_chunk_grid, +) from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -780,7 +785,7 @@ def _create_metadata_v3( if chunk_grid is not None: chunk_grid_parsed: ChunkGrid = chunk_grid else: - chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape) + chunk_grid_parsed = ChunkGrid.from_regular(shape, chunk_shape) return ArrayV3Metadata( shape=shape, data_type=dtype, @@ -4700,7 +4705,7 @@ async def init_array( sharding_codec.validate( shape=chunk_shape_parsed, dtype=zdtype, - chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), + chunk_grid=ChunkGrid.from_regular(shape_parsed, shard_shape_parsed), ) codecs_out = (sharding_codec,) chunks_out = shard_shape_parsed @@ -6001,8 +6006,9 @@ async def _resize( if delete_outside_chunks and not only_growing: # Remove all chunks outside of the new shape - old_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords(array.metadata.shape)) - new_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords(new_shape)) + old_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords()) + new_grid = parse_chunk_grid(array.metadata.chunk_grid, new_shape) + new_chunk_coords = set(new_grid.all_chunk_coords()) async def _delete_key(key: str) -> None: await (array.store_path / key).delete() diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 33f8ed907b..2070f118f6 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -40,13 +40,23 @@ @dataclass(frozen=True) class FixedDimension: - """All chunks on this axis have the same size.""" + """Uniform chunk size. Boundary chunks contain less data but are + encoded at full size by the codec pipeline.""" - size: int # chunk edge length (> 0) + size: int # chunk edge length (>= 0) + extent: int # array dimension length def __post_init__(self) -> None: if self.size < 0: raise ValueError(f"FixedDimension size must be >= 0, got {self.size}") + if self.extent < 0: + raise ValueError(f"FixedDimension extent must be >= 0, got {self.extent}") + + @property + def nchunks(self) -> int: + if self.size == 0: + return 1 if self.extent == 0 else 0 + return ceildiv(self.extent, self.size) def index_to_chunk(self, idx: int) -> int: if self.size == 0: @@ -56,15 +66,15 @@ def index_to_chunk(self, idx: int) -> int: def chunk_offset(self, chunk_ix: int) -> int: return chunk_ix * self.size - def chunk_size(self, chunk_ix: int, dim_len: int) -> int: - if self.size == 0: - return 0 - return min(self.size, dim_len - chunk_ix * self.size) + def chunk_size(self, chunk_ix: int) -> int: + """Buffer size for codec processing — always uniform.""" + return self.size - def nchunks(self, dim_len: int) -> int: + def data_size(self, chunk_ix: int) -> int: + """Valid data region within the buffer — clipped at extent.""" if self.size == 0: - return 1 if dim_len == 0 else 0 - return ceildiv(dim_len, self.size) + return 0 + return max(0, min(self.size, self.extent - chunk_ix * self.size)) def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: if self.size == 0: @@ -74,7 +84,8 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int @dataclass(frozen=True) class VaryingDimension: - """Chunks on this axis have explicit per-chunk sizes.""" + """Explicit per-chunk sizes. No padding — each edge length is + both the codec size and the data size.""" edges: tuple[int, ...] # per-chunk edge lengths (all > 0) cumulative: tuple[int, ...] # prefix sums for O(log n) lookup @@ -89,17 +100,27 @@ def __init__(self, edges: Sequence[int]) -> None: object.__setattr__(self, "edges", edges_tuple) object.__setattr__(self, "cumulative", cumulative) + @property + def nchunks(self) -> int: + return len(self.edges) + + @property + def extent(self) -> int: + return self.cumulative[-1] + def index_to_chunk(self, idx: int) -> int: return bisect.bisect_right(self.cumulative, idx) def chunk_offset(self, chunk_ix: int) -> int: return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 - def chunk_size(self, chunk_ix: int, dim_len: int) -> int: + def chunk_size(self, chunk_ix: int) -> int: + """Buffer size for codec processing.""" return self.edges[chunk_ix] - def nchunks(self, dim_len: int) -> int: - return len(self.edges) + def data_size(self, chunk_ix: int) -> int: + """Valid data region — same as chunk_size for varying dims.""" + return self.edges[chunk_ix] def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: return np.searchsorted(self.cumulative, indices, side="right") @@ -108,6 +129,34 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int DimensionGrid = FixedDimension | VaryingDimension +# --------------------------------------------------------------------------- +# ChunkSpec +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class ChunkSpec: + """Specification of a single chunk's location and size. + + ``slices`` gives the valid data region in array coordinates. + ``codec_shape`` gives the buffer shape for codec processing. + For interior chunks these are equal. For boundary chunks of a regular + grid, ``codec_shape`` is the full declared chunk size while ``shape`` + is clipped. For rectilinear grids, ``shape == codec_shape`` always. + """ + + slices: tuple[slice, ...] + codec_shape: tuple[int, ...] + + @property + def shape(self) -> tuple[int, ...]: + return tuple(s.stop - s.start for s in self.slices) + + @property + def is_boundary(self) -> bool: + return self.shape != self.codec_shape + + # --------------------------------------------------------------------------- # RLE helpers (ported from #3534) # --------------------------------------------------------------------------- @@ -155,6 +204,10 @@ class ChunkGrid(Metadata): """ Unified chunk grid supporting both regular and rectilinear chunking. + A chunk grid is a concrete arrangement of chunks for a specific array. + It stores the extent (array dimension length) per dimension, enabling + ``grid[coords]`` to return a ``ChunkSpec`` without external parameters. + Internally represents each dimension as either FixedDimension (uniform chunks) or VaryingDimension (per-chunk edge lengths with prefix sums). """ @@ -165,10 +218,19 @@ def __init__(self, *, dimensions: tuple[DimensionGrid, ...]) -> None: object.__setattr__(self, "dimensions", dimensions) @classmethod - def from_regular(cls, chunk_shape: ShapeLike) -> ChunkGrid: + def from_regular(cls, array_shape: ShapeLike, chunk_shape: ShapeLike) -> ChunkGrid: """Create a ChunkGrid where all dimensions are fixed (regular).""" - parsed = parse_shapelike(chunk_shape) - dims = tuple(FixedDimension(size=s) for s in parsed) + shape_parsed = parse_shapelike(array_shape) + chunks_parsed = parse_shapelike(chunk_shape) + if len(shape_parsed) != len(chunks_parsed): + raise ValueError( + f"array_shape and chunk_shape must have same ndim, " + f"got {len(shape_parsed)} vs {len(chunks_parsed)}" + ) + dims = tuple( + FixedDimension(size=c, extent=s) + for s, c in zip(shape_parsed, chunks_parsed, strict=True) + ) return cls(dimensions=dims) @classmethod @@ -177,14 +239,16 @@ def from_rectilinear(cls, chunk_shapes: Sequence[Sequence[int]]) -> ChunkGrid: Each element of chunk_shapes is a sequence of chunk sizes for that dimension. If all sizes in a dimension are identical, it's stored as FixedDimension. + The extent of each dimension is ``sum(edges)``. """ dims: list[DimensionGrid] = [] for edges in chunk_shapes: edges_list = list(edges) if not edges_list: raise ValueError("Each dimension must have at least one chunk") + extent = sum(edges_list) if all(e == edges_list[0] for e in edges_list): - dims.append(FixedDimension(size=edges_list[0])) + dims.append(FixedDimension(size=edges_list[0], extent=extent)) else: dims.append(VaryingDimension(edges_list)) return cls(dimensions=tuple(dims)) @@ -199,6 +263,11 @@ def ndim(self) -> int: def is_regular(self) -> bool: return all(isinstance(d, FixedDimension) for d in self.dimensions) + @property + def shape(self) -> tuple[int, ...]: + """Number of chunks per dimension.""" + return tuple(d.nchunks for d in self.dimensions) + @property def chunk_shape(self) -> tuple[int, ...]: """Return the uniform chunk shape. Raises if grid is not regular.""" @@ -212,60 +281,47 @@ def chunk_shape(self) -> tuple[int, ...]: if not self.is_regular: raise ValueError( "chunk_shape is only available for regular chunk grids. " - "Use get_chunk_shape(array_shape, chunk_coords) for rectilinear grids." + "Use grid[coords] for per-chunk sizes." ) return tuple(d.size for d in self.dimensions) # type: ignore[union-attr] - # -- Chunk queries (shape-free where possible) -- + # -- Collection interface -- - def get_chunk_shape( - self, array_shape: tuple[int, ...], chunk_coords: tuple[int, ...] - ) -> tuple[int, ...] | None: - """Return the shape of a specific chunk, or None if out of bounds.""" - result: list[int] = [] - for dim, dim_len, chunk_ix in zip(self.dimensions, array_shape, chunk_coords, strict=True): - nch = dim.nchunks(dim_len) - if chunk_ix < 0 or chunk_ix >= nch: - return None - result.append(dim.chunk_size(chunk_ix, dim_len)) - return tuple(result) - - def get_chunk_origin( - self, array_shape: tuple[int, ...], chunk_coords: tuple[int, ...] - ) -> tuple[int, ...] | None: - """Return the origin (start indices) of a specific chunk, or None if OOB.""" - result: list[int] = [] - for dim, dim_len, chunk_ix in zip(self.dimensions, array_shape, chunk_coords, strict=True): - nch = dim.nchunks(dim_len) - if chunk_ix < 0 or chunk_ix >= nch: + def __getitem__(self, coords: tuple[int, ...]) -> ChunkSpec | None: + """Return the ChunkSpec for a chunk at the given grid position, or None if OOB.""" + slices: list[slice] = [] + codec_shape: list[int] = [] + for dim, ix in zip(self.dimensions, coords, strict=True): + if ix < 0 or ix >= dim.nchunks: return None - result.append(dim.chunk_offset(chunk_ix)) - return tuple(result) + offset = dim.chunk_offset(ix) + slices.append(slice(offset, offset + dim.data_size(ix))) + codec_shape.append(dim.chunk_size(ix)) + return ChunkSpec(tuple(slices), tuple(codec_shape)) - def grid_shape(self, array_shape: tuple[int, ...]) -> tuple[int, ...]: - """Return the number of chunks per dimension.""" - return tuple(d.nchunks(s) for d, s in zip(self.dimensions, array_shape, strict=True)) + def __iter__(self) -> Iterator[ChunkSpec]: + """Iterate all chunks, yielding ChunkSpec for each.""" + for coords in itertools.product(*(range(d.nchunks) for d in self.dimensions)): + spec = self[coords] + if spec is not None: + yield spec - def all_chunk_coords(self, array_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - return itertools.product( - *(range(d.nchunks(s)) for d, s in zip(self.dimensions, array_shape, strict=True)) - ) + def all_chunk_coords(self) -> Iterator[tuple[int, ...]]: + return itertools.product(*(range(d.nchunks) for d in self.dimensions)) - def get_nchunks(self, array_shape: tuple[int, ...]) -> int: - return reduce( - operator.mul, - (d.nchunks(s) for d, s in zip(self.dimensions, array_shape, strict=True)), - 1, - ) + def get_nchunks(self) -> int: + return reduce(operator.mul, (d.nchunks for d in self.dimensions), 1) # -- Serialization -- @classmethod def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> ChunkGrid: if isinstance(data, ChunkGrid): - # Handle both ChunkGrid and legacy RegularChunkGrid if isinstance(data, RegularChunkGrid): - return ChunkGrid.from_regular(data.chunk_shape) + return ChunkGrid.from_regular( + tuple(d.extent for d in data.dimensions), + data.chunk_shape, + ) return data name_parsed, configuration_parsed = parse_named_configuration(data) @@ -276,7 +332,12 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> raise ValueError("Regular chunk grid requires 'chunk_shape' configuration") if not isinstance(chunk_shape_raw, Sequence): raise TypeError(f"chunk_shape must be a sequence, got {type(chunk_shape_raw)}") - return cls.from_regular(cast("Sequence[int]", chunk_shape_raw)) + # Without array shape, create with extent=0 as placeholder. + # parse_chunk_grid() should be used when array shape is available. + dims = tuple( + FixedDimension(size=int(cast("int", s)), extent=0) for s in chunk_shape_raw + ) + return cls(dimensions=dims) if name_parsed == "rectilinear": chunk_shapes_raw = configuration_parsed.get("chunk_shapes") @@ -284,7 +345,6 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") if not isinstance(chunk_shapes_raw, Sequence): raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - # Decode RLE if present decoded: list[list[int]] = [] for dim_spec in chunk_shapes_raw: if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): @@ -307,12 +367,24 @@ def to_dict(self) -> dict[str, JSON]: chunk_shapes: list[Any] = [] for dim in self.dimensions: if isinstance(dim, FixedDimension): - # Single fixed size — store as RLE - chunk_shapes.append([[dim.size, 1]]) + # Serialize as uniform edges. The extent is reconstructed + # from sum(edges) on deserialization, so we use the actual + # data sizes to preserve the true extent (which may not be + # a multiple of chunk size at the boundary). + n = dim.nchunks + if n == 0: + chunk_shapes.append([]) + else: + last_data = dim.extent - (n - 1) * dim.size + if last_data == dim.size: + edges = [dim.size] * n + else: + edges = [dim.size] * (n - 1) + [last_data] + rle = _compress_rle(edges) + chunk_shapes.append(rle) else: edges = list(dim.edges) rle = _compress_rle(edges) - # Use RLE only if it actually compresses if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): chunk_shapes.append(rle) else: @@ -323,6 +395,55 @@ def to_dict(self) -> dict[str, JSON]: } +def parse_chunk_grid( + data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], + array_shape: tuple[int, ...], +) -> ChunkGrid: + """Create a ChunkGrid from a metadata dict, injecting array shape as extent. + + This is the primary entry point for constructing a ChunkGrid from serialized + metadata. Unlike ``ChunkGrid.from_dict``, this always produces a grid with + correct extent values. + """ + if isinstance(data, ChunkGrid): + # Re-bind extent if array_shape differs from what's stored + dims: list[DimensionGrid] = [] + for dim, extent in zip(data.dimensions, array_shape, strict=True): + if isinstance(dim, FixedDimension): + dims.append(FixedDimension(size=dim.size, extent=extent)) + else: + dims.append(dim) # VaryingDimension has intrinsic extent + return ChunkGrid(dimensions=tuple(dims)) + + name_parsed, configuration_parsed = parse_named_configuration(data) + + if name_parsed == "regular": + chunk_shape_raw = configuration_parsed.get("chunk_shape") + if chunk_shape_raw is None: + raise ValueError("Regular chunk grid requires 'chunk_shape' configuration") + if not isinstance(chunk_shape_raw, Sequence): + raise TypeError(f"chunk_shape must be a sequence, got {type(chunk_shape_raw)}") + return ChunkGrid.from_regular(array_shape, cast("Sequence[int]", chunk_shape_raw)) + + if name_parsed == "rectilinear": + chunk_shapes_raw = configuration_parsed.get("chunk_shapes") + if chunk_shapes_raw is None: + raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") + if not isinstance(chunk_shapes_raw, Sequence): + raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") + decoded: list[list[int]] = [] + for dim_spec in chunk_shapes_raw: + if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): + decoded.append(_expand_rle(dim_spec)) + elif isinstance(dim_spec, list): + decoded.append(dim_spec) + else: + raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + return ChunkGrid.from_rectilinear(decoded) + + raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") + + # --------------------------------------------------------------------------- # Backwards-compatible alias # --------------------------------------------------------------------------- @@ -335,7 +456,8 @@ class RegularChunkGrid(ChunkGrid): def __init__(self, *, chunk_shape: ShapeLike) -> None: chunk_shape_parsed = parse_shapelike(chunk_shape) - dims = tuple(FixedDimension(size=s) for s in chunk_shape_parsed) + # Without array shape, use extent=0 as placeholder + dims = tuple(FixedDimension(size=s, extent=0) for s in chunk_shape_parsed) object.__setattr__(self, "dimensions", dims) object.__setattr__(self, "_chunk_shape", chunk_shape_parsed) @@ -347,15 +469,13 @@ def _from_dict(cls, data: dict[str, JSON] | NamedConfig[str, Any]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}} - def all_chunk_coords(self, array_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - return itertools.product( - *(range(ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape, strict=False)) - ) + def all_chunk_coords(self) -> Iterator[tuple[int, ...]]: + return itertools.product(*(range(d.nchunks) for d in self.dimensions)) - def get_nchunks(self, array_shape: tuple[int, ...]) -> int: + def get_nchunks(self) -> int: return reduce( operator.mul, - itertools.starmap(ceildiv, zip(array_shape, self.chunk_shape, strict=True)), + (d.nchunks for d in self.dimensions), 1, ) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index cedd5a14bc..8ec5620228 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -403,7 +403,7 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: dim_offset = g.chunk_offset(dim_chunk_ix) dim_chunk_sel = self.dim_sel - dim_offset dim_out_sel = None - is_complete_chunk = g.chunk_size(dim_chunk_ix, self.dim_len) == 1 + is_complete_chunk = g.data_size(dim_chunk_ix) == 1 else: dim_chunk_ix = self.dim_sel // self.dim_chunk_len dim_offset = dim_chunk_ix * self.dim_chunk_len @@ -447,7 +447,7 @@ def __init__( object.__setattr__(self, "nitems", max(0, ceildiv((stop - start), step))) if dim_grid is not None: - object.__setattr__(self, "nchunks", dim_grid.nchunks(dim_len)) + object.__setattr__(self, "nchunks", dim_grid.nchunks) else: object.__setattr__(self, "nchunks", ceildiv(dim_len, dim_chunk_len)) @@ -460,7 +460,7 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): dim_offset = g.chunk_offset(dim_chunk_ix) - dim_chunk_len = g.chunk_size(dim_chunk_ix, self.dim_len) + dim_chunk_len = g.data_size(dim_chunk_ix) dim_limit = dim_offset + dim_chunk_len if self.start < dim_offset: @@ -723,7 +723,7 @@ def __init__( g = dim_grid if g is not None: - nchunks = g.nchunks(dim_len) + nchunks = g.nchunks else: nchunks = ceildiv(dim_len, dim_chunk_len) @@ -732,7 +732,7 @@ def __init__( for dim_chunk_ix in range(nchunks): if g is not None: dim_offset = g.chunk_offset(dim_chunk_ix) - chunk_len = g.chunk_size(dim_chunk_ix, dim_len) + chunk_len = g.data_size(dim_chunk_ix) else: dim_offset = dim_chunk_ix * dim_chunk_len chunk_len = dim_chunk_len @@ -762,7 +762,7 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: # find region in chunk if g is not None: dim_offset = g.chunk_offset(dim_chunk_ix) - chunk_len = g.chunk_size(dim_chunk_ix, self.dim_len) + chunk_len = g.data_size(dim_chunk_ix) else: dim_offset = dim_chunk_ix * self.dim_chunk_len chunk_len = self.dim_chunk_len @@ -859,7 +859,7 @@ def __init__( g = dim_grid if g is not None: - nchunks = g.nchunks(dim_len) + nchunks = g.nchunks else: nchunks = ceildiv(dim_len, dim_chunk_len) @@ -1280,7 +1280,7 @@ def __init__( if shape == (): cdata_shape = (1,) else: - cdata_shape = tuple(g.nchunks(s) for g, s in zip(dim_grids, shape, strict=True)) + cdata_shape = tuple(g.nchunks for g in dim_grids) nchunks = reduce(operator.mul, cdata_shape, 1) # some initial normalization diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3204543426..1d58dfe8c4 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -7,7 +7,7 @@ from zarr.abc.metadata import Metadata from zarr.abc.numcodec import Numcodec, _is_numcodec -from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_grids import ChunkGrid from zarr.core.dtype import get_data_type_from_json from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 from zarr.errors import ZarrUserWarning @@ -118,8 +118,8 @@ def ndim(self) -> int: return len(self.shape) @cached_property - def chunk_grid(self) -> RegularChunkGrid: - return RegularChunkGrid(chunk_shape=self.chunks) + def chunk_grid(self) -> ChunkGrid: + return ChunkGrid.from_regular(self.shape, self.chunks) @property def shards(self) -> tuple[int, ...] | None: diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 1f8d6b4674..775e08e334 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -24,7 +24,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec -from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid +from zarr.core.chunk_grids import ChunkGrid, parse_chunk_grid from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -229,7 +229,7 @@ def __init__( """ shape_parsed = parse_shapelike(shape) - chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) + chunk_grid_parsed = parse_chunk_grid(chunk_grid, shape_parsed) chunk_key_encoding_parsed = parse_chunk_key_encoding(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # Note: relying on a type method is numpy-specific @@ -262,14 +262,8 @@ def __init__( self._validate_metadata() def _validate_metadata(self) -> None: - if hasattr(self.chunk_grid, "ndim") and len(self.shape) != self.chunk_grid.ndim: + if len(self.shape) != self.chunk_grid.ndim: raise ValueError("`chunk_grid` and `shape` need to have the same number of dimensions.") - elif isinstance(self.chunk_grid, RegularChunkGrid) and len(self.shape) != len( - self.chunk_grid.chunk_shape - ): - raise ValueError( - "`chunk_shape` and `shape` need to have the same number of dimensions." - ) if self.dimension_names is not None and len(self.shape) != len(self.dimension_names): raise ValueError( "`dimension_names` and `shape` need to have the same number of dimensions." @@ -332,21 +326,13 @@ def inner_codecs(self) -> tuple[Codec, ...]: def get_chunk_spec( self, _chunk_coords: tuple[int, ...], array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: - if self.chunk_grid.is_regular: - # Regular grids: return the uniform chunk shape for all chunks, - # including boundary chunks. The codec pipeline expects full-sized - # buffers and handles boundary trimming separately. - chunk_shape = self.chunk_grid.chunk_shape - else: - # Rectilinear grids: each chunk may have a different shape. - chunk_shape_or_none = self.chunk_grid.get_chunk_shape(self.shape, _chunk_coords) - if chunk_shape_or_none is None: - raise ValueError( - f"Chunk coordinates {_chunk_coords} are out of bounds for shape {self.shape}" - ) - chunk_shape = chunk_shape_or_none + spec = self.chunk_grid[_chunk_coords] + if spec is None: + raise ValueError( + f"Chunk coordinates {_chunk_coords} are out of bounds for shape {self.shape}" + ) return ArraySpec( - shape=chunk_shape, + shape=spec.codec_shape, dtype=self.dtype, fill_value=self.fill_value, config=array_config, diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 330f220b56..45c86eb3eb 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -14,7 +14,7 @@ from zarr.abc.store import RangeByteRequest, Store from zarr.codecs.bytes import BytesCodec from zarr.core.array import Array -from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_grids import ChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype @@ -160,7 +160,7 @@ def array_metadata( return ArrayV3Metadata( shape=shape, data_type=dtype, - chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + chunk_grid=ChunkGrid.from_regular(shape, chunk_shape), fill_value=fill_value, attributes=draw(attributes), # type: ignore[arg-type] dimension_names=draw(dimension_names(ndim=ndim)), diff --git a/tests/test_indexing.py b/tests/test_indexing.py index c0bf7dd270..11c0a49e7f 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1219,8 +1219,8 @@ def test_get_block_selection_1d(store: StorePath) -> None: _test_get_block_selection(a, z, selection, expected_idx) bad_selections = block_selections_1d_bad + [ - z.metadata.chunk_grid.get_nchunks(z.shape) + 1, # out of bounds - -(z.metadata.chunk_grid.get_nchunks(z.shape) + 1), # out of bounds + z.metadata.chunk_grid.get_nchunks() + 1, # out of bounds + -(z.metadata.chunk_grid.get_nchunks() + 1), # out of bounds ] for selection_bad in bad_selections: diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 420e774342..b7460881f1 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -2,14 +2,14 @@ Tests for the unified ChunkGrid design (POC). Tests the core ChunkGrid with FixedDimension/VaryingDimension internals, -serialization round-trips, indexing with rectilinear grids, and end-to-end -array creation + read/write. +ChunkSpec, serialization round-trips, indexing with rectilinear grids, +and end-to-end array creation + read/write. """ from __future__ import annotations import json -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import numpy as np import pytest @@ -17,8 +17,11 @@ if TYPE_CHECKING: from pathlib import Path + from zarr.core.common import JSON + from zarr.core.chunk_grids import ( ChunkGrid, + ChunkSpec, FixedDimension, RegularChunkGrid, VaryingDimension, @@ -33,8 +36,9 @@ class TestFixedDimension: def test_basic(self) -> None: - d = FixedDimension(size=10) + d = FixedDimension(size=10, extent=100) assert d.size == 10 + assert d.extent == 100 assert d.index_to_chunk(0) == 0 assert d.index_to_chunk(9) == 0 assert d.index_to_chunk(10) == 1 @@ -42,26 +46,38 @@ def test_basic(self) -> None: assert d.chunk_offset(0) == 0 assert d.chunk_offset(1) == 10 assert d.chunk_offset(3) == 30 - assert d.chunk_size(0, 100) == 10 - assert d.chunk_size(9, 100) == 10 - # boundary chunk - assert d.chunk_size(9, 95) == 5 - assert d.nchunks(100) == 10 - assert d.nchunks(95) == 10 + # chunk_size is always uniform (codec buffer) + assert d.chunk_size(0) == 10 + assert d.chunk_size(9) == 10 + # data_size clips at boundary + assert d.data_size(0) == 10 + assert d.data_size(9) == 10 + assert d.nchunks == 10 + + def test_boundary_data_size(self) -> None: + d = FixedDimension(size=10, extent=95) + assert d.nchunks == 10 + assert d.chunk_size(9) == 10 # codec buffer always full + assert d.data_size(9) == 5 # only 5 valid elements at boundary def test_vectorized(self) -> None: - d = FixedDimension(size=10) + d = FixedDimension(size=10, extent=100) indices = np.array([0, 5, 10, 15, 99]) chunks = d.indices_to_chunks(indices) np.testing.assert_array_equal(chunks, [0, 0, 1, 1, 9]) def test_negative_size_rejected(self) -> None: with pytest.raises(ValueError, match="must be >= 0"): - FixedDimension(size=-1) + FixedDimension(size=-1, extent=100) + + def test_negative_extent_rejected(self) -> None: + with pytest.raises(ValueError, match="must be >= 0"): + FixedDimension(size=10, extent=-1) def test_zero_size_allowed(self) -> None: - d = FixedDimension(size=0) + d = FixedDimension(size=0, extent=0) assert d.size == 0 + assert d.nchunks == 1 # 0-size with 0-extent = 1 chunk # --------------------------------------------------------------------------- @@ -74,17 +90,15 @@ def test_basic(self) -> None: d = VaryingDimension([10, 20, 30]) assert d.edges == (10, 20, 30) assert d.cumulative == (10, 30, 60) - assert d.nchunks(60) == 3 + assert d.nchunks == 3 + assert d.extent == 60 def test_index_to_chunk(self) -> None: d = VaryingDimension([10, 20, 30]) - # First chunk: indices 0-9 assert d.index_to_chunk(0) == 0 assert d.index_to_chunk(9) == 0 - # Second chunk: indices 10-29 assert d.index_to_chunk(10) == 1 assert d.index_to_chunk(29) == 1 - # Third chunk: indices 30-59 assert d.index_to_chunk(30) == 2 assert d.index_to_chunk(59) == 2 @@ -96,9 +110,16 @@ def test_chunk_offset(self) -> None: def test_chunk_size(self) -> None: d = VaryingDimension([10, 20, 30]) - assert d.chunk_size(0, 60) == 10 - assert d.chunk_size(1, 60) == 20 - assert d.chunk_size(2, 60) == 30 + assert d.chunk_size(0) == 10 + assert d.chunk_size(1) == 20 + assert d.chunk_size(2) == 30 + + def test_data_size(self) -> None: + d = VaryingDimension([10, 20, 30]) + # data_size == chunk_size for varying dims + assert d.data_size(0) == 10 + assert d.data_size(1) == 20 + assert d.data_size(2) == 30 def test_vectorized(self) -> None: d = VaryingDimension([10, 20, 30]) @@ -115,6 +136,29 @@ def test_zero_edge_rejected(self) -> None: VaryingDimension([10, 0, 5]) +# --------------------------------------------------------------------------- +# ChunkSpec +# --------------------------------------------------------------------------- + + +class TestChunkSpec: + def test_basic(self) -> None: + spec = ChunkSpec( + slices=(slice(0, 10), slice(0, 20)), + codec_shape=(10, 20), + ) + assert spec.shape == (10, 20) + assert not spec.is_boundary + + def test_boundary(self) -> None: + spec = ChunkSpec( + slices=(slice(90, 95), slice(0, 20)), + codec_shape=(10, 20), + ) + assert spec.shape == (5, 20) + assert spec.is_boundary + + # --------------------------------------------------------------------------- # ChunkGrid construction # --------------------------------------------------------------------------- @@ -122,14 +166,14 @@ def test_zero_edge_rejected(self) -> None: class TestChunkGridConstruction: def test_from_regular(self) -> None: - g = ChunkGrid.from_regular((10, 20)) + g = ChunkGrid.from_regular((100, 200), (10, 20)) assert g.is_regular assert g.chunk_shape == (10, 20) assert g.ndim == 2 def test_zero_dim(self) -> None: """0-d arrays produce a ChunkGrid with no dimensions.""" - g = ChunkGrid.from_regular(()) + g = ChunkGrid.from_regular((), ()) assert g.is_regular assert g.chunk_shape == () assert g.ndim == 0 @@ -160,44 +204,76 @@ def test_all_uniform_becomes_regular(self) -> None: class TestChunkGridQueries: - def test_regular_grid_shape(self) -> None: - g = ChunkGrid.from_regular((10, 20)) - assert g.grid_shape((100, 200)) == (10, 10) - assert g.grid_shape((95, 200)) == (10, 10) + def test_regular_shape(self) -> None: + g = ChunkGrid.from_regular((100, 200), (10, 20)) + assert g.shape == (10, 10) - def test_rectilinear_grid_shape(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) - assert g.grid_shape((60, 100)) == (3, 4) + def test_regular_shape_boundary(self) -> None: + g = ChunkGrid.from_regular((95, 200), (10, 20)) + assert g.shape == (10, 10) # ceildiv(95, 10) == 10 - def test_regular_get_chunk_shape(self) -> None: - g = ChunkGrid.from_regular((10, 20)) - assert g.get_chunk_shape((100, 200), (0, 0)) == (10, 20) - assert g.get_chunk_shape((95, 200), (9, 0)) == (5, 20) # boundary - assert g.get_chunk_shape((100, 200), (99, 0)) is None # OOB + def test_rectilinear_shape(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + assert g.shape == (3, 4) - def test_rectilinear_get_chunk_shape(self) -> None: + def test_regular_getitem(self) -> None: + g = ChunkGrid.from_regular((100, 200), (10, 20)) + spec = g[(0, 0)] + assert spec is not None + assert spec.shape == (10, 20) + assert spec.codec_shape == (10, 20) + assert not spec.is_boundary + + def test_regular_getitem_boundary(self) -> None: + g = ChunkGrid.from_regular((95, 200), (10, 20)) + spec = g[(9, 0)] + assert spec is not None + assert spec.shape == (5, 20) # data_size clipped + assert spec.codec_shape == (10, 20) # codec always full + assert spec.is_boundary + + def test_regular_getitem_oob(self) -> None: + g = ChunkGrid.from_regular((100, 200), (10, 20)) + assert g[(99, 0)] is None + + def test_rectilinear_getitem(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) - assert g.get_chunk_shape((60, 100), (0, 0)) == (10, 25) - assert g.get_chunk_shape((60, 100), (1, 0)) == (20, 25) - assert g.get_chunk_shape((60, 100), (2, 3)) == (30, 25) - assert g.get_chunk_shape((60, 100), (3, 0)) is None # OOB + spec0 = g[(0, 0)] + assert spec0 is not None + assert spec0.shape == (10, 25) + + spec1 = g[(1, 0)] + assert spec1 is not None + assert spec1.shape == (20, 25) - def test_get_chunk_origin(self) -> None: + spec2 = g[(2, 3)] + assert spec2 is not None + assert spec2.shape == (30, 25) + + assert g[(3, 0)] is None # OOB + + def test_getitem_slices(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) - assert g.get_chunk_origin((60, 100), (0, 0)) == (0, 0) - assert g.get_chunk_origin((60, 100), (1, 0)) == (10, 0) - assert g.get_chunk_origin((60, 100), (2, 2)) == (30, 50) + spec = g[(1, 2)] + assert spec is not None + assert spec.slices == (slice(10, 30), slice(50, 75)) def test_all_chunk_coords(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) - coords = list(g.all_chunk_coords((60, 100))) - assert len(coords) == 6 # 3 * 2 + coords = list(g.all_chunk_coords()) + assert len(coords) == 6 assert coords[0] == (0, 0) assert coords[-1] == (2, 1) def test_get_nchunks(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) - assert g.get_nchunks((60, 100)) == 6 + assert g.get_nchunks() == 6 + + def test_iter(self) -> None: + g = ChunkGrid.from_regular((30, 40), (10, 20)) + specs = list(g) + assert len(specs) == 6 # 3 * 2 + assert all(isinstance(s, ChunkSpec) for s in specs) # --------------------------------------------------------------------------- @@ -227,7 +303,7 @@ def test_roundtrip(self) -> None: class TestSerialization: def test_regular_roundtrip(self) -> None: - g = ChunkGrid.from_regular((10, 20)) + g = ChunkGrid.from_regular((100, 200), (10, 20)) d = g.to_dict() assert d["name"] == "regular" config = d["configuration"] @@ -243,29 +319,28 @@ def test_rectilinear_roundtrip(self) -> None: assert d["name"] == "rectilinear" g2 = ChunkGrid.from_dict(d) assert not g2.is_regular - # Verify the reconstructed grid produces correct shapes - assert g2.get_chunk_shape((60, 100), (0, 0)) == (10, 25) - assert g2.get_chunk_shape((60, 100), (1, 0)) == (20, 25) - assert g2.get_chunk_shape((60, 100), (2, 3)) == (30, 25) + # Verify the reconstructed grid has same dimensions + spec0 = g2[(0, 0)] + assert spec0 is not None + assert spec0.shape == (10, 25) + spec1 = g2[(1, 0)] + assert spec1 is not None + assert spec1.shape == (20, 25) def test_rectilinear_rle_serialization(self) -> None: """RLE should be used when it actually compresses.""" g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]]) d = g.to_dict() - # First dim: 10 identical chunks -> RLE - # Second dim: 4 identical chunks -> stored as FixedDimension -> RLE [[25, 1]] assert d["name"] == "regular" # all uniform -> serializes as regular def test_rectilinear_rle_with_varying(self) -> None: g = ChunkGrid.from_rectilinear([[100, 100, 100, 50], [25, 25, 25, 25]]) d = g.to_dict() assert d["name"] == "rectilinear" - # Check RLE used for first dimension config = d["configuration"] assert isinstance(config, dict) chunk_shapes = config["chunk_shapes"] assert isinstance(chunk_shapes, list) - # First dim: [100, 100, 100, 50] -> [[100, 3], [50, 1]] (RLE shorter) assert chunk_shapes[0] == [[100, 3], [50, 1]] def test_json_roundtrip(self) -> None: @@ -274,7 +349,7 @@ def test_json_roundtrip(self) -> None: json_str = json.dumps(d) d2 = json.loads(json_str) g2 = ChunkGrid.from_dict(d2) - assert g2.grid_shape((60, 100)) == (3, 2) + assert g2.shape == (3, 2) def test_unknown_name_raises(self) -> None: with pytest.raises(ValueError, match="Unknown chunk grid"): @@ -294,9 +369,8 @@ def test_regular_chunk_grid_still_works(self) -> None: assert isinstance(g, ChunkGrid) def test_from_dict_regular(self) -> None: - d: dict[str, Any] = {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} + d: dict[str, JSON] = {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} g = ChunkGrid.from_dict(d) - # from_dict now returns ChunkGrid, not RegularChunkGrid assert isinstance(g, ChunkGrid) assert g.is_regular assert g.chunk_shape == (10, 20) @@ -327,16 +401,13 @@ def test_basic_indexer_rectilinear(self) -> None: chunk_grid=g, ) projections = list(indexer) - # Should visit all 3*2=6 chunks assert len(projections) == 6 - # Check first chunk p0 = projections[0] assert p0.chunk_coords == (0, 0) assert p0.chunk_selection == (slice(0, 10, 1), slice(0, 50, 1)) - # Check second chunk on first axis - p1 = projections[2] # (1, 0) in product order + p1 = projections[2] assert p1.chunk_coords == (1, 0) assert p1.chunk_selection == (slice(0, 20, 1), slice(0, 50, 1)) @@ -345,14 +416,14 @@ def test_basic_indexer_int_selection(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) indexer = BasicIndexer( - selection=(15, slice(None)), # index 15 falls in chunk 1 (offset 10) + selection=(15, slice(None)), shape=(60, 100), chunk_grid=g, ) projections = list(indexer) - assert len(projections) == 2 # 2 chunks in second dimension + assert len(projections) == 2 assert projections[0].chunk_coords == (1, 0) - assert projections[0].chunk_selection == (5, slice(0, 50, 1)) # 15 - 10 = 5 + assert projections[0].chunk_selection == (5, slice(0, 50, 1)) def test_basic_indexer_slice_subset(self) -> None: from zarr.core.indexing import BasicIndexer @@ -364,7 +435,6 @@ def test_basic_indexer_slice_subset(self) -> None: chunk_grid=g, ) projections = list(indexer) - # slice(5, 35) spans chunks 0 (5:10), 1 (0:20), 2 (0:5) chunk_coords_dim0 = sorted({p.chunk_coords[0] for p in projections}) assert chunk_coords_dim0 == [0, 1, 2] @@ -412,7 +482,7 @@ def test_create_rectilinear_array(self, tmp_path: Path) -> None: meta = AsyncArray._create_metadata_v3( shape=(60, 100), dtype=Float32(), - chunk_shape=(10, 20), # fallback, overridden by chunk_grid + chunk_shape=(10, 20), chunk_grid=g, ) assert isinstance(meta, ArrayV3Metadata) @@ -424,10 +494,13 @@ def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) d = g.to_dict() g2 = ChunkGrid.from_dict(d) - assert g2.grid_shape((60, 100)) == g.grid_shape((60, 100)) - # All chunk shapes should match - for coord in g.all_chunk_coords((60, 100)): - assert g.get_chunk_shape((60, 100), coord) == g2.get_chunk_shape((60, 100), coord) + assert g2.shape == g.shape + for coord in g.all_chunk_coords(): + orig_spec = g[coord] + new_spec = g2[coord] + assert orig_spec is not None + assert new_spec is not None + assert orig_spec.shape == new_spec.shape def test_get_chunk_spec_regular(self, tmp_path: Path) -> None: """get_chunk_spec works for regular grids.""" @@ -448,7 +521,6 @@ def test_get_chunk_spec_regular(self, tmp_path: Path) -> None: ) assert spec.shape == (10, 20) - # Boundary chunk spec_boundary = meta.get_chunk_spec( (9, 9), ArrayConfig.from_dict({}), @@ -497,9 +569,250 @@ def test_sharding_accepts_rectilinear_outer_grid(self) -> None: codec = ShardingCodec(chunk_shape=(5, 5)) g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) - # Should not raise codec.validate( shape=(60, 100), dtype=Float32(), chunk_grid=g, ) + + +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- + + +class TestEdgeCases: + """Edge cases around boundary chunks, zero-size dims, direct construction, + and serialization round-trips.""" + + # -- FixedDimension boundary (extent != size * nchunks) -- + + def test_fixed_dim_boundary_data_size(self) -> None: + """Boundary chunk's data_size is clipped to the remainder.""" + d = FixedDimension(size=10, extent=95) + assert d.nchunks == 10 + assert d.data_size(0) == 10 + assert d.data_size(9) == 5 # 95 - 9*10 = 5 + assert d.chunk_size(9) == 10 # codec buffer always full + + def test_fixed_dim_data_size_out_of_bounds(self) -> None: + """data_size clamps to 0 for out-of-bounds chunk indices.""" + d = FixedDimension(size=10, extent=100) + assert d.data_size(10) == 0 # exactly at boundary + assert d.data_size(11) == 0 # past boundary + assert d.data_size(999) == 0 + + def test_fixed_dim_data_size_boundary_oob(self) -> None: + """data_size for boundary grid, past last chunk.""" + d = FixedDimension(size=10, extent=95) + assert d.data_size(10) == 0 # past nchunks=10 + + def test_chunk_grid_boundary_getitem(self) -> None: + """ChunkGrid with boundary FixedDimension via direct construction.""" + g = ChunkGrid(dimensions=(FixedDimension(10, 95), FixedDimension(20, 40))) + spec = g[(9, 1)] + assert spec is not None + assert spec.shape == (5, 20) # data: (95-90, 40-20) + assert spec.codec_shape == (10, 20) # codec buffers are full + assert spec.is_boundary + + def test_chunk_grid_boundary_iter(self) -> None: + """Iterating a boundary grid yields correct boundary ChunkSpecs.""" + g = ChunkGrid(dimensions=(FixedDimension(10, 25),)) + specs = list(g) + assert len(specs) == 3 + assert specs[0].shape == (10,) + assert specs[1].shape == (10,) + assert specs[2].shape == (5,) + assert specs[2].is_boundary + assert not specs[0].is_boundary + + def test_chunk_grid_boundary_shape(self) -> None: + """shape property with boundary extent.""" + g = ChunkGrid(dimensions=(FixedDimension(10, 95),)) + assert g.shape == (10,) # ceildiv(95, 10) = 10 + + # -- Boundary FixedDimension in rectilinear serialization -- + + def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: + """A rectilinear grid with a boundary FixedDimension preserves extent.""" + g = ChunkGrid( + dimensions=( + VaryingDimension([10, 20, 30]), + FixedDimension(size=10, extent=95), + ) + ) + assert g.shape == (3, 10) + + d = g.to_dict() + assert d["name"] == "rectilinear" + # Second dim should serialize as edges that sum to 95 + config = d["configuration"] + assert isinstance(config, dict) + chunk_shapes = config["chunk_shapes"] + assert isinstance(chunk_shapes, list) + # Last edge should be 5, not 10 + dim1_shapes = chunk_shapes[1] + # Expand RLE to check + if isinstance(dim1_shapes[0], list): + expanded = _expand_rle(dim1_shapes) + else: + expanded = dim1_shapes + assert sum(expanded) == 95 # extent preserved + assert expanded[-1] == 5 # boundary chunk + + g2 = ChunkGrid.from_dict(d) + assert g2.shape == g.shape + # Round-tripped grid should have correct extent + for coord in g.all_chunk_coords(): + orig = g[coord] + new = g2[coord] + assert orig is not None + assert new is not None + assert orig.shape == new.shape + + def test_exact_extent_fixed_dim_rectilinear_roundtrip(self) -> None: + """No boundary: extent == size * nchunks round-trips cleanly.""" + g = ChunkGrid( + dimensions=( + VaryingDimension([10, 20]), + FixedDimension(size=25, extent=100), + ) + ) + d = g.to_dict() + g2 = ChunkGrid.from_dict(d) + assert g2.shape == g.shape + # All chunks should be uniform + for coord in g.all_chunk_coords(): + orig = g[coord] + new = g2[coord] + assert orig is not None + assert new is not None + assert orig.shape == new.shape + + # -- Zero-size and zero-extent -- + + def test_zero_size_zero_extent(self) -> None: + """FixedDimension(size=0, extent=0) => 1 chunk of size 0.""" + d = FixedDimension(size=0, extent=0) + assert d.nchunks == 1 + assert d.chunk_size(0) == 0 + assert d.data_size(0) == 0 + + def test_zero_size_nonzero_extent(self) -> None: + """FixedDimension(size=0, extent=5) => 0 chunks (can't partition).""" + d = FixedDimension(size=0, extent=5) + assert d.nchunks == 0 + assert d.data_size(0) == 0 + assert d.chunk_size(0) == 0 + + def test_zero_extent_nonzero_size(self) -> None: + """FixedDimension(size=10, extent=0) => 0 chunks.""" + d = FixedDimension(size=10, extent=0) + assert d.nchunks == 0 + assert d.data_size(0) == 0 + + # -- 0-d grid -- + + def test_0d_grid_getitem(self) -> None: + """0-d grid has exactly one chunk at coords ().""" + g = ChunkGrid.from_regular((), ()) + spec = g[()] + assert spec is not None + assert spec.shape == () + assert spec.codec_shape == () + assert not spec.is_boundary + + def test_0d_grid_iter(self) -> None: + """0-d grid iteration yields a single ChunkSpec.""" + g = ChunkGrid.from_regular((), ()) + specs = list(g) + assert len(specs) == 1 + + def test_0d_grid_all_chunk_coords(self) -> None: + """0-d grid has one chunk coord: the empty tuple.""" + g = ChunkGrid.from_regular((), ()) + coords = list(g.all_chunk_coords()) + assert coords == [()] + + def test_0d_grid_nchunks(self) -> None: + g = ChunkGrid.from_regular((), ()) + assert g.get_nchunks() == 1 + + # -- parse_chunk_grid edge cases -- + + def test_parse_chunk_grid_preserves_varying_extent(self) -> None: + """parse_chunk_grid does not overwrite VaryingDimension extent.""" + from zarr.core.chunk_grids import parse_chunk_grid + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + # VaryingDimension extent is 60 (sum of edges) + assert isinstance(g.dimensions[0], VaryingDimension) + assert g.dimensions[0].extent == 60 + + # Re-binding with a different array shape should not change VaryingDimension + g2 = parse_chunk_grid(g, (60, 100)) + assert isinstance(g2.dimensions[0], VaryingDimension) + assert g2.dimensions[0].extent == 60 # unchanged + + def test_parse_chunk_grid_rebinds_fixed_extent(self) -> None: + """parse_chunk_grid updates FixedDimension extent from array shape.""" + from zarr.core.chunk_grids import parse_chunk_grid + + g = ChunkGrid.from_regular((100, 200), (10, 20)) + assert g.dimensions[0].extent == 100 + + g2 = parse_chunk_grid(g, (50, 100)) + assert isinstance(g2.dimensions[0], FixedDimension) + assert g2.dimensions[0].extent == 50 # re-bound + assert g2.shape == (5, 5) + + # -- ChunkGrid.__getitem__ validation -- + + def test_getitem_negative_index_returns_none(self) -> None: + g = ChunkGrid.from_regular((100,), (10,)) + assert g[(-1,)] is None + + def test_getitem_oob_returns_none(self) -> None: + g = ChunkGrid.from_regular((100,), (10,)) + assert g[(10,)] is None + assert g[(99,)] is None + + # -- ChunkSpec properties -- + + def test_chunk_spec_empty_slices(self) -> None: + """ChunkSpec with zero-width slice.""" + spec = ChunkSpec(slices=(slice(10, 10),), codec_shape=(0,)) + assert spec.shape == (0,) + assert not spec.is_boundary + + def test_chunk_spec_multidim_boundary(self) -> None: + """is_boundary only when shape != codec_shape.""" + spec = ChunkSpec( + slices=(slice(0, 10), slice(0, 5)), + codec_shape=(10, 10), + ) + assert spec.shape == (10, 5) + assert spec.is_boundary # second dim differs + + # -- Rectilinear with zero-nchunks FixedDimension in to_dict -- + + def test_zero_nchunks_fixed_dim_in_rectilinear_to_dict(self) -> None: + """A rectilinear grid with a 0-nchunks FixedDimension serializes.""" + g = ChunkGrid( + dimensions=( + VaryingDimension([10, 20]), + FixedDimension(size=10, extent=0), + ) + ) + assert g.shape == (2, 0) + d = g.to_dict() + assert d["name"] == "rectilinear" + + # -- VaryingDimension data_size -- + + def test_varying_dim_data_size_equals_chunk_size(self) -> None: + """For VaryingDimension, data_size == chunk_size (no padding).""" + d = VaryingDimension([10, 20, 5]) + for i in range(3): + assert d.data_size(i) == d.chunk_size(i) From 784f4e7b5747a846b16b61c6d81b3f889646e88d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:08:06 -0400 Subject: [PATCH 006/118] Remove prospectus --- chunk-grid-prospectus.md | 360 --------------------------------------- 1 file changed, 360 deletions(-) delete mode 100644 chunk-grid-prospectus.md diff --git a/chunk-grid-prospectus.md b/chunk-grid-prospectus.md deleted file mode 100644 index c764394a28..0000000000 --- a/chunk-grid-prospectus.md +++ /dev/null @@ -1,360 +0,0 @@ -# Prospectus: Unified Chunk Grid Design for zarr-python - -Version: 3 - -**Related:** -- [#3750](https://github.com/zarr-developers/zarr-python/issues/3750) (single ChunkGrid proposal) -- [#3534](https://github.com/zarr-developers/zarr-python/pull/3534) (rectilinear implementation) -- [#3735](https://github.com/zarr-developers/zarr-python/pull/3735) (chunk grid module/registry) -- [ZEP0003](https://github.com/zarr-developers/zeps/blob/main/draft/ZEP0003.md) (variable chunking spec) -- [zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) (sharding v1.1: non-divisible subchunks) -- [zarr-extensions#25](https://github.com/zarr-developers/zarr-extensions/pull/25) (rectilinear extension) -- [zarr-extensions#34](https://github.com/zarr-developers/zarr-extensions/issues/34) (sharding + rectilinear) - -## Problem - -The Zarr V3 spec defines `chunk_grid` as an extension point, but chunk grids are fundamentally different from codecs. Codecs are independent — supporting `zstd` tells you nothing about `gzip`. Chunk grids form a hierarchy — the rectilinear grid is strictly more general than the regular grid. Any regular grid is expressible as a rectilinear grid. - -There is no known chunk grid that is both (a) more general than rectilinear and (b) retains the axis-aligned tessellation properties Zarr assumes. All known grids are special cases: - -| Grid type | Description | -|---|---| -| Regular | Uniform chunk size, boundary chunks padded with fill_value | -| Regular-bounded (zarrs) | Uniform chunk size, boundary chunks trimmed to array extent | -| HPC boundary-padded | Regular interior, larger boundary chunks | -| Fully variable | Arbitrary per-chunk sizes | - -A registry-based plugin system adds complexity without clear benefit. - -## Design - -### Principles - -1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern — the specific partition of a specific array. The grid stores enough information to answer any question about any chunk without external parameters. -2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. The serialization format (`"regular"` vs `"rectilinear"`) is chosen by the metadata layer, not the grid. -3. **No chunk grid registry.** Simple name-based dispatch in `parse_chunk_grid()`. -4. **Fixed vs Varying per dimension.** `FixedDimension(size, extent)` for uniform chunks; `VaryingDimension(edges, cumulative)` for per-chunk edge lengths with precomputed prefix sums. Avoids expanding regular dimensions into lists of identical values. -5. **Transparent transitions.** Operations like `resize()` can move an array from regular to rectilinear chunking. - -### Internal representation - -```python -@dataclass(frozen=True) -class FixedDimension: - """Uniform chunk size. Boundary chunks contain less data but are - encoded at full size by the codec pipeline.""" - size: int # chunk edge length (> 0) - extent: int # array dimension length - - @property - def nchunks(self) -> int: - return ceildiv(self.extent, self.size) - - def index_to_chunk(self, idx: int) -> int: - return idx // self.size - def chunk_offset(self, chunk_ix: int) -> int: - return chunk_ix * self.size - def chunk_size(self, chunk_ix: int) -> int: - return self.size # always uniform - def data_size(self, chunk_ix: int) -> int: - return min(self.size, self.extent - chunk_ix * self.size) # clipped at extent - def indices_to_chunks(self, indices: NDArray) -> NDArray: - return indices // self.size - -@dataclass(frozen=True) -class VaryingDimension: - """Explicit per-chunk sizes. No padding — each edge length is - both the codec size and the data size.""" - edges: tuple[int, ...] # per-chunk edge lengths (all > 0) - cumulative: tuple[int, ...] # prefix sums for O(log n) lookup - - @property - def nchunks(self) -> int: - return len(self.edges) - @property - def extent(self) -> int: - return self.cumulative[-1] - - def index_to_chunk(self, idx: int) -> int: - return bisect.bisect_right(self.cumulative, idx) - def chunk_offset(self, chunk_ix: int) -> int: - return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 - def chunk_size(self, chunk_ix: int) -> int: - return self.edges[chunk_ix] - def data_size(self, chunk_ix: int) -> int: - return self.edges[chunk_ix] # same as chunk_size - def indices_to_chunks(self, indices: NDArray) -> NDArray: - return np.searchsorted(self.cumulative, indices, side='right') -``` - -Both types share a common interface: `nchunks`, `extent`, `index_to_chunk`, `chunk_offset`, `chunk_size`, `data_size`, `indices_to_chunks`. Memory usage scales with the number of *varying* dimensions, not total chunks. - -The two size methods serve different consumers: - -| Method | Returns | Consumer | -|---|---|---| -| `chunk_size` | Buffer size for codec processing | Codec pipeline (`ArraySpec.shape`) | -| `data_size` | Valid data region within the buffer | Indexing pipeline (`chunk_selection` slicing) | - -For `FixedDimension`, these differ only at the boundary. For `VaryingDimension`, they are identical. This matches current zarr-python behavior: `get_chunk_spec` passes the full `chunk_shape` to the codec for all chunks, and the indexer generates a `chunk_selection` that clips the decoded buffer. - -### ChunkSpec - -```python -@dataclass(frozen=True) -class ChunkSpec: - slices: tuple[slice, ...] # valid data region in array coordinates - codec_shape: tuple[int, ...] # buffer shape for codec processing - - @property - def shape(self) -> tuple[int, ...]: - return tuple(s.stop - s.start for s in self.slices) - - @property - def is_boundary(self) -> bool: - return self.shape != self.codec_shape -``` - -For interior chunks, `shape == codec_shape`. For boundary chunks of a regular grid, `codec_shape` is the full declared chunk size while `shape` is clipped. For rectilinear grids, `shape == codec_shape` always. - -### API - -```python -# Creating arrays -arr = zarr.create_array(shape=(100, 200), chunks=(10, 20)) # regular -arr = zarr.create_array(shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]]) # rectilinear -arr = zarr.create_array(shape=(1000,), chunks=[[[100, 10]]]) # RLE shorthand - -# ChunkGrid as a collection -grid = arr.chunk_grid # ChunkGrid instance -grid.shape # (10, 8) — number of chunks per dimension -grid.ndim # 2 -grid.is_regular # True if all dimensions are Fixed - -spec = grid[0, 1] # ChunkSpec for chunk at grid position (0, 1) -spec.slices # (slice(0, 10), slice(25, 50)) -spec.shape # (10, 25) — data shape -spec.codec_shape # (10, 25) — same for interior chunks - -boundary = grid[9, 0] # boundary chunk (extent=95, size=10) -boundary.shape # (5, 25) — 5 elements of real data -boundary.codec_shape # (10, 25) — codec sees full buffer - -grid[99, 99] # None — out of bounds - -for spec in grid: # iterate all chunks - ... - -# .chunks property: retained for regular grids, raises for rectilinear -arr.chunks # (10, 25) -``` - -`ChunkGrid.__getitem__` constructs `ChunkSpec` using `chunk_size` for `codec_shape` and `data_size` for `slices`: - -```python -def __getitem__(self, coords: tuple[int, ...]) -> ChunkSpec | None: - slices = [] - codec_shape = [] - for dim, ix in zip(self.dimensions, coords): - if ix < 0 or ix >= dim.nchunks: - return None - offset = dim.chunk_offset(ix) - slices.append(slice(offset, offset + dim.data_size(ix))) - codec_shape.append(dim.chunk_size(ix)) - return ChunkSpec(tuple(slices), tuple(codec_shape)) -``` - -#### Serialization - -```python -# Regular grid: -{"name": "regular", "configuration": {"chunk_shape": [10, 20]}} - -# Rectilinear grid (with RLE compression): -{"name": "rectilinear", "configuration": {"chunk_shapes": [[10, 20, 30], [[25, 4]]]}} -``` - -Both names deserialize to the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. For rectilinear grids, the extent is redundant (`sum(edges)`) and is validated for consistency. - -**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`, which already knows how to produce its JSON document. The flow is always: metadata document → `ChunkGrid` (via `parse_chunk_grid`), never the reverse. The grid is a pure runtime computation object. - -`ArrayV3Metadata` stores the chunk grid's JSON `name` from the original metadata document and uses it when serializing back. This gives round-trip fidelity for free — a store written as rectilinear with uniform edges stays rectilinear. - -The only place where a user needs to choose the format is when creating new metadata. For `create_array`, the format is inferred from the `chunks` argument: a flat tuple produces `"regular"`, a nested list produces `"rectilinear"`. For `resize`, the format can be specified explicitly via `chunk_grid_metadata`: - -```python -arr.resize( - (80, 100), - chunks=[[10, 20, 30, 20], [25, 25, 25, 25]], - chunk_grid_metadata="rectilinear", -) -``` - -`chunk_grid_metadata` is typed as `str`, not a closed literal — the Zarr V3 spec allows any registered chunk grid name. zarr-python supports `"regular"` and `"rectilinear"` natively; other names (e.g., zarrs' `"regular_bounded"`) would raise unless a handler is registered. If omitted, the format is inferred: `"rectilinear"` when chunks are non-uniform or explicitly nested, `"regular"` when chunks are a flat tuple and evenly divide the shape. Specifying `"regular"` when the chunks are non-uniform raises an error. - -#### Resize - -```python -arr.resize((80, 100)) # inferred rectilinear if not evenly divisible -arr.resize((80, 100), chunks=[[10, 20, 30, 20], [25, 25, 25, 25]]) # explicit chunks -arr.resize((70, 100)) # stays regular if divisible -arr.resize((100, 100), chunk_grid_metadata="rectilinear") # force rectilinear metadata -``` - -Resize creates new `ArrayV3Metadata` (and thus a new `ChunkGrid`). Since resize always creates new metadata, `chunk_grid_metadata` is the natural place to choose the serialization format. - -### Indexing - -The indexing pipeline is coupled to regular grid assumptions — every per-dimension indexer takes a scalar `dim_chunk_len: int` and uses `//` and `*`: - -```python -dim_chunk_ix = self.dim_sel // self.dim_chunk_len # IntDimIndexer -dim_offset = dim_chunk_ix * self.dim_chunk_len # SliceDimIndexer -``` - -Replace `dim_chunk_len: int` with the dimension object (`FixedDimension | VaryingDimension`). The shared interface means the indexer code structure stays the same — `dim_sel // dim_chunk_len` becomes `dim_grid.index_to_chunk(dim_sel)`. O(1) for regular, binary search for varying. - -### Codec pipeline - -Today, `get_chunk_spec()` returns the same `ArraySpec(shape=chunk_grid.chunk_shape)` for every chunk. For rectilinear grids, each chunk has a different codec shape: - -```python -def get_chunk_spec(self, chunk_coords, array_config, prototype) -> ArraySpec: - spec = self.chunk_grid[chunk_coords] - return ArraySpec(shape=spec.codec_shape, ...) -``` - -Note `spec.codec_shape`, not `spec.shape`. For regular grids, `codec_shape` is uniform (preserving current behavior). The boundary clipping flow is unchanged: - -``` -Write: user data → pad to codec_shape with fill_value → encode → store -Read: store → decode to codec_shape → slice via chunk_selection → user data -``` - -### Sharding - -PR #3534 marks sharding as incompatible with rectilinear grids. This is unnecessary — sharding has three independent grid levels: - -``` -Level 1 — Outer chunk grid (shard boundaries): regular or rectilinear -Level 2 — Inner subchunk grid (within each shard): always regular -Level 3 — Shard index: ceil(shard_dim / subchunk_dim) entries per dimension -``` - -The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as extent and the subchunk shape as `FixedDimension`. Each shard is self-contained — it doesn't need to know whether the outer grid is regular or rectilinear. - -[zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) lifts the requirement that subchunk shapes evenly divide the shard shape. With the proposed `ChunkGrid`, this just means removing the `shard_shape % subchunk_shape == 0` validation — `FixedDimension` already handles boundary clipping via `data_size`. - -| Outer grid | Subchunk divisibility | Required change | -|---|---|---| -| Regular | Evenly divides (v1.0) | None | -| Regular | Non-divisible (v1.1) | Remove divisibility validation | -| Rectilinear | Evenly divides | Remove "sharding incompatible" guard | -| Rectilinear | Non-divisible | Both changes | - -### What this replaces - -| Current | Proposed | -|---|---| -| `ChunkGrid` ABC + `RegularChunkGrid` subclass | Single concrete `ChunkGrid` with `is_regular` | -| `RectilinearChunkGrid` (#3534) | Same `ChunkGrid` class | -| Chunk grid registry + entrypoints (#3735) | Direct name dispatch | -| `arr.chunks` | Retained for regular; `arr.chunk_grid` for general use | -| `get_chunk_shape(shape, coord)` | `grid[coord].codec_shape` or `grid[coord].shape` | - -## Design decisions - -### Why store the extent in ChunkGrid? - -The chunk grid is a concrete arrangement, not an abstract tiling pattern. A finite collection naturally has an extent. Storing it enables `__getitem__`, eliminates `dim_len` parameters from every method, and makes the grid self-describing. - -This does *not* mean `ArrayV3Metadata.shape` should delegate to the grid. The array shape remains an independent field in metadata. The extent is passed into the grid at construction time so it can answer boundary questions without external parameters. It is **not** serialized as part of the chunk grid JSON — it comes from the `shape` field in array metadata and is passed to `parse_chunk_grid()`. - -### Why distinguish chunk_size from data_size? - -A chunk in a regular grid has two sizes. `chunk_size` is the buffer size the codec processes — always `size` for `FixedDimension`, even at the boundary (padded with `fill_value`). `data_size` is the valid data region — clipped to `extent % size` at the boundary. The indexing layer uses `data_size` to generate `chunk_selection` slices. - -This matches current zarr-python behavior and matters for: -1. **Backward compatibility.** Existing stores have boundary chunks encoded at full `chunk_shape`. -2. **Codec simplicity.** Codecs assume uniform input shapes for regular grids. -3. **Shard index correctness.** The index assumes `subchunk_dim`-sized entries. - -For `VaryingDimension`, `chunk_size == data_size` — no padding. This is the fundamental difference: `FixedDimension` has a declared size plus an extent that clips data; `VaryingDimension` has explicit sizes that *are* the extent. - -### Why not a chunk grid registry? - -There is no known chunk grid outside the rectilinear family that retains the tessellation properties zarr-python assumes. A `match` on the grid name is sufficient. - -### Why a single class instead of a Protocol? - -All known grids are special cases of rectilinear. A Protocol-based approach means every caller programs against an abstract interface and adding a grid type requires implementing ~10 methods. A single class is simpler. If a genuinely novel grid type emerges, a Protocol can be extracted. - -## Prior art - -**zarrs (Rust):** Three independent grid types behind a `ChunkGridTraits` trait. Key patterns adopted: Fixed vs Varying per dimension, prefix sums + binary search, `Option` for out-of-bounds, `NonZeroU64` for chunk dimensions, separate subchunk grid per shard, array shape at construction. - -**TensorStore (C++):** Stores only `chunk_shape` — boundary clipping via `valid_data_bounds` at query time. Both `RegularGridRef` and `IrregularGrid` internally. No registry. - -## Migration - -### Plan - -1. **Amend and merge #3735.** Keep the `chunk_grids/` module layout. Replace the registry with direct name dispatch. Remove `register_chunk_grid` / `get_chunk_grid_class` and the entrypoint. -2. **Open a new PR** implementing this prospectus: - - `FixedDimension`, `VaryingDimension`, `ChunkSpec`, and `ChunkGrid` classes. - - `parse_chunk_grid(metadata, array_shape)` with `"regular"` and `"rectilinear"` dispatch. - - Port RLE helpers, `resolve_chunk_spec`, `ChunksLike`, and validation functions from #3534. - - Refactor per-dimension indexers to accept `FixedDimension | VaryingDimension`. - - Update `get_chunk_spec` to use `grid[chunk_coords].codec_shape`. - - Add `arr.chunk_grid`. Keep `.chunks` for regular, raise for rectilinear. - - Remove the "sharding incompatible with rectilinear" guard. - - Adapt tests from #3534. -3. **Close trial PRs** with credits: - - **#3534** — RLE helpers, validation logic, chunk spec resolution, test cases, review discussion. - - **#3737** — extent-in-grid idea (adopted per-dimension). - - **#1483** — original POC; superseded by V3 implementation. - - **#3736** — resolved by storing extent per-dimension. -4. **Sharding v1.1** (separate PR, after zarr-specs#370) — remove `shard_shape % subchunk_shape == 0` validation. - -### Reusable components from #3534 - -| Component | Disposition | -|---|---| -| RLE encode/decode helpers | **Keep** | -| `_normalize_rectilinear_chunks` / `_parse_chunk_shapes` | **Keep** — feed into `VaryingDimension` | -| `resolve_chunk_spec` / `ChunksLike` | **Keep** | -| `_validate_zarr_format_compatibility` | **Keep** — rectilinear is V3-only | -| `_validate_sharding_compatibility` | **Remove** — sharding is compatible | -| `RectilinearChunkGrid` class | **Replace** | -| Indexing changes | **Insufficient** — `isinstance` guards remain | - -A **fresh PR** is more practical than adapting #3534's 5700-line diff. - -### Downstream migration - -All four downstream PRs/issues follow the same pattern: - -| Two-class pattern | Unified pattern | -|---|---| -| `isinstance(cg, RegularChunkGrid)` | `cg.is_regular` | -| `isinstance(cg, RectilinearChunkGrid)` | `not cg.is_regular` | -| `cg.chunk_shape` | `cg.dimensions[i].size` or `cg[coord].shape` | -| `cg.chunk_shapes` | `tuple(d.edges for d in cg.dimensions)` | -| `RegularChunkGrid(chunk_shape=...)` | `ChunkGrid.from_regular(shape, chunks)` | -| `RectilinearChunkGrid(chunk_shapes=...)` | `ChunkGrid.from_rectilinear(edges)` | -| Feature detection via class import | Version check or `hasattr(ChunkGrid, 'is_regular')` | - -**[xarray#10880](https://github.com/pydata/xarray/pull/10880):** Replace `isinstance` checks with `.is_regular`. Write path simplifies with `chunks=[[...]]` API. ~1–2 days. - -**[VirtualiZarr#877](https://github.com/zarr-developers/VirtualiZarr/pull/877):** Drop vendored `_is_nested_sequence`. Replace `isinstance` checks. ~1–2 days. - -**[Icechunk#1338](https://github.com/earth-mover/icechunk/issues/1338):** Minimal impact — format changes driven by spec, not class hierarchy. - -**[cubed#876](https://github.com/cubed-dev/cubed/issues/876):** Switch store creation to `ChunkGrid` API. <1 day. - -## Open questions - -1. **RLE in the Python API:** Should users pass RLE-encoded chunk specs directly, or only expanded lists? -2. **Resize defaults:** When growing a regular array, should the default preserve regularity or transition to rectilinear? -3. **`ChunkSpec` complexity:** `ChunkSpec` carries both `slices` and `codec_shape`. Should the grid expose separate methods for codec vs data queries instead? -4. **`__getitem__` with slices:** Should `grid[0, :]` or `grid[0:3, :]` return a sub-grid or an iterator of `ChunkSpec`s? From f1a1bc3f419aab6104bf290a8e3bc282807084c7 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 20:35:55 -0400 Subject: [PATCH 007/118] Fix sharding --- src/zarr/core/metadata/v3.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 775e08e334..ac1578f10e 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -301,19 +301,14 @@ def chunks(self) -> tuple[int, ...]: @property def shards(self) -> tuple[int, ...] | None: - if self.chunk_grid.is_regular: - from zarr.codecs.sharding import ShardingCodec + if not self.chunk_grid.is_regular: + return None - if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): - return self.chunk_grid.chunk_shape - else: - return None + from zarr.codecs.sharding import ShardingCodec - msg = ( - "The `shards` attribute is only defined for arrays using regular chunk grids. " - "This array has a rectilinear chunk grid. Use `chunk_grid` for general access." - ) - raise NotImplementedError(msg) + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.chunk_grid.chunk_shape + return None @property def inner_codecs(self) -> tuple[Codec, ...]: From 30fa86740d9785d6badf24ad8035de249459a45a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 22:14:38 -0400 Subject: [PATCH 008/118] Fix bugs --- src/zarr/codecs/sharding.py | 37 ++++++--- src/zarr/core/chunk_grids.py | 8 +- src/zarr/core/indexing.py | 8 +- tests/test_unified_chunk_grid.py | 128 +++++++++++++++++++++++++++++++ 4 files changed, 166 insertions(+), 15 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 1785199ec4..9e741d0710 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -390,18 +390,31 @@ def validate( # Sharding works with both regular and rectilinear outer chunk grids. # Each shard is self-contained — the ShardingCodec constructs an independent # inner ChunkGrid per shard using the shard shape and subchunk shape. - if chunk_grid.is_regular and not all( - s % c == 0 - for s, c in zip( - chunk_grid.chunk_shape, - self.chunk_shape, - strict=False, - ) - ): - raise ValueError( - f"The array's `chunk_shape` (got {chunk_grid.chunk_shape}) " - f"needs to be divisible by the shard's inner `chunk_shape` (got {self.chunk_shape})." - ) + if chunk_grid.is_regular: + if not all( + s % c == 0 + for s, c in zip( + chunk_grid.chunk_shape, + self.chunk_shape, + strict=False, + ) + ): + raise ValueError( + f"The array's `chunk_shape` (got {chunk_grid.chunk_shape}) " + f"needs to be divisible by the shard's inner `chunk_shape` (got {self.chunk_shape})." + ) + else: + # For rectilinear grids, every chunk's dimensions must be divisible + # by the inner chunk_shape. + for coord in chunk_grid.all_chunk_coords(): + spec = chunk_grid[coord] + if spec is not None and not all( + s % c == 0 for s, c in zip(spec.codec_shape, self.chunk_shape, strict=False) + ): + raise ValueError( + f"Chunk at {coord} has shape {spec.codec_shape} which is not " + f"divisible by the shard's inner `chunk_shape` (got {self.chunk_shape})." + ) async def _decode_single( self, diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 2070f118f6..5dd024ae19 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -412,7 +412,13 @@ def parse_chunk_grid( if isinstance(dim, FixedDimension): dims.append(FixedDimension(size=dim.size, extent=extent)) else: - dims.append(dim) # VaryingDimension has intrinsic extent + # VaryingDimension has intrinsic extent — validate it matches + if dim.extent != extent: + raise ValueError( + f"VaryingDimension extent {dim.extent} does not match " + f"array shape extent {extent} for dimension {len(dims)}" + ) + dims.append(dim) return ChunkGrid(dimensions=tuple(dims)) name_parsed, configuration_parsed = parse_named_configuration(data) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 8ec5620228..125fcce97a 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1051,10 +1051,14 @@ def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: Chu else: drop_axes = () - # Compute chunk_shape for ix_() compatibility in __iter__ + # Compute chunk_shape for ix_() compatibility in __iter__. + # For VaryingDimension, use the max edge length so that + # slice_to_range produces correct ranges for the largest chunk. from zarr.core.chunk_grids import FixedDimension - chunk_shape = tuple(g.size if isinstance(g, FixedDimension) else 1 for g in dim_grids) + chunk_shape = tuple( + g.size if isinstance(g, FixedDimension) else max(g.edges) for g in dim_grids + ) object.__setattr__(self, "dim_indexers", dim_indexers) object.__setattr__(self, "shape", shape) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index b7460881f1..6557c6a3b6 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -816,3 +816,131 @@ def test_varying_dim_data_size_equals_chunk_size(self) -> None: d = VaryingDimension([10, 20, 5]) for i in range(3): assert d.data_size(i) == d.chunk_size(i) + + +# --------------------------------------------------------------------------- +# Bug: OrthogonalIndexer chunk_shape=1 for VaryingDimension +# --------------------------------------------------------------------------- + + +class TestOrthogonalIndexerRectilinear: + """OrthogonalIndexer must use correct per-chunk sizes for VaryingDimension, + not a hardcoded 1. The chunk_shape field is used by ix_() to convert slices + to ranges for advanced indexing.""" + + def test_orthogonal_int_array_selection_rectilinear(self) -> None: + """Integer array selection with rectilinear grid must produce correct + chunk-local selections.""" + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + indexer = OrthogonalIndexer( + selection=(np.array([5, 15, 35]), slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + # Grid: dim0 chunks [0..10), [10..30), [30..60); dim1 chunks [0..50), [50..100) + # Indices 5, 15, 35 land in chunks 0, 1, 2 respectively. + # Combined with slice(None) over 2 dim1 chunks, we get 6 projections. + chunk_coords = [p.chunk_coords for p in projections] + assert chunk_coords == [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)] + + def test_orthogonal_bool_array_selection_rectilinear(self) -> None: + """Boolean array selection with rectilinear grid.""" + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + mask = np.zeros(60, dtype=bool) + mask[5] = True + mask[15] = True + mask[35] = True + indexer = OrthogonalIndexer( + selection=(mask, slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) > 0 + + def test_orthogonal_advanced_indexing_chunk_shape_not_one(self) -> None: + """Verify OrthogonalIndexer.chunk_shape reflects actual chunk sizes, + not a hardcoded 1 for VaryingDimension.""" + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + indexer = OrthogonalIndexer( + selection=(np.array([5, 15]), slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + # chunk_shape should NOT have 1 for the VaryingDimension + # The first dim has varying chunks [10, 20, 30] — we need a + # representative size for ix_() to work. Using the max is safe. + assert indexer.chunk_shape[0] > 1 # was incorrectly 1 before fix + assert indexer.chunk_shape[1] == 50 + + +# --------------------------------------------------------------------------- +# Bug: Sharding validation skipped for rectilinear grids +# --------------------------------------------------------------------------- + + +class TestShardingValidationRectilinear: + """ShardingCodec.validate must check divisibility for rectilinear grids too.""" + + def test_sharding_rejects_non_divisible_rectilinear(self) -> None: + """Rectilinear shard sizes not divisible by inner chunk_shape should raise.""" + from zarr.codecs.sharding import ShardingCodec + from zarr.core.dtype import Float32 + + codec = ShardingCodec(chunk_shape=(5, 5)) + # 17 is not divisible by 5 + g = ChunkGrid.from_rectilinear([[10, 20, 17], [50, 50]]) + + with pytest.raises(ValueError, match="divisible"): + codec.validate( + shape=(47, 100), + dtype=Float32(), + chunk_grid=g, + ) + + def test_sharding_accepts_divisible_rectilinear(self) -> None: + """Rectilinear shard sizes all divisible by inner chunk_shape should pass.""" + from zarr.codecs.sharding import ShardingCodec + from zarr.core.dtype import Float32 + + codec = ShardingCodec(chunk_shape=(5, 5)) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + + # Should not raise + codec.validate( + shape=(60, 100), + dtype=Float32(), + chunk_grid=g, + ) + + +# --------------------------------------------------------------------------- +# Bug: parse_chunk_grid doesn't validate VaryingDimension extent vs array_shape +# --------------------------------------------------------------------------- + + +class TestParseChunkGridValidation: + """parse_chunk_grid should raise when VaryingDimension extent != array_shape.""" + + def test_varying_extent_mismatch_raises(self) -> None: + from zarr.core.chunk_grids import parse_chunk_grid + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + # VaryingDimension extent is 60, but array_shape says 100 + with pytest.raises(ValueError, match="extent"): + parse_chunk_grid(g, (100, 100)) + + def test_varying_extent_match_ok(self) -> None: + from zarr.core.chunk_grids import parse_chunk_grid + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + # Matching extents should work fine + g2 = parse_chunk_grid(g, (60, 100)) + assert g2.dimensions[0].extent == 60 From 1282c7bf36be838254514031040b63e798f49325 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 22:50:43 -0400 Subject: [PATCH 009/118] Support sequence in array functions --- src/zarr/api/synchronous.py | 4 ++-- src/zarr/core/array.py | 27 +++++++++++++++++++++++---- src/zarr/core/chunk_grids.py | 19 +++++++++++++++++++ src/zarr/core/group.py | 9 +++++---- 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 1204eba3c9..f5e6efbc22 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -13,7 +13,7 @@ from zarr.errors import ZarrDeprecationWarning if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence import numpy as np import numpy.typing as npt @@ -822,7 +822,7 @@ def create_array( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Literal["auto"] = "auto", + chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 96706c3a04..ecd2c1dd1f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4523,7 +4523,7 @@ async def init_array( store_path: StorePath, shape: ShapeLike, dtype: ZDTypeLike, - chunks: tuple[int, ...] | Literal["auto"] = "auto", + chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -4639,6 +4639,24 @@ async def init_array( else: await ensure_no_existing_node(store_path, zarr_format=zarr_format) + # Detect rectilinear (nested list) chunks, e.g. [[10, 20, 30], [25, 25]] + from zarr.core.chunk_grids import _is_rectilinear_chunks + + rectilinear_grid: ChunkGrid | None = None + if _is_rectilinear_chunks(chunks): + if zarr_format == 2: + raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") + if shards is not None: + raise ValueError("Rectilinear chunk grids do not support sharding.") + rect_chunks = cast("Sequence[Sequence[int]]", chunks) + rectilinear_grid = ChunkGrid.from_rectilinear(rect_chunks) + # Use first chunk size per dim as placeholder for _auto_partition + chunks_flat: tuple[int, ...] | Literal["auto"] = tuple( + dim_edges[0] for dim_edges in rect_chunks + ) + else: + chunks_flat = cast("tuple[int, ...] | Literal['auto']", chunks) + item_size = 1 if isinstance(zdtype, HasItemSize): item_size = zdtype.item_size @@ -4646,7 +4664,7 @@ async def init_array( shard_shape_parsed, chunk_shape_parsed = _auto_partition( array_shape=shape_parsed, shard_shape=shards, - chunk_shape=chunks, + chunk_shape=chunks_flat, item_size=item_size, ) chunks_out: tuple[int, ...] @@ -4725,6 +4743,7 @@ async def init_array( codecs=codecs_out, dimension_names=dimension_names, attributes=attributes, + chunk_grid=rectilinear_grid, ) arr = AsyncArray(metadata=meta, store_path=store_path, config=config) @@ -4739,7 +4758,7 @@ async def create_array( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Literal["auto"] = "auto", + chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -4881,7 +4900,7 @@ async def create_array( data=data_parsed, write_data=write_data, name=name, - chunks=chunks, + chunks=cast("Literal['auto', 'keep'] | tuple[int, ...]", chunks), shards=shards, filters=filters, compressors=compressors, diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 5dd024ae19..31ddab4716 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -199,6 +199,25 @@ def _compress_rle(sizes: Sequence[int]) -> list[list[int]]: ChunksLike = tuple[int, ...] | list[list[int] | int] | int +def _is_rectilinear_chunks(chunks: Any) -> bool: + """Check if chunks is a nested sequence (e.g. [[10, 20], [5, 5]]). + + Returns True for inputs like [[10, 20], [5, 5]] or [(10, 20), (5, 5)]. + Returns False for flat sequences like (10, 10) or [10, 10]. + """ + if isinstance(chunks, (str, int, ChunkGrid)): + return False + if not hasattr(chunks, "__iter__"): + return False + try: + first_elem = next(iter(chunks), None) + if first_elem is None: + return False + return hasattr(first_elem, "__iter__") and not isinstance(first_elem, (str, bytes, int)) + except (TypeError, StopIteration): + return False + + @dataclass(frozen=True) class ChunkGrid(Metadata): """ diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 9b5fee275b..302b82cdf2 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -71,6 +71,7 @@ Iterable, Iterator, Mapping, + Sequence, ) from typing import Any @@ -1022,7 +1023,7 @@ async def create_array( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Literal["auto"] = "auto", + chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -2449,7 +2450,7 @@ def create( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Literal["auto"] = "auto", + chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -2593,7 +2594,7 @@ def create_array( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Literal["auto"] = "auto", + chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -2991,7 +2992,7 @@ def array( *, shape: ShapeLike, dtype: npt.DTypeLike, - chunks: tuple[int, ...] | Literal["auto"] = "auto", + chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", shards: tuple[int, ...] | Literal["auto"] | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", From ea89f33a8e70a35efb20e0d198bb58b1e7047f0f Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 22:50:56 -0400 Subject: [PATCH 010/118] Add end-to-end tests --- tests/test_unified_chunk_grid.py | 348 ++++++++++++++++++++++++++++++- 1 file changed, 347 insertions(+), 1 deletion(-) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 6557c6a3b6..1795c8ece2 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -9,11 +9,13 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import numpy as np import pytest +import zarr + if TYPE_CHECKING: from pathlib import Path @@ -944,3 +946,347 @@ def test_varying_extent_match_ok(self) -> None: # Matching extents should work fine g2 = parse_chunk_grid(g, (60, 100)) assert g2.dimensions[0].extent == 60 + + +# --------------------------------------------------------------------------- +# Full-pipeline read/write tests with rectilinear grids +# --------------------------------------------------------------------------- + + +class TestFullPipelineRectilinear: + """End-to-end read/write tests through the full Array pipeline.""" + + @staticmethod + def _make_1d(tmp_path: Path) -> tuple[zarr.Array[Any], np.ndarray[Any, Any]]: + a = np.arange(30, dtype="int32") + z = zarr.create_array( + store=tmp_path / "arr1d.zarr", + shape=(30,), + chunks=[[5, 10, 15]], + dtype="int32", + ) + z[:] = a + return z, a + + @staticmethod + def _make_2d(tmp_path: Path) -> tuple[zarr.Array[Any], np.ndarray[Any, Any]]: + a = np.arange(6000, dtype="int32").reshape(60, 100) + z = zarr.create_array( + store=tmp_path / "arr2d.zarr", + shape=(60, 100), + chunks=[[10, 20, 30], [25, 25, 25, 25]], + dtype="int32", + ) + z[:] = a + return z, a + + # --- Basic selection --- + + def test_basic_selection_1d(self, tmp_path: Path) -> None: + z, a = self._make_1d(tmp_path) + sels: list[Any] = [0, 4, 5, 14, 15, 29, -1, slice(None), slice(3, 18), slice(0, 0)] + for sel in sels: + np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") + + def test_basic_selection_1d_strided(self, tmp_path: Path) -> None: + z, a = self._make_1d(tmp_path) + for sel in [slice(None, None, 2), slice(1, 25, 3), slice(0, 30, 7)]: + np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") + + def test_basic_selection_2d(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + selections: list[Any] = [ + 42, + -1, + (9, 24), + (10, 25), + (30, 50), + (59, 99), + slice(None), + (slice(5, 35), slice(20, 80)), + (slice(0, 10), slice(0, 25)), # within one chunk + (slice(10, 10), slice(None)), # empty + (slice(None, None, 3), slice(None, None, 7)), # strided + ] + for sel in selections: + np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") + + # --- Orthogonal selection --- + + def test_orthogonal_selection_1d_bool(self, tmp_path: Path) -> None: + z, a = self._make_1d(tmp_path) + ix = np.zeros(30, dtype=bool) + ix[[0, 4, 5, 14, 15, 29]] = True + np.testing.assert_array_equal(z.oindex[ix], a[ix]) + + def test_orthogonal_selection_1d_int(self, tmp_path: Path) -> None: + z, a = self._make_1d(tmp_path) + ix = np.array([0, 4, 5, 14, 15, 29]) + np.testing.assert_array_equal(z.oindex[ix], a[ix]) + ix_neg = np.array([0, -1, -15, -25]) + np.testing.assert_array_equal(z.oindex[ix_neg], a[ix_neg]) + + def test_orthogonal_selection_2d_bool(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + ix0 = np.zeros(60, dtype=bool) + ix0[[0, 9, 10, 29, 30, 59]] = True + ix1 = np.zeros(100, dtype=bool) + ix1[[0, 24, 25, 49, 50, 99]] = True + np.testing.assert_array_equal(z.oindex[ix0, ix1], a[np.ix_(ix0, ix1)]) + + def test_orthogonal_selection_2d_int(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + ix0 = np.array([0, 9, 10, 29, 30, 59]) + ix1 = np.array([0, 24, 25, 49, 50, 99]) + np.testing.assert_array_equal(z.oindex[ix0, ix1], a[np.ix_(ix0, ix1)]) + + def test_orthogonal_selection_2d_mixed(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + ix = np.array([0, 9, 10, 29, 30, 59]) + np.testing.assert_array_equal(z.oindex[ix, slice(25, 75)], a[np.ix_(ix, np.arange(25, 75))]) + np.testing.assert_array_equal( + z.oindex[slice(10, 30), ix[:4]], a[np.ix_(np.arange(10, 30), ix[:4])] + ) + + # --- Coordinate (vindex) selection --- + + def test_coordinate_selection_1d(self, tmp_path: Path) -> None: + z, a = self._make_1d(tmp_path) + ix = np.array([0, 4, 5, 14, 15, 29]) + np.testing.assert_array_equal(z.vindex[ix], a[ix]) + + def test_coordinate_selection_2d(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + r = np.array([0, 9, 10, 29, 30, 59]) + c = np.array([0, 24, 25, 49, 50, 99]) + np.testing.assert_array_equal(z.vindex[r, c], a[r, c]) + + def test_coordinate_selection_2d_bool_mask(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + mask = a > 3000 + np.testing.assert_array_equal(z.vindex[mask], a[mask]) + + # --- Set selection --- + + def test_set_basic_selection(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + new_data = np.full((20, 50), -1, dtype="int32") + z[5:25, 10:60] = new_data + a[5:25, 10:60] = new_data + np.testing.assert_array_equal(z[:], a) + + def test_set_orthogonal_selection(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + rows = np.array([0, 10, 30]) + cols = np.array([0, 25, 50, 75]) + val = np.full((3, 4), -99, dtype="int32") + z.oindex[rows, cols] = val + a[np.ix_(rows, cols)] = val + np.testing.assert_array_equal(z[:], a) + + # --- Higher dimensions --- + + def test_3d_array(self, tmp_path: Path) -> None: + shape = (12, 20, 15) + chunk_shapes = [[4, 8], [5, 5, 10], [5, 10]] + a = np.arange(int(np.prod(shape)), dtype="int32").reshape(shape) + z = zarr.create_array( + store=tmp_path / "arr3d.zarr", + shape=shape, + chunks=chunk_shapes, + dtype="int32", + ) + z[:] = a + np.testing.assert_array_equal(z[:], a) + np.testing.assert_array_equal(z[2:10, 3:18, 4:14], a[2:10, 3:18, 4:14]) + + def test_1d_single_chunk(self, tmp_path: Path) -> None: + a = np.arange(20, dtype="int32") + z = zarr.create_array( + store=tmp_path / "arr1c.zarr", + shape=(20,), + chunks=[[20]], + dtype="int32", + ) + z[:] = a + np.testing.assert_array_equal(z[:], a) + + # --- Persistence roundtrip --- + + def test_persistence_roundtrip(self, tmp_path: Path) -> None: + _, a = self._make_2d(tmp_path) + z2 = zarr.open_array(store=tmp_path / "arr2d.zarr", mode="r") + assert not z2.metadata.chunk_grid.is_regular + np.testing.assert_array_equal(z2[:], a) + + # --- Highly irregular chunks --- + + def test_highly_irregular_chunks(self, tmp_path: Path) -> None: + shape = (100, 100) + chunk_shapes = [[5, 10, 15, 20, 50], [100]] + a = np.arange(10000, dtype="int32").reshape(shape) + z = zarr.create_array( + store=tmp_path / "irreg.zarr", + shape=shape, + chunks=chunk_shapes, + dtype="int32", + ) + z[:] = a + np.testing.assert_array_equal(z[:], a) + np.testing.assert_array_equal(z[3:97, 10:90], a[3:97, 10:90]) + + # --- API validation --- + + def test_v2_rejects_rectilinear(self, tmp_path: Path) -> None: + with pytest.raises(ValueError, match="Zarr format 2"): + zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(30,), + chunks=[[10, 20]], + dtype="int32", + zarr_format=2, + ) + + def test_sharding_rejects_rectilinear(self, tmp_path: Path) -> None: + with pytest.raises(ValueError, match="sharding"): + zarr.create_array( + store=tmp_path / "shard.zarr", + shape=(60, 100), + chunks=[[10, 20, 30], [25, 25, 25, 25]], + shards=(30, 50), + dtype="int32", + ) + + def test_nchunks(self, tmp_path: Path) -> None: + z, _ = self._make_2d(tmp_path) + assert z.metadata.chunk_grid.get_nchunks() == 12 + + +# --------------------------------------------------------------------------- +# Hypothesis property-based tests +# --------------------------------------------------------------------------- + +pytest.importorskip("hypothesis") + +import hypothesis.strategies as st # noqa: E402 +from hypothesis import event, given, settings # noqa: E402 + + +@st.composite +def rectilinear_chunks_st(draw: st.DrawFn, *, shape: tuple[int, ...]) -> list[list[int]]: + """Generate valid rectilinear chunk shapes for a given array shape.""" + chunk_shapes: list[list[int]] = [] + for size in shape: + assert size > 0 + max_chunks = min(size, 10) + nchunks = draw(st.integers(min_value=1, max_value=max_chunks)) + if nchunks == 1: + chunk_shapes.append([size]) + else: + dividers = sorted( + draw( + st.lists( + st.integers(min_value=1, max_value=size - 1), + min_size=nchunks - 1, + max_size=nchunks - 1, + unique=True, + ) + ) + ) + chunk_shapes.append( + [a - b for a, b in zip(dividers + [size], [0] + dividers, strict=False)] + ) + return chunk_shapes + + +@st.composite +def rectilinear_arrays_st(draw: st.DrawFn) -> tuple[zarr.Array[Any], np.ndarray[Any, Any]]: + """Generate a rectilinear zarr array with random data, shape, and chunks.""" + from zarr.storage import MemoryStore + + ndim = draw(st.integers(min_value=1, max_value=3)) + shape = draw(st.tuples(*[st.integers(min_value=2, max_value=20) for _ in range(ndim)])) + chunk_shapes = draw(rectilinear_chunks_st(shape=shape)) + event(f"ndim={ndim}, shape={shape}") + + a = np.arange(int(np.prod(shape)), dtype="int32").reshape(shape) + store = MemoryStore() + z = zarr.create_array(store=store, shape=shape, chunks=chunk_shapes, dtype="int32") + z[:] = a + return z, a + + +@settings(deadline=None, max_examples=50) +@given(data=st.data()) +def test_property_basic_indexing_rectilinear(data: st.DataObject) -> None: + """Property test: basic indexing on rectilinear arrays matches numpy.""" + z, a = data.draw(rectilinear_arrays_st()) + np.testing.assert_array_equal(z[:], a) + + slicers = [] + for size in a.shape: + start = data.draw(st.integers(min_value=0, max_value=size - 1)) + stop = data.draw(st.integers(min_value=start, max_value=size)) + slicers.append(slice(start, stop)) + sel = tuple(slicers) + np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") + + +@settings(deadline=None, max_examples=50) +@given(data=st.data()) +def test_property_oindex_rectilinear(data: st.DataObject) -> None: + """Property test: orthogonal int-array indexing matches numpy.""" + z, a = data.draw(rectilinear_arrays_st()) + + indexers_z = [] + indexers_np = [] + for size in a.shape: + n = data.draw(st.integers(min_value=1, max_value=min(size, 5))) + ix = np.array( + sorted( + data.draw( + st.lists( + st.integers(min_value=0, max_value=size - 1), + min_size=n, + max_size=n, + unique=True, + ) + ) + ) + ) + indexers_z.append(ix) + indexers_np.append(ix) + + result = z.oindex[tuple(indexers_z)] + expected = a[np.ix_(*indexers_np)] + np.testing.assert_array_equal(result, expected) + + +@settings(deadline=None, max_examples=50) +@given(data=st.data()) +def test_property_vindex_rectilinear(data: st.DataObject) -> None: + """Property test: vindex on rectilinear arrays matches numpy.""" + z, a = data.draw(rectilinear_arrays_st()) + + n = data.draw(st.integers(min_value=1, max_value=min(min(a.shape), 5))) + indexers = tuple( + np.array( + data.draw( + st.lists( + st.integers(min_value=0, max_value=size - 1), + min_size=n, + max_size=n, + ) + ) + ) + for size in a.shape + ) + np.testing.assert_array_equal(z.vindex[indexers], a[indexers]) + + +@settings(deadline=None, max_examples=50) +@given(data=st.data()) +def test_property_roundtrip_rectilinear(data: st.DataObject) -> None: + """Property test: write then read matches original data.""" + z, a = data.draw(rectilinear_arrays_st()) + np.testing.assert_array_equal(z[:], a) From 2eba460c333e4e2c9ed0d521afe3fc53de757979 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:00:56 -0400 Subject: [PATCH 011/118] Collapse indexing paths --- src/zarr/core/indexing.py | 231 +++++++++++--------------------------- 1 file changed, 68 insertions(+), 163 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 125fcce97a..4a827d0b50 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -384,42 +384,30 @@ class ChunkDimProjection(NamedTuple): class IntDimIndexer: dim_sel: int dim_len: int - dim_chunk_len: int # kept for backwards compat; unused if dim_grid is set - dim_grid: DimensionGrid | None + dim_grid: DimensionGrid nitems: int = 1 - def __init__( - self, dim_sel: int, dim_len: int, dim_chunk_len: int, dim_grid: DimensionGrid | None = None - ) -> None: + def __init__(self, dim_sel: int, dim_len: int, dim_grid: DimensionGrid) -> None: object.__setattr__(self, "dim_sel", normalize_integer_selection(dim_sel, dim_len)) object.__setattr__(self, "dim_len", dim_len) - object.__setattr__(self, "dim_chunk_len", dim_chunk_len) object.__setattr__(self, "dim_grid", dim_grid) def __iter__(self) -> Iterator[ChunkDimProjection]: g = self.dim_grid - if g is not None: - dim_chunk_ix = g.index_to_chunk(self.dim_sel) - dim_offset = g.chunk_offset(dim_chunk_ix) - dim_chunk_sel = self.dim_sel - dim_offset - dim_out_sel = None - is_complete_chunk = g.data_size(dim_chunk_ix) == 1 - else: - dim_chunk_ix = self.dim_sel // self.dim_chunk_len - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel - dim_offset - dim_out_sel = None - is_complete_chunk = self.dim_chunk_len == 1 + dim_chunk_ix = g.index_to_chunk(self.dim_sel) + dim_offset = g.chunk_offset(dim_chunk_ix) + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + is_complete_chunk = g.data_size(dim_chunk_ix) == 1 yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) @dataclass(frozen=True) class SliceDimIndexer: dim_len: int - dim_chunk_len: int nitems: int nchunks: int - dim_grid: DimensionGrid | None + dim_grid: DimensionGrid start: int stop: int @@ -429,8 +417,7 @@ def __init__( self, dim_sel: slice, dim_len: int, - dim_chunk_len: int, - dim_grid: DimensionGrid | None = None, + dim_grid: DimensionGrid, ) -> None: # normalize start, stop, step = dim_sel.indices(dim_len) @@ -442,93 +429,46 @@ def __init__( object.__setattr__(self, "step", step) object.__setattr__(self, "dim_len", dim_len) - object.__setattr__(self, "dim_chunk_len", dim_chunk_len) object.__setattr__(self, "dim_grid", dim_grid) object.__setattr__(self, "nitems", max(0, ceildiv((stop - start), step))) - - if dim_grid is not None: - object.__setattr__(self, "nchunks", dim_grid.nchunks) - else: - object.__setattr__(self, "nchunks", ceildiv(dim_len, dim_chunk_len)) + object.__setattr__(self, "nchunks", dim_grid.nchunks) def __iter__(self) -> Iterator[ChunkDimProjection]: g = self.dim_grid - if g is not None: - # Use the dimension grid for chunk boundary lookups - dim_chunk_ix_from = g.index_to_chunk(self.start) if self.start > 0 else 0 - dim_chunk_ix_to = g.index_to_chunk(self.stop - 1) + 1 if self.stop > 0 else 0 - - for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): - dim_offset = g.chunk_offset(dim_chunk_ix) - dim_chunk_len = g.data_size(dim_chunk_ix) - dim_limit = dim_offset + dim_chunk_len - - if self.start < dim_offset: - dim_chunk_sel_start = 0 - remainder = (dim_offset - self.start) % self.step - if remainder: - dim_chunk_sel_start += self.step - remainder - dim_out_offset = ceildiv((dim_offset - self.start), self.step) - else: - dim_chunk_sel_start = self.start - dim_offset - dim_out_offset = 0 - - if self.stop > dim_limit: - dim_chunk_sel_stop = dim_chunk_len - else: - dim_chunk_sel_stop = self.stop - dim_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) - dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) - - if dim_chunk_nitems == 0: - continue - - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - is_complete_chunk = ( - dim_chunk_sel_start == 0 and (self.stop >= dim_limit) and self.step in [1, None] - ) - yield ChunkDimProjection( - dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk - ) - else: - # Legacy path: scalar dim_chunk_len - dim_chunk_ix_from = 0 if self.start == 0 else self.start // self.dim_chunk_len - dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) - - for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) - dim_chunk_len = dim_limit - dim_offset - - if self.start < dim_offset: - dim_chunk_sel_start = 0 - remainder = (dim_offset - self.start) % self.step - if remainder: - dim_chunk_sel_start += self.step - remainder - dim_out_offset = ceildiv((dim_offset - self.start), self.step) - else: - dim_chunk_sel_start = self.start - dim_offset - dim_out_offset = 0 - - if self.stop > dim_limit: - dim_chunk_sel_stop = dim_chunk_len - else: - dim_chunk_sel_stop = self.stop - dim_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) - dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) - - if dim_chunk_nitems == 0: - continue - - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - is_complete_chunk = ( - dim_chunk_sel_start == 0 and (self.stop >= dim_limit) and self.step in [1, None] - ) - yield ChunkDimProjection( - dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk - ) + dim_chunk_ix_from = g.index_to_chunk(self.start) if self.start > 0 else 0 + dim_chunk_ix_to = g.index_to_chunk(self.stop - 1) + 1 if self.stop > 0 else 0 + + for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + dim_offset = g.chunk_offset(dim_chunk_ix) + dim_chunk_len = g.data_size(dim_chunk_ix) + dim_limit = dim_offset + dim_chunk_len + + if self.start < dim_offset: + dim_chunk_sel_start = 0 + remainder = (dim_offset - self.start) % self.step + if remainder: + dim_chunk_sel_start += self.step - remainder + dim_out_offset = ceildiv((dim_offset - self.start), self.step) + else: + dim_chunk_sel_start = self.start - dim_offset + dim_out_offset = 0 + + if self.stop > dim_limit: + dim_chunk_sel_stop = dim_chunk_len + else: + dim_chunk_sel_stop = self.stop - dim_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + + if dim_chunk_nitems == 0: + continue + + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + is_complete_chunk = ( + dim_chunk_sel_start == 0 and (self.stop >= dim_limit) and self.step in [1, None] + ) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) def check_selection_length(selection: SelectionNormalized, shape: tuple[int, ...]) -> None: @@ -652,16 +592,12 @@ def __init__( # setup per-dimension indexers dim_indexers: list[IntDimIndexer | SliceDimIndexer] = [] for dim_sel, dim_len, dim_grid in zip(selection_normalized, shape, dim_grids, strict=True): - from zarr.core.chunk_grids import FixedDimension - - dim_chunk_len = dim_grid.size if isinstance(dim_grid, FixedDimension) else 1 - dim_indexer: IntDimIndexer | SliceDimIndexer if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_grid) elif is_slice(dim_sel): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_grid) else: raise IndexError( @@ -694,8 +630,7 @@ def __iter__(self) -> Iterator[ChunkProjection]: class BoolArrayDimIndexer: dim_sel: npt.NDArray[np.bool_] dim_len: int - dim_chunk_len: int - dim_grid: DimensionGrid | None + dim_grid: DimensionGrid nchunks: int chunk_nitems: npt.NDArray[Any] @@ -707,8 +642,7 @@ def __init__( self, dim_sel: npt.NDArray[np.bool_], dim_len: int, - dim_chunk_len: int, - dim_grid: DimensionGrid | None = None, + dim_grid: DimensionGrid, ) -> None: # check number of dimensions if not is_bool_array(dim_sel, 1): @@ -721,21 +655,13 @@ def __init__( ) g = dim_grid - - if g is not None: - nchunks = g.nchunks - else: - nchunks = ceildiv(dim_len, dim_chunk_len) + nchunks = g.nchunks # precompute number of selected items for each chunk chunk_nitems = np.zeros(nchunks, dtype="i8") for dim_chunk_ix in range(nchunks): - if g is not None: - dim_offset = g.chunk_offset(dim_chunk_ix) - chunk_len = g.data_size(dim_chunk_ix) - else: - dim_offset = dim_chunk_ix * dim_chunk_len - chunk_len = dim_chunk_len + dim_offset = g.chunk_offset(dim_chunk_ix) + chunk_len = g.data_size(dim_chunk_ix) chunk_nitems[dim_chunk_ix] = np.count_nonzero( dim_sel[dim_offset : dim_offset + chunk_len] ) @@ -746,7 +672,6 @@ def __init__( # store attributes object.__setattr__(self, "dim_sel", dim_sel) object.__setattr__(self, "dim_len", dim_len) - object.__setattr__(self, "dim_chunk_len", dim_chunk_len) object.__setattr__(self, "dim_grid", dim_grid) object.__setattr__(self, "nchunks", nchunks) object.__setattr__(self, "chunk_nitems", chunk_nitems) @@ -755,22 +680,20 @@ def __init__( object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) def __iter__(self) -> Iterator[ChunkDimProjection]: + from zarr.core.chunk_grids import FixedDimension + g = self.dim_grid # iterate over chunks with at least one item for dim_chunk_ix in self.dim_chunk_ixs: # find region in chunk - if g is not None: - dim_offset = g.chunk_offset(dim_chunk_ix) - chunk_len = g.data_size(dim_chunk_ix) - else: - dim_offset = dim_chunk_ix * self.dim_chunk_len - chunk_len = self.dim_chunk_len + dim_offset = g.chunk_offset(dim_chunk_ix) + chunk_len = g.data_size(dim_chunk_ix) dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + chunk_len] - # pad out if final chunk (for fixed grids, actual chunk may be smaller than dim_chunk_len) - if g is None and dim_chunk_sel.shape[0] < self.dim_chunk_len: - tmp = np.zeros(self.dim_chunk_len, dtype=bool) + # pad out if final chunk (for fixed grids, actual chunk may be smaller than declared size) + if isinstance(g, FixedDimension) and dim_chunk_sel.shape[0] < g.size: + tmp = np.zeros(g.size, dtype=bool) tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel dim_chunk_sel = tmp @@ -829,8 +752,7 @@ class IntArrayDimIndexer: """Integer array selection against a single dimension.""" dim_len: int - dim_chunk_len: int - dim_grid: DimensionGrid | None + dim_grid: DimensionGrid nchunks: int nitems: int order: Order @@ -844,11 +766,10 @@ def __init__( self, dim_sel: npt.NDArray[np.intp], dim_len: int, - dim_chunk_len: int, + dim_grid: DimensionGrid, wraparound: bool = True, boundscheck: bool = True, order: Order = Order.UNKNOWN, - dim_grid: DimensionGrid | None = None, ) -> None: # ensure 1d array dim_sel = np.asanyarray(dim_sel) @@ -857,11 +778,7 @@ def __init__( nitems = len(dim_sel) g = dim_grid - - if g is not None: - nchunks = g.nchunks - else: - nchunks = ceildiv(dim_len, dim_chunk_len) + nchunks = g.nchunks # handle wraparound if wraparound: @@ -872,10 +789,7 @@ def __init__( boundscheck_indices(dim_sel, dim_len) # determine which chunk is needed for each selection item - if g is not None: - dim_sel_chunk = g.indices_to_chunks(dim_sel) - else: - dim_sel_chunk = dim_sel // dim_chunk_len + dim_sel_chunk = g.indices_to_chunks(dim_sel) # determine order of indices if order == Order.UNKNOWN: @@ -903,7 +817,6 @@ def __init__( # store attributes object.__setattr__(self, "dim_len", dim_len) - object.__setattr__(self, "dim_chunk_len", dim_chunk_len) object.__setattr__(self, "dim_grid", dim_grid) object.__setattr__(self, "nchunks", nchunks) object.__setattr__(self, "nitems", nitems) @@ -931,10 +844,7 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: dim_out_sel = self.dim_out_sel[start:stop] # find region in chunk - if g is not None: - dim_offset = g.chunk_offset(dim_chunk_ix) - else: - dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_offset = g.chunk_offset(dim_chunk_ix) dim_chunk_sel = self.dim_sel[start:stop] - dim_offset is_complete_chunk = False # TODO yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) @@ -1013,23 +923,18 @@ def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: Chu IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer ] = [] for dim_sel, dim_len, dim_grid in zip(selection, shape, dim_grids, strict=True): - from zarr.core.chunk_grids import FixedDimension - - dim_chunk_len = dim_grid.size if isinstance(dim_grid, FixedDimension) else 1 dim_indexer: IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_grid) elif isinstance(dim_sel, slice): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_grid) elif is_integer_array(dim_sel): - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid) + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_grid) elif is_bool_array(dim_sel): - dim_indexer = BoolArrayDimIndexer( - dim_sel, dim_len, dim_chunk_len, dim_grid=dim_grid - ) + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_grid) else: raise IndexError( @@ -1199,7 +1104,7 @@ def __init__( f"expected integer or slice, got {type(dim_sel)!r}" ) - dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size, dim_grid=dim_grid) + dim_indexer = SliceDimIndexer(slice_, dim_len, dim_grid) dim_indexers.append(dim_indexer) if start >= dim_len or start < 0: From fa0739618425fe13a1a2fd69054c1b9988fa3395 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:04:52 -0400 Subject: [PATCH 012/118] Add DimensionGrid protocol --- src/zarr/core/chunk_grids.py | 24 ++++++++++++++++++++---- src/zarr/core/indexing.py | 9 +++++++-- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 31ddab4716..95231b4094 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -9,7 +9,7 @@ from collections.abc import Sequence from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal, Protocol, cast, runtime_checkable import numpy as np import numpy.typing as npt @@ -126,7 +126,19 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int return np.searchsorted(self.cumulative, indices, side="right") -DimensionGrid = FixedDimension | VaryingDimension +@runtime_checkable +class DimensionGrid(Protocol): + """Structural interface shared by FixedDimension and VaryingDimension.""" + + @property + def nchunks(self) -> int: ... + @property + def extent(self) -> int: ... + def index_to_chunk(self, idx: int) -> int: ... + def chunk_offset(self, chunk_ix: int) -> int: ... + def chunk_size(self, chunk_ix: int) -> int: ... + def data_size(self, chunk_ix: int) -> int: ... + def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: ... # --------------------------------------------------------------------------- @@ -302,7 +314,11 @@ def chunk_shape(self) -> tuple[int, ...]: "chunk_shape is only available for regular chunk grids. " "Use grid[coords] for per-chunk sizes." ) - return tuple(d.size for d in self.dimensions) # type: ignore[union-attr] + return tuple( + d.size + for d in self.dimensions + if isinstance(d, FixedDimension) # guaranteed by is_regular + ) # -- Collection interface -- @@ -401,7 +417,7 @@ def to_dict(self) -> dict[str, JSON]: edges = [dim.size] * (n - 1) + [last_data] rle = _compress_rle(edges) chunk_shapes.append(rle) - else: + elif isinstance(dim, VaryingDimension): edges = list(dim.edges) rle = _compress_rle(edges) if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 4a827d0b50..1b96710149 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -959,10 +959,15 @@ def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: Chu # Compute chunk_shape for ix_() compatibility in __iter__. # For VaryingDimension, use the max edge length so that # slice_to_range produces correct ranges for the largest chunk. - from zarr.core.chunk_grids import FixedDimension + from zarr.core.chunk_grids import FixedDimension, VaryingDimension chunk_shape = tuple( - g.size if isinstance(g, FixedDimension) else max(g.edges) for g in dim_grids + g.size + if isinstance(g, FixedDimension) + else max(g.edges) + if isinstance(g, VaryingDimension) + else g.chunk_size(0) + for g in dim_grids ) object.__setattr__(self, "dim_indexers", dim_indexers) From a0acd95c0e575d345f34a9827792bb973ed0cd6d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:07:28 -0400 Subject: [PATCH 013/118] Remove the try/except escape hatch from ChunkGrid.chunk_shape --- src/zarr/core/chunk_grids.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 95231b4094..b0d13f5ecc 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -302,13 +302,6 @@ def shape(self) -> tuple[int, ...]: @property def chunk_shape(self) -> tuple[int, ...]: """Return the uniform chunk shape. Raises if grid is not regular.""" - # Check for a stored _chunk_shape (set by RegularChunkGrid subclass) - try: - stored: tuple[int, ...] = object.__getattribute__(self, "_chunk_shape") - except AttributeError: - pass - else: - return stored if not self.is_regular: raise ValueError( "chunk_shape is only available for regular chunk grids. " @@ -502,6 +495,11 @@ def __init__(self, *, chunk_shape: ShapeLike) -> None: object.__setattr__(self, "dimensions", dims) object.__setattr__(self, "_chunk_shape", chunk_shape_parsed) + @property + def chunk_shape(self) -> tuple[int, ...]: + """Return the stored chunk shape (extent may be 0 as placeholder).""" + return self._chunk_shape + @classmethod def _from_dict(cls, data: dict[str, JSON] | NamedConfig[str, Any]) -> Self: _, configuration_parsed = parse_named_configuration(data, "regular") From ce0527d39d7e01fc58dcc446738120bcb702d53c Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:08:36 -0400 Subject: [PATCH 014/118] Cache is_regular --- src/zarr/core/chunk_grids.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index b0d13f5ecc..ac23446f88 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -244,9 +244,13 @@ class ChunkGrid(Metadata): """ dimensions: tuple[DimensionGrid, ...] + _is_regular: bool def __init__(self, *, dimensions: tuple[DimensionGrid, ...]) -> None: object.__setattr__(self, "dimensions", dimensions) + object.__setattr__( + self, "_is_regular", all(isinstance(d, FixedDimension) for d in dimensions) + ) @classmethod def from_regular(cls, array_shape: ShapeLike, chunk_shape: ShapeLike) -> ChunkGrid: @@ -292,7 +296,7 @@ def ndim(self) -> int: @property def is_regular(self) -> bool: - return all(isinstance(d, FixedDimension) for d in self.dimensions) + return self._is_regular @property def shape(self) -> tuple[int, ...]: @@ -492,7 +496,7 @@ def __init__(self, *, chunk_shape: ShapeLike) -> None: chunk_shape_parsed = parse_shapelike(chunk_shape) # Without array shape, use extent=0 as placeholder dims = tuple(FixedDimension(size=s, extent=0) for s in chunk_shape_parsed) - object.__setattr__(self, "dimensions", dims) + super().__init__(dimensions=dims) object.__setattr__(self, "_chunk_shape", chunk_shape_parsed) @property From f43366829b61634dfe7149e7e8da37953b312e47 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:11:01 -0400 Subject: [PATCH 015/118] Produce RLE directly --- src/zarr/core/chunk_grids.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ac23446f88..b142d74e02 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -399,21 +399,22 @@ def to_dict(self) -> dict[str, JSON]: chunk_shapes: list[Any] = [] for dim in self.dimensions: if isinstance(dim, FixedDimension): - # Serialize as uniform edges. The extent is reconstructed - # from sum(edges) on deserialization, so we use the actual - # data sizes to preserve the true extent (which may not be - # a multiple of chunk size at the boundary). + # Produce RLE directly without allocating a full edge list. n = dim.nchunks if n == 0: chunk_shapes.append([]) else: last_data = dim.extent - (n - 1) * dim.size if last_data == dim.size: - edges = [dim.size] * n + # All chunks uniform + chunk_shapes.append([[dim.size, n]]) else: - edges = [dim.size] * (n - 1) + [last_data] - rle = _compress_rle(edges) - chunk_shapes.append(rle) + # n-1 full chunks + 1 boundary chunk + rle: list[list[int]] = [] + if n > 1: + rle.append([dim.size, n - 1]) + rle.append([last_data, 1]) + chunk_shapes.append(rle) elif isinstance(dim, VaryingDimension): edges = list(dim.edges) rle = _compress_rle(edges) From 02fd7c51ff1e35400986197f6ba3994129a77d01 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:31:34 -0400 Subject: [PATCH 016/118] Fix bugs --- src/zarr/core/array.py | 4 ++ src/zarr/core/chunk_grids.py | 32 +++++++++----- src/zarr/core/metadata/v3.py | 3 +- tests/test_unified_chunk_grid.py | 75 ++++++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 13 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ecd2c1dd1f..24a50b5d60 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -6018,6 +6018,10 @@ async def _resize( """ new_shape = parse_shapelike(new_shape) assert len(new_shape) == len(array.metadata.shape) + + if not array.metadata.chunk_grid.is_regular: + raise ValueError("Resize is not supported for arrays with rectilinear chunk grids.") + new_metadata = array.metadata.update_shape(new_shape) # ensure deletion is only run if array is shrinking as the delete_outside_chunks path is unbounded in memory diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index b142d74e02..64ec38606f 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -364,12 +364,10 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> raise ValueError("Regular chunk grid requires 'chunk_shape' configuration") if not isinstance(chunk_shape_raw, Sequence): raise TypeError(f"chunk_shape must be a sequence, got {type(chunk_shape_raw)}") - # Without array shape, create with extent=0 as placeholder. - # parse_chunk_grid() should be used when array shape is available. - dims = tuple( - FixedDimension(size=int(cast("int", s)), extent=0) for s in chunk_shape_raw - ) - return cls(dimensions=dims) + # Without array shape, return a RegularChunkGrid that preserves + # chunk_shape but raises on extent-dependent operations. + # Use parse_chunk_grid() when array shape is available. + return RegularChunkGrid(chunk_shape=tuple(int(cast("int", s)) for s in chunk_shape_raw)) if name_parsed == "rectilinear": chunk_shapes_raw = configuration_parsed.get("chunk_shapes") @@ -513,15 +511,25 @@ def _from_dict(cls, data: dict[str, JSON] | NamedConfig[str, Any]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}} + def _raise_no_extent(self) -> None: + raise ValueError( + "RegularChunkGrid does not have array shape information. " + "Use ChunkGrid.from_regular(array_shape, chunk_shape) or " + "parse_chunk_grid() to create a grid with extent." + ) + + @property + def shape(self) -> tuple[int, ...]: + self._raise_no_extent() + raise AssertionError # unreachable, for mypy + def all_chunk_coords(self) -> Iterator[tuple[int, ...]]: - return itertools.product(*(range(d.nchunks) for d in self.dimensions)) + self._raise_no_extent() + raise AssertionError # unreachable, for mypy def get_nchunks(self) -> int: - return reduce( - operator.mul, - (d.nchunks for d in self.dimensions), - 1, - ) + self._raise_no_extent() + raise AssertionError # unreachable, for mypy # --------------------------------------------------------------------------- diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index ac1578f10e..7c9644aff6 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -428,7 +428,8 @@ def to_dict(self) -> dict[str, JSON]: return out_dict def update_shape(self, shape: tuple[int, ...]) -> Self: - return replace(self, shape=shape) + new_grid = parse_chunk_grid(self.chunk_grid, shape) + return replace(self, shape=shape, chunk_grid=new_grid) def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 1795c8ece2..6e12118152 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -1290,3 +1290,78 @@ def test_property_roundtrip_rectilinear(data: st.DataObject) -> None: """Property test: write then read matches original data.""" z, a = data.draw(rectilinear_arrays_st()) np.testing.assert_array_equal(z[:], a) + + +# --------------------------------------------------------------------------- +# Bug #3: _resize with rectilinear grids +# --------------------------------------------------------------------------- + + +class TestResizeRectilinear: + def test_resize_regular_preserves_chunk_grid(self, tmp_path: Path) -> None: + """Resize a regular array — chunk_grid extents must match new shape.""" + z = zarr.create_array( + store=tmp_path / "regular.zarr", + shape=(100,), + chunks=(10,), + dtype="int32", + ) + z[:] = np.arange(100, dtype="int32") + z.resize(50) + assert z.shape == (50,) + # The chunk grid's extent must agree with the new shape + assert z.metadata.chunk_grid.dimensions[0].extent == 50 + + def test_resize_rectilinear_raises(self, tmp_path: Path) -> None: + """Resize should raise for rectilinear grids (not yet supported).""" + z = zarr.create_array( + store=tmp_path / "rect.zarr", + shape=(30,), + chunks=[[5, 10, 15]], + dtype="int32", + ) + z[:] = np.arange(30, dtype="int32") + with pytest.raises((ValueError, NotImplementedError)): + z.resize(20) + + +# --------------------------------------------------------------------------- +# Bug #4: extent=0 placeholder in RegularChunkGrid / from_dict +# --------------------------------------------------------------------------- + + +class TestExtentPlaceholder: + def test_regular_chunk_grid_chunk_shape_preserved(self) -> None: + """RegularChunkGrid preserves chunk_shape.""" + g = RegularChunkGrid(chunk_shape=(10, 20)) + assert g.chunk_shape == (10, 20) + + def test_regular_chunk_grid_nchunks_raises(self) -> None: + """RegularChunkGrid raises on get_nchunks() (no extent info).""" + g = RegularChunkGrid(chunk_shape=(10, 20)) + with pytest.raises(ValueError, match="array shape"): + g.get_nchunks() + + def test_regular_chunk_grid_shape_raises(self) -> None: + """RegularChunkGrid raises on .shape (no extent info).""" + g = RegularChunkGrid(chunk_shape=(10, 20)) + with pytest.raises(ValueError, match="array shape"): + _ = g.shape + + def test_regular_chunk_grid_all_chunk_coords_raises(self) -> None: + """RegularChunkGrid raises on all_chunk_coords() (no extent info).""" + g = RegularChunkGrid(chunk_shape=(10, 20)) + with pytest.raises(ValueError, match="array shape"): + list(g.all_chunk_coords()) + + def test_from_dict_regular_raises_on_extent_ops(self) -> None: + """ChunkGrid.from_dict for regular grids raises on extent-dependent ops.""" + g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) + assert g.chunk_shape == (10, 20) + with pytest.raises(ValueError, match="array shape"): + g.get_nchunks() + + def test_from_dict_regular_is_regular_chunk_grid(self) -> None: + """ChunkGrid.from_dict for regular grids returns a RegularChunkGrid.""" + g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) + assert isinstance(g, RegularChunkGrid) From 42ef639df134c43ac86bb445d46e07737133d670 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:46:25 -0400 Subject: [PATCH 017/118] Separate chunk grid serialization --- src/zarr/core/chunk_grids.py | 118 ++++++++++------ src/zarr/core/metadata/v3.py | 23 ++- tests/test_unified_chunk_grid.py | 235 ++++++++++++++++++++++++------- 3 files changed, 286 insertions(+), 90 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 64ec38606f..0bc7efbe23 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -15,7 +15,6 @@ import numpy.typing as npt import zarr -from zarr.abc.metadata import Metadata from zarr.core.common import ( JSON, NamedConfig, @@ -231,7 +230,7 @@ def _is_rectilinear_chunks(chunks: Any) -> bool: @dataclass(frozen=True) -class ChunkGrid(Metadata): +class ChunkGrid: """ Unified chunk grid supporting both regular and rectilinear chunking. @@ -387,43 +386,8 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") - def to_dict(self) -> dict[str, JSON]: - if self.is_regular: - return { - "name": "regular", - "configuration": {"chunk_shape": tuple(self.chunk_shape)}, - } - else: - chunk_shapes: list[Any] = [] - for dim in self.dimensions: - if isinstance(dim, FixedDimension): - # Produce RLE directly without allocating a full edge list. - n = dim.nchunks - if n == 0: - chunk_shapes.append([]) - else: - last_data = dim.extent - (n - 1) * dim.size - if last_data == dim.size: - # All chunks uniform - chunk_shapes.append([[dim.size, n]]) - else: - # n-1 full chunks + 1 boundary chunk - rle: list[list[int]] = [] - if n > 1: - rle.append([dim.size, n - 1]) - rle.append([last_data, 1]) - chunk_shapes.append(rle) - elif isinstance(dim, VaryingDimension): - edges = list(dim.edges) - rle = _compress_rle(edges) - if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): - chunk_shapes.append(rle) - else: - chunk_shapes.append(edges) - return { - "name": "rectilinear", - "configuration": {"chunk_shapes": chunk_shapes}, - } + # ChunkGrid does not serialize itself. The format choice ("regular" vs + # "rectilinear") belongs to the metadata layer. Use serialize_chunk_grid(). def parse_chunk_grid( @@ -476,11 +440,84 @@ def parse_chunk_grid( decoded.append(dim_spec) else: raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + if len(decoded) != len(array_shape): + raise ValueError( + f"chunk_shapes has {len(decoded)} dimensions but array shape " + f"has {len(array_shape)} dimensions" + ) + for i, (edges, extent) in enumerate(zip(decoded, array_shape, strict=True)): + edge_sum = sum(edges) + if edge_sum != extent: + raise ValueError( + f"Rectilinear chunk edges for dimension {i} sum to {edge_sum} " + f"but array shape extent is {extent}" + ) return ChunkGrid.from_rectilinear(decoded) raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") +def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: + """Serialize a ChunkGrid to a metadata dict using the given format name. + + The format choice ("regular" vs "rectilinear") belongs to the metadata layer, + not the grid itself. This function is called by ArrayV3Metadata.to_dict(). + """ + if name == "regular": + if not grid.is_regular: + raise ValueError( + "Cannot serialize a non-regular chunk grid as 'regular'. Use 'rectilinear' instead." + ) + return { + "name": "regular", + "configuration": {"chunk_shape": tuple(grid.chunk_shape)}, + } + + if name == "rectilinear": + chunk_shapes: list[Any] = [] + for dim in grid.dimensions: + if isinstance(dim, FixedDimension): + # Produce RLE directly without allocating a full edge list. + n = dim.nchunks + if n == 0: + chunk_shapes.append([]) + else: + last_data = dim.extent - (n - 1) * dim.size + if last_data == dim.size: + chunk_shapes.append([[dim.size, n]]) + else: + rle: list[list[int]] = [] + if n > 1: + rle.append([dim.size, n - 1]) + rle.append([last_data, 1]) + chunk_shapes.append(rle) + elif isinstance(dim, VaryingDimension): + edges = list(dim.edges) + rle = _compress_rle(edges) + if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): + chunk_shapes.append(rle) + else: + chunk_shapes.append(edges) + return { + "name": "rectilinear", + "configuration": {"chunk_shapes": chunk_shapes}, + } + + raise ValueError(f"Unknown chunk grid name for serialization: {name!r}") + + +def _infer_chunk_grid_name( + data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], + grid: ChunkGrid, +) -> str: + """Extract or infer the chunk grid serialization name from the input.""" + if isinstance(data, dict): + name, _ = parse_named_configuration(data) + return name + # ChunkGrid passed directly — infer from structure + return "regular" if grid.is_regular else "rectilinear" + + # --------------------------------------------------------------------------- # Backwards-compatible alias # --------------------------------------------------------------------------- @@ -508,9 +545,6 @@ def _from_dict(cls, data: dict[str, JSON] | NamedConfig[str, Any]) -> Self: _, configuration_parsed = parse_named_configuration(data, "regular") return cls(**configuration_parsed) # type: ignore[arg-type] - def to_dict(self) -> dict[str, JSON]: - return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}} - def _raise_no_extent(self) -> None: raise ValueError( "RegularChunkGrid does not have array shape information. " diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 7c9644aff6..9cdfbbe254 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -24,7 +24,12 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec -from zarr.core.chunk_grids import ChunkGrid, parse_chunk_grid +from zarr.core.chunk_grids import ( + ChunkGrid, + _infer_chunk_grid_name, + parse_chunk_grid, + serialize_chunk_grid, +) from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -200,6 +205,7 @@ class ArrayV3Metadata(Metadata): shape: tuple[int, ...] data_type: ZDType[TBaseDType, TBaseScalar] chunk_grid: ChunkGrid + chunk_grid_name: str chunk_key_encoding: ChunkKeyEncoding fill_value: Any codecs: tuple[Codec, ...] @@ -221,6 +227,7 @@ def __init__( codecs: Iterable[Codec | dict[str, JSON] | NamedConfig[str, Any] | str], attributes: dict[str, JSON] | None, dimension_names: DimensionNames, + chunk_grid_name: str | None = None, storage_transformers: Iterable[dict[str, JSON]] | None = None, extra_fields: Mapping[str, AllowedExtraField] | None = None, ) -> None: @@ -230,6 +237,11 @@ def __init__( shape_parsed = parse_shapelike(shape) chunk_grid_parsed = parse_chunk_grid(chunk_grid, shape_parsed) + chunk_grid_name_parsed = ( + chunk_grid_name + if chunk_grid_name is not None + else _infer_chunk_grid_name(chunk_grid, chunk_grid_parsed) + ) chunk_key_encoding_parsed = parse_chunk_key_encoding(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # Note: relying on a type method is numpy-specific @@ -251,6 +263,7 @@ def __init__( object.__setattr__(self, "shape", shape_parsed) object.__setattr__(self, "data_type", data_type) object.__setattr__(self, "chunk_grid", chunk_grid_parsed) + object.__setattr__(self, "chunk_grid_name", chunk_grid_name_parsed) object.__setattr__(self, "chunk_key_encoding", chunk_key_encoding_parsed) object.__setattr__(self, "codecs", codecs_parsed) object.__setattr__(self, "dimension_names", dimension_names_parsed) @@ -407,6 +420,14 @@ def to_dict(self) -> dict[str, JSON]: extra_fields = out_dict.pop("extra_fields") out_dict = out_dict | extra_fields # type: ignore[operator] + # Serialize chunk_grid using the stored name (not the grid's own logic). + # This gives round-trip fidelity: a store written as "rectilinear" with + # uniform edges stays "rectilinear". + out_dict["chunk_grid"] = serialize_chunk_grid(self.chunk_grid, self.chunk_grid_name) + + # chunk_grid_name is internal — not part of the Zarr metadata document + out_dict.pop("chunk_grid_name", None) + out_dict["fill_value"] = self.data_type.to_json_scalar( self.fill_value, zarr_format=self.zarr_format ) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 6e12118152..c552340fc7 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -29,6 +29,8 @@ VaryingDimension, _compress_rle, _expand_rle, + parse_chunk_grid, + serialize_chunk_grid, ) # --------------------------------------------------------------------------- @@ -306,7 +308,7 @@ def test_roundtrip(self) -> None: class TestSerialization: def test_regular_roundtrip(self) -> None: g = ChunkGrid.from_regular((100, 200), (10, 20)) - d = g.to_dict() + d = serialize_chunk_grid(g, "regular") assert d["name"] == "regular" config = d["configuration"] assert isinstance(config, dict) @@ -317,7 +319,7 @@ def test_regular_roundtrip(self) -> None: def test_rectilinear_roundtrip(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) - d = g.to_dict() + d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" g2 = ChunkGrid.from_dict(d) assert not g2.is_regular @@ -332,12 +334,20 @@ def test_rectilinear_roundtrip(self) -> None: def test_rectilinear_rle_serialization(self) -> None: """RLE should be used when it actually compresses.""" g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]]) - d = g.to_dict() - assert d["name"] == "regular" # all uniform -> serializes as regular + # All uniform, but name is chosen by the metadata layer, not the grid. + # Serializing as "regular" works because is_regular is True. + d = serialize_chunk_grid(g, "regular") + assert d["name"] == "regular" + + def test_rectilinear_uniform_stays_rectilinear(self) -> None: + """A rectilinear grid with uniform edges stays rectilinear if the name says so.""" + g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]]) + d = serialize_chunk_grid(g, "rectilinear") + assert d["name"] == "rectilinear" def test_rectilinear_rle_with_varying(self) -> None: g = ChunkGrid.from_rectilinear([[100, 100, 100, 50], [25, 25, 25, 25]]) - d = g.to_dict() + d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" config = d["configuration"] assert isinstance(config, dict) @@ -347,7 +357,7 @@ def test_rectilinear_rle_with_varying(self) -> None: def test_json_roundtrip(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) - d = g.to_dict() + d = serialize_chunk_grid(g, "rectilinear") json_str = json.dumps(d) d2 = json.loads(json_str) g2 = ChunkGrid.from_dict(d2) @@ -357,6 +367,88 @@ def test_unknown_name_raises(self) -> None: with pytest.raises(ValueError, match="Unknown chunk grid"): ChunkGrid.from_dict({"name": "hexagonal", "configuration": {}}) + def test_serialize_non_regular_as_regular_raises(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + with pytest.raises(ValueError, match="Cannot serialize a non-regular chunk grid"): + serialize_chunk_grid(g, "regular") + + def test_serialize_unknown_name_raises(self) -> None: + g = ChunkGrid.from_regular((100,), (10,)) + with pytest.raises(ValueError, match="Unknown chunk grid name for serialization"): + serialize_chunk_grid(g, "hexagonal") + + +class TestParseChunkGridValidation: + def test_varying_extent_mismatch_raises(self) -> None: + from zarr.core.chunk_grids import parse_chunk_grid + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + # VaryingDimension extent is 60, but array_shape says 100 + with pytest.raises(ValueError, match="extent"): + parse_chunk_grid(g, (100, 100)) + + def test_varying_extent_match_ok(self) -> None: + from zarr.core.chunk_grids import parse_chunk_grid + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + # Matching extents should work fine + g2 = parse_chunk_grid(g, (60, 100)) + assert g2.dimensions[0].extent == 60 + + def test_rectilinear_extent_mismatch_raises(self) -> None: + """sum(edges) must match the array shape for each dimension.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"chunk_shapes": [[10, 20, 30], [25, 25]]}, + } + # sum([10,20,30])=60, sum([25,25])=50 — array shape (100, 50) mismatches dim 0 + with pytest.raises(ValueError, match="sum to 60 but array shape extent is 100"): + parse_chunk_grid(data, (100, 50)) + + def test_rectilinear_extent_mismatch_second_dim(self) -> None: + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"chunk_shapes": [[50, 50], [10, 20]]}, + } + # dim 0 OK (100), dim 1: sum([10,20])=30 != 50 + with pytest.raises(ValueError, match="dimension 1 sum to 30 but array shape extent is 50"): + parse_chunk_grid(data, (100, 50)) + + def test_rectilinear_extent_match_passes(self) -> None: + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"chunk_shapes": [[10, 20, 30], [25, 25]]}, + } + g = parse_chunk_grid(data, (60, 50)) + assert g.shape == (3, 2) + + def test_rectilinear_ndim_mismatch_raises(self) -> None: + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"chunk_shapes": [[10, 20], [25, 25]]}, + } + with pytest.raises(ValueError, match="2 dimensions but array shape has 3"): + parse_chunk_grid(data, (30, 50, 100)) + + def test_rectilinear_rle_extent_validated(self) -> None: + """RLE-encoded edges are expanded before validation.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"chunk_shapes": [[[10, 5]], [[25, 2]]]}, + } + # sum = 50 and 50 — match (50, 50) + g = parse_chunk_grid(data, (50, 50)) + assert g.shape == (5, 2) + # mismatch + with pytest.raises(ValueError, match="sum to 50 but array shape extent is 100"): + parse_chunk_grid(data, (100, 50)) + + def test_varying_dimension_extent_mismatch_on_chunkgrid_input(self) -> None: + """When passing a ChunkGrid directly, VaryingDimension extent is validated.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + with pytest.raises(ValueError, match="does not match"): + parse_chunk_grid(g, (100, 50)) + # --------------------------------------------------------------------------- # Backwards compatibility @@ -494,7 +586,7 @@ def test_create_rectilinear_array(self, tmp_path: Path) -> None: def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: """Verify metadata round-trips through JSON.""" g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) - d = g.to_dict() + d = serialize_chunk_grid(g, "rectilinear") g2 = ChunkGrid.from_dict(d) assert g2.shape == g.shape for coord in g.all_chunk_coords(): @@ -504,6 +596,90 @@ def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: assert new_spec is not None assert orig_spec.shape == new_spec.shape + def test_chunk_grid_name_regular(self, tmp_path: Path) -> None: + """Regular arrays store chunk_grid_name='regular'.""" + from zarr.core.array import AsyncArray + from zarr.core.dtype import Float32 + + meta = AsyncArray._create_metadata_v3( + shape=(100, 200), + dtype=Float32(), + chunk_shape=(10, 20), + ) + assert meta.chunk_grid_name == "regular" + d = meta.to_dict() + chunk_grid_dict = d["chunk_grid"] + assert isinstance(chunk_grid_dict, dict) + assert chunk_grid_dict["name"] == "regular" + + def test_chunk_grid_name_rectilinear(self, tmp_path: Path) -> None: + """Rectilinear arrays store chunk_grid_name='rectilinear'.""" + from zarr.core.array import AsyncArray + from zarr.core.dtype import Float32 + + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + meta = AsyncArray._create_metadata_v3( + shape=(60, 100), + dtype=Float32(), + chunk_shape=(10, 20), + chunk_grid=g, + ) + assert meta.chunk_grid_name == "rectilinear" + d = meta.to_dict() + chunk_grid_dict = d["chunk_grid"] + assert isinstance(chunk_grid_dict, dict) + assert chunk_grid_dict["name"] == "rectilinear" + + def test_chunk_grid_name_roundtrip_preserves_rectilinear(self, tmp_path: Path) -> None: + """A rectilinear grid with uniform edges stays 'rectilinear' through to_dict/from_dict.""" + from zarr.core.metadata.v3 import ArrayV3Metadata + + meta_dict: dict[str, Any] = { + "zarr_format": 3, + "node_type": "array", + "shape": [100, 100], + "chunk_grid": { + "name": "rectilinear", + "configuration": {"chunk_shapes": [[[50, 2]], [[25, 4]]]}, + }, + "chunk_key_encoding": {"name": "default"}, + "data_type": "float32", + "fill_value": 0.0, + "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], + } + meta = ArrayV3Metadata.from_dict(meta_dict) + # Grid is uniform (all Fixed), but name should stay "rectilinear" + assert meta.chunk_grid.is_regular + assert meta.chunk_grid_name == "rectilinear" + d = meta.to_dict() + chunk_grid_dict = d["chunk_grid"] + assert isinstance(chunk_grid_dict, dict) + assert chunk_grid_dict["name"] == "rectilinear" + + def test_chunk_grid_name_regular_from_dict(self, tmp_path: Path) -> None: + """A 'regular' chunk grid name is preserved through from_dict.""" + from zarr.core.metadata.v3 import ArrayV3Metadata + + meta_dict: dict[str, Any] = { + "zarr_format": 3, + "node_type": "array", + "shape": [100, 100], + "chunk_grid": { + "name": "regular", + "configuration": {"chunk_shape": [50, 25]}, + }, + "chunk_key_encoding": {"name": "default"}, + "data_type": "float32", + "fill_value": 0.0, + "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], + } + meta = ArrayV3Metadata.from_dict(meta_dict) + assert meta.chunk_grid_name == "regular" + d = meta.to_dict() + chunk_grid_dict = d["chunk_grid"] + assert isinstance(chunk_grid_dict, dict) + assert chunk_grid_dict["name"] == "regular" + def test_get_chunk_spec_regular(self, tmp_path: Path) -> None: """get_chunk_spec works for regular grids.""" from zarr.core.array import AsyncArray @@ -646,7 +822,7 @@ def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: ) assert g.shape == (3, 10) - d = g.to_dict() + d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" # Second dim should serialize as edges that sum to 95 config = d["configuration"] @@ -681,7 +857,7 @@ def test_exact_extent_fixed_dim_rectilinear_roundtrip(self) -> None: FixedDimension(size=25, extent=100), ) ) - d = g.to_dict() + d = serialize_chunk_grid(g, "rectilinear") g2 = ChunkGrid.from_dict(d) assert g2.shape == g.shape # All chunks should be uniform @@ -797,9 +973,9 @@ def test_chunk_spec_multidim_boundary(self) -> None: assert spec.shape == (10, 5) assert spec.is_boundary # second dim differs - # -- Rectilinear with zero-nchunks FixedDimension in to_dict -- + # -- Rectilinear with zero-nchunks FixedDimension in serialize_chunk_grid -- - def test_zero_nchunks_fixed_dim_in_rectilinear_to_dict(self) -> None: + def test_zero_nchunks_fixed_dim_in_rectilinear_serialize(self) -> None: """A rectilinear grid with a 0-nchunks FixedDimension serializes.""" g = ChunkGrid( dimensions=( @@ -808,7 +984,7 @@ def test_zero_nchunks_fixed_dim_in_rectilinear_to_dict(self) -> None: ) ) assert g.shape == (2, 0) - d = g.to_dict() + d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" # -- VaryingDimension data_size -- @@ -820,11 +996,6 @@ def test_varying_dim_data_size_equals_chunk_size(self) -> None: assert d.data_size(i) == d.chunk_size(i) -# --------------------------------------------------------------------------- -# Bug: OrthogonalIndexer chunk_shape=1 for VaryingDimension -# --------------------------------------------------------------------------- - - class TestOrthogonalIndexerRectilinear: """OrthogonalIndexer must use correct per-chunk sizes for VaryingDimension, not a hardcoded 1. The chunk_shape field is used by ix_() to convert slices @@ -883,11 +1054,6 @@ def test_orthogonal_advanced_indexing_chunk_shape_not_one(self) -> None: assert indexer.chunk_shape[1] == 50 -# --------------------------------------------------------------------------- -# Bug: Sharding validation skipped for rectilinear grids -# --------------------------------------------------------------------------- - - class TestShardingValidationRectilinear: """ShardingCodec.validate must check divisibility for rectilinear grids too.""" @@ -923,31 +1089,6 @@ def test_sharding_accepts_divisible_rectilinear(self) -> None: ) -# --------------------------------------------------------------------------- -# Bug: parse_chunk_grid doesn't validate VaryingDimension extent vs array_shape -# --------------------------------------------------------------------------- - - -class TestParseChunkGridValidation: - """parse_chunk_grid should raise when VaryingDimension extent != array_shape.""" - - def test_varying_extent_mismatch_raises(self) -> None: - from zarr.core.chunk_grids import parse_chunk_grid - - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) - # VaryingDimension extent is 60, but array_shape says 100 - with pytest.raises(ValueError, match="extent"): - parse_chunk_grid(g, (100, 100)) - - def test_varying_extent_match_ok(self) -> None: - from zarr.core.chunk_grids import parse_chunk_grid - - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) - # Matching extents should work fine - g2 = parse_chunk_grid(g, (60, 100)) - assert g2.dimensions[0].extent == 60 - - # --------------------------------------------------------------------------- # Full-pipeline read/write tests with rectilinear grids # --------------------------------------------------------------------------- From 55e720b52e5a2381ad34baf8fc0ad1742e5f6ccf Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 10 Mar 2026 00:06:54 -0400 Subject: [PATCH 018/118] Retain comments --- src/zarr/core/indexing.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 1b96710149..020a04e6b1 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -434,33 +434,43 @@ def __init__( object.__setattr__(self, "nchunks", dim_grid.nchunks) def __iter__(self) -> Iterator[ChunkDimProjection]: + # figure out the range of chunks we need to visit g = self.dim_grid dim_chunk_ix_from = g.index_to_chunk(self.start) if self.start > 0 else 0 dim_chunk_ix_to = g.index_to_chunk(self.stop - 1) + 1 if self.stop > 0 else 0 + # iterate over chunks in range for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + # compute offsets for chunk within overall array dim_offset = g.chunk_offset(dim_chunk_ix) + # determine chunk length, accounting for trailing chunk dim_chunk_len = g.data_size(dim_chunk_ix) dim_limit = dim_offset + dim_chunk_len if self.start < dim_offset: + # selection starts before current chunk dim_chunk_sel_start = 0 remainder = (dim_offset - self.start) % self.step if remainder: dim_chunk_sel_start += self.step - remainder + # compute number of previous items, provides offset into output array dim_out_offset = ceildiv((dim_offset - self.start), self.step) else: + # selection starts within current chunk dim_chunk_sel_start = self.start - dim_offset dim_out_offset = 0 if self.stop > dim_limit: + # selection ends after current chunk dim_chunk_sel_stop = dim_chunk_len else: + # selection ends within current chunk dim_chunk_sel_stop = self.stop - dim_offset dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + # If there are no elements on the selection within this chunk, then skip if dim_chunk_nitems == 0: continue @@ -789,6 +799,8 @@ def __init__( boundscheck_indices(dim_sel, dim_len) # determine which chunk is needed for each selection item + # note: for dense integer selections, the division operation here is the + # bottleneck dim_sel_chunk = g.indices_to_chunks(dim_sel) # determine order of indices @@ -800,6 +812,7 @@ def __init__( dim_out_sel = None elif order == Order.DECREASING: dim_sel = dim_sel[::-1] + # TODO should be possible to do this without creating an arange dim_out_sel = np.arange(nitems - 1, -1, -1) else: # sort indices to group by chunk From 1b7871de628bd71bc05dde7cd7d0d44de1e05585 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 10 Mar 2026 00:15:52 -0400 Subject: [PATCH 019/118] Update block and coordinate indexing --- src/zarr/core/indexing.py | 20 +++----- tests/test_unified_chunk_grid.py | 86 ++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 13 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 020a04e6b1..7a4490c7e4 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1071,7 +1071,6 @@ class BlockIndexer(Indexer): def __init__( self, selection: BasicSelection, shape: tuple[int, ...], chunk_grid: ChunkGrid ) -> None: - chunk_shape = get_chunk_shape(chunk_grid) dim_grids = _get_dim_grids(chunk_grid) # handle ellipsis @@ -1082,17 +1081,15 @@ def __init__( # setup per-dimension indexers dim_indexers = [] - for dim_sel, dim_len, dim_chunk_size, dim_grid in zip( - selection_normalized, shape, chunk_shape, dim_grids, strict=True - ): - dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) + for dim_sel, dim_len, dim_grid in zip(selection_normalized, shape, dim_grids, strict=True): + dim_numchunks = dim_grid.nchunks if is_integer(dim_sel): if dim_sel < 0: dim_sel = dim_numchunks + dim_sel - start = dim_sel * dim_chunk_size - stop = start + dim_chunk_size + start = dim_grid.chunk_offset(dim_sel) + stop = start + dim_grid.chunk_size(dim_sel) slice_ = slice(start, stop) elif is_slice(dim_sel): @@ -1112,8 +1109,8 @@ def __init__( if stop < 0: stop = dim_numchunks + stop - start *= dim_chunk_size - stop *= dim_chunk_size + start = dim_grid.chunk_offset(start) if start < dim_numchunks else dim_len + stop = dim_grid.chunk_offset(stop) if stop < dim_numchunks else dim_len slice_ = slice(start, stop) else: @@ -1125,7 +1122,7 @@ def __init__( dim_indexer = SliceDimIndexer(slice_, dim_len, dim_grid) dim_indexers.append(dim_indexer) - if start >= dim_len or start < 0: + if slice_.start >= dim_len or slice_.start < 0: msg = f"index out of bounds for dimension with length {dim_len}" raise BoundsCheckError(msg) @@ -1193,7 +1190,6 @@ class CoordinateIndexer(Indexer): chunk_rixs: npt.NDArray[np.intp] chunk_mixs: tuple[npt.NDArray[np.intp], ...] shape: tuple[int, ...] - chunk_shape: tuple[int, ...] dim_grids: tuple[DimensionGrid, ...] drop_axes: tuple[int, ...] @@ -1201,7 +1197,6 @@ def __init__( self, selection: CoordinateSelection, shape: tuple[int, ...], chunk_grid: ChunkGrid ) -> None: dim_grids = _get_dim_grids(chunk_grid) - chunk_shape = tuple(g.size if hasattr(g, "size") else 1 for g in dim_grids) cdata_shape: tuple[int, ...] if shape == (): @@ -1284,7 +1279,6 @@ def __init__( object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) object.__setattr__(self, "chunk_rixs", chunk_rixs) object.__setattr__(self, "chunk_mixs", chunk_mixs) - object.__setattr__(self, "chunk_shape", chunk_shape) object.__setattr__(self, "dim_grids", dim_grids) object.__setattr__(self, "shape", shape) object.__setattr__(self, "drop_axes", ()) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index c552340fc7..d732711d94 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -1207,6 +1207,68 @@ def test_coordinate_selection_2d_bool_mask(self, tmp_path: Path) -> None: mask = a > 3000 np.testing.assert_array_equal(z.vindex[mask], a[mask]) + # --- Block selection --- + + def test_block_selection_1d(self, tmp_path: Path) -> None: + z, a = self._make_1d(tmp_path) + # chunks: [5, 10, 15] -> offsets 0, 5, 15 + # block 0: a[0:5], block 1: a[5:15], block 2: a[15:30] + np.testing.assert_array_equal(z.blocks[0], a[0:5]) + np.testing.assert_array_equal(z.blocks[1], a[5:15]) + np.testing.assert_array_equal(z.blocks[2], a[15:30]) + np.testing.assert_array_equal(z.blocks[-1], a[15:30]) + # slice of blocks + np.testing.assert_array_equal(z.blocks[0:2], a[0:15]) + np.testing.assert_array_equal(z.blocks[1:3], a[5:30]) + np.testing.assert_array_equal(z.blocks[:], a[:]) + + def test_block_selection_2d(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + # dim0 chunks: [10, 20, 30] -> offsets 0, 10, 30 + # dim1 chunks: [25, 25, 25, 25] -> offsets 0, 25, 50, 75 + np.testing.assert_array_equal(z.blocks[0, 0], a[0:10, 0:25]) + np.testing.assert_array_equal(z.blocks[1, 2], a[10:30, 50:75]) + np.testing.assert_array_equal(z.blocks[2, 3], a[30:60, 75:100]) + np.testing.assert_array_equal(z.blocks[-1, -1], a[30:60, 75:100]) + # slice of blocks + np.testing.assert_array_equal(z.blocks[0:2, 1:3], a[0:30, 25:75]) + np.testing.assert_array_equal(z.blocks[:, :], a[:, :]) + + def test_set_block_selection_1d(self, tmp_path: Path) -> None: + z, a = self._make_1d(tmp_path) + # overwrite block 1 (a[5:15]) + val = np.full(10, -1, dtype="int32") + z.blocks[1] = val + a[5:15] = val + np.testing.assert_array_equal(z[:], a) + + def test_set_block_selection_2d(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + # overwrite blocks [0:2, 1:3] -> a[0:30, 25:75] + val = np.full((30, 50), -99, dtype="int32") + z.blocks[0:2, 1:3] = val + a[0:30, 25:75] = val + np.testing.assert_array_equal(z[:], a) + + # --- Set coordinate selection --- + + def test_set_coordinate_selection_1d(self, tmp_path: Path) -> None: + z, a = self._make_1d(tmp_path) + ix = np.array([0, 4, 5, 14, 15, 29]) + val = np.full(len(ix), -7, dtype="int32") + z.vindex[ix] = val + a[ix] = val + np.testing.assert_array_equal(z[:], a) + + def test_set_coordinate_selection_2d(self, tmp_path: Path) -> None: + z, a = self._make_2d(tmp_path) + r = np.array([0, 9, 10, 29, 30, 59]) + c = np.array([0, 24, 25, 49, 50, 99]) + val = np.full(len(r), -42, dtype="int32") + z.vindex[r, c] = val + a[r, c] = val + np.testing.assert_array_equal(z[:], a) + # --- Set selection --- def test_set_basic_selection(self, tmp_path: Path) -> None: @@ -1425,6 +1487,30 @@ def test_property_vindex_rectilinear(data: st.DataObject) -> None: np.testing.assert_array_equal(z.vindex[indexers], a[indexers]) +@settings(deadline=None, max_examples=50) +@given(data=st.data()) +def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: + """Property test: block indexing on rectilinear arrays matches numpy.""" + z, a = data.draw(rectilinear_arrays_st()) + grid = z.metadata.chunk_grid + + # Pick a random block per dimension and verify it matches the expected slice + for dim in range(a.ndim): + dim_grid = grid.dimensions[dim] + block_ix = data.draw(st.integers(min_value=0, max_value=dim_grid.nchunks - 1)) + sel = [slice(None)] * a.ndim + start = dim_grid.chunk_offset(block_ix) + stop = start + dim_grid.data_size(block_ix) + sel[dim] = slice(start, stop) + block_sel: list[slice | int] = [slice(None)] * a.ndim + block_sel[dim] = block_ix + np.testing.assert_array_equal( + z.blocks[tuple(block_sel)], + a[tuple(sel)], + err_msg=f"dim={dim}, block={block_ix}", + ) + + @settings(deadline=None, max_examples=50) @given(data=st.data()) def test_property_roundtrip_rectilinear(data: st.DataObject) -> None: From 9c0f582f40a27eae1b75501f197cf9a52d8f453b Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 10 Mar 2026 11:05:21 -0400 Subject: [PATCH 020/118] POC: TiledDimension --- src/zarr/core/chunk_grids.py | 278 ++++++++++++++++--- tests/test_unified_chunk_grid.py | 455 +++++++++++++++++++++++++++++++ 2 files changed, 700 insertions(+), 33 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 0bc7efbe23..b88396a66f 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -125,9 +125,131 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int return np.searchsorted(self.cumulative, indices, side="right") +@dataclass(frozen=True) +class TiledDimension: + """Periodic chunk pattern repeated N times, with an optional trailing remainder. + + Exploits periodicity for O(1) chunk_offset/chunk_size and O(log pattern_len) + index_to_chunk, regardless of total chunk count. Memory is O(pattern_len) + instead of O(n_chunks). + + Example: 30 years of monthly chunks (days per month): + TiledDimension(pattern=(31,28,31,30,31,30,31,31,30,31,30,31), repeats=30) + """ + + pattern: tuple[int, ...] # one period's edge lengths (all > 0) + repeats: int # number of full repetitions (>= 1) + remainder: tuple[int, ...] # trailing partial period (all > 0, may be empty) + + # Precomputed + _pattern_cumulative: tuple[int, ...] # prefix sums within one period + _period_extent: int # sum(pattern) + _pattern_nchunks: int # len(pattern) + _remainder_cumulative: tuple[int, ...] # prefix sums of remainder + _total_nchunks: int + _total_extent: int + + def __init__( + self, + pattern: Sequence[int], + repeats: int = 1, + remainder: Sequence[int] = (), + ) -> None: + pattern_t = tuple(pattern) + remainder_t = tuple(remainder) + if not pattern_t: + raise ValueError("TiledDimension pattern must not be empty") + if repeats < 1: + raise ValueError(f"TiledDimension repeats must be >= 1, got {repeats}") + if any(e <= 0 for e in pattern_t): + raise ValueError(f"All pattern edge lengths must be > 0, got {pattern_t}") + if any(e <= 0 for e in remainder_t): + raise ValueError(f"All remainder edge lengths must be > 0, got {remainder_t}") + + pattern_cum = tuple(itertools.accumulate(pattern_t)) + period_extent = pattern_cum[-1] + remainder_cum = tuple(itertools.accumulate(remainder_t)) if remainder_t else () + total_nchunks = len(pattern_t) * repeats + len(remainder_t) + total_extent = period_extent * repeats + (remainder_cum[-1] if remainder_cum else 0) + + object.__setattr__(self, "pattern", pattern_t) + object.__setattr__(self, "repeats", repeats) + object.__setattr__(self, "remainder", remainder_t) + object.__setattr__(self, "_pattern_cumulative", pattern_cum) + object.__setattr__(self, "_period_extent", period_extent) + object.__setattr__(self, "_pattern_nchunks", len(pattern_t)) + object.__setattr__(self, "_remainder_cumulative", remainder_cum) + object.__setattr__(self, "_total_nchunks", total_nchunks) + object.__setattr__(self, "_total_extent", total_extent) + + @property + def nchunks(self) -> int: + return self._total_nchunks + + @property + def extent(self) -> int: + return self._total_extent + + def chunk_offset(self, chunk_ix: int) -> int: + period, offset = divmod(chunk_ix, self._pattern_nchunks) + if period < self.repeats: + base = period * self._period_extent + return base + (self._pattern_cumulative[offset - 1] if offset > 0 else 0) + # In the remainder + rem_ix = chunk_ix - self.repeats * self._pattern_nchunks + return self.repeats * self._period_extent + ( + self._remainder_cumulative[rem_ix - 1] if rem_ix > 0 else 0 + ) + + def chunk_size(self, chunk_ix: int) -> int: + """Buffer size for codec processing.""" + period, offset = divmod(chunk_ix, self._pattern_nchunks) + if period < self.repeats: + return self.pattern[offset] + return self.remainder[chunk_ix - self.repeats * self._pattern_nchunks] + + def data_size(self, chunk_ix: int) -> int: + """Valid data region — same as chunk_size for tiled dims.""" + return self.chunk_size(chunk_ix) + + def index_to_chunk(self, idx: int) -> int: + period, within = divmod(idx, self._period_extent) + if period < self.repeats: + local = bisect.bisect_right(self._pattern_cumulative, within) + return period * self._pattern_nchunks + local + # In the remainder region + rem_idx = idx - self.repeats * self._period_extent + local = bisect.bisect_right(self._remainder_cumulative, rem_idx) + return self.repeats * self._pattern_nchunks + local + + def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: + periods, withins = np.divmod(indices, self._period_extent) + result = np.empty_like(indices) + + # Chunks in the repeating region + in_repeat = periods < self.repeats + if np.any(in_repeat): + local = np.searchsorted(self._pattern_cumulative, withins[in_repeat], side="right") + result[in_repeat] = periods[in_repeat] * self._pattern_nchunks + local + + # Chunks in the remainder region + in_remainder = ~in_repeat + if np.any(in_remainder) and self._remainder_cumulative: + rem_indices = indices[in_remainder] - self.repeats * self._period_extent + local = np.searchsorted(self._remainder_cumulative, rem_indices, side="right") + result[in_remainder] = self.repeats * self._pattern_nchunks + local + + return result + + @property + def edges(self) -> tuple[int, ...]: + """Expand to full edge list (for compatibility with VaryingDimension).""" + return self.pattern * self.repeats + self.remainder + + @runtime_checkable class DimensionGrid(Protocol): - """Structural interface shared by FixedDimension and VaryingDimension.""" + """Structural interface shared by FixedDimension, VaryingDimension, and TiledDimension.""" @property def nchunks(self) -> int: ... @@ -202,6 +324,48 @@ def _compress_rle(sizes: Sequence[int]) -> list[list[int]]: return result +# --------------------------------------------------------------------------- +# Tile helpers +# --------------------------------------------------------------------------- + + +def _detect_tile_pattern( + edges: Sequence[int], +) -> tuple[tuple[int, ...], int, tuple[int, ...]] | None: + """Detect the shortest repeating tile pattern in an edge list. + + Returns (pattern, repeats, remainder) if a tile pattern saves space over + the flat representation, otherwise None. + + A pattern must repeat at least 2 times to qualify. + """ + n = len(edges) + if n < 4: + return None + + # Try pattern lengths from 2 up to n//2 + for plen in range(2, n // 2 + 1): + pattern = tuple(edges[:plen]) + full_repeats = n // plen + if full_repeats < 2: + break + # Check all full repetitions match + match = True + for r in range(1, full_repeats): + start = r * plen + if tuple(edges[start : start + plen]) != pattern: + match = False + break + if not match: + continue + remainder = tuple(edges[full_repeats * plen :]) + # Only use tile if it's more compact: pattern + remainder < flat list + tile_cost = plen + len(remainder) + 2 # +2 for repeats field + overhead + if tile_cost < n: + return pattern, full_repeats, remainder + return None + + # --------------------------------------------------------------------------- # Unified ChunkGrid # --------------------------------------------------------------------------- @@ -238,8 +402,9 @@ class ChunkGrid: It stores the extent (array dimension length) per dimension, enabling ``grid[coords]`` to return a ``ChunkSpec`` without external parameters. - Internally represents each dimension as either FixedDimension (uniform chunks) - or VaryingDimension (per-chunk edge lengths with prefix sums). + Internally represents each dimension as FixedDimension (uniform chunks), + VaryingDimension (per-chunk edge lengths with prefix sums), or + TiledDimension (periodic pattern repeated N times). """ dimensions: tuple[DimensionGrid, ...] @@ -374,15 +539,11 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") if not isinstance(chunk_shapes_raw, Sequence): raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - decoded: list[list[int]] = [] + dims_list: list[DimensionGrid] = [] for dim_spec in chunk_shapes_raw: - if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): - decoded.append(_expand_rle(dim_spec)) - elif isinstance(dim_spec, list): - decoded.append(dim_spec) - else: - raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") - return cls.from_rectilinear(decoded) + parsed = _parse_dim_spec(dim_spec) + dims_list.append(_build_dimension(parsed)) + return cls(dimensions=tuple(dims_list)) raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") @@ -390,6 +551,41 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> # "rectilinear") belongs to the metadata layer. Use serialize_chunk_grid(). +def _parse_dim_spec(dim_spec: Any) -> list[int] | TiledDimension: + """Parse a single dimension's chunk_shapes entry. + + Returns either a flat edge list or a TiledDimension (for tile-encoded entries). + Handles: flat list of ints, RLE ([[size, count], ...]), and tile dicts. + """ + if isinstance(dim_spec, dict): + # Tile encoding: {"tile": [...], "repeat": N, "remainder": [...]} + tile_pattern = dim_spec.get("tile") + if tile_pattern is None: + raise ValueError(f"Tile-encoded dim_spec must have 'tile' key, got {dim_spec}") + repeat = dim_spec.get("repeat", 1) + remainder = dim_spec.get("remainder", []) + return TiledDimension( + pattern=tile_pattern, + repeats=repeat, + remainder=remainder, + ) + if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): + return _expand_rle(dim_spec) + if isinstance(dim_spec, list): + return dim_spec + raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + + +def _build_dimension(dim_spec_parsed: list[int] | TiledDimension) -> DimensionGrid: + """Build a DimensionGrid from a parsed dim spec.""" + if isinstance(dim_spec_parsed, TiledDimension): + return dim_spec_parsed + edges = dim_spec_parsed + if all(e == edges[0] for e in edges): + return FixedDimension(size=edges[0], extent=sum(edges)) + return VaryingDimension(edges) + + def parse_chunk_grid( data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], array_shape: tuple[int, ...], @@ -407,10 +603,11 @@ def parse_chunk_grid( if isinstance(dim, FixedDimension): dims.append(FixedDimension(size=dim.size, extent=extent)) else: - # VaryingDimension has intrinsic extent — validate it matches + # VaryingDimension/TiledDimension have intrinsic extent — validate if dim.extent != extent: + dim_type = type(dim).__name__ raise ValueError( - f"VaryingDimension extent {dim.extent} does not match " + f"{dim_type} extent {dim.extent} does not match " f"array shape extent {extent} for dimension {len(dims)}" ) dims.append(dim) @@ -432,27 +629,22 @@ def parse_chunk_grid( raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") if not isinstance(chunk_shapes_raw, Sequence): raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - decoded: list[list[int]] = [] - for dim_spec in chunk_shapes_raw: - if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): - decoded.append(_expand_rle(dim_spec)) - elif isinstance(dim_spec, list): - decoded.append(dim_spec) - else: - raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") - if len(decoded) != len(array_shape): + if len(chunk_shapes_raw) != len(array_shape): raise ValueError( - f"chunk_shapes has {len(decoded)} dimensions but array shape " + f"chunk_shapes has {len(chunk_shapes_raw)} dimensions but array shape " f"has {len(array_shape)} dimensions" ) - for i, (edges, extent) in enumerate(zip(decoded, array_shape, strict=True)): - edge_sum = sum(edges) - if edge_sum != extent: + dims_built: list[DimensionGrid] = [] + for i, dim_spec in enumerate(chunk_shapes_raw): + parsed = _parse_dim_spec(dim_spec) + dim = _build_dimension(parsed) + if dim.extent != array_shape[i]: raise ValueError( - f"Rectilinear chunk edges for dimension {i} sum to {edge_sum} " - f"but array shape extent is {extent}" + f"Rectilinear chunk edges for dimension {i} sum to {dim.extent} " + f"but array shape extent is {array_shape[i]}" ) - return ChunkGrid.from_rectilinear(decoded) + dims_built.append(dim) + return ChunkGrid(dimensions=tuple(dims_built)) raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") @@ -491,13 +683,33 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: rle.append([dim.size, n - 1]) rle.append([last_data, 1]) chunk_shapes.append(rle) + elif isinstance(dim, TiledDimension): + tile_dict: dict[str, Any] = { + "tile": list(dim.pattern), + "repeat": dim.repeats, + } + if dim.remainder: + tile_dict["remainder"] = list(dim.remainder) + chunk_shapes.append(tile_dict) elif isinstance(dim, VaryingDimension): edges = list(dim.edges) - rle = _compress_rle(edges) - if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): - chunk_shapes.append(rle) + # Try tile compression first (more compact for periodic patterns) + tile_result = _detect_tile_pattern(edges) + if tile_result is not None: + pattern, repeats, remainder = tile_result + tile_dict_v: dict[str, Any] = { + "tile": list(pattern), + "repeat": repeats, + } + if remainder: + tile_dict_v["remainder"] = list(remainder) + chunk_shapes.append(tile_dict_v) else: - chunk_shapes.append(edges) + rle = _compress_rle(edges) + if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): + chunk_shapes.append(rle) + else: + chunk_shapes.append(edges) return { "name": "rectilinear", "configuration": {"chunk_shapes": chunk_shapes}, diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index d732711d94..234c2acc0a 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -26,8 +26,10 @@ ChunkSpec, FixedDimension, RegularChunkGrid, + TiledDimension, VaryingDimension, _compress_rle, + _detect_tile_pattern, _expand_rle, parse_chunk_grid, serialize_chunk_grid, @@ -1592,3 +1594,456 @@ def test_from_dict_regular_is_regular_chunk_grid(self) -> None: """ChunkGrid.from_dict for regular grids returns a RegularChunkGrid.""" g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) assert isinstance(g, RegularChunkGrid) + + +# --------------------------------------------------------------------------- +# TiledDimension +# --------------------------------------------------------------------------- + +# Days per month (non-leap year) +MONTHS_365 = (31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) +# Days per month (leap year) +MONTHS_366 = (31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) + + +class TestTiledDimension: + def test_basic(self) -> None: + d = TiledDimension(pattern=(10, 20, 30), repeats=3) + assert d.nchunks == 9 + assert d.extent == 180 # (10+20+30) * 3 + + def test_with_remainder(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(15,)) + assert d.nchunks == 5 # 2*2 + 1 + assert d.extent == 75 # (10+20)*2 + 15 + + def test_chunk_size(self) -> None: + d = TiledDimension(pattern=(10, 20, 30), repeats=3) + # Period 0 + assert d.chunk_size(0) == 10 + assert d.chunk_size(1) == 20 + assert d.chunk_size(2) == 30 + # Period 1 (same pattern) + assert d.chunk_size(3) == 10 + assert d.chunk_size(4) == 20 + assert d.chunk_size(5) == 30 + # Period 2 + assert d.chunk_size(6) == 10 + assert d.chunk_size(7) == 20 + assert d.chunk_size(8) == 30 + + def test_chunk_size_with_remainder(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(7, 3)) + assert d.chunk_size(0) == 10 + assert d.chunk_size(1) == 20 + assert d.chunk_size(2) == 10 + assert d.chunk_size(3) == 20 + assert d.chunk_size(4) == 7 # remainder + assert d.chunk_size(5) == 3 # remainder + + def test_data_size_equals_chunk_size(self) -> None: + d = TiledDimension(pattern=(10, 20, 30), repeats=2) + for i in range(d.nchunks): + assert d.data_size(i) == d.chunk_size(i) + + def test_chunk_offset(self) -> None: + d = TiledDimension(pattern=(10, 20, 30), repeats=2) + assert d.chunk_offset(0) == 0 + assert d.chunk_offset(1) == 10 + assert d.chunk_offset(2) == 30 + assert d.chunk_offset(3) == 60 # period 1 starts + assert d.chunk_offset(4) == 70 + assert d.chunk_offset(5) == 90 + + def test_chunk_offset_with_remainder(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(5,)) + assert d.chunk_offset(0) == 0 + assert d.chunk_offset(1) == 10 + assert d.chunk_offset(2) == 30 + assert d.chunk_offset(3) == 40 + assert d.chunk_offset(4) == 60 # remainder starts + + def test_index_to_chunk(self) -> None: + d = TiledDimension(pattern=(10, 20, 30), repeats=2) + # Period 0 + assert d.index_to_chunk(0) == 0 + assert d.index_to_chunk(9) == 0 + assert d.index_to_chunk(10) == 1 + assert d.index_to_chunk(29) == 1 + assert d.index_to_chunk(30) == 2 + assert d.index_to_chunk(59) == 2 + # Period 1 + assert d.index_to_chunk(60) == 3 + assert d.index_to_chunk(69) == 3 + assert d.index_to_chunk(70) == 4 + assert d.index_to_chunk(119) == 5 + + def test_index_to_chunk_with_remainder(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(5,)) + assert d.index_to_chunk(60) == 4 # first element of remainder + assert d.index_to_chunk(64) == 4 # last element of remainder + + def test_indices_to_chunks_vectorized(self) -> None: + d = TiledDimension(pattern=(10, 20, 30), repeats=2) + indices = np.array([0, 9, 10, 29, 30, 59, 60, 69, 70, 119], dtype=np.intp) + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 5], dtype=np.intp) + result = d.indices_to_chunks(indices) + np.testing.assert_array_equal(result, expected) + + def test_indices_to_chunks_with_remainder(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(5,)) + indices = np.array([0, 10, 30, 40, 60, 64], dtype=np.intp) + expected = np.array([0, 1, 2, 3, 4, 4], dtype=np.intp) + result = d.indices_to_chunks(indices) + np.testing.assert_array_equal(result, expected) + + def test_edges_property(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=3, remainder=(5,)) + assert d.edges == (10, 20, 10, 20, 10, 20, 5) + + def test_monthly_pattern(self) -> None: + """30 years of non-leap-year monthly chunks.""" + d = TiledDimension(pattern=MONTHS_365, repeats=30) + assert d.nchunks == 360 + assert d.extent == 365 * 30 + # January of year 0 + assert d.chunk_size(0) == 31 + assert d.chunk_offset(0) == 0 + # February of year 0 + assert d.chunk_size(1) == 28 + assert d.chunk_offset(1) == 31 + # January of year 1 + assert d.chunk_size(12) == 31 + assert d.chunk_offset(12) == 365 + # December of year 29 + assert d.chunk_size(359) == 31 + + def test_monthly_pattern_index_lookup(self) -> None: + """Verify index_to_chunk for day-of-year lookups across years.""" + d = TiledDimension(pattern=MONTHS_365, repeats=30) + # Day 0 of year 0 -> chunk 0 (January) + assert d.index_to_chunk(0) == 0 + # Day 31 of year 0 -> chunk 1 (February) + assert d.index_to_chunk(31) == 1 + # Day 0 of year 1 -> chunk 12 + assert d.index_to_chunk(365) == 12 + # Day 31 of year 1 -> chunk 13 (February year 1) + assert d.index_to_chunk(365 + 31) == 13 + + def test_quasi_periodic_leap_years(self) -> None: + """4-year cycle with one leap year, repeated 7 times + 2-year remainder.""" + four_year = MONTHS_365 * 3 + MONTHS_366 # 48 months + d = TiledDimension( + pattern=four_year, + repeats=7, + remainder=MONTHS_365 * 2, # 2 extra non-leap years + ) + assert d.nchunks == 48 * 7 + 24 # 360 months = 30 years + four_year_days = 365 * 3 + 366 + assert d.extent == four_year_days * 7 + 365 * 2 + + def test_validation_empty_pattern(self) -> None: + with pytest.raises(ValueError, match="pattern must not be empty"): + TiledDimension(pattern=()) + + def test_validation_zero_repeats(self) -> None: + with pytest.raises(ValueError, match="repeats must be >= 1"): + TiledDimension(pattern=(10,), repeats=0) + + def test_validation_negative_edge(self) -> None: + with pytest.raises(ValueError, match="pattern edge lengths must be > 0"): + TiledDimension(pattern=(10, -5)) + + def test_validation_negative_remainder(self) -> None: + with pytest.raises(ValueError, match="remainder edge lengths must be > 0"): + TiledDimension(pattern=(10,), repeats=2, remainder=(-1,)) + + def test_consistency_with_varying(self) -> None: + """TiledDimension should produce identical results to VaryingDimension.""" + pattern = (10, 20, 30) + repeats = 4 + remainder = (15, 25) + tiled = TiledDimension(pattern=pattern, repeats=repeats, remainder=remainder) + expanded_edges = list(pattern) * repeats + list(remainder) + varying = VaryingDimension(expanded_edges) + + assert tiled.nchunks == varying.nchunks + assert tiled.extent == varying.extent + + for i in range(tiled.nchunks): + assert tiled.chunk_size(i) == varying.chunk_size(i) + assert tiled.data_size(i) == varying.data_size(i) + assert tiled.chunk_offset(i) == varying.chunk_offset(i) + + # Test index_to_chunk for all valid indices + for idx in range(tiled.extent): + assert tiled.index_to_chunk(idx) == varying.index_to_chunk(idx), f"idx={idx}" + + def test_consistency_vectorized(self) -> None: + """Vectorized indices_to_chunks matches scalar version.""" + d = TiledDimension(pattern=(10, 20, 30), repeats=5, remainder=(15,)) + all_indices = np.arange(d.extent, dtype=np.intp) + vectorized = d.indices_to_chunks(all_indices) + scalar = np.array([d.index_to_chunk(int(i)) for i in all_indices], dtype=np.intp) + np.testing.assert_array_equal(vectorized, scalar) + + +# --------------------------------------------------------------------------- +# Tile detection +# --------------------------------------------------------------------------- + + +class TestDetectTilePattern: + def test_simple_tile(self) -> None: + edges = [10, 20, 10, 20, 10, 20] + result = _detect_tile_pattern(edges) + assert result is not None + pattern, repeats, remainder = result + assert pattern == (10, 20) + assert repeats == 3 + assert remainder == () + + def test_tile_with_remainder(self) -> None: + edges = [10, 20, 30, 10, 20, 30, 10] + result = _detect_tile_pattern(edges) + assert result is not None + pattern, repeats, remainder = result + assert pattern == (10, 20, 30) + assert repeats == 2 + assert remainder == (10,) + + def test_no_pattern(self) -> None: + edges = [10, 20, 30, 40] + result = _detect_tile_pattern(edges) + assert result is None + + def test_too_short(self) -> None: + edges = [10, 20] + result = _detect_tile_pattern(edges) + assert result is None + + def test_monthly_pattern(self) -> None: + edges = list(MONTHS_365) * 10 + result = _detect_tile_pattern(edges) + assert result is not None + pattern, repeats, remainder = result + assert pattern == MONTHS_365 + assert repeats == 10 + assert remainder == () + + def test_rle_beats_tile(self) -> None: + """All-same values: RLE is better, tile should not be preferred.""" + edges = [10] * 20 + result = _detect_tile_pattern(edges) + # Tile might detect this with pattern=(10,) but we require pattern len >= 2 + # and repeats >= 2. (10,) repeated 20 times has pattern len 1, which our + # detector skips (starts at plen=2). RLE handles this better. + # The detector may find (10,10) x 10 — that's fine but suboptimal vs RLE. + # The serializer prefers tile only when it's more compact. + if result is not None: + pattern, _repeats, _remainder = result + assert len(pattern) >= 2 + + +# --------------------------------------------------------------------------- +# Tile serialization round-trip +# --------------------------------------------------------------------------- + + +class TestTileSerialization: + @staticmethod + def _get_chunk_shapes(grid: ChunkGrid) -> list[JSON]: + """Serialize grid and extract chunk_shapes list with type narrowing.""" + serialized = serialize_chunk_grid(grid, "rectilinear") + config = serialized["configuration"] + assert isinstance(config, dict) + chunk_shapes = config["chunk_shapes"] + assert isinstance(chunk_shapes, list) + return chunk_shapes + + def test_round_trip_tiled_dimension(self) -> None: + """TiledDimension serializes as tile dict and round-trips.""" + dim = TiledDimension(pattern=(10, 20, 30), repeats=4) + grid = ChunkGrid(dimensions=(dim,)) + chunk_shapes = self._get_chunk_shapes(grid) + # Should be a tile dict + first = chunk_shapes[0] + assert isinstance(first, dict) + assert first["tile"] == [10, 20, 30] + assert first["repeat"] == 4 + assert "remainder" not in first + + def test_round_trip_tiled_with_remainder(self) -> None: + dim = TiledDimension(pattern=(10, 20), repeats=3, remainder=(15,)) + grid = ChunkGrid(dimensions=(dim,)) + chunk_shapes = self._get_chunk_shapes(grid) + first = chunk_shapes[0] + assert isinstance(first, dict) + assert first["tile"] == [10, 20] + assert first["repeat"] == 3 + assert first["remainder"] == [15] + + def test_parse_tile_dict(self) -> None: + """Tile dict in chunk_shapes is parsed into TiledDimension.""" + metadata: dict[str, Any] = { + "name": "rectilinear", + "configuration": { + "chunk_shapes": [ + {"tile": [10, 20, 30], "repeat": 4}, + ] + }, + } + grid = parse_chunk_grid(metadata, (240,)) + assert len(grid.dimensions) == 1 + dim = grid.dimensions[0] + assert isinstance(dim, TiledDimension) + assert dim.pattern == (10, 20, 30) + assert dim.repeats == 4 + assert dim.remainder == () + + def test_parse_tile_dict_with_remainder(self) -> None: + metadata: dict[str, Any] = { + "name": "rectilinear", + "configuration": { + "chunk_shapes": [ + {"tile": [10, 20], "repeat": 3, "remainder": [15]}, + ] + }, + } + grid = parse_chunk_grid(metadata, (105,)) # (10+20)*3 + 15 = 105 + dim = grid.dimensions[0] + assert isinstance(dim, TiledDimension) + assert dim.pattern == (10, 20) + assert dim.repeats == 3 + assert dim.remainder == (15,) + + def test_full_round_trip(self) -> None: + """Serialize -> JSON -> parse -> verify structure preserved.""" + original = TiledDimension(pattern=MONTHS_365, repeats=30) + grid = ChunkGrid(dimensions=(original,)) + serialized = serialize_chunk_grid(grid, "rectilinear") + json_str = json.dumps(serialized) + parsed_dict = json.loads(json_str) + grid2 = parse_chunk_grid(parsed_dict, (365 * 30,)) + dim = grid2.dimensions[0] + assert isinstance(dim, TiledDimension) + assert dim.pattern == MONTHS_365 + assert dim.repeats == 30 + assert dim.extent == 365 * 30 + + def test_mixed_dimensions(self) -> None: + """Grid with tiled, fixed, and varying dimensions.""" + tiled = TiledDimension(pattern=(10, 20), repeats=5) + fixed = FixedDimension(size=100, extent=500) + varying = VaryingDimension([30, 40, 30]) + grid = ChunkGrid(dimensions=(tiled, fixed, varying)) + + assert not grid.is_regular + chunk_shapes = self._get_chunk_shapes(grid) + # Tiled -> dict + assert isinstance(chunk_shapes[0], dict) + # Fixed -> RLE + assert isinstance(chunk_shapes[1], list) + # Varying -> flat or RLE + assert isinstance(chunk_shapes[2], list) + + # Round-trip + serialized = serialize_chunk_grid(grid, "rectilinear") + grid2 = parse_chunk_grid(serialized, (150, 500, 100)) + assert isinstance(grid2.dimensions[0], TiledDimension) + assert isinstance(grid2.dimensions[1], FixedDimension) + + def test_varying_dimension_auto_tile_detection(self) -> None: + """VaryingDimension with a periodic pattern serializes as tile.""" + edges = list(MONTHS_365) * 10 # 120 edges, periodic + varying = VaryingDimension(edges) + grid = ChunkGrid(dimensions=(varying,)) + chunk_shapes = self._get_chunk_shapes(grid) + # Should detect the tile pattern and serialize as dict + first = chunk_shapes[0] + assert isinstance(first, dict) + assert tuple(first["tile"]) == MONTHS_365 + assert first["repeat"] == 10 + + def test_from_dict_tile(self) -> None: + """ChunkGrid.from_dict handles tile-encoded entries.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": { + "chunk_shapes": [ + {"tile": [10, 20], "repeat": 5}, + ] + }, + } + grid = ChunkGrid.from_dict(data) + dim = grid.dimensions[0] + assert isinstance(dim, TiledDimension) + assert dim.pattern == (10, 20) + assert dim.repeats == 5 + + +# --------------------------------------------------------------------------- +# TiledDimension in ChunkGrid queries +# --------------------------------------------------------------------------- + + +class TestTiledChunkGrid: + def test_getitem(self) -> None: + d = TiledDimension(pattern=(10, 20, 30), repeats=2) + grid = ChunkGrid(dimensions=(d,)) + spec = grid[(0,)] + assert spec is not None + assert spec.shape == (10,) + assert spec.codec_shape == (10,) + assert spec.slices == (slice(0, 10),) + + spec = grid[(1,)] + assert spec is not None + assert spec.shape == (20,) + assert spec.slices == (slice(10, 30),) + + spec = grid[(3,)] # period 1, chunk 0 + assert spec is not None + assert spec.shape == (10,) + assert spec.slices == (slice(60, 70),) + + def test_getitem_oob(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=2) + grid = ChunkGrid(dimensions=(d,)) + assert grid[(4,)] is None + assert grid[(-1,)] is None + + def test_grid_shape(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=3, remainder=(5,)) + grid = ChunkGrid(dimensions=(d,)) + assert grid.shape == (7,) + + def test_not_regular(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=3) + grid = ChunkGrid(dimensions=(d,)) + assert not grid.is_regular + + def test_iterate_all(self) -> None: + d = TiledDimension(pattern=(10, 20), repeats=2) + grid = ChunkGrid(dimensions=(d,)) + specs = list(grid) + assert len(specs) == 4 + assert specs[0].shape == (10,) + assert specs[1].shape == (20,) + assert specs[2].shape == (10,) + assert specs[3].shape == (20,) + + def test_2d_mixed(self) -> None: + """2D grid: tiled time dimension x fixed spatial dimension.""" + time_dim = TiledDimension(pattern=(10, 20), repeats=3) + space_dim = FixedDimension(size=100, extent=100) + grid = ChunkGrid(dimensions=(time_dim, space_dim)) + assert grid.shape == (6, 1) + + spec = grid[(0, 0)] + assert spec is not None + assert spec.shape == (10, 100) + + spec = grid[(1, 0)] + assert spec is not None + assert spec.shape == (20, 100) From 1e2fa976ca451e93c33be23563bbbe49ca12a8c8 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 10 Mar 2026 12:21:03 -0400 Subject: [PATCH 021/118] Support rectilinear shards --- src/zarr/core/array.py | 41 ++++++++++++++++++++------ src/zarr/core/chunk_grids.py | 2 +- tests/test_unified_chunk_grid.py | 49 ++++++++++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 11 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 24a50b5d60..944966aa72 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable, Mapping +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, field, replace from itertools import starmap from logging import getLogger @@ -139,7 +139,7 @@ from zarr.storage._utils import _relativize_path if TYPE_CHECKING: - from collections.abc import Iterator, Sequence + from collections.abc import Iterator from typing import Self import numpy.typing as npt @@ -4245,7 +4245,9 @@ class ShardsConfigParam(TypedDict): index_location: ShardingCodecIndexLocation | None -ShardsLike: TypeAlias = tuple[int, ...] | ShardsConfigParam | Literal["auto"] +ShardsLike: TypeAlias = ( + tuple[int, ...] | Sequence[Sequence[int]] | ShardsConfigParam | Literal["auto"] +) async def from_array( @@ -4639,15 +4641,22 @@ async def init_array( else: await ensure_no_existing_node(store_path, zarr_format=zarr_format) - # Detect rectilinear (nested list) chunks, e.g. [[10, 20, 30], [25, 25]] + # Detect rectilinear (nested list) chunks or shards, e.g. [[10, 20, 30], [25, 25]] from zarr.core.chunk_grids import _is_rectilinear_chunks rectilinear_grid: ChunkGrid | None = None - if _is_rectilinear_chunks(chunks): + rectilinear_shards = _is_rectilinear_chunks(shards) + rectilinear_chunks = _is_rectilinear_chunks(chunks) + + if rectilinear_chunks: if zarr_format == 2: raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") if shards is not None: - raise ValueError("Rectilinear chunk grids do not support sharding.") + raise ValueError( + "Rectilinear chunks with sharding is not supported. " + "Use rectilinear shards instead: " + "chunks=(inner_size, ...), shards=[[shard_sizes], ...]" + ) rect_chunks = cast("Sequence[Sequence[int]]", chunks) rectilinear_grid = ChunkGrid.from_rectilinear(rect_chunks) # Use first chunk size per dim as placeholder for _auto_partition @@ -4657,13 +4666,24 @@ async def init_array( else: chunks_flat = cast("tuple[int, ...] | Literal['auto']", chunks) + # Handle rectilinear shards: shards=[[60, 40, 20], [50, 50]] + # means variable-sized shard boundaries with uniform inner chunks + shards_for_partition: ShardsLike | None = shards + if rectilinear_shards: + if zarr_format == 2: + raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") + rect_shards = cast("Sequence[Sequence[int]]", shards) + rectilinear_grid = ChunkGrid.from_rectilinear(rect_shards) + # Use first shard size per dim as placeholder for _auto_partition + shards_for_partition = tuple(dim_edges[0] for dim_edges in rect_shards) + item_size = 1 if isinstance(zdtype, HasItemSize): item_size = zdtype.item_size shard_shape_parsed, chunk_shape_parsed = _auto_partition( array_shape=shape_parsed, - shard_shape=shards, + shard_shape=shards_for_partition, chunk_shape=chunks_flat, item_size=item_size, ) @@ -4720,10 +4740,15 @@ async def init_array( sharding_codec = ShardingCodec( chunk_shape=chunk_shape_parsed, codecs=sub_codecs, index_location=index_location ) + # Use rectilinear grid for validation when shards are rectilinear + if rectilinear_shards and rectilinear_grid is not None: + validation_grid = rectilinear_grid + else: + validation_grid = ChunkGrid.from_regular(shape_parsed, shard_shape_parsed) sharding_codec.validate( shape=chunk_shape_parsed, dtype=zdtype, - chunk_grid=ChunkGrid.from_regular(shape_parsed, shard_shape_parsed), + chunk_grid=validation_grid, ) codecs_out = (sharding_codec,) chunks_out = shard_shape_parsed diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index b88396a66f..04c1b5ba6d 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -998,6 +998,6 @@ def _auto_partition( elif isinstance(shard_shape, dict): _shards_out = tuple(shard_shape["shape"]) else: - _shards_out = shard_shape + _shards_out = cast("tuple[int, ...]", shard_shape) return _shards_out, _chunks_out diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 234c2acc0a..e0b04e46a0 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -1352,8 +1352,9 @@ def test_v2_rejects_rectilinear(self, tmp_path: Path) -> None: zarr_format=2, ) - def test_sharding_rejects_rectilinear(self, tmp_path: Path) -> None: - with pytest.raises(ValueError, match="sharding"): + def test_sharding_rejects_rectilinear_chunks_with_shards(self, tmp_path: Path) -> None: + """Rectilinear chunks (inner) with sharding is not supported.""" + with pytest.raises(ValueError, match="Rectilinear chunks with sharding"): zarr.create_array( store=tmp_path / "shard.zarr", shape=(60, 100), @@ -1362,6 +1363,50 @@ def test_sharding_rejects_rectilinear(self, tmp_path: Path) -> None: dtype="int32", ) + def test_rectilinear_shards_roundtrip(self, tmp_path: Path) -> None: + """Rectilinear shards with uniform inner chunks: full write/read roundtrip.""" + import numpy as np + + data = np.arange(120 * 100, dtype="int32").reshape(120, 100) + arr = zarr.create_array( + store=tmp_path / "rect_shards.zarr", + shape=(120, 100), + chunks=(10, 10), # uniform inner chunks + shards=[[60, 40, 20], [50, 50]], # rectilinear shard boundaries + dtype="int32", + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, data) + + def test_rectilinear_shards_partial_read(self, tmp_path: Path) -> None: + """Partial reads across rectilinear shard boundaries.""" + import numpy as np + + data = np.arange(120 * 100, dtype="float64").reshape(120, 100) + arr = zarr.create_array( + store=tmp_path / "rect_shards.zarr", + shape=(120, 100), + chunks=(10, 10), + shards=[[60, 40, 20], [50, 50]], + dtype="float64", + ) + arr[:] = data + # Read a slice crossing shard boundaries + result = arr[50:70, 40:60] + np.testing.assert_array_equal(result, data[50:70, 40:60]) + + def test_rectilinear_shards_validates_divisibility(self, tmp_path: Path) -> None: + """Inner chunk_shape must divide every shard's dimensions.""" + with pytest.raises(ValueError, match="divisible"): + zarr.create_array( + store=tmp_path / "bad.zarr", + shape=(120, 100), + chunks=(10, 10), + shards=[[60, 45, 15], [50, 50]], # 45 not divisible by 10 + dtype="int32", + ) + def test_nchunks(self, tmp_path: Path) -> None: z, _ = self._make_2d(tmp_path) assert z.metadata.chunk_grid.get_nchunks() == 12 From 1f424b07510987e11b41cba48a21048ab391d8b6 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 10:21:26 -0400 Subject: [PATCH 022/118] Revert "POC: TiledDimension" This reverts commit 9c0f582f40a27eae1b75501f197cf9a52d8f453b. --- src/zarr/core/chunk_grids.py | 278 +++---------------- tests/test_unified_chunk_grid.py | 455 ------------------------------- 2 files changed, 33 insertions(+), 700 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 04c1b5ba6d..a160fa2d5a 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -125,131 +125,9 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int return np.searchsorted(self.cumulative, indices, side="right") -@dataclass(frozen=True) -class TiledDimension: - """Periodic chunk pattern repeated N times, with an optional trailing remainder. - - Exploits periodicity for O(1) chunk_offset/chunk_size and O(log pattern_len) - index_to_chunk, regardless of total chunk count. Memory is O(pattern_len) - instead of O(n_chunks). - - Example: 30 years of monthly chunks (days per month): - TiledDimension(pattern=(31,28,31,30,31,30,31,31,30,31,30,31), repeats=30) - """ - - pattern: tuple[int, ...] # one period's edge lengths (all > 0) - repeats: int # number of full repetitions (>= 1) - remainder: tuple[int, ...] # trailing partial period (all > 0, may be empty) - - # Precomputed - _pattern_cumulative: tuple[int, ...] # prefix sums within one period - _period_extent: int # sum(pattern) - _pattern_nchunks: int # len(pattern) - _remainder_cumulative: tuple[int, ...] # prefix sums of remainder - _total_nchunks: int - _total_extent: int - - def __init__( - self, - pattern: Sequence[int], - repeats: int = 1, - remainder: Sequence[int] = (), - ) -> None: - pattern_t = tuple(pattern) - remainder_t = tuple(remainder) - if not pattern_t: - raise ValueError("TiledDimension pattern must not be empty") - if repeats < 1: - raise ValueError(f"TiledDimension repeats must be >= 1, got {repeats}") - if any(e <= 0 for e in pattern_t): - raise ValueError(f"All pattern edge lengths must be > 0, got {pattern_t}") - if any(e <= 0 for e in remainder_t): - raise ValueError(f"All remainder edge lengths must be > 0, got {remainder_t}") - - pattern_cum = tuple(itertools.accumulate(pattern_t)) - period_extent = pattern_cum[-1] - remainder_cum = tuple(itertools.accumulate(remainder_t)) if remainder_t else () - total_nchunks = len(pattern_t) * repeats + len(remainder_t) - total_extent = period_extent * repeats + (remainder_cum[-1] if remainder_cum else 0) - - object.__setattr__(self, "pattern", pattern_t) - object.__setattr__(self, "repeats", repeats) - object.__setattr__(self, "remainder", remainder_t) - object.__setattr__(self, "_pattern_cumulative", pattern_cum) - object.__setattr__(self, "_period_extent", period_extent) - object.__setattr__(self, "_pattern_nchunks", len(pattern_t)) - object.__setattr__(self, "_remainder_cumulative", remainder_cum) - object.__setattr__(self, "_total_nchunks", total_nchunks) - object.__setattr__(self, "_total_extent", total_extent) - - @property - def nchunks(self) -> int: - return self._total_nchunks - - @property - def extent(self) -> int: - return self._total_extent - - def chunk_offset(self, chunk_ix: int) -> int: - period, offset = divmod(chunk_ix, self._pattern_nchunks) - if period < self.repeats: - base = period * self._period_extent - return base + (self._pattern_cumulative[offset - 1] if offset > 0 else 0) - # In the remainder - rem_ix = chunk_ix - self.repeats * self._pattern_nchunks - return self.repeats * self._period_extent + ( - self._remainder_cumulative[rem_ix - 1] if rem_ix > 0 else 0 - ) - - def chunk_size(self, chunk_ix: int) -> int: - """Buffer size for codec processing.""" - period, offset = divmod(chunk_ix, self._pattern_nchunks) - if period < self.repeats: - return self.pattern[offset] - return self.remainder[chunk_ix - self.repeats * self._pattern_nchunks] - - def data_size(self, chunk_ix: int) -> int: - """Valid data region — same as chunk_size for tiled dims.""" - return self.chunk_size(chunk_ix) - - def index_to_chunk(self, idx: int) -> int: - period, within = divmod(idx, self._period_extent) - if period < self.repeats: - local = bisect.bisect_right(self._pattern_cumulative, within) - return period * self._pattern_nchunks + local - # In the remainder region - rem_idx = idx - self.repeats * self._period_extent - local = bisect.bisect_right(self._remainder_cumulative, rem_idx) - return self.repeats * self._pattern_nchunks + local - - def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: - periods, withins = np.divmod(indices, self._period_extent) - result = np.empty_like(indices) - - # Chunks in the repeating region - in_repeat = periods < self.repeats - if np.any(in_repeat): - local = np.searchsorted(self._pattern_cumulative, withins[in_repeat], side="right") - result[in_repeat] = periods[in_repeat] * self._pattern_nchunks + local - - # Chunks in the remainder region - in_remainder = ~in_repeat - if np.any(in_remainder) and self._remainder_cumulative: - rem_indices = indices[in_remainder] - self.repeats * self._period_extent - local = np.searchsorted(self._remainder_cumulative, rem_indices, side="right") - result[in_remainder] = self.repeats * self._pattern_nchunks + local - - return result - - @property - def edges(self) -> tuple[int, ...]: - """Expand to full edge list (for compatibility with VaryingDimension).""" - return self.pattern * self.repeats + self.remainder - - @runtime_checkable class DimensionGrid(Protocol): - """Structural interface shared by FixedDimension, VaryingDimension, and TiledDimension.""" + """Structural interface shared by FixedDimension and VaryingDimension.""" @property def nchunks(self) -> int: ... @@ -324,48 +202,6 @@ def _compress_rle(sizes: Sequence[int]) -> list[list[int]]: return result -# --------------------------------------------------------------------------- -# Tile helpers -# --------------------------------------------------------------------------- - - -def _detect_tile_pattern( - edges: Sequence[int], -) -> tuple[tuple[int, ...], int, tuple[int, ...]] | None: - """Detect the shortest repeating tile pattern in an edge list. - - Returns (pattern, repeats, remainder) if a tile pattern saves space over - the flat representation, otherwise None. - - A pattern must repeat at least 2 times to qualify. - """ - n = len(edges) - if n < 4: - return None - - # Try pattern lengths from 2 up to n//2 - for plen in range(2, n // 2 + 1): - pattern = tuple(edges[:plen]) - full_repeats = n // plen - if full_repeats < 2: - break - # Check all full repetitions match - match = True - for r in range(1, full_repeats): - start = r * plen - if tuple(edges[start : start + plen]) != pattern: - match = False - break - if not match: - continue - remainder = tuple(edges[full_repeats * plen :]) - # Only use tile if it's more compact: pattern + remainder < flat list - tile_cost = plen + len(remainder) + 2 # +2 for repeats field + overhead - if tile_cost < n: - return pattern, full_repeats, remainder - return None - - # --------------------------------------------------------------------------- # Unified ChunkGrid # --------------------------------------------------------------------------- @@ -402,9 +238,8 @@ class ChunkGrid: It stores the extent (array dimension length) per dimension, enabling ``grid[coords]`` to return a ``ChunkSpec`` without external parameters. - Internally represents each dimension as FixedDimension (uniform chunks), - VaryingDimension (per-chunk edge lengths with prefix sums), or - TiledDimension (periodic pattern repeated N times). + Internally represents each dimension as either FixedDimension (uniform chunks) + or VaryingDimension (per-chunk edge lengths with prefix sums). """ dimensions: tuple[DimensionGrid, ...] @@ -539,11 +374,15 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") if not isinstance(chunk_shapes_raw, Sequence): raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - dims_list: list[DimensionGrid] = [] + decoded: list[list[int]] = [] for dim_spec in chunk_shapes_raw: - parsed = _parse_dim_spec(dim_spec) - dims_list.append(_build_dimension(parsed)) - return cls(dimensions=tuple(dims_list)) + if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): + decoded.append(_expand_rle(dim_spec)) + elif isinstance(dim_spec, list): + decoded.append(dim_spec) + else: + raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + return cls.from_rectilinear(decoded) raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") @@ -551,41 +390,6 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> # "rectilinear") belongs to the metadata layer. Use serialize_chunk_grid(). -def _parse_dim_spec(dim_spec: Any) -> list[int] | TiledDimension: - """Parse a single dimension's chunk_shapes entry. - - Returns either a flat edge list or a TiledDimension (for tile-encoded entries). - Handles: flat list of ints, RLE ([[size, count], ...]), and tile dicts. - """ - if isinstance(dim_spec, dict): - # Tile encoding: {"tile": [...], "repeat": N, "remainder": [...]} - tile_pattern = dim_spec.get("tile") - if tile_pattern is None: - raise ValueError(f"Tile-encoded dim_spec must have 'tile' key, got {dim_spec}") - repeat = dim_spec.get("repeat", 1) - remainder = dim_spec.get("remainder", []) - return TiledDimension( - pattern=tile_pattern, - repeats=repeat, - remainder=remainder, - ) - if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): - return _expand_rle(dim_spec) - if isinstance(dim_spec, list): - return dim_spec - raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") - - -def _build_dimension(dim_spec_parsed: list[int] | TiledDimension) -> DimensionGrid: - """Build a DimensionGrid from a parsed dim spec.""" - if isinstance(dim_spec_parsed, TiledDimension): - return dim_spec_parsed - edges = dim_spec_parsed - if all(e == edges[0] for e in edges): - return FixedDimension(size=edges[0], extent=sum(edges)) - return VaryingDimension(edges) - - def parse_chunk_grid( data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], array_shape: tuple[int, ...], @@ -603,11 +407,10 @@ def parse_chunk_grid( if isinstance(dim, FixedDimension): dims.append(FixedDimension(size=dim.size, extent=extent)) else: - # VaryingDimension/TiledDimension have intrinsic extent — validate + # VaryingDimension has intrinsic extent — validate it matches if dim.extent != extent: - dim_type = type(dim).__name__ raise ValueError( - f"{dim_type} extent {dim.extent} does not match " + f"VaryingDimension extent {dim.extent} does not match " f"array shape extent {extent} for dimension {len(dims)}" ) dims.append(dim) @@ -629,22 +432,27 @@ def parse_chunk_grid( raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") if not isinstance(chunk_shapes_raw, Sequence): raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - if len(chunk_shapes_raw) != len(array_shape): + decoded: list[list[int]] = [] + for dim_spec in chunk_shapes_raw: + if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): + decoded.append(_expand_rle(dim_spec)) + elif isinstance(dim_spec, list): + decoded.append(dim_spec) + else: + raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + if len(decoded) != len(array_shape): raise ValueError( - f"chunk_shapes has {len(chunk_shapes_raw)} dimensions but array shape " + f"chunk_shapes has {len(decoded)} dimensions but array shape " f"has {len(array_shape)} dimensions" ) - dims_built: list[DimensionGrid] = [] - for i, dim_spec in enumerate(chunk_shapes_raw): - parsed = _parse_dim_spec(dim_spec) - dim = _build_dimension(parsed) - if dim.extent != array_shape[i]: + for i, (edges, extent) in enumerate(zip(decoded, array_shape, strict=True)): + edge_sum = sum(edges) + if edge_sum != extent: raise ValueError( - f"Rectilinear chunk edges for dimension {i} sum to {dim.extent} " - f"but array shape extent is {array_shape[i]}" + f"Rectilinear chunk edges for dimension {i} sum to {edge_sum} " + f"but array shape extent is {extent}" ) - dims_built.append(dim) - return ChunkGrid(dimensions=tuple(dims_built)) + return ChunkGrid.from_rectilinear(decoded) raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") @@ -683,33 +491,13 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: rle.append([dim.size, n - 1]) rle.append([last_data, 1]) chunk_shapes.append(rle) - elif isinstance(dim, TiledDimension): - tile_dict: dict[str, Any] = { - "tile": list(dim.pattern), - "repeat": dim.repeats, - } - if dim.remainder: - tile_dict["remainder"] = list(dim.remainder) - chunk_shapes.append(tile_dict) elif isinstance(dim, VaryingDimension): edges = list(dim.edges) - # Try tile compression first (more compact for periodic patterns) - tile_result = _detect_tile_pattern(edges) - if tile_result is not None: - pattern, repeats, remainder = tile_result - tile_dict_v: dict[str, Any] = { - "tile": list(pattern), - "repeat": repeats, - } - if remainder: - tile_dict_v["remainder"] = list(remainder) - chunk_shapes.append(tile_dict_v) + rle = _compress_rle(edges) + if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): + chunk_shapes.append(rle) else: - rle = _compress_rle(edges) - if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): - chunk_shapes.append(rle) - else: - chunk_shapes.append(edges) + chunk_shapes.append(edges) return { "name": "rectilinear", "configuration": {"chunk_shapes": chunk_shapes}, diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index e0b04e46a0..281dc0ebef 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -26,10 +26,8 @@ ChunkSpec, FixedDimension, RegularChunkGrid, - TiledDimension, VaryingDimension, _compress_rle, - _detect_tile_pattern, _expand_rle, parse_chunk_grid, serialize_chunk_grid, @@ -1639,456 +1637,3 @@ def test_from_dict_regular_is_regular_chunk_grid(self) -> None: """ChunkGrid.from_dict for regular grids returns a RegularChunkGrid.""" g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) assert isinstance(g, RegularChunkGrid) - - -# --------------------------------------------------------------------------- -# TiledDimension -# --------------------------------------------------------------------------- - -# Days per month (non-leap year) -MONTHS_365 = (31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) -# Days per month (leap year) -MONTHS_366 = (31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) - - -class TestTiledDimension: - def test_basic(self) -> None: - d = TiledDimension(pattern=(10, 20, 30), repeats=3) - assert d.nchunks == 9 - assert d.extent == 180 # (10+20+30) * 3 - - def test_with_remainder(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(15,)) - assert d.nchunks == 5 # 2*2 + 1 - assert d.extent == 75 # (10+20)*2 + 15 - - def test_chunk_size(self) -> None: - d = TiledDimension(pattern=(10, 20, 30), repeats=3) - # Period 0 - assert d.chunk_size(0) == 10 - assert d.chunk_size(1) == 20 - assert d.chunk_size(2) == 30 - # Period 1 (same pattern) - assert d.chunk_size(3) == 10 - assert d.chunk_size(4) == 20 - assert d.chunk_size(5) == 30 - # Period 2 - assert d.chunk_size(6) == 10 - assert d.chunk_size(7) == 20 - assert d.chunk_size(8) == 30 - - def test_chunk_size_with_remainder(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(7, 3)) - assert d.chunk_size(0) == 10 - assert d.chunk_size(1) == 20 - assert d.chunk_size(2) == 10 - assert d.chunk_size(3) == 20 - assert d.chunk_size(4) == 7 # remainder - assert d.chunk_size(5) == 3 # remainder - - def test_data_size_equals_chunk_size(self) -> None: - d = TiledDimension(pattern=(10, 20, 30), repeats=2) - for i in range(d.nchunks): - assert d.data_size(i) == d.chunk_size(i) - - def test_chunk_offset(self) -> None: - d = TiledDimension(pattern=(10, 20, 30), repeats=2) - assert d.chunk_offset(0) == 0 - assert d.chunk_offset(1) == 10 - assert d.chunk_offset(2) == 30 - assert d.chunk_offset(3) == 60 # period 1 starts - assert d.chunk_offset(4) == 70 - assert d.chunk_offset(5) == 90 - - def test_chunk_offset_with_remainder(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(5,)) - assert d.chunk_offset(0) == 0 - assert d.chunk_offset(1) == 10 - assert d.chunk_offset(2) == 30 - assert d.chunk_offset(3) == 40 - assert d.chunk_offset(4) == 60 # remainder starts - - def test_index_to_chunk(self) -> None: - d = TiledDimension(pattern=(10, 20, 30), repeats=2) - # Period 0 - assert d.index_to_chunk(0) == 0 - assert d.index_to_chunk(9) == 0 - assert d.index_to_chunk(10) == 1 - assert d.index_to_chunk(29) == 1 - assert d.index_to_chunk(30) == 2 - assert d.index_to_chunk(59) == 2 - # Period 1 - assert d.index_to_chunk(60) == 3 - assert d.index_to_chunk(69) == 3 - assert d.index_to_chunk(70) == 4 - assert d.index_to_chunk(119) == 5 - - def test_index_to_chunk_with_remainder(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(5,)) - assert d.index_to_chunk(60) == 4 # first element of remainder - assert d.index_to_chunk(64) == 4 # last element of remainder - - def test_indices_to_chunks_vectorized(self) -> None: - d = TiledDimension(pattern=(10, 20, 30), repeats=2) - indices = np.array([0, 9, 10, 29, 30, 59, 60, 69, 70, 119], dtype=np.intp) - expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 5], dtype=np.intp) - result = d.indices_to_chunks(indices) - np.testing.assert_array_equal(result, expected) - - def test_indices_to_chunks_with_remainder(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=2, remainder=(5,)) - indices = np.array([0, 10, 30, 40, 60, 64], dtype=np.intp) - expected = np.array([0, 1, 2, 3, 4, 4], dtype=np.intp) - result = d.indices_to_chunks(indices) - np.testing.assert_array_equal(result, expected) - - def test_edges_property(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=3, remainder=(5,)) - assert d.edges == (10, 20, 10, 20, 10, 20, 5) - - def test_monthly_pattern(self) -> None: - """30 years of non-leap-year monthly chunks.""" - d = TiledDimension(pattern=MONTHS_365, repeats=30) - assert d.nchunks == 360 - assert d.extent == 365 * 30 - # January of year 0 - assert d.chunk_size(0) == 31 - assert d.chunk_offset(0) == 0 - # February of year 0 - assert d.chunk_size(1) == 28 - assert d.chunk_offset(1) == 31 - # January of year 1 - assert d.chunk_size(12) == 31 - assert d.chunk_offset(12) == 365 - # December of year 29 - assert d.chunk_size(359) == 31 - - def test_monthly_pattern_index_lookup(self) -> None: - """Verify index_to_chunk for day-of-year lookups across years.""" - d = TiledDimension(pattern=MONTHS_365, repeats=30) - # Day 0 of year 0 -> chunk 0 (January) - assert d.index_to_chunk(0) == 0 - # Day 31 of year 0 -> chunk 1 (February) - assert d.index_to_chunk(31) == 1 - # Day 0 of year 1 -> chunk 12 - assert d.index_to_chunk(365) == 12 - # Day 31 of year 1 -> chunk 13 (February year 1) - assert d.index_to_chunk(365 + 31) == 13 - - def test_quasi_periodic_leap_years(self) -> None: - """4-year cycle with one leap year, repeated 7 times + 2-year remainder.""" - four_year = MONTHS_365 * 3 + MONTHS_366 # 48 months - d = TiledDimension( - pattern=four_year, - repeats=7, - remainder=MONTHS_365 * 2, # 2 extra non-leap years - ) - assert d.nchunks == 48 * 7 + 24 # 360 months = 30 years - four_year_days = 365 * 3 + 366 - assert d.extent == four_year_days * 7 + 365 * 2 - - def test_validation_empty_pattern(self) -> None: - with pytest.raises(ValueError, match="pattern must not be empty"): - TiledDimension(pattern=()) - - def test_validation_zero_repeats(self) -> None: - with pytest.raises(ValueError, match="repeats must be >= 1"): - TiledDimension(pattern=(10,), repeats=0) - - def test_validation_negative_edge(self) -> None: - with pytest.raises(ValueError, match="pattern edge lengths must be > 0"): - TiledDimension(pattern=(10, -5)) - - def test_validation_negative_remainder(self) -> None: - with pytest.raises(ValueError, match="remainder edge lengths must be > 0"): - TiledDimension(pattern=(10,), repeats=2, remainder=(-1,)) - - def test_consistency_with_varying(self) -> None: - """TiledDimension should produce identical results to VaryingDimension.""" - pattern = (10, 20, 30) - repeats = 4 - remainder = (15, 25) - tiled = TiledDimension(pattern=pattern, repeats=repeats, remainder=remainder) - expanded_edges = list(pattern) * repeats + list(remainder) - varying = VaryingDimension(expanded_edges) - - assert tiled.nchunks == varying.nchunks - assert tiled.extent == varying.extent - - for i in range(tiled.nchunks): - assert tiled.chunk_size(i) == varying.chunk_size(i) - assert tiled.data_size(i) == varying.data_size(i) - assert tiled.chunk_offset(i) == varying.chunk_offset(i) - - # Test index_to_chunk for all valid indices - for idx in range(tiled.extent): - assert tiled.index_to_chunk(idx) == varying.index_to_chunk(idx), f"idx={idx}" - - def test_consistency_vectorized(self) -> None: - """Vectorized indices_to_chunks matches scalar version.""" - d = TiledDimension(pattern=(10, 20, 30), repeats=5, remainder=(15,)) - all_indices = np.arange(d.extent, dtype=np.intp) - vectorized = d.indices_to_chunks(all_indices) - scalar = np.array([d.index_to_chunk(int(i)) for i in all_indices], dtype=np.intp) - np.testing.assert_array_equal(vectorized, scalar) - - -# --------------------------------------------------------------------------- -# Tile detection -# --------------------------------------------------------------------------- - - -class TestDetectTilePattern: - def test_simple_tile(self) -> None: - edges = [10, 20, 10, 20, 10, 20] - result = _detect_tile_pattern(edges) - assert result is not None - pattern, repeats, remainder = result - assert pattern == (10, 20) - assert repeats == 3 - assert remainder == () - - def test_tile_with_remainder(self) -> None: - edges = [10, 20, 30, 10, 20, 30, 10] - result = _detect_tile_pattern(edges) - assert result is not None - pattern, repeats, remainder = result - assert pattern == (10, 20, 30) - assert repeats == 2 - assert remainder == (10,) - - def test_no_pattern(self) -> None: - edges = [10, 20, 30, 40] - result = _detect_tile_pattern(edges) - assert result is None - - def test_too_short(self) -> None: - edges = [10, 20] - result = _detect_tile_pattern(edges) - assert result is None - - def test_monthly_pattern(self) -> None: - edges = list(MONTHS_365) * 10 - result = _detect_tile_pattern(edges) - assert result is not None - pattern, repeats, remainder = result - assert pattern == MONTHS_365 - assert repeats == 10 - assert remainder == () - - def test_rle_beats_tile(self) -> None: - """All-same values: RLE is better, tile should not be preferred.""" - edges = [10] * 20 - result = _detect_tile_pattern(edges) - # Tile might detect this with pattern=(10,) but we require pattern len >= 2 - # and repeats >= 2. (10,) repeated 20 times has pattern len 1, which our - # detector skips (starts at plen=2). RLE handles this better. - # The detector may find (10,10) x 10 — that's fine but suboptimal vs RLE. - # The serializer prefers tile only when it's more compact. - if result is not None: - pattern, _repeats, _remainder = result - assert len(pattern) >= 2 - - -# --------------------------------------------------------------------------- -# Tile serialization round-trip -# --------------------------------------------------------------------------- - - -class TestTileSerialization: - @staticmethod - def _get_chunk_shapes(grid: ChunkGrid) -> list[JSON]: - """Serialize grid and extract chunk_shapes list with type narrowing.""" - serialized = serialize_chunk_grid(grid, "rectilinear") - config = serialized["configuration"] - assert isinstance(config, dict) - chunk_shapes = config["chunk_shapes"] - assert isinstance(chunk_shapes, list) - return chunk_shapes - - def test_round_trip_tiled_dimension(self) -> None: - """TiledDimension serializes as tile dict and round-trips.""" - dim = TiledDimension(pattern=(10, 20, 30), repeats=4) - grid = ChunkGrid(dimensions=(dim,)) - chunk_shapes = self._get_chunk_shapes(grid) - # Should be a tile dict - first = chunk_shapes[0] - assert isinstance(first, dict) - assert first["tile"] == [10, 20, 30] - assert first["repeat"] == 4 - assert "remainder" not in first - - def test_round_trip_tiled_with_remainder(self) -> None: - dim = TiledDimension(pattern=(10, 20), repeats=3, remainder=(15,)) - grid = ChunkGrid(dimensions=(dim,)) - chunk_shapes = self._get_chunk_shapes(grid) - first = chunk_shapes[0] - assert isinstance(first, dict) - assert first["tile"] == [10, 20] - assert first["repeat"] == 3 - assert first["remainder"] == [15] - - def test_parse_tile_dict(self) -> None: - """Tile dict in chunk_shapes is parsed into TiledDimension.""" - metadata: dict[str, Any] = { - "name": "rectilinear", - "configuration": { - "chunk_shapes": [ - {"tile": [10, 20, 30], "repeat": 4}, - ] - }, - } - grid = parse_chunk_grid(metadata, (240,)) - assert len(grid.dimensions) == 1 - dim = grid.dimensions[0] - assert isinstance(dim, TiledDimension) - assert dim.pattern == (10, 20, 30) - assert dim.repeats == 4 - assert dim.remainder == () - - def test_parse_tile_dict_with_remainder(self) -> None: - metadata: dict[str, Any] = { - "name": "rectilinear", - "configuration": { - "chunk_shapes": [ - {"tile": [10, 20], "repeat": 3, "remainder": [15]}, - ] - }, - } - grid = parse_chunk_grid(metadata, (105,)) # (10+20)*3 + 15 = 105 - dim = grid.dimensions[0] - assert isinstance(dim, TiledDimension) - assert dim.pattern == (10, 20) - assert dim.repeats == 3 - assert dim.remainder == (15,) - - def test_full_round_trip(self) -> None: - """Serialize -> JSON -> parse -> verify structure preserved.""" - original = TiledDimension(pattern=MONTHS_365, repeats=30) - grid = ChunkGrid(dimensions=(original,)) - serialized = serialize_chunk_grid(grid, "rectilinear") - json_str = json.dumps(serialized) - parsed_dict = json.loads(json_str) - grid2 = parse_chunk_grid(parsed_dict, (365 * 30,)) - dim = grid2.dimensions[0] - assert isinstance(dim, TiledDimension) - assert dim.pattern == MONTHS_365 - assert dim.repeats == 30 - assert dim.extent == 365 * 30 - - def test_mixed_dimensions(self) -> None: - """Grid with tiled, fixed, and varying dimensions.""" - tiled = TiledDimension(pattern=(10, 20), repeats=5) - fixed = FixedDimension(size=100, extent=500) - varying = VaryingDimension([30, 40, 30]) - grid = ChunkGrid(dimensions=(tiled, fixed, varying)) - - assert not grid.is_regular - chunk_shapes = self._get_chunk_shapes(grid) - # Tiled -> dict - assert isinstance(chunk_shapes[0], dict) - # Fixed -> RLE - assert isinstance(chunk_shapes[1], list) - # Varying -> flat or RLE - assert isinstance(chunk_shapes[2], list) - - # Round-trip - serialized = serialize_chunk_grid(grid, "rectilinear") - grid2 = parse_chunk_grid(serialized, (150, 500, 100)) - assert isinstance(grid2.dimensions[0], TiledDimension) - assert isinstance(grid2.dimensions[1], FixedDimension) - - def test_varying_dimension_auto_tile_detection(self) -> None: - """VaryingDimension with a periodic pattern serializes as tile.""" - edges = list(MONTHS_365) * 10 # 120 edges, periodic - varying = VaryingDimension(edges) - grid = ChunkGrid(dimensions=(varying,)) - chunk_shapes = self._get_chunk_shapes(grid) - # Should detect the tile pattern and serialize as dict - first = chunk_shapes[0] - assert isinstance(first, dict) - assert tuple(first["tile"]) == MONTHS_365 - assert first["repeat"] == 10 - - def test_from_dict_tile(self) -> None: - """ChunkGrid.from_dict handles tile-encoded entries.""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": { - "chunk_shapes": [ - {"tile": [10, 20], "repeat": 5}, - ] - }, - } - grid = ChunkGrid.from_dict(data) - dim = grid.dimensions[0] - assert isinstance(dim, TiledDimension) - assert dim.pattern == (10, 20) - assert dim.repeats == 5 - - -# --------------------------------------------------------------------------- -# TiledDimension in ChunkGrid queries -# --------------------------------------------------------------------------- - - -class TestTiledChunkGrid: - def test_getitem(self) -> None: - d = TiledDimension(pattern=(10, 20, 30), repeats=2) - grid = ChunkGrid(dimensions=(d,)) - spec = grid[(0,)] - assert spec is not None - assert spec.shape == (10,) - assert spec.codec_shape == (10,) - assert spec.slices == (slice(0, 10),) - - spec = grid[(1,)] - assert spec is not None - assert spec.shape == (20,) - assert spec.slices == (slice(10, 30),) - - spec = grid[(3,)] # period 1, chunk 0 - assert spec is not None - assert spec.shape == (10,) - assert spec.slices == (slice(60, 70),) - - def test_getitem_oob(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=2) - grid = ChunkGrid(dimensions=(d,)) - assert grid[(4,)] is None - assert grid[(-1,)] is None - - def test_grid_shape(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=3, remainder=(5,)) - grid = ChunkGrid(dimensions=(d,)) - assert grid.shape == (7,) - - def test_not_regular(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=3) - grid = ChunkGrid(dimensions=(d,)) - assert not grid.is_regular - - def test_iterate_all(self) -> None: - d = TiledDimension(pattern=(10, 20), repeats=2) - grid = ChunkGrid(dimensions=(d,)) - specs = list(grid) - assert len(specs) == 4 - assert specs[0].shape == (10,) - assert specs[1].shape == (20,) - assert specs[2].shape == (10,) - assert specs[3].shape == (20,) - - def test_2d_mixed(self) -> None: - """2D grid: tiled time dimension x fixed spatial dimension.""" - time_dim = TiledDimension(pattern=(10, 20), repeats=3) - space_dim = FixedDimension(size=100, extent=100) - grid = ChunkGrid(dimensions=(time_dim, space_dim)) - assert grid.shape == (6, 1) - - spec = grid[(0, 0)] - assert spec is not None - assert spec.shape == (10, 100) - - spec = grid[(1, 0)] - assert spec is not None - assert spec.shape == (20, 100) From 0967e53ee4ad5870c14ed11d597a93e2103d0db2 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 10:53:03 -0400 Subject: [PATCH 023/118] Fix __getitem__ for 1d chunk grids --- src/zarr/core/chunk_grids.py | 9 ++++++- tests/test_unified_chunk_grid.py | 43 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index a160fa2d5a..cd107b1cfa 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -318,8 +318,15 @@ def chunk_shape(self) -> tuple[int, ...]: # -- Collection interface -- - def __getitem__(self, coords: tuple[int, ...]) -> ChunkSpec | None: + def __getitem__(self, coords: int | tuple[int, ...]) -> ChunkSpec | None: """Return the ChunkSpec for a chunk at the given grid position, or None if OOB.""" + if isinstance(coords, int): + coords = (coords,) + if len(coords) != self.ndim: + raise ValueError( + f"Expected {self.ndim} coordinate(s) for a {self.ndim}-d chunk grid, " + f"got {len(coords)}." + ) slices: list[slice] = [] codec_shape: list[int] = [] for dim, ix in zip(self.dimensions, coords, strict=True): diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 281dc0ebef..f321fcca17 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -947,6 +947,49 @@ def test_parse_chunk_grid_rebinds_fixed_extent(self) -> None: # -- ChunkGrid.__getitem__ validation -- + def test_getitem_int_1d_regular(self) -> None: + """Integer indexing works for 1-d regular grids.""" + g = ChunkGrid.from_regular((100,), (10,)) + spec = g[0] + assert spec is not None + assert spec.shape == (10,) + assert spec.slices == (slice(0, 10),) + # Boundary chunk + spec = g[9] + assert spec is not None + assert spec.shape == (10,) + + def test_getitem_int_1d_rectilinear(self) -> None: + """Integer indexing works for 1-d rectilinear grids.""" + g = ChunkGrid.from_rectilinear([[20, 30, 50]]) + spec = g[0] + assert spec is not None + assert spec.shape == (20,) + spec = g[1] + assert spec is not None + assert spec.shape == (30,) + spec = g[2] + assert spec is not None + assert spec.shape == (50,) + + def test_getitem_int_0d_raises(self) -> None: + """Integer indexing raises ValueError for 0-d grids (ndim mismatch).""" + g = ChunkGrid.from_regular((), ()) + with pytest.raises(ValueError, match="Expected 0 coordinate.*got 1"): + g[0] + + def test_getitem_int_2d_raises(self) -> None: + """Integer indexing raises ValueError for 2-d grids (ndim mismatch).""" + g = ChunkGrid.from_regular((100, 200), (10, 20)) + with pytest.raises(ValueError, match="Expected 2 coordinate.*got 1"): + g[0] + + def test_getitem_int_oob_returns_none(self) -> None: + """Integer OOB returns None for 1-d grid.""" + g = ChunkGrid.from_regular((100,), (10,)) + assert g[10] is None + assert g[99] is None + def test_getitem_negative_index_returns_none(self) -> None: g = ChunkGrid.from_regular((100,), (10,)) assert g[(-1,)] is None From 67a684dc30a60f01f08bcb19d3c7fcee98d7b649 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 14:22:02 -0400 Subject: [PATCH 024/118] Implement resize --- src/zarr/core/array.py | 7 +- src/zarr/core/chunk_grids.py | 56 +++++- src/zarr/core/metadata/v3.py | 2 +- tests/test_unified_chunk_grid.py | 286 +++++++++++++++++++++++++++++-- 4 files changed, 325 insertions(+), 26 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 944966aa72..ff5c41a0ea 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -44,7 +44,6 @@ ChunkGrid, _auto_partition, normalize_chunks, - parse_chunk_grid, ) from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, @@ -6044,9 +6043,6 @@ async def _resize( new_shape = parse_shapelike(new_shape) assert len(new_shape) == len(array.metadata.shape) - if not array.metadata.chunk_grid.is_regular: - raise ValueError("Resize is not supported for arrays with rectilinear chunk grids.") - new_metadata = array.metadata.update_shape(new_shape) # ensure deletion is only run if array is shrinking as the delete_outside_chunks path is unbounded in memory @@ -6055,8 +6051,7 @@ async def _resize( if delete_outside_chunks and not only_growing: # Remove all chunks outside of the new shape old_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords()) - new_grid = parse_chunk_grid(array.metadata.chunk_grid, new_shape) - new_chunk_coords = set(new_grid.all_chunk_coords()) + new_chunk_coords = set(new_metadata.chunk_grid.all_chunk_coords()) async def _delete_key(key: str) -> None: await (array.store_path / key).delete() diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index cd107b1cfa..5faa3e7f48 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -350,6 +350,54 @@ def all_chunk_coords(self) -> Iterator[tuple[int, ...]]: def get_nchunks(self) -> int: return reduce(operator.mul, (d.nchunks for d in self.dimensions), 1) + # -- Resize -- + + def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: + """Return a new ChunkGrid adjusted for *new_shape*. + + For regular (FixedDimension) axes the extent is simply re-bound. + For varying (VaryingDimension) axes: + * **grow**: a new chunk whose size equals the growth is appended. + * **shrink**: trailing chunks that lie entirely beyond *new_shape* are + dropped; the last retained chunk is the one whose cumulative offset + first reaches or exceeds the new extent. + * **no change**: the dimension is kept as-is. + + Raises + ------ + ValueError + If *new_shape* has the wrong number of dimensions. + """ + if len(new_shape) != self.ndim: + raise ValueError( + f"new_shape has {len(new_shape)} dimensions but " + f"chunk grid has {self.ndim} dimensions" + ) + dims: list[DimensionGrid] = [] + for dim, new_extent in zip(self.dimensions, new_shape, strict=True): + if isinstance(dim, FixedDimension): + dims.append(FixedDimension(size=dim.size, extent=new_extent)) + elif isinstance(dim, VaryingDimension): + old_extent = dim.extent + if new_extent == old_extent: + dims.append(dim) + elif new_extent > old_extent: + expanded_edges = list(dim.edges) + [new_extent - old_extent] + dims.append(VaryingDimension(expanded_edges)) + else: + # Shrink: keep chunks whose cumulative offset covers new_extent + shrunk_edges: list[int] = [] + total = 0 + for edge in dim.edges: + shrunk_edges.append(edge) + total += edge + if total >= new_extent: + break + dims.append(VaryingDimension(shrunk_edges)) + else: + raise TypeError(f"Unexpected dimension type: {type(dim)}") + return ChunkGrid(dimensions=tuple(dims)) + # -- Serialization -- @classmethod @@ -414,10 +462,12 @@ def parse_chunk_grid( if isinstance(dim, FixedDimension): dims.append(FixedDimension(size=dim.size, extent=extent)) else: - # VaryingDimension has intrinsic extent — validate it matches - if dim.extent != extent: + # VaryingDimension has intrinsic extent (sum of edges). + # After resize/shrink the last chunk may extend past the array + # boundary, so extent >= array_shape is valid (like regular grids). + if dim.extent < extent: raise ValueError( - f"VaryingDimension extent {dim.extent} does not match " + f"VaryingDimension extent {dim.extent} is less than " f"array shape extent {extent} for dimension {len(dims)}" ) dims.append(dim) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 9cdfbbe254..4548703798 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -449,7 +449,7 @@ def to_dict(self) -> dict[str, JSON]: return out_dict def update_shape(self, shape: tuple[int, ...]) -> Self: - new_grid = parse_chunk_grid(self.chunk_grid, shape) + new_grid = self.chunk_grid.update_shape(shape) return replace(self, shape=shape, chunk_grid=new_grid) def update_attributes(self, attributes: dict[str, JSON]) -> Self: diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index f321fcca17..5e481e1846 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -33,6 +33,17 @@ serialize_chunk_grid, ) + +def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: + """Extract the per-chunk edge lengths for *dim* from a ChunkGrid.""" + d = grid.dimensions[dim] + if isinstance(d, FixedDimension): + return (d.size,) * d.nchunks + if isinstance(d, VaryingDimension): + return d.edges + raise TypeError(f"Unexpected dimension type: {type(d)}") + + # --------------------------------------------------------------------------- # FixedDimension # --------------------------------------------------------------------------- @@ -444,9 +455,13 @@ def test_rectilinear_rle_extent_validated(self) -> None: parse_chunk_grid(data, (100, 50)) def test_varying_dimension_extent_mismatch_on_chunkgrid_input(self) -> None: - """When passing a ChunkGrid directly, VaryingDimension extent is validated.""" + """When passing a ChunkGrid directly, VaryingDimension extent is validated. + + After resize, extent >= array_shape is allowed (last chunk extends past + boundary). But extent < array_shape is still an error. + """ g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) - with pytest.raises(ValueError, match="does not match"): + with pytest.raises(ValueError, match="less than"): parse_chunk_grid(g, (100, 50)) @@ -1608,12 +1623,75 @@ def test_property_roundtrip_rectilinear(data: st.DataObject) -> None: # --------------------------------------------------------------------------- -# Bug #3: _resize with rectilinear grids +# Resize / append for rectilinear grids # --------------------------------------------------------------------------- -class TestResizeRectilinear: - def test_resize_regular_preserves_chunk_grid(self, tmp_path: Path) -> None: +class TestUpdateShape: + """Unit tests for ChunkGrid.update_shape().""" + + def test_no_change(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + new_grid = grid.update_shape((60, 50)) + assert _edges(new_grid, 0) == (10, 20, 30) + assert _edges(new_grid, 1) == (25, 25) + + def test_grow_single_dim(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + new_grid = grid.update_shape((80, 50)) + assert _edges(new_grid, 0) == (10, 20, 30, 20) + assert _edges(new_grid, 1) == (25, 25) + + def test_grow_multiple_dims(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20], [20, 30]]) + # from (30, 50) to (45, 65) + new_grid = grid.update_shape((45, 65)) + assert _edges(new_grid, 0) == (10, 20, 15) + assert _edges(new_grid, 1) == (20, 30, 15) + + def test_shrink_single_dim(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20, 30, 40], [25, 25]]) + new_grid = grid.update_shape((35, 50)) + # 10+20=30 < 35, 10+20+30=60 >= 35 → keep (10, 20, 30) + assert _edges(new_grid, 0) == (10, 20, 30) + assert _edges(new_grid, 1) == (25, 25) + + def test_shrink_to_single_chunk(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + new_grid = grid.update_shape((5, 50)) + assert _edges(new_grid, 0) == (10,) + assert _edges(new_grid, 1) == (25, 25) + + def test_shrink_multiple_dims(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 10, 15, 5], [20, 25, 15]]) + # from (40, 60) to (25, 35) + new_grid = grid.update_shape((25, 35)) + # dim 0: 10+10=20 < 25, 10+10+15=35 >= 25 → keep (10, 10, 15) + assert _edges(new_grid, 0) == (10, 10, 15) + # dim 1: 20 < 35, 20+25=45 >= 35 → keep (20, 25) + assert _edges(new_grid, 1) == (20, 25) + + def test_dimension_mismatch_error(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20], [30, 40]]) + with pytest.raises(ValueError, match="dimensions"): + grid.update_shape((30, 70, 100)) + + def test_boundary_cases(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25]]) + # Grow to exact chunk boundary on dim 0, add 25 to dim 1 + new_grid = grid.update_shape((60, 65)) + assert _edges(new_grid, 0) == (10, 20, 30) # no change (60 == sum) + assert _edges(new_grid, 1) == (15, 25, 25) # added chunk of 25 + + # Shrink to exact chunk boundary + grid2 = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25, 10]]) + new_grid2 = grid2.update_shape((30, 40)) + # dim 0: 10+20=30 >= 30 → keep (10, 20) + assert _edges(new_grid2, 0) == (10, 20) + # dim 1: 15+25=40 >= 40 → keep (15, 25) + assert _edges(new_grid2, 1) == (15, 25) + + def test_regular_preserves_extents(self, tmp_path: Path) -> None: """Resize a regular array — chunk_grid extents must match new shape.""" z = zarr.create_array( store=tmp_path / "regular.zarr", @@ -1624,20 +1702,196 @@ def test_resize_regular_preserves_chunk_grid(self, tmp_path: Path) -> None: z[:] = np.arange(100, dtype="int32") z.resize(50) assert z.shape == (50,) - # The chunk grid's extent must agree with the new shape assert z.metadata.chunk_grid.dimensions[0].extent == 50 - def test_resize_rectilinear_raises(self, tmp_path: Path) -> None: - """Resize should raise for rectilinear grids (not yet supported).""" - z = zarr.create_array( - store=tmp_path / "rect.zarr", - shape=(30,), - chunks=[[5, 10, 15]], - dtype="int32", + +class TestResizeRectilinear: + """End-to-end resize tests on rectilinear arrays.""" + + async def test_async_resize_grow(self) -> None: + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(30, 40), + chunks=[[10, 20], [20, 20]], + dtype="i4", + zarr_format=3, + ) + data = np.arange(30 * 40, dtype="i4").reshape(30, 40) + await arr.setitem(slice(None), data) + + await arr.resize((50, 60)) + assert arr.shape == (50, 60) + assert _edges(arr.metadata.chunk_grid, 0) == (10, 20, 20) + assert _edges(arr.metadata.chunk_grid, 1) == (20, 20, 20) + result = await arr.getitem((slice(0, 30), slice(0, 40))) + np.testing.assert_array_equal(result, data) + + async def test_async_resize_shrink(self) -> None: + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(60, 50), + chunks=[[10, 20, 30], [25, 25]], + dtype="f4", + zarr_format=3, + ) + data = np.arange(60 * 50, dtype="f4").reshape(60, 50) + await arr.setitem(slice(None), data) + + await arr.resize((25, 30)) + assert arr.shape == (25, 30) + result = await arr.getitem(slice(None)) + np.testing.assert_array_equal(result, data[:25, :30]) + + def test_sync_resize_grow(self) -> None: + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(20, 30), + chunks=[[8, 12], [10, 20]], + dtype="u1", + zarr_format=3, + ) + data = np.arange(20 * 30, dtype="u1").reshape(20, 30) + arr[:] = data + arr.resize((35, 45)) + assert arr.shape == (35, 45) + np.testing.assert_array_equal(arr[:20, :30], data) + + def test_sync_resize_shrink(self) -> None: + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(40, 50), + chunks=[[10, 15, 15], [20, 30]], + dtype="i2", + zarr_format=3, + ) + data = np.arange(40 * 50, dtype="i2").reshape(40, 50) + arr[:] = data + arr.resize((15, 30)) + assert arr.shape == (15, 30) + np.testing.assert_array_equal(arr[:], data[:15, :30]) + + +class TestAppendRectilinear: + """End-to-end append tests on rectilinear arrays.""" + + async def test_append_first_axis(self) -> None: + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(30, 20), + chunks=[[10, 20], [10, 10]], + dtype="i4", + zarr_format=3, + ) + initial = np.arange(30 * 20, dtype="i4").reshape(30, 20) + await arr.setitem(slice(None), initial) + + append_data = np.arange(30 * 20, 45 * 20, dtype="i4").reshape(15, 20) + await arr.append(append_data, axis=0) + assert arr.shape == (45, 20) + + result = await arr.getitem(slice(None)) + np.testing.assert_array_equal(result, np.vstack([initial, append_data])) + + async def test_append_second_axis(self) -> None: + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(20, 30), + chunks=[[10, 10], [10, 20]], + dtype="f4", + zarr_format=3, + ) + initial = np.arange(20 * 30, dtype="f4").reshape(20, 30) + await arr.setitem(slice(None), initial) + + append_data = np.arange(20 * 30, 20 * 45, dtype="f4").reshape(20, 15) + await arr.append(append_data, axis=1) + assert arr.shape == (20, 45) + + result = await arr.getitem(slice(None)) + np.testing.assert_array_equal(result, np.hstack([initial, append_data])) + + def test_sync_append(self) -> None: + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(20, 20), + chunks=[[8, 12], [7, 13]], + dtype="u2", + zarr_format=3, + ) + initial = np.arange(20 * 20, dtype="u2").reshape(20, 20) + arr[:] = initial + + append_data = np.arange(20 * 20, 25 * 20, dtype="u2").reshape(5, 20) + arr.append(append_data, axis=0) + assert arr.shape == (25, 20) + np.testing.assert_array_equal(arr[:20, :], initial) + np.testing.assert_array_equal(arr[20:, :], append_data) + + async def test_multiple_appends(self) -> None: + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(10, 10), + chunks=[[3, 7], [4, 6]], + dtype="i4", + zarr_format=3, + ) + initial = np.arange(10 * 10, dtype="i4").reshape(10, 10) + await arr.setitem(slice(None), initial) + + all_data = [initial] + for i in range(3): + chunk = np.full((5, 10), i + 100, dtype="i4") + await arr.append(chunk, axis=0) + all_data.append(chunk) + + assert arr.shape == (25, 10) + result = await arr.getitem(slice(None)) + np.testing.assert_array_equal(result, np.vstack(all_data)) + + async def test_append_with_partial_edge_chunks(self) -> None: + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(25, 30), + chunks=[[10, 15], [12, 18]], + dtype="f8", + zarr_format=3, + ) + initial = np.random.default_rng(42).random((25, 30)) + await arr.setitem(slice(None), initial) + + append_data = np.random.default_rng(43).random((10, 30)) + await arr.append(append_data, axis=0) + assert arr.shape == (35, 30) + + result = await arr.getitem(slice(None)) + np.testing.assert_array_almost_equal(result, np.vstack([initial, append_data])) # type: ignore[arg-type] + + async def test_append_small_data(self) -> None: + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(20, 20), + chunks=[[8, 12], [7, 13]], + dtype="i4", + zarr_format=3, ) - z[:] = np.arange(30, dtype="int32") - with pytest.raises((ValueError, NotImplementedError)): - z.resize(20) + data = np.arange(20 * 20, dtype="i4").reshape(20, 20) + await arr.setitem(slice(None), data) + + small = np.full((3, 20), 999, dtype="i4") + await arr.append(small, axis=0) + assert arr.shape == (23, 20) + result = await arr.getitem((slice(20, 23), slice(None))) + np.testing.assert_array_equal(result, small) # --------------------------------------------------------------------------- From 61c48a42d9c344a6e8efd7aa9d700721f97c9918 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 14:46:51 -0400 Subject: [PATCH 025/118] Fix spec compliance --- src/zarr/core/chunk_grids.py | 107 +++++++++++++++++++++++-------- tests/test_unified_chunk_grid.py | 95 +++++++++++++++++++++++++-- 2 files changed, 169 insertions(+), 33 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 5faa3e7f48..d2ca780d88 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -173,14 +173,22 @@ def is_boundary(self) -> bool: # --------------------------------------------------------------------------- -def _expand_rle(data: list[list[int]]) -> list[int]: - """Expand run-length encoded chunk sizes: [[size, count], ...] -> [size, size, ...]""" +def _expand_rle(data: Sequence[list[int] | int]) -> list[int]: + """Expand a mixed array of bare integers and RLE pairs. + + Per the rectilinear chunk grid spec, each element can be: + - a bare integer (an explicit edge length) + - a two-element array ``[value, count]`` (run-length encoded) + """ result: list[int] = [] for item in data: - if len(item) != 2: - raise ValueError(f"RLE entries must be [size, count], got {item}") - size, count = item - result.extend([size] * count) + if isinstance(item, int): + result.append(item) + elif len(item) == 2: + size, count = item + result.extend([size] * count) + else: + raise ValueError(f"RLE entries must be an integer or [size, count], got {item}") return result @@ -202,6 +210,61 @@ def _compress_rle(sizes: Sequence[int]) -> list[list[int]]: return result +def _validate_rectilinear_kind(configuration: dict[str, JSON]) -> None: + """Validate the ``kind`` field of a rectilinear chunk grid configuration. + + The spec requires ``kind: "inline"``. + """ + kind = configuration.get("kind") + if kind is None: + raise ValueError( + "Rectilinear chunk grid configuration requires a 'kind' field. " + "Only 'inline' is currently supported." + ) + if kind != "inline": + raise ValueError( + f"Unsupported rectilinear chunk grid kind: {kind!r}. " + f"Only 'inline' is currently supported." + ) + + +def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[int]: + """Decode a single dimension's chunk edge specification per the rectilinear spec. + + Per the spec, each element of ``chunk_shapes`` can be: + - a bare integer ``m``: repeat ``m`` until the sum >= array extent + - an array of bare integers and/or ``[value, count]`` RLE pairs + + Parameters + ---------- + dim_spec + The raw JSON value for one dimension's chunk edges. + array_extent + Array length along this dimension. Required when *dim_spec* is a bare + integer (to know how many repetitions). May be ``None`` when the extent + is not yet known (e.g. ``from_dict`` without array shape). + """ + if isinstance(dim_spec, int): + if array_extent is None: + raise ValueError( + "Integer chunk_shapes shorthand requires array shape to expand. " + "Use parse_chunk_grid() instead of ChunkGrid.from_dict()." + ) + if dim_spec <= 0: + raise ValueError(f"Integer chunk edge length must be > 0, got {dim_spec}") + n = ceildiv(array_extent, dim_spec) + return [dim_spec] * n + if isinstance(dim_spec, list): + # Check if the list contains any sub-lists (RLE pairs) or is all bare ints + has_sublists = any(isinstance(e, list) for e in dim_spec) + if has_sublists: + return _expand_rle(dim_spec) + else: + # All bare integers — explicit edge lengths + return [int(e) for e in dim_spec] + raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + + # --------------------------------------------------------------------------- # Unified ChunkGrid # --------------------------------------------------------------------------- @@ -424,19 +487,13 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> return RegularChunkGrid(chunk_shape=tuple(int(cast("int", s)) for s in chunk_shape_raw)) if name_parsed == "rectilinear": + _validate_rectilinear_kind(configuration_parsed) chunk_shapes_raw = configuration_parsed.get("chunk_shapes") if chunk_shapes_raw is None: raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") if not isinstance(chunk_shapes_raw, Sequence): raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - decoded: list[list[int]] = [] - for dim_spec in chunk_shapes_raw: - if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): - decoded.append(_expand_rle(dim_spec)) - elif isinstance(dim_spec, list): - decoded.append(dim_spec) - else: - raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + decoded = [_decode_dim_spec(dim_spec) for dim_spec in chunk_shapes_raw] return cls.from_rectilinear(decoded) raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") @@ -484,30 +541,26 @@ def parse_chunk_grid( return ChunkGrid.from_regular(array_shape, cast("Sequence[int]", chunk_shape_raw)) if name_parsed == "rectilinear": + _validate_rectilinear_kind(configuration_parsed) chunk_shapes_raw = configuration_parsed.get("chunk_shapes") if chunk_shapes_raw is None: raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") if not isinstance(chunk_shapes_raw, Sequence): raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - decoded: list[list[int]] = [] - for dim_spec in chunk_shapes_raw: - if isinstance(dim_spec, list) and dim_spec and isinstance(dim_spec[0], list): - decoded.append(_expand_rle(dim_spec)) - elif isinstance(dim_spec, list): - decoded.append(dim_spec) - else: - raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") - if len(decoded) != len(array_shape): + if len(chunk_shapes_raw) != len(array_shape): raise ValueError( - f"chunk_shapes has {len(decoded)} dimensions but array shape " + f"chunk_shapes has {len(chunk_shapes_raw)} dimensions but array shape " f"has {len(array_shape)} dimensions" ) + decoded: list[list[int]] = [] + for dim_spec, extent in zip(chunk_shapes_raw, array_shape, strict=True): + decoded.append(_decode_dim_spec(dim_spec, array_extent=extent)) for i, (edges, extent) in enumerate(zip(decoded, array_shape, strict=True)): edge_sum = sum(edges) - if edge_sum != extent: + if edge_sum < extent: raise ValueError( f"Rectilinear chunk edges for dimension {i} sum to {edge_sum} " - f"but array shape extent is {extent}" + f"but array shape extent is {extent} (edge sum must be >= extent)" ) return ChunkGrid.from_rectilinear(decoded) @@ -557,7 +610,7 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: chunk_shapes.append(edges) return { "name": "rectilinear", - "configuration": {"chunk_shapes": chunk_shapes}, + "configuration": {"kind": "inline", "chunk_shapes": chunk_shapes}, } raise ValueError(f"Unknown chunk grid name for serialization: {name!r}") diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 5e481e1846..fbbbeddea1 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -389,6 +389,89 @@ def test_serialize_unknown_name_raises(self) -> None: serialize_chunk_grid(g, "hexagonal") +class TestSpecCompliance: + """Tests for compliance with the rectilinear chunk grid extension spec + (zarr-extensions PR #25).""" + + def test_kind_inline_required_on_deserialize(self) -> None: + """Deserialization requires kind: 'inline'.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"chunk_shapes": [[10, 20], [15, 15]]}, + } + with pytest.raises(ValueError, match="requires a 'kind' field"): + ChunkGrid.from_dict(data) + + def test_kind_unknown_rejected(self) -> None: + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "reference", "chunk_shapes": [[10, 20], [15, 15]]}, + } + with pytest.raises(ValueError, match="Unsupported rectilinear chunk grid kind"): + ChunkGrid.from_dict(data) + + def test_kind_inline_in_serialized_output(self) -> None: + """Serialization includes kind: 'inline'.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + d = serialize_chunk_grid(g, "rectilinear") + config = d["configuration"] + assert isinstance(config, dict) + assert config["kind"] == "inline" + + def test_integer_shorthand_per_dimension(self) -> None: + """A bare integer in chunk_shapes means repeat until >= extent.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [4, [1, 2, 3]]}, + } + g = parse_chunk_grid(data, (6, 6)) + # 4 repeated: ceildiv(6, 4) = 2 → [4, 4] + assert _edges(g, 0) == (4, 4) + assert _edges(g, 1) == (1, 2, 3) + + def test_mixed_rle_and_bare_integers(self) -> None: + """An array can mix bare integers and [value, count] RLE pairs.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[[1, 3], 3]]}, + } + # [[1, 3], 3] → [1, 1, 1, 3] → sum = 6 + g = parse_chunk_grid(data, (6,)) + assert _edges(g, 0) == (1, 1, 1, 3) + + def test_overflow_chunks_allowed(self) -> None: + """Edge sum >= extent is valid (overflow chunks permitted).""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[4, 4, 4]]}, + } + # sum = 12 > extent = 6 — allowed per spec + g = parse_chunk_grid(data, (6,)) + assert _edges(g, 0) == (4, 4, 4) + + def test_spec_example(self) -> None: + """The full example from the spec README.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": { + "kind": "inline", + "chunk_shapes": [ + 4, # integer shorthand → [4, 4] + [1, 2, 3], # explicit list + [[4, 2]], # pure RLE → [4, 4] + [[1, 3], 3], # mixed RLE + bare → [1, 1, 1, 3] + [4, 4, 4], # explicit list with overflow + ], + }, + } + g = parse_chunk_grid(data, (6, 6, 6, 6, 6)) + assert _edges(g, 0) == (4, 4) + assert _edges(g, 1) == (1, 2, 3) + assert _edges(g, 2) == (4, 4) + assert _edges(g, 3) == (1, 1, 1, 3) + assert _edges(g, 4) == (4, 4, 4) + + class TestParseChunkGridValidation: def test_varying_extent_mismatch_raises(self) -> None: from zarr.core.chunk_grids import parse_chunk_grid @@ -410,7 +493,7 @@ def test_rectilinear_extent_mismatch_raises(self) -> None: """sum(edges) must match the array shape for each dimension.""" data: dict[str, Any] = { "name": "rectilinear", - "configuration": {"chunk_shapes": [[10, 20, 30], [25, 25]]}, + "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [25, 25]]}, } # sum([10,20,30])=60, sum([25,25])=50 — array shape (100, 50) mismatches dim 0 with pytest.raises(ValueError, match="sum to 60 but array shape extent is 100"): @@ -419,7 +502,7 @@ def test_rectilinear_extent_mismatch_raises(self) -> None: def test_rectilinear_extent_mismatch_second_dim(self) -> None: data: dict[str, Any] = { "name": "rectilinear", - "configuration": {"chunk_shapes": [[50, 50], [10, 20]]}, + "configuration": {"kind": "inline", "chunk_shapes": [[50, 50], [10, 20]]}, } # dim 0 OK (100), dim 1: sum([10,20])=30 != 50 with pytest.raises(ValueError, match="dimension 1 sum to 30 but array shape extent is 50"): @@ -428,7 +511,7 @@ def test_rectilinear_extent_mismatch_second_dim(self) -> None: def test_rectilinear_extent_match_passes(self) -> None: data: dict[str, Any] = { "name": "rectilinear", - "configuration": {"chunk_shapes": [[10, 20, 30], [25, 25]]}, + "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [25, 25]]}, } g = parse_chunk_grid(data, (60, 50)) assert g.shape == (3, 2) @@ -436,7 +519,7 @@ def test_rectilinear_extent_match_passes(self) -> None: def test_rectilinear_ndim_mismatch_raises(self) -> None: data: dict[str, Any] = { "name": "rectilinear", - "configuration": {"chunk_shapes": [[10, 20], [25, 25]]}, + "configuration": {"kind": "inline", "chunk_shapes": [[10, 20], [25, 25]]}, } with pytest.raises(ValueError, match="2 dimensions but array shape has 3"): parse_chunk_grid(data, (30, 50, 100)) @@ -445,7 +528,7 @@ def test_rectilinear_rle_extent_validated(self) -> None: """RLE-encoded edges are expanded before validation.""" data: dict[str, Any] = { "name": "rectilinear", - "configuration": {"chunk_shapes": [[[10, 5]], [[25, 2]]]}, + "configuration": {"kind": "inline", "chunk_shapes": [[[10, 5]], [[25, 2]]]}, } # sum = 50 and 50 — match (50, 50) g = parse_chunk_grid(data, (50, 50)) @@ -655,7 +738,7 @@ def test_chunk_grid_name_roundtrip_preserves_rectilinear(self, tmp_path: Path) - "shape": [100, 100], "chunk_grid": { "name": "rectilinear", - "configuration": {"chunk_shapes": [[[50, 2]], [[25, 4]]]}, + "configuration": {"kind": "inline", "chunk_shapes": [[[50, 2]], [[25, 4]]]}, }, "chunk_key_encoding": {"name": "default"}, "data_type": "float32", From 7d5ebb8a36af4333a2248508d79d2a371db12e68 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 14:52:25 -0400 Subject: [PATCH 026/118] Fix .info --- src/zarr/core/_info.py | 2 +- src/zarr/core/array.py | 3 ++- tests/test_unified_chunk_grid.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index fef424346a..1503f05b26 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -117,7 +117,7 @@ def __repr__(self) -> str: if self._chunk_shape is None: # for non-regular chunk grids - kwargs["chunk_shape"] = "" + kwargs["_chunk_shape"] = "" template += "\nFilters : {_filters}" diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ff5c41a0ea..25894e4dad 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1932,6 +1932,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: + chunk_shape = self.chunks if self.metadata.chunk_grid.is_regular else None return ArrayInfo( _zarr_format=self.metadata.zarr_format, _data_type=self._zdtype, @@ -1939,7 +1940,7 @@ def _info( _shape=self.shape, _order=self.order, _shard_shape=self.shards, - _chunk_shape=self.chunks, + _chunk_shape=chunk_shape, _read_only=self.read_only, _compressors=self.compressors, _filters=self.filters, diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index fbbbeddea1..5548d477d8 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -1705,6 +1705,27 @@ def test_property_roundtrip_rectilinear(data: st.DataObject) -> None: np.testing.assert_array_equal(z[:], a) +# --------------------------------------------------------------------------- +# .info display for rectilinear grids +# --------------------------------------------------------------------------- + + +def test_info_display_rectilinear() -> None: + """Array.info should not crash for rectilinear grids.""" + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(30,), + chunks=[[10, 20]], + dtype="i4", + zarr_format=3, + ) + info = arr.info + text = repr(info) + assert "" in text + assert "Array" in text + + # --------------------------------------------------------------------------- # Resize / append for rectilinear grids # --------------------------------------------------------------------------- From e74586a7a70ad7fa6c492bc9a2f38c5c76399942 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 14:56:28 -0400 Subject: [PATCH 027/118] Fix typing --- src/zarr/core/array.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 25894e4dad..304c11ef38 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -2671,12 +2671,12 @@ def __array__( raise ValueError(msg) arr = self[...] - arr_np: NDArrayLike = np.array(arr, dtype=dtype) + arr_np = np.array(arr, dtype=dtype) if dtype is not None: arr_np = arr_np.astype(dtype) - return arr_np + return cast("NDArrayLike", arr_np) def __getitem__(self, selection: Selection) -> NDArrayLikeOrScalar: """Retrieve data for an item or region of the array. @@ -3691,7 +3691,7 @@ def get_coordinate_selection( if hasattr(out_array, "shape"): # restore shape - out_array = np.array(out_array).reshape(indexer.sel_shape) + out_array = cast("NDArrayLikeOrScalar", np.array(out_array).reshape(indexer.sel_shape)) return out_array def set_coordinate_selection( @@ -5878,7 +5878,7 @@ async def _get_coordinate_selection( if hasattr(out_array, "shape"): # restore shape - out_array = np.array(out_array).reshape(indexer.sel_shape) + out_array = cast("NDArrayLikeOrScalar", np.array(out_array).reshape(indexer.sel_shape)) return out_array From 8ab3ca862ed75c514748c6235cedef4eb201ed84 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:04:03 -0400 Subject: [PATCH 028/118] Adopt joe's property testing strategy --- src/zarr/testing/strategies.py | 55 ++++++++++++++++++++++- tests/test_properties.py | 17 +++++-- tests/test_unified_chunk_grid.py | 76 -------------------------------- 3 files changed, 68 insertions(+), 80 deletions(-) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 45c86eb3eb..a16bd0b1b6 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -228,7 +228,7 @@ def np_array_and_chunks( draw: st.DrawFn, *, arrays: st.SearchStrategy[npt.NDArray[Any]] = numpy_arrays(), # noqa: B008 -) -> tuple[np.ndarray, tuple[int, ...]]: # type: ignore[type-arg] +) -> tuple[np.ndarray[Any, Any], tuple[int, ...]]: """A hypothesis strategy to generate small sized random arrays. Returns: a tuple of the array and a suitable random chunking for it. @@ -324,6 +324,59 @@ def simple_arrays( ) +@st.composite +def rectilinear_chunks(draw: st.DrawFn, *, shape: tuple[int, ...]) -> list[list[int]]: + """Generate valid rectilinear chunk shapes for a given array shape. + + Each dimension is partitioned into 1..min(size, 10) chunks by drawing + unique divider points within [1, size-1]. + """ + chunk_shapes: list[list[int]] = [] + for size in shape: + assert size > 0 + max_chunks = min(size, 10) + nchunks = draw(st.integers(min_value=1, max_value=max_chunks)) + if nchunks == 1: + chunk_shapes.append([size]) + else: + dividers = sorted( + draw( + st.lists( + st.integers(min_value=1, max_value=size - 1), + min_size=nchunks - 1, + max_size=nchunks - 1, + unique=True, + ) + ) + ) + chunk_shapes.append( + [a - b for a, b in zip(dividers + [size], [0] + dividers, strict=False)] + ) + return chunk_shapes + + +# Rectilinear arrays need min_side >= 2 so divider generation works +_rectilinear_shapes = npst.array_shapes(max_dims=3, min_side=2, max_side=20) + + +@st.composite +def rectilinear_arrays( + draw: st.DrawFn, + *, + shapes: st.SearchStrategy[tuple[int, ...]] = _rectilinear_shapes, +) -> Any: + """Generate a zarr v3 array with rectilinear (variable) chunk grid.""" + shape = draw(shapes) + chunk_shapes = draw(rectilinear_chunks(shape=shape)) + + nparray = np.arange(int(np.prod(shape)), dtype="int32").reshape(shape) + store = MemoryStore() + a = zarr.create_array(store=store, shape=shape, chunks=chunk_shapes, dtype="int32") + a[:] = nparray + + return a + + def is_negative_slice(idx: Any) -> bool: return isinstance(idx, slice) and idx.step is not None and idx.step < 0 diff --git a/tests/test_properties.py b/tests/test_properties.py index bab659c976..4b6c151382 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -25,6 +25,7 @@ basic_indices, numpy_arrays, orthogonal_indices, + rectilinear_arrays, simple_arrays, stores, zarr_formats, @@ -111,7 +112,7 @@ def test_array_creates_implicit_groups(array): @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @given(data=st.data()) async def test_basic_indexing(data: st.DataObject) -> None: - zarray = data.draw(simple_arrays()) + zarray = data.draw(st.one_of(simple_arrays(), rectilinear_arrays())) nparray = zarray[:] indexer = data.draw(basic_indices(shape=nparray.shape)) @@ -138,7 +139,12 @@ async def test_basic_indexing(data: st.DataObject) -> None: @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") async def test_oindex(data: st.DataObject) -> None: # integer_array_indices can't handle 0-size dimensions. - zarray = data.draw(simple_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) + zarray = data.draw( + st.one_of( + simple_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1)), + rectilinear_arrays(shapes=npst.array_shapes(max_dims=3, min_side=2, max_side=20)), + ) + ) nparray = zarray[:] zindexer, npindexer = data.draw(orthogonal_indices(shape=nparray.shape)) @@ -170,7 +176,12 @@ async def test_oindex(data: st.DataObject) -> None: @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") async def test_vindex(data: st.DataObject) -> None: # integer_array_indices can't handle 0-size dimensions. - zarray = data.draw(simple_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) + zarray = data.draw( + st.one_of( + simple_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1)), + rectilinear_arrays(shapes=npst.array_shapes(max_dims=3, min_side=2, max_side=20)), + ) + ) nparray = zarray[:] indexer = data.draw( npst.integer_array_indices( diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 5548d477d8..c409e81ed6 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -1605,74 +1605,6 @@ def rectilinear_arrays_st(draw: st.DrawFn) -> tuple[zarr.Array[Any], np.ndarray[ return z, a -@settings(deadline=None, max_examples=50) -@given(data=st.data()) -def test_property_basic_indexing_rectilinear(data: st.DataObject) -> None: - """Property test: basic indexing on rectilinear arrays matches numpy.""" - z, a = data.draw(rectilinear_arrays_st()) - np.testing.assert_array_equal(z[:], a) - - slicers = [] - for size in a.shape: - start = data.draw(st.integers(min_value=0, max_value=size - 1)) - stop = data.draw(st.integers(min_value=start, max_value=size)) - slicers.append(slice(start, stop)) - sel = tuple(slicers) - np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") - - -@settings(deadline=None, max_examples=50) -@given(data=st.data()) -def test_property_oindex_rectilinear(data: st.DataObject) -> None: - """Property test: orthogonal int-array indexing matches numpy.""" - z, a = data.draw(rectilinear_arrays_st()) - - indexers_z = [] - indexers_np = [] - for size in a.shape: - n = data.draw(st.integers(min_value=1, max_value=min(size, 5))) - ix = np.array( - sorted( - data.draw( - st.lists( - st.integers(min_value=0, max_value=size - 1), - min_size=n, - max_size=n, - unique=True, - ) - ) - ) - ) - indexers_z.append(ix) - indexers_np.append(ix) - - result = z.oindex[tuple(indexers_z)] - expected = a[np.ix_(*indexers_np)] - np.testing.assert_array_equal(result, expected) - - -@settings(deadline=None, max_examples=50) -@given(data=st.data()) -def test_property_vindex_rectilinear(data: st.DataObject) -> None: - """Property test: vindex on rectilinear arrays matches numpy.""" - z, a = data.draw(rectilinear_arrays_st()) - - n = data.draw(st.integers(min_value=1, max_value=min(min(a.shape), 5))) - indexers = tuple( - np.array( - data.draw( - st.lists( - st.integers(min_value=0, max_value=size - 1), - min_size=n, - max_size=n, - ) - ) - ) - for size in a.shape - ) - np.testing.assert_array_equal(z.vindex[indexers], a[indexers]) - - @settings(deadline=None, max_examples=50) @given(data=st.data()) def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: @@ -1697,14 +1629,6 @@ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: ) -@settings(deadline=None, max_examples=50) -@given(data=st.data()) -def test_property_roundtrip_rectilinear(data: st.DataObject) -> None: - """Property test: write then read matches original data.""" - z, a = data.draw(rectilinear_arrays_st()) - np.testing.assert_array_equal(z[:], a) - - # --------------------------------------------------------------------------- # .info display for rectilinear grids # --------------------------------------------------------------------------- From 80d8280e1d74fc9227f46aff35ac01bfc95e413d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:27:20 -0400 Subject: [PATCH 029/118] Remove RegularChunkGrid --- src/zarr/core/array.py | 16 ++++---- src/zarr/core/chunk_grids.py | 62 ++---------------------------- tests/conftest.py | 6 +-- tests/test_cli/test_migrate_v3.py | 3 +- tests/test_unified_chunk_grid.py | 63 ++++++++++--------------------- 5 files changed, 35 insertions(+), 115 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 304c11ef38..3977939c07 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1054,8 +1054,8 @@ def chunks(self) -> tuple[int, ...]: """Returns the chunk shape of the Array. If sharding is used the inner chunk shape is returned. - Only defined for arrays using using `RegularChunkGrid`. - If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + Only defined for arrays using a regular chunk grid. + If array uses a rectilinear chunk grid, `NotImplementedError` is raised. Returns ------- @@ -1069,8 +1069,8 @@ def shards(self) -> tuple[int, ...] | None: """Returns the shard shape of the Array. Returns None if sharding is not used. - Only defined for arrays using using `RegularChunkGrid`. - If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + Only defined for arrays using a regular chunk grid. + If array uses a rectilinear chunk grid, `NotImplementedError` is raised. Returns ------- @@ -2275,8 +2275,8 @@ def chunks(self) -> tuple[int, ...]: """Returns a tuple of integers describing the length of each dimension of a chunk of the array. If sharding is used the inner chunk shape is returned. - Only defined for arrays using using `RegularChunkGrid`. - If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + Only defined for arrays using a regular chunk grid. + If array uses a rectilinear chunk grid, `NotImplementedError` is raised. Returns ------- @@ -2290,8 +2290,8 @@ def shards(self) -> tuple[int, ...] | None: """Returns a tuple of integers describing the length of each dimension of a shard of the array. Returns None if sharding is not used. - Only defined for arrays using using `RegularChunkGrid`. - If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + Only defined for arrays using a regular chunk grid. + If array uses a rectilinear chunk grid, `NotImplementedError` is raised. Returns ------- diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index d2ca780d88..43f1ec9052 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -27,7 +27,6 @@ if TYPE_CHECKING: from collections.abc import Iterator - from typing import Self from zarr.core.array import ShardsLike @@ -466,11 +465,6 @@ def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: @classmethod def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> ChunkGrid: if isinstance(data, ChunkGrid): - if isinstance(data, RegularChunkGrid): - return ChunkGrid.from_regular( - tuple(d.extent for d in data.dimensions), - data.chunk_shape, - ) return data name_parsed, configuration_parsed = parse_named_configuration(data) @@ -481,10 +475,10 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> raise ValueError("Regular chunk grid requires 'chunk_shape' configuration") if not isinstance(chunk_shape_raw, Sequence): raise TypeError(f"chunk_shape must be a sequence, got {type(chunk_shape_raw)}") - # Without array shape, return a RegularChunkGrid that preserves - # chunk_shape but raises on extent-dependent operations. - # Use parse_chunk_grid() when array shape is available. - return RegularChunkGrid(chunk_shape=tuple(int(cast("int", s)) for s in chunk_shape_raw)) + chunk_shape = tuple(int(cast("int", s)) for s in chunk_shape_raw) + # Without array shape we cannot compute extents. Use a sentinel + # extent of 0; callers that need extent should use parse_chunk_grid(). + return cls(dimensions=tuple(FixedDimension(size=s, extent=0) for s in chunk_shape)) if name_parsed == "rectilinear": _validate_rectilinear_kind(configuration_parsed) @@ -628,54 +622,6 @@ def _infer_chunk_grid_name( return "regular" if grid.is_regular else "rectilinear" -# --------------------------------------------------------------------------- -# Backwards-compatible alias -# --------------------------------------------------------------------------- - - -class RegularChunkGrid(ChunkGrid): - """Backwards-compatible wrapper. Prefer ChunkGrid.from_regular() for new code.""" - - _chunk_shape: tuple[int, ...] - - def __init__(self, *, chunk_shape: ShapeLike) -> None: - chunk_shape_parsed = parse_shapelike(chunk_shape) - # Without array shape, use extent=0 as placeholder - dims = tuple(FixedDimension(size=s, extent=0) for s in chunk_shape_parsed) - super().__init__(dimensions=dims) - object.__setattr__(self, "_chunk_shape", chunk_shape_parsed) - - @property - def chunk_shape(self) -> tuple[int, ...]: - """Return the stored chunk shape (extent may be 0 as placeholder).""" - return self._chunk_shape - - @classmethod - def _from_dict(cls, data: dict[str, JSON] | NamedConfig[str, Any]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "regular") - return cls(**configuration_parsed) # type: ignore[arg-type] - - def _raise_no_extent(self) -> None: - raise ValueError( - "RegularChunkGrid does not have array shape information. " - "Use ChunkGrid.from_regular(array_shape, chunk_shape) or " - "parse_chunk_grid() to create a grid with extent." - ) - - @property - def shape(self) -> tuple[int, ...]: - self._raise_no_extent() - raise AssertionError # unreachable, for mypy - - def all_chunk_coords(self) -> Iterator[tuple[int, ...]]: - self._raise_no_extent() - raise AssertionError # unreachable, for mypy - - def get_nchunks(self) -> int: - self._raise_no_extent() - raise AssertionError # unreachable, for mypy - - # --------------------------------------------------------------------------- # Chunk guessing / normalization (unchanged) # --------------------------------------------------------------------------- diff --git a/tests/conftest.py b/tests/conftest.py index 23a1e87d0a..f1cdec08e5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,7 @@ _parse_chunk_encoding_v3, _parse_chunk_key_encoding, ) -from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition +from zarr.core.chunk_grids import ChunkGrid, _auto_partition from zarr.core.common import ( JSON, DimensionNames, @@ -379,7 +379,7 @@ def create_array_metadata( sharding_codec.validate( shape=chunk_shape_parsed, dtype=dtype_parsed, - chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), + chunk_grid=ChunkGrid.from_regular(chunk_shape_parsed, shard_shape_parsed), ) codecs_out = (sharding_codec,) chunks_out = shard_shape_parsed @@ -390,7 +390,7 @@ def create_array_metadata( return ArrayV3Metadata( shape=shape_parsed, data_type=dtype_parsed, - chunk_grid=RegularChunkGrid(chunk_shape=chunks_out), + chunk_grid={"name": "regular", "configuration": {"chunk_shape": chunks_out}}, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value, codecs=codecs_out, diff --git a/tests/test_cli/test_migrate_v3.py b/tests/test_cli/test_migrate_v3.py index 8bda31d208..dd3ca02549 100644 --- a/tests/test_cli/test_migrate_v3.py +++ b/tests/test_cli/test_migrate_v3.py @@ -16,7 +16,6 @@ from zarr.codecs.numcodecs import LZMA, Delta from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec -from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.npy.int import UInt8, UInt16 @@ -63,7 +62,7 @@ def test_migrate_array(local_store: LocalStore) -> None: expected_metadata = ArrayV3Metadata( shape=shape, data_type=UInt16(endianness="little"), - chunk_grid=RegularChunkGrid(chunk_shape=chunks), + chunk_grid={"name": "regular", "configuration": {"chunk_shape": chunks}}, chunk_key_encoding=V2ChunkKeyEncoding(separator="."), fill_value=fill_value, codecs=( diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index c409e81ed6..6b2795c3e1 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -25,7 +25,6 @@ ChunkGrid, ChunkSpec, FixedDimension, - RegularChunkGrid, VaryingDimension, _compress_rle, _expand_rle, @@ -554,12 +553,6 @@ def test_varying_dimension_extent_mismatch_on_chunkgrid_input(self) -> None: class TestBackwardsCompat: - def test_regular_chunk_grid_still_works(self) -> None: - g = RegularChunkGrid(chunk_shape=(10, 20)) - assert g.chunk_shape == (10, 20) - assert g.is_regular - assert isinstance(g, ChunkGrid) - def test_from_dict_regular(self) -> None: d: dict[str, JSON] = {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} g = ChunkGrid.from_dict(d) @@ -567,12 +560,11 @@ def test_from_dict_regular(self) -> None: assert g.is_regular assert g.chunk_shape == (10, 20) - def test_regular_chunk_grid_passed_to_from_dict(self) -> None: - """RegularChunkGrid instances should be convertible.""" - rcg = RegularChunkGrid(chunk_shape=(10, 20)) - g = ChunkGrid.from_dict(rcg) - assert isinstance(g, ChunkGrid) - assert g.is_regular + def test_from_dict_regular_extent_zero(self) -> None: + """from_dict without array shape produces extent=0 sentinel.""" + g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) + # Extent is 0 (unknown) — use parse_chunk_grid() to bind real extents + assert all(d.extent == 0 for d in g.dimensions) # --------------------------------------------------------------------------- @@ -1923,42 +1915,25 @@ async def test_append_small_data(self) -> None: # --------------------------------------------------------------------------- -# Bug #4: extent=0 placeholder in RegularChunkGrid / from_dict +# from_dict extent=0 sentinel # --------------------------------------------------------------------------- -class TestExtentPlaceholder: - def test_regular_chunk_grid_chunk_shape_preserved(self) -> None: - """RegularChunkGrid preserves chunk_shape.""" - g = RegularChunkGrid(chunk_shape=(10, 20)) +class TestFromDictExtentSentinel: + def test_from_dict_regular_chunk_shape_preserved(self) -> None: + """from_dict preserves chunk_shape even with extent=0 sentinel.""" + g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) assert g.chunk_shape == (10, 20) - def test_regular_chunk_grid_nchunks_raises(self) -> None: - """RegularChunkGrid raises on get_nchunks() (no extent info).""" - g = RegularChunkGrid(chunk_shape=(10, 20)) - with pytest.raises(ValueError, match="array shape"): - g.get_nchunks() - - def test_regular_chunk_grid_shape_raises(self) -> None: - """RegularChunkGrid raises on .shape (no extent info).""" - g = RegularChunkGrid(chunk_shape=(10, 20)) - with pytest.raises(ValueError, match="array shape"): - _ = g.shape - - def test_regular_chunk_grid_all_chunk_coords_raises(self) -> None: - """RegularChunkGrid raises on all_chunk_coords() (no extent info).""" - g = RegularChunkGrid(chunk_shape=(10, 20)) - with pytest.raises(ValueError, match="array shape"): - list(g.all_chunk_coords()) - - def test_from_dict_regular_raises_on_extent_ops(self) -> None: - """ChunkGrid.from_dict for regular grids raises on extent-dependent ops.""" + def test_from_dict_regular_is_chunk_grid(self) -> None: + """from_dict for regular grids returns a ChunkGrid.""" g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) - assert g.chunk_shape == (10, 20) - with pytest.raises(ValueError, match="array shape"): - g.get_nchunks() + assert isinstance(g, ChunkGrid) + assert g.is_regular - def test_from_dict_regular_is_regular_chunk_grid(self) -> None: - """ChunkGrid.from_dict for regular grids returns a RegularChunkGrid.""" + def test_parse_chunk_grid_binds_extent(self) -> None: + """parse_chunk_grid resolves extent=0 from from_dict.""" g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) - assert isinstance(g, RegularChunkGrid) + resolved = parse_chunk_grid(g, (100, 200)) + assert resolved.shape == (10, 10) + assert resolved.get_nchunks() == 100 From ffc7805743a825425e7347e7bedcf5bccefb3cd0 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:33:09 -0400 Subject: [PATCH 030/118] Use none rather than sentinel value --- src/zarr/core/chunk_grids.py | 42 +++++++++++++++++++++++--------- tests/test_unified_chunk_grid.py | 26 +++++++++++--------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 43f1ec9052..12070af17d 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -39,22 +39,36 @@ @dataclass(frozen=True) class FixedDimension: """Uniform chunk size. Boundary chunks contain less data but are - encoded at full size by the codec pipeline.""" + encoded at full size by the codec pipeline. + + ``extent`` is ``None`` when the array shape is not yet known (e.g. when + constructed via ``ChunkGrid.from_dict`` without array shape). Calling any + method that depends on extent will raise ``ValueError`` in that case. + """ size: int # chunk edge length (>= 0) - extent: int # array dimension length + extent: int | None # array dimension length, or None if unknown def __post_init__(self) -> None: if self.size < 0: raise ValueError(f"FixedDimension size must be >= 0, got {self.size}") - if self.extent < 0: + if self.extent is not None and self.extent < 0: raise ValueError(f"FixedDimension extent must be >= 0, got {self.extent}") + def _require_extent(self) -> int: + if self.extent is None: + raise ValueError( + "FixedDimension extent is unknown. " + "Use parse_chunk_grid() to bind array shape before calling this method." + ) + return self.extent + @property def nchunks(self) -> int: + extent = self._require_extent() if self.size == 0: - return 1 if self.extent == 0 else 0 - return ceildiv(self.extent, self.size) + return 1 if extent == 0 else 0 + return ceildiv(extent, self.size) def index_to_chunk(self, idx: int) -> int: if self.size == 0: @@ -70,9 +84,10 @@ def chunk_size(self, chunk_ix: int) -> int: def data_size(self, chunk_ix: int) -> int: """Valid data region within the buffer — clipped at extent.""" + extent = self._require_extent() if self.size == 0: return 0 - return max(0, min(self.size, self.extent - chunk_ix * self.size)) + return max(0, min(self.size, extent - chunk_ix * self.size)) def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: if self.size == 0: @@ -131,7 +146,7 @@ class DimensionGrid(Protocol): @property def nchunks(self) -> int: ... @property - def extent(self) -> int: ... + def extent(self) -> int | None: ... def index_to_chunk(self, idx: int) -> int: ... def chunk_offset(self, chunk_ix: int) -> int: ... def chunk_size(self, chunk_ix: int) -> int: ... @@ -476,9 +491,9 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> if not isinstance(chunk_shape_raw, Sequence): raise TypeError(f"chunk_shape must be a sequence, got {type(chunk_shape_raw)}") chunk_shape = tuple(int(cast("int", s)) for s in chunk_shape_raw) - # Without array shape we cannot compute extents. Use a sentinel - # extent of 0; callers that need extent should use parse_chunk_grid(). - return cls(dimensions=tuple(FixedDimension(size=s, extent=0) for s in chunk_shape)) + # Without array shape we cannot compute extents; use extent=None. + # Callers that need extent should use parse_chunk_grid(). + return cls(dimensions=tuple(FixedDimension(size=s, extent=None) for s in chunk_shape)) if name_parsed == "rectilinear": _validate_rectilinear_kind(configuration_parsed) @@ -512,7 +527,7 @@ def parse_chunk_grid( for dim, extent in zip(data.dimensions, array_shape, strict=True): if isinstance(dim, FixedDimension): dims.append(FixedDimension(size=dim.size, extent=extent)) - else: + elif isinstance(dim, VaryingDimension): # VaryingDimension has intrinsic extent (sum of edges). # After resize/shrink the last chunk may extend past the array # boundary, so extent >= array_shape is valid (like regular grids). @@ -522,6 +537,8 @@ def parse_chunk_grid( f"array shape extent {extent} for dimension {len(dims)}" ) dims.append(dim) + else: + raise TypeError(f"Unexpected dimension type: {type(dim)}") return ChunkGrid(dimensions=tuple(dims)) name_parsed, configuration_parsed = parse_named_configuration(data) @@ -582,11 +599,12 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: for dim in grid.dimensions: if isinstance(dim, FixedDimension): # Produce RLE directly without allocating a full edge list. + extent = dim._require_extent() n = dim.nchunks if n == 0: chunk_shapes.append([]) else: - last_data = dim.extent - (n - 1) * dim.size + last_data = extent - (n - 1) * dim.size if last_data == dim.size: chunk_shapes.append([[dim.size, n]]) else: diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 6b2795c3e1..0575ebae49 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -560,11 +560,10 @@ def test_from_dict_regular(self) -> None: assert g.is_regular assert g.chunk_shape == (10, 20) - def test_from_dict_regular_extent_zero(self) -> None: - """from_dict without array shape produces extent=0 sentinel.""" + def test_from_dict_regular_extent_none(self) -> None: + """from_dict without array shape produces extent=None.""" g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) - # Extent is 0 (unknown) — use parse_chunk_grid() to bind real extents - assert all(d.extent == 0 for d in g.dimensions) + assert all(d.extent is None for d in g.dimensions) # --------------------------------------------------------------------------- @@ -1919,20 +1918,25 @@ async def test_append_small_data(self) -> None: # --------------------------------------------------------------------------- -class TestFromDictExtentSentinel: +class TestFromDictExtentNone: def test_from_dict_regular_chunk_shape_preserved(self) -> None: - """from_dict preserves chunk_shape even with extent=0 sentinel.""" + """from_dict preserves chunk_shape even without extent.""" g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) assert g.chunk_shape == (10, 20) - def test_from_dict_regular_is_chunk_grid(self) -> None: - """from_dict for regular grids returns a ChunkGrid.""" + def test_from_dict_regular_extent_is_none(self) -> None: + """from_dict without array shape sets extent=None.""" g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) - assert isinstance(g, ChunkGrid) - assert g.is_regular + assert all(d.extent is None for d in g.dimensions) + + def test_from_dict_regular_nchunks_raises(self) -> None: + """Extent-dependent operations raise on shapeless grids.""" + g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) + with pytest.raises(ValueError, match="extent is unknown"): + g.get_nchunks() def test_parse_chunk_grid_binds_extent(self) -> None: - """parse_chunk_grid resolves extent=0 from from_dict.""" + """parse_chunk_grid resolves extent=None from from_dict.""" g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) resolved = parse_chunk_grid(g, (100, 200)) assert resolved.shape == (10, 10) From 9beaee6b917f9e89f3cc94f7782b14c0be84fc4c Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 17:05:27 -0400 Subject: [PATCH 031/118] Remove regular chunk grid --- src/zarr/core/chunk_grids.py | 116 ++++++++++--------------------- tests/test_unified_chunk_grid.py | 80 ++++++--------------- 2 files changed, 57 insertions(+), 139 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 12070af17d..dcceb376c8 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -39,36 +39,22 @@ @dataclass(frozen=True) class FixedDimension: """Uniform chunk size. Boundary chunks contain less data but are - encoded at full size by the codec pipeline. - - ``extent`` is ``None`` when the array shape is not yet known (e.g. when - constructed via ``ChunkGrid.from_dict`` without array shape). Calling any - method that depends on extent will raise ``ValueError`` in that case. - """ + encoded at full size by the codec pipeline.""" size: int # chunk edge length (>= 0) - extent: int | None # array dimension length, or None if unknown + extent: int # array dimension length def __post_init__(self) -> None: if self.size < 0: raise ValueError(f"FixedDimension size must be >= 0, got {self.size}") - if self.extent is not None and self.extent < 0: + if self.extent < 0: raise ValueError(f"FixedDimension extent must be >= 0, got {self.extent}") - def _require_extent(self) -> int: - if self.extent is None: - raise ValueError( - "FixedDimension extent is unknown. " - "Use parse_chunk_grid() to bind array shape before calling this method." - ) - return self.extent - @property def nchunks(self) -> int: - extent = self._require_extent() if self.size == 0: - return 1 if extent == 0 else 0 - return ceildiv(extent, self.size) + return 1 if self.extent == 0 else 0 + return ceildiv(self.extent, self.size) def index_to_chunk(self, idx: int) -> int: if self.size == 0: @@ -84,10 +70,9 @@ def chunk_size(self, chunk_ix: int) -> int: def data_size(self, chunk_ix: int) -> int: """Valid data region within the buffer — clipped at extent.""" - extent = self._require_extent() if self.size == 0: return 0 - return max(0, min(self.size, extent - chunk_ix * self.size)) + return max(0, min(self.size, self.extent - chunk_ix * self.size)) def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: if self.size == 0: @@ -146,7 +131,7 @@ class DimensionGrid(Protocol): @property def nchunks(self) -> int: ... @property - def extent(self) -> int | None: ... + def extent(self) -> int: ... def index_to_chunk(self, idx: int) -> int: ... def chunk_offset(self, chunk_ix: int) -> int: ... def chunk_size(self, chunk_ix: int) -> int: ... @@ -206,21 +191,27 @@ def _expand_rle(data: Sequence[list[int] | int]) -> list[int]: return result -def _compress_rle(sizes: Sequence[int]) -> list[list[int]]: - """Compress chunk sizes to RLE: [10,10,10,20,20] -> [[10,3],[20,2]]""" +def _compress_rle(sizes: Sequence[int]) -> list[list[int] | int]: + """Compress chunk sizes to mixed RLE format per the rectilinear spec. + + Runs of length > 1 are emitted as ``[value, count]`` pairs; runs of + length 1 are emitted as bare integers:: + + [10, 10, 10, 5] -> [[10, 3], 5] + """ if not sizes: return [] - result: list[list[int]] = [] + result: list[list[int] | int] = [] current = sizes[0] count = 1 for s in sizes[1:]: if s == current: count += 1 else: - result.append([current, count]) + result.append([current, count] if count > 1 else current) current = s count = 1 - result.append([current, count]) + result.append([current, count] if count > 1 else current) return result @@ -255,15 +246,11 @@ def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[in The raw JSON value for one dimension's chunk edges. array_extent Array length along this dimension. Required when *dim_spec* is a bare - integer (to know how many repetitions). May be ``None`` when the extent - is not yet known (e.g. ``from_dict`` without array shape). + integer (to know how many repetitions). """ if isinstance(dim_spec, int): if array_extent is None: - raise ValueError( - "Integer chunk_shapes shorthand requires array shape to expand. " - "Use parse_chunk_grid() instead of ChunkGrid.from_dict()." - ) + raise ValueError("Integer chunk_shapes shorthand requires array shape to expand.") if dim_spec <= 0: raise ValueError(f"Integer chunk edge length must be > 0, got {dim_spec}") n = ceildiv(array_extent, dim_spec) @@ -475,51 +462,19 @@ def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: raise TypeError(f"Unexpected dimension type: {type(dim)}") return ChunkGrid(dimensions=tuple(dims)) - # -- Serialization -- - - @classmethod - def from_dict(cls, data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any]) -> ChunkGrid: - if isinstance(data, ChunkGrid): - return data - - name_parsed, configuration_parsed = parse_named_configuration(data) - - if name_parsed == "regular": - chunk_shape_raw = configuration_parsed.get("chunk_shape") - if chunk_shape_raw is None: - raise ValueError("Regular chunk grid requires 'chunk_shape' configuration") - if not isinstance(chunk_shape_raw, Sequence): - raise TypeError(f"chunk_shape must be a sequence, got {type(chunk_shape_raw)}") - chunk_shape = tuple(int(cast("int", s)) for s in chunk_shape_raw) - # Without array shape we cannot compute extents; use extent=None. - # Callers that need extent should use parse_chunk_grid(). - return cls(dimensions=tuple(FixedDimension(size=s, extent=None) for s in chunk_shape)) - - if name_parsed == "rectilinear": - _validate_rectilinear_kind(configuration_parsed) - chunk_shapes_raw = configuration_parsed.get("chunk_shapes") - if chunk_shapes_raw is None: - raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") - if not isinstance(chunk_shapes_raw, Sequence): - raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - decoded = [_decode_dim_spec(dim_spec) for dim_spec in chunk_shapes_raw] - return cls.from_rectilinear(decoded) - - raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") - # ChunkGrid does not serialize itself. The format choice ("regular" vs - # "rectilinear") belongs to the metadata layer. Use serialize_chunk_grid(). + # "rectilinear") belongs to the metadata layer. Use serialize_chunk_grid() + # for output and parse_chunk_grid() for input. def parse_chunk_grid( data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], array_shape: tuple[int, ...], ) -> ChunkGrid: - """Create a ChunkGrid from a metadata dict, injecting array shape as extent. + """Create a ChunkGrid from a metadata dict or existing grid, binding array shape. This is the primary entry point for constructing a ChunkGrid from serialized - metadata. Unlike ``ChunkGrid.from_dict``, this always produces a grid with - correct extent values. + metadata. It always produces a grid with correct extent values. """ if isinstance(data, ChunkGrid): # Re-bind extent if array_shape differs from what's stored @@ -598,25 +553,28 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: chunk_shapes: list[Any] = [] for dim in grid.dimensions: if isinstance(dim, FixedDimension): - # Produce RLE directly without allocating a full edge list. - extent = dim._require_extent() + # Produce the most compact spec representation. n = dim.nchunks if n == 0: chunk_shapes.append([]) else: - last_data = extent - (n - 1) * dim.size + last_data = dim.extent - (n - 1) * dim.size if last_data == dim.size: - chunk_shapes.append([[dim.size, n]]) + # All chunks uniform → integer shorthand + chunk_shapes.append(dim.size) + elif n == 1: + # Single boundary chunk → bare integer + chunk_shapes.append([last_data]) + elif n == 2: + # One full chunk + one boundary → bare integers + chunk_shapes.append([dim.size, last_data]) else: - rle: list[list[int]] = [] - if n > 1: - rle.append([dim.size, n - 1]) - rle.append([last_data, 1]) - chunk_shapes.append(rle) + # RLE for the uniform run + bare int for boundary + chunk_shapes.append([[dim.size, n - 1], last_data]) elif isinstance(dim, VaryingDimension): edges = list(dim.edges) rle = _compress_rle(edges) - if sum(count for _, count in rle) == len(edges) and len(rle) < len(edges): + if len(rle) < len(edges): chunk_shapes.append(rle) else: chunk_shapes.append(edges) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 0575ebae49..ec70740267 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -19,8 +19,6 @@ if TYPE_CHECKING: from pathlib import Path - from zarr.core.common import JSON - from zarr.core.chunk_grids import ( ChunkGrid, ChunkSpec, @@ -302,7 +300,9 @@ def test_expand(self) -> None: def test_compress(self) -> None: assert _compress_rle([10, 10, 10]) == [[10, 3]] - assert _compress_rle([10, 10, 20]) == [[10, 2], [20, 1]] + assert _compress_rle([10, 10, 20]) == [[10, 2], 20] + assert _compress_rle([5]) == [5] + assert _compress_rle([10, 20, 30]) == [10, 20, 30] def test_roundtrip(self) -> None: original = [10, 10, 10, 20, 20, 30] @@ -323,7 +323,7 @@ def test_regular_roundtrip(self) -> None: config = d["configuration"] assert isinstance(config, dict) assert tuple(config["chunk_shape"]) == (10, 20) - g2 = ChunkGrid.from_dict(d) + g2 = parse_chunk_grid(d, (100, 200)) assert g2.is_regular assert g2.chunk_shape == (10, 20) @@ -331,7 +331,7 @@ def test_rectilinear_roundtrip(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" - g2 = ChunkGrid.from_dict(d) + g2 = parse_chunk_grid(d, (60, 100)) assert not g2.is_regular # Verify the reconstructed grid has same dimensions spec0 = g2[(0, 0)] @@ -363,19 +363,19 @@ def test_rectilinear_rle_with_varying(self) -> None: assert isinstance(config, dict) chunk_shapes = config["chunk_shapes"] assert isinstance(chunk_shapes, list) - assert chunk_shapes[0] == [[100, 3], [50, 1]] + assert chunk_shapes[0] == [[100, 3], 50] def test_json_roundtrip(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) d = serialize_chunk_grid(g, "rectilinear") json_str = json.dumps(d) d2 = json.loads(json_str) - g2 = ChunkGrid.from_dict(d2) + g2 = parse_chunk_grid(d2, (60, 100)) assert g2.shape == (3, 2) def test_unknown_name_raises(self) -> None: with pytest.raises(ValueError, match="Unknown chunk grid"): - ChunkGrid.from_dict({"name": "hexagonal", "configuration": {}}) + parse_chunk_grid({"name": "hexagonal", "configuration": {}}, (10,)) def test_serialize_non_regular_as_regular_raises(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) @@ -399,7 +399,7 @@ def test_kind_inline_required_on_deserialize(self) -> None: "configuration": {"chunk_shapes": [[10, 20], [15, 15]]}, } with pytest.raises(ValueError, match="requires a 'kind' field"): - ChunkGrid.from_dict(data) + parse_chunk_grid(data, (30, 30)) def test_kind_unknown_rejected(self) -> None: data: dict[str, Any] = { @@ -407,7 +407,7 @@ def test_kind_unknown_rejected(self) -> None: "configuration": {"kind": "reference", "chunk_shapes": [[10, 20], [15, 15]]}, } with pytest.raises(ValueError, match="Unsupported rectilinear chunk grid kind"): - ChunkGrid.from_dict(data) + parse_chunk_grid(data, (30, 30)) def test_kind_inline_in_serialized_output(self) -> None: """Serialization includes kind: 'inline'.""" @@ -547,25 +547,6 @@ def test_varying_dimension_extent_mismatch_on_chunkgrid_input(self) -> None: parse_chunk_grid(g, (100, 50)) -# --------------------------------------------------------------------------- -# Backwards compatibility -# --------------------------------------------------------------------------- - - -class TestBackwardsCompat: - def test_from_dict_regular(self) -> None: - d: dict[str, JSON] = {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} - g = ChunkGrid.from_dict(d) - assert isinstance(g, ChunkGrid) - assert g.is_regular - assert g.chunk_shape == (10, 20) - - def test_from_dict_regular_extent_none(self) -> None: - """from_dict without array shape produces extent=None.""" - g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) - assert all(d.extent is None for d in g.dimensions) - - # --------------------------------------------------------------------------- # Indexing with rectilinear grids # --------------------------------------------------------------------------- @@ -676,7 +657,7 @@ def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: """Verify metadata round-trips through JSON.""" g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) d = serialize_chunk_grid(g, "rectilinear") - g2 = ChunkGrid.from_dict(d) + g2 = parse_chunk_grid(d, (60, 100)) assert g2.shape == g.shape for coord in g.all_chunk_coords(): orig_spec = g[coord] @@ -928,7 +909,7 @@ def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: assert sum(expanded) == 95 # extent preserved assert expanded[-1] == 5 # boundary chunk - g2 = ChunkGrid.from_dict(d) + g2 = parse_chunk_grid(d, (60, 95)) assert g2.shape == g.shape # Round-tripped grid should have correct extent for coord in g.all_chunk_coords(): @@ -947,7 +928,7 @@ def test_exact_extent_fixed_dim_rectilinear_roundtrip(self) -> None: ) ) d = serialize_chunk_grid(g, "rectilinear") - g2 = ChunkGrid.from_dict(d) + g2 = parse_chunk_grid(d, (30, 100)) assert g2.shape == g.shape # All chunks should be uniform for coord in g.all_chunk_coords(): @@ -1912,32 +1893,11 @@ async def test_append_small_data(self) -> None: result = await arr.getitem((slice(20, 23), slice(None))) np.testing.assert_array_equal(result, small) - -# --------------------------------------------------------------------------- -# from_dict extent=0 sentinel -# --------------------------------------------------------------------------- - - -class TestFromDictExtentNone: - def test_from_dict_regular_chunk_shape_preserved(self) -> None: - """from_dict preserves chunk_shape even without extent.""" - g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) + def test_parse_chunk_grid_regular_from_dict(self) -> None: + """parse_chunk_grid constructs a regular grid from a metadata dict.""" + d: dict[str, Any] = {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} + g = parse_chunk_grid(d, (100, 200)) + assert g.is_regular assert g.chunk_shape == (10, 20) - - def test_from_dict_regular_extent_is_none(self) -> None: - """from_dict without array shape sets extent=None.""" - g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) - assert all(d.extent is None for d in g.dimensions) - - def test_from_dict_regular_nchunks_raises(self) -> None: - """Extent-dependent operations raise on shapeless grids.""" - g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) - with pytest.raises(ValueError, match="extent is unknown"): - g.get_nchunks() - - def test_parse_chunk_grid_binds_extent(self) -> None: - """parse_chunk_grid resolves extent=None from from_dict.""" - g = ChunkGrid.from_dict({"name": "regular", "configuration": {"chunk_shape": [10, 20]}}) - resolved = parse_chunk_grid(g, (100, 200)) - assert resolved.shape == (10, 10) - assert resolved.get_nchunks() == 100 + assert g.shape == (10, 10) + assert g.get_nchunks() == 100 From da2c08b04bec2872549799188d68895cdde1df04 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 21:19:08 -0400 Subject: [PATCH 032/118] Fix boundary handling in VaryingDimension --- src/zarr/core/chunk_grids.py | 94 ++++++++--- src/zarr/core/indexing.py | 33 ++-- tests/test_unified_chunk_grid.py | 263 ++++++++++++++++++++++++++++--- 3 files changed, 328 insertions(+), 62 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index dcceb376c8..0ea99e0ec8 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -82,30 +82,35 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int @dataclass(frozen=True) class VaryingDimension: - """Explicit per-chunk sizes. No padding — each edge length is - both the codec size and the data size.""" + """Explicit per-chunk sizes. The last chunk may extend past the array + extent, in which case ``data_size`` clips to the valid region while + ``chunk_size`` returns the full edge length for codec processing.""" edges: tuple[int, ...] # per-chunk edge lengths (all > 0) cumulative: tuple[int, ...] # prefix sums for O(log n) lookup + extent: int # array dimension length (may be < sum(edges) after resize) - def __init__(self, edges: Sequence[int]) -> None: + def __init__(self, edges: Sequence[int], extent: int) -> None: edges_tuple = tuple(edges) if not edges_tuple: raise ValueError("VaryingDimension edges must not be empty") if any(e <= 0 for e in edges_tuple): raise ValueError(f"All edge lengths must be > 0, got {edges_tuple}") cumulative = tuple(itertools.accumulate(edges_tuple)) + if extent < 0: + raise ValueError(f"VaryingDimension extent must be >= 0, got {extent}") + if extent > cumulative[-1]: + raise ValueError( + f"VaryingDimension extent {extent} exceeds sum of edges {cumulative[-1]}" + ) object.__setattr__(self, "edges", edges_tuple) object.__setattr__(self, "cumulative", cumulative) + object.__setattr__(self, "extent", extent) @property def nchunks(self) -> int: return len(self.edges) - @property - def extent(self) -> int: - return self.cumulative[-1] - def index_to_chunk(self, idx: int) -> int: return bisect.bisect_right(self.cumulative, idx) @@ -117,8 +122,9 @@ def chunk_size(self, chunk_ix: int) -> int: return self.edges[chunk_ix] def data_size(self, chunk_ix: int) -> int: - """Valid data region — same as chunk_size for varying dims.""" - return self.edges[chunk_ix] + """Valid data region within the buffer — clipped at extent.""" + offset = self.chunk_offset(chunk_ix) + return max(0, min(self.edges[chunk_ix], self.extent - offset)) def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: return np.searchsorted(self.cumulative, indices, side="right") @@ -348,7 +354,7 @@ def from_rectilinear(cls, chunk_shapes: Sequence[Sequence[int]]) -> ChunkGrid: if all(e == edges_list[0] for e in edges_list): dims.append(FixedDimension(size=edges_list[0], extent=extent)) else: - dims.append(VaryingDimension(edges_list)) + dims.append(VaryingDimension(edges_list, extent=extent)) return cls(dimensions=tuple(dims)) # -- Properties -- @@ -408,8 +414,57 @@ def __iter__(self) -> Iterator[ChunkSpec]: if spec is not None: yield spec - def all_chunk_coords(self) -> Iterator[tuple[int, ...]]: - return itertools.product(*(range(d.nchunks) for d in self.dimensions)) + def all_chunk_coords( + self, + *, + origin: Sequence[int] | None = None, + selection_shape: Sequence[int] | None = None, + ) -> Iterator[tuple[int, ...]]: + """Iterate over chunk coordinates, optionally restricted to a subregion. + + Parameters + ---------- + origin : Sequence[int] | None + The first chunk coordinate to return. Defaults to the grid origin. + selection_shape : Sequence[int] | None + The number of chunks per dimension to iterate. Defaults to the + remaining extent from origin. + """ + if origin is None: + origin_parsed = (0,) * self.ndim + else: + origin_parsed = tuple(origin) + if selection_shape is None: + selection_shape_parsed = tuple( + g - o for o, g in zip(origin_parsed, self.shape, strict=True) + ) + else: + selection_shape_parsed = tuple(selection_shape) + ranges = tuple( + range(o, o + s) for o, s in zip(origin_parsed, selection_shape_parsed, strict=True) + ) + return itertools.product(*ranges) + + def iter_chunk_regions( + self, + *, + origin: Sequence[int] | None = None, + selection_shape: Sequence[int] | None = None, + ) -> Iterator[tuple[slice, ...]]: + """Iterate over the data regions (slices) spanned by each chunk. + + Parameters + ---------- + origin : Sequence[int] | None + The first chunk coordinate to return. Defaults to the grid origin. + selection_shape : Sequence[int] | None + The number of chunks per dimension to iterate. Defaults to the + remaining extent from origin. + """ + for coords in self.all_chunk_coords(origin=origin, selection_shape=selection_shape): + spec = self[coords] + if spec is not None: + yield spec.slices def get_nchunks(self) -> int: return reduce(operator.mul, (d.nchunks for d in self.dimensions), 1) @@ -447,7 +502,7 @@ def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: dims.append(dim) elif new_extent > old_extent: expanded_edges = list(dim.edges) + [new_extent - old_extent] - dims.append(VaryingDimension(expanded_edges)) + dims.append(VaryingDimension(expanded_edges, extent=new_extent)) else: # Shrink: keep chunks whose cumulative offset covers new_extent shrunk_edges: list[int] = [] @@ -457,7 +512,7 @@ def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: total += edge if total >= new_extent: break - dims.append(VaryingDimension(shrunk_edges)) + dims.append(VaryingDimension(shrunk_edges, extent=new_extent)) else: raise TypeError(f"Unexpected dimension type: {type(dim)}") return ChunkGrid(dimensions=tuple(dims)) @@ -483,15 +538,16 @@ def parse_chunk_grid( if isinstance(dim, FixedDimension): dims.append(FixedDimension(size=dim.size, extent=extent)) elif isinstance(dim, VaryingDimension): - # VaryingDimension has intrinsic extent (sum of edges). # After resize/shrink the last chunk may extend past the array - # boundary, so extent >= array_shape is valid (like regular grids). - if dim.extent < extent: + # boundary, so sum(edges) >= array_shape is valid (like regular grids). + edge_sum = sum(dim.edges) + if edge_sum < extent: raise ValueError( - f"VaryingDimension extent {dim.extent} is less than " + f"VaryingDimension edge sum {edge_sum} is less than " f"array shape extent {extent} for dimension {len(dims)}" ) - dims.append(dim) + # Re-bind extent to the actual array shape + dims.append(VaryingDimension(dim.edges, extent=extent)) else: raise TypeError(f"Unexpected dimension type: {type(dim)}") return ChunkGrid(dimensions=tuple(dims)) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 7a4490c7e4..8ea8b06751 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -690,8 +690,6 @@ def __init__( object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) def __iter__(self) -> Iterator[ChunkDimProjection]: - from zarr.core.chunk_grids import FixedDimension - g = self.dim_grid # iterate over chunks with at least one item @@ -701,9 +699,10 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: chunk_len = g.data_size(dim_chunk_ix) dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + chunk_len] - # pad out if final chunk (for fixed grids, actual chunk may be smaller than declared size) - if isinstance(g, FixedDimension) and dim_chunk_sel.shape[0] < g.size: - tmp = np.zeros(g.size, dtype=bool) + # pad out if boundary chunk (codec buffer may be larger than valid data region) + codec_size = g.chunk_size(dim_chunk_ix) + if dim_chunk_sel.shape[0] < codec_size: + tmp = np.zeros(codec_size, dtype=bool) tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel dim_chunk_sel = tmp @@ -917,8 +916,8 @@ def oindex_set(a: npt.NDArray[Any], selection: Selection, value: Any) -> None: @dataclass(frozen=True) class OrthogonalIndexer(Indexer): dim_indexers: list[IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer] + dim_grids: tuple[DimensionGrid, ...] shape: tuple[int, ...] - chunk_shape: tuple[int, ...] is_advanced: bool drop_axes: tuple[int, ...] @@ -969,23 +968,9 @@ def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: Chu else: drop_axes = () - # Compute chunk_shape for ix_() compatibility in __iter__. - # For VaryingDimension, use the max edge length so that - # slice_to_range produces correct ranges for the largest chunk. - from zarr.core.chunk_grids import FixedDimension, VaryingDimension - - chunk_shape = tuple( - g.size - if isinstance(g, FixedDimension) - else max(g.edges) - if isinstance(g, VaryingDimension) - else g.chunk_size(0) - for g in dim_grids - ) - object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__(self, "dim_grids", dim_grids) object.__setattr__(self, "shape", shape) - object.__setattr__(self, "chunk_shape", chunk_shape) object.__setattr__(self, "is_advanced", is_advanced) object.__setattr__(self, "drop_axes", drop_axes) @@ -1005,7 +990,11 @@ def __iter__(self) -> Iterator[ChunkProjection]: # so need to work around via np.ix_. Also np.ix_ does not support a # mixture of arrays and slices or integers, so need to convert slices # and integers into ranges. - chunk_selection = ix_(chunk_selection, self.chunk_shape) + chunk_shape = tuple( + g.chunk_size(p.dim_chunk_ix) + for g, p in zip(self.dim_grids, dim_projections, strict=True) + ) + chunk_selection = ix_(chunk_selection, chunk_shape) # special case for non-monotonic indices if not is_basic_selection(out_selection): diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index ec70740267..c4cf3889d8 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -99,14 +99,14 @@ def test_zero_size_allowed(self) -> None: class TestVaryingDimension: def test_basic(self) -> None: - d = VaryingDimension([10, 20, 30]) + d = VaryingDimension([10, 20, 30], extent=60) assert d.edges == (10, 20, 30) assert d.cumulative == (10, 30, 60) assert d.nchunks == 3 assert d.extent == 60 def test_index_to_chunk(self) -> None: - d = VaryingDimension([10, 20, 30]) + d = VaryingDimension([10, 20, 30], extent=60) assert d.index_to_chunk(0) == 0 assert d.index_to_chunk(9) == 0 assert d.index_to_chunk(10) == 1 @@ -115,37 +115,37 @@ def test_index_to_chunk(self) -> None: assert d.index_to_chunk(59) == 2 def test_chunk_offset(self) -> None: - d = VaryingDimension([10, 20, 30]) + d = VaryingDimension([10, 20, 30], extent=60) assert d.chunk_offset(0) == 0 assert d.chunk_offset(1) == 10 assert d.chunk_offset(2) == 30 def test_chunk_size(self) -> None: - d = VaryingDimension([10, 20, 30]) + d = VaryingDimension([10, 20, 30], extent=60) assert d.chunk_size(0) == 10 assert d.chunk_size(1) == 20 assert d.chunk_size(2) == 30 def test_data_size(self) -> None: - d = VaryingDimension([10, 20, 30]) - # data_size == chunk_size for varying dims + d = VaryingDimension([10, 20, 30], extent=60) + # data_size == chunk_size when extent == sum(edges) (no boundary) assert d.data_size(0) == 10 assert d.data_size(1) == 20 assert d.data_size(2) == 30 def test_vectorized(self) -> None: - d = VaryingDimension([10, 20, 30]) + d = VaryingDimension([10, 20, 30], extent=60) indices = np.array([0, 9, 10, 29, 30, 59]) chunks = d.indices_to_chunks(indices) np.testing.assert_array_equal(chunks, [0, 0, 1, 1, 2, 2]) def test_empty_rejected(self) -> None: with pytest.raises(ValueError, match="must not be empty"): - VaryingDimension([]) + VaryingDimension([], extent=0) def test_zero_edge_rejected(self) -> None: with pytest.raises(ValueError, match="must be > 0"): - VaryingDimension([10, 0, 5]) + VaryingDimension([10, 0, 5], extent=15) # --------------------------------------------------------------------------- @@ -886,7 +886,7 @@ def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: """A rectilinear grid with a boundary FixedDimension preserves extent.""" g = ChunkGrid( dimensions=( - VaryingDimension([10, 20, 30]), + VaryingDimension([10, 20, 30], extent=60), FixedDimension(size=10, extent=95), ) ) @@ -923,7 +923,7 @@ def test_exact_extent_fixed_dim_rectilinear_roundtrip(self) -> None: """No boundary: extent == size * nchunks round-trips cleanly.""" g = ChunkGrid( dimensions=( - VaryingDimension([10, 20]), + VaryingDimension([10, 20], extent=30), FixedDimension(size=25, extent=100), ) ) @@ -1092,7 +1092,7 @@ def test_zero_nchunks_fixed_dim_in_rectilinear_serialize(self) -> None: """A rectilinear grid with a 0-nchunks FixedDimension serializes.""" g = ChunkGrid( dimensions=( - VaryingDimension([10, 20]), + VaryingDimension([10, 20], extent=30), FixedDimension(size=10, extent=0), ) ) @@ -1104,7 +1104,7 @@ def test_zero_nchunks_fixed_dim_in_rectilinear_serialize(self) -> None: def test_varying_dim_data_size_equals_chunk_size(self) -> None: """For VaryingDimension, data_size == chunk_size (no padding).""" - d = VaryingDimension([10, 20, 5]) + d = VaryingDimension([10, 20, 5], extent=35) for i in range(3): assert d.data_size(i) == d.chunk_size(i) @@ -1149,9 +1149,9 @@ def test_orthogonal_bool_array_selection_rectilinear(self) -> None: projections = list(indexer) assert len(projections) > 0 - def test_orthogonal_advanced_indexing_chunk_shape_not_one(self) -> None: - """Verify OrthogonalIndexer.chunk_shape reflects actual chunk sizes, - not a hardcoded 1 for VaryingDimension.""" + def test_orthogonal_advanced_indexing_produces_correct_projections(self) -> None: + """Verify OrthogonalIndexer produces correct chunk projections + for advanced indexing with VaryingDimension.""" from zarr.core.indexing import OrthogonalIndexer g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) @@ -1160,11 +1160,16 @@ def test_orthogonal_advanced_indexing_chunk_shape_not_one(self) -> None: shape=(60, 100), chunk_grid=g, ) - # chunk_shape should NOT have 1 for the VaryingDimension - # The first dim has varying chunks [10, 20, 30] — we need a - # representative size for ix_() to work. Using the max is safe. - assert indexer.chunk_shape[0] > 1 # was incorrectly 1 before fix - assert indexer.chunk_shape[1] == 50 + projections = list(indexer) + # index 5 is in chunk 0 (edges [10,...]), index 15 is in chunk 1 (edges [...,20,...]) + # dim 1 slice(None) covers both chunks [50, 50] + # cartesian product: 2 chunks in dim 0 x 2 chunks in dim 1 = 4 projections + assert len(projections) == 4 + coords = [p.chunk_coords for p in projections] + assert (0, 0) in coords + assert (0, 1) in coords + assert (1, 0) in coords + assert (1, 1) in coords class TestShardingValidationRectilinear: @@ -1901,3 +1906,219 @@ def test_parse_chunk_grid_regular_from_dict(self) -> None: assert g.chunk_shape == (10, 20) assert g.shape == (10, 10) assert g.get_nchunks() == 100 + + +# --------------------------------------------------------------------------- +# Boundary chunk tests +# --------------------------------------------------------------------------- + + +class TestVaryingDimensionBoundary: + """VaryingDimension with extent < sum(edges), mirroring how FixedDimension + handles boundary chunks.""" + + def test_extent_parameter(self) -> None: + d = VaryingDimension([10, 20, 30], extent=50) + assert d.extent == 50 + assert d.chunk_size(2) == 30 # codec buffer: full edge + assert d.data_size(2) == 20 # valid data: clipped to extent + + def test_extent_equals_sum_no_clipping(self) -> None: + d = VaryingDimension([10, 20, 30], extent=60) + assert d.extent == 60 + assert d.data_size(2) == 30 # no clipping when extent == sum(edges) + + def test_data_size_interior_chunks_unaffected(self) -> None: + d = VaryingDimension([10, 20, 30], extent=50) + assert d.data_size(0) == 10 # fully within extent + assert d.data_size(1) == 20 # fully within extent (offset 10, ends at 30) + + def test_data_size_at_exact_boundary(self) -> None: + d = VaryingDimension([10, 20, 30], extent=60) + # extent == sum(edges), so no clipping + assert d.data_size(2) == 30 + + def test_data_size_single_element_boundary(self) -> None: + d = VaryingDimension([10, 20, 30], extent=31) + assert d.data_size(0) == 10 + assert d.data_size(1) == 20 + assert d.data_size(2) == 1 # only 1 element in last chunk + + def test_extent_exceeds_sum_rejected(self) -> None: + with pytest.raises(ValueError, match="exceeds sum of edges"): + VaryingDimension([10, 20], extent=50) + + def test_negative_extent_rejected(self) -> None: + with pytest.raises(ValueError, match="must be >= 0"): + VaryingDimension([10, 20], extent=-1) + + def test_chunk_spec_boundary_varying(self) -> None: + """ChunkGrid with a boundary VaryingDimension produces correct ChunkSpec.""" + g = ChunkGrid(dimensions=(VaryingDimension([10, 20, 30], extent=50),)) + spec = g[(2,)] + assert spec is not None + assert spec.codec_shape == (30,) # full edge + assert spec.shape == (20,) # clipped to extent + assert spec.is_boundary is True + + def test_chunk_spec_interior_varying(self) -> None: + g = ChunkGrid(dimensions=(VaryingDimension([10, 20, 30], extent=50),)) + spec = g[(0,)] + assert spec is not None + assert spec.codec_shape == (10,) + assert spec.shape == (10,) + assert spec.is_boundary is False + + +class TestBoundaryIndexing: + """Indexing operations on boundary chunks for both FixedDimension and + VaryingDimension, ensuring the isinstance cleanup works correctly.""" + + def test_bool_indexer_fixed_boundary(self) -> None: + """BoolArrayDimIndexer pads to codec size for FixedDimension boundary.""" + from zarr.core.indexing import BoolArrayDimIndexer + + # array extent 7, chunk size 5 → 2 chunks, last has data_size=2 + dim = FixedDimension(size=5, extent=7) + mask = np.array([False, False, False, False, False, True, True]) + indexer = BoolArrayDimIndexer(mask, 7, dim) + projections = list(indexer) + assert len(projections) == 1 + p = projections[0] + assert p.dim_chunk_ix == 1 + # boolean selection should be padded to chunk_size (5) + sel = p.dim_chunk_sel + assert isinstance(sel, np.ndarray) + assert sel.shape[0] == 5 + assert sel[0] is np.True_ + assert sel[1] is np.True_ + assert sel[2] is np.False_ # padding + + def test_bool_indexer_varying_boundary(self) -> None: + """BoolArrayDimIndexer pads to codec size for VaryingDimension boundary.""" + from zarr.core.indexing import BoolArrayDimIndexer + + # edges [5, 10], extent=7 -> last chunk has data_size=2, chunk_size=10 + dim = VaryingDimension([5, 10], extent=7) + mask = np.array([False, False, False, False, False, True, True]) + indexer = BoolArrayDimIndexer(mask, 7, dim) + projections = list(indexer) + assert len(projections) == 1 + p = projections[0] + assert p.dim_chunk_ix == 1 + # boolean selection should be padded to chunk_size (10) + sel = p.dim_chunk_sel + assert isinstance(sel, np.ndarray) + assert sel.shape[0] == 10 + assert sel[0] is np.True_ + assert sel[1] is np.True_ + assert sel[2] is np.False_ # padding + + def test_bool_indexer_no_padding_interior(self) -> None: + """No padding needed for interior chunks.""" + from zarr.core.indexing import BoolArrayDimIndexer + + dim = FixedDimension(size=5, extent=10) + mask = np.array([True, False, False, False, False, False, False, False, False, False]) + indexer = BoolArrayDimIndexer(mask, 10, dim) + projections = list(indexer) + assert len(projections) == 1 + p = projections[0] + assert p.dim_chunk_ix == 0 + sel = p.dim_chunk_sel + assert isinstance(sel, np.ndarray) + assert sel.shape[0] == 5 # equals chunk_size, no padding needed + + def test_slice_indexer_varying_boundary(self) -> None: + """SliceDimIndexer clips to data_size at boundary for VaryingDimension.""" + from zarr.core.indexing import SliceDimIndexer + + dim = VaryingDimension([5, 10], extent=7) + # select all elements + indexer = SliceDimIndexer(slice(None), 7, dim) + projections = list(indexer) + assert len(projections) == 2 + # chunk 0: full chunk + assert projections[0].dim_chunk_sel == slice(0, 5, 1) + # chunk 1: clipped to data_size (2), not chunk_size (10) + assert projections[1].dim_chunk_sel == slice(0, 2, 1) + + def test_int_array_indexer_varying_boundary(self) -> None: + """IntArrayDimIndexer handles indices near boundary correctly.""" + from zarr.core.indexing import IntArrayDimIndexer + + dim = VaryingDimension([5, 10], extent=7) + indices = np.array([6]) # in chunk 1, offset 5, so chunk-local = 1 + indexer = IntArrayDimIndexer(indices, 7, dim) + projections = list(indexer) + assert len(projections) == 1 + assert projections[0].dim_chunk_ix == 1 + sel = projections[0].dim_chunk_sel + assert isinstance(sel, np.ndarray) + np.testing.assert_array_equal(sel, [1]) + + def test_orthogonal_indexer_varying_boundary_advanced(self) -> None: + """OrthogonalIndexer with advanced indexing uses per-chunk chunk_size + for ix_() conversion, not a precomputed max.""" + from zarr.core.indexing import OrthogonalIndexer + + # 2D: dim 0 has boundary chunk, dim 1 is regular + g = ChunkGrid( + dimensions=( + VaryingDimension([5, 10], extent=7), + FixedDimension(size=4, extent=8), + ) + ) + indexer = OrthogonalIndexer( + selection=(np.array([0, 6]), slice(None)), + shape=(7, 8), + chunk_grid=g, + ) + projections = list(indexer) + # index 0 → chunk 0, index 6 → chunk 1; dim 1 has 2 chunks + assert len(projections) == 4 + coords = {p.chunk_coords for p in projections} + assert coords == {(0, 0), (0, 1), (1, 0), (1, 1)} + + +class TestUpdateShapeBoundary: + """Resize creates boundary VaryingDimensions with correct extent.""" + + def test_shrink_creates_boundary(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20, 30]]) + new_grid = grid.update_shape((45,)) + dim = new_grid.dimensions[0] + assert isinstance(dim, VaryingDimension) + assert dim.edges == (10, 20, 30) # last chunk kept (cumulative 60 >= 45) + assert dim.extent == 45 + assert dim.chunk_size(2) == 30 # codec buffer + assert dim.data_size(2) == 15 # clipped: 45 - 30 = 15 + + def test_shrink_to_exact_boundary(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20, 30]]) + new_grid = grid.update_shape((30,)) + dim = new_grid.dimensions[0] + assert isinstance(dim, VaryingDimension) + assert dim.edges == (10, 20) # chunk 2 dropped entirely + assert dim.extent == 30 + assert dim.data_size(1) == 20 # no clipping needed + + def test_shrink_chunk_spec(self) -> None: + """After shrink, ChunkSpec reflects boundary correctly.""" + grid = ChunkGrid.from_rectilinear([[10, 20, 30]]) + new_grid = grid.update_shape((45,)) + spec = new_grid[(2,)] + assert spec is not None + assert spec.codec_shape == (30,) + assert spec.shape == (15,) + assert spec.is_boundary is True + + def test_parse_chunk_grid_rebinds_extent(self) -> None: + """parse_chunk_grid re-binds VaryingDimension extent to array shape.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30]]) + # sum(edges)=60, array_shape=50 → re-bind extent + g2 = parse_chunk_grid(g, (50,)) + dim = g2.dimensions[0] + assert isinstance(dim, VaryingDimension) + assert dim.extent == 50 + assert dim.data_size(2) == 20 # 50 - 30 = 20 From 6d9de38df182e6a25c14d164554df3745fd25fcc Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 22:04:53 -0400 Subject: [PATCH 033/118] Add chunk_sizes property --- src/zarr/core/array.py | 46 ++++++++++++++++++++++++++++++++ src/zarr/core/chunk_grids.py | 16 +++++++++++ tests/test_unified_chunk_grid.py | 39 +++++++++++++++++++++++++++ 3 files changed, 101 insertions(+) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 3977939c07..3f775ad741 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1064,6 +1064,31 @@ def chunks(self) -> tuple[int, ...]: """ return self.metadata.chunks + @property + def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: + """Per-dimension chunk sizes for the array. + + Returns the data size of each chunk along every dimension, + including the final boundary chunk. Works for both regular + and rectilinear chunk grids. + + Returns + ------- + tuple[tuple[int, ...], ...] + One inner tuple per dimension containing chunk sizes. + + Examples + -------- + >>> arr = zarr.create_array(store, shape=(100, 80), chunks=(30, 40)) + >>> arr.chunk_sizes + ((30, 30, 30, 10), (40, 40)) + + >>> arr = zarr.create_array(store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]]) + >>> arr.chunk_sizes + ((10, 20, 30), (50, 50)) + """ + return self.metadata.chunk_grid.chunk_sizes + @property def shards(self) -> tuple[int, ...] | None: """Returns the shard shape of the Array. @@ -2285,6 +2310,27 @@ def chunks(self) -> tuple[int, ...]: """ return self.async_array.chunks + @property + def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: + """Per-dimension chunk sizes for the array. + + Returns the data size of each chunk along every dimension, + including the final boundary chunk. Works for both regular + and rectilinear chunk grids. + + Returns + ------- + tuple[tuple[int, ...], ...] + One inner tuple per dimension containing chunk sizes. + + Examples + -------- + >>> arr = zarr.open_array(store) + >>> arr.chunk_sizes + ((30, 30, 30, 10), (40, 40)) + """ + return self.async_array.chunk_sizes + @property def shards(self) -> tuple[int, ...] | None: """Returns a tuple of integers describing the length of each dimension of a shard of the array. diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 0ea99e0ec8..55155691f7 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -386,6 +386,22 @@ def chunk_shape(self) -> tuple[int, ...]: if isinstance(d, FixedDimension) # guaranteed by is_regular ) + @property + def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: + """Per-dimension chunk sizes, including the final boundary chunk. + + Returns the actual data size of each chunk (clipped at the array + extent), matching the dask ``Array.chunks`` convention. Works for + both regular and rectilinear grids. + + Returns + ------- + tuple[tuple[int, ...], ...] + One inner tuple per dimension, each containing the data size + of every chunk along that dimension. + """ + return tuple(tuple(d.data_size(i) for i in range(d.nchunks)) for d in self.dimensions) + # -- Collection interface -- def __getitem__(self, coords: int | tuple[int, ...]) -> ChunkSpec | None: diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index c4cf3889d8..aab60dd03f 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -1606,6 +1606,45 @@ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: ) +# --------------------------------------------------------------------------- +# .chunk_sizes property +# --------------------------------------------------------------------------- + + +class TestChunkSizes: + """Tests for ChunkGrid.chunk_sizes and Array.chunk_sizes.""" + + def test_regular_grid(self) -> None: + grid = ChunkGrid.from_regular((100, 80), (30, 40)) + assert grid.chunk_sizes == ((30, 30, 30, 10), (40, 40)) + + def test_regular_grid_exact(self) -> None: + grid = ChunkGrid.from_regular((90, 80), (30, 40)) + assert grid.chunk_sizes == ((30, 30, 30), (40, 40)) + + def test_rectilinear_grid(self) -> None: + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + assert grid.chunk_sizes == ((10, 20, 30), (50, 50)) + + def test_single_chunk(self) -> None: + grid = ChunkGrid.from_regular((10,), (10,)) + assert grid.chunk_sizes == ((10,),) + + def test_array_property_regular(self) -> None: + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, shape=(100, 80), chunks=(30, 40), dtype="i4", zarr_format=3 + ) + assert arr.chunk_sizes == ((30, 30, 30, 10), (40, 40)) + + def test_array_property_rectilinear(self) -> None: + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]], dtype="i4", zarr_format=3 + ) + assert arr.chunk_sizes == ((10, 20, 30), (50, 50)) + + # --------------------------------------------------------------------------- # .info display for rectilinear grids # --------------------------------------------------------------------------- From cc2999aaf71fada1371c518b8a84065c00ac8d77 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Mar 2026 22:23:31 -0400 Subject: [PATCH 034/118] Add docs --- docs/user-guide/arrays.md | 149 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index cd6a93cac9..7469432c28 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -592,6 +592,155 @@ In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is This means that `10*10` chunks are stored in each shard, and there are `10*10` shards in total. Without the `shards` argument, there would be 10,000 chunks stored as individual files. +## Rectilinear (variable) chunk grids + +!!! warning "Experimental" + Rectilinear chunk grids are an experimental feature and may change in + future releases. This feature is expected to stabilize in Zarr version 3.3. + +By default, Zarr arrays use a regular chunk grid where every chunk along a +given dimension has the same size (except possibly the final boundary chunk). +Rectilinear chunk grids allow each chunk along a dimension to have a different +size. This is useful when the natural partitioning of the data is not uniform — +for example, satellite swaths of varying width, time series with irregular +intervals, or spatial tiles of different extents. + +### Creating arrays with rectilinear chunks + +To create an array with rectilinear chunks, pass a nested list to the `chunks` +parameter where each inner list gives the chunk sizes along one dimension: + +```python exec="true" session="arrays" source="above" result="ansi" +z = zarr.create_array( + store=zarr.storage.MemoryStore(), + shape=(60, 100), + chunks=[[10, 20, 30], [50, 50]], + dtype='int32', +) +print(z.info) +``` + +In this example the first dimension is split into three chunks of sizes 10, 20, +and 30, while the second dimension is split into two equal chunks of size 50. + +### Reading and writing data + +Rectilinear arrays support the same indexing interface as regular arrays. +Reads and writes that cross chunk boundaries of different sizes are handled +automatically: + +```python exec="true" session="arrays" source="above" result="ansi" +import numpy as np +data = np.arange(60 * 100, dtype='int32').reshape(60, 100) +z[:] = data +# Read a slice that spans the first two chunks (sizes 10 and 20) along axis 0 +print(z[5:25, 0:5]) +``` + +### Inspecting chunk sizes + +The `.chunk_sizes` property returns the actual data size of each chunk along +every dimension. It works for both regular and rectilinear arrays and returns +a tuple of tuples: + +```python exec="true" session="arrays" source="above" result="ansi" +print(z.chunk_sizes) +``` + +For regular arrays, this includes the boundary chunk: + +```python exec="true" session="arrays" source="above" result="ansi" +z_regular = zarr.create_array( + store=zarr.storage.MemoryStore(), + shape=(100, 80), + chunks=(30, 40), + dtype='int32', +) +print(z_regular.chunk_sizes) +``` + +Note that the `.chunks` property is only available for regular chunk grids. For +rectilinear arrays, use `.chunk_sizes` instead. + +### Resizing and appending + +Rectilinear arrays can be resized. When growing, a new chunk is appended with +the size of the added region. When shrinking, trailing chunks are dropped: + +```python exec="true" session="arrays" source="above" result="ansi" +z = zarr.create_array( + store=zarr.storage.MemoryStore(), + shape=(30,), + chunks=[[10, 20]], + dtype='float64', +) +z[:] = np.arange(30, dtype='float64') +print(f"Before resize: chunk_sizes={z.chunk_sizes}") +z.resize((50,)) +print(f"After resize: chunk_sizes={z.chunk_sizes}") +``` + +The `append` method also works with rectilinear arrays: + +```python exec="true" session="arrays" source="above" result="ansi" +z.append(np.arange(10, dtype='float64')) +print(f"After append: shape={z.shape}, chunk_sizes={z.chunk_sizes}") +``` + +### Compressors and filters + +Rectilinear arrays work with all codecs — compressors, filters, and checksums. +Since each chunk may have a different size, the codec pipeline processes each +chunk independently: + +```python exec="true" session="arrays" source="above" result="ansi" +z = zarr.create_array( + store=zarr.storage.MemoryStore(), + shape=(60, 100), + chunks=[[10, 20, 30], [50, 50]], + dtype='float64', + filters=[zarr.codecs.TransposeCodec(order=(1, 0))], + compressors=[zarr.codecs.BloscCodec(cname='zstd', clevel=3)], +) +z[:] = np.arange(60 * 100, dtype='float64').reshape(60, 100) +np.testing.assert_array_equal(z[:], np.arange(60 * 100, dtype='float64').reshape(60, 100)) +print("Roundtrip OK") +``` + +### Rectilinear shard boundaries + +Rectilinear chunk grids can also be used for shard boundaries when combined +with sharding. In this case, the outer grid (shards) is rectilinear while the +inner chunks remain regular. Each shard dimension must be divisible by the +corresponding inner chunk size: + +```python exec="true" session="arrays" source="above" result="ansi" +z = zarr.create_array( + store=zarr.storage.MemoryStore(), + shape=(120, 100), + chunks=(10, 10), + shards=[[60, 40, 20], [50, 50]], + dtype='int32', +) +z[:] = np.arange(120 * 100, dtype='int32').reshape(120, 100) +print(z[50:70, 40:60]) +``` + +Note that rectilinear inner chunks with sharding are not supported — only the +shard boundaries can be rectilinear. + +### Metadata format + +Rectilinear chunk grid metadata uses run-length encoding (RLE) for compact +serialization. When reading metadata, both bare integers and `[value, count]` +pairs are accepted: + +- `[10, 20, 30]` — three chunks with explicit sizes +- `[[10, 3]]` — three chunks of size 10 (RLE shorthand) +- `[[10, 3], 5]` — three chunks of size 10, then one chunk of size 5 + +When writing, Zarr automatically compresses repeated values into RLE format. + ## Missing features in 3.0 The following features have not been ported to 3.0 yet. From b47ddbafc179b1471f9db76ec62896d6d8593749 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 10:12:28 -0400 Subject: [PATCH 035/118] Improve polymorphism --- src/zarr/core/chunk_grids.py | 155 +++++++++++++++++++---------------- 1 file changed, 86 insertions(+), 69 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 55155691f7..4d3b889734 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -79,6 +79,14 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int return np.zeros_like(indices) return indices // self.size + def with_extent(self, new_extent: int) -> FixedDimension: + """Return a copy re-bound to *new_extent*.""" + return FixedDimension(size=self.size, extent=new_extent) + + def resize(self, new_extent: int) -> FixedDimension: + """Return a copy adjusted for a new array extent (same as with_extent for fixed).""" + return FixedDimension(size=self.size, extent=new_extent) + @dataclass(frozen=True) class VaryingDimension: @@ -129,6 +137,34 @@ def data_size(self, chunk_ix: int) -> int: def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: return np.searchsorted(self.cumulative, indices, side="right") + def with_extent(self, new_extent: int) -> VaryingDimension: + """Return a copy re-bound to *new_extent*, validating edge coverage.""" + edge_sum = sum(self.edges) + if edge_sum < new_extent: + raise ValueError( + f"VaryingDimension edge sum {edge_sum} is less than new extent {new_extent}" + ) + return VaryingDimension(self.edges, extent=new_extent) + + def resize(self, new_extent: int) -> VaryingDimension: + """Return a copy adjusted for a new array extent (grow/shrink).""" + old_extent = self.extent + if new_extent == old_extent: + return self + elif new_extent > old_extent: + expanded_edges = list(self.edges) + [new_extent - old_extent] + return VaryingDimension(expanded_edges, extent=new_extent) + else: + # Shrink: keep chunks whose cumulative offset covers new_extent + shrunk_edges: list[int] = [] + total = 0 + for edge in self.edges: + shrunk_edges.append(edge) + total += edge + if total >= new_extent: + break + return VaryingDimension(shrunk_edges, extent=new_extent) + @runtime_checkable class DimensionGrid(Protocol): @@ -143,6 +179,8 @@ def chunk_offset(self, chunk_ix: int) -> int: ... def chunk_size(self, chunk_ix: int) -> int: ... def data_size(self, chunk_ix: int) -> int: ... def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: ... + def with_extent(self, new_extent: int) -> DimensionGrid: ... + def resize(self, new_extent: int) -> DimensionGrid: ... # --------------------------------------------------------------------------- @@ -197,7 +235,7 @@ def _expand_rle(data: Sequence[list[int] | int]) -> list[int]: return result -def _compress_rle(sizes: Sequence[int]) -> list[list[int] | int]: +def _compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: """Compress chunk sizes to mixed RLE format per the rectilinear spec. Runs of length > 1 are emitted as ``[value, count]`` pairs; runs of @@ -207,7 +245,7 @@ def _compress_rle(sizes: Sequence[int]) -> list[list[int] | int]: """ if not sizes: return [] - result: list[list[int] | int] = [] + result: list[int | list[int]] = [] current = sizes[0] count = 1 for s in sizes[1:]: @@ -221,6 +259,37 @@ def _compress_rle(sizes: Sequence[int]) -> list[list[int] | int]: return result +# A single dimension's rectilinear chunk spec: bare int (uniform shorthand), +# list of ints (explicit edges), or mixed RLE (e.g. [[10, 3], 5]). +RectilinearDimSpec = int | list[int | list[int]] + + +def _serialize_fixed_dim(dim: FixedDimension) -> RectilinearDimSpec: + """Compact rectilinear representation for a fixed-size dimension.""" + n = dim.nchunks + if n == 0: + return [] + last_data = dim.extent - (n - 1) * dim.size + if last_data == dim.size: + return dim.size + elif n == 1: + return [last_data] + elif n == 2: + return [dim.size, last_data] + else: + return [[dim.size, n - 1], last_data] + + +def _serialize_varying_dim(dim: VaryingDimension) -> RectilinearDimSpec: + """RLE-compressed rectilinear representation for a varying dimension.""" + edges = list(dim.edges) + rle = _compress_rle(edges) + if len(rle) < len(edges): + return rle + # mypy: list[int] is invariant, so it won't widen to list[int | list[int]] + return cast("RectilinearDimSpec", edges) + + def _validate_rectilinear_kind(configuration: dict[str, JSON]) -> None: """Validate the ``kind`` field of a rectilinear chunk grid configuration. @@ -508,30 +577,11 @@ def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: f"new_shape has {len(new_shape)} dimensions but " f"chunk grid has {self.ndim} dimensions" ) - dims: list[DimensionGrid] = [] - for dim, new_extent in zip(self.dimensions, new_shape, strict=True): - if isinstance(dim, FixedDimension): - dims.append(FixedDimension(size=dim.size, extent=new_extent)) - elif isinstance(dim, VaryingDimension): - old_extent = dim.extent - if new_extent == old_extent: - dims.append(dim) - elif new_extent > old_extent: - expanded_edges = list(dim.edges) + [new_extent - old_extent] - dims.append(VaryingDimension(expanded_edges, extent=new_extent)) - else: - # Shrink: keep chunks whose cumulative offset covers new_extent - shrunk_edges: list[int] = [] - total = 0 - for edge in dim.edges: - shrunk_edges.append(edge) - total += edge - if total >= new_extent: - break - dims.append(VaryingDimension(shrunk_edges, extent=new_extent)) - else: - raise TypeError(f"Unexpected dimension type: {type(dim)}") - return ChunkGrid(dimensions=tuple(dims)) + dims = tuple( + dim.resize(new_extent) + for dim, new_extent in zip(self.dimensions, new_shape, strict=True) + ) + return ChunkGrid(dimensions=dims) # ChunkGrid does not serialize itself. The format choice ("regular" vs # "rectilinear") belongs to the metadata layer. Use serialize_chunk_grid() @@ -549,24 +599,11 @@ def parse_chunk_grid( """ if isinstance(data, ChunkGrid): # Re-bind extent if array_shape differs from what's stored - dims: list[DimensionGrid] = [] - for dim, extent in zip(data.dimensions, array_shape, strict=True): - if isinstance(dim, FixedDimension): - dims.append(FixedDimension(size=dim.size, extent=extent)) - elif isinstance(dim, VaryingDimension): - # After resize/shrink the last chunk may extend past the array - # boundary, so sum(edges) >= array_shape is valid (like regular grids). - edge_sum = sum(dim.edges) - if edge_sum < extent: - raise ValueError( - f"VaryingDimension edge sum {edge_sum} is less than " - f"array shape extent {extent} for dimension {len(dims)}" - ) - # Re-bind extent to the actual array shape - dims.append(VaryingDimension(dim.edges, extent=extent)) - else: - raise TypeError(f"Unexpected dimension type: {type(dim)}") - return ChunkGrid(dimensions=tuple(dims)) + dims = tuple( + dim.with_extent(extent) + for dim, extent in zip(data.dimensions, array_shape, strict=True) + ) + return ChunkGrid(dimensions=dims) name_parsed, configuration_parsed = parse_named_configuration(data) @@ -622,34 +659,14 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: } if name == "rectilinear": - chunk_shapes: list[Any] = [] + chunk_shapes: list[RectilinearDimSpec] = [] for dim in grid.dimensions: if isinstance(dim, FixedDimension): - # Produce the most compact spec representation. - n = dim.nchunks - if n == 0: - chunk_shapes.append([]) - else: - last_data = dim.extent - (n - 1) * dim.size - if last_data == dim.size: - # All chunks uniform → integer shorthand - chunk_shapes.append(dim.size) - elif n == 1: - # Single boundary chunk → bare integer - chunk_shapes.append([last_data]) - elif n == 2: - # One full chunk + one boundary → bare integers - chunk_shapes.append([dim.size, last_data]) - else: - # RLE for the uniform run + bare int for boundary - chunk_shapes.append([[dim.size, n - 1], last_data]) + chunk_shapes.append(_serialize_fixed_dim(dim)) elif isinstance(dim, VaryingDimension): - edges = list(dim.edges) - rle = _compress_rle(edges) - if len(rle) < len(edges): - chunk_shapes.append(rle) - else: - chunk_shapes.append(edges) + chunk_shapes.append(_serialize_varying_dim(dim)) + else: + raise TypeError(f"Unexpected dimension type: {type(dim)}") return { "name": "rectilinear", "configuration": {"kind": "inline", "chunk_shapes": chunk_shapes}, From 2caa927e173cf6608e10c3fa34d1e1a4d444555d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 11:22:53 -0400 Subject: [PATCH 036/118] always return based on inner chunks --- src/zarr/core/array.py | 25 ++++++++++++++++++++--- src/zarr/core/metadata/v3.py | 2 +- tests/test_unified_chunk_grid.py | 34 ++++++++++++++++++++++++++++---- 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0f9dfd0cab..e27bd98ad6 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1293,7 +1293,18 @@ def _chunk_grid_shape(self) -> tuple[int, ...]: tuple[int, ...] The number of chunks along each dimension. """ - return tuple(starmap(ceildiv, zip(self.shape, self.chunks, strict=True))) + # TODO: refactor — extract a sharding_codec property on ArrayV3Metadata + # to replace the repeated `len == 1 and isinstance` pattern. + from zarr.codecs.sharding import ShardingCodec + + codecs: tuple[Codec, ...] = getattr(self.metadata, "codecs", ()) + if len(codecs) == 1 and isinstance(codecs[0], ShardingCodec): + chunk_shape = codecs[0].chunk_shape + elif self.metadata.chunk_grid.is_regular: + chunk_shape = self.metadata.chunk_grid.chunk_shape + else: + return self.metadata.chunk_grid.shape + return tuple(starmap(ceildiv, zip(self.shape, chunk_shape, strict=True))) @property def _shard_grid_shape(self) -> tuple[int, ...]: @@ -5562,8 +5573,16 @@ def _iter_chunk_regions( A tuple of slice objects representing the region spanned by each shard in the selection. """ - return _iter_regions( - array.shape, array.chunks, origin=origin, selection_shape=selection_shape, trim_excess=True + if array.metadata.chunk_grid.is_regular: + return _iter_regions( + array.shape, + array.chunks, + origin=origin, + selection_shape=selection_shape, + trim_excess=True, + ) + return array.metadata.chunk_grid.iter_chunk_regions( + origin=origin, selection_shape=selection_shape ) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 4548703798..96a6f7d1ea 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -308,7 +308,7 @@ def chunks(self) -> tuple[int, ...]: msg = ( "The `chunks` attribute is only defined for arrays using regular chunk grids. " - "This array has a rectilinear chunk grid. Use `chunk_grid` for general access." + "This array has a rectilinear chunk grid. Use `chunk_sizes` for general access." ) raise NotImplementedError(msg) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index aab60dd03f..a2d27d412f 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -15,10 +15,6 @@ import pytest import zarr - -if TYPE_CHECKING: - from pathlib import Path - from zarr.core.chunk_grids import ( ChunkGrid, ChunkSpec, @@ -29,6 +25,10 @@ parse_chunk_grid, serialize_chunk_grid, ) +from zarr.storage import MemoryStore + +if TYPE_CHECKING: + from pathlib import Path def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: @@ -2161,3 +2161,29 @@ def test_parse_chunk_grid_rebinds_extent(self) -> None: assert isinstance(dim, VaryingDimension) assert dim.extent == 50 assert dim.data_size(2) == 20 # 50 - 30 = 20 + + +class TestNchunksWorksForRectilinear: + def test_nchunks_returns_correct_count(self) -> None: + """nchunks should work for rectilinear arrays.""" + store = MemoryStore() + a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") + assert a.nchunks == 2 + + def test_nchunks_2d_rectilinear(self) -> None: + store = MemoryStore() + a = zarr.create_array(store, shape=(30, 40), chunks=[[10, 20], [15, 25]], dtype="int32") + assert a.nchunks == 4 # 2 chunks x 2 chunks + + +class TestIterChunkRegionsWorksForRectilinear: + def test_iter_chunk_regions_rectilinear(self) -> None: + """_iter_chunk_regions should work for rectilinear arrays.""" + from zarr.core.array import _iter_chunk_regions + + store = MemoryStore() + a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") + regions = list(_iter_chunk_regions(a)) + assert len(regions) == 2 + assert regions[0] == (slice(0, 10),) + assert regions[1] == (slice(10, 30),) From 6af91a6c0753b6a730d3faae72bc0b535799bad4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 12:56:12 -0400 Subject: [PATCH 037/118] Fix from_array --- src/zarr/api/synchronous.py | 9 +++++---- src/zarr/core/array.py | 22 +++++++++++++--------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index f5e6efbc22..e0af472169 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -993,7 +993,7 @@ def from_array( data: AnyArray | npt.ArrayLike, write_data: bool = True, name: str | None = None, - chunks: Literal["auto", "keep"] | tuple[int, ...] = "keep", + chunks: Literal["auto", "keep"] | tuple[int, ...] | Sequence[Sequence[int]] = "keep", shards: ShardsLike | None | Literal["keep"] = "keep", filters: FiltersLike | Literal["keep"] = "keep", compressors: CompressorsLike | Literal["keep"] = "keep", @@ -1025,13 +1025,14 @@ def from_array( name : str or None, optional The name of the array within the store. If ``name`` is ``None``, the array will be located at the root of the store. - chunks : tuple[int, ...] or "auto" or "keep", optional + chunks : tuple[int, ...] or Sequence[Sequence[int]] or "auto" or "keep", optional Chunk shape of the array. Following values are supported: - "auto": Automatically determine the chunk shape based on the array's shape and dtype. - - "keep": Retain the chunk shape of the data array if it is a zarr Array. - - tuple[int, ...]: A tuple of integers representing the chunk shape. + - "keep": Retain the chunk grid of the data array if it is a zarr Array. + - tuple[int, ...]: A tuple of integers representing the chunk shape (regular grid). + - Sequence[Sequence[int]]: Per-dimension chunk edge lists (rectilinear grid). If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". shards : tuple[int, ...], optional diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e27bd98ad6..f57de704f3 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4326,7 +4326,7 @@ async def from_array( data: AnyArray | npt.ArrayLike, write_data: bool = True, name: str | None = None, - chunks: Literal["auto", "keep"] | tuple[int, ...] = "keep", + chunks: Literal["auto", "keep"] | tuple[int, ...] | Sequence[Sequence[int]] = "keep", shards: ShardsLike | None | Literal["keep"] = "keep", filters: FiltersLike | Literal["keep"] = "keep", compressors: CompressorsLike | Literal["keep"] = "keep", @@ -4358,13 +4358,14 @@ async def from_array( name : str or None, optional The name of the array within the store. If ``name`` is ``None``, the array will be located at the root of the store. - chunks : tuple[int, ...] or "auto" or "keep", optional + chunks : tuple[int, ...] or Sequence[Sequence[int]] or "auto" or "keep", optional Chunk shape of the array. Following values are supported: - "auto": Automatically determine the chunk shape based on the array's shape and dtype. - - "keep": Retain the chunk shape of the data array if it is a zarr Array. - - tuple[int, ...]: A tuple of integers representing the chunk shape. + - "keep": Retain the chunk grid of the data array if it is a zarr Array. + - tuple[int, ...]: A tuple of integers representing the chunk shape (regular grid). + - Sequence[Sequence[int]]: Per-dimension chunk edge lists (rectilinear grid). If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". shards : tuple[int, ...], optional @@ -4995,7 +4996,7 @@ async def create_array( data=data_parsed, write_data=write_data, name=name, - chunks=cast("Literal['auto', 'keep'] | tuple[int, ...]", chunks), + chunks=chunks, shards=shards, filters=filters, compressors=compressors, @@ -5038,7 +5039,7 @@ async def create_array( def _parse_keep_array_attr( data: AnyArray | npt.ArrayLike, - chunks: Literal["auto", "keep"] | tuple[int, ...], + chunks: Literal["auto", "keep"] | tuple[int, ...] | Sequence[Sequence[int]], shards: ShardsLike | None | Literal["keep"], filters: FiltersLike | Literal["keep"], compressors: CompressorsLike | Literal["keep"], @@ -5049,7 +5050,7 @@ def _parse_keep_array_attr( chunk_key_encoding: ChunkKeyEncodingLike | None, dimension_names: DimensionNames, ) -> tuple[ - tuple[int, ...] | Literal["auto"], + tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"], ShardsLike | None, FiltersLike, CompressorsLike, @@ -5062,9 +5063,12 @@ def _parse_keep_array_attr( ]: if isinstance(data, Array): if chunks == "keep": - chunks = data.chunks + if data.metadata.chunk_grid.is_regular: + chunks = data.chunks + else: + chunks = data.chunk_sizes if shards == "keep": - shards = data.shards + shards = data.shards if data.metadata.chunk_grid.is_regular else None if zarr_format is None: zarr_format = data.metadata.zarr_format if filters == "keep": From e04d86474c315ce85901add9dad4620f4183b18d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 16:00:48 -0400 Subject: [PATCH 038/118] Add V3 of the prospectus --- docs/design/chunk-grid.md | 416 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 416 insertions(+) create mode 100644 docs/design/chunk-grid.md diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md new file mode 100644 index 0000000000..63f95bcd78 --- /dev/null +++ b/docs/design/chunk-grid.md @@ -0,0 +1,416 @@ +# Prospectus: Unified Chunk Grid Design for zarr-python + +Version: 3 + +**Related:** +- [#3750](https://github.com/zarr-developers/zarr-python/issues/3750) (single ChunkGrid proposal) +- [#3534](https://github.com/zarr-developers/zarr-python/pull/3534) (rectilinear implementation) +- [#3735](https://github.com/zarr-developers/zarr-python/pull/3735) (chunk grid module/registry) +- [ZEP0003](https://github.com/zarr-developers/zeps/blob/main/draft/ZEP0003.md) (variable chunking spec) +- [zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) (sharding v1.1: non-divisible subchunks) +- [zarr-extensions#25](https://github.com/zarr-developers/zarr-extensions/pull/25) (rectilinear extension) +- [zarr-extensions#34](https://github.com/zarr-developers/zarr-extensions/issues/34) (sharding + rectilinear) + +## Problem + +The Zarr V3 spec defines `chunk_grid` as an extension point, but chunk grids are fundamentally different from codecs. Codecs are independent — supporting `zstd` tells you nothing about `gzip`. Chunk grids form a hierarchy — the rectilinear grid is strictly more general than the regular grid. Any regular grid is expressible as a rectilinear grid. + +There is no known chunk grid that is both (a) more general than rectilinear and (b) retains the axis-aligned tessellation properties Zarr assumes. All known grids are special cases: + +| Grid type | Description | +|---|---| +| Regular | Uniform chunk size, boundary chunks padded with fill_value | +| Regular-bounded (zarrs) | Uniform chunk size, boundary chunks trimmed to array extent | +| HPC boundary-padded | Regular interior, larger boundary chunks | +| Fully variable | Arbitrary per-chunk sizes | + +A registry-based plugin system adds complexity without clear benefit. + +## Design + +### Principles + +1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern — the specific partition of a specific array. The grid stores enough information to answer any question about any chunk without external parameters. +2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. The serialization format (`"regular"` vs `"rectilinear"`) is chosen by the metadata layer, not the grid. +3. **No chunk grid registry.** Simple name-based dispatch in `parse_chunk_grid()`. +4. **Fixed vs Varying per dimension.** `FixedDimension(size, extent)` for uniform chunks; `VaryingDimension(edges, cumulative)` for per-chunk edge lengths with precomputed prefix sums. Avoids expanding regular dimensions into lists of identical values. +5. **Transparent transitions.** Operations like `resize()` can move an array from regular to rectilinear chunking. + +### Internal representation + +```python +@dataclass(frozen=True) +class FixedDimension: + """Uniform chunk size. Boundary chunks contain less data but are + encoded at full size by the codec pipeline.""" + size: int # chunk edge length (> 0) + extent: int # array dimension length + + @property + def nchunks(self) -> int: + return ceildiv(self.extent, self.size) + + def index_to_chunk(self, idx: int) -> int: + return idx // self.size + def chunk_offset(self, chunk_ix: int) -> int: + return chunk_ix * self.size + def chunk_size(self, chunk_ix: int) -> int: + return self.size # always uniform + def data_size(self, chunk_ix: int) -> int: + return min(self.size, self.extent - chunk_ix * self.size) # clipped at extent + def indices_to_chunks(self, indices: NDArray) -> NDArray: + return indices // self.size + +@dataclass(frozen=True) +class VaryingDimension: + """Explicit per-chunk sizes. No padding — each edge length is + both the codec size and the data size.""" + edges: tuple[int, ...] # per-chunk edge lengths (all > 0) + cumulative: tuple[int, ...] # prefix sums for O(log n) lookup + + @property + def nchunks(self) -> int: + return len(self.edges) + @property + def extent(self) -> int: + return self.cumulative[-1] + + def index_to_chunk(self, idx: int) -> int: + return bisect.bisect_right(self.cumulative, idx) + def chunk_offset(self, chunk_ix: int) -> int: + return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 + def chunk_size(self, chunk_ix: int) -> int: + return self.edges[chunk_ix] + def data_size(self, chunk_ix: int) -> int: + return self.edges[chunk_ix] # same as chunk_size + def indices_to_chunks(self, indices: NDArray) -> NDArray: + return np.searchsorted(self.cumulative, indices, side='right') +``` + +Both types share a common interface: `nchunks`, `extent`, `index_to_chunk`, `chunk_offset`, `chunk_size`, `data_size`, `indices_to_chunks`. Memory usage scales with the number of *varying* dimensions, not total chunks. + +The two size methods serve different consumers: + +| Method | Returns | Consumer | +|---|---|---| +| `chunk_size` | Buffer size for codec processing | Codec pipeline (`ArraySpec.shape`) | +| `data_size` | Valid data region within the buffer | Indexing pipeline (`chunk_selection` slicing) | + +For `FixedDimension`, these differ only at the boundary. For `VaryingDimension`, they are identical. This matches current zarr-python behavior: `get_chunk_spec` passes the full `chunk_shape` to the codec for all chunks, and the indexer generates a `chunk_selection` that clips the decoded buffer. + +### ChunkSpec + +```python +@dataclass(frozen=True) +class ChunkSpec: + slices: tuple[slice, ...] # valid data region in array coordinates + codec_shape: tuple[int, ...] # buffer shape for codec processing + + @property + def shape(self) -> tuple[int, ...]: + return tuple(s.stop - s.start for s in self.slices) + + @property + def is_boundary(self) -> bool: + return self.shape != self.codec_shape +``` + +For interior chunks, `shape == codec_shape`. For boundary chunks of a regular grid, `codec_shape` is the full declared chunk size while `shape` is clipped. For rectilinear grids, `shape == codec_shape` always. + +### API + +```python +# Creating arrays +arr = zarr.create_array(shape=(100, 200), chunks=(10, 20)) # regular +arr = zarr.create_array(shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]]) # rectilinear +arr = zarr.create_array(shape=(1000,), chunks=[[[100, 10]]]) # RLE shorthand + +# ChunkGrid as a collection +grid = arr.chunk_grid # ChunkGrid instance +grid.shape # (10, 8) — number of chunks per dimension +grid.ndim # 2 +grid.is_regular # True if all dimensions are Fixed + +spec = grid[0, 1] # ChunkSpec for chunk at grid position (0, 1) +spec.slices # (slice(0, 10), slice(25, 50)) +spec.shape # (10, 25) — data shape +spec.codec_shape # (10, 25) — same for interior chunks + +boundary = grid[9, 0] # boundary chunk (extent=95, size=10) +boundary.shape # (5, 25) — 5 elements of real data +boundary.codec_shape # (10, 25) — codec sees full buffer + +grid[99, 99] # None — out of bounds + +for spec in grid: # iterate all chunks + ... + +# .chunks property: retained for regular grids, raises for rectilinear +arr.chunks # (10, 25) +``` + +`ChunkGrid.__getitem__` constructs `ChunkSpec` using `chunk_size` for `codec_shape` and `data_size` for `slices`: + +```python +def __getitem__(self, coords: tuple[int, ...]) -> ChunkSpec | None: + slices = [] + codec_shape = [] + for dim, ix in zip(self.dimensions, coords): + if ix < 0 or ix >= dim.nchunks: + return None + offset = dim.chunk_offset(ix) + slices.append(slice(offset, offset + dim.data_size(ix))) + codec_shape.append(dim.chunk_size(ix)) + return ChunkSpec(tuple(slices), tuple(codec_shape)) +``` + +#### Serialization + +```python +# Regular grid: +{"name": "regular", "configuration": {"chunk_shape": [10, 20]}} + +# Rectilinear grid (with RLE compression): +{"name": "rectilinear", "configuration": {"chunk_shapes": [[10, 20, 30], [[25, 4]]]}} +``` + +Both names deserialize to the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. For rectilinear grids, the extent is redundant (`sum(edges)`) and is validated for consistency. + +**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`, which already knows how to produce its JSON document. The flow is always: metadata document → `ChunkGrid` (via `parse_chunk_grid`), never the reverse. The grid is a pure runtime computation object. + +`ArrayV3Metadata` stores the chunk grid's JSON `name` from the original metadata document and uses it when serializing back. This gives round-trip fidelity for free — a store written as rectilinear with uniform edges stays rectilinear. + +The only place where a user needs to choose the format is when creating new metadata. For `create_array`, the format is inferred from the `chunks` argument: a flat tuple produces `"regular"`, a nested list produces `"rectilinear"`. For `resize`, the format can be specified explicitly via `chunk_grid_metadata`: + +```python +arr.resize( + (80, 100), + chunks=[[10, 20, 30, 20], [25, 25, 25, 25]], + chunk_grid_metadata="rectilinear", +) +``` + +`chunk_grid_metadata` is typed as `str`, not a closed literal — the Zarr V3 spec allows any registered chunk grid name. zarr-python supports `"regular"` and `"rectilinear"` natively; other names (e.g., zarrs' `"regular_bounded"`) would raise unless a handler is registered. If omitted, the format is inferred: `"rectilinear"` when chunks are non-uniform or explicitly nested, `"regular"` when chunks are a flat tuple and evenly divide the shape. Specifying `"regular"` when the chunks are non-uniform raises an error. + +#### Resize + +```python +arr.resize((80, 100)) # inferred rectilinear if not evenly divisible +arr.resize((80, 100), chunks=[[10, 20, 30, 20], [25, 25, 25, 25]]) # explicit chunks +arr.resize((70, 100)) # stays regular if divisible +arr.resize((100, 100), chunk_grid_metadata="rectilinear") # force rectilinear metadata +``` + +Resize creates new `ArrayV3Metadata` (and thus a new `ChunkGrid`). Since resize always creates new metadata, `chunk_grid_metadata` is the natural place to choose the serialization format. + +### Indexing + +The indexing pipeline is coupled to regular grid assumptions — every per-dimension indexer takes a scalar `dim_chunk_len: int` and uses `//` and `*`: + +```python +dim_chunk_ix = self.dim_sel // self.dim_chunk_len # IntDimIndexer +dim_offset = dim_chunk_ix * self.dim_chunk_len # SliceDimIndexer +``` + +Replace `dim_chunk_len: int` with the dimension object (`FixedDimension | VaryingDimension`). The shared interface means the indexer code structure stays the same — `dim_sel // dim_chunk_len` becomes `dim_grid.index_to_chunk(dim_sel)`. O(1) for regular, binary search for varying. + +### Codec pipeline + +Today, `get_chunk_spec()` returns the same `ArraySpec(shape=chunk_grid.chunk_shape)` for every chunk. For rectilinear grids, each chunk has a different codec shape: + +```python +def get_chunk_spec(self, chunk_coords, array_config, prototype) -> ArraySpec: + spec = self.chunk_grid[chunk_coords] + return ArraySpec(shape=spec.codec_shape, ...) +``` + +Note `spec.codec_shape`, not `spec.shape`. For regular grids, `codec_shape` is uniform (preserving current behavior). The boundary clipping flow is unchanged: + +``` +Write: user data → pad to codec_shape with fill_value → encode → store +Read: store → decode to codec_shape → slice via chunk_selection → user data +``` + +### Sharding + +PR #3534 marks sharding as incompatible with rectilinear grids. This is unnecessary — sharding has three independent grid levels: + +``` +Level 1 — Outer chunk grid (shard boundaries): regular or rectilinear +Level 2 — Inner subchunk grid (within each shard): always regular +Level 3 — Shard index: ceil(shard_dim / subchunk_dim) entries per dimension +``` + +The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as extent and the subchunk shape as `FixedDimension`. Each shard is self-contained — it doesn't need to know whether the outer grid is regular or rectilinear. + +[zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) lifts the requirement that subchunk shapes evenly divide the shard shape. With the proposed `ChunkGrid`, this just means removing the `shard_shape % subchunk_shape == 0` validation — `FixedDimension` already handles boundary clipping via `data_size`. + +| Outer grid | Subchunk divisibility | Required change | +|---|---|---| +| Regular | Evenly divides (v1.0) | None | +| Regular | Non-divisible (v1.1) | Remove divisibility validation | +| Rectilinear | Evenly divides | Remove "sharding incompatible" guard | +| Rectilinear | Non-divisible | Both changes | + +### What this replaces + +| Current | Proposed | +|---|---| +| `ChunkGrid` ABC + `RegularChunkGrid` subclass | Single concrete `ChunkGrid` with `is_regular` | +| `RectilinearChunkGrid` (#3534) | Same `ChunkGrid` class | +| Chunk grid registry + entrypoints (#3735) | Direct name dispatch | +| `arr.chunks` | Retained for regular; `arr.chunk_grid` for general use | +| `get_chunk_shape(shape, coord)` | `grid[coord].codec_shape` or `grid[coord].shape` | + +## Design decisions + +### Why store the extent in ChunkGrid? + +The chunk grid is a concrete arrangement, not an abstract tiling pattern. A finite collection naturally has an extent. Storing it enables `__getitem__`, eliminates `dim_len` parameters from every method, and makes the grid self-describing. + +This does *not* mean `ArrayV3Metadata.shape` should delegate to the grid. The array shape remains an independent field in metadata. The extent is passed into the grid at construction time so it can answer boundary questions without external parameters. It is **not** serialized as part of the chunk grid JSON — it comes from the `shape` field in array metadata and is passed to `parse_chunk_grid()`. + +### Why distinguish chunk_size from data_size? + +A chunk in a regular grid has two sizes. `chunk_size` is the buffer size the codec processes — always `size` for `FixedDimension`, even at the boundary (padded with `fill_value`). `data_size` is the valid data region — clipped to `extent % size` at the boundary. The indexing layer uses `data_size` to generate `chunk_selection` slices. + +This matches current zarr-python behavior and matters for: +1. **Backward compatibility.** Existing stores have boundary chunks encoded at full `chunk_shape`. +2. **Codec simplicity.** Codecs assume uniform input shapes for regular grids. +3. **Shard index correctness.** The index assumes `subchunk_dim`-sized entries. + +For `VaryingDimension`, `chunk_size == data_size` — no padding. This is the fundamental difference: `FixedDimension` has a declared size plus an extent that clips data; `VaryingDimension` has explicit sizes that *are* the extent. + +### Why not a chunk grid registry? + +There is no known chunk grid outside the rectilinear family that retains the tessellation properties zarr-python assumes. A `match` on the grid name is sufficient. + +### Why a single class instead of a Protocol? + +All known grids are special cases of rectilinear. A Protocol-based approach means every caller programs against an abstract interface and adding a grid type requires implementing ~10 methods. A single class is simpler. If a genuinely novel grid type emerges, a Protocol can be extracted. + +## Prior art + +**zarrs (Rust):** Three independent grid types behind a `ChunkGridTraits` trait. Key patterns adopted: Fixed vs Varying per dimension, prefix sums + binary search, `Option` for out-of-bounds, `NonZeroU64` for chunk dimensions, separate subchunk grid per shard, array shape at construction. + +**TensorStore (C++):** Stores only `chunk_shape` — boundary clipping via `valid_data_bounds` at query time. Both `RegularGridRef` and `IrregularGrid` internally. No registry. + +## Migration + +### Plan + +1. **Amend and merge #3735.** Keep the `chunk_grids/` module layout. Replace the registry with direct name dispatch. Remove `register_chunk_grid` / `get_chunk_grid_class` and the entrypoint. +2. **Open a new PR** implementing this prospectus: + - `FixedDimension`, `VaryingDimension`, `ChunkSpec`, and `ChunkGrid` classes. + - `parse_chunk_grid(metadata, array_shape)` with `"regular"` and `"rectilinear"` dispatch. + - Port RLE helpers, `resolve_chunk_spec`, `ChunksLike`, and validation functions from #3534. + - Refactor per-dimension indexers to accept `FixedDimension | VaryingDimension`. + - Update `get_chunk_spec` to use `grid[chunk_coords].codec_shape`. + - Add `arr.chunk_grid`. Keep `.chunks` for regular, raise for rectilinear. + - Remove the "sharding incompatible with rectilinear" guard. + - Adapt tests from #3534. +3. **Close trial PRs** with credits: + - **#3534** — RLE helpers, validation logic, chunk spec resolution, test cases, review discussion. + - **#3737** — extent-in-grid idea (adopted per-dimension). + - **#1483** — original POC; superseded by V3 implementation. + - **#3736** — resolved by storing extent per-dimension. +4. **Sharding v1.1** (separate PR, after zarr-specs#370) — remove `shard_shape % subchunk_shape == 0` validation. + +### Reusable components from #3534 + +| Component | Disposition | +|---|---| +| RLE encode/decode helpers | **Keep** | +| `_normalize_rectilinear_chunks` / `_parse_chunk_shapes` | **Keep** — feed into `VaryingDimension` | +| `resolve_chunk_spec` / `ChunksLike` | **Keep** | +| `_validate_zarr_format_compatibility` | **Keep** — rectilinear is V3-only | +| `_validate_sharding_compatibility` | **Remove** — sharding is compatible | +| `RectilinearChunkGrid` class | **Replace** | +| Indexing changes | **Insufficient** — `isinstance` guards remain | + +A **fresh PR** is more practical than adapting #3534's 5700-line diff. + +### Downstream migration + +All four downstream PRs/issues follow the same pattern: + +| Two-class pattern | Unified pattern | +|---|---| +| `isinstance(cg, RegularChunkGrid)` | `cg.is_regular` | +| `isinstance(cg, RectilinearChunkGrid)` | `not cg.is_regular` | +| `cg.chunk_shape` | `cg.dimensions[i].size` or `cg[coord].shape` | +| `cg.chunk_shapes` | `tuple(d.edges for d in cg.dimensions)` | +| `RegularChunkGrid(chunk_shape=...)` | `ChunkGrid.from_regular(shape, chunks)` | +| `RectilinearChunkGrid(chunk_shapes=...)` | `ChunkGrid.from_rectilinear(edges)` | +| Feature detection via class import | Version check or `hasattr(ChunkGrid, 'is_regular')` | + +**[xarray#10880](https://github.com/pydata/xarray/pull/10880):** Replace `isinstance` checks with `.is_regular`. Write path simplifies with `chunks=[[...]]` API. ~1–2 days. + +**[VirtualiZarr#877](https://github.com/zarr-developers/VirtualiZarr/pull/877):** Drop vendored `_is_nested_sequence`. Replace `isinstance` checks. ~1–2 days. + +**[Icechunk#1338](https://github.com/earth-mover/icechunk/issues/1338):** Minimal impact — format changes driven by spec, not class hierarchy. + +**[cubed#876](https://github.com/cubed-dev/cubed/issues/876):** Switch store creation to `ChunkGrid` API. <1 day. + +## Open questions + +1. **RLE in the Python API:** Should users pass RLE-encoded chunk specs directly, or only expanded lists? +2. **Resize defaults:** When growing a regular array, should the default preserve regularity or transition to rectilinear? +3. **`ChunkSpec` complexity:** `ChunkSpec` carries both `slices` and `codec_shape`. Should the grid expose separate methods for codec vs data queries instead? +4. **`__getitem__` with slices:** Should `grid[0, :]` or `grid[0:3, :]` return a sub-grid or an iterator of `ChunkSpec`s? + +## Proofs of concepts + +- Zarr-Python: + - branch - https://github.com/maxrjones/zarr-python/tree/poc/unified-chunk-grid + - diff - https://github.com/zarr-developers/zarr-python/compare/main...maxrjones:zarr-python:poc/unified-chunk-grid?expand=1 +- Xarray: + - branch - https://github.com/maxrjones/xarray/tree/poc/unified-zarr-chunk-grid + - diff - https://github.com/pydata/xarray/compare/main...maxrjones:xarray:poc/unified-zarr-chunk-grid?expand=1 +- VirtualiZarr: + - branch - https://github.com/maxrjones/VirtualiZarr/tree/poc/unified-chunk-grid + - diff - https://github.com/zarr-developers/VirtualiZarr/compare/main...maxrjones:VirtualiZarr:poc/unified-chunk-grid?expand=1 +- Virtual TIFF: + - branch - https://github.com/virtual-zarr/virtual-tiff/tree/poc/unified-chunk-grid + - diff - https://github.com/virtual-zarr/virtual-tiff/compare/main...poc/unified-chunk-grid?expand=1 +- Microbenchmarks: + - https://github.com/maxrjones/zarr-chunk-grid-tests/tree/unified-chunk-grid +## Breaking POC into reviewable PRs + +### PR 1: Per-dimension grid types and `ChunkSpec` (pure additions) + +**Files**: `chunk_grids.py` (new types only) +**Scope**: Add `FixedDimension`, `VaryingDimension`, `DimensionGrid` protocol, `ChunkSpec`, and RLE helpers (`_expand_rle`, `_compress_rle`). Unit tests for these types. No existing code changes — purely additive. + +### PR 2: Unified `ChunkGrid` class (replaces old hierarchy) + +**Files**: `chunk_grids.py` (new `ChunkGrid` class + `RegularChunkGrid` compat wrapper) +**Scope**: New `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `all_chunk_coords()` (no shape arg), `is_regular`, `chunk_shape`. Keep `RegularChunkGrid` as backwards-compat subclass. Add `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()`. Tests for the grid class itself. + +### PR 3: Indexing generalization + +**Files**: `indexing.py` +**Scope**: Refactor `IntDimIndexer`, `SliceDimIndexer`, `BoolArrayDimIndexer`, `BasicIndexer`, `OrthogonalIndexer`, `CoordinateIndexer` to accept `DimensionGrid` instead of `dim_chunk_len: int`. Replace `get_chunk_shape()` calls with `_get_dim_grids()`. Tests for indexing with both regular and rectilinear grids. + +### PR 4: Metadata and array integration + +**Files**: `metadata/v3.py`, `metadata/v2.py`, `array.py`, `group.py`, `api/synchronous.py` +**Scope**: Wire the new `ChunkGrid` into `ArrayV3Metadata` (add `chunk_grid_name`, use `serialize_chunk_grid` in `to_dict`, use `parse_chunk_grid` in constructor). Update `init_array`/`create_array` to accept rectilinear chunks. Update `_resize` to guard against rectilinear grids. + +### PR 5: Sharding codec compatibility + +**Files**: `codecs/sharding.py` +**Scope**: Update `ShardingCodec.validate` to handle rectilinear outer grids (validate every chunk is divisible). Replace `RegularChunkGrid(chunk_shape=...)` calls with `ChunkGrid.from_regular(...)`. + +### PR 6: End-to-end tests + +**Files**: `tests/test_unified_chunk_grid.py`, updates to `tests/test_array.py`, `tests/test_indexing.py` +**Scope**: Full integration tests — round-trip create/read/write with rectilinear arrays, serialization fidelity, hypothesis strategies. + +## Notes + +- PRs 1–2 are purely additive and low-risk. +- PR 3 is the biggest behavioral change. +- PRs 4–5 wire things together. +- PR 6 adds comprehensive test coverage. +- Each PR builds on the previous but is independently reviewable. \ No newline at end of file From 8dcea81df0ffb4df959aa4a9376a6248b63a51e4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 16:07:47 -0400 Subject: [PATCH 039/118] Fastforward design docs --- docs/design/chunk-grid.md | 215 +++++++++++++++++++++++++++----------- mkdocs.yml | 2 + 2 files changed, 156 insertions(+), 61 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 63f95bcd78..3df1cb8d74 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -1,8 +1,9 @@ -# Prospectus: Unified Chunk Grid Design for zarr-python +# Unified Chunk Grid -Version: 3 +Version: 4 **Related:** + - [#3750](https://github.com/zarr-developers/zarr-python/issues/3750) (single ChunkGrid proposal) - [#3534](https://github.com/zarr-developers/zarr-python/pull/3534) (rectilinear implementation) - [#3735](https://github.com/zarr-developers/zarr-python/pull/3735) (chunk grid module/registry) @@ -33,7 +34,7 @@ A registry-based plugin system adds complexity without clear benefit. 1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern — the specific partition of a specific array. The grid stores enough information to answer any question about any chunk without external parameters. 2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. The serialization format (`"regular"` vs `"rectilinear"`) is chosen by the metadata layer, not the grid. 3. **No chunk grid registry.** Simple name-based dispatch in `parse_chunk_grid()`. -4. **Fixed vs Varying per dimension.** `FixedDimension(size, extent)` for uniform chunks; `VaryingDimension(edges, cumulative)` for per-chunk edge lengths with precomputed prefix sums. Avoids expanding regular dimensions into lists of identical values. +4. **Fixed vs Varying per dimension.** `FixedDimension(size, extent)` for uniform chunks; `VaryingDimension(edges, extent)` for per-chunk edge lengths with precomputed prefix sums. Avoids expanding regular dimensions into lists of identical values. 5. **Transparent transitions.** Operations like `resize()` can move an array from regular to rectilinear chunking. ### Internal representation @@ -43,11 +44,16 @@ A registry-based plugin system adds complexity without clear benefit. class FixedDimension: """Uniform chunk size. Boundary chunks contain less data but are encoded at full size by the codec pipeline.""" - size: int # chunk edge length (> 0) + size: int # chunk edge length (>= 0) extent: int # array dimension length + def __post_init__(self) -> None: + # validates size >= 0 and extent >= 0 + @property def nchunks(self) -> int: + if self.size == 0: + return 1 if self.extent == 0 else 0 return ceildiv(self.extent, self.size) def index_to_chunk(self, idx: int) -> int: @@ -57,23 +63,31 @@ class FixedDimension: def chunk_size(self, chunk_ix: int) -> int: return self.size # always uniform def data_size(self, chunk_ix: int) -> int: - return min(self.size, self.extent - chunk_ix * self.size) # clipped at extent + return max(0, min(self.size, self.extent - chunk_ix * self.size)) def indices_to_chunks(self, indices: NDArray) -> NDArray: return indices // self.size + def with_extent(self, new_extent: int) -> FixedDimension: + return FixedDimension(size=self.size, extent=new_extent) + def resize(self, new_extent: int) -> FixedDimension: + return FixedDimension(size=self.size, extent=new_extent) @dataclass(frozen=True) class VaryingDimension: - """Explicit per-chunk sizes. No padding — each edge length is - both the codec size and the data size.""" + """Explicit per-chunk sizes. The last chunk may extend past the array + extent, in which case data_size clips to the valid region while + chunk_size returns the full edge length for codec processing.""" edges: tuple[int, ...] # per-chunk edge lengths (all > 0) cumulative: tuple[int, ...] # prefix sums for O(log n) lookup + extent: int # array dimension length (may be < sum(edges)) + + def __init__(self, edges: Sequence[int], extent: int) -> None: + # validates edges non-empty, all > 0, extent >= 0, extent <= sum(edges) + # computes cumulative via itertools.accumulate + # uses object.__setattr__ for frozen dataclass @property def nchunks(self) -> int: return len(self.edges) - @property - def extent(self) -> int: - return self.cumulative[-1] def index_to_chunk(self, idx: int) -> int: return bisect.bisect_right(self.cumulative, idx) @@ -82,12 +96,19 @@ class VaryingDimension: def chunk_size(self, chunk_ix: int) -> int: return self.edges[chunk_ix] def data_size(self, chunk_ix: int) -> int: - return self.edges[chunk_ix] # same as chunk_size + offset = self.chunk_offset(chunk_ix) + return max(0, min(self.edges[chunk_ix], self.extent - offset)) def indices_to_chunks(self, indices: NDArray) -> NDArray: return np.searchsorted(self.cumulative, indices, side='right') + def with_extent(self, new_extent: int) -> VaryingDimension: + # validates edge_sum >= new_extent, re-binds extent + return VaryingDimension(self.edges, extent=new_extent) + def resize(self, new_extent: int) -> VaryingDimension: + # grow: append chunk of size (new_extent - old_extent) + # shrink: drop trailing chunks, keep those up to new_extent ``` -Both types share a common interface: `nchunks`, `extent`, `index_to_chunk`, `chunk_offset`, `chunk_size`, `data_size`, `indices_to_chunks`. Memory usage scales with the number of *varying* dimensions, not total chunks. +Both types implement the `DimensionGrid` protocol: `nchunks`, `extent`, `index_to_chunk`, `chunk_offset`, `chunk_size`, `data_size`, `indices_to_chunks`, `with_extent`, `resize`. Memory usage scales with the number of *varying* dimensions, not total chunks. The two size methods serve different consumers: @@ -96,7 +117,29 @@ The two size methods serve different consumers: | `chunk_size` | Buffer size for codec processing | Codec pipeline (`ArraySpec.shape`) | | `data_size` | Valid data region within the buffer | Indexing pipeline (`chunk_selection` slicing) | -For `FixedDimension`, these differ only at the boundary. For `VaryingDimension`, they are identical. This matches current zarr-python behavior: `get_chunk_spec` passes the full `chunk_shape` to the codec for all chunks, and the indexer generates a `chunk_selection` that clips the decoded buffer. +For `FixedDimension`, these differ only at the boundary. For `VaryingDimension`, these differ only when the last chunk extends past the extent (i.e., `extent < sum(edges)`). This matches current zarr-python behavior: `get_chunk_spec` passes the full `chunk_shape` to the codec for all chunks, and the indexer generates a `chunk_selection` that clips the decoded buffer. + +### DimensionGrid Protocol + +```python +@runtime_checkable +class DimensionGrid(Protocol): + """Structural interface shared by FixedDimension and VaryingDimension.""" + + @property + def nchunks(self) -> int: ... + @property + def extent(self) -> int: ... + def index_to_chunk(self, idx: int) -> int: ... + def chunk_offset(self, chunk_ix: int) -> int: ... + def chunk_size(self, chunk_ix: int) -> int: ... + def data_size(self, chunk_ix: int) -> int: ... + def indices_to_chunks(self, indices: NDArray[np.intp]) -> NDArray[np.intp]: ... + def with_extent(self, new_extent: int) -> DimensionGrid: ... + def resize(self, new_extent: int) -> DimensionGrid: ... +``` + +The protocol is `@runtime_checkable`, enabling polymorphic handling of both dimension types without `isinstance` checks. ### ChunkSpec @@ -115,7 +158,7 @@ class ChunkSpec: return self.shape != self.codec_shape ``` -For interior chunks, `shape == codec_shape`. For boundary chunks of a regular grid, `codec_shape` is the full declared chunk size while `shape` is clipped. For rectilinear grids, `shape == codec_shape` always. +For interior chunks, `shape == codec_shape`. For boundary chunks of a regular grid, `codec_shape` is the full declared chunk size while `shape` is clipped. For rectilinear grids, `shape == codec_shape` unless the last chunk extends past the extent. ### API @@ -123,36 +166,40 @@ For interior chunks, `shape == codec_shape`. For boundary chunks of a regular gr # Creating arrays arr = zarr.create_array(shape=(100, 200), chunks=(10, 20)) # regular arr = zarr.create_array(shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]]) # rectilinear -arr = zarr.create_array(shape=(1000,), chunks=[[[100, 10]]]) # RLE shorthand # ChunkGrid as a collection -grid = arr.chunk_grid # ChunkGrid instance -grid.shape # (10, 8) — number of chunks per dimension -grid.ndim # 2 -grid.is_regular # True if all dimensions are Fixed +grid = arr.metadata.chunk_grid # ChunkGrid instance +grid.shape # (10, 10) — number of chunks per dimension +grid.ndim # 2 +grid.is_regular # True if all dimensions are Fixed -spec = grid[0, 1] # ChunkSpec for chunk at grid position (0, 1) -spec.slices # (slice(0, 10), slice(25, 50)) -spec.shape # (10, 25) — data shape -spec.codec_shape # (10, 25) — same for interior chunks +spec = grid[0, 1] # ChunkSpec for chunk at grid position (0, 1) +spec.slices # (slice(0, 10), slice(20, 40)) +spec.shape # (10, 20) — data shape +spec.codec_shape # (10, 20) — same for interior chunks -boundary = grid[9, 0] # boundary chunk (extent=95, size=10) -boundary.shape # (5, 25) — 5 elements of real data -boundary.codec_shape # (10, 25) — codec sees full buffer +boundary = grid[9, 0] # boundary chunk (extent=100, size=10) +boundary.shape # (10, 20) — data shape +boundary.codec_shape # (10, 20) — codec sees full buffer -grid[99, 99] # None — out of bounds +grid[99, 99] # None — out of bounds -for spec in grid: # iterate all chunks +for spec in grid: # iterate all chunks ... -# .chunks property: retained for regular grids, raises for rectilinear -arr.chunks # (10, 25) +# .chunks property: retained for regular grids, raises NotImplementedError for rectilinear +arr.chunks # (10, 20) + +# .chunk_sizes property: works for all grids (dask-style) +arr.chunk_sizes # ((10, 10, ..., 10), (20, 20, ..., 20)) ``` `ChunkGrid.__getitem__` constructs `ChunkSpec` using `chunk_size` for `codec_shape` and `data_size` for `slices`: ```python -def __getitem__(self, coords: tuple[int, ...]) -> ChunkSpec | None: +def __getitem__(self, coords: int | tuple[int, ...]) -> ChunkSpec | None: + if isinstance(coords, int): + coords = (coords,) slices = [] codec_shape = [] for dim, ix in zip(self.dimensions, coords): @@ -164,44 +211,92 @@ def __getitem__(self, coords: tuple[int, ...]) -> ChunkSpec | None: return ChunkSpec(tuple(slices), tuple(codec_shape)) ``` +#### Construction + +```python +# Regular grid — all FixedDimension +grid = ChunkGrid.from_regular(array_shape=(100, 200), chunk_shape=(10, 20)) + +# Rectilinear grid — per-dimension edge lists +# Dims with all-identical edges are stored as FixedDimension (optimization) +grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + +# Direct construction +grid = ChunkGrid(dimensions=(FixedDimension(10, 100), VaryingDimension([10, 20, 30], 60))) +``` + #### Serialization ```python # Regular grid: {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} -# Rectilinear grid (with RLE compression): -{"name": "rectilinear", "configuration": {"chunk_shapes": [[10, 20, 30], [[25, 4]]]}} +# Rectilinear grid (with RLE compression and "kind" field): +{"name": "rectilinear", "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [[25, 4]]]}} ``` -Both names deserialize to the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. For rectilinear grids, the extent is redundant (`sum(edges)`) and is validated for consistency. +Both names deserialize to the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. + +**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`, which stores the chunk grid's JSON `name` in the `chunk_grid_name` field. `serialize_chunk_grid(grid, name)` is called by `ArrayV3Metadata.to_dict()`. This gives round-trip fidelity — a store written as rectilinear with uniform edges stays rectilinear. + +For `create_array`, the format is inferred from the `chunks` argument: a flat tuple produces `"regular"`, a nested list produces `"rectilinear"`. The `_is_rectilinear_chunks()` helper detects nested sequences like `[[10, 20], [5, 5]]`. -**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`, which already knows how to produce its JSON document. The flow is always: metadata document → `ChunkGrid` (via `parse_chunk_grid`), never the reverse. The grid is a pure runtime computation object. +##### Rectilinear spec compliance -`ArrayV3Metadata` stores the chunk grid's JSON `name` from the original metadata document and uses it when serializing back. This gives round-trip fidelity for free — a store written as rectilinear with uniform edges stays rectilinear. +The rectilinear format requires `"kind": "inline"` (validated by `_validate_rectilinear_kind()`). Per the spec, each element of `chunk_shapes` can be: -The only place where a user needs to choose the format is when creating new metadata. For `create_array`, the format is inferred from the `chunks` argument: a flat tuple produces `"regular"`, a nested list produces `"rectilinear"`. For `resize`, the format can be specified explicitly via `chunk_grid_metadata`: +- A bare integer `m`: repeated until `sum >= array_extent` +- A list of bare integers: explicit per-chunk sizes +- A mixed array of bare integers and `[value, count]` RLE pairs + +RLE compression is used when serializing: runs of identical sizes become `[value, count]` pairs, singletons stay as bare integers. ```python -arr.resize( - (80, 100), - chunks=[[10, 20, 30, 20], [25, 25, 25, 25]], - chunk_grid_metadata="rectilinear", -) +# _compress_rle([10, 10, 10, 5]) -> [[10, 3], 5] +# _expand_rle([[10, 3], 5]) -> [10, 10, 10, 5] ``` -`chunk_grid_metadata` is typed as `str`, not a closed literal — the Zarr V3 spec allows any registered chunk grid name. zarr-python supports `"regular"` and `"rectilinear"` natively; other names (e.g., zarrs' `"regular_bounded"`) would raise unless a handler is registered. If omitted, the format is inferred: `"rectilinear"` when chunks are non-uniform or explicitly nested, `"regular"` when chunks are a flat tuple and evenly divide the shape. Specifying `"regular"` when the chunks are non-uniform raises an error. +For `FixedDimension` serialized as rectilinear, `_serialize_fixed_dim()` produces a compact representation: bare integer when evenly divisible, `[size, last_data]` for two chunks, `[[size, n-1], last_data]` for more. + +#### chunk_sizes + +The `chunk_sizes` property provides universal access to per-dimension chunk data sizes, matching the dask `Array.chunks` convention. It works for both regular and rectilinear grids: + +```python +>>> arr = zarr.create_array(store, shape=(100, 80), chunks=(30, 40)) +>>> arr.chunk_sizes +((30, 30, 30, 10), (40, 40)) + +>>> arr = zarr.create_array(store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]]) +>>> arr.chunk_sizes +((10, 20, 30), (50, 50)) +``` #### Resize ```python -arr.resize((80, 100)) # inferred rectilinear if not evenly divisible -arr.resize((80, 100), chunks=[[10, 20, 30, 20], [25, 25, 25, 25]]) # explicit chunks -arr.resize((70, 100)) # stays regular if divisible -arr.resize((100, 100), chunk_grid_metadata="rectilinear") # force rectilinear metadata +arr.resize((80, 100)) # re-binds extent; FixedDimension stays fixed +arr.resize((200, 100)) # VaryingDimension grows by appending a new chunk +arr.resize((30, 100)) # VaryingDimension shrinks by dropping trailing chunks +``` + +Resize uses `ChunkGrid.update_shape(new_shape)`, which delegates to each dimension's `.resize()` method: +- `FixedDimension.resize()`: simply re-binds the extent (identical to `with_extent`) +- `VaryingDimension.resize()`: grow appends a chunk of size `new_extent - old_extent`; shrink drops trailing chunks whose cumulative offset lies beyond the new extent + +#### from_array + +The `from_array()` function handles both regular and rectilinear source arrays: + +```python +src = zarr.create_array(store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]]) +new = zarr.from_array(data=src, store=new_store, chunks="keep") +# Preserves rectilinear structure: new.chunk_sizes == ((10, 20, 30), (50, 50)) ``` -Resize creates new `ArrayV3Metadata` (and thus a new `ChunkGrid`). Since resize always creates new metadata, `chunk_grid_metadata` is the natural place to choose the serialization format. +When `chunks="keep"`, the logic checks `data.metadata.chunk_grid.is_regular`: +- Regular: extracts `data.chunks` (flat tuple) and preserves shards +- Rectilinear: extracts `data.chunk_sizes` (nested tuples) and forces shards to None ### Indexing @@ -233,7 +328,7 @@ Read: store → decode to codec_shape → slice via chunk_selection → user da ### Sharding -PR #3534 marks sharding as incompatible with rectilinear grids. This is unnecessary — sharding has three independent grid levels: +The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as extent and the subchunk shape as `FixedDimension`. Each shard is self-contained — it doesn't need to know whether the outer grid is regular or rectilinear. Rectilinear chunks with sharding currently raises `ValueError` pending further validation work. ``` Level 1 — Outer chunk grid (shard boundaries): regular or rectilinear @@ -241,8 +336,6 @@ Level 2 — Inner subchunk grid (within each shard): always regular Level 3 — Shard index: ceil(shard_dim / subchunk_dim) entries per dimension ``` -The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as extent and the subchunk shape as `FixedDimension`. Each shard is self-contained — it doesn't need to know whether the outer grid is regular or rectilinear. - [zarr-specs#370](https://github.com/zarr-developers/zarr-specs/pull/370) lifts the requirement that subchunk shapes evenly divide the shard shape. With the proposed `ChunkGrid`, this just means removing the `shard_shape % subchunk_shape == 0` validation — `FixedDimension` already handles boundary clipping via `data_size`. | Outer grid | Subchunk divisibility | Required change | @@ -259,7 +352,7 @@ The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as | `ChunkGrid` ABC + `RegularChunkGrid` subclass | Single concrete `ChunkGrid` with `is_regular` | | `RectilinearChunkGrid` (#3534) | Same `ChunkGrid` class | | Chunk grid registry + entrypoints (#3735) | Direct name dispatch | -| `arr.chunks` | Retained for regular; `arr.chunk_grid` for general use | +| `arr.chunks` | Retained for regular; `arr.chunk_sizes` for general use | | `get_chunk_shape(shape, coord)` | `grid[coord].codec_shape` or `grid[coord].shape` | ## Design decisions @@ -279,7 +372,7 @@ This matches current zarr-python behavior and matters for: 2. **Codec simplicity.** Codecs assume uniform input shapes for regular grids. 3. **Shard index correctness.** The index assumes `subchunk_dim`-sized entries. -For `VaryingDimension`, `chunk_size == data_size` — no padding. This is the fundamental difference: `FixedDimension` has a declared size plus an extent that clips data; `VaryingDimension` has explicit sizes that *are* the extent. +For `VaryingDimension`, `chunk_size == data_size` when `extent == sum(edges)`. When `extent < sum(edges)` (e.g., after a resize that keeps the last chunk oversized), `data_size` clips the last chunk. This is the fundamental difference: `FixedDimension` has a declared size plus an extent that clips data; `VaryingDimension` has explicit sizes that normally *are* the extent but can also extend past it. ### Why not a chunk grid registry? @@ -301,12 +394,12 @@ All known grids are special cases of rectilinear. A Protocol-based approach mean 1. **Amend and merge #3735.** Keep the `chunk_grids/` module layout. Replace the registry with direct name dispatch. Remove `register_chunk_grid` / `get_chunk_grid_class` and the entrypoint. 2. **Open a new PR** implementing this prospectus: - - `FixedDimension`, `VaryingDimension`, `ChunkSpec`, and `ChunkGrid` classes. + - `FixedDimension`, `VaryingDimension`, `DimensionGrid` protocol, `ChunkSpec`, and `ChunkGrid` classes. - `parse_chunk_grid(metadata, array_shape)` with `"regular"` and `"rectilinear"` dispatch. - Port RLE helpers, `resolve_chunk_spec`, `ChunksLike`, and validation functions from #3534. - Refactor per-dimension indexers to accept `FixedDimension | VaryingDimension`. - Update `get_chunk_spec` to use `grid[chunk_coords].codec_shape`. - - Add `arr.chunk_grid`. Keep `.chunks` for regular, raise for rectilinear. + - Add `arr.chunk_sizes`. Keep `.chunks` for regular, raise for rectilinear. - Remove the "sharding incompatible with rectilinear" guard. - Adapt tests from #3534. 3. **Close trial PRs** with credits: @@ -354,10 +447,10 @@ All four downstream PRs/issues follow the same pattern: ## Open questions -1. **RLE in the Python API:** Should users pass RLE-encoded chunk specs directly, or only expanded lists? -2. **Resize defaults:** When growing a regular array, should the default preserve regularity or transition to rectilinear? -3. **`ChunkSpec` complexity:** `ChunkSpec` carries both `slices` and `codec_shape`. Should the grid expose separate methods for codec vs data queries instead? -4. **`__getitem__` with slices:** Should `grid[0, :]` or `grid[0:3, :]` return a sub-grid or an iterator of `ChunkSpec`s? +1. **Resize defaults:** When growing a regular array, should the default preserve regularity or transition to rectilinear? +2. **`ChunkSpec` complexity:** `ChunkSpec` carries both `slices` and `codec_shape`. Should the grid expose separate methods for codec vs data queries instead? +3. **`__getitem__` with slices:** Should `grid[0, :]` or `grid[0:3, :]` return a sub-grid or an iterator of `ChunkSpec`s? +4. **Rectilinear + sharding:** The current POC raises `ValueError` for rectilinear chunks with sharding. When should this be relaxed? ## Proofs of concepts @@ -385,7 +478,7 @@ All four downstream PRs/issues follow the same pattern: ### PR 2: Unified `ChunkGrid` class (replaces old hierarchy) **Files**: `chunk_grids.py` (new `ChunkGrid` class + `RegularChunkGrid` compat wrapper) -**Scope**: New `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `all_chunk_coords()` (no shape arg), `is_regular`, `chunk_shape`. Keep `RegularChunkGrid` as backwards-compat subclass. Add `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()`. Tests for the grid class itself. +**Scope**: New `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `all_chunk_coords()` (no shape arg), `is_regular`, `chunk_shape`, `chunk_sizes`. Keep `RegularChunkGrid` as backwards-compat subclass. Add `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()`. Tests for the grid class itself. ### PR 3: Indexing generalization @@ -413,4 +506,4 @@ All four downstream PRs/issues follow the same pattern: - PR 3 is the biggest behavioral change. - PRs 4–5 wire things together. - PR 6 adds comprehensive test coverage. -- Each PR builds on the previous but is independently reviewable. \ No newline at end of file +- Each PR builds on the previous but is independently reviewable. diff --git a/mkdocs.yml b/mkdocs.yml index 61872b6234..5e261dd1ae 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -75,6 +75,8 @@ nav: - Creation sub-module: api/zarr/deprecated/creation.md - release-notes.md - contributing.md + - Design documents: + - design/chunk-grid.md watch: - src/zarr - docs From 4eb01c54d2edc5994d76257c4679576fe3ed41fc Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 16:26:35 -0400 Subject: [PATCH 040/118] Require array extent --- docs/design/chunk-grid.md | 17 ++++-- src/zarr/core/array.py | 4 +- src/zarr/core/chunk_grids.py | 43 +++++++++++--- tests/test_unified_chunk_grid.py | 98 ++++++++++++++++---------------- 4 files changed, 99 insertions(+), 63 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 3df1cb8d74..cf50afbda8 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -213,18 +213,25 @@ def __getitem__(self, coords: int | tuple[int, ...]) -> ChunkSpec | None: #### Construction +Both `from_regular` and `from_rectilinear` require `array_shape`, binding the extent per dimension at construction time. This is a core design choice: a chunk grid is a concrete arrangement for a specific array, not an abstract tiling pattern. + ```python # Regular grid — all FixedDimension grid = ChunkGrid.from_regular(array_shape=(100, 200), chunk_shape=(10, 20)) -# Rectilinear grid — per-dimension edge lists -# Dims with all-identical edges are stored as FixedDimension (optimization) -grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) +# Rectilinear grid — extent = sum(edges) when shape matches +grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) + +# Rectilinear grid with boundary clipping — last chunk extends past array extent +# e.g., shape=(55, 90) but edges sum to (60, 100): data_size clips at extent +grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(55, 90)) # Direct construction -grid = ChunkGrid(dimensions=(FixedDimension(10, 100), VaryingDimension([10, 20, 30], 60))) +grid = ChunkGrid(dimensions=(FixedDimension(10, 100), VaryingDimension([10, 20, 30], 55))) ``` +When `extent < sum(edges)`, the dimension is always stored as `VaryingDimension` (even if all edges are identical) to preserve the explicit edge count. The last chunk's `chunk_size` returns the full declared edge (codec buffer) while `data_size` clips to the extent. This mirrors how `FixedDimension` handles boundary chunks in regular grids. + #### Serialization ```python @@ -434,7 +441,7 @@ All four downstream PRs/issues follow the same pattern: | `cg.chunk_shape` | `cg.dimensions[i].size` or `cg[coord].shape` | | `cg.chunk_shapes` | `tuple(d.edges for d in cg.dimensions)` | | `RegularChunkGrid(chunk_shape=...)` | `ChunkGrid.from_regular(shape, chunks)` | -| `RectilinearChunkGrid(chunk_shapes=...)` | `ChunkGrid.from_rectilinear(edges)` | +| `RectilinearChunkGrid(chunk_shapes=...)` | `ChunkGrid.from_rectilinear(edges, shape)` | | Feature detection via class import | Version check or `hasattr(ChunkGrid, 'is_regular')` | **[xarray#10880](https://github.com/pydata/xarray/pull/10880):** Replace `isinstance` checks with `.is_regular`. Write path simplifies with `chunks=[[...]]` API. ~1–2 days. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f57de704f3..0eeb5901a0 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4729,7 +4729,7 @@ async def init_array( "chunks=(inner_size, ...), shards=[[shard_sizes], ...]" ) rect_chunks = cast("Sequence[Sequence[int]]", chunks) - rectilinear_grid = ChunkGrid.from_rectilinear(rect_chunks) + rectilinear_grid = ChunkGrid.from_rectilinear(rect_chunks, array_shape=shape_parsed) # Use first chunk size per dim as placeholder for _auto_partition chunks_flat: tuple[int, ...] | Literal["auto"] = tuple( dim_edges[0] for dim_edges in rect_chunks @@ -4744,7 +4744,7 @@ async def init_array( if zarr_format == 2: raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") rect_shards = cast("Sequence[Sequence[int]]", shards) - rectilinear_grid = ChunkGrid.from_rectilinear(rect_shards) + rectilinear_grid = ChunkGrid.from_rectilinear(rect_shards, array_shape=shape_parsed) # Use first shard size per dim as placeholder for _auto_partition shards_for_partition = tuple(dim_edges[0] for dim_edges in rect_shards) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 4d3b889734..b912cacbc7 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -196,7 +196,8 @@ class ChunkSpec: ``codec_shape`` gives the buffer shape for codec processing. For interior chunks these are equal. For boundary chunks of a regular grid, ``codec_shape`` is the full declared chunk size while ``shape`` - is clipped. For rectilinear grids, ``shape == codec_shape`` always. + is clipped. For rectilinear grids, ``shape == codec_shape`` unless the + last chunk extends past the array extent. """ slices: tuple[slice, ...] @@ -407,20 +408,46 @@ def from_regular(cls, array_shape: ShapeLike, chunk_shape: ShapeLike) -> ChunkGr return cls(dimensions=dims) @classmethod - def from_rectilinear(cls, chunk_shapes: Sequence[Sequence[int]]) -> ChunkGrid: + def from_rectilinear( + cls, + chunk_shapes: Sequence[Sequence[int]], + array_shape: ShapeLike, + ) -> ChunkGrid: """Create a ChunkGrid with per-dimension edge lists. Each element of chunk_shapes is a sequence of chunk sizes for that dimension. - If all sizes in a dimension are identical, it's stored as FixedDimension. - The extent of each dimension is ``sum(edges)``. + If all sizes in a dimension are identical *and* the extent equals + ``sum(edges)``, the dimension is stored as ``FixedDimension``. + Otherwise it is stored as ``VaryingDimension``, preserving the + explicit edge count (important when the last chunk extends past + the array boundary). + + Parameters + ---------- + chunk_shapes + Per-dimension sequences of chunk edge lengths. + array_shape + The array shape to bind as the extent per dimension. The last + chunk along each dimension may extend past the array boundary + (the edge is the codec buffer size; ``data_size`` clips to the + extent). """ + extents = parse_shapelike(array_shape) + if len(extents) != len(chunk_shapes): + raise ValueError( + f"array_shape has {len(extents)} dimensions but chunk_shapes " + f"has {len(chunk_shapes)} dimensions" + ) dims: list[DimensionGrid] = [] - for edges in chunk_shapes: + for edges, extent in zip(chunk_shapes, extents, strict=True): edges_list = list(edges) if not edges_list: raise ValueError("Each dimension must have at least one chunk") - extent = sum(edges_list) - if all(e == edges_list[0] for e in edges_list): + edge_sum = sum(edges_list) + # Only collapse to FixedDimension when edges are uniform AND + # extent equals edge_sum. When extent < edge_sum the explicit + # edge count matters (overflow chunks), so use VaryingDimension. + if all(e == edges_list[0] for e in edges_list) and extent == edge_sum: dims.append(FixedDimension(size=edges_list[0], extent=extent)) else: dims.append(VaryingDimension(edges_list, extent=extent)) @@ -637,7 +664,7 @@ def parse_chunk_grid( f"Rectilinear chunk edges for dimension {i} sum to {edge_sum} " f"but array shape extent is {extent} (edge sum must be >= extent)" ) - return ChunkGrid.from_rectilinear(decoded) + return ChunkGrid.from_rectilinear(decoded, array_shape=array_shape) raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index a2d27d412f..5c3bcdc930 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -191,7 +191,7 @@ def test_zero_dim(self) -> None: assert g.ndim == 0 def test_from_rectilinear(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) assert not g.is_regular assert g.ndim == 2 with pytest.raises(ValueError, match="only available for regular"): @@ -199,13 +199,13 @@ def test_from_rectilinear(self) -> None: def test_rectilinear_with_uniform_dim(self) -> None: """A rectilinear grid with all-same sizes in one dim stores it as Fixed.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) assert isinstance(g.dimensions[0], VaryingDimension) assert isinstance(g.dimensions[1], FixedDimension) def test_all_uniform_becomes_regular(self) -> None: """If all dimensions have uniform sizes, the grid is regular.""" - g = ChunkGrid.from_rectilinear([[10, 10, 10], [25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 10, 10], [25, 25]], array_shape=(30, 50)) assert g.is_regular assert g.chunk_shape == (10, 25) @@ -225,7 +225,7 @@ def test_regular_shape_boundary(self) -> None: assert g.shape == (10, 10) # ceildiv(95, 10) == 10 def test_rectilinear_shape(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) assert g.shape == (3, 4) def test_regular_getitem(self) -> None: @@ -249,7 +249,7 @@ def test_regular_getitem_oob(self) -> None: assert g[(99, 0)] is None def test_rectilinear_getitem(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) spec0 = g[(0, 0)] assert spec0 is not None assert spec0.shape == (10, 25) @@ -265,20 +265,20 @@ def test_rectilinear_getitem(self) -> None: assert g[(3, 0)] is None # OOB def test_getitem_slices(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) spec = g[(1, 2)] assert spec is not None assert spec.slices == (slice(10, 30), slice(50, 75)) def test_all_chunk_coords(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) coords = list(g.all_chunk_coords()) assert len(coords) == 6 assert coords[0] == (0, 0) assert coords[-1] == (2, 1) def test_get_nchunks(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) assert g.get_nchunks() == 6 def test_iter(self) -> None: @@ -328,7 +328,7 @@ def test_regular_roundtrip(self) -> None: assert g2.chunk_shape == (10, 20) def test_rectilinear_roundtrip(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" g2 = parse_chunk_grid(d, (60, 100)) @@ -343,7 +343,7 @@ def test_rectilinear_roundtrip(self) -> None: def test_rectilinear_rle_serialization(self) -> None: """RLE should be used when it actually compresses.""" - g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]], array_shape=(1000, 100)) # All uniform, but name is chosen by the metadata layer, not the grid. # Serializing as "regular" works because is_regular is True. d = serialize_chunk_grid(g, "regular") @@ -351,12 +351,14 @@ def test_rectilinear_rle_serialization(self) -> None: def test_rectilinear_uniform_stays_rectilinear(self) -> None: """A rectilinear grid with uniform edges stays rectilinear if the name says so.""" - g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]], array_shape=(1000, 100)) d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" def test_rectilinear_rle_with_varying(self) -> None: - g = ChunkGrid.from_rectilinear([[100, 100, 100, 50], [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear( + [[100, 100, 100, 50], [25, 25, 25, 25]], array_shape=(350, 100) + ) d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" config = d["configuration"] @@ -366,7 +368,7 @@ def test_rectilinear_rle_with_varying(self) -> None: assert chunk_shapes[0] == [[100, 3], 50] def test_json_roundtrip(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) d = serialize_chunk_grid(g, "rectilinear") json_str = json.dumps(d) d2 = json.loads(json_str) @@ -378,7 +380,7 @@ def test_unknown_name_raises(self) -> None: parse_chunk_grid({"name": "hexagonal", "configuration": {}}, (10,)) def test_serialize_non_regular_as_regular_raises(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) with pytest.raises(ValueError, match="Cannot serialize a non-regular chunk grid"): serialize_chunk_grid(g, "regular") @@ -411,7 +413,7 @@ def test_kind_unknown_rejected(self) -> None: def test_kind_inline_in_serialized_output(self) -> None: """Serialization includes kind: 'inline'.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) d = serialize_chunk_grid(g, "rectilinear") config = d["configuration"] assert isinstance(config, dict) @@ -475,7 +477,7 @@ class TestParseChunkGridValidation: def test_varying_extent_mismatch_raises(self) -> None: from zarr.core.chunk_grids import parse_chunk_grid - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) # VaryingDimension extent is 60, but array_shape says 100 with pytest.raises(ValueError, match="extent"): parse_chunk_grid(g, (100, 100)) @@ -483,7 +485,7 @@ def test_varying_extent_mismatch_raises(self) -> None: def test_varying_extent_match_ok(self) -> None: from zarr.core.chunk_grids import parse_chunk_grid - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) # Matching extents should work fine g2 = parse_chunk_grid(g, (60, 100)) assert g2.dimensions[0].extent == 60 @@ -542,7 +544,7 @@ def test_varying_dimension_extent_mismatch_on_chunkgrid_input(self) -> None: After resize, extent >= array_shape is allowed (last chunk extends past boundary). But extent < array_shape is still an error. """ - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) with pytest.raises(ValueError, match="less than"): parse_chunk_grid(g, (100, 50)) @@ -558,7 +560,7 @@ class TestRectilinearIndexing: def test_basic_indexer_rectilinear(self) -> None: from zarr.core.indexing import BasicIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) indexer = BasicIndexer( selection=(slice(None), slice(None)), shape=(60, 100), @@ -578,7 +580,7 @@ def test_basic_indexer_rectilinear(self) -> None: def test_basic_indexer_int_selection(self) -> None: from zarr.core.indexing import BasicIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) indexer = BasicIndexer( selection=(15, slice(None)), shape=(60, 100), @@ -592,7 +594,7 @@ def test_basic_indexer_int_selection(self) -> None: def test_basic_indexer_slice_subset(self) -> None: from zarr.core.indexing import BasicIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) indexer = BasicIndexer( selection=(slice(5, 35), slice(0, 50)), shape=(60, 100), @@ -605,7 +607,7 @@ def test_basic_indexer_slice_subset(self) -> None: def test_orthogonal_indexer_rectilinear(self) -> None: from zarr.core.indexing import OrthogonalIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) indexer = OrthogonalIndexer( selection=(slice(None), slice(None)), shape=(60, 100), @@ -641,7 +643,7 @@ def test_create_rectilinear_array(self, tmp_path: Path) -> None: from zarr.core.dtype import Float32 from zarr.core.metadata.v3 import ArrayV3Metadata - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) meta = AsyncArray._create_metadata_v3( shape=(60, 100), @@ -655,7 +657,7 @@ def test_create_rectilinear_array(self, tmp_path: Path) -> None: def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: """Verify metadata round-trips through JSON.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) d = serialize_chunk_grid(g, "rectilinear") g2 = parse_chunk_grid(d, (60, 100)) assert g2.shape == g.shape @@ -687,7 +689,7 @@ def test_chunk_grid_name_rectilinear(self, tmp_path: Path) -> None: from zarr.core.array import AsyncArray from zarr.core.dtype import Float32 - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) meta = AsyncArray._create_metadata_v3( shape=(60, 100), dtype=Float32(), @@ -783,7 +785,7 @@ def test_get_chunk_spec_rectilinear(self, tmp_path: Path) -> None: from zarr.core.buffer.core import default_buffer_prototype from zarr.core.dtype import Float32 - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) meta = AsyncArray._create_metadata_v3( shape=(60, 100), dtype=Float32(), @@ -815,7 +817,7 @@ def test_sharding_accepts_rectilinear_outer_grid(self) -> None: from zarr.core.dtype import Float32 codec = ShardingCodec(chunk_shape=(5, 5)) - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) codec.validate( shape=(60, 100), @@ -993,7 +995,7 @@ def test_parse_chunk_grid_preserves_varying_extent(self) -> None: """parse_chunk_grid does not overwrite VaryingDimension extent.""" from zarr.core.chunk_grids import parse_chunk_grid - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) # VaryingDimension extent is 60 (sum of edges) assert isinstance(g.dimensions[0], VaryingDimension) assert g.dimensions[0].extent == 60 @@ -1031,7 +1033,7 @@ def test_getitem_int_1d_regular(self) -> None: def test_getitem_int_1d_rectilinear(self) -> None: """Integer indexing works for 1-d rectilinear grids.""" - g = ChunkGrid.from_rectilinear([[20, 30, 50]]) + g = ChunkGrid.from_rectilinear([[20, 30, 50]], array_shape=(100,)) spec = g[0] assert spec is not None assert spec.shape == (20,) @@ -1119,7 +1121,7 @@ def test_orthogonal_int_array_selection_rectilinear(self) -> None: chunk-local selections.""" from zarr.core.indexing import OrthogonalIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) indexer = OrthogonalIndexer( selection=(np.array([5, 15, 35]), slice(None)), shape=(60, 100), @@ -1136,7 +1138,7 @@ def test_orthogonal_bool_array_selection_rectilinear(self) -> None: """Boolean array selection with rectilinear grid.""" from zarr.core.indexing import OrthogonalIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) mask = np.zeros(60, dtype=bool) mask[5] = True mask[15] = True @@ -1154,7 +1156,7 @@ def test_orthogonal_advanced_indexing_produces_correct_projections(self) -> None for advanced indexing with VaryingDimension.""" from zarr.core.indexing import OrthogonalIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) indexer = OrthogonalIndexer( selection=(np.array([5, 15]), slice(None)), shape=(60, 100), @@ -1182,7 +1184,7 @@ def test_sharding_rejects_non_divisible_rectilinear(self) -> None: codec = ShardingCodec(chunk_shape=(5, 5)) # 17 is not divisible by 5 - g = ChunkGrid.from_rectilinear([[10, 20, 17], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 17], [50, 50]], array_shape=(47, 100)) with pytest.raises(ValueError, match="divisible"): codec.validate( @@ -1197,7 +1199,7 @@ def test_sharding_accepts_divisible_rectilinear(self) -> None: from zarr.core.dtype import Float32 codec = ShardingCodec(chunk_shape=(5, 5)) - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) # Should not raise codec.validate( @@ -1623,7 +1625,7 @@ def test_regular_grid_exact(self) -> None: assert grid.chunk_sizes == ((30, 30, 30), (40, 40)) def test_rectilinear_grid(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) assert grid.chunk_sizes == ((10, 20, 30), (50, 50)) def test_single_chunk(self) -> None: @@ -1675,39 +1677,39 @@ class TestUpdateShape: """Unit tests for ChunkGrid.update_shape().""" def test_no_change(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) new_grid = grid.update_shape((60, 50)) assert _edges(new_grid, 0) == (10, 20, 30) assert _edges(new_grid, 1) == (25, 25) def test_grow_single_dim(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) new_grid = grid.update_shape((80, 50)) assert _edges(new_grid, 0) == (10, 20, 30, 20) assert _edges(new_grid, 1) == (25, 25) def test_grow_multiple_dims(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20], [20, 30]]) + grid = ChunkGrid.from_rectilinear([[10, 20], [20, 30]], array_shape=(30, 50)) # from (30, 50) to (45, 65) new_grid = grid.update_shape((45, 65)) assert _edges(new_grid, 0) == (10, 20, 15) assert _edges(new_grid, 1) == (20, 30, 15) def test_shrink_single_dim(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30, 40], [25, 25]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30, 40], [25, 25]], array_shape=(100, 50)) new_grid = grid.update_shape((35, 50)) # 10+20=30 < 35, 10+20+30=60 >= 35 → keep (10, 20, 30) assert _edges(new_grid, 0) == (10, 20, 30) assert _edges(new_grid, 1) == (25, 25) def test_shrink_to_single_chunk(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) new_grid = grid.update_shape((5, 50)) assert _edges(new_grid, 0) == (10,) assert _edges(new_grid, 1) == (25, 25) def test_shrink_multiple_dims(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 10, 15, 5], [20, 25, 15]]) + grid = ChunkGrid.from_rectilinear([[10, 10, 15, 5], [20, 25, 15]], array_shape=(40, 60)) # from (40, 60) to (25, 35) new_grid = grid.update_shape((25, 35)) # dim 0: 10+10=20 < 25, 10+10+15=35 >= 25 → keep (10, 10, 15) @@ -1716,19 +1718,19 @@ def test_shrink_multiple_dims(self) -> None: assert _edges(new_grid, 1) == (20, 25) def test_dimension_mismatch_error(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20], [30, 40]]) + grid = ChunkGrid.from_rectilinear([[10, 20], [30, 40]], array_shape=(30, 70)) with pytest.raises(ValueError, match="dimensions"): grid.update_shape((30, 70, 100)) def test_boundary_cases(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25]], array_shape=(60, 40)) # Grow to exact chunk boundary on dim 0, add 25 to dim 1 new_grid = grid.update_shape((60, 65)) assert _edges(new_grid, 0) == (10, 20, 30) # no change (60 == sum) assert _edges(new_grid, 1) == (15, 25, 25) # added chunk of 25 # Shrink to exact chunk boundary - grid2 = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25, 10]]) + grid2 = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25, 10]], array_shape=(60, 50)) new_grid2 = grid2.update_shape((30, 40)) # dim 0: 10+20=30 >= 30 → keep (10, 20) assert _edges(new_grid2, 0) == (10, 20) @@ -2124,7 +2126,7 @@ class TestUpdateShapeBoundary: """Resize creates boundary VaryingDimensions with correct extent.""" def test_shrink_creates_boundary(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) new_grid = grid.update_shape((45,)) dim = new_grid.dimensions[0] assert isinstance(dim, VaryingDimension) @@ -2134,7 +2136,7 @@ def test_shrink_creates_boundary(self) -> None: assert dim.data_size(2) == 15 # clipped: 45 - 30 = 15 def test_shrink_to_exact_boundary(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) new_grid = grid.update_shape((30,)) dim = new_grid.dimensions[0] assert isinstance(dim, VaryingDimension) @@ -2144,7 +2146,7 @@ def test_shrink_to_exact_boundary(self) -> None: def test_shrink_chunk_spec(self) -> None: """After shrink, ChunkSpec reflects boundary correctly.""" - grid = ChunkGrid.from_rectilinear([[10, 20, 30]]) + grid = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) new_grid = grid.update_shape((45,)) spec = new_grid[(2,)] assert spec is not None @@ -2154,7 +2156,7 @@ def test_shrink_chunk_spec(self) -> None: def test_parse_chunk_grid_rebinds_extent(self) -> None: """parse_chunk_grid re-binds VaryingDimension extent to array shape.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30]]) + g = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) # sum(edges)=60, array_shape=50 → re-bind extent g2 = parse_chunk_grid(g, (50,)) dim = g2.dimensions[0] From d893d6f58495b0340c0d0d0ae166f048c4919303 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 16:29:49 -0400 Subject: [PATCH 041/118] Add overflow chunk tests --- tests/test_unified_chunk_grid.py | 84 ++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 3 deletions(-) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 5c3bcdc930..e2f5f8a892 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -35,9 +35,9 @@ def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: """Extract the per-chunk edge lengths for *dim* from a ChunkGrid.""" d = grid.dimensions[dim] if isinstance(d, FixedDimension): - return (d.size,) * d.nchunks + return tuple(d.size for _ in range(d.nchunks)) if isinstance(d, VaryingDimension): - return d.edges + return tuple(d.edges) raise TypeError(f"Unexpected dimension type: {type(d)}") @@ -1919,7 +1919,7 @@ async def test_append_with_partial_edge_chunks(self) -> None: assert arr.shape == (35, 30) result = await arr.getitem(slice(None)) - np.testing.assert_array_almost_equal(result, np.vstack([initial, append_data])) # type: ignore[arg-type] + np.testing.assert_array_almost_equal(result, np.vstack([initial, append_data])) async def test_append_small_data(self) -> None: store = zarr.storage.MemoryStore() @@ -2011,6 +2011,84 @@ def test_chunk_spec_interior_varying(self) -> None: assert spec.is_boundary is False +class TestMultipleOverflowChunks: + """Rectilinear grids where multiple chunks extend past the array extent.""" + + def test_multiple_chunks_past_extent(self) -> None: + """Chunks 2 is partial, chunk 3 is entirely past the extent.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) + d = g.dimensions[0] + assert d.nchunks == 4 + assert d.data_size(0) == 10 # fully within + assert d.data_size(1) == 20 # fully within + assert d.data_size(2) == 20 # partial: 50 - 30 = 20 + assert d.data_size(3) == 0 # entirely past + assert d.chunk_size(2) == 30 # codec buffer: full edge + assert d.chunk_size(3) == 40 # codec buffer: full edge + + def test_chunk_spec_entirely_past_extent(self) -> None: + """ChunkSpec for a chunk entirely past the extent has zero-size shape.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) + spec = g[(3,)] + assert spec is not None + assert spec.shape == (0,) + assert spec.codec_shape == (40,) + assert spec.is_boundary is True + + def test_chunk_spec_partial_overflow(self) -> None: + """ChunkSpec for a partially-overflowing chunk clips correctly.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) + spec = g[(2,)] + assert spec is not None + assert spec.shape == (20,) + assert spec.codec_shape == (30,) + assert spec.is_boundary is True + assert spec.slices == (slice(30, 50),) + + def test_chunk_sizes_with_overflow(self) -> None: + """chunk_sizes returns clipped data sizes including zero for past-extent chunks.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) + assert g.chunk_sizes == ((10, 20, 20, 0),) + + def test_multidim_overflow(self) -> None: + """Overflow in multiple dimensions simultaneously.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30], [40, 40, 40]], array_shape=(45, 100)) + # dim 0: edges sum to 60, extent 45 → chunk 2 partial (45-30=15) + # dim 1: edges sum to 120, extent 100 → chunk 2 partial (100-80=20) + assert g.chunk_sizes == ((10, 20, 15), (40, 40, 20)) + spec = g[(2, 2)] + assert spec is not None + assert spec.shape == (15, 20) + assert spec.codec_shape == (30, 40) + + def test_uniform_edges_with_overflow_stays_varying(self) -> None: + """Uniform edges with extent < sum(edges) must stay VaryingDimension.""" + g = ChunkGrid.from_rectilinear([[10, 10, 10, 10]], array_shape=(35,)) + assert isinstance(g.dimensions[0], VaryingDimension) + assert not g.is_regular # can't collapse to FixedDimension + assert g.chunk_sizes == ((10, 10, 10, 5),) + assert g.dimensions[0].nchunks == 4 + + def test_serialization_roundtrip_overflow(self) -> None: + """Overflow chunks survive serialization round-trip.""" + g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) + serialized = serialize_chunk_grid(g, "rectilinear") + assert serialized == { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30, 40]]}, + } + g2 = parse_chunk_grid(serialized, (50,)) + assert g2.dimensions[0].nchunks == 4 + assert g2.chunk_sizes == ((10, 20, 20, 0),) + + def test_index_to_chunk_near_extent(self) -> None: + """Index lookup near and at the extent boundary.""" + d = VaryingDimension([10, 20, 30, 40], extent=50) + assert d.index_to_chunk(29) == 1 # last index in chunk 1 + assert d.index_to_chunk(30) == 2 # first index in chunk 2 + assert d.index_to_chunk(49) == 2 # last valid index + + class TestBoundaryIndexing: """Indexing operations on boundary chunks for both FixedDimension and VaryingDimension, ensuring the isinstance cleanup works correctly.""" From 308bb245aff9a436b347b48ed841357b0e55f68e Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 16:43:01 -0400 Subject: [PATCH 042/118] Design doc for chunk grid metadata separation --- docs/design/chunk-grid-metadata-separation.md | 253 ++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 docs/design/chunk-grid-metadata-separation.md diff --git a/docs/design/chunk-grid-metadata-separation.md b/docs/design/chunk-grid-metadata-separation.md new file mode 100644 index 0000000000..767090fe46 --- /dev/null +++ b/docs/design/chunk-grid-metadata-separation.md @@ -0,0 +1,253 @@ +# Chunk Grid: Metadata / Array Separation + +**Related:** + +- [chunk-grid.md](chunk-grid.md) (unified chunk grid design) + +## Problem + +`ArrayV3Metadata` stores `chunk_grid: ChunkGrid` — a behavioral object with index-to-chunk math, iteration, resize logic, and per-dimension grid types. Metadata should be a serializable data bag. The behavioral `ChunkGrid` carries runtime state (extent per dimension, prefix sums) that belongs on the array, not on the metadata record. + +This coupling causes several issues: + +1. **Metadata is not a simple DTO.** Constructing `ArrayV3Metadata` triggers `parse_chunk_grid()` which builds `FixedDimension`/`VaryingDimension` objects, computes prefix sums, and validates edge coverage. Metadata round-trips (`from_dict` → `to_dict`) pay this cost unnecessarily. +2. **Codec validation is misplaced.** `_validate_metadata()` calls `codec.validate(chunk_grid=...)`, passing the behavioral object. This conflates structural metadata validation with runtime array validation. +3. **`update_shape` lives on metadata.** Shape changes require constructing a new `ChunkGrid` with updated extents, serializing it back, and creating new metadata. The metadata layer shouldn't know about resize semantics. +4. **Redundant state.** `chunk_grid_name: str` exists solely to preserve serialization format because the `ChunkGrid` object doesn't carry its own name. With a plain dict, the name is just `chunk_grid["name"]`. + +## Design + +### Principles + +1. **Metadata stores what's on disk.** `ArrayV3Metadata.chunk_grid` holds the JSON dict exactly as it appears in `zarr.json`. No parsing, no behavioral objects, no computed state. +2. **The array owns the behavioral grid.** `AsyncArray` constructs a `ChunkGrid` from the metadata dict + shape at init time. All chunk-related behavior (indexing, iteration, resize) goes through the array's grid. +3. **Codec validation happens at array construction.** The array has the full context (shape + grid + dtype) needed to validate codecs. Metadata validates only structural properties (correct keys, matching ndim). +4. **No signature changes to downstream consumers.** Indexers and codecs still receive `ChunkGrid`. Only the *source* of the grid changes — from `metadata.chunk_grid` to `array.chunk_grid`. + +### Current architecture + +``` +ArrayV3Metadata + ├── chunk_grid: ChunkGrid ← behavioral object + ├── chunk_grid_name: str ← "regular" | "rectilinear" + └── _validate_metadata() → codec.validate(chunk_grid=...) + get_chunk_spec() → chunk_grid[coords] + update_shape() → chunk_grid.update_shape() + to_dict() → serialize_chunk_grid(chunk_grid, name) + chunks → chunk_grid.chunk_shape + shards → chunk_grid.chunk_shape + +AsyncArray + └── self.metadata.chunk_grid ← delegates everything + +Indexers + └── __init__(chunk_grid: ChunkGrid) + +Codec.validate() + └── validate(shape, dtype, chunk_grid: ChunkGrid) +``` + +### Proposed architecture + +``` +ArrayV3Metadata + ├── chunk_grid: dict[str, JSON] ← plain serialized form + └── _validate_metadata() → structural checks only (ndim, required keys) + to_dict() → return self.chunk_grid + from_dict() → store dict as-is + +AsyncArray + ├── chunk_grid: ChunkGrid ← behavioral object, constructed on init + └── get_chunk_spec() ← moved from metadata + _validate_codecs() ← moved from metadata + update_shape() ← moved from metadata + +Indexers ← unchanged +Codec.validate() ← unchanged +``` + +### Metadata changes + +`ArrayV3Metadata` becomes simpler: + +```python +@dataclass(frozen=True, kw_only=True) +class ArrayV3Metadata(Metadata): + shape: tuple[int, ...] + data_type: ZDType[TBaseDType, TBaseScalar] + chunk_grid: dict[str, JSON] # plain JSON dict + chunk_key_encoding: ChunkKeyEncoding + fill_value: Any + codecs: tuple[Codec, ...] + # ... + + def __init__(self, *, chunk_grid, **kwargs): + # Store the dict directly. Validate structure only: + # - has "name" and "configuration" keys + # - ndim matches shape + # No parse_chunk_grid(), no ChunkGrid construction. + object.__setattr__(self, "chunk_grid", chunk_grid) + + def _validate_metadata(self): + # Structural: ndim from dict matches len(shape) + # No codec validation — that moves to the array. + name, config = parse_named_configuration(self.chunk_grid) + if name == "regular": + ndim = len(config["chunk_shape"]) + elif name == "rectilinear": + ndim = len(config["chunk_shapes"]) + if ndim != len(self.shape): + raise ValueError(...) + + @property + def chunks(self) -> tuple[int, ...]: + name, config = parse_named_configuration(self.chunk_grid) + if name == "regular": + return tuple(config["chunk_shape"]) + raise NotImplementedError(...) + + def to_dict(self): + d = super().to_dict() + # chunk_grid is already a dict — no serialize_chunk_grid() needed + return d +``` + +The `chunk_grid_name` field is removed. Round-trip fidelity is preserved because the original dict is stored verbatim. + +### Array changes + +`AsyncArray` constructs the behavioral `ChunkGrid` and owns all chunk-related operations: + +```python +@dataclass(frozen=True) +class AsyncArray: + metadata: ArrayV2Metadata | ArrayV3Metadata + # ... + + def __init__(self, metadata, store_path, config): + # ... existing init ... + chunk_grid = parse_chunk_grid(metadata.chunk_grid, metadata.shape) + object.__setattr__(self, "_chunk_grid", chunk_grid) + # Codec validation moves here: + self._validate_codecs() + + @property + def chunk_grid(self) -> ChunkGrid: + return self._chunk_grid + + def get_chunk_spec(self, chunk_coords, array_config, prototype) -> ArraySpec: + spec = self.chunk_grid[chunk_coords] + if spec is None: + raise ValueError(...) + return ArraySpec(shape=spec.codec_shape, ...) + + def _validate_codecs(self) -> None: + for codec in self.metadata.codecs: + codec.validate( + shape=self.metadata.shape, + dtype=self.metadata.data_type, + chunk_grid=self.chunk_grid, + ) +``` + +For resize, the array constructs a new `ChunkGrid` with the new shape. For regular grids, the metadata dict doesn't change on resize — only the extent changes, which is runtime state not serialized in the chunk grid JSON. The array rebuilds its `ChunkGrid` with the new shape: + +```python +async def _resize(self, new_shape): + new_grid = self.chunk_grid.update_shape(new_shape) + # For regular grids, metadata.chunk_grid dict stays the same. + # For rectilinear grids that grew/shrank, serialize back: + new_chunk_grid_dict = serialize_chunk_grid(new_grid, self.metadata.chunk_grid["name"]) + new_metadata = replace(self.metadata, shape=new_shape, chunk_grid=new_chunk_grid_dict) + # ... +``` + +### V2 metadata + +`ArrayV2Metadata` already stores `chunks: tuple[int, ...]` as plain data. Its `chunk_grid` property (which constructs a `ChunkGrid`) is removed. The array handles construction for both V2 and V3: + +```python +@property +def chunk_grid(self) -> ChunkGrid: + if isinstance(self.metadata, ArrayV2Metadata): + return ChunkGrid.from_regular(self.metadata.shape, self.metadata.chunks) + return parse_chunk_grid(self.metadata.chunk_grid, self.metadata.shape) +``` + +### Call site migration + +All `self.metadata.chunk_grid` references in `array.py` (~30 sites) change to `self.chunk_grid`: + +```python +# Before +indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + +# After +indexer = BasicIndexer(selection, self.shape, self.chunk_grid) +``` + +Indexers, codecs, and the codec pipeline are unchanged — they still receive `ChunkGrid` as a parameter. Only the *source* changes. + +### What does NOT change + +| Component | Status | +|---|---| +| `ChunkGrid` class | Unchanged — all behavior stays | +| `FixedDimension`, `VaryingDimension` | Unchanged | +| `parse_chunk_grid()`, `serialize_chunk_grid()` | Unchanged — called from array instead of metadata | +| Indexer classes | Unchanged — still receive `ChunkGrid` | +| Codec `validate()` signature | Unchanged — still receives `ChunkGrid` | +| On-disk format | No spec change | + +## Design decisions + +### Why not keep ChunkGrid on metadata with a lazy property? + +A lazy `@cached_property` on metadata would defer the cost but not fix the fundamental issue: metadata would still own behavioral state. Resize logic, codec validation, and `get_chunk_spec` would remain on metadata. The goal is a clean separation — metadata is data, the array is behavior. + +### Why move codec validation to the array? + +Codec validation needs the behavioral `ChunkGrid` (sharding checks divisibility, iterates chunk coords). With a plain dict on metadata, the array is the first place where a `ChunkGrid` exists. Validating there is natural — the array is the runtime boundary where all pieces (shape, dtype, grid, codecs) come together. + +This means `ArrayV3Metadata` can be constructed with an invalid codec/grid combination without error. This is acceptable: metadata is a data transfer object. Validation at the array boundary catches errors at the same point users interact with the data. + +### Why store the raw dict instead of a TypedDict or NamedTuple? + +The dict is exactly what's in `zarr.json`. Storing it verbatim gives: +- Zero-cost round-trips (`to_dict` returns it as-is) +- No `chunk_grid_name` field needed (it's `chunk_grid["name"]`) +- Forward compatibility with unknown chunk grid types (metadata can store and round-trip grids it doesn't understand) + +### How does resize work with a plain dict? + +For regular grids, the chunk grid JSON doesn't change on resize — `{"name": "regular", "configuration": {"chunk_shape": [10, 20]}}` is the same regardless of array shape. The extent is runtime state derived from `shape`. The array rebuilds its `ChunkGrid` with the new shape. + +For rectilinear grids, resize may add or remove chunks. The array resizes the `ChunkGrid`, serializes it back to a dict via `serialize_chunk_grid()`, and creates new metadata with the updated dict. + +## Migration + +### PR 1: Add `chunk_grid` property to `AsyncArray` (non-breaking) + +**Files:** `array.py` +**Scope:** Add `chunk_grid` property that delegates to `self.metadata.chunk_grid`. Migrate all `self.metadata.chunk_grid` references in `array.py` to `self.chunk_grid`. Purely mechanical, no behavioral change. + +### PR 2: Move codec validation to array + +**Files:** `v3.py`, `array.py` +**Scope:** Remove `codec.validate()` calls from `ArrayV3Metadata._validate_metadata()`. Add `_validate_codecs()` to `AsyncArray.__init__`. Move `get_chunk_spec()` from metadata to array. The codec `validate()` signature is unchanged. + +### PR 3: Make metadata chunk_grid a plain dict + +**Files:** `v3.py`, `v2.py`, `array.py` +**Scope:** Replace `chunk_grid: ChunkGrid` with `chunk_grid: dict[str, JSON]` on `ArrayV3Metadata`. Remove `chunk_grid_name` field. Update `__init__`, `from_dict`, `to_dict`, `_validate_metadata`, `chunks`, `shards`. Construct `ChunkGrid` in `AsyncArray.__init__` via `parse_chunk_grid()`. Remove `chunk_grid` property from `ArrayV2Metadata`. Update `update_shape` / resize flow. + +### PR 4: Update tests + +**Files:** `tests/` +**Scope:** Update tests that construct `ArrayV3Metadata` directly and access `.chunk_grid` as a `ChunkGrid`. Tests that go through `Array` / `AsyncArray` should mostly work unchanged. + +## Open questions + +1. **Convenience properties on metadata.** Should `ArrayV3Metadata` expose `chunk_shape` parsed from the raw dict? Or should all chunk access go through the array? Exposing it avoids constructing a full `ChunkGrid` for simple queries, but adds dict-parsing logic to metadata. +2. **Downstream breakage.** Code that accesses `metadata.chunk_grid` as a `ChunkGrid` (e.g., xarray, VirtualiZarr) will break. Migration path: `array.chunk_grid` for behavioral access, `metadata.chunk_grid` for the raw dict. Downstream PRs needed for xarray, VirtualiZarr, Icechunk. +3. **Rectilinear resize serialization.** When a rectilinear array resizes, the array must serialize the updated `ChunkGrid` back to a dict. Should this use `serialize_chunk_grid()` (which applies RLE compression), or should the array manipulate the dict directly? The former is cleaner; the latter avoids a round-trip through `ChunkGrid`. From 0f528226a02f103dd16ad5b57db7ef612b8e4880 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 12 Mar 2026 16:45:37 -0400 Subject: [PATCH 043/118] minor simplifications --- src/zarr/core/array.py | 16 +++------------- src/zarr/core/chunk_grids.py | 2 +- tests/test_unified_chunk_grid.py | 12 ++++++------ 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0eeb5901a0..a1371e8d7c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1299,12 +1299,10 @@ def _chunk_grid_shape(self) -> tuple[int, ...]: codecs: tuple[Codec, ...] = getattr(self.metadata, "codecs", ()) if len(codecs) == 1 and isinstance(codecs[0], ShardingCodec): + # When sharding, count inner chunks across the whole array chunk_shape = codecs[0].chunk_shape - elif self.metadata.chunk_grid.is_regular: - chunk_shape = self.metadata.chunk_grid.chunk_shape - else: - return self.metadata.chunk_grid.shape - return tuple(starmap(ceildiv, zip(self.shape, chunk_shape, strict=True))) + return tuple(starmap(ceildiv, zip(self.shape, chunk_shape, strict=True))) + return self.metadata.chunk_grid.shape @property def _shard_grid_shape(self) -> tuple[int, ...]: @@ -5577,14 +5575,6 @@ def _iter_chunk_regions( A tuple of slice objects representing the region spanned by each shard in the selection. """ - if array.metadata.chunk_grid.is_regular: - return _iter_regions( - array.shape, - array.chunks, - origin=origin, - selection_shape=selection_shape, - trim_excess=True, - ) return array.metadata.chunk_grid.iter_chunk_regions( origin=origin, selection_shape=selection_shape ) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index b912cacbc7..aeb3e5b325 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -515,7 +515,7 @@ def __getitem__(self, coords: int | tuple[int, ...]) -> ChunkSpec | None: if ix < 0 or ix >= dim.nchunks: return None offset = dim.chunk_offset(ix) - slices.append(slice(offset, offset + dim.data_size(ix))) + slices.append(slice(offset, offset + dim.data_size(ix), 1)) codec_shape.append(dim.chunk_size(ix)) return ChunkSpec(tuple(slices), tuple(codec_shape)) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index e2f5f8a892..9b5a4427b5 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -268,7 +268,7 @@ def test_getitem_slices(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) spec = g[(1, 2)] assert spec is not None - assert spec.slices == (slice(10, 30), slice(50, 75)) + assert spec.slices == (slice(10, 30, 1), slice(50, 75, 1)) def test_all_chunk_coords(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) @@ -1025,7 +1025,7 @@ def test_getitem_int_1d_regular(self) -> None: spec = g[0] assert spec is not None assert spec.shape == (10,) - assert spec.slices == (slice(0, 10),) + assert spec.slices == (slice(0, 10, 1),) # Boundary chunk spec = g[9] assert spec is not None @@ -1918,7 +1918,7 @@ async def test_append_with_partial_edge_chunks(self) -> None: await arr.append(append_data, axis=0) assert arr.shape == (35, 30) - result = await arr.getitem(slice(None)) + result = np.asarray(await arr.getitem(slice(None))) np.testing.assert_array_almost_equal(result, np.vstack([initial, append_data])) async def test_append_small_data(self) -> None: @@ -2043,7 +2043,7 @@ def test_chunk_spec_partial_overflow(self) -> None: assert spec.shape == (20,) assert spec.codec_shape == (30,) assert spec.is_boundary is True - assert spec.slices == (slice(30, 50),) + assert spec.slices == (slice(30, 50, 1),) def test_chunk_sizes_with_overflow(self) -> None: """chunk_sizes returns clipped data sizes including zero for past-extent chunks.""" @@ -2265,5 +2265,5 @@ def test_iter_chunk_regions_rectilinear(self) -> None: a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") regions = list(_iter_chunk_regions(a)) assert len(regions) == 2 - assert regions[0] == (slice(0, 10),) - assert regions[1] == (slice(10, 30),) + assert regions[0] == (slice(0, 10, 1),) + assert regions[1] == (slice(10, 30, 1),) From 5823fbb80943765471ef75d4b8911be276a1b22c Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 14:16:48 -0400 Subject: [PATCH 044/118] Gatekeep rectilinear chunks behind feature flag --- docs/user-guide/arrays.md | 13 +++++++++ docs/user-guide/config.md | 1 + src/zarr/api/synchronous.py | 9 +++++- src/zarr/core/array.py | 3 ++ src/zarr/core/chunk_grids.py | 17 +++++++++++ src/zarr/core/config.py | 1 + src/zarr/testing/strategies.py | 5 ++-- tests/test_config.py | 1 + tests/test_unified_chunk_grid.py | 48 ++++++++++++++++++++++++++++++++ 9 files changed, 95 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index 2a8fe08100..7bc965665b 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -605,6 +605,18 @@ Without the `shards` argument, there would be 10,000 chunks stored as individual Rectilinear chunk grids are an experimental feature and may change in future releases. This feature is expected to stabilize in Zarr version 3.3. + Because the feature is still stabilizing, it is disabled by default and + must be explicitly enabled: + + ```python + import zarr + zarr.config.set({"array.rectilinear_chunks": True}) + ``` + + Or via the environment variable `ZARR_ARRAY__RECTILINEAR_CHUNKS=True`. + + The examples below assume this config has been set. + By default, Zarr arrays use a regular chunk grid where every chunk along a given dimension has the same size (except possibly the final boundary chunk). Rectilinear chunk grids allow each chunk along a dimension to have a different @@ -618,6 +630,7 @@ To create an array with rectilinear chunks, pass a nested list to the `chunks` parameter where each inner list gives the chunk sizes along one dimension: ```python exec="true" session="arrays" source="above" result="ansi" +zarr.config.set({"array.rectilinear_chunks": True}) z = zarr.create_array( store=zarr.storage.MemoryStore(), shape=(60, 100), diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index 21fe9b5def..e0cce321be 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -30,6 +30,7 @@ Configuration options include the following: - Default Zarr format `default_zarr_version` - Default array order in memory `array.order` - Whether empty chunks are written to storage `array.write_empty_chunks` +- Enable experimental rectilinear chunk grids `array.rectilinear_chunks` - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index e0af472169..06c70ceb0b 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -858,9 +858,13 @@ def create_array( data : np.ndarray, optional Array-like data to use for initializing the array. If this parameter is provided, the ``shape`` and ``dtype`` parameters must be ``None``. - chunks : tuple[int, ...] | Literal["auto"], default="auto" + chunks : tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"], default="auto" Chunk shape of the array. If chunks is "auto", a chunk shape is guessed based on the shape of the array and the dtype. + A nested list of per-dimension edge sizes creates a rectilinear grid. + Rectilinear chunk grids are experimental and must be explicitly enabled + with ``zarr.config.set({'array.rectilinear_chunks': True})`` while the + feature is stabilizing. shards : tuple[int, ...], optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec] | Literal["auto"], optional @@ -1033,6 +1037,9 @@ def from_array( - "keep": Retain the chunk grid of the data array if it is a zarr Array. - tuple[int, ...]: A tuple of integers representing the chunk shape (regular grid). - Sequence[Sequence[int]]: Per-dimension chunk edge lists (rectilinear grid). + Rectilinear chunk grids are experimental and must be explicitly enabled + with ``zarr.config.set({'array.rectilinear_chunks': True})`` while the + feature is stabilizing. If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". shards : tuple[int, ...], optional diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a1371e8d7c..d78668068d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4364,6 +4364,9 @@ async def from_array( - "keep": Retain the chunk grid of the data array if it is a zarr Array. - tuple[int, ...]: A tuple of integers representing the chunk shape (regular grid). - Sequence[Sequence[int]]: Per-dimension chunk edge lists (rectilinear grid). + Rectilinear chunk grids are experimental and must be explicitly enabled + with ``zarr.config.set({'array.rectilinear_chunks': True})`` while the + feature is stabilizing. If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". shards : tuple[int, ...], optional diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index aeb3e5b325..e0ee5e9cc4 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -431,7 +431,20 @@ def from_rectilinear( chunk along each dimension may extend past the array boundary (the edge is the codec buffer size; ``data_size`` clips to the extent). + + Raises + ------ + ValueError + If the ``array.rectilinear_chunks`` config option is not enabled. """ + from zarr.core.config import config + + if not config.get("array.rectilinear_chunks"): + raise ValueError( + "Rectilinear chunk grids are experimental and disabled by default. " + "Enable them with: zarr.config.set({'array.rectilinear_chunks': True}) " + "or set the environment variable ZARR_ARRAY__RECTILINEAR_CHUNKS=True" + ) extents = parse_shapelike(array_shape) if len(extents) != len(chunk_shapes): raise ValueError( @@ -623,6 +636,10 @@ def parse_chunk_grid( This is the primary entry point for constructing a ChunkGrid from serialized metadata. It always produces a grid with correct extent values. + + Both ``"regular"`` and ``"rectilinear"`` grid names are supported. Rectilinear + grids are experimental and require the ``array.rectilinear_chunks`` config + option to be enabled; a ``ValueError`` is raised otherwise. """ if isinstance(data, ChunkGrid): # Re-bind extent if array_shape differs from what's stored diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index f8f8ea4f5f..fceb3657b2 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -97,6 +97,7 @@ def enable_gpu(self) -> ConfigSet: "order": "C", "write_empty_chunks": False, "target_shard_size_bytes": None, + "rectilinear_chunks": False, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index a16bd0b1b6..ea2736b3c9 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -371,8 +371,9 @@ def rectilinear_arrays( nparray = np.arange(int(np.prod(shape)), dtype="int32").reshape(shape) store = MemoryStore() - a = zarr.create_array(store=store, shape=shape, chunks=chunk_shapes, dtype="int32") - a[:] = nparray + with zarr.config.set({"array.rectilinear_chunks": True}): + a = zarr.create_array(store=store, shape=shape, chunks=chunk_shapes, dtype="int32") + a[:] = nparray return a diff --git a/tests/test_config.py b/tests/test_config.py index c3102e8efe..2704505bc8 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -54,6 +54,7 @@ def test_config_defaults_set() -> None: "order": "C", "write_empty_chunks": False, "target_shard_size_bytes": None, + "rectilinear_chunks": False, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 9b5a4427b5..109cebd0cf 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -28,9 +28,17 @@ from zarr.storage import MemoryStore if TYPE_CHECKING: + from collections.abc import Generator from pathlib import Path +@pytest.fixture(autouse=True) +def _enable_rectilinear_chunks() -> Generator[None, None, None]: + """Enable rectilinear chunks for all tests in this module.""" + with zarr.config.set({"array.rectilinear_chunks": True}): + yield + + def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: """Extract the per-chunk edge lengths for *dim* from a ChunkGrid.""" d = grid.dimensions[dim] @@ -41,6 +49,46 @@ def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: raise TypeError(f"Unexpected dimension type: {type(d)}") +# --------------------------------------------------------------------------- +# Feature flag gating +# --------------------------------------------------------------------------- + + +class TestRectilinearFeatureFlag: + """Test that rectilinear chunks are gated behind the config flag.""" + + def test_disabled_by_default(self) -> None: + with zarr.config.set({"array.rectilinear_chunks": False}): + with pytest.raises(ValueError, match="experimental and disabled by default"): + ChunkGrid.from_rectilinear([[10, 20], [25, 25]], array_shape=(30, 50)) + + def test_enabled_via_config(self) -> None: + with zarr.config.set({"array.rectilinear_chunks": True}): + g = ChunkGrid.from_rectilinear([[10, 20], [25, 25]], array_shape=(30, 50)) + assert g.ndim == 2 + + def test_create_array_blocked(self) -> None: + with zarr.config.set({"array.rectilinear_chunks": False}): + store = MemoryStore() + with pytest.raises(ValueError, match="experimental and disabled by default"): + zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") + + def test_parse_chunk_grid_blocked(self) -> None: + """Opening a rectilinear array from metadata is also gated.""" + with zarr.config.set({"array.rectilinear_chunks": False}): + with pytest.raises(ValueError, match="experimental and disabled by default"): + parse_chunk_grid( + { + "name": "rectilinear", + "configuration": { + "kind": "inline", + "chunk_shapes": [[10, 20, 30], [50, 50]], + }, + }, + array_shape=(60, 100), + ) + + # --------------------------------------------------------------------------- # FixedDimension # --------------------------------------------------------------------------- From 27f28e7d5f54b0a0d57cddb9abff722cb28fdae1 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 14:31:13 -0400 Subject: [PATCH 045/118] Fix off-by-one bug --- src/zarr/core/chunk_grids.py | 2 ++ tests/test_unified_chunk_grid.py | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index e0ee5e9cc4..4c28b73bfe 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -120,6 +120,8 @@ def nchunks(self) -> int: return len(self.edges) def index_to_chunk(self, idx: int) -> int: + if idx < 0 or idx >= self.extent: + raise IndexError(f"Index {idx} out of bounds for dimension with extent {self.extent}") return bisect.bisect_right(self.cumulative, idx) def chunk_offset(self, chunk_ix: int) -> int: diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 109cebd0cf..dab7abcefd 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -49,6 +49,28 @@ def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: raise TypeError(f"Unexpected dimension type: {type(d)}") +# --------------------------------------------------------------------------- +# Index to chunk +# --------------------------------------------------------------------------- + + +class TestVaryingDimensionIndexToChunkBounds: + def test_index_at_extent_raises(self) -> None: + """index_to_chunk(extent) should raise since extent is out of bounds.""" + dim = VaryingDimension([10, 20, 30], extent=60) + with pytest.raises(IndexError, match="out of bounds"): + dim.index_to_chunk(60) + + def test_index_past_extent_raises(self) -> None: + dim = VaryingDimension([10, 20, 30], extent=60) + with pytest.raises(IndexError, match="out of bounds"): + dim.index_to_chunk(100) + + def test_last_valid_index_works(self) -> None: + dim = VaryingDimension([10, 20, 30], extent=60) + assert dim.index_to_chunk(59) == 2 + + # --------------------------------------------------------------------------- # Feature flag gating # --------------------------------------------------------------------------- From a35cf5624cafff0c43440f1398c2db5e1b9bccc1 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 16:05:57 -0400 Subject: [PATCH 046/118] Fix chunk indexing boundary checks --- src/zarr/core/chunk_grids.py | 4 ++++ tests/test_unified_chunk_grid.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 4c28b73bfe..ce95b75fa3 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -57,6 +57,10 @@ def nchunks(self) -> int: return ceildiv(self.extent, self.size) def index_to_chunk(self, idx: int) -> int: + if idx < 0: + raise IndexError(f"Negative index {idx} is not allowed") + if idx >= self.extent: + raise IndexError(f"Index {idx} is out of bounds for extent {self.extent}") if self.size == 0: return 0 return idx // self.size diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index dab7abcefd..66b4a027f3 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -71,6 +71,23 @@ def test_last_valid_index_works(self) -> None: assert dim.index_to_chunk(59) == 2 +class TestFixedDimensionIndexToChunkBounds: + def test_negative_index_raises(self) -> None: + """index_to_chunk(-1) should raise, not silently return -1.""" + dim = FixedDimension(size=10, extent=95) + with pytest.raises(IndexError, match="Negative"): + dim.index_to_chunk(-1) + + def test_index_at_extent_raises(self) -> None: + dim = FixedDimension(size=10, extent=95) + with pytest.raises(IndexError, match="out of bounds"): + dim.index_to_chunk(95) + + def test_last_valid_index_works(self) -> None: + dim = FixedDimension(size=10, extent=95) + assert dim.index_to_chunk(94) == 9 + + # --------------------------------------------------------------------------- # Feature flag gating # --------------------------------------------------------------------------- From cbb28fe5d4959eefbe78448294ed8f46b2077769 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 16:07:55 -0400 Subject: [PATCH 047/118] Standardize docstrings --- src/zarr/core/array.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index d78668068d..33ba0f7999 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4889,9 +4889,13 @@ async def create_array( data : np.ndarray, optional Array-like data to use for initializing the array. If this parameter is provided, the ``shape`` and ``dtype`` parameters must be ``None``. - chunks : tuple[int, ...] | Literal["auto"], default="auto" + chunks : tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"], default="auto" Chunk shape of the array. If chunks is "auto", a chunk shape is guessed based on the shape of the array and the dtype. + A nested list of per-dimension edge sizes creates a rectilinear grid. + Rectilinear chunk grids are experimental and must be explicitly enabled + with ``zarr.config.set({'array.rectilinear_chunks': True})`` while the + feature is stabilizing. shards : tuple[int, ...], optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec] | Literal["auto"], optional From 280eb685a79240c247ecf42a9cbf35c5287249f4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 16:53:19 -0400 Subject: [PATCH 048/118] fix spec compliance --- src/zarr/core/chunk_grids.py | 31 ++++++++++---------- tests/test_unified_chunk_grid.py | 49 +++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ce95b75fa3..538a14fbd1 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -272,19 +272,16 @@ def _compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: def _serialize_fixed_dim(dim: FixedDimension) -> RectilinearDimSpec: - """Compact rectilinear representation for a fixed-size dimension.""" - n = dim.nchunks - if n == 0: + """Compact rectilinear representation for a fixed-size dimension. + + Per the rectilinear spec, a bare integer is repeated until the sum + >= extent. This preserves the full codec buffer size for boundary + chunks, matching the regular grid spec ("chunks at the border always + have the full chunk size"). + """ + if dim.nchunks == 0: return [] - last_data = dim.extent - (n - 1) * dim.size - if last_data == dim.size: - return dim.size - elif n == 1: - return [last_data] - elif n == 2: - return [dim.size, last_data] - else: - return [[dim.size, n - 1], last_data] + return dim.size def _serialize_varying_dim(dim: VaryingDimension) -> RectilinearDimSpec: @@ -463,10 +460,12 @@ def from_rectilinear( if not edges_list: raise ValueError("Each dimension must have at least one chunk") edge_sum = sum(edges_list) - # Only collapse to FixedDimension when edges are uniform AND - # extent equals edge_sum. When extent < edge_sum the explicit - # edge count matters (overflow chunks), so use VaryingDimension. - if all(e == edges_list[0] for e in edges_list) and extent == edge_sum: + # Collapse to FixedDimension when edges are uniform AND either + # extent == edge_sum (exact fit) or the number of edges matches + # ceildiv(extent, edge) (regular grid with boundary overflow). + if all(e == edges_list[0] for e in edges_list) and ( + extent == edge_sum or len(edges_list) == ceildiv(extent, edges_list[0]) + ): dims.append(FixedDimension(size=edges_list[0], extent=extent)) else: dims.append(VaryingDimension(edges_list, extent=extent)) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 66b4a027f3..87ebef5325 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -636,6 +636,33 @@ def test_varying_dimension_extent_mismatch_on_chunkgrid_input(self) -> None: parse_chunk_grid(g, (100, 50)) +class TestRectilinearRoundTripPreservesCodecShape: + def test_boundary_chunk_codec_size_preserved(self) -> None: + """Round-tripping through rectilinear should not change codec buffer sizes.""" + grid = ChunkGrid.from_regular((95,), (10,)) + original_codec_size = grid.dimensions[0].chunk_size(9) + assert original_codec_size == 10 + + serialized = serialize_chunk_grid(grid, "rectilinear") + parsed = parse_chunk_grid(serialized, (95,)) + + roundtripped_codec_size = parsed.dimensions[0].chunk_size(9) + assert roundtripped_codec_size == original_codec_size, ( + f"codec buffer changed from {original_codec_size} to " + f"{roundtripped_codec_size} after round-trip" + ) + + def test_single_chunk_boundary_codec_size_preserved(self) -> None: + """shape=7, chunk_size=10: single chunk's codec buffer should stay 10.""" + grid = ChunkGrid.from_regular((7,), (10,)) + assert grid.dimensions[0].chunk_size(0) == 10 + + serialized = serialize_chunk_grid(grid, "rectilinear") + parsed = parse_chunk_grid(serialized, (7,)) + + assert parsed.dimensions[0].chunk_size(0) == 10 + + # --------------------------------------------------------------------------- # Indexing with rectilinear grids # --------------------------------------------------------------------------- @@ -983,20 +1010,14 @@ def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" - # Second dim should serialize as edges that sum to 95 + # Second dim serializes as bare integer (per rectilinear spec, + # a bare integer repeats until sum >= extent, preserving full + # codec buffer size for boundary chunks). config = d["configuration"] assert isinstance(config, dict) chunk_shapes = config["chunk_shapes"] assert isinstance(chunk_shapes, list) - # Last edge should be 5, not 10 - dim1_shapes = chunk_shapes[1] - # Expand RLE to check - if isinstance(dim1_shapes[0], list): - expanded = _expand_rle(dim1_shapes) - else: - expanded = dim1_shapes - assert sum(expanded) == 95 # extent preserved - assert expanded[-1] == 5 # boundary chunk + assert chunk_shapes[1] == 10 # bare integer shorthand g2 = parse_chunk_grid(d, (60, 95)) assert g2.shape == g.shape @@ -2148,11 +2169,11 @@ def test_multidim_overflow(self) -> None: assert spec.shape == (15, 20) assert spec.codec_shape == (30, 40) - def test_uniform_edges_with_overflow_stays_varying(self) -> None: - """Uniform edges with extent < sum(edges) must stay VaryingDimension.""" + def test_uniform_edges_with_overflow_collapses_to_fixed(self) -> None: + """Uniform edges where len == ceildiv(extent, edge) collapse to FixedDimension.""" g = ChunkGrid.from_rectilinear([[10, 10, 10, 10]], array_shape=(35,)) - assert isinstance(g.dimensions[0], VaryingDimension) - assert not g.is_regular # can't collapse to FixedDimension + assert isinstance(g.dimensions[0], FixedDimension) + assert g.is_regular assert g.chunk_sizes == ((10, 10, 10, 5),) assert g.dimensions[0].nchunks == 4 From e88c06bd48341d942813edb613c530469132c722 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 16:57:13 -0400 Subject: [PATCH 049/118] Handle integer floats --- src/zarr/core/chunk_grids.py | 8 ++++---- tests/test_unified_chunk_grid.py | 11 +++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 538a14fbd1..a171cedc5a 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -232,10 +232,10 @@ def _expand_rle(data: Sequence[list[int] | int]) -> list[int]: """ result: list[int] = [] for item in data: - if isinstance(item, int): - result.append(item) - elif len(item) == 2: - size, count = item + if isinstance(item, (int, float)) and not isinstance(item, bool): + result.append(int(item)) + elif isinstance(item, list) and len(item) == 2: + size, count = int(item[0]), int(item[1]) result.extend([size] * count) else: raise ValueError(f"RLE entries must be an integer or [size, count], got {item}") diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 87ebef5325..ffe0b6fb30 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -397,6 +397,17 @@ def test_roundtrip(self) -> None: assert _expand_rle(compressed) == original +class TestExpandRleHandlesJsonFloats: + def test_bare_integer_floats_accepted(self) -> None: + """JSON parsers may emit 10.0 for the integer 10; _expand_rle should handle it.""" + result = _expand_rle([10.0, 20.0]) # type: ignore[list-item] + assert result == [10, 20] + + def test_rle_pair_with_float_count(self) -> None: + result = _expand_rle([[10, 3.0]]) # type: ignore[list-item] + assert result == [10, 10, 10] + + # --------------------------------------------------------------------------- # Serialization # --------------------------------------------------------------------------- From 58bd33680fde890aa2c2436fdfced545c4e707c9 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:02:58 -0400 Subject: [PATCH 050/118] More spec compliance --- src/zarr/core/chunk_grids.py | 7 +++++-- tests/test_unified_chunk_grid.py | 14 ++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index a171cedc5a..f9c0799d04 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -279,8 +279,6 @@ def _serialize_fixed_dim(dim: FixedDimension) -> RectilinearDimSpec: chunks, matching the regular grid spec ("chunks at the border always have the full chunk size"). """ - if dim.nchunks == 0: - return [] return dim.size @@ -708,6 +706,11 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: } if name == "rectilinear": + if any(d.extent == 0 for d in grid.dimensions): + raise ValueError( + "Cannot serialize a zero-extent grid as 'rectilinear': " + "the spec requires all edge lengths to be positive integers." + ) chunk_shapes: list[RectilinearDimSpec] = [] for dim in grid.dimensions: if isinstance(dim, FixedDimension): diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index ffe0b6fb30..45ec799d73 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -487,6 +487,12 @@ def test_serialize_unknown_name_raises(self) -> None: with pytest.raises(ValueError, match="Unknown chunk grid name for serialization"): serialize_chunk_grid(g, "hexagonal") + def test_zero_extent_rectilinear_raises(self) -> None: + """Zero-extent grids cannot be serialized as rectilinear (spec requires positive edges).""" + grid = ChunkGrid.from_regular((0,), (10,)) + with pytest.raises(ValueError, match="zero-extent"): + serialize_chunk_grid(grid, "rectilinear") + class TestSpecCompliance: """Tests for compliance with the rectilinear chunk grid extension spec @@ -1209,8 +1215,8 @@ def test_chunk_spec_multidim_boundary(self) -> None: # -- Rectilinear with zero-nchunks FixedDimension in serialize_chunk_grid -- - def test_zero_nchunks_fixed_dim_in_rectilinear_serialize(self) -> None: - """A rectilinear grid with a 0-nchunks FixedDimension serializes.""" + def test_zero_nchunks_fixed_dim_in_rectilinear_serialize_raises(self) -> None: + """A rectilinear grid with a 0-extent dimension cannot be serialized.""" g = ChunkGrid( dimensions=( VaryingDimension([10, 20], extent=30), @@ -1218,8 +1224,8 @@ def test_zero_nchunks_fixed_dim_in_rectilinear_serialize(self) -> None: ) ) assert g.shape == (2, 0) - d = serialize_chunk_grid(g, "rectilinear") - assert d["name"] == "rectilinear" + with pytest.raises(ValueError, match="zero-extent"): + serialize_chunk_grid(g, "rectilinear") # -- VaryingDimension data_size -- From e0fbab4c90d8eb9e0591a204fa7cded2b854f955 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:05:46 -0400 Subject: [PATCH 051/118] Fix block indexing error --- src/zarr/core/indexing.py | 5 +++++ tests/test_unified_chunk_grid.py | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 8ea8b06751..57ccdf13b2 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1077,6 +1077,11 @@ def __init__( if dim_sel < 0: dim_sel = dim_numchunks + dim_sel + if dim_sel < 0 or dim_sel >= dim_numchunks: + raise BoundsCheckError( + f"block index out of bounds for dimension with {dim_numchunks} chunk(s)" + ) + start = dim_grid.chunk_offset(dim_sel) stop = start + dim_grid.chunk_size(dim_sel) slice_ = slice(start, stop) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 45ec799d73..f03a49c354 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -25,6 +25,7 @@ parse_chunk_grid, serialize_chunk_grid, ) +from zarr.errors import BoundsCheckError from zarr.storage import MemoryStore if TYPE_CHECKING: @@ -747,6 +748,13 @@ def test_orthogonal_indexer_rectilinear(self) -> None: projections = list(indexer) assert len(projections) == 6 + def test_oob_block_raises_bounds_check_error(self) -> None: + """Out-of-bounds block index should raise BoundsCheckError, not IndexError.""" + store = MemoryStore() + a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") + with pytest.raises(BoundsCheckError): + a.get_block_selection((2,)) + # --------------------------------------------------------------------------- # End-to-end: array creation with rectilinear chunks From 52777394f194b21a4bff636619f4d450cd0ad8ae Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:14:22 -0400 Subject: [PATCH 052/118] Add V2 regression tests --- tests/test_unified_chunk_grid.py | 281 ++++++++++++++++++++++++++++++- 1 file changed, 279 insertions(+), 2 deletions(-) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index f03a49c354..998f0c264c 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -21,7 +21,10 @@ FixedDimension, VaryingDimension, _compress_rle, + _decode_dim_spec, _expand_rle, + _infer_chunk_grid_name, + _is_rectilinear_chunks, parse_chunk_grid, serialize_chunk_grid, ) @@ -365,6 +368,40 @@ def test_all_chunk_coords(self) -> None: assert coords[0] == (0, 0) assert coords[-1] == (2, 1) + def test_all_chunk_coords_with_origin(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) + coords = list(g.all_chunk_coords(origin=(1, 0))) + assert len(coords) == 4 # 2 remaining in dim0 * 2 in dim1 + assert coords[0] == (1, 0) + assert coords[-1] == (2, 1) + + def test_all_chunk_coords_with_selection_shape(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) + coords = list(g.all_chunk_coords(selection_shape=(2, 1))) + assert len(coords) == 2 + assert coords == [(0, 0), (1, 0)] + + def test_all_chunk_coords_with_origin_and_selection_shape(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) + coords = list(g.all_chunk_coords(origin=(1, 1), selection_shape=(2, 1))) + assert coords == [(1, 1), (2, 1)] + + def test_all_chunk_coords_origin_at_last_chunk(self) -> None: + g = ChunkGrid.from_regular((30, 40), (10, 20)) + coords = list(g.all_chunk_coords(origin=(2, 1))) + assert coords == [(2, 1)] + + def test_all_chunk_coords_selection_shape_zero(self) -> None: + g = ChunkGrid.from_regular((30, 40), (10, 20)) + coords = list(g.all_chunk_coords(selection_shape=(0, 0))) + assert coords == [] + + def test_all_chunk_coords_single_dim_slice(self) -> None: + """Origin shifts one dim, selection_shape restricts the other.""" + g = ChunkGrid.from_regular((60, 80), (20, 20)) # 3x4 + coords = list(g.all_chunk_coords(origin=(0, 2), selection_shape=(3, 1))) + assert coords == [(0, 2), (1, 2), (2, 2)] + def test_get_nchunks(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) assert g.get_nchunks() == 6 @@ -401,14 +438,143 @@ def test_roundtrip(self) -> None: class TestExpandRleHandlesJsonFloats: def test_bare_integer_floats_accepted(self) -> None: """JSON parsers may emit 10.0 for the integer 10; _expand_rle should handle it.""" - result = _expand_rle([10.0, 20.0]) # type: ignore[list-item] + result = _expand_rle([10.0, 20.0]) assert result == [10, 20] def test_rle_pair_with_float_count(self) -> None: - result = _expand_rle([[10, 3.0]]) # type: ignore[list-item] + result = _expand_rle([[10, 3.0]]) assert result == [10, 10, 10] +# --------------------------------------------------------------------------- +# _decode_dim_spec edge cases +# --------------------------------------------------------------------------- + + +class TestDecodeDimSpec: + """Edge cases for _decode_dim_spec: floats, empty lists, negatives, missing extent.""" + + def test_bare_integer(self) -> None: + assert _decode_dim_spec(10, array_extent=25) == [10, 10, 10] + + def test_bare_integer_exact_fit(self) -> None: + assert _decode_dim_spec(5, array_extent=10) == [5, 5] + + def test_bare_integer_no_extent_raises(self) -> None: + with pytest.raises(ValueError, match="requires array shape"): + _decode_dim_spec(10, array_extent=None) + + def test_bare_integer_zero_raises(self) -> None: + with pytest.raises(ValueError, match="must be > 0"): + _decode_dim_spec(0, array_extent=10) + + def test_bare_integer_negative_raises(self) -> None: + with pytest.raises(ValueError, match="must be > 0"): + _decode_dim_spec(-5, array_extent=10) + + def test_bare_float_raises(self) -> None: + """A bare float (not in a list) is not int or list — should raise.""" + with pytest.raises(ValueError, match="Invalid chunk_shapes entry"): + _decode_dim_spec(10.0, array_extent=10) + + def test_explicit_integer_list(self) -> None: + assert _decode_dim_spec([10, 20, 30]) == [10, 20, 30] + + def test_empty_list(self) -> None: + """An empty list has no sub-lists, so it returns an empty explicit list.""" + assert _decode_dim_spec([]) == [] + + def test_list_with_rle(self) -> None: + assert _decode_dim_spec([[5, 3], 10]) == [5, 5, 5, 10] + + def test_string_raises(self) -> None: + with pytest.raises(ValueError, match="Invalid chunk_shapes entry"): + _decode_dim_spec("auto", array_extent=10) + + def test_none_raises(self) -> None: + with pytest.raises(ValueError, match="Invalid chunk_shapes entry"): + _decode_dim_spec(None, array_extent=10) + + +# --------------------------------------------------------------------------- +# _is_rectilinear_chunks edge cases +# --------------------------------------------------------------------------- + + +class TestIsRectilinearChunks: + """Edge cases for _is_rectilinear_chunks.""" + + def test_nested_lists(self) -> None: + assert _is_rectilinear_chunks([[10, 20], [5, 5]]) is True + + def test_nested_tuples(self) -> None: + assert _is_rectilinear_chunks(((10, 20), (5, 5))) is True + + def test_flat_tuple(self) -> None: + assert _is_rectilinear_chunks((10, 20)) is False + + def test_flat_list(self) -> None: + assert _is_rectilinear_chunks([10, 20]) is False + + def test_single_int(self) -> None: + assert _is_rectilinear_chunks(10) is False + + def test_string(self) -> None: + assert _is_rectilinear_chunks("auto") is False + + def test_empty_list(self) -> None: + assert _is_rectilinear_chunks([]) is False + + def test_empty_nested_list(self) -> None: + """First element is an empty list — it's iterable and not str/int.""" + assert _is_rectilinear_chunks([[]]) is True + + def test_chunk_grid_instance(self) -> None: + g = ChunkGrid.from_regular((10,), (5,)) + assert _is_rectilinear_chunks(g) is False + + def test_none(self) -> None: + assert _is_rectilinear_chunks(None) is False + + def test_float(self) -> None: + assert _is_rectilinear_chunks(3.14) is False + + +# --------------------------------------------------------------------------- +# _infer_chunk_grid_name edge cases +# --------------------------------------------------------------------------- + + +class TestInferChunkGridName: + """Edge cases for _infer_chunk_grid_name.""" + + def test_regular_grid(self) -> None: + g = ChunkGrid.from_regular((100,), (10,)) + assert _infer_chunk_grid_name(g, g) == "regular" + + @pytest.fixture(autouse=True) + def _enable_rectilinear(self) -> Any: + with zarr.config.set({"array.rectilinear_chunks": True}): + yield + + def test_rectilinear_grid(self) -> None: + g = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) + assert _infer_chunk_grid_name(g, g) == "rectilinear" + + def test_dict_with_regular_name(self) -> None: + g = ChunkGrid.from_regular((100,), (10,)) + d: dict[str, Any] = {"name": "regular", "configuration": {"chunk_shape": [10]}} + assert _infer_chunk_grid_name(d, g) == "regular" + + def test_dict_with_rectilinear_name(self) -> None: + g = ChunkGrid.from_regular((100,), (10,)) + d: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [10]}, + } + assert _infer_chunk_grid_name(d, g) == "rectilinear" + + # --------------------------------------------------------------------------- # Serialization # --------------------------------------------------------------------------- @@ -1741,6 +1907,117 @@ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: ) +# --------------------------------------------------------------------------- +# V2 regression tests +# --------------------------------------------------------------------------- + + +class TestV2Regression: + """Verify V2 arrays still work correctly after the ChunkGrid refactor. + + V2 only supports regular chunks. These tests ensure the V2 metadata + round-trip (create → write → read) and chunk_grid property work as + expected with the unified ChunkGrid infrastructure. + """ + + def test_v2_create_and_readback(self, tmp_path: Path) -> None: + """Basic V2 array: create, write, read back.""" + data = np.arange(60, dtype="float64").reshape(6, 10) + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=data.shape, + chunks=(3, 5), + dtype=data.dtype, + zarr_format=2, + ) + a[:] = data + np.testing.assert_array_equal(a[:], data) + + def test_v2_chunk_grid_is_regular(self, tmp_path: Path) -> None: + """V2 metadata.chunk_grid produces a regular ChunkGrid with FixedDimensions.""" + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(20, 30), + chunks=(10, 15), + dtype="int32", + zarr_format=2, + ) + grid = a.metadata.chunk_grid + assert grid.is_regular + assert grid.chunk_shape == (10, 15) + assert grid.shape == (2, 2) + assert all(isinstance(d, FixedDimension) for d in grid.dimensions) + + def test_v2_boundary_chunks(self, tmp_path: Path) -> None: + """V2 boundary chunks: codec buffer size stays full, data is clipped.""" + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(25,), + chunks=(10,), + dtype="int32", + zarr_format=2, + ) + grid = a.metadata.chunk_grid + assert grid.dimensions[0].nchunks == 3 + assert grid.dimensions[0].chunk_size(2) == 10 # full codec buffer + assert grid.dimensions[0].data_size(2) == 5 # clipped to extent + + def test_v2_slicing_with_boundary(self, tmp_path: Path) -> None: + """V2 array slicing across boundary chunks returns correct data.""" + data = np.arange(25, dtype="int32") + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(25,), + chunks=(10,), + dtype="int32", + zarr_format=2, + ) + a[:] = data + np.testing.assert_array_equal(a[18:25], data[18:25]) + np.testing.assert_array_equal(a[:], data) + + def test_v2_metadata_roundtrip(self, tmp_path: Path) -> None: + """V2 metadata survives store close and reopen.""" + store_path = tmp_path / "v2.zarr" + data = np.arange(12, dtype="float32").reshape(3, 4) + a = zarr.create_array( + store=store_path, + shape=data.shape, + chunks=(2, 2), + dtype=data.dtype, + zarr_format=2, + ) + a[:] = data + + # Reopen from store + b = zarr.open_array(store=store_path, mode="r") + assert b.metadata.zarr_format == 2 + assert b.chunks == (2, 2) + assert b.metadata.chunk_grid.chunk_shape == (2, 2) + np.testing.assert_array_equal(b[:], data) + + def test_v2_chunk_spec_via_grid(self, tmp_path: Path) -> None: + """ChunkSpec from V2 grid has correct slices and codec_shape.""" + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(15, 20), + chunks=(10, 10), + dtype="int32", + zarr_format=2, + ) + grid = a.metadata.chunk_grid + # Interior chunk + spec = grid[(0, 0)] + assert spec is not None + assert spec.shape == (10, 10) + assert spec.codec_shape == (10, 10) + # Boundary chunk + spec = grid[(1, 1)] + assert spec is not None + assert spec.shape == (5, 10) # clipped data + assert spec.codec_shape == (10, 10) # full buffer + + # --------------------------------------------------------------------------- # .chunk_sizes property # --------------------------------------------------------------------------- From c9858c06a92aad0c9253b6c5dad4cc294206e9aa Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:17:44 -0400 Subject: [PATCH 053/118] Add comments --- src/zarr/core/chunk_grids.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index f9c0799d04..c1d83e90c3 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -700,12 +700,19 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: raise ValueError( "Cannot serialize a non-regular chunk grid as 'regular'. Use 'rectilinear' instead." ) + # The regular grid spec encodes only chunk_shape, not per-axis edges, + # so zero-extent dimensions are valid (they simply produce zero chunks). return { "name": "regular", "configuration": {"chunk_shape": tuple(grid.chunk_shape)}, } if name == "rectilinear": + # Zero-extent dimensions cannot be represented as rectilinear because + # the spec requires at least one positive-integer edge length per axis. + # This is intentionally asymmetric with the regular grid, which encodes + # only chunk_shape (no per-axis edges) and thus handles zero-extent + # arrays without issue. if any(d.extent == 0 for d in grid.dimensions): raise ValueError( "Cannot serialize a zero-extent grid as 'rectilinear': " From 9e4fa30b11df82ad6855fdedc1630f0d4400fb55 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:28:05 -0400 Subject: [PATCH 054/118] Consistent bounds checking between dimension types --- src/zarr/core/chunk_grids.py | 9 ++++++ tests/test_unified_chunk_grid.py | 50 ++++++++++++++++++++++++++------ 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index c1d83e90c3..12183e2eec 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -65,15 +65,24 @@ def index_to_chunk(self, idx: int) -> int: return 0 return idx // self.size + def _check_chunk_ix(self, chunk_ix: int) -> None: + if chunk_ix < 0 or chunk_ix >= self.nchunks: + raise IndexError( + f"Chunk index {chunk_ix} out of bounds for dimension with {self.nchunks} chunks" + ) + def chunk_offset(self, chunk_ix: int) -> int: + self._check_chunk_ix(chunk_ix) return chunk_ix * self.size def chunk_size(self, chunk_ix: int) -> int: """Buffer size for codec processing — always uniform.""" + self._check_chunk_ix(chunk_ix) return self.size def data_size(self, chunk_ix: int) -> int: """Valid data region within the buffer — clipped at extent.""" + self._check_chunk_ix(chunk_ix) if self.size == 0: return 0 return max(0, min(self.size, self.extent - chunk_ix * self.size)) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 998f0c264c..7a656b9b95 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -182,6 +182,27 @@ def test_zero_size_allowed(self) -> None: assert d.size == 0 assert d.nchunks == 1 # 0-size with 0-extent = 1 chunk + def test_chunk_offset_oob_raises(self) -> None: + d = FixedDimension(size=10, extent=100) + with pytest.raises(IndexError, match="out of bounds"): + d.chunk_offset(10) + with pytest.raises(IndexError, match="out of bounds"): + d.chunk_offset(-1) + + def test_chunk_size_oob_raises(self) -> None: + d = FixedDimension(size=10, extent=100) + with pytest.raises(IndexError, match="out of bounds"): + d.chunk_size(10) + with pytest.raises(IndexError, match="out of bounds"): + d.chunk_size(-1) + + def test_data_size_oob_raises(self) -> None: + d = FixedDimension(size=10, extent=100) + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(10) + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(-1) + # --------------------------------------------------------------------------- # VaryingDimension @@ -1151,16 +1172,22 @@ def test_fixed_dim_boundary_data_size(self) -> None: assert d.chunk_size(9) == 10 # codec buffer always full def test_fixed_dim_data_size_out_of_bounds(self) -> None: - """data_size clamps to 0 for out-of-bounds chunk indices.""" + """data_size raises IndexError for out-of-bounds chunk indices.""" d = FixedDimension(size=10, extent=100) - assert d.data_size(10) == 0 # exactly at boundary - assert d.data_size(11) == 0 # past boundary - assert d.data_size(999) == 0 + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(10) # exactly at boundary + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(11) # past boundary + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(999) + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(-1) def test_fixed_dim_data_size_boundary_oob(self) -> None: - """data_size for boundary grid, past last chunk.""" + """data_size raises IndexError past last chunk.""" d = FixedDimension(size=10, extent=95) - assert d.data_size(10) == 0 # past nchunks=10 + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(10) # past nchunks=10 def test_chunk_grid_boundary_getitem(self) -> None: """ChunkGrid with boundary FixedDimension via direct construction.""" @@ -1252,14 +1279,19 @@ def test_zero_size_nonzero_extent(self) -> None: """FixedDimension(size=0, extent=5) => 0 chunks (can't partition).""" d = FixedDimension(size=0, extent=5) assert d.nchunks == 0 - assert d.data_size(0) == 0 - assert d.chunk_size(0) == 0 + # No valid chunk index exists on a 0-chunk dimension + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(0) + with pytest.raises(IndexError, match="out of bounds"): + d.chunk_size(0) def test_zero_extent_nonzero_size(self) -> None: """FixedDimension(size=10, extent=0) => 0 chunks.""" d = FixedDimension(size=10, extent=0) assert d.nchunks == 0 - assert d.data_size(0) == 0 + # No valid chunk index exists on a 0-chunk dimension + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(0) # -- 0-d grid -- From a21d5879d29ba43b68e2fea7dd6d395cd9236f33 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:29:40 -0400 Subject: [PATCH 055/118] use pre-computed extent --- src/zarr/core/chunk_grids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 12183e2eec..30104688de 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -154,7 +154,7 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int def with_extent(self, new_extent: int) -> VaryingDimension: """Return a copy re-bound to *new_extent*, validating edge coverage.""" - edge_sum = sum(self.edges) + edge_sum = self.cumulative[-1] if edge_sum < new_extent: raise ValueError( f"VaryingDimension edge sum {edge_sum} is less than new extent {new_extent}" From e0625803946d1b9519bea24833eb2c7b63232c45 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:38:20 -0400 Subject: [PATCH 056/118] Improve sharding validation logic --- src/zarr/codecs/sharding.py | 24 +++++++++++++----------- src/zarr/core/chunk_grids.py | 18 +++++++++++++++++- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 9e741d0710..6497386ed4 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -404,17 +404,19 @@ def validate( f"needs to be divisible by the shard's inner `chunk_shape` (got {self.chunk_shape})." ) else: - # For rectilinear grids, every chunk's dimensions must be divisible - # by the inner chunk_shape. - for coord in chunk_grid.all_chunk_coords(): - spec = chunk_grid[coord] - if spec is not None and not all( - s % c == 0 for s, c in zip(spec.codec_shape, self.chunk_shape, strict=False) - ): - raise ValueError( - f"Chunk at {coord} has shape {spec.codec_shape} which is not " - f"divisible by the shard's inner `chunk_shape` (got {self.chunk_shape})." - ) + # For rectilinear grids, every unique edge length per dimension + # must be divisible by the corresponding inner chunk size. + # unique_edge_lengths is a lazy generator that short-circuits + # deduplication, and we short-circuit on the first failure. + for i, (dim, inner) in enumerate( + zip(chunk_grid.dimensions, self.chunk_shape, strict=False) + ): + for edge in dim.unique_edge_lengths: + if edge % inner != 0: + raise ValueError( + f"Chunk edge length {edge} in dimension {i} is not " + f"divisible by the shard's inner chunk size {inner}." + ) async def _decode_single( self, diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 30104688de..87632c38c5 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -6,7 +6,7 @@ import numbers import operator import warnings -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from dataclasses import dataclass from functools import reduce from typing import TYPE_CHECKING, Any, Literal, Protocol, cast, runtime_checkable @@ -87,6 +87,11 @@ def data_size(self, chunk_ix: int) -> int: return 0 return max(0, min(self.size, self.extent - chunk_ix * self.size)) + @property + def unique_edge_lengths(self) -> Iterable[int]: + """Distinct chunk edge lengths for this dimension.""" + return (self.size,) + def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: if self.size == 0: return np.zeros_like(indices) @@ -149,6 +154,15 @@ def data_size(self, chunk_ix: int) -> int: offset = self.chunk_offset(chunk_ix) return max(0, min(self.edges[chunk_ix], self.extent - offset)) + @property + def unique_edge_lengths(self) -> Iterable[int]: + """Distinct chunk edge lengths for this dimension (lazily deduplicated).""" + seen: set[int] = set() + for e in self.edges: + if e not in seen: + seen.add(e) + yield e + def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: return np.searchsorted(self.cumulative, indices, side="right") @@ -194,6 +208,8 @@ def chunk_offset(self, chunk_ix: int) -> int: ... def chunk_size(self, chunk_ix: int) -> int: ... def data_size(self, chunk_ix: int) -> int: ... def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: ... + @property + def unique_edge_lengths(self) -> Iterable[int]: ... def with_extent(self, new_extent: int) -> DimensionGrid: ... def resize(self, new_extent: int) -> DimensionGrid: ... From 3591734f4e25385fc54b94797754ddc0ad316179 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:46:33 -0400 Subject: [PATCH 057/118] Improve sharding validation logic --- src/zarr/core/chunk_grids.py | 36 ++++++++++++++++++++++++++++ tests/test_unified_chunk_grid.py | 40 ++++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 87632c38c5..ebf6ef6199 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -994,3 +994,39 @@ def _auto_partition( _shards_out = cast("tuple[int, ...]", shard_shape) return _shards_out, _chunks_out + + +# --------------------------------------------------------------------------- +# Backwards-compatibility shim for RegularChunkGrid +# --------------------------------------------------------------------------- + + +class _RegularChunkGridMeta(type): + """Metaclass that makes ``isinstance(obj, RegularChunkGrid)`` work. + + Returns True when *obj* is a ``ChunkGrid`` whose ``is_regular`` flag is set. + """ + + def __instancecheck__(cls, instance: object) -> bool: + return isinstance(instance, ChunkGrid) and instance.is_regular + + +class RegularChunkGrid(metaclass=_RegularChunkGridMeta): + """Deprecated compatibility shim. + + .. deprecated:: 3.1 + Use ``ChunkGrid.from_regular(array_shape, chunk_shape)`` instead. + Use ``grid.is_regular`` instead of ``isinstance(grid, RegularChunkGrid)``. + """ + + def __new__(cls, *, chunk_shape: ShapeLike) -> ChunkGrid: # type: ignore[misc] + warnings.warn( + "RegularChunkGrid is deprecated. " + "Use ChunkGrid.from_regular(array_shape, chunk_shape) instead.", + DeprecationWarning, + stacklevel=2, + ) + # Without array_shape we cannot bind extents, so use chunk_shape as extent. + # This matches the old behavior where RegularChunkGrid was shape-unaware. + parsed = parse_shapelike(chunk_shape) + return ChunkGrid.from_regular(array_shape=parsed, chunk_shape=parsed) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 7a656b9b95..9ae6d48d68 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -132,6 +132,42 @@ def test_parse_chunk_grid_blocked(self) -> None: ) +# --------------------------------------------------------------------------- +# RegularChunkGrid backwards compatibility +# --------------------------------------------------------------------------- + + +class TestRegularChunkGridCompat: + """The deprecated RegularChunkGrid shim should work for common patterns.""" + + def test_construction_emits_deprecation_warning(self) -> None: + from zarr.core.chunk_grids import RegularChunkGrid + + with pytest.warns(DeprecationWarning, match="RegularChunkGrid is deprecated"): + grid = RegularChunkGrid(chunk_shape=(10, 20)) + assert isinstance(grid, ChunkGrid) + assert grid.is_regular + assert grid.chunk_shape == (10, 20) + + def test_isinstance_check(self) -> None: + from zarr.core.chunk_grids import RegularChunkGrid + + grid = ChunkGrid.from_regular((100, 200), (10, 20)) + assert isinstance(grid, RegularChunkGrid) + + def test_isinstance_false_for_rectilinear(self) -> None: + from zarr.core.chunk_grids import RegularChunkGrid + + grid = ChunkGrid.from_rectilinear([[10, 20], [25, 25]], array_shape=(30, 50)) + assert not isinstance(grid, RegularChunkGrid) + + def test_isinstance_false_for_unrelated_types(self) -> None: + from zarr.core.chunk_grids import RegularChunkGrid + + assert not isinstance("hello", RegularChunkGrid) + assert not isinstance(42, RegularChunkGrid) + + # --------------------------------------------------------------------------- # FixedDimension # --------------------------------------------------------------------------- @@ -459,11 +495,11 @@ def test_roundtrip(self) -> None: class TestExpandRleHandlesJsonFloats: def test_bare_integer_floats_accepted(self) -> None: """JSON parsers may emit 10.0 for the integer 10; _expand_rle should handle it.""" - result = _expand_rle([10.0, 20.0]) + result = _expand_rle([10.0, 20.0]) # type: ignore[list-item] assert result == [10, 20] def test_rle_pair_with_float_count(self) -> None: - result = _expand_rle([[10, 3.0]]) + result = _expand_rle([[10, 3.0]]) # type: ignore[list-item] assert result == [10, 10, 10] From 4be96b047def6acca2c21b03bca05dead32c7b49 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:48:35 -0400 Subject: [PATCH 058/118] Remove deferred design --- docs/design/chunk-grid-metadata-separation.md | 253 ------------------ 1 file changed, 253 deletions(-) delete mode 100644 docs/design/chunk-grid-metadata-separation.md diff --git a/docs/design/chunk-grid-metadata-separation.md b/docs/design/chunk-grid-metadata-separation.md deleted file mode 100644 index 767090fe46..0000000000 --- a/docs/design/chunk-grid-metadata-separation.md +++ /dev/null @@ -1,253 +0,0 @@ -# Chunk Grid: Metadata / Array Separation - -**Related:** - -- [chunk-grid.md](chunk-grid.md) (unified chunk grid design) - -## Problem - -`ArrayV3Metadata` stores `chunk_grid: ChunkGrid` — a behavioral object with index-to-chunk math, iteration, resize logic, and per-dimension grid types. Metadata should be a serializable data bag. The behavioral `ChunkGrid` carries runtime state (extent per dimension, prefix sums) that belongs on the array, not on the metadata record. - -This coupling causes several issues: - -1. **Metadata is not a simple DTO.** Constructing `ArrayV3Metadata` triggers `parse_chunk_grid()` which builds `FixedDimension`/`VaryingDimension` objects, computes prefix sums, and validates edge coverage. Metadata round-trips (`from_dict` → `to_dict`) pay this cost unnecessarily. -2. **Codec validation is misplaced.** `_validate_metadata()` calls `codec.validate(chunk_grid=...)`, passing the behavioral object. This conflates structural metadata validation with runtime array validation. -3. **`update_shape` lives on metadata.** Shape changes require constructing a new `ChunkGrid` with updated extents, serializing it back, and creating new metadata. The metadata layer shouldn't know about resize semantics. -4. **Redundant state.** `chunk_grid_name: str` exists solely to preserve serialization format because the `ChunkGrid` object doesn't carry its own name. With a plain dict, the name is just `chunk_grid["name"]`. - -## Design - -### Principles - -1. **Metadata stores what's on disk.** `ArrayV3Metadata.chunk_grid` holds the JSON dict exactly as it appears in `zarr.json`. No parsing, no behavioral objects, no computed state. -2. **The array owns the behavioral grid.** `AsyncArray` constructs a `ChunkGrid` from the metadata dict + shape at init time. All chunk-related behavior (indexing, iteration, resize) goes through the array's grid. -3. **Codec validation happens at array construction.** The array has the full context (shape + grid + dtype) needed to validate codecs. Metadata validates only structural properties (correct keys, matching ndim). -4. **No signature changes to downstream consumers.** Indexers and codecs still receive `ChunkGrid`. Only the *source* of the grid changes — from `metadata.chunk_grid` to `array.chunk_grid`. - -### Current architecture - -``` -ArrayV3Metadata - ├── chunk_grid: ChunkGrid ← behavioral object - ├── chunk_grid_name: str ← "regular" | "rectilinear" - └── _validate_metadata() → codec.validate(chunk_grid=...) - get_chunk_spec() → chunk_grid[coords] - update_shape() → chunk_grid.update_shape() - to_dict() → serialize_chunk_grid(chunk_grid, name) - chunks → chunk_grid.chunk_shape - shards → chunk_grid.chunk_shape - -AsyncArray - └── self.metadata.chunk_grid ← delegates everything - -Indexers - └── __init__(chunk_grid: ChunkGrid) - -Codec.validate() - └── validate(shape, dtype, chunk_grid: ChunkGrid) -``` - -### Proposed architecture - -``` -ArrayV3Metadata - ├── chunk_grid: dict[str, JSON] ← plain serialized form - └── _validate_metadata() → structural checks only (ndim, required keys) - to_dict() → return self.chunk_grid - from_dict() → store dict as-is - -AsyncArray - ├── chunk_grid: ChunkGrid ← behavioral object, constructed on init - └── get_chunk_spec() ← moved from metadata - _validate_codecs() ← moved from metadata - update_shape() ← moved from metadata - -Indexers ← unchanged -Codec.validate() ← unchanged -``` - -### Metadata changes - -`ArrayV3Metadata` becomes simpler: - -```python -@dataclass(frozen=True, kw_only=True) -class ArrayV3Metadata(Metadata): - shape: tuple[int, ...] - data_type: ZDType[TBaseDType, TBaseScalar] - chunk_grid: dict[str, JSON] # plain JSON dict - chunk_key_encoding: ChunkKeyEncoding - fill_value: Any - codecs: tuple[Codec, ...] - # ... - - def __init__(self, *, chunk_grid, **kwargs): - # Store the dict directly. Validate structure only: - # - has "name" and "configuration" keys - # - ndim matches shape - # No parse_chunk_grid(), no ChunkGrid construction. - object.__setattr__(self, "chunk_grid", chunk_grid) - - def _validate_metadata(self): - # Structural: ndim from dict matches len(shape) - # No codec validation — that moves to the array. - name, config = parse_named_configuration(self.chunk_grid) - if name == "regular": - ndim = len(config["chunk_shape"]) - elif name == "rectilinear": - ndim = len(config["chunk_shapes"]) - if ndim != len(self.shape): - raise ValueError(...) - - @property - def chunks(self) -> tuple[int, ...]: - name, config = parse_named_configuration(self.chunk_grid) - if name == "regular": - return tuple(config["chunk_shape"]) - raise NotImplementedError(...) - - def to_dict(self): - d = super().to_dict() - # chunk_grid is already a dict — no serialize_chunk_grid() needed - return d -``` - -The `chunk_grid_name` field is removed. Round-trip fidelity is preserved because the original dict is stored verbatim. - -### Array changes - -`AsyncArray` constructs the behavioral `ChunkGrid` and owns all chunk-related operations: - -```python -@dataclass(frozen=True) -class AsyncArray: - metadata: ArrayV2Metadata | ArrayV3Metadata - # ... - - def __init__(self, metadata, store_path, config): - # ... existing init ... - chunk_grid = parse_chunk_grid(metadata.chunk_grid, metadata.shape) - object.__setattr__(self, "_chunk_grid", chunk_grid) - # Codec validation moves here: - self._validate_codecs() - - @property - def chunk_grid(self) -> ChunkGrid: - return self._chunk_grid - - def get_chunk_spec(self, chunk_coords, array_config, prototype) -> ArraySpec: - spec = self.chunk_grid[chunk_coords] - if spec is None: - raise ValueError(...) - return ArraySpec(shape=spec.codec_shape, ...) - - def _validate_codecs(self) -> None: - for codec in self.metadata.codecs: - codec.validate( - shape=self.metadata.shape, - dtype=self.metadata.data_type, - chunk_grid=self.chunk_grid, - ) -``` - -For resize, the array constructs a new `ChunkGrid` with the new shape. For regular grids, the metadata dict doesn't change on resize — only the extent changes, which is runtime state not serialized in the chunk grid JSON. The array rebuilds its `ChunkGrid` with the new shape: - -```python -async def _resize(self, new_shape): - new_grid = self.chunk_grid.update_shape(new_shape) - # For regular grids, metadata.chunk_grid dict stays the same. - # For rectilinear grids that grew/shrank, serialize back: - new_chunk_grid_dict = serialize_chunk_grid(new_grid, self.metadata.chunk_grid["name"]) - new_metadata = replace(self.metadata, shape=new_shape, chunk_grid=new_chunk_grid_dict) - # ... -``` - -### V2 metadata - -`ArrayV2Metadata` already stores `chunks: tuple[int, ...]` as plain data. Its `chunk_grid` property (which constructs a `ChunkGrid`) is removed. The array handles construction for both V2 and V3: - -```python -@property -def chunk_grid(self) -> ChunkGrid: - if isinstance(self.metadata, ArrayV2Metadata): - return ChunkGrid.from_regular(self.metadata.shape, self.metadata.chunks) - return parse_chunk_grid(self.metadata.chunk_grid, self.metadata.shape) -``` - -### Call site migration - -All `self.metadata.chunk_grid` references in `array.py` (~30 sites) change to `self.chunk_grid`: - -```python -# Before -indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) - -# After -indexer = BasicIndexer(selection, self.shape, self.chunk_grid) -``` - -Indexers, codecs, and the codec pipeline are unchanged — they still receive `ChunkGrid` as a parameter. Only the *source* changes. - -### What does NOT change - -| Component | Status | -|---|---| -| `ChunkGrid` class | Unchanged — all behavior stays | -| `FixedDimension`, `VaryingDimension` | Unchanged | -| `parse_chunk_grid()`, `serialize_chunk_grid()` | Unchanged — called from array instead of metadata | -| Indexer classes | Unchanged — still receive `ChunkGrid` | -| Codec `validate()` signature | Unchanged — still receives `ChunkGrid` | -| On-disk format | No spec change | - -## Design decisions - -### Why not keep ChunkGrid on metadata with a lazy property? - -A lazy `@cached_property` on metadata would defer the cost but not fix the fundamental issue: metadata would still own behavioral state. Resize logic, codec validation, and `get_chunk_spec` would remain on metadata. The goal is a clean separation — metadata is data, the array is behavior. - -### Why move codec validation to the array? - -Codec validation needs the behavioral `ChunkGrid` (sharding checks divisibility, iterates chunk coords). With a plain dict on metadata, the array is the first place where a `ChunkGrid` exists. Validating there is natural — the array is the runtime boundary where all pieces (shape, dtype, grid, codecs) come together. - -This means `ArrayV3Metadata` can be constructed with an invalid codec/grid combination without error. This is acceptable: metadata is a data transfer object. Validation at the array boundary catches errors at the same point users interact with the data. - -### Why store the raw dict instead of a TypedDict or NamedTuple? - -The dict is exactly what's in `zarr.json`. Storing it verbatim gives: -- Zero-cost round-trips (`to_dict` returns it as-is) -- No `chunk_grid_name` field needed (it's `chunk_grid["name"]`) -- Forward compatibility with unknown chunk grid types (metadata can store and round-trip grids it doesn't understand) - -### How does resize work with a plain dict? - -For regular grids, the chunk grid JSON doesn't change on resize — `{"name": "regular", "configuration": {"chunk_shape": [10, 20]}}` is the same regardless of array shape. The extent is runtime state derived from `shape`. The array rebuilds its `ChunkGrid` with the new shape. - -For rectilinear grids, resize may add or remove chunks. The array resizes the `ChunkGrid`, serializes it back to a dict via `serialize_chunk_grid()`, and creates new metadata with the updated dict. - -## Migration - -### PR 1: Add `chunk_grid` property to `AsyncArray` (non-breaking) - -**Files:** `array.py` -**Scope:** Add `chunk_grid` property that delegates to `self.metadata.chunk_grid`. Migrate all `self.metadata.chunk_grid` references in `array.py` to `self.chunk_grid`. Purely mechanical, no behavioral change. - -### PR 2: Move codec validation to array - -**Files:** `v3.py`, `array.py` -**Scope:** Remove `codec.validate()` calls from `ArrayV3Metadata._validate_metadata()`. Add `_validate_codecs()` to `AsyncArray.__init__`. Move `get_chunk_spec()` from metadata to array. The codec `validate()` signature is unchanged. - -### PR 3: Make metadata chunk_grid a plain dict - -**Files:** `v3.py`, `v2.py`, `array.py` -**Scope:** Replace `chunk_grid: ChunkGrid` with `chunk_grid: dict[str, JSON]` on `ArrayV3Metadata`. Remove `chunk_grid_name` field. Update `__init__`, `from_dict`, `to_dict`, `_validate_metadata`, `chunks`, `shards`. Construct `ChunkGrid` in `AsyncArray.__init__` via `parse_chunk_grid()`. Remove `chunk_grid` property from `ArrayV2Metadata`. Update `update_shape` / resize flow. - -### PR 4: Update tests - -**Files:** `tests/` -**Scope:** Update tests that construct `ArrayV3Metadata` directly and access `.chunk_grid` as a `ChunkGrid`. Tests that go through `Array` / `AsyncArray` should mostly work unchanged. - -## Open questions - -1. **Convenience properties on metadata.** Should `ArrayV3Metadata` expose `chunk_shape` parsed from the raw dict? Or should all chunk access go through the array? Exposing it avoids constructing a full `ChunkGrid` for simple queries, but adds dict-parsing logic to metadata. -2. **Downstream breakage.** Code that accesses `metadata.chunk_grid` as a `ChunkGrid` (e.g., xarray, VirtualiZarr) will break. Migration path: `array.chunk_grid` for behavioral access, `metadata.chunk_grid` for the raw dict. Downstream PRs needed for xarray, VirtualiZarr, Icechunk. -3. **Rectilinear resize serialization.** When a rectilinear array resizes, the array must serialize the updated `ChunkGrid` back to a dict. Should this use `serialize_chunk_grid()` (which applies RLE compression), or should the array manipulate the dict directly? The former is cleaner; the latter avoids a round-trip through `ChunkGrid`. From 087382b3e0ce8f560aac44f41669f166c4114c0e Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:58:09 -0400 Subject: [PATCH 059/118] Update design doc --- docs/design/chunk-grid.md | 92 ++++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 20 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index cf50afbda8..9be6d0cdfb 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -1,6 +1,6 @@ # Unified Chunk Grid -Version: 4 +Version: 5 **Related:** @@ -57,13 +57,16 @@ class FixedDimension: return ceildiv(self.extent, self.size) def index_to_chunk(self, idx: int) -> int: - return idx // self.size + return idx // self.size # raises IndexError if OOB def chunk_offset(self, chunk_ix: int) -> int: - return chunk_ix * self.size + return chunk_ix * self.size # raises IndexError if OOB def chunk_size(self, chunk_ix: int) -> int: - return self.size # always uniform + return self.size # always uniform; raises IndexError if OOB def data_size(self, chunk_ix: int) -> int: - return max(0, min(self.size, self.extent - chunk_ix * self.size)) + return max(0, min(self.size, self.extent - chunk_ix * self.size)) # raises IndexError if OOB + @property + def unique_edge_lengths(self) -> Iterable[int]: + return (self.size,) # O(1) def indices_to_chunks(self, indices: NDArray) -> NDArray: return indices // self.size def with_extent(self, new_extent: int) -> FixedDimension: @@ -90,25 +93,30 @@ class VaryingDimension: return len(self.edges) def index_to_chunk(self, idx: int) -> int: - return bisect.bisect_right(self.cumulative, idx) + return bisect.bisect_right(self.cumulative, idx) # raises IndexError if OOB def chunk_offset(self, chunk_ix: int) -> int: - return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 + return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 # raises IndexError if OOB def chunk_size(self, chunk_ix: int) -> int: - return self.edges[chunk_ix] + return self.edges[chunk_ix] # raises IndexError if OOB def data_size(self, chunk_ix: int) -> int: offset = self.chunk_offset(chunk_ix) - return max(0, min(self.edges[chunk_ix], self.extent - offset)) + return max(0, min(self.edges[chunk_ix], self.extent - offset)) # raises IndexError if OOB + @property + def unique_edge_lengths(self) -> Iterable[int]: + # lazy generator: yields unseen values, short-circuits deduplication def indices_to_chunks(self, indices: NDArray) -> NDArray: return np.searchsorted(self.cumulative, indices, side='right') def with_extent(self, new_extent: int) -> VaryingDimension: - # validates edge_sum >= new_extent, re-binds extent + # validates cumulative[-1] >= new_extent (O(1)), re-binds extent return VaryingDimension(self.edges, extent=new_extent) def resize(self, new_extent: int) -> VaryingDimension: # grow: append chunk of size (new_extent - old_extent) # shrink: drop trailing chunks, keep those up to new_extent ``` -Both types implement the `DimensionGrid` protocol: `nchunks`, `extent`, `index_to_chunk`, `chunk_offset`, `chunk_size`, `data_size`, `indices_to_chunks`, `with_extent`, `resize`. Memory usage scales with the number of *varying* dimensions, not total chunks. +Both types implement the `DimensionGrid` protocol: `nchunks`, `extent`, `index_to_chunk`, `chunk_offset`, `chunk_size`, `data_size`, `indices_to_chunks`, `unique_edge_lengths`, `with_extent`, `resize`. Memory usage scales with the number of *varying* dimensions, not total chunks. + +All per-chunk methods (`chunk_offset`, `chunk_size`, `data_size`) raise `IndexError` for out-of-bounds chunk indices, providing consistent fail-fast behavior across both dimension types. The two size methods serve different consumers: @@ -130,10 +138,12 @@ class DimensionGrid(Protocol): def nchunks(self) -> int: ... @property def extent(self) -> int: ... + @property + def unique_edge_lengths(self) -> Iterable[int]: ... def index_to_chunk(self, idx: int) -> int: ... - def chunk_offset(self, chunk_ix: int) -> int: ... - def chunk_size(self, chunk_ix: int) -> int: ... - def data_size(self, chunk_ix: int) -> int: ... + def chunk_offset(self, chunk_ix: int) -> int: ... # raises IndexError if OOB + def chunk_size(self, chunk_ix: int) -> int: ... # raises IndexError if OOB + def data_size(self, chunk_ix: int) -> int: ... # raises IndexError if OOB def indices_to_chunks(self, indices: NDArray[np.intp]) -> NDArray[np.intp]: ... def with_extent(self, new_extent: int) -> DimensionGrid: ... def resize(self, new_extent: int) -> DimensionGrid: ... @@ -265,6 +275,8 @@ RLE compression is used when serializing: runs of identical sizes become `[value For `FixedDimension` serialized as rectilinear, `_serialize_fixed_dim()` produces a compact representation: bare integer when evenly divisible, `[size, last_data]` for two chunks, `[[size, n-1], last_data]` for more. +**Zero-extent handling:** Regular grids serialize zero-extent dimensions without issue (the format encodes only `chunk_shape`, no edges). Rectilinear grids reject zero-extent dimensions because the spec requires at least one positive-integer edge length per axis. This asymmetry is intentional and spec-compliant — documented in `serialize_chunk_grid()`. + #### chunk_sizes The `chunk_sizes` property provides universal access to per-dimension chunk data sizes, matching the dask `Array.chunks` convention. It works for both regular and rectilinear grids: @@ -291,6 +303,17 @@ Resize uses `ChunkGrid.update_shape(new_shape)`, which delegates to each dimensi - `FixedDimension.resize()`: simply re-binds the extent (identical to `with_extent`) - `VaryingDimension.resize()`: grow appends a chunk of size `new_extent - old_extent`; shrink drops trailing chunks whose cumulative offset lies beyond the new extent +**Known limitation (deferred):** When growing a `VaryingDimension`, the current implementation always appends a single chunk covering the new region. For example, `[10, 10, 10]` resized from 30 to 45 produces `[10, 10, 10, 15]` instead of the more natural `[10, 10, 10, 10, 10]`. A future improvement should add an optional `chunks` parameter to `resize()` that controls how the new region is partitioned, with a sane default (e.g., repeating the last chunk size). This is safely deferrable because: +- `FixedDimension` already handles resize correctly (regular grids stay regular) +- The single-chunk default produces valid state, just suboptimal chunk layout +- Rectilinear arrays are behind an experimental feature flag +- Adding an optional parameter is backwards-compatible + +Open design questions for the `chunks` parameter: +- Does it describe the new region only, or the entire post-resize array? +- Must the overlapping portion agree with existing chunks (no rechunking)? +- What is the type? Same as `chunks` in `create_array`? + #### from_array The `from_array()` function handles both regular and rectilinear source arrays: @@ -335,7 +358,7 @@ Read: store → decode to codec_shape → slice via chunk_selection → user da ### Sharding -The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as extent and the subchunk shape as `FixedDimension`. Each shard is self-contained — it doesn't need to know whether the outer grid is regular or rectilinear. Rectilinear chunks with sharding currently raises `ValueError` pending further validation work. +The `ShardingCodec` constructs a `ChunkGrid` per shard using the shard shape as extent and the subchunk shape as `FixedDimension`. Each shard is self-contained — it doesn't need to know whether the outer grid is regular or rectilinear. Validation checks that every unique edge length per dimension is divisible by the inner chunk size, using `dim.unique_edge_lengths` for efficient polymorphic iteration (O(1) for fixed dimensions, lazy-deduplicated for varying). ``` Level 1 — Outer chunk grid (shard boundaries): regular or rectilinear @@ -389,6 +412,16 @@ There is no known chunk grid outside the rectilinear family that retains the tes All known grids are special cases of rectilinear. A Protocol-based approach means every caller programs against an abstract interface and adding a grid type requires implementing ~10 methods. A single class is simpler. If a genuinely novel grid type emerges, a Protocol can be extracted. +### Deferred: Metadata / Array separation + +An earlier design doc proposed decoupling `ChunkGrid` (behavioral) from `ArrayV3Metadata` (data), so that metadata would store only a plain dict and the array layer would construct the `ChunkGrid`. This was deferred because: + +1. **Scope.** The unified chunk grid is already a large change spanning chunk grids, indexing, codecs, metadata, and the array API. Adding a metadata refactor would increase the review surface and risk without a concrete payoff for this PR. +2. **No blocking issue.** The current coupling — `ArrayV3Metadata` stores a `ChunkGrid` and calls `serialize_chunk_grid()` / `parse_chunk_grid()` — works correctly. The grid is constructed once from metadata + `shape` and round-trips cleanly. +3. **Independent concern.** Separating metadata DTOs from behavioral objects is a general architectural goal that applies beyond chunk grids (e.g., codec pipelines). It's better addressed holistically than piecemeal. + +The current design stores `chunk_grid: ChunkGrid` and `chunk_grid_name: str` on `ArrayV3Metadata`. The name controls serialization format; the grid handles all behavioral queries. If a future refactor makes metadata a pure DTO, the `ChunkGrid` construction would move to the array layer and `parse_chunk_grid()` already provides the right entry point. + ## Prior art **zarrs (Rust):** Three independent grid types behind a `ChunkGridTraits` trait. Key patterns adopted: Fixed vs Varying per dimension, prefix sums + binary search, `Option` for out-of-bounds, `NonZeroU64` for chunk dimensions, separate subchunk grid per shard, array shape at construction. @@ -430,13 +463,29 @@ All known grids are special cases of rectilinear. A Protocol-based approach mean A **fresh PR** is more practical than adapting #3534's 5700-line diff. +### Backwards compatibility + +A `RegularChunkGrid` shim is provided for downstream code that imports or type-checks against the old class: + +```python +from zarr.core.chunk_grids import RegularChunkGrid # works (no ImportError) + +# Construction emits DeprecationWarning, returns a real ChunkGrid +grid = RegularChunkGrid(chunk_shape=(10, 20)) + +# isinstance works via __instancecheck__ metaclass +isinstance(grid, RegularChunkGrid) # True for any regular ChunkGrid +``` + +The shim uses `chunk_shape` as extent (matching the old shape-unaware behavior). The deprecation warning directs users to `ChunkGrid.from_regular()`. + ### Downstream migration All four downstream PRs/issues follow the same pattern: | Two-class pattern | Unified pattern | |---|---| -| `isinstance(cg, RegularChunkGrid)` | `cg.is_regular` | +| `isinstance(cg, RegularChunkGrid)` | `cg.is_regular` (or keep `isinstance` — shim handles it) | | `isinstance(cg, RectilinearChunkGrid)` | `not cg.is_regular` | | `cg.chunk_shape` | `cg.dimensions[i].size` or `cg[coord].shape` | | `cg.chunk_shapes` | `tuple(d.edges for d in cg.dimensions)` | @@ -454,10 +503,13 @@ All four downstream PRs/issues follow the same pattern: ## Open questions -1. **Resize defaults:** When growing a regular array, should the default preserve regularity or transition to rectilinear? +1. **Resize defaults (deferred):** When growing a rectilinear array, should `resize()` accept an optional `chunks` parameter? See the [Resize section](#resize) for details and open design questions. Regular arrays already stay regular on resize. 2. **`ChunkSpec` complexity:** `ChunkSpec` carries both `slices` and `codec_shape`. Should the grid expose separate methods for codec vs data queries instead? 3. **`__getitem__` with slices:** Should `grid[0, :]` or `grid[0:3, :]` return a sub-grid or an iterator of `ChunkSpec`s? -4. **Rectilinear + sharding:** The current POC raises `ValueError` for rectilinear chunks with sharding. When should this be relaxed? + +### Resolved + +4. ~~**Rectilinear + sharding:** The current POC raises `ValueError` for rectilinear chunks with sharding. When should this be relaxed?~~ **Resolved.** Sharding now validates divisibility polymorphically via `dim.unique_edge_lengths`. ## Proofs of concepts @@ -484,8 +536,8 @@ All four downstream PRs/issues follow the same pattern: ### PR 2: Unified `ChunkGrid` class (replaces old hierarchy) -**Files**: `chunk_grids.py` (new `ChunkGrid` class + `RegularChunkGrid` compat wrapper) -**Scope**: New `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `all_chunk_coords()` (no shape arg), `is_regular`, `chunk_shape`, `chunk_sizes`. Keep `RegularChunkGrid` as backwards-compat subclass. Add `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()`. Tests for the grid class itself. +**Files**: `chunk_grids.py` (new `ChunkGrid` class + `RegularChunkGrid` deprecation shim) +**Scope**: New `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `all_chunk_coords()` (no shape arg), `is_regular`, `chunk_shape`, `chunk_sizes`. Add `RegularChunkGrid` deprecation shim (metaclass-based `isinstance` support, `DeprecationWarning` on construction). Add `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()`. Tests for the grid class itself. ### PR 3: Indexing generalization From 38fd5aaf2874b505889b8420d7d3f0dbc722c6e4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:00:43 -0400 Subject: [PATCH 060/118] Remove unnecessary casts --- src/zarr/core/array.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 33ba0f7999..ebe5ec62bf 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -2739,12 +2739,12 @@ def __array__( raise ValueError(msg) arr = self[...] - arr_np = np.array(arr, dtype=dtype) + arr_np: NDArrayLike = np.array(arr, dtype=dtype) if dtype is not None: arr_np = arr_np.astype(dtype) - return cast("NDArrayLike", arr_np) + return arr_np def __getitem__(self, selection: Selection) -> NDArrayLikeOrScalar: """Retrieve data for an item or region of the array. @@ -3759,7 +3759,7 @@ def get_coordinate_selection( if hasattr(out_array, "shape"): # restore shape - out_array = cast("NDArrayLikeOrScalar", np.array(out_array).reshape(indexer.sel_shape)) + out_array = np.array(out_array).reshape(indexer.sel_shape) return out_array def set_coordinate_selection( From bbc0703027fbb6c488f73318236daede568236a4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:08:07 -0400 Subject: [PATCH 061/118] Improve typing --- src/zarr/core/chunk_grids.py | 9 ++++++--- src/zarr/core/metadata/v3.py | 7 +++++-- tests/test_unified_chunk_grid.py | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ebf6ef6199..f41e94f9e1 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -295,6 +295,9 @@ def _compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: # list of ints (explicit edges), or mixed RLE (e.g. [[10, 3], 5]). RectilinearDimSpec = int | list[int | list[int]] +# The serialization format name for a chunk grid. +ChunkGridName = Literal["regular", "rectilinear"] + def _serialize_fixed_dim(dim: FixedDimension) -> RectilinearDimSpec: """Compact rectilinear representation for a fixed-size dimension. @@ -714,7 +717,7 @@ def parse_chunk_grid( raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") -def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: +def serialize_chunk_grid(grid: ChunkGrid, name: ChunkGridName) -> dict[str, JSON]: """Serialize a ChunkGrid to a metadata dict using the given format name. The format choice ("regular" vs "rectilinear") belongs to the metadata layer, @@ -762,11 +765,11 @@ def serialize_chunk_grid(grid: ChunkGrid, name: str) -> dict[str, JSON]: def _infer_chunk_grid_name( data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], grid: ChunkGrid, -) -> str: +) -> ChunkGridName: """Extract or infer the chunk grid serialization name from the input.""" if isinstance(data, dict): name, _ = parse_named_configuration(data) - return name + return cast("ChunkGridName", name) # ChunkGrid passed directly — infer from structure return "regular" if grid.is_regular else "rectilinear" diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 96a6f7d1ea..06a6e15090 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -26,6 +26,7 @@ from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import ( ChunkGrid, + ChunkGridName, _infer_chunk_grid_name, parse_chunk_grid, serialize_chunk_grid, @@ -205,7 +206,9 @@ class ArrayV3Metadata(Metadata): shape: tuple[int, ...] data_type: ZDType[TBaseDType, TBaseScalar] chunk_grid: ChunkGrid - chunk_grid_name: str + chunk_grid_name: ( + ChunkGridName # serialization format; tracked internally for round-trip fidelity + ) chunk_key_encoding: ChunkKeyEncoding fill_value: Any codecs: tuple[Codec, ...] @@ -227,7 +230,7 @@ def __init__( codecs: Iterable[Codec | dict[str, JSON] | NamedConfig[str, Any] | str], attributes: dict[str, JSON] | None, dimension_names: DimensionNames, - chunk_grid_name: str | None = None, + chunk_grid_name: ChunkGridName | None = None, storage_transformers: Iterable[dict[str, JSON]] | None = None, extra_fields: Mapping[str, AllowedExtraField] | None = None, ) -> None: diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 9ae6d48d68..2280fcb7ce 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -709,7 +709,7 @@ def test_serialize_non_regular_as_regular_raises(self) -> None: def test_serialize_unknown_name_raises(self) -> None: g = ChunkGrid.from_regular((100,), (10,)) with pytest.raises(ValueError, match="Unknown chunk grid name for serialization"): - serialize_chunk_grid(g, "hexagonal") + serialize_chunk_grid(g, "hexagonal") # type: ignore[arg-type] def test_zero_extent_rectilinear_raises(self) -> None: """Zero-extent grids cannot be serialized as rectilinear (spec requires positive edges).""" From 460d6835372a910477acb9ca79a6b42784b3f00f Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:11:51 -0400 Subject: [PATCH 062/118] Add another deferred item --- docs/design/chunk-grid.md | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 9be6d0cdfb..1cf6a1349c 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -18,12 +18,12 @@ The Zarr V3 spec defines `chunk_grid` as an extension point, but chunk grids are There is no known chunk grid that is both (a) more general than rectilinear and (b) retains the axis-aligned tessellation properties Zarr assumes. All known grids are special cases: -| Grid type | Description | -|---|---| -| Regular | Uniform chunk size, boundary chunks padded with fill_value | -| Regular-bounded (zarrs) | Uniform chunk size, boundary chunks trimmed to array extent | -| HPC boundary-padded | Regular interior, larger boundary chunks | -| Fully variable | Arbitrary per-chunk sizes | +| Grid type | Description | Example | +|---|---|---| +| Regular | Uniform chunk size, boundary chunks padded with fill_value | `[10, 10, 10, 10]` | +| Regular-bounded (zarrs) | Uniform chunk size, boundary chunks trimmed to array extent | `[10, 10, 10, 5]` | +| HPC boundary-padded | Regular interior, larger boundary chunks ([VirtualiZarr#217](https://github.com/zarr-developers/VirtualiZarr/issues/217)) | `[10, 8, 8, 8, 10]` | +| Fully variable | Arbitrary per-chunk sizes | `[5, 12, 3, 20]` | A registry-based plugin system adds complexity without clear benefit. @@ -412,6 +412,20 @@ There is no known chunk grid outside the rectilinear family that retains the tes All known grids are special cases of rectilinear. A Protocol-based approach means every caller programs against an abstract interface and adding a grid type requires implementing ~10 methods. A single class is simpler. If a genuinely novel grid type emerges, a Protocol can be extracted. +### Deferred: Tiled/periodic chunk patterns + +[#3750 discussion](https://github.com/zarr-developers/zarr-python/issues/3750) identified periodic chunk patterns as a use case not efficiently served by RLE alone. RLE compresses runs of identical values (`np.repeat`), but periodic patterns like days-per-month (`[31, 28, 31, 30, ...]` repeated 30 years) need a tile encoding (`np.tile`). Real-world examples include: + +- **Oceanographic models** (ROMS): HPC boundary-padded chunks like `[10, 8, 8, 8, 10]` — handled by RLE +- **Temporal axes**: days-per-month, hours-per-day — need tile encoding for compact metadata +- **Temporal-aware grids**: date/time-aware chunk grids that layer over other axes (raised by @LDeakin) + +A `TiledDimension` prototype was built ([commit 9c0f582](https://github.com/maxrjones/zarr-python/commit/9c0f582f)) demonstrating that the per-dimension design supports this without changes to indexing or the codec pipeline. However, it was intentionally excluded from this release because: + +1. **Metadata format must come first.** Tile encoding requires a new `kind` value in the rectilinear spec (currently only `"inline"` is defined). This should go through [zarr-extensions#25](https://github.com/zarr-developers/zarr-extensions/pull/25), not zarr-python unilaterally. +2. **The per-dimension architecture doesn't preclude it.** A future `TiledDimension` can implement the `DimensionGrid` protocol alongside `FixedDimension` and `VaryingDimension` with no changes to indexing, codecs, or the `ChunkGrid` class. +3. **RLE covers the MVP.** Most real-world variable chunk patterns (HPC boundaries, irregular partitions) are efficiently encoded with RLE. Tile encoding is an optimization for a specific (temporal) subset. + ### Deferred: Metadata / Array separation An earlier design doc proposed decoupling `ChunkGrid` (behavioral) from `ArrayV3Metadata` (data), so that metadata would store only a plain dict and the array layer would construct the `ChunkGrid`. This was deferred because: From 73164b6ced66037596803fe5fb23fe536eee808a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:14:58 -0400 Subject: [PATCH 063/118] Add to design doc --- docs/design/chunk-grid.md | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 1cf6a1349c..d3d92bcb19 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -408,10 +408,40 @@ For `VaryingDimension`, `chunk_size == data_size` when `extent == sum(edges)`. W There is no known chunk grid outside the rectilinear family that retains the tessellation properties zarr-python assumes. A `match` on the grid name is sufficient. +### Why a single class instead of RegularChunkGrid + RectilinearChunkGrid? + +[Discussed in #3534.](https://github.com/zarr-developers/zarr-python/pull/3534) @d-v-b argued that `RegularChunkGrid` is unnecessary since rectilinear is more general; @dcherian argued that downstream libraries need a fast way to detect regular grids without inspecting potentially millions of chunk edges (see [xarray#9808](https://github.com/pydata/xarray/pull/9808)). + +The resolution: a single `ChunkGrid` class with an `is_regular` property (O(1), cached at construction). This gives downstream code the fast-path detection @dcherian needed without the class hierarchy complexity @d-v-b wanted to avoid. The metadata document's `name` field (`"regular"` vs `"rectilinear"`) is also available for clients who inspect JSON directly. + +A `RegularChunkGrid` deprecation shim preserves `isinstance` checks for existing code — see [Backwards compatibility](#backwards-compatibility). + ### Why a single class instead of a Protocol? All known grids are special cases of rectilinear. A Protocol-based approach means every caller programs against an abstract interface and adding a grid type requires implementing ~10 methods. A single class is simpler. If a genuinely novel grid type emerges, a Protocol can be extracted. +### Why `.chunks` raises for rectilinear grids + +[Debated in #3534.](https://github.com/zarr-developers/zarr-python/pull/3534) @d-v-b suggested making `.chunks` return `tuple[tuple[int, ...], ...]` (dask-style) for all grids. @dcherian strongly objected: every downstream consumer expects `tuple[int, ...]`, and silently returning a different type would be worse than raising. Materializing O(10M) chunk edges into a Python tuple is also a real performance risk ([xarray#8902](https://github.com/pydata/xarray/issues/8902#issuecomment-2546127373)). + +The resolution: +- `.chunks` is retained for regular grids (returns `tuple[int, ...]` as before) +- `.chunks` raises `NotImplementedError` for rectilinear grids with a message pointing to `.chunk_sizes` +- `.chunk_sizes` returns `tuple[tuple[int, ...], ...]` (dask convention) for all grids + +@maxrjones noted in review that deprecating `.chunks` for regular grids was not desirable. The current branch does not deprecate it. + +### User control over grid serialization format + +@d-v-b raised in #3534 that users need a way to say "these chunks are regular, but serialize as rectilinear" (e.g., to allow future append/extend workflows without format changes). @jhamman initially made nested-list input always produce `RectilinearChunkGrid`. + +The current branch resolves this via `chunk_grid_name: ChunkGridName` on `ArrayV3Metadata`. The name is stored internally for round-trip fidelity and is not part of the Zarr spec metadata. Current inference behavior: +- `chunks=(10, 20)` (flat tuple) → infers `"regular"` +- `chunks=[[10, 20], [5, 5]]` (nested lists with varying sizes) → infers `"rectilinear"` +- `chunks=[[10, 10], [20, 20]]` (nested lists with uniform sizes) → `from_rectilinear` collapses to `FixedDimension`, so `is_regular=True` and infers `"regular"` + +**Open question:** Should uniform nested lists preserve `"rectilinear"` to support future append workflows without a format change? This could be addressed by checking the input form before collapsing, or by allowing users to pass `chunk_grid_name` explicitly through the `create_array` API. + ### Deferred: Tiled/periodic chunk patterns [#3750 discussion](https://github.com/zarr-developers/zarr-python/issues/3750) identified periodic chunk patterns as a use case not efficiently served by RLE alone. RLE compresses runs of identical values (`np.repeat`), but periodic patterns like days-per-month (`[31, 28, 31, 30, ...]` repeated 30 years) need a tile encoding (`np.tile`). Real-world examples include: @@ -513,13 +543,17 @@ All four downstream PRs/issues follow the same pattern: **[Icechunk#1338](https://github.com/earth-mover/icechunk/issues/1338):** Minimal impact — format changes driven by spec, not class hierarchy. -**[cubed#876](https://github.com/cubed-dev/cubed/issues/876):** Switch store creation to `ChunkGrid` API. <1 day. +**[cubed#876](https://github.com/cubed-dev/cubed/issues/876):** Switch store creation to `ChunkGrid` API. <1 day. @tomwhite confirmed in #3534 that rechunking with variable-sized intermediate chunks works. + +**HEALPix use case:** @tinaok demonstrated in #3534 that variable-chunked arrays arise naturally when grouping HEALPix cells by parent pixel — the chunk sizes come from `np.unique(parents, return_counts=True)`. ## Open questions 1. **Resize defaults (deferred):** When growing a rectilinear array, should `resize()` accept an optional `chunks` parameter? See the [Resize section](#resize) for details and open design questions. Regular arrays already stay regular on resize. 2. **`ChunkSpec` complexity:** `ChunkSpec` carries both `slices` and `codec_shape`. Should the grid expose separate methods for codec vs data queries instead? 3. **`__getitem__` with slices:** Should `grid[0, :]` or `grid[0:3, :]` return a sub-grid or an iterator of `ChunkSpec`s? +4. **Uniform nested lists:** Should `chunks=[[10, 10], [20, 20]]` serialize as `"rectilinear"` (preserving user intent for future append) or `"regular"` (current behavior, collapses uniform edges)? See [User control over grid serialization format](#user-control-over-grid-serialization-format). +5. **`zarr.open` with rectilinear:** @tomwhite noted in #3534 that `zarr.open(mode="w")` doesn't support rectilinear chunks directly. This could be addressed in a follow-up. ### Resolved From aec0abdcc60af50a932f41972eae26eabb36a2e0 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:26:34 -0400 Subject: [PATCH 064/118] Add design principles --- docs/design/chunk-grid.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index d3d92bcb19..4dcc2b5558 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -27,9 +27,20 @@ There is no known chunk grid that is both (a) more general than rectilinear and A registry-based plugin system adds complexity without clear benefit. +## Goals + +1. **Follow the zarr extension proposal.** The implementation should conform to the [rectilinear chunk grid spec](https://github.com/zarr-developers/zarr-extensions/pull/25), not innovate on the metadata format. +2. **Minimize changes to the public API.** Users creating regular arrays should see no difference. Rectilinear is additive. +3. **Maintain backwards compatibility.** Existing code using `RegularChunkGrid`, `.chunks`, or `isinstance` checks should continue to work (with deprecation warnings where appropriate). +4. **Design for future iteration.** The internal architecture should allow refactoring (e.g., metadata/array separation, new dimension types) without breaking the public API. +5. **Minimize downstream changes.** xarray, VirtualiZarr, Icechunk, Cubed, etc. should need minimal updates. +6. **Minimize time to stable release.** Ship behind a feature flag, stabilize through real-world usage, promote to stable API. +7. **The new API should be useful.** `chunk_sizes`, `ChunkGrid.__getitem__`, `is_regular` — these should solve real problems, not just expose internals. +8. **Extensible for other serialization structures.** The per-dimension design should support future encodings (tile, temporal) without changes to indexing or codecs. + ## Design -### Principles +### Design choices 1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern — the specific partition of a specific array. The grid stores enough information to answer any question about any chunk without external parameters. 2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. The serialization format (`"regular"` vs `"rectilinear"`) is chosen by the metadata layer, not the grid. From abb9d9de92d64b477911992261b8ebda68e5a71e Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:36:08 -0400 Subject: [PATCH 065/118] Polish design doc --- docs/design/chunk-grid.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 4dcc2b5558..7d4fdfde00 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -427,9 +427,13 @@ The resolution: a single `ChunkGrid` class with an `is_regular` property (O(1), A `RegularChunkGrid` deprecation shim preserves `isinstance` checks for existing code — see [Backwards compatibility](#backwards-compatibility). -### Why a single class instead of a Protocol? +### Why is ChunkGrid a concrete class instead of a Protocol/ABC? -All known grids are special cases of rectilinear. A Protocol-based approach means every caller programs against an abstract interface and adding a grid type requires implementing ~10 methods. A single class is simpler. If a genuinely novel grid type emerges, a Protocol can be extracted. +The old design had `ChunkGrid` as an ABC with `RegularChunkGrid` as a subclass. #3534 added `RectilinearChunkGrid` as a second subclass. This branch makes `ChunkGrid` a single concrete class instead. + +All known grids are special cases of rectilinear, so there's no need for a class hierarchy at the grid level. A `ChunkGrid` Protocol/ABC would mean every caller programs against an abstract interface and adding a grid type requires implementing ~15 methods. A single class is simpler. + +Note: the *dimension* types (`FixedDimension`, `VaryingDimension`) do use a `DimensionGrid` Protocol — that's where the polymorphism lives. The grid-level class is concrete; the dimension-level types are polymorphic. If a genuinely novel grid type emerges that can't be expressed as a combination of per-dimension types, a grid-level Protocol can be extracted. ### Why `.chunks` raises for rectilinear grids From aa002c8353cfb99eb1143f8c77fda5f8da97d4da Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:42:06 -0400 Subject: [PATCH 066/118] Update migration sequence --- docs/design/chunk-grid.md | 87 +++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 39 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 7d4fdfde00..33f1d3d4a6 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -489,42 +489,9 @@ The current design stores `chunk_grid: ChunkGrid` and `chunk_grid_name: str` on ## Migration -### Plan - -1. **Amend and merge #3735.** Keep the `chunk_grids/` module layout. Replace the registry with direct name dispatch. Remove `register_chunk_grid` / `get_chunk_grid_class` and the entrypoint. -2. **Open a new PR** implementing this prospectus: - - `FixedDimension`, `VaryingDimension`, `DimensionGrid` protocol, `ChunkSpec`, and `ChunkGrid` classes. - - `parse_chunk_grid(metadata, array_shape)` with `"regular"` and `"rectilinear"` dispatch. - - Port RLE helpers, `resolve_chunk_spec`, `ChunksLike`, and validation functions from #3534. - - Refactor per-dimension indexers to accept `FixedDimension | VaryingDimension`. - - Update `get_chunk_spec` to use `grid[chunk_coords].codec_shape`. - - Add `arr.chunk_sizes`. Keep `.chunks` for regular, raise for rectilinear. - - Remove the "sharding incompatible with rectilinear" guard. - - Adapt tests from #3534. -3. **Close trial PRs** with credits: - - **#3534** — RLE helpers, validation logic, chunk spec resolution, test cases, review discussion. - - **#3737** — extent-in-grid idea (adopted per-dimension). - - **#1483** — original POC; superseded by V3 implementation. - - **#3736** — resolved by storing extent per-dimension. -4. **Sharding v1.1** (separate PR, after zarr-specs#370) — remove `shard_shape % subchunk_shape == 0` validation. - -### Reusable components from #3534 - -| Component | Disposition | -|---|---| -| RLE encode/decode helpers | **Keep** | -| `_normalize_rectilinear_chunks` / `_parse_chunk_shapes` | **Keep** — feed into `VaryingDimension` | -| `resolve_chunk_spec` / `ChunksLike` | **Keep** | -| `_validate_zarr_format_compatibility` | **Keep** — rectilinear is V3-only | -| `_validate_sharding_compatibility` | **Remove** — sharding is compatible | -| `RectilinearChunkGrid` class | **Replace** | -| Indexing changes | **Insufficient** — `isinstance` guards remain | - -A **fresh PR** is more practical than adapting #3534's 5700-line diff. - ### Backwards compatibility -A `RegularChunkGrid` shim is provided for downstream code that imports or type-checks against the old class: +A `RegularChunkGrid` deprecation shim preserves the three common usage patterns: ```python from zarr.core.chunk_grids import RegularChunkGrid # works (no ImportError) @@ -540,8 +507,6 @@ The shim uses `chunk_shape` as extent (matching the old shape-unaware behavior). ### Downstream migration -All four downstream PRs/issues follow the same pattern: - | Two-class pattern | Unified pattern | |---|---| | `isinstance(cg, RegularChunkGrid)` | `cg.is_regular` (or keep `isinstance` — shim handles it) | @@ -552,16 +517,60 @@ All four downstream PRs/issues follow the same pattern: | `RectilinearChunkGrid(chunk_shapes=...)` | `ChunkGrid.from_rectilinear(edges, shape)` | | Feature detection via class import | Version check or `hasattr(ChunkGrid, 'is_regular')` | -**[xarray#10880](https://github.com/pydata/xarray/pull/10880):** Replace `isinstance` checks with `.is_regular`. Write path simplifies with `chunks=[[...]]` API. ~1–2 days. +**[xarray#10880](https://github.com/pydata/xarray/pull/10880):** Replace `isinstance` checks with `.is_regular`. Write path simplifies with `chunks=[[...]]` API. -**[VirtualiZarr#877](https://github.com/zarr-developers/VirtualiZarr/pull/877):** Drop vendored `_is_nested_sequence`. Replace `isinstance` checks. ~1–2 days. +**[VirtualiZarr#877](https://github.com/zarr-developers/VirtualiZarr/pull/877):** Drop vendored `_is_nested_sequence`. Replace `isinstance` checks. **[Icechunk#1338](https://github.com/earth-mover/icechunk/issues/1338):** Minimal impact — format changes driven by spec, not class hierarchy. -**[cubed#876](https://github.com/cubed-dev/cubed/issues/876):** Switch store creation to `ChunkGrid` API. <1 day. @tomwhite confirmed in #3534 that rechunking with variable-sized intermediate chunks works. +**[cubed#876](https://github.com/cubed-dev/cubed/issues/876):** Switch store creation to `ChunkGrid` API. @tomwhite confirmed in #3534 that rechunking with variable-sized intermediate chunks works. **HEALPix use case:** @tinaok demonstrated in #3534 that variable-chunked arrays arise naturally when grouping HEALPix cells by parent pixel — the chunk sizes come from `np.unique(parents, return_counts=True)`. +### Credits + +This implementation builds on prior work: + +- **[#3534](https://github.com/zarr-developers/zarr-python/pull/3534)** (@jhamman) — RLE helpers, validation logic, test cases, and the review discussion that shaped the architecture. +- **[#3737](https://github.com/zarr-developers/zarr-python/pull/3737)** — extent-in-grid idea (adopted per-dimension). +- **[#1483](https://github.com/zarr-developers/zarr-python/pull/1483)** — original variable chunking POC. +- **[#3736](https://github.com/zarr-developers/zarr-python/pull/3736)** — resolved by storing extent per-dimension. + +### Suggested PR sequence + +If the design is accepted, the POC branch can be split into 5 incremental PRs. PRs 1–2 are where the design decisions are reviewed; PRs 3–5 are mechanical consequences. + +**PR 1: Per-dimension types + ChunkSpec** (purely additive) +- `FixedDimension`, `VaryingDimension`, `DimensionGrid` protocol, `ChunkSpec` +- RLE helpers (`_expand_rle`, `_compress_rle`, `_decode_dim_spec`) +- `ChunkGridName` type alias +- Unit tests for all new types +- Zero changes to existing code + +**PR 2: Unified ChunkGrid class + serialization** (replaces hierarchy) +- `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `__iter__`, `all_chunk_coords`, `is_regular`, `chunk_shape`, `chunk_sizes`, `unique_edge_lengths` +- `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()` +- `RegularChunkGrid` deprecation shim +- `chunk_grid_name: ChunkGridName` on `ArrayV3Metadata` +- Feature flag (`array.rectilinear_chunks`) + +**PR 3: Indexing generalization** +- Replace `dim_chunk_len: int` with `dim_grid: DimensionGrid` in all per-dimension indexers +- Vectorized `indices_to_chunks()` in `IntArrayDimIndexer` and `CoordinateIndexer` + +**PR 4: Array, codec pipeline, and sharding integration** +- Wire `ChunkGrid` into `create_array` / `init_array` +- `get_chunk_spec()` → `grid[chunk_coords].codec_shape` +- Sharding validation via `dim.unique_edge_lengths` +- `arr.chunk_sizes`, `from_array` with `chunks="keep"`, resize support +- Hypothesis strategies for rectilinear grids + +**PR 5: End-to-end tests + docs** +- Full pipeline tests (create → write → read → verify) +- V2 backwards compatibility regression tests +- Boundary/overflow/edge case tests +- Design doc and user guide updates + ## Open questions 1. **Resize defaults (deferred):** When growing a rectilinear array, should `resize()` accept an optional `chunks` parameter? See the [Resize section](#resize) for details and open design questions. Regular arrays already stay regular on resize. From fffe4da63638bd6059670760c8c718ee9a22bd0a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:43:05 -0400 Subject: [PATCH 067/118] Remove stale sections --- docs/design/chunk-grid.md | 43 --------------------------------------- 1 file changed, 43 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 33f1d3d4a6..39bafc5dda 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -579,10 +579,6 @@ If the design is accepted, the POC branch can be split into 5 incremental PRs. P 4. **Uniform nested lists:** Should `chunks=[[10, 10], [20, 20]]` serialize as `"rectilinear"` (preserving user intent for future append) or `"regular"` (current behavior, collapses uniform edges)? See [User control over grid serialization format](#user-control-over-grid-serialization-format). 5. **`zarr.open` with rectilinear:** @tomwhite noted in #3534 that `zarr.open(mode="w")` doesn't support rectilinear chunks directly. This could be addressed in a follow-up. -### Resolved - -4. ~~**Rectilinear + sharding:** The current POC raises `ValueError` for rectilinear chunks with sharding. When should this be relaxed?~~ **Resolved.** Sharding now validates divisibility polymorphically via `dim.unique_edge_lengths`. - ## Proofs of concepts - Zarr-Python: @@ -599,42 +595,3 @@ If the design is accepted, the POC branch can be split into 5 incremental PRs. P - diff - https://github.com/virtual-zarr/virtual-tiff/compare/main...poc/unified-chunk-grid?expand=1 - Microbenchmarks: - https://github.com/maxrjones/zarr-chunk-grid-tests/tree/unified-chunk-grid -## Breaking POC into reviewable PRs - -### PR 1: Per-dimension grid types and `ChunkSpec` (pure additions) - -**Files**: `chunk_grids.py` (new types only) -**Scope**: Add `FixedDimension`, `VaryingDimension`, `DimensionGrid` protocol, `ChunkSpec`, and RLE helpers (`_expand_rle`, `_compress_rle`). Unit tests for these types. No existing code changes — purely additive. - -### PR 2: Unified `ChunkGrid` class (replaces old hierarchy) - -**Files**: `chunk_grids.py` (new `ChunkGrid` class + `RegularChunkGrid` deprecation shim) -**Scope**: New `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `all_chunk_coords()` (no shape arg), `is_regular`, `chunk_shape`, `chunk_sizes`. Add `RegularChunkGrid` deprecation shim (metaclass-based `isinstance` support, `DeprecationWarning` on construction). Add `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()`. Tests for the grid class itself. - -### PR 3: Indexing generalization - -**Files**: `indexing.py` -**Scope**: Refactor `IntDimIndexer`, `SliceDimIndexer`, `BoolArrayDimIndexer`, `BasicIndexer`, `OrthogonalIndexer`, `CoordinateIndexer` to accept `DimensionGrid` instead of `dim_chunk_len: int`. Replace `get_chunk_shape()` calls with `_get_dim_grids()`. Tests for indexing with both regular and rectilinear grids. - -### PR 4: Metadata and array integration - -**Files**: `metadata/v3.py`, `metadata/v2.py`, `array.py`, `group.py`, `api/synchronous.py` -**Scope**: Wire the new `ChunkGrid` into `ArrayV3Metadata` (add `chunk_grid_name`, use `serialize_chunk_grid` in `to_dict`, use `parse_chunk_grid` in constructor). Update `init_array`/`create_array` to accept rectilinear chunks. Update `_resize` to guard against rectilinear grids. - -### PR 5: Sharding codec compatibility - -**Files**: `codecs/sharding.py` -**Scope**: Update `ShardingCodec.validate` to handle rectilinear outer grids (validate every chunk is divisible). Replace `RegularChunkGrid(chunk_shape=...)` calls with `ChunkGrid.from_regular(...)`. - -### PR 6: End-to-end tests - -**Files**: `tests/test_unified_chunk_grid.py`, updates to `tests/test_array.py`, `tests/test_indexing.py` -**Scope**: Full integration tests — round-trip create/read/write with rectilinear arrays, serialization fidelity, hypothesis strategies. - -## Notes - -- PRs 1–2 are purely additive and low-risk. -- PR 3 is the biggest behavioral change. -- PRs 4–5 wire things together. -- PR 6 adds comprehensive test coverage. -- Each PR builds on the previous but is independently reviewable. From 6777ec5e5279d2340c66c07d1a871eb10685a8f9 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 16:06:17 -0400 Subject: [PATCH 068/118] Use TypeGuard --- src/zarr/core/array.py | 17 ++++++----------- src/zarr/core/chunk_grids.py | 4 ++-- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ebe5ec62bf..3a626e65a5 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4718,9 +4718,8 @@ async def init_array( rectilinear_grid: ChunkGrid | None = None rectilinear_shards = _is_rectilinear_chunks(shards) - rectilinear_chunks = _is_rectilinear_chunks(chunks) - if rectilinear_chunks: + if _is_rectilinear_chunks(chunks): if zarr_format == 2: raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") if shards is not None: @@ -4729,25 +4728,21 @@ async def init_array( "Use rectilinear shards instead: " "chunks=(inner_size, ...), shards=[[shard_sizes], ...]" ) - rect_chunks = cast("Sequence[Sequence[int]]", chunks) - rectilinear_grid = ChunkGrid.from_rectilinear(rect_chunks, array_shape=shape_parsed) + rectilinear_grid = ChunkGrid.from_rectilinear(chunks, array_shape=shape_parsed) # Use first chunk size per dim as placeholder for _auto_partition - chunks_flat: tuple[int, ...] | Literal["auto"] = tuple( - dim_edges[0] for dim_edges in rect_chunks - ) + chunks_flat: tuple[int, ...] | Literal["auto"] = tuple(dim_edges[0] for dim_edges in chunks) else: chunks_flat = cast("tuple[int, ...] | Literal['auto']", chunks) # Handle rectilinear shards: shards=[[60, 40, 20], [50, 50]] # means variable-sized shard boundaries with uniform inner chunks shards_for_partition: ShardsLike | None = shards - if rectilinear_shards: + if _is_rectilinear_chunks(shards): if zarr_format == 2: raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") - rect_shards = cast("Sequence[Sequence[int]]", shards) - rectilinear_grid = ChunkGrid.from_rectilinear(rect_shards, array_shape=shape_parsed) + rectilinear_grid = ChunkGrid.from_rectilinear(shards, array_shape=shape_parsed) # Use first shard size per dim as placeholder for _auto_partition - shards_for_partition = tuple(dim_edges[0] for dim_edges in rect_shards) + shards_for_partition = tuple(dim_edges[0] for dim_edges in shards) item_size = 1 if isinstance(zdtype, HasItemSize): diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index f41e94f9e1..25d37955e4 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -9,7 +9,7 @@ from collections.abc import Iterable, Sequence from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any, Literal, Protocol, cast, runtime_checkable +from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeGuard, cast, runtime_checkable import numpy as np import numpy.typing as npt @@ -379,7 +379,7 @@ def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[in ChunksLike = tuple[int, ...] | list[list[int] | int] | int -def _is_rectilinear_chunks(chunks: Any) -> bool: +def _is_rectilinear_chunks(chunks: Any) -> TypeGuard[Sequence[Sequence[int]]]: """Check if chunks is a nested sequence (e.g. [[10, 20], [5, 5]]). Returns True for inputs like [[10, 20], [5, 5]] or [(10, 20), (5, 5)]. From adec422a33c8b18a12ce153648e43bf5045dc003 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 16:13:23 -0400 Subject: [PATCH 069/118] Cache nchunks --- src/zarr/core/chunk_grids.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 25d37955e4..aa426c8b4e 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -7,7 +7,7 @@ import operator import warnings from collections.abc import Iterable, Sequence -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import reduce from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeGuard, cast, runtime_checkable @@ -43,18 +43,17 @@ class FixedDimension: size: int # chunk edge length (>= 0) extent: int # array dimension length + nchunks: int = field(init=False, repr=False) def __post_init__(self) -> None: if self.size < 0: raise ValueError(f"FixedDimension size must be >= 0, got {self.size}") if self.extent < 0: raise ValueError(f"FixedDimension extent must be >= 0, got {self.extent}") - - @property - def nchunks(self) -> int: if self.size == 0: - return 1 if self.extent == 0 else 0 - return ceildiv(self.extent, self.size) + object.__setattr__(self, "nchunks", 1 if self.extent == 0 else 0) + else: + object.__setattr__(self, "nchunks", ceildiv(self.extent, self.size)) def index_to_chunk(self, idx: int) -> int: if idx < 0: @@ -65,6 +64,13 @@ def index_to_chunk(self, idx: int) -> int: return 0 return idx // self.size + # Bounds checking: all callers (ChunkGrid.__getitem__, indexers) validate + # chunk indices before calling these methods, so the checks here are + # redundant on the hot path. They are retained for safety when methods + # are called directly. If profiling shows this overhead matters, the + # checks can be removed — VaryingDimension gets natural IndexError from + # tuple indexing, and FixedDimension would silently return wrong values. + def _check_chunk_ix(self, chunk_ix: int) -> None: if chunk_ix < 0 or chunk_ix >= self.nchunks: raise IndexError( From 4903b09c14490cf16c3c65ae193ff3760b158e9d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 19:30:51 -0400 Subject: [PATCH 070/118] Add cubed example --- docs/design/chunk-grid.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 39bafc5dda..eca8025fcb 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -593,5 +593,7 @@ If the design is accepted, the POC branch can be split into 5 incremental PRs. P - Virtual TIFF: - branch - https://github.com/virtual-zarr/virtual-tiff/tree/poc/unified-chunk-grid - diff - https://github.com/virtual-zarr/virtual-tiff/compare/main...poc/unified-chunk-grid?expand=1 +- Cubed: + - branch - https://github.com/maxrjones/cubed/tree/poc/unified-chunk-grid - Microbenchmarks: - https://github.com/maxrjones/zarr-chunk-grid-tests/tree/unified-chunk-grid From 67e540c5e868f0c4f06673e3adc2469519fab220 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Fri, 20 Mar 2026 20:57:39 +0100 Subject: [PATCH 071/118] move chunk grid off metadata (#6) --- src/zarr/abc/codec.py | 14 +- src/zarr/codecs/sharding.py | 24 +- src/zarr/codecs/transpose.py | 4 +- src/zarr/core/array.py | 167 ++++++++++---- src/zarr/core/chunk_grids.py | 21 ++ src/zarr/core/codec_pipeline.py | 4 +- src/zarr/core/metadata/v2.py | 6 - src/zarr/core/metadata/v3.py | 337 ++++++++++++++++++++++------- src/zarr/testing/strategies.py | 30 ++- tests/test_array.py | 10 +- tests/test_codecs/test_sharding.py | 5 +- tests/test_indexing.py | 10 +- 12 files changed, 455 insertions(+), 177 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 3ec5ec522b..f43430b055 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -17,10 +17,10 @@ from zarr.abc.store import ByteGetter, ByteSetter, Store from zarr.core.array_spec import ArraySpec - from zarr.core.chunk_grids import ChunkGrid from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType from zarr.core.indexing import SelectorTuple from zarr.core.metadata import ArrayMetadata + from zarr.core.metadata.v3 import ChunkGridMetadata __all__ = [ "ArrayArrayCodec", @@ -140,7 +140,7 @@ def validate( *, shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], - chunk_grid: ChunkGrid, + chunk_grid: ChunkGridMetadata, ) -> None: """Validates that the codec configuration is compatible with the array metadata. Raises errors when the codec configuration is not compatible. @@ -151,8 +151,8 @@ def validate( The array shape dtype : np.dtype[Any] The array data type - chunk_grid : ChunkGrid - The array chunk grid + chunk_grid : ChunkGridMetadata + The array chunk grid metadata """ async def _decode_single(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: @@ -357,7 +357,7 @@ def validate( *, shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], - chunk_grid: ChunkGrid, + chunk_grid: ChunkGridMetadata, ) -> None: """Validates that all codec configurations are compatible with the array metadata. Raises errors when a codec configuration is not compatible. @@ -368,8 +368,8 @@ def validate( The array shape dtype : np.dtype[Any] The array data type - chunk_grid : ChunkGrid - The array chunk grid + chunk_grid : ChunkGridMetadata + The array chunk grid metadata """ ... diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 6497386ed4..43f48cd87e 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -52,7 +52,12 @@ get_indexer, morton_order_iter, ) -from zarr.core.metadata.v3 import parse_codecs +from zarr.core.metadata.v3 import ( + ChunkGridMetadata, + RectilinearChunkGrid, + RegularChunkGrid, + parse_codecs, +) from zarr.registry import get_ndbuffer_class, get_pipeline_class from zarr.storage._utils import _normalize_byte_range_index @@ -381,16 +386,13 @@ def validate( *, shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], - chunk_grid: ChunkGrid, + chunk_grid: ChunkGridMetadata, ) -> None: if len(self.chunk_shape) != len(shape): raise ValueError( "The shard's `chunk_shape` and array's `shape` need to have the same number of dimensions." ) - # Sharding works with both regular and rectilinear outer chunk grids. - # Each shard is self-contained — the ShardingCodec constructs an independent - # inner ChunkGrid per shard using the shard shape and subchunk shape. - if chunk_grid.is_regular: + if isinstance(chunk_grid, RegularChunkGrid): if not all( s % c == 0 for s, c in zip( @@ -403,15 +405,13 @@ def validate( f"The array's `chunk_shape` (got {chunk_grid.chunk_shape}) " f"needs to be divisible by the shard's inner `chunk_shape` (got {self.chunk_shape})." ) - else: + elif isinstance(chunk_grid, RectilinearChunkGrid): # For rectilinear grids, every unique edge length per dimension # must be divisible by the corresponding inner chunk size. - # unique_edge_lengths is a lazy generator that short-circuits - # deduplication, and we short-circuit on the first failure. - for i, (dim, inner) in enumerate( - zip(chunk_grid.dimensions, self.chunk_shape, strict=False) + for i, (edges, inner) in enumerate( + zip(chunk_grid.chunk_shapes, self.chunk_shape, strict=False) ): - for edge in dim.unique_edge_lengths: + for edge in set(edges): if edge % inner != 0: raise ValueError( f"Chunk edge length {edge} in dimension {i} is not " diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 609448a59c..5756fba2b4 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -14,8 +14,8 @@ from typing import Self from zarr.core.buffer import NDBuffer - from zarr.core.chunk_grids import ChunkGrid from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + from zarr.core.metadata.v3 import ChunkGridMetadata def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: @@ -51,7 +51,7 @@ def validate( self, shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], - chunk_grid: ChunkGrid, + chunk_grid: ChunkGridMetadata, ) -> None: if len(self.order) != len(shape): raise ValueError( diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 3a626e65a5..a675406e60 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -30,7 +30,7 @@ from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -120,7 +120,12 @@ parse_compressor, parse_filters, ) -from zarr.core.metadata.v3 import parse_node_type_array +from zarr.core.metadata.v3 import ( + ChunkGridMetadata, + RectilinearChunkGrid, + RegularChunkGrid, + parse_node_type_array, +) from zarr.core.sync import sync from zarr.errors import ( ArrayNotFoundError, @@ -306,11 +311,14 @@ class AsyncArray(Generic[T_ArrayMetadata]): The codec pipeline used for encoding and decoding chunks. config : ArrayConfig The runtime configuration of the array. + chunk_grid : ChunkGrid + The behavioral chunk grid bound to this array's shape. """ metadata: T_ArrayMetadata store_path: StorePath codec_pipeline: CodecPipeline = field(init=False) + chunk_grid: ChunkGrid = field(init=False) config: ArrayConfig @overload @@ -341,6 +349,7 @@ def __init__( object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) object.__setattr__(self, "config", config_parsed) + object.__setattr__(self, "chunk_grid", ChunkGrid.from_metadata(metadata_parsed)) object.__setattr__( self, "codec_pipeline", @@ -748,7 +757,7 @@ def _create_metadata_v3( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: DimensionNames = None, attributes: dict[str, JSON] | None = None, - chunk_grid: ChunkGrid | None = None, + chunk_grid: ChunkGridMetadata | None = None, ) -> ArrayV3Metadata: """ Create an instance of ArrayV3Metadata. @@ -781,14 +790,15 @@ def _create_metadata_v3( else: fill_value_parsed = fill_value - if chunk_grid is not None: - chunk_grid_parsed: ChunkGrid = chunk_grid - else: - chunk_grid_parsed = ChunkGrid.from_regular(shape, chunk_shape) + chunk_grid_meta = ( + chunk_grid + if chunk_grid is not None + else RegularChunkGrid(chunk_shape=parse_shapelike(chunk_shape)) + ) return ArrayV3Metadata( shape=shape, data_type=dtype, - chunk_grid=chunk_grid_parsed, + chunk_grid=chunk_grid_meta, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, codecs=codecs_parsed, # type: ignore[arg-type] @@ -1062,6 +1072,7 @@ def chunks(self) -> tuple[int, ...]: tuple[int, ...]: The chunk shape of the Array. """ + # TODO: move sharding awareness out of metadata return self.metadata.chunks @property @@ -1087,7 +1098,7 @@ def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: >>> arr.chunk_sizes ((10, 20, 30), (50, 50)) """ - return self.metadata.chunk_grid.chunk_sizes + return self.chunk_grid.chunk_sizes @property def shards(self) -> tuple[int, ...] | None: @@ -1302,7 +1313,7 @@ def _chunk_grid_shape(self) -> tuple[int, ...]: # When sharding, count inner chunks across the whole array chunk_shape = codecs[0].chunk_shape return tuple(starmap(ceildiv, zip(self.shape, chunk_shape, strict=True))) - return self.metadata.chunk_grid.shape + return self.chunk_grid.shape @property def _shard_grid_shape(self) -> tuple[int, ...]: @@ -1630,6 +1641,7 @@ async def _get_selection( self.metadata, self.codec_pipeline, self.config, + self.chunk_grid, indexer, prototype=prototype, out=out, @@ -1684,6 +1696,7 @@ async def example(): self.metadata, self.codec_pipeline, self.config, + self.chunk_grid, selection, prototype=prototype, ) @@ -1701,6 +1714,7 @@ async def get_orthogonal_selection( self.metadata, self.codec_pipeline, self.config, + self.chunk_grid, selection, out=out, fields=fields, @@ -1720,6 +1734,7 @@ async def get_mask_selection( self.metadata, self.codec_pipeline, self.config, + self.chunk_grid, mask, out=out, fields=fields, @@ -1739,6 +1754,7 @@ async def get_coordinate_selection( self.metadata, self.codec_pipeline, self.config, + self.chunk_grid, selection, out=out, fields=fields, @@ -1764,6 +1780,7 @@ async def _set_selection( self.metadata, self.codec_pipeline, self.config, + self.chunk_grid, indexer, value, prototype=prototype, @@ -1814,6 +1831,7 @@ async def setitem( self.metadata, self.codec_pipeline, self.config, + self.chunk_grid, selection, value, prototype=prototype, @@ -1970,7 +1988,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - chunk_shape = self.chunks if self.metadata.chunk_grid.is_regular else None + chunk_shape = self.chunks if self.chunk_grid.is_regular else None return ArrayInfo( _zarr_format=self.metadata.zarr_format, _data_type=self._zdtype, @@ -2022,6 +2040,11 @@ def config(self) -> ArrayConfig: """ return self.async_array.config + @property + def chunk_grid(self) -> ChunkGrid: + """The behavioral chunk grid for this array, bound to the array's shape.""" + return self.async_array.chunk_grid + @classmethod @deprecated("Use zarr.create_array instead.", category=ZarrDeprecationWarning) def create( @@ -3130,7 +3153,7 @@ def get_basic_selection( prototype = default_buffer_prototype() return sync( self.async_array._get_selection( - BasicIndexer(selection, self.shape, self.metadata.chunk_grid), + BasicIndexer(selection, self.shape, self.chunk_grid), out=out, fields=fields, prototype=prototype, @@ -3237,7 +3260,7 @@ def set_basic_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BasicIndexer(selection, self.shape, self.chunk_grid) sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_orthogonal_selection( @@ -3365,7 +3388,7 @@ def get_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self.chunk_grid) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3484,7 +3507,7 @@ def set_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self.chunk_grid) return sync( self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype) ) @@ -3572,7 +3595,7 @@ def get_mask_selection( if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self.chunk_grid) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3662,7 +3685,7 @@ def set_mask_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self.chunk_grid) sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_coordinate_selection( @@ -3750,7 +3773,7 @@ def get_coordinate_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self.chunk_grid) out_array = sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3843,7 +3866,7 @@ def set_coordinate_selection( if prototype is None: prototype = default_buffer_prototype() # setup indexer - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self.chunk_grid) # handle value - need ndarray-like flatten value if not is_scalar(value, self.dtype): @@ -3965,7 +3988,7 @@ def get_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self.chunk_grid) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -4066,7 +4089,7 @@ def set_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self.chunk_grid) sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @property @@ -4716,7 +4739,7 @@ async def init_array( # Detect rectilinear (nested list) chunks or shards, e.g. [[10, 20, 30], [25, 25]] from zarr.core.chunk_grids import _is_rectilinear_chunks - rectilinear_grid: ChunkGrid | None = None + rectilinear_meta: RectilinearChunkGrid | None = None rectilinear_shards = _is_rectilinear_chunks(shards) if _is_rectilinear_chunks(chunks): @@ -4728,7 +4751,9 @@ async def init_array( "Use rectilinear shards instead: " "chunks=(inner_size, ...), shards=[[shard_sizes], ...]" ) - rectilinear_grid = ChunkGrid.from_rectilinear(chunks, array_shape=shape_parsed) + rectilinear_meta = RectilinearChunkGrid( + chunk_shapes=tuple(tuple(dim_edges) for dim_edges in chunks) + ) # Use first chunk size per dim as placeholder for _auto_partition chunks_flat: tuple[int, ...] | Literal["auto"] = tuple(dim_edges[0] for dim_edges in chunks) else: @@ -4740,7 +4765,9 @@ async def init_array( if _is_rectilinear_chunks(shards): if zarr_format == 2: raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") - rectilinear_grid = ChunkGrid.from_rectilinear(shards, array_shape=shape_parsed) + rectilinear_meta = RectilinearChunkGrid( + chunk_shapes=tuple(tuple(dim_edges) for dim_edges in shards) + ) # Use first shard size per dim as placeholder for _auto_partition shards_for_partition = tuple(dim_edges[0] for dim_edges in shards) @@ -4808,10 +4835,10 @@ async def init_array( chunk_shape=chunk_shape_parsed, codecs=sub_codecs, index_location=index_location ) # Use rectilinear grid for validation when shards are rectilinear - if rectilinear_shards and rectilinear_grid is not None: - validation_grid = rectilinear_grid + if rectilinear_shards and rectilinear_meta is not None: + validation_grid: ChunkGridMetadata = rectilinear_meta else: - validation_grid = ChunkGrid.from_regular(shape_parsed, shard_shape_parsed) + validation_grid = RegularChunkGrid(chunk_shape=shard_shape_parsed) sharding_codec.validate( shape=chunk_shape_parsed, dtype=zdtype, @@ -4835,7 +4862,7 @@ async def init_array( codecs=codecs_out, dimension_names=dimension_names, attributes=attributes, - chunk_grid=rectilinear_grid, + chunk_grid=rectilinear_meta, ) arr = AsyncArray(metadata=meta, store_path=store_path, config=config) @@ -5063,12 +5090,12 @@ def _parse_keep_array_attr( ]: if isinstance(data, Array): if chunks == "keep": - if data.metadata.chunk_grid.is_regular: + if data.chunk_grid.is_regular: chunks = data.chunks else: chunks = data.chunk_sizes if shards == "keep": - shards = data.shards if data.metadata.chunk_grid.is_regular else None + shards = data.shards if data.chunk_grid.is_regular else None if zarr_format is None: zarr_format = data.metadata.zarr_format if filters == "keep": @@ -5577,9 +5604,7 @@ def _iter_chunk_regions( A tuple of slice objects representing the region spanned by each shard in the selection. """ - return array.metadata.chunk_grid.iter_chunk_regions( - origin=origin, selection_shape=selection_shape - ) + return array.chunk_grid.iter_chunk_regions(origin=origin, selection_shape=selection_shape) async def _nchunks_initialized( @@ -5652,11 +5677,32 @@ async def _nbytes_stored( return await store_path.store.getsize_prefix(store_path.path) +def _get_chunk_spec( + metadata: ArrayMetadata, + chunk_grid: ChunkGrid, + chunk_coords: tuple[int, ...], + array_config: ArrayConfig, + prototype: BufferPrototype, +) -> ArraySpec: + """Build an ArraySpec for a single chunk using the behavioral ChunkGrid.""" + spec = chunk_grid[chunk_coords] + if spec is None: + raise IndexError(f"Chunk coordinates {chunk_coords} are out of bounds.") + return ArraySpec( + shape=spec.codec_shape, + dtype=metadata.dtype, + fill_value=metadata.fill_value, + config=array_config, + prototype=prototype, + ) + + async def _get_selection( store_path: StorePath, metadata: ArrayMetadata, codec_pipeline: CodecPipeline, config: ArrayConfig, + chunk_grid: ChunkGrid, indexer: Indexer, *, prototype: BufferPrototype, @@ -5733,7 +5779,7 @@ async def _get_selection( [ ( store_path / metadata.encode_chunk_key(chunk_coords), - metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), + _get_chunk_spec(metadata, chunk_grid, chunk_coords, _config, prototype), chunk_selection, out_selection, is_complete_chunk, @@ -5753,6 +5799,7 @@ async def _getitem( metadata: ArrayMetadata, codec_pipeline: CodecPipeline, config: ArrayConfig, + chunk_grid: ChunkGrid, selection: BasicSelection, *, prototype: BufferPrototype | None = None, @@ -5770,6 +5817,8 @@ async def _getitem( The codec pipeline for encoding/decoding. config : ArrayConfig The array configuration. + chunk_grid : ChunkGrid + The behavioral chunk grid. selection : BasicSelection A selection object specifying the subset of data to retrieve. prototype : BufferPrototype, optional @@ -5785,10 +5834,10 @@ async def _getitem( indexer = BasicIndexer( selection, shape=metadata.shape, - chunk_grid=metadata.chunk_grid, + chunk_grid=chunk_grid, ) return await _get_selection( - store_path, metadata, codec_pipeline, config, indexer, prototype=prototype + store_path, metadata, codec_pipeline, config, chunk_grid, indexer, prototype=prototype ) @@ -5797,6 +5846,7 @@ async def _get_orthogonal_selection( metadata: ArrayMetadata, codec_pipeline: CodecPipeline, config: ArrayConfig, + chunk_grid: ChunkGrid, selection: OrthogonalSelection, *, out: NDBuffer | None = None, @@ -5816,6 +5866,8 @@ async def _get_orthogonal_selection( The codec pipeline for encoding/decoding. config : ArrayConfig The array configuration. + chunk_grid : ChunkGrid + The behavioral chunk grid. selection : OrthogonalSelection The orthogonal selection specification. out : NDBuffer | None, optional @@ -5832,12 +5884,13 @@ async def _get_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, metadata.shape, metadata.chunk_grid) + indexer = OrthogonalIndexer(selection, metadata.shape, chunk_grid) return await _get_selection( store_path, metadata, codec_pipeline, config, + chunk_grid, indexer=indexer, out=out, fields=fields, @@ -5850,6 +5903,7 @@ async def _get_mask_selection( metadata: ArrayMetadata, codec_pipeline: CodecPipeline, config: ArrayConfig, + chunk_grid: ChunkGrid, mask: MaskSelection, *, out: NDBuffer | None = None, @@ -5869,6 +5923,8 @@ async def _get_mask_selection( The codec pipeline for encoding/decoding. config : ArrayConfig The array configuration. + chunk_grid : ChunkGrid + The behavioral chunk grid. mask : MaskSelection The boolean mask specifying the selection. out : NDBuffer | None, optional @@ -5885,12 +5941,13 @@ async def _get_mask_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, metadata.shape, metadata.chunk_grid) + indexer = MaskIndexer(mask, metadata.shape, chunk_grid) return await _get_selection( store_path, metadata, codec_pipeline, config, + chunk_grid, indexer=indexer, out=out, fields=fields, @@ -5903,6 +5960,7 @@ async def _get_coordinate_selection( metadata: ArrayMetadata, codec_pipeline: CodecPipeline, config: ArrayConfig, + chunk_grid: ChunkGrid, selection: CoordinateSelection, *, out: NDBuffer | None = None, @@ -5922,6 +5980,8 @@ async def _get_coordinate_selection( The codec pipeline for encoding/decoding. config : ArrayConfig The array configuration. + chunk_grid : ChunkGrid + The behavioral chunk grid. selection : CoordinateSelection The coordinate selection specification. out : NDBuffer | None, optional @@ -5938,12 +5998,13 @@ async def _get_coordinate_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, metadata.shape, metadata.chunk_grid) + indexer = CoordinateIndexer(selection, metadata.shape, chunk_grid) out_array = await _get_selection( store_path, metadata, codec_pipeline, config, + chunk_grid, indexer=indexer, out=out, fields=fields, @@ -5961,6 +6022,7 @@ async def _set_selection( metadata: ArrayMetadata, codec_pipeline: CodecPipeline, config: ArrayConfig, + chunk_grid: ChunkGrid, indexer: Indexer, value: npt.ArrayLike, *, @@ -5980,6 +6042,8 @@ async def _set_selection( The codec pipeline for encoding/decoding. config : ArrayConfig The array configuration. + chunk_grid : ChunkGrid + The behavioral chunk grid. indexer : Indexer The indexer specifying the selection. value : npt.ArrayLike @@ -6043,7 +6107,7 @@ async def _set_selection( [ ( store_path / metadata.encode_chunk_key(chunk_coords), - metadata.get_chunk_spec(chunk_coords, _config, prototype), + _get_chunk_spec(metadata, chunk_grid, chunk_coords, _config, prototype), chunk_selection, out_selection, is_complete_chunk, @@ -6060,6 +6124,7 @@ async def _setitem( metadata: ArrayMetadata, codec_pipeline: CodecPipeline, config: ArrayConfig, + chunk_grid: ChunkGrid, selection: BasicSelection, value: npt.ArrayLike, prototype: BufferPrototype | None = None, @@ -6077,6 +6142,8 @@ async def _setitem( The codec pipeline for encoding/decoding. config : ArrayConfig The array configuration. + chunk_grid : ChunkGrid + The behavioral chunk grid. selection : BasicSelection The selection defining the region of the array to set. value : npt.ArrayLike @@ -6090,10 +6157,17 @@ async def _setitem( indexer = BasicIndexer( selection, shape=metadata.shape, - chunk_grid=metadata.chunk_grid, + chunk_grid=chunk_grid, ) return await _set_selection( - store_path, metadata, codec_pipeline, config, indexer, value, prototype=prototype + store_path, + metadata, + codec_pipeline, + config, + chunk_grid, + indexer, + value, + prototype=prototype, ) @@ -6119,14 +6193,15 @@ async def _resize( assert len(new_shape) == len(array.metadata.shape) new_metadata = array.metadata.update_shape(new_shape) + new_chunk_grid = ChunkGrid.from_metadata(new_metadata) # ensure deletion is only run if array is shrinking as the delete_outside_chunks path is unbounded in memory only_growing = all(new >= old for new, old in zip(new_shape, array.metadata.shape, strict=True)) if delete_outside_chunks and not only_growing: # Remove all chunks outside of the new shape - old_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords()) - new_chunk_coords = set(new_metadata.chunk_grid.all_chunk_coords()) + old_chunk_coords = set(array.chunk_grid.all_chunk_coords()) + new_chunk_coords = set(new_chunk_grid.all_chunk_coords()) async def _delete_key(key: str) -> None: await (array.store_path / key).delete() @@ -6143,8 +6218,9 @@ async def _delete_key(key: str) -> None: # Write new metadata await save_metadata(array.store_path, new_metadata) - # Update metadata (in place) + # Update metadata and chunk_grid (in place) object.__setattr__(array, "metadata", new_metadata) + object.__setattr__(array, "chunk_grid", new_chunk_grid) async def _append( @@ -6210,6 +6286,7 @@ async def _append( array.metadata, array.codec_pipeline, array.config, + array.chunk_grid, append_selection, data, ) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index aa426c8b4e..ad2f9d3d4f 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -29,6 +29,7 @@ from collections.abc import Iterator from zarr.core.array import ShardsLike + from zarr.core.metadata import ArrayMetadata # --------------------------------------------------------------------------- @@ -426,6 +427,26 @@ def __init__(self, *, dimensions: tuple[DimensionGrid, ...]) -> None: self, "_is_regular", all(isinstance(d, FixedDimension) for d in dimensions) ) + @classmethod + def from_metadata(cls, metadata: ArrayMetadata) -> ChunkGrid: + """Construct a behavioral ChunkGrid from array metadata. + + For v2 metadata, builds from shape and chunks. + For v3 metadata, dispatches on the chunk grid type. + """ + from zarr.core.metadata import ArrayV2Metadata + from zarr.core.metadata.v3 import RectilinearChunkGrid, RegularChunkGrid + + if isinstance(metadata, ArrayV2Metadata): + return cls.from_regular(metadata.shape, metadata.chunks) + chunk_grid_meta = metadata.chunk_grid + if isinstance(chunk_grid_meta, RegularChunkGrid): + return cls.from_regular(metadata.shape, chunk_grid_meta.chunk_shape) + elif isinstance(chunk_grid_meta, RectilinearChunkGrid): + return cls.from_rectilinear(chunk_grid_meta.chunk_shapes, metadata.shape) + else: + raise TypeError(f"Unknown chunk grid metadata type: {type(chunk_grid_meta)}") + @classmethod def from_regular(cls, array_shape: ShapeLike, chunk_shape: ShapeLike) -> ChunkGrid: """Create a ChunkGrid where all dimensions are fixed (regular).""" diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index fd557ac43e..d370775d12 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -27,8 +27,8 @@ from zarr.abc.store import ByteGetter, ByteSetter from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer - from zarr.core.chunk_grids import ChunkGrid from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + from zarr.core.metadata.v3 import ChunkGridMetadata T = TypeVar("T") U = TypeVar("U") @@ -138,7 +138,7 @@ def validate( *, shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], - chunk_grid: ChunkGrid, + chunk_grid: ChunkGridMetadata, ) -> None: for codec in self: codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 1d58dfe8c4..36dbd27c3b 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -2,12 +2,10 @@ import warnings from collections.abc import Iterable, Sequence -from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast from zarr.abc.metadata import Metadata from zarr.abc.numcodec import Numcodec, _is_numcodec -from zarr.core.chunk_grids import ChunkGrid from zarr.core.dtype import get_data_type_from_json from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 from zarr.errors import ZarrUserWarning @@ -117,10 +115,6 @@ def __init__( def ndim(self) -> int: return len(self.shape) - @cached_property - def chunk_grid(self) -> ChunkGrid: - return ChunkGrid.from_regular(self.shape, self.chunks) - @property def shards(self) -> tuple[int, ...] | None: return None diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 06a6e15090..eb1a3bf3ef 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -1,36 +1,14 @@ from __future__ import annotations -from collections.abc import Mapping -from typing import TYPE_CHECKING, NotRequired, TypedDict, TypeGuard, cast - -from zarr.abc.metadata import Metadata -from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype import VariableLengthUTF8, ZDType, get_data_type_from_json -from zarr.core.dtype.common import check_dtype_spec_v3 - -if TYPE_CHECKING: - from typing import Self - - from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.chunk_grids import ChunkGrid - from zarr.core.common import JSON - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar - - import json -from collections.abc import Iterable +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, field, replace -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal, NotRequired, TypedDict, TypeGuard, cast from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec +from zarr.abc.metadata import Metadata from zarr.core.array_spec import ArrayConfig, ArraySpec -from zarr.core.chunk_grids import ( - ChunkGrid, - ChunkGridName, - _infer_chunk_grid_name, - parse_chunk_grid, - serialize_chunk_grid, -) +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -41,14 +19,23 @@ ZARR_JSON, DimensionNames, NamedConfig, + NamedRequiredConfig, parse_named_configuration, parse_shapelike, ) from zarr.core.config import config +from zarr.core.dtype import VariableLengthUTF8, ZDType, get_data_type_from_json +from zarr.core.dtype.common import check_dtype_spec_v3 from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError, UnknownCodecError from zarr.registry import get_codec_class +if TYPE_CHECKING: + from typing import Self + + from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar + def parse_zarr_format(data: object) -> Literal[3]: if data == 3: @@ -180,6 +167,233 @@ def parse_extra_fields( return dict(data) +# --------------------------------------------------------------------------- +# Chunk grid metadata types (pure DTOs — no array shape, no behavioral logic) +# --------------------------------------------------------------------------- + +# JSON type for a single dimension's rectilinear spec: +# bare int (uniform shorthand), or list of ints / [value, count] RLE pairs. +RectilinearDimSpecJSON = int | list[int | list[int]] + + +class RegularChunkGridConfig(TypedDict): + chunk_shape: tuple[int, ...] + + +class RectilinearChunkGridConfig(TypedDict): + kind: Literal["inline"] + chunk_shapes: tuple[RectilinearDimSpecJSON, ...] + + +RegularChunkGridJSON = NamedRequiredConfig[Literal["regular"], RegularChunkGridConfig] +RectilinearChunkGridJSON = NamedRequiredConfig[Literal["rectilinear"], RectilinearChunkGridConfig] + +ChunkGridJSON = RegularChunkGridJSON | RectilinearChunkGridJSON + + +def _parse_chunk_shape(chunk_shape: Iterable[int]) -> tuple[int, ...]: + """Validate and normalize a regular chunk shape. All elements must be >= 1. + + The spec defines chunk indexing via modular arithmetic with the chunk + edge length, so zero is not a valid edge length. + """ + as_tup = tuple(chunk_shape) + problems = [idx for idx, val in enumerate(as_tup) if val < 1] + if len(problems) == 1: + idx = problems[0] + raise ValueError(f"Invalid chunk shape {as_tup[idx]} at index {idx}.") + elif len(problems) > 1: + raise ValueError( + f"Invalid chunk shapes {[as_tup[idx] for idx in problems]} at indices {problems}." + ) + return as_tup + + +def _validate_rectilinear_kind(kind: str | None) -> None: + """The rectilinear spec requires ``kind: "inline"``.""" + if kind is None: + raise ValueError( + "Rectilinear chunk grid configuration requires a 'kind' field. " + "Only 'inline' is currently supported." + ) + if kind != "inline": + raise ValueError( + f"Unsupported rectilinear chunk grid kind: {kind!r}. " + "Only 'inline' is currently supported." + ) + + +def _expand_rle(data: Sequence[int | list[int]]) -> list[int]: + """Expand a mixed array of bare integers and RLE pairs. + + Per the rectilinear chunk grid spec, each element can be: + - a bare integer (an explicit edge length) + - a two-element array ``[value, count]`` (run-length encoded) + """ + result: list[int] = [] + for item in data: + if isinstance(item, (int, float)) and not isinstance(item, bool): + result.append(int(item)) + elif isinstance(item, list) and len(item) == 2: + size, count = int(item[0]), int(item[1]) + result.extend([size] * count) + else: + raise ValueError(f"RLE entries must be an integer or [size, count], got {item}") + return result + + +def _compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: + """Compress chunk sizes to mixed RLE format per the rectilinear spec. + + Runs of length > 1 are emitted as ``[value, count]`` pairs; runs of + length 1 are emitted as bare integers:: + + [10, 10, 10, 5] -> [[10, 3], 5] + """ + if not sizes: + return [] + result: list[int | list[int]] = [] + current = sizes[0] + count = 1 + for s in sizes[1:]: + if s == current: + count += 1 + else: + result.append([current, count] if count > 1 else current) + current = s + count = 1 + result.append([current, count] if count > 1 else current) + return result + + +def _validate_chunk_shapes( + chunk_shapes: Sequence[Sequence[int]], +) -> tuple[tuple[int, ...], ...]: + """Validate expanded per-dimension edge lists. All edges must be >= 1. + + Unlike regular grids, rectilinear grids list explicit per-chunk edges, + so zero-sized edges are not meaningful. + """ + result: list[tuple[int, ...]] = [] + for dim_idx, edges in enumerate(chunk_shapes): + edges_tup = tuple(edges) + if not edges_tup: + raise ValueError(f"Dimension {dim_idx} has no chunk edges.") + bad = [i for i, e in enumerate(edges_tup) if e < 1] + if bad: + raise ValueError( + f"Dimension {dim_idx} has invalid edge lengths at indices {bad}: " + f"{[edges_tup[i] for i in bad]}" + ) + result.append(edges_tup) + return tuple(result) + + +@dataclass(frozen=True, kw_only=True) +class RegularChunkGrid(Metadata): + """Metadata-only description of a regular chunk grid. + + Stores just the chunk shape — no array extent, no behavioral logic. + This is what lives on ``ArrayV3Metadata.chunk_grid``. + """ + + chunk_shape: tuple[int, ...] + + def __post_init__(self) -> None: + chunk_shape_parsed = _parse_chunk_shape(self.chunk_shape) + object.__setattr__(self, "chunk_shape", chunk_shape_parsed) + + @property + def ndim(self) -> int: + return len(self.chunk_shape) + + def to_dict(self) -> RegularChunkGridJSON: # type: ignore[override] + return { + "name": "regular", + "configuration": {"chunk_shape": self.chunk_shape}, + } + + @classmethod + def from_dict(cls, data: RegularChunkGridJSON) -> Self: # type: ignore[override] + _, configuration = parse_named_configuration(data, "regular") + return cls(chunk_shape=tuple(configuration["chunk_shape"])) + + +@dataclass(frozen=True, kw_only=True) +class RectilinearChunkGrid(Metadata): + """Metadata-only description of a rectilinear chunk grid. + + Stores the per-dimension chunk edge lengths as expanded integer tuples + (no RLE). Serialization re-compresses to RLE via ``to_dict``. + This is what lives on ``ArrayV3Metadata.chunk_grid``. + """ + + chunk_shapes: tuple[tuple[int, ...], ...] + + def __post_init__(self) -> None: + chunk_shapes_parsed = _validate_chunk_shapes(self.chunk_shapes) + object.__setattr__(self, "chunk_shapes", chunk_shapes_parsed) + + @property + def ndim(self) -> int: + return len(self.chunk_shapes) + + def to_dict(self) -> RectilinearChunkGridJSON: # type: ignore[override] + serialized_dims: list[RectilinearDimSpecJSON] = [] + for edges in self.chunk_shapes: + rle = _compress_rle(edges) + # Use RLE only if it's actually shorter + if len(rle) < len(edges): + serialized_dims.append(rle) + else: + serialized_dims.append(list(edges)) + return { + "name": "rectilinear", + "configuration": { + "kind": "inline", + "chunk_shapes": tuple(serialized_dims), + }, + } + + @classmethod + def from_dict(cls, data: RectilinearChunkGridJSON) -> Self: # type: ignore[override] + _, configuration = parse_named_configuration(data, "rectilinear") + _validate_rectilinear_kind(configuration.get("kind")) + raw_shapes = configuration["chunk_shapes"] + expanded: list[tuple[int, ...]] = [] + for dim_spec in raw_shapes: + if isinstance(dim_spec, int): + # Bare int shorthand — uniform edge length for this dimension. + # The DTO stores the single edge length; the behavioral ChunkGrid + # will repeat it to match the array extent when constructed. + if dim_spec < 1: + raise ValueError(f"Integer chunk edge length must be >= 1, got {dim_spec}") + expanded.append((dim_spec,)) + elif isinstance(dim_spec, list): + expanded.append(tuple(_expand_rle(dim_spec))) + else: + raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + return cls(chunk_shapes=tuple(expanded)) + + +ChunkGridMetadata = RegularChunkGrid | RectilinearChunkGrid + + +def parse_chunk_grid( + data: dict[str, JSON] | ChunkGridMetadata | NamedConfig[str, Any], +) -> ChunkGridMetadata: + """Parse a chunk grid from a metadata dict or pass through an existing instance.""" + if isinstance(data, (RegularChunkGrid, RectilinearChunkGrid)): + return data + + name, _ = parse_named_configuration(data) + if name == "regular": + return RegularChunkGrid.from_dict(data) # type: ignore[arg-type] + if name == "rectilinear": + return RectilinearChunkGrid.from_dict(data) # type: ignore[arg-type] + raise ValueError(f"Unknown chunk grid name: {name!r}") + + class ArrayMetadataJSON_V3(TypedDict): """ A typed dictionary model for zarr v3 metadata. @@ -205,10 +419,7 @@ class ArrayMetadataJSON_V3(TypedDict): class ArrayV3Metadata(Metadata): shape: tuple[int, ...] data_type: ZDType[TBaseDType, TBaseScalar] - chunk_grid: ChunkGrid - chunk_grid_name: ( - ChunkGridName # serialization format; tracked internally for round-trip fidelity - ) + chunk_grid: ChunkGridMetadata chunk_key_encoding: ChunkKeyEncoding fill_value: Any codecs: tuple[Codec, ...] @@ -224,13 +435,12 @@ def __init__( *, shape: Iterable[int], data_type: ZDType[TBaseDType, TBaseScalar], - chunk_grid: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], + chunk_grid: dict[str, JSON] | ChunkGridMetadata | NamedConfig[str, Any], chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, codecs: Iterable[Codec | dict[str, JSON] | NamedConfig[str, Any] | str], attributes: dict[str, JSON] | None, dimension_names: DimensionNames, - chunk_grid_name: ChunkGridName | None = None, storage_transformers: Iterable[dict[str, JSON]] | None = None, extra_fields: Mapping[str, AllowedExtraField] | None = None, ) -> None: @@ -239,12 +449,7 @@ def __init__( """ shape_parsed = parse_shapelike(shape) - chunk_grid_parsed = parse_chunk_grid(chunk_grid, shape_parsed) - chunk_grid_name_parsed = ( - chunk_grid_name - if chunk_grid_name is not None - else _infer_chunk_grid_name(chunk_grid, chunk_grid_parsed) - ) + chunk_grid_parsed = parse_chunk_grid(chunk_grid) chunk_key_encoding_parsed = parse_chunk_key_encoding(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # Note: relying on a type method is numpy-specific @@ -266,7 +471,6 @@ def __init__( object.__setattr__(self, "shape", shape_parsed) object.__setattr__(self, "data_type", data_type) object.__setattr__(self, "chunk_grid", chunk_grid_parsed) - object.__setattr__(self, "chunk_grid_name", chunk_grid_name_parsed) object.__setattr__(self, "chunk_key_encoding", chunk_key_encoding_parsed) object.__setattr__(self, "codecs", codecs_parsed) object.__setattr__(self, "dimension_names", dimension_names_parsed) @@ -297,27 +501,27 @@ def ndim(self) -> int: def dtype(self) -> ZDType[TBaseDType, TBaseScalar]: return self.data_type + # TODO: move these behavioral properties to the Array class. + # They require knowledge of codecs (ShardingCodec) and don't belong on a metadata DTO. + @property def chunks(self) -> tuple[int, ...]: - if self.chunk_grid.is_regular: - from zarr.codecs.sharding import ShardingCodec + if not isinstance(self.chunk_grid, RegularChunkGrid): + msg = ( + "The `chunks` attribute is only defined for arrays using regular chunk grids. " + "This array has a rectilinear chunk grid. Use `chunk_sizes` for general access." + ) + raise NotImplementedError(msg) - if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): - sharding_codec = self.codecs[0] - assert isinstance(sharding_codec, ShardingCodec) # for mypy - return sharding_codec.chunk_shape - else: - return self.chunk_grid.chunk_shape + from zarr.codecs.sharding import ShardingCodec - msg = ( - "The `chunks` attribute is only defined for arrays using regular chunk grids. " - "This array has a rectilinear chunk grid. Use `chunk_sizes` for general access." - ) - raise NotImplementedError(msg) + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.codecs[0].chunk_shape + return self.chunk_grid.chunk_shape @property def shards(self) -> tuple[int, ...] | None: - if not self.chunk_grid.is_regular: + if not isinstance(self.chunk_grid, RegularChunkGrid): return None from zarr.codecs.sharding import ShardingCodec @@ -334,22 +538,6 @@ def inner_codecs(self) -> tuple[Codec, ...]: return self.codecs[0].codecs return self.codecs - def get_chunk_spec( - self, _chunk_coords: tuple[int, ...], array_config: ArrayConfig, prototype: BufferPrototype - ) -> ArraySpec: - spec = self.chunk_grid[_chunk_coords] - if spec is None: - raise ValueError( - f"Chunk coordinates {_chunk_coords} are out of bounds for shape {self.shape}" - ) - return ArraySpec( - shape=spec.codec_shape, - dtype=self.dtype, - fill_value=self.fill_value, - config=array_config, - prototype=prototype, - ) - def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) @@ -423,13 +611,7 @@ def to_dict(self) -> dict[str, JSON]: extra_fields = out_dict.pop("extra_fields") out_dict = out_dict | extra_fields # type: ignore[operator] - # Serialize chunk_grid using the stored name (not the grid's own logic). - # This gives round-trip fidelity: a store written as "rectilinear" with - # uniform edges stays "rectilinear". - out_dict["chunk_grid"] = serialize_chunk_grid(self.chunk_grid, self.chunk_grid_name) - - # chunk_grid_name is internal — not part of the Zarr metadata document - out_dict.pop("chunk_grid_name", None) + out_dict["chunk_grid"] = self.chunk_grid.to_dict() out_dict["fill_value"] = self.data_type.to_json_scalar( self.fill_value, zarr_format=self.zarr_format @@ -452,8 +634,7 @@ def to_dict(self) -> dict[str, JSON]: return out_dict def update_shape(self, shape: tuple[int, ...]) -> Self: - new_grid = self.chunk_grid.update_shape(shape) - return replace(self, shape=shape, chunk_grid=new_grid) + return replace(self, shape=shape) def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index ea2736b3c9..3a0cc58df0 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -14,11 +14,11 @@ from zarr.abc.store import RangeByteRequest, Store from zarr.codecs.bytes import BytesCodec from zarr.core.array import Array -from zarr.core.chunk_grids import ChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v3 import RegularChunkGrid from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike from zarr.storage._common import _dereference_path @@ -140,7 +140,7 @@ def array_metadata( # separator = draw(st.sampled_from(['/', '\\'])) shape = draw(array_shapes()) ndim = len(shape) - chunk_shape = draw(array_shapes(min_dims=ndim, max_dims=ndim)) + chunk_shape = draw(array_shapes(min_dims=ndim, max_dims=ndim, min_side=1)) np_dtype = draw(dtypes()) dtype = get_data_type_from_native_dtype(np_dtype) fill_value = draw(npst.from_dtype(np_dtype)) @@ -160,7 +160,7 @@ def array_metadata( return ArrayV3Metadata( shape=shape, data_type=dtype, - chunk_grid=ChunkGrid.from_regular(shape, chunk_shape), + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), fill_value=fill_value, attributes=draw(attributes), # type: ignore[arg-type] dimension_names=draw(dimension_names(ndim=ndim)), @@ -194,11 +194,17 @@ def chunk_shapes(draw: st.DrawFn, *, shape: tuple[int, ...]) -> tuple[int, ...]: # We want this strategy to shrink towards arrays with smaller number of chunks # 1. st.integers() shrinks towards smaller values. So we use that to generate number of chunks numchunks = draw( - st.tuples(*[st.integers(min_value=0 if size == 0 else 1, max_value=size) for size in shape]) + st.tuples( + *[ + st.integers(min_value=0 if size == 0 else 1, max_value=max(size, 1)) + for size in shape + ] + ) ) # 2. and now generate the chunks tuple + # Chunk sizes must be >= 1 per spec; for zero-extent dimensions use 1. chunks = tuple( - size // nchunks if nchunks > 0 else 0 + max(1, size // nchunks) if nchunks > 0 else 1 for size, nchunks in zip(shape, numchunks, strict=True) ) @@ -260,14 +266,14 @@ def arrays( nparray = draw(arrays, label="array data") chunk_shape = draw(chunk_shapes(shape=nparray.shape), label="chunk shape") dim_names: None | list[str | None] = None - if zarr_format == 3 and all(c > 0 for c in chunk_shape): - shard_shape = draw( - st.none() | shard_shapes(shape=nparray.shape, chunk_shape=chunk_shape), - label="shard shape", - ) + shard_shape = None + if zarr_format == 3: dim_names = draw(dimension_names(ndim=nparray.ndim), label="dimension names") - else: - shard_shape = None + if all(s > 0 for s in nparray.shape) and all(c > 0 for c in chunk_shape): + shard_shape = draw( + st.none() | shard_shapes(shape=nparray.shape, chunk_shape=chunk_shape), + label="shard shape", + ) # test that None works too. fill_value = draw(st.one_of([st.none(), npst.from_dtype(nparray.dtype)])) # compressor = draw(compressors) diff --git a/tests/test_array.py b/tests/test_array.py index b57eea8fa1..9dae878a90 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -796,13 +796,13 @@ def test_resize_growing_skips_chunk_enumeration( ) z[:] = np.ones((10, 10), dtype="i4") - grid_cls = type(z.metadata.chunk_grid) + grid_cls = type(z.chunk_grid) # growth only - ensure no chunk coords are enumerated with mock.patch.object( grid_cls, "all_chunk_coords", - wraps=z.metadata.chunk_grid.all_chunk_coords, + wraps=z.chunk_grid.all_chunk_coords, ) as mock_coords: z.resize((20, 20)) mock_coords.assert_not_called() @@ -815,7 +815,7 @@ def test_resize_growing_skips_chunk_enumeration( with mock.patch.object( grid_cls, "all_chunk_coords", - wraps=z.metadata.chunk_grid.all_chunk_coords, + wraps=z.chunk_grid.all_chunk_coords, ) as mock_coords: z.resize((5, 5)) assert mock_coords.call_count > 0 @@ -838,7 +838,7 @@ def test_resize_growing_skips_chunk_enumeration( with mock.patch.object( grid_cls, "all_chunk_coords", - wraps=z2.metadata.chunk_grid.all_chunk_coords, + wraps=z2.chunk_grid.all_chunk_coords, ) as mock_coords: z2.resize((20, 5)) assert mock_coords.call_count > 0 @@ -1576,7 +1576,7 @@ async def test_with_data(impl: Literal["sync", "async"], store: Store) -> None: elif impl == "async": arr = await create_array(store, name=name, data=data, zarr_format=3) stored = await arr._get_selection( - BasicIndexer(..., shape=arr.shape, chunk_grid=arr.metadata.chunk_grid), + BasicIndexer(..., shape=arr.shape, chunk_grid=arr.chunk_grid), prototype=default_buffer_prototype(), ) else: diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index d0e2d09b7c..dfecb6d57f 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -1,5 +1,4 @@ import pickle -import re from typing import Any import numpy as np @@ -489,9 +488,7 @@ def test_invalid_metadata(store: Store) -> None: def test_invalid_shard_shape() -> None: with pytest.raises( ValueError, - match=re.escape( - "The array's `chunk_shape` (got (16, 16)) needs to be divisible by the shard's inner `chunk_shape` (got (9,))." - ), + match="needs to be divisible by the shard's inner", ): zarr.create_array( {}, diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 11c0a49e7f..7392df54fa 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1219,8 +1219,8 @@ def test_get_block_selection_1d(store: StorePath) -> None: _test_get_block_selection(a, z, selection, expected_idx) bad_selections = block_selections_1d_bad + [ - z.metadata.chunk_grid.get_nchunks() + 1, # out of bounds - -(z.metadata.chunk_grid.get_nchunks() + 1), # out of bounds + z.chunk_grid.get_nchunks() + 1, # out of bounds + -(z.chunk_grid.get_nchunks() + 1), # out of bounds ] for selection_bad in bad_selections: @@ -1933,9 +1933,11 @@ def test_indexing_with_zarr_array(store: StorePath) -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) -@pytest.mark.parametrize("shape", [(0, 2, 3), (0), (3, 0)]) +@pytest.mark.parametrize("shape", [(0, 2, 3), (0,), (3, 0)]) def test_zero_sized_chunks(store: StorePath, shape: list[int]) -> None: - z = zarr.create_array(store=store, shape=shape, chunks=shape, zarr_format=3, dtype="f8") + # Chunk sizes must be >= 1 per spec; use 1 for zero-extent dimensions. + chunks = tuple(max(1, s) for s in shape) + z = zarr.create_array(store=store, shape=shape, chunks=chunks, zarr_format=3, dtype="f8") z[...] = 42 assert_array_equal(z[...], np.zeros(shape, dtype="f8")) From 14370e64c82f4c0efd253b271538c270869fd72a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 18:03:20 -0400 Subject: [PATCH 072/118] Fixup after refactor --- src/zarr/core/array.py | 84 ++++++++--- src/zarr/core/chunk_grids.py | 43 +++--- src/zarr/core/metadata/v3.py | 37 ++++- tests/test_unified_chunk_grid.py | 251 +++++++++++++++---------------- 4 files changed, 248 insertions(+), 167 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a675406e60..f3d65db349 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -185,6 +185,18 @@ class DefaultFillValue: DEFAULT_FILL_VALUE = DefaultFillValue() +def _chunk_sizes_from_shape( + array_shape: tuple[int, ...], chunk_shape: tuple[int, ...] +) -> tuple[tuple[int, ...], ...]: + """Compute dask-style chunk sizes from an array shape and uniform chunk shape.""" + result: list[tuple[int, ...]] = [] + for s, c in zip(array_shape, chunk_shape, strict=True): + nchunks = ceildiv(s, c) + sizes = tuple(min(c, s - i * c) for i in range(nchunks)) + result.append(sizes) + return tuple(result) + + def parse_array_metadata(data: Any) -> ArrayMetadata: if isinstance(data, ArrayMetadata): return data @@ -1076,12 +1088,11 @@ def chunks(self) -> tuple[int, ...]: return self.metadata.chunks @property - def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: - """Per-dimension chunk sizes for the array. + def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: + """Per-dimension sizes of chunks used for reading. - Returns the data size of each chunk along every dimension, - including the final boundary chunk. Works for both regular - and rectilinear chunk grids. + When sharding is used, returns the inner chunk sizes. + Otherwise, returns the outer chunk sizes (same as ``write_chunk_sizes``). Returns ------- @@ -1091,12 +1102,33 @@ def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: Examples -------- >>> arr = zarr.create_array(store, shape=(100, 80), chunks=(30, 40)) - >>> arr.chunk_sizes + >>> arr.read_chunk_sizes ((30, 30, 30, 10), (40, 40)) + """ + from zarr.codecs.sharding import ShardingCodec - >>> arr = zarr.create_array(store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]]) - >>> arr.chunk_sizes - ((10, 20, 30), (50, 50)) + codecs: tuple[Codec, ...] = getattr(self.metadata, "codecs", ()) + if len(codecs) == 1 and isinstance(codecs[0], ShardingCodec): + inner_chunk_shape = codecs[0].chunk_shape + return _chunk_sizes_from_shape(self.shape, inner_chunk_shape) + return self.chunk_grid.chunk_sizes + + @property + def write_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: + """Per-dimension sizes of chunks used for writing (storage chunks). + + Always returns the outer chunk sizes, regardless of sharding. + + Returns + ------- + tuple[tuple[int, ...], ...] + One inner tuple per dimension containing chunk sizes. + + Examples + -------- + >>> arr = zarr.create_array(store, shape=(100, 80), chunks=(30, 40)) + >>> arr.write_chunk_sizes + ((30, 30, 30, 10), (40, 40)) """ return self.chunk_grid.chunk_sizes @@ -2347,12 +2379,30 @@ def chunks(self) -> tuple[int, ...]: return self.async_array.chunks @property - def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: - """Per-dimension chunk sizes for the array. + def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: + """Per-dimension sizes of chunks used for reading. + + When sharding is used, returns the inner chunk sizes. + Otherwise, returns the outer chunk sizes (same as ``write_chunk_sizes``). + + Returns + ------- + tuple[tuple[int, ...], ...] + One inner tuple per dimension containing chunk sizes. + + Examples + -------- + >>> arr = zarr.open_array(store) + >>> arr.read_chunk_sizes + ((30, 30, 30, 10), (40, 40)) + """ + return self.async_array.read_chunk_sizes + + @property + def write_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: + """Per-dimension sizes of chunks used for writing (storage chunks). - Returns the data size of each chunk along every dimension, - including the final boundary chunk. Works for both regular - and rectilinear chunk grids. + Always returns the outer chunk sizes, regardless of sharding. Returns ------- @@ -2362,10 +2412,10 @@ def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: Examples -------- >>> arr = zarr.open_array(store) - >>> arr.chunk_sizes + >>> arr.write_chunk_sizes ((30, 30, 30, 10), (40, 40)) """ - return self.async_array.chunk_sizes + return self.async_array.write_chunk_sizes @property def shards(self) -> tuple[int, ...] | None: @@ -5093,7 +5143,7 @@ def _parse_keep_array_attr( if data.chunk_grid.is_regular: chunks = data.chunks else: - chunks = data.chunk_sizes + chunks = data.write_chunk_sizes if shards == "keep": shards = data.shards if data.chunk_grid.is_regular else None if zarr_format is None: diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ad2f9d3d4f..858167e645 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -45,6 +45,7 @@ class FixedDimension: size: int # chunk edge length (>= 0) extent: int # array dimension length nchunks: int = field(init=False, repr=False) + ngridcells: int = field(init=False, repr=False) def __post_init__(self) -> None: if self.size < 0: @@ -52,9 +53,11 @@ def __post_init__(self) -> None: if self.extent < 0: raise ValueError(f"FixedDimension extent must be >= 0, got {self.extent}") if self.size == 0: - object.__setattr__(self, "nchunks", 1 if self.extent == 0 else 0) + n = 1 if self.extent == 0 else 0 else: - object.__setattr__(self, "nchunks", ceildiv(self.extent, self.size)) + n = ceildiv(self.extent, self.size) + object.__setattr__(self, "nchunks", n) + object.__setattr__(self, "ngridcells", n) def index_to_chunk(self, idx: int) -> int: if idx < 0: @@ -141,9 +144,17 @@ def __init__(self, edges: Sequence[int], extent: int) -> None: object.__setattr__(self, "extent", extent) @property - def nchunks(self) -> int: + def ngridcells(self) -> int: + """Total grid cells including those past the array extent.""" return len(self.edges) + @property + def nchunks(self) -> int: + """Number of chunks that contain data (overlap [0, extent)).""" + if self.extent == 0: + return 0 + return bisect.bisect_left(self.cumulative, self.extent) + 1 + def index_to_chunk(self, idx: int) -> int: if idx < 0 or idx >= self.extent: raise IndexError(f"Index {idx} out of bounds for dimension with extent {self.extent}") @@ -183,23 +194,19 @@ def with_extent(self, new_extent: int) -> VaryingDimension: return VaryingDimension(self.edges, extent=new_extent) def resize(self, new_extent: int) -> VaryingDimension: - """Return a copy adjusted for a new array extent (grow/shrink).""" - old_extent = self.extent - if new_extent == old_extent: + """Return a copy adjusted for a new array extent (grow/shrink). + + Grow past existing edges: appends a chunk for the additional extent. + Shrink or grow within existing edges: preserves all edges and re-binds + the extent. The spec allows trailing edges beyond the array extent. + """ + if new_extent == self.extent: return self - elif new_extent > old_extent: - expanded_edges = list(self.edges) + [new_extent - old_extent] + elif new_extent > self.cumulative[-1]: + expanded_edges = list(self.edges) + [new_extent - self.cumulative[-1]] return VaryingDimension(expanded_edges, extent=new_extent) else: - # Shrink: keep chunks whose cumulative offset covers new_extent - shrunk_edges: list[int] = [] - total = 0 - for edge in self.edges: - shrunk_edges.append(edge) - total += edge - if total >= new_extent: - break - return VaryingDimension(shrunk_edges, extent=new_extent) + return VaryingDimension(self.edges, extent=new_extent) @runtime_checkable @@ -209,6 +216,8 @@ class DimensionGrid(Protocol): @property def nchunks(self) -> int: ... @property + def ngridcells(self) -> int: ... + @property def extent(self) -> int: ... def index_to_chunk(self, idx: int) -> int: ... def chunk_offset(self, chunk_ix: int) -> int: ... diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index eb1a3bf3ef..a609bda692 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -315,8 +315,9 @@ def to_dict(self) -> RegularChunkGridJSON: # type: ignore[override] @classmethod def from_dict(cls, data: RegularChunkGridJSON) -> Self: # type: ignore[override] - _, configuration = parse_named_configuration(data, "regular") - return cls(chunk_shape=tuple(configuration["chunk_shape"])) + parse_named_configuration(data, "regular") # validate name + configuration = data["configuration"] + return cls(chunk_shape=_parse_chunk_shape(configuration["chunk_shape"])) @dataclass(frozen=True, kw_only=True) @@ -355,9 +356,28 @@ def to_dict(self) -> RectilinearChunkGridJSON: # type: ignore[override] }, } + def update_shape( + self, old_shape: tuple[int, ...], new_shape: tuple[int, ...] + ) -> RectilinearChunkGrid: + """Return a new RectilinearChunkGrid with edges adjusted for *new_shape*. + + Grow past existing edges: appends a chunk covering the additional extent. + Shrink or grow within existing edges: edges are kept as-is (the spec + allows trailing edges beyond the array extent). + """ + new_chunk_shapes: list[tuple[int, ...]] = [] + for edges, new_ext in zip(self.chunk_shapes, new_shape, strict=True): + edge_sum = sum(edges) + if new_ext > edge_sum: + new_chunk_shapes.append((*edges, new_ext - edge_sum)) + else: + new_chunk_shapes.append(edges) + return RectilinearChunkGrid(chunk_shapes=tuple(new_chunk_shapes)) + @classmethod def from_dict(cls, data: RectilinearChunkGridJSON) -> Self: # type: ignore[override] - _, configuration = parse_named_configuration(data, "rectilinear") + parse_named_configuration(data, "rectilinear") # validate name + configuration = data["configuration"] _validate_rectilinear_kind(configuration.get("kind")) raw_shapes = configuration["chunk_shapes"] expanded: list[tuple[int, ...]] = [] @@ -372,7 +392,9 @@ def from_dict(cls, data: RectilinearChunkGridJSON) -> Self: # type: ignore[over elif isinstance(dim_spec, list): expanded.append(tuple(_expand_rle(dim_spec))) else: - raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") + raise TypeError( + f"Invalid chunk_shapes entry: expected int or list, got {type(dim_spec)}" + ) return cls(chunk_shapes=tuple(expanded)) @@ -509,7 +531,7 @@ def chunks(self) -> tuple[int, ...]: if not isinstance(self.chunk_grid, RegularChunkGrid): msg = ( "The `chunks` attribute is only defined for arrays using regular chunk grids. " - "This array has a rectilinear chunk grid. Use `chunk_sizes` for general access." + "This array has a rectilinear chunk grid. Use `read_chunk_sizes` or `write_chunk_sizes` for general access." ) raise NotImplementedError(msg) @@ -634,7 +656,10 @@ def to_dict(self) -> dict[str, JSON]: return out_dict def update_shape(self, shape: tuple[int, ...]) -> Self: - return replace(self, shape=shape) + chunk_grid = self.chunk_grid + if isinstance(chunk_grid, RectilinearChunkGrid): + chunk_grid = chunk_grid.update_shape(self.shape, shape) + return replace(self, shape=shape, chunk_grid=chunk_grid) def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 2280fcb7ce..298b3de61d 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -996,26 +996,23 @@ def test_create_regular_array(self, tmp_path: Path) -> None: chunks=(10, 20), dtype="float32", ) - assert arr.metadata.chunk_grid.is_regular + assert arr.chunk_grid.is_regular assert arr.chunks == (10, 20) def test_create_rectilinear_array(self, tmp_path: Path) -> None: """Create an array with a rectilinear chunk grid via metadata.""" - from zarr.core.array import AsyncArray - from zarr.core.dtype import Float32 - from zarr.core.metadata.v3 import ArrayV3Metadata - - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) + from zarr.core.metadata.v3 import ArrayV3Metadata, RectilinearChunkGrid - meta = AsyncArray._create_metadata_v3( + arr = zarr.create_array( + store=tmp_path / "rect.zarr", shape=(60, 100), - dtype=Float32(), - chunk_shape=(10, 20), - chunk_grid=g, + chunks=[[10, 20, 30], [50, 50]], + dtype="float32", ) - assert isinstance(meta, ArrayV3Metadata) - assert not meta.chunk_grid.is_regular - assert meta.chunk_grid.ndim == 2 + assert isinstance(arr.metadata, ArrayV3Metadata) + assert isinstance(arr.metadata.chunk_grid, RectilinearChunkGrid) + assert not arr.chunk_grid.is_regular + assert arr.chunk_grid.ndim == 2 def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: """Verify metadata round-trips through JSON.""" @@ -1030,43 +1027,43 @@ def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: assert new_spec is not None assert orig_spec.shape == new_spec.shape - def test_chunk_grid_name_regular(self, tmp_path: Path) -> None: - """Regular arrays store chunk_grid_name='regular'.""" - from zarr.core.array import AsyncArray - from zarr.core.dtype import Float32 + def test_chunk_grid_serializes_regular(self, tmp_path: Path) -> None: + """Regular arrays serialize with name='regular'.""" + from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid - meta = AsyncArray._create_metadata_v3( + arr = zarr.create_array( + store=tmp_path / "regular.zarr", shape=(100, 200), - dtype=Float32(), - chunk_shape=(10, 20), + chunks=(10, 20), + dtype="float32", ) - assert meta.chunk_grid_name == "regular" - d = meta.to_dict() + assert isinstance(arr.metadata, ArrayV3Metadata) + assert isinstance(arr.metadata.chunk_grid, RegularChunkGrid) + d = arr.metadata.to_dict() chunk_grid_dict = d["chunk_grid"] assert isinstance(chunk_grid_dict, dict) assert chunk_grid_dict["name"] == "regular" - def test_chunk_grid_name_rectilinear(self, tmp_path: Path) -> None: - """Rectilinear arrays store chunk_grid_name='rectilinear'.""" - from zarr.core.array import AsyncArray - from zarr.core.dtype import Float32 + def test_chunk_grid_serializes_rectilinear(self, tmp_path: Path) -> None: + """Rectilinear arrays serialize with name='rectilinear'.""" + from zarr.core.metadata.v3 import ArrayV3Metadata, RectilinearChunkGrid - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - meta = AsyncArray._create_metadata_v3( + arr = zarr.create_array( + store=tmp_path / "rect.zarr", shape=(60, 100), - dtype=Float32(), - chunk_shape=(10, 20), - chunk_grid=g, + chunks=[[10, 20, 30], [50, 50]], + dtype="float32", ) - assert meta.chunk_grid_name == "rectilinear" - d = meta.to_dict() + assert isinstance(arr.metadata, ArrayV3Metadata) + assert isinstance(arr.metadata.chunk_grid, RectilinearChunkGrid) + d = arr.metadata.to_dict() chunk_grid_dict = d["chunk_grid"] assert isinstance(chunk_grid_dict, dict) assert chunk_grid_dict["name"] == "rectilinear" def test_chunk_grid_name_roundtrip_preserves_rectilinear(self, tmp_path: Path) -> None: """A rectilinear grid with uniform edges stays 'rectilinear' through to_dict/from_dict.""" - from zarr.core.metadata.v3 import ArrayV3Metadata + from zarr.core.metadata.v3 import ArrayV3Metadata, RectilinearChunkGrid meta_dict: dict[str, Any] = { "zarr_format": 3, @@ -1082,9 +1079,7 @@ def test_chunk_grid_name_roundtrip_preserves_rectilinear(self, tmp_path: Path) - "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], } meta = ArrayV3Metadata.from_dict(meta_dict) - # Grid is uniform (all Fixed), but name should stay "rectilinear" - assert meta.chunk_grid.is_regular - assert meta.chunk_grid_name == "rectilinear" + assert isinstance(meta.chunk_grid, RectilinearChunkGrid) d = meta.to_dict() chunk_grid_dict = d["chunk_grid"] assert isinstance(chunk_grid_dict, dict) @@ -1092,7 +1087,7 @@ def test_chunk_grid_name_roundtrip_preserves_rectilinear(self, tmp_path: Path) - def test_chunk_grid_name_regular_from_dict(self, tmp_path: Path) -> None: """A 'regular' chunk grid name is preserved through from_dict.""" - from zarr.core.metadata.v3 import ArrayV3Metadata + from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid meta_dict: dict[str, Any] = { "zarr_format": 3, @@ -1108,62 +1103,38 @@ def test_chunk_grid_name_regular_from_dict(self, tmp_path: Path) -> None: "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], } meta = ArrayV3Metadata.from_dict(meta_dict) - assert meta.chunk_grid_name == "regular" + assert isinstance(meta.chunk_grid, RegularChunkGrid) d = meta.to_dict() chunk_grid_dict = d["chunk_grid"] assert isinstance(chunk_grid_dict, dict) assert chunk_grid_dict["name"] == "regular" def test_get_chunk_spec_regular(self, tmp_path: Path) -> None: - """get_chunk_spec works for regular grids.""" - from zarr.core.array import AsyncArray - from zarr.core.array_spec import ArrayConfig - from zarr.core.buffer.core import default_buffer_prototype - from zarr.core.dtype import Float32 + """ChunkGrid indexing works for regular grids.""" + grid = ChunkGrid.from_regular((100, 200), (10, 20)) - meta = AsyncArray._create_metadata_v3( - shape=(100, 200), - dtype=Float32(), - chunk_shape=(10, 20), - ) - spec = meta.get_chunk_spec( - (0, 0), - ArrayConfig.from_dict({}), - default_buffer_prototype(), - ) + spec = grid[(0, 0)] + assert spec is not None assert spec.shape == (10, 20) - spec_boundary = meta.get_chunk_spec( - (9, 9), - ArrayConfig.from_dict({}), - default_buffer_prototype(), - ) + spec_boundary = grid[(9, 9)] + assert spec_boundary is not None assert spec_boundary.shape == (10, 20) def test_get_chunk_spec_rectilinear(self, tmp_path: Path) -> None: - """get_chunk_spec returns per-chunk shapes for rectilinear grids.""" - from zarr.core.array import AsyncArray - from zarr.core.array_spec import ArrayConfig - from zarr.core.buffer.core import default_buffer_prototype - from zarr.core.dtype import Float32 - - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - meta = AsyncArray._create_metadata_v3( - shape=(60, 100), - dtype=Float32(), - chunk_shape=(10, 20), - chunk_grid=g, - ) - proto = default_buffer_prototype() - config = ArrayConfig.from_dict({}) + """ChunkGrid indexing returns per-chunk shapes for rectilinear grids.""" + grid = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - spec0 = meta.get_chunk_spec((0, 0), config, proto) + spec0 = grid[(0, 0)] + assert spec0 is not None assert spec0.shape == (10, 50) - spec1 = meta.get_chunk_spec((1, 0), config, proto) + spec1 = grid[(1, 0)] + assert spec1 is not None assert spec1.shape == (20, 50) - spec2 = meta.get_chunk_spec((2, 1), config, proto) + spec2 = grid[(2, 1)] + assert spec2 is not None assert spec2.shape == (30, 50) @@ -1177,14 +1148,15 @@ def test_sharding_accepts_rectilinear_outer_grid(self) -> None: """ShardingCodec.validate should not reject rectilinear outer grids.""" from zarr.codecs.sharding import ShardingCodec from zarr.core.dtype import Float32 + from zarr.core.metadata.v3 import RectilinearChunkGrid codec = ShardingCodec(chunk_shape=(5, 5)) - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) + grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 30), (50, 50))) codec.validate( shape=(60, 100), dtype=Float32(), - chunk_grid=g, + chunk_grid=grid_meta, ) @@ -1548,31 +1520,33 @@ def test_sharding_rejects_non_divisible_rectilinear(self) -> None: """Rectilinear shard sizes not divisible by inner chunk_shape should raise.""" from zarr.codecs.sharding import ShardingCodec from zarr.core.dtype import Float32 + from zarr.core.metadata.v3 import RectilinearChunkGrid codec = ShardingCodec(chunk_shape=(5, 5)) # 17 is not divisible by 5 - g = ChunkGrid.from_rectilinear([[10, 20, 17], [50, 50]], array_shape=(47, 100)) + grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 17), (50, 50))) with pytest.raises(ValueError, match="divisible"): codec.validate( shape=(47, 100), dtype=Float32(), - chunk_grid=g, + chunk_grid=grid_meta, ) def test_sharding_accepts_divisible_rectilinear(self) -> None: """Rectilinear shard sizes all divisible by inner chunk_shape should pass.""" from zarr.codecs.sharding import ShardingCodec from zarr.core.dtype import Float32 + from zarr.core.metadata.v3 import RectilinearChunkGrid codec = ShardingCodec(chunk_shape=(5, 5)) - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) + grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 30), (50, 50))) # Should not raise codec.validate( shape=(60, 100), dtype=Float32(), - chunk_grid=g, + chunk_grid=grid_meta, ) @@ -1806,7 +1780,7 @@ def test_1d_single_chunk(self, tmp_path: Path) -> None: def test_persistence_roundtrip(self, tmp_path: Path) -> None: _, a = self._make_2d(tmp_path) z2 = zarr.open_array(store=tmp_path / "arr2d.zarr", mode="r") - assert not z2.metadata.chunk_grid.is_regular + assert not z2.chunk_grid.is_regular np.testing.assert_array_equal(z2[:], a) # --- Highly irregular chunks --- @@ -1894,7 +1868,7 @@ def test_rectilinear_shards_validates_divisibility(self, tmp_path: Path) -> None def test_nchunks(self, tmp_path: Path) -> None: z, _ = self._make_2d(tmp_path) - assert z.metadata.chunk_grid.get_nchunks() == 12 + assert z.chunk_grid.get_nchunks() == 12 # --------------------------------------------------------------------------- @@ -1956,7 +1930,7 @@ def rectilinear_arrays_st(draw: st.DrawFn) -> tuple[zarr.Array[Any], np.ndarray[ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: """Property test: block indexing on rectilinear arrays matches numpy.""" z, a = data.draw(rectilinear_arrays_st()) - grid = z.metadata.chunk_grid + grid = z.chunk_grid # Pick a random block per dimension and verify it matches the expected slice for dim in range(a.ndim): @@ -2002,7 +1976,7 @@ def test_v2_create_and_readback(self, tmp_path: Path) -> None: np.testing.assert_array_equal(a[:], data) def test_v2_chunk_grid_is_regular(self, tmp_path: Path) -> None: - """V2 metadata.chunk_grid produces a regular ChunkGrid with FixedDimensions.""" + """V2 chunk_grid produces a regular ChunkGrid with FixedDimensions.""" a = zarr.create_array( store=tmp_path / "v2.zarr", shape=(20, 30), @@ -2010,7 +1984,7 @@ def test_v2_chunk_grid_is_regular(self, tmp_path: Path) -> None: dtype="int32", zarr_format=2, ) - grid = a.metadata.chunk_grid + grid = a.chunk_grid assert grid.is_regular assert grid.chunk_shape == (10, 15) assert grid.shape == (2, 2) @@ -2025,7 +1999,7 @@ def test_v2_boundary_chunks(self, tmp_path: Path) -> None: dtype="int32", zarr_format=2, ) - grid = a.metadata.chunk_grid + grid = a.chunk_grid assert grid.dimensions[0].nchunks == 3 assert grid.dimensions[0].chunk_size(2) == 10 # full codec buffer assert grid.dimensions[0].data_size(2) == 5 # clipped to extent @@ -2061,7 +2035,7 @@ def test_v2_metadata_roundtrip(self, tmp_path: Path) -> None: b = zarr.open_array(store=store_path, mode="r") assert b.metadata.zarr_format == 2 assert b.chunks == (2, 2) - assert b.metadata.chunk_grid.chunk_shape == (2, 2) + assert b.chunk_grid.chunk_shape == (2, 2) np.testing.assert_array_equal(b[:], data) def test_v2_chunk_spec_via_grid(self, tmp_path: Path) -> None: @@ -2073,7 +2047,7 @@ def test_v2_chunk_spec_via_grid(self, tmp_path: Path) -> None: dtype="int32", zarr_format=2, ) - grid = a.metadata.chunk_grid + grid = a.chunk_grid # Interior chunk spec = grid[(0, 0)] assert spec is not None @@ -2087,12 +2061,12 @@ def test_v2_chunk_spec_via_grid(self, tmp_path: Path) -> None: # --------------------------------------------------------------------------- -# .chunk_sizes property +# .read_chunk_sizes / .write_chunk_sizes properties # --------------------------------------------------------------------------- class TestChunkSizes: - """Tests for ChunkGrid.chunk_sizes and Array.chunk_sizes.""" + """Tests for ChunkGrid.chunk_sizes and Array.read_chunk_sizes / write_chunk_sizes.""" def test_regular_grid(self) -> None: grid = ChunkGrid.from_regular((100, 80), (30, 40)) @@ -2110,19 +2084,36 @@ def test_single_chunk(self) -> None: grid = ChunkGrid.from_regular((10,), (10,)) assert grid.chunk_sizes == ((10,),) - def test_array_property_regular(self) -> None: + def test_array_read_chunk_sizes_regular(self) -> None: store = zarr.storage.MemoryStore() arr = zarr.create_array( store=store, shape=(100, 80), chunks=(30, 40), dtype="i4", zarr_format=3 ) - assert arr.chunk_sizes == ((30, 30, 30, 10), (40, 40)) + assert arr.read_chunk_sizes == ((30, 30, 30, 10), (40, 40)) + assert arr.write_chunk_sizes == ((30, 30, 30, 10), (40, 40)) - def test_array_property_rectilinear(self) -> None: + def test_array_read_chunk_sizes_rectilinear(self) -> None: store = zarr.storage.MemoryStore() arr = zarr.create_array( store=store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]], dtype="i4", zarr_format=3 ) - assert arr.chunk_sizes == ((10, 20, 30), (50, 50)) + assert arr.read_chunk_sizes == ((10, 20, 30), (50, 50)) + assert arr.write_chunk_sizes == ((10, 20, 30), (50, 50)) + + def test_array_sharded_chunk_sizes(self) -> None: + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(120, 80), + chunks=(60, 40), + shards=(120, 80), + dtype="i4", + zarr_format=3, + ) + # read_chunk_sizes returns inner chunks + assert arr.read_chunk_sizes == ((60, 60), (40, 40)) + # write_chunk_sizes returns outer (shard) chunks + assert arr.write_chunk_sizes == ((120,), (80,)) # --------------------------------------------------------------------------- @@ -2176,24 +2167,30 @@ def test_grow_multiple_dims(self) -> None: def test_shrink_single_dim(self) -> None: grid = ChunkGrid.from_rectilinear([[10, 20, 30, 40], [25, 25]], array_shape=(100, 50)) new_grid = grid.update_shape((35, 50)) - # 10+20=30 < 35, 10+20+30=60 >= 35 → keep (10, 20, 30) - assert _edges(new_grid, 0) == (10, 20, 30) + # All edges preserved (spec allows trailing edges beyond extent) + assert _edges(new_grid, 0) == (10, 20, 30, 40) + # But only 3 chunks are active (10+20+30=60 >= 35) + assert new_grid.dimensions[0].nchunks == 3 assert _edges(new_grid, 1) == (25, 25) def test_shrink_to_single_chunk(self) -> None: grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) new_grid = grid.update_shape((5, 50)) - assert _edges(new_grid, 0) == (10,) + # All edges preserved + assert _edges(new_grid, 0) == (10, 20, 30) + # But only 1 chunk is active (10 >= 5) + assert new_grid.dimensions[0].nchunks == 1 assert _edges(new_grid, 1) == (25, 25) def test_shrink_multiple_dims(self) -> None: grid = ChunkGrid.from_rectilinear([[10, 10, 15, 5], [20, 25, 15]], array_shape=(40, 60)) # from (40, 60) to (25, 35) new_grid = grid.update_shape((25, 35)) - # dim 0: 10+10=20 < 25, 10+10+15=35 >= 25 → keep (10, 10, 15) - assert _edges(new_grid, 0) == (10, 10, 15) - # dim 1: 20 < 35, 20+25=45 >= 35 → keep (20, 25) - assert _edges(new_grid, 1) == (20, 25) + # All edges preserved, but nchunks reflects active chunks + assert _edges(new_grid, 0) == (10, 10, 15, 5) + assert new_grid.dimensions[0].nchunks == 3 # 10+10+15=35 >= 25 + assert _edges(new_grid, 1) == (20, 25, 15) + assert new_grid.dimensions[1].nchunks == 2 # 20+25=45 >= 35 def test_dimension_mismatch_error(self) -> None: grid = ChunkGrid.from_rectilinear([[10, 20], [30, 40]], array_shape=(30, 70)) @@ -2207,13 +2204,14 @@ def test_boundary_cases(self) -> None: assert _edges(new_grid, 0) == (10, 20, 30) # no change (60 == sum) assert _edges(new_grid, 1) == (15, 25, 25) # added chunk of 25 - # Shrink to exact chunk boundary + # Shrink to exact chunk boundary — edges preserved, nchunks adjusts grid2 = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25, 10]], array_shape=(60, 50)) new_grid2 = grid2.update_shape((30, 40)) - # dim 0: 10+20=30 >= 30 → keep (10, 20) - assert _edges(new_grid2, 0) == (10, 20) - # dim 1: 15+25=40 >= 40 → keep (15, 25) - assert _edges(new_grid2, 1) == (15, 25) + # All edges preserved + assert _edges(new_grid2, 0) == (10, 20, 30) + assert new_grid2.dimensions[0].nchunks == 2 # 10+20=30 >= 30 + assert _edges(new_grid2, 1) == (15, 25, 10) + assert new_grid2.dimensions[1].nchunks == 2 # 15+25=40 >= 40 def test_regular_preserves_extents(self, tmp_path: Path) -> None: """Resize a regular array — chunk_grid extents must match new shape.""" @@ -2226,7 +2224,7 @@ def test_regular_preserves_extents(self, tmp_path: Path) -> None: z[:] = np.arange(100, dtype="int32") z.resize(50) assert z.shape == (50,) - assert z.metadata.chunk_grid.dimensions[0].extent == 50 + assert z.chunk_grid.dimensions[0].extent == 50 class TestResizeRectilinear: @@ -2246,8 +2244,8 @@ async def test_async_resize_grow(self) -> None: await arr.resize((50, 60)) assert arr.shape == (50, 60) - assert _edges(arr.metadata.chunk_grid, 0) == (10, 20, 20) - assert _edges(arr.metadata.chunk_grid, 1) == (20, 20, 20) + assert _edges(arr.chunk_grid, 0) == (10, 20, 20) + assert _edges(arr.chunk_grid, 1) == (20, 20, 20) result = await arr.getitem((slice(0, 30), slice(0, 40))) np.testing.assert_array_equal(result, data) @@ -2493,25 +2491,21 @@ class TestMultipleOverflowChunks: """Rectilinear grids where multiple chunks extend past the array extent.""" def test_multiple_chunks_past_extent(self) -> None: - """Chunks 2 is partial, chunk 3 is entirely past the extent.""" + """Edges past extent are structural; nchunks counts active only.""" g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) d = g.dimensions[0] - assert d.nchunks == 4 + assert d.ngridcells == 4 # structural: all edges + assert d.nchunks == 3 # active: chunks overlapping [0, 50) assert d.data_size(0) == 10 # fully within assert d.data_size(1) == 20 # fully within assert d.data_size(2) == 20 # partial: 50 - 30 = 20 - assert d.data_size(3) == 0 # entirely past assert d.chunk_size(2) == 30 # codec buffer: full edge - assert d.chunk_size(3) == 40 # codec buffer: full edge - def test_chunk_spec_entirely_past_extent(self) -> None: - """ChunkSpec for a chunk entirely past the extent has zero-size shape.""" + def test_chunk_spec_past_extent_is_oob(self) -> None: + """Chunk entirely past the extent is out of bounds (not active).""" g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) spec = g[(3,)] - assert spec is not None - assert spec.shape == (0,) - assert spec.codec_shape == (40,) - assert spec.is_boundary is True + assert spec is None def test_chunk_spec_partial_overflow(self) -> None: """ChunkSpec for a partially-overflowing chunk clips correctly.""" @@ -2524,9 +2518,9 @@ def test_chunk_spec_partial_overflow(self) -> None: assert spec.slices == (slice(30, 50, 1),) def test_chunk_sizes_with_overflow(self) -> None: - """chunk_sizes returns clipped data sizes including zero for past-extent chunks.""" + """chunk_sizes only includes active chunks.""" g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) - assert g.chunk_sizes == ((10, 20, 20, 0),) + assert g.chunk_sizes == ((10, 20, 20),) def test_multidim_overflow(self) -> None: """Overflow in multiple dimensions simultaneously.""" @@ -2556,8 +2550,9 @@ def test_serialization_roundtrip_overflow(self) -> None: "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30, 40]]}, } g2 = parse_chunk_grid(serialized, (50,)) - assert g2.dimensions[0].nchunks == 4 - assert g2.chunk_sizes == ((10, 20, 20, 0),) + assert g2.dimensions[0].ngridcells == 4 + assert g2.dimensions[0].nchunks == 3 + assert g2.chunk_sizes == ((10, 20, 20),) def test_index_to_chunk_near_extent(self) -> None: """Index lookup near and at the extent boundary.""" @@ -2696,7 +2691,9 @@ def test_shrink_to_exact_boundary(self) -> None: new_grid = grid.update_shape((30,)) dim = new_grid.dimensions[0] assert isinstance(dim, VaryingDimension) - assert dim.edges == (10, 20) # chunk 2 dropped entirely + assert dim.edges == (10, 20, 30) # all edges preserved + assert dim.nchunks == 2 # only first two are active (10+20=30 >= 30) + assert dim.ngridcells == 3 assert dim.extent == 30 assert dim.data_size(1) == 20 # no clipping needed From bfc5d6bd8a29ce687bf65d6c54cf8e4aa82cd816 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 19:04:38 -0400 Subject: [PATCH 073/118] Fixup --- src/zarr/metadata/migrate_v3.py | 4 ++-- tests/conftest.py | 6 +++--- tests/test_group.py | 4 +--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/zarr/metadata/migrate_v3.py b/src/zarr/metadata/migrate_v3.py index a72939100d..80c50585be 100644 --- a/src/zarr/metadata/migrate_v3.py +++ b/src/zarr/metadata/migrate_v3.py @@ -27,7 +27,7 @@ from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType from zarr.core.group import GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata -from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid from zarr.core.sync import sync from zarr.registry import get_codec_class from zarr.storage import StorePath @@ -211,7 +211,7 @@ def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: return ArrayV3Metadata( shape=metadata_v2.shape, data_type=metadata_v2.dtype, - chunk_grid=metadata_v2.chunk_grid, + chunk_grid=RegularChunkGrid(chunk_shape=metadata_v2.chunks), chunk_key_encoding=chunk_key_encoding, fill_value=metadata_v2.fill_value, codecs=codecs, diff --git a/tests/conftest.py b/tests/conftest.py index f1cdec08e5..55ebb9b594 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,7 @@ _parse_chunk_encoding_v3, _parse_chunk_key_encoding, ) -from zarr.core.chunk_grids import ChunkGrid, _auto_partition +from zarr.core.chunk_grids import _auto_partition from zarr.core.common import ( JSON, DimensionNames, @@ -37,7 +37,7 @@ ) from zarr.core.dtype.common import HasItemSize from zarr.core.metadata.v2 import ArrayV2Metadata -from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid from zarr.core.sync import sync from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore from zarr.testing.store import LatencyStore @@ -379,7 +379,7 @@ def create_array_metadata( sharding_codec.validate( shape=chunk_shape_parsed, dtype=dtype_parsed, - chunk_grid=ChunkGrid.from_regular(chunk_shape_parsed, shard_shape_parsed), + chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), ) codecs_out = (sharding_codec,) chunks_out = shard_shape_parsed diff --git a/tests/test_group.py b/tests/test_group.py index 6f1f4e68fa..89c80709a5 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -1176,9 +1176,7 @@ async def test_asyncgroup_create_array( assert subnode.store_path.store == store assert subnode.shape == shape assert subnode.dtype == dtype - # todo: fix the type annotation of array.metadata.chunk_grid so that we get some autocomplete - # here. - assert subnode.metadata.chunk_grid.chunk_shape == chunk_shape + assert subnode.chunk_grid.chunk_shape == chunk_shape assert subnode.metadata.zarr_format == zarr_format From 0f783393ec0e281ae084ce1344241989b389964e Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 21:12:56 -0400 Subject: [PATCH 074/118] Remove duplicated code --- src/zarr/core/chunk_grids.py | 49 +++----------------------------- src/zarr/core/common.py | 43 ++++++++++++++++++++++++++++ src/zarr/core/metadata/v3.py | 49 +++----------------------------- tests/test_unified_chunk_grid.py | 25 ++++++++-------- 4 files changed, 63 insertions(+), 103 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 858167e645..5fdcbee482 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -20,6 +20,8 @@ NamedConfig, ShapeLike, ceildiv, + compress_rle, + expand_rle, parse_named_configuration, parse_shapelike, ) @@ -264,49 +266,6 @@ def is_boundary(self) -> bool: # --------------------------------------------------------------------------- -def _expand_rle(data: Sequence[list[int] | int]) -> list[int]: - """Expand a mixed array of bare integers and RLE pairs. - - Per the rectilinear chunk grid spec, each element can be: - - a bare integer (an explicit edge length) - - a two-element array ``[value, count]`` (run-length encoded) - """ - result: list[int] = [] - for item in data: - if isinstance(item, (int, float)) and not isinstance(item, bool): - result.append(int(item)) - elif isinstance(item, list) and len(item) == 2: - size, count = int(item[0]), int(item[1]) - result.extend([size] * count) - else: - raise ValueError(f"RLE entries must be an integer or [size, count], got {item}") - return result - - -def _compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: - """Compress chunk sizes to mixed RLE format per the rectilinear spec. - - Runs of length > 1 are emitted as ``[value, count]`` pairs; runs of - length 1 are emitted as bare integers:: - - [10, 10, 10, 5] -> [[10, 3], 5] - """ - if not sizes: - return [] - result: list[int | list[int]] = [] - current = sizes[0] - count = 1 - for s in sizes[1:]: - if s == current: - count += 1 - else: - result.append([current, count] if count > 1 else current) - current = s - count = 1 - result.append([current, count] if count > 1 else current) - return result - - # A single dimension's rectilinear chunk spec: bare int (uniform shorthand), # list of ints (explicit edges), or mixed RLE (e.g. [[10, 3], 5]). RectilinearDimSpec = int | list[int | list[int]] @@ -329,7 +288,7 @@ def _serialize_fixed_dim(dim: FixedDimension) -> RectilinearDimSpec: def _serialize_varying_dim(dim: VaryingDimension) -> RectilinearDimSpec: """RLE-compressed rectilinear representation for a varying dimension.""" edges = list(dim.edges) - rle = _compress_rle(edges) + rle = compress_rle(edges) if len(rle) < len(edges): return rle # mypy: list[int] is invariant, so it won't widen to list[int | list[int]] @@ -380,7 +339,7 @@ def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[in # Check if the list contains any sub-lists (RLE pairs) or is all bare ints has_sublists = any(isinstance(e, list) for e in dim_spec) if has_sublists: - return _expand_rle(dim_spec) + return expand_rle(dim_spec) else: # All bare integers — explicit edge lengths return [int(e) for e in dim_spec] diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 275d062eba..4773e7ac91 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -252,3 +252,46 @@ def _warn_order_kwarg() -> None: def _default_zarr_format() -> ZarrFormat: """Return the default zarr_version""" return cast("ZarrFormat", int(zarr_config.get("default_zarr_format", 3))) + + +def expand_rle(data: Sequence[int | list[int]]) -> list[int]: + """Expand a mixed array of bare integers and RLE pairs. + + Per the rectilinear chunk grid spec, each element can be: + - a bare integer (an explicit edge length) + - a two-element array ``[value, count]`` (run-length encoded) + """ + result: list[int] = [] + for item in data: + if isinstance(item, (int, float)) and not isinstance(item, bool): + result.append(int(item)) + elif isinstance(item, list) and len(item) == 2: + size, count = int(item[0]), int(item[1]) + result.extend([size] * count) + else: + raise ValueError(f"RLE entries must be an integer or [size, count], got {item}") + return result + + +def compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: + """Compress chunk sizes to mixed RLE format per the rectilinear spec. + + Runs of length > 1 are emitted as ``[value, count]`` pairs; runs of + length 1 are emitted as bare integers:: + + [10, 10, 10, 5] -> [[10, 3], 5] + """ + if not sizes: + return [] + result: list[int | list[int]] = [] + current = sizes[0] + count = 1 + for s in sizes[1:]: + if s == current: + count += 1 + else: + result.append([current, count] if count > 1 else current) + current = s + count = 1 + result.append([current, count] if count > 1 else current) + return result diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index a609bda692..e9bed9b092 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -20,6 +20,8 @@ DimensionNames, NamedConfig, NamedRequiredConfig, + compress_rle, + expand_rle, parse_named_configuration, parse_shapelike, ) @@ -223,49 +225,6 @@ def _validate_rectilinear_kind(kind: str | None) -> None: ) -def _expand_rle(data: Sequence[int | list[int]]) -> list[int]: - """Expand a mixed array of bare integers and RLE pairs. - - Per the rectilinear chunk grid spec, each element can be: - - a bare integer (an explicit edge length) - - a two-element array ``[value, count]`` (run-length encoded) - """ - result: list[int] = [] - for item in data: - if isinstance(item, (int, float)) and not isinstance(item, bool): - result.append(int(item)) - elif isinstance(item, list) and len(item) == 2: - size, count = int(item[0]), int(item[1]) - result.extend([size] * count) - else: - raise ValueError(f"RLE entries must be an integer or [size, count], got {item}") - return result - - -def _compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: - """Compress chunk sizes to mixed RLE format per the rectilinear spec. - - Runs of length > 1 are emitted as ``[value, count]`` pairs; runs of - length 1 are emitted as bare integers:: - - [10, 10, 10, 5] -> [[10, 3], 5] - """ - if not sizes: - return [] - result: list[int | list[int]] = [] - current = sizes[0] - count = 1 - for s in sizes[1:]: - if s == current: - count += 1 - else: - result.append([current, count] if count > 1 else current) - current = s - count = 1 - result.append([current, count] if count > 1 else current) - return result - - def _validate_chunk_shapes( chunk_shapes: Sequence[Sequence[int]], ) -> tuple[tuple[int, ...], ...]: @@ -342,7 +301,7 @@ def ndim(self) -> int: def to_dict(self) -> RectilinearChunkGridJSON: # type: ignore[override] serialized_dims: list[RectilinearDimSpecJSON] = [] for edges in self.chunk_shapes: - rle = _compress_rle(edges) + rle = compress_rle(edges) # Use RLE only if it's actually shorter if len(rle) < len(edges): serialized_dims.append(rle) @@ -390,7 +349,7 @@ def from_dict(cls, data: RectilinearChunkGridJSON) -> Self: # type: ignore[over raise ValueError(f"Integer chunk edge length must be >= 1, got {dim_spec}") expanded.append((dim_spec,)) elif isinstance(dim_spec, list): - expanded.append(tuple(_expand_rle(dim_spec))) + expanded.append(tuple(expand_rle(dim_spec))) else: raise TypeError( f"Invalid chunk_shapes entry: expected int or list, got {type(dim_spec)}" diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 298b3de61d..3a10e52b61 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -20,14 +20,13 @@ ChunkSpec, FixedDimension, VaryingDimension, - _compress_rle, _decode_dim_spec, - _expand_rle, _infer_chunk_grid_name, _is_rectilinear_chunks, parse_chunk_grid, serialize_chunk_grid, ) +from zarr.core.common import compress_rle, expand_rle from zarr.errors import BoundsCheckError from zarr.storage import MemoryStore @@ -477,29 +476,29 @@ def test_iter(self) -> None: class TestRLE: def test_expand(self) -> None: - assert _expand_rle([[10, 3]]) == [10, 10, 10] - assert _expand_rle([[10, 2], [20, 1]]) == [10, 10, 20] + assert expand_rle([[10, 3]]) == [10, 10, 10] + assert expand_rle([[10, 2], [20, 1]]) == [10, 10, 20] def test_compress(self) -> None: - assert _compress_rle([10, 10, 10]) == [[10, 3]] - assert _compress_rle([10, 10, 20]) == [[10, 2], 20] - assert _compress_rle([5]) == [5] - assert _compress_rle([10, 20, 30]) == [10, 20, 30] + assert compress_rle([10, 10, 10]) == [[10, 3]] + assert compress_rle([10, 10, 20]) == [[10, 2], 20] + assert compress_rle([5]) == [5] + assert compress_rle([10, 20, 30]) == [10, 20, 30] def test_roundtrip(self) -> None: original = [10, 10, 10, 20, 20, 30] - compressed = _compress_rle(original) - assert _expand_rle(compressed) == original + compressed = compress_rle(original) + assert expand_rle(compressed) == original class TestExpandRleHandlesJsonFloats: def test_bare_integer_floats_accepted(self) -> None: - """JSON parsers may emit 10.0 for the integer 10; _expand_rle should handle it.""" - result = _expand_rle([10.0, 20.0]) # type: ignore[list-item] + """JSON parsers may emit 10.0 for the integer 10; expand_rle should handle it.""" + result = expand_rle([10.0, 20.0]) # type: ignore[list-item] assert result == [10, 20] def test_rle_pair_with_float_count(self) -> None: - result = _expand_rle([[10, 3.0]]) # type: ignore[list-item] + result = expand_rle([[10, 3.0]]) # type: ignore[list-item] assert result == [10, 10, 10] From fa6980d6e19f11cc0a56ca6947eeb0a2cc0bbf0b Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 21:14:01 -0400 Subject: [PATCH 075/118] Add to experimental --- src/zarr/experimental/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/zarr/experimental/__init__.py b/src/zarr/experimental/__init__.py index 3863510c65..f7caaf96a1 100644 --- a/src/zarr/experimental/__init__.py +++ b/src/zarr/experimental/__init__.py @@ -1 +1,5 @@ """The experimental module is a site for exporting new or experimental Zarr features.""" + +from zarr.core.chunk_grids import ChunkGrid, ChunkSpec + +__all__ = ["ChunkGrid", "ChunkSpec"] From 2360392e972a618f60face775ea9ac3646eb7c0b Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 21:22:43 -0400 Subject: [PATCH 076/118] Avoid divide by zero --- src/zarr/core/chunk_grids.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 5fdcbee482..c142b2e0d6 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -484,8 +484,10 @@ def from_rectilinear( # Collapse to FixedDimension when edges are uniform AND either # extent == edge_sum (exact fit) or the number of edges matches # ceildiv(extent, edge) (regular grid with boundary overflow). - if all(e == edges_list[0] for e in edges_list) and ( - extent == edge_sum or len(edges_list) == ceildiv(extent, edges_list[0]) + if ( + edges_list[0] > 0 + and all(e == edges_list[0] for e in edges_list) + and (extent == edge_sum or len(edges_list) == ceildiv(extent, edges_list[0])) ): dims.append(FixedDimension(size=edges_list[0], extent=extent)) else: From 21aa18b5be623d15a3033ac2cd2f90d0aa4f42e3 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 21:23:08 -0400 Subject: [PATCH 077/118] Improve RLE validation --- src/zarr/core/common.py | 9 ++++++++- tests/test_unified_chunk_grid.py | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 4773e7ac91..a32a3e8ed8 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -264,9 +264,16 @@ def expand_rle(data: Sequence[int | list[int]]) -> list[int]: result: list[int] = [] for item in data: if isinstance(item, (int, float)) and not isinstance(item, bool): - result.append(int(item)) + val = int(item) + if val < 1: + raise ValueError(f"Chunk edge length must be >= 1, got {val}") + result.append(val) elif isinstance(item, list) and len(item) == 2: size, count = int(item[0]), int(item[1]) + if size < 1: + raise ValueError(f"Chunk edge length must be >= 1, got {size}") + if count < 1: + raise ValueError(f"RLE repeat count must be >= 1, got {count}") result.extend([size] * count) else: raise ValueError(f"RLE entries must be an integer or [size, count], got {item}") diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 3a10e52b61..e627f6d2c7 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -490,6 +490,30 @@ def test_roundtrip(self) -> None: compressed = compress_rle(original) assert expand_rle(compressed) == original + def test_expand_rejects_zero_edge(self) -> None: + with pytest.raises(ValueError, match="Chunk edge length must be >= 1"): + expand_rle([0]) + + def test_expand_rejects_negative_edge(self) -> None: + with pytest.raises(ValueError, match="Chunk edge length must be >= 1"): + expand_rle([-5]) + + def test_expand_rejects_zero_rle_size(self) -> None: + with pytest.raises(ValueError, match="Chunk edge length must be >= 1"): + expand_rle([[0, 3]]) + + def test_expand_rejects_negative_rle_size(self) -> None: + with pytest.raises(ValueError, match="Chunk edge length must be >= 1"): + expand_rle([[-10, 2]]) + + def test_expand_rejects_zero_rle_count(self) -> None: + with pytest.raises(ValueError, match="RLE repeat count must be >= 1"): + expand_rle([[5, 0]]) + + def test_expand_rejects_negative_rle_count(self) -> None: + with pytest.raises(ValueError, match="RLE repeat count must be >= 1"): + expand_rle([[5, -1]]) + class TestExpandRleHandlesJsonFloats: def test_bare_integer_floats_accepted(self) -> None: From 7e171f512ea3ee850709819246b45467ad25aa45 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:29:53 -0400 Subject: [PATCH 078/118] Raise error on unknown chunk grid --- src/zarr/codecs/sharding.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 43f48cd87e..632308b806 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -417,6 +417,11 @@ def validate( f"Chunk edge length {edge} in dimension {i} is not " f"divisible by the shard's inner chunk size {inner}." ) + else: + raise TypeError( + f"Sharding is only compatible with regular and rectilinear chunk grids, " + f"got {type(chunk_grid)}" + ) async def _decode_single( self, From b6b271f8839db9ba431e274ca0da4fa25549e436 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:30:51 -0400 Subject: [PATCH 079/118] Add utility function --- src/zarr/core/common.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 8b8046f802..97bf5cb097 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -303,3 +303,20 @@ def compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: count = 1 result.append([current, count] if count > 1 else current) return result + + +def validate_rectilinear_edges( + chunk_shapes: Sequence[Sequence[int]], array_shape: Sequence[int] +) -> None: + """Validate that rectilinear chunk edges cover the array extent per dimension. + + Raises ValueError if any dimension's edge sum is less than the corresponding + array extent. + """ + for i, (edges, extent) in enumerate(zip(chunk_shapes, array_shape, strict=True)): + edge_sum = sum(edges) + if edge_sum < extent: + raise ValueError( + f"Rectilinear chunk edges for dimension {i} sum to {edge_sum} " + f"but array shape extent is {extent} (edge sum must be >= extent)" + ) From c19e9db49f96c1900c9ea25413f15bce07d076e4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:32:28 -0400 Subject: [PATCH 080/118] Minor improvements --- src/zarr/core/array.py | 2 +- src/zarr/core/chunk_grids.py | 45 ++++++++++++++++---------------- src/zarr/core/metadata/v2.py | 20 ++++++++++++++ src/zarr/core/metadata/v3.py | 3 +++ tests/test_unified_chunk_grid.py | 30 ++++++++++----------- 5 files changed, 62 insertions(+), 38 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index cc88e71784..a22a7e6cdf 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1345,7 +1345,7 @@ def _chunk_grid_shape(self) -> tuple[int, ...]: # When sharding, count inner chunks across the whole array chunk_shape = codecs[0].chunk_shape return tuple(starmap(ceildiv, zip(self.shape, chunk_shape, strict=True))) - return self.chunk_grid.shape + return self.chunk_grid.grid_shape @property def _shard_grid_shape(self) -> tuple[int, ...]: diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 6e5549068e..f3c20a1647 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -24,6 +24,7 @@ expand_rle, parse_named_configuration, parse_shapelike, + validate_rectilinear_edges, ) from zarr.errors import ZarrUserWarning @@ -127,6 +128,8 @@ class VaryingDimension: edges: tuple[int, ...] # per-chunk edge lengths (all > 0) cumulative: tuple[int, ...] # prefix sums for O(log n) lookup extent: int # array dimension length (may be < sum(edges) after resize) + nchunks: int = field(init=False, repr=False) # cached at construction + ngridcells: int = field(init=False, repr=False) # cached at construction def __init__(self, edges: Sequence[int], extent: int) -> None: edges_tuple = tuple(edges) @@ -144,34 +147,38 @@ def __init__(self, edges: Sequence[int], extent: int) -> None: object.__setattr__(self, "edges", edges_tuple) object.__setattr__(self, "cumulative", cumulative) object.__setattr__(self, "extent", extent) - - @property - def ngridcells(self) -> int: - """Total grid cells including those past the array extent.""" - return len(self.edges) - - @property - def nchunks(self) -> int: - """Number of chunks that contain data (overlap [0, extent)).""" - if self.extent == 0: - return 0 - return bisect.bisect_left(self.cumulative, self.extent) + 1 + # Cache nchunks: number of chunks that overlap [0, extent) + if extent == 0: + n = 0 + else: + n = bisect.bisect_left(cumulative, extent) + 1 + object.__setattr__(self, "nchunks", n) + object.__setattr__(self, "ngridcells", len(edges_tuple)) def index_to_chunk(self, idx: int) -> int: if idx < 0 or idx >= self.extent: raise IndexError(f"Index {idx} out of bounds for dimension with extent {self.extent}") return bisect.bisect_right(self.cumulative, idx) + def _check_chunk_ix(self, chunk_ix: int) -> None: + if chunk_ix < 0 or chunk_ix >= len(self.edges): + raise IndexError( + f"Chunk index {chunk_ix} out of bounds for dimension with {len(self.edges)} grid cells" + ) + def chunk_offset(self, chunk_ix: int) -> int: + self._check_chunk_ix(chunk_ix) return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 def chunk_size(self, chunk_ix: int) -> int: """Buffer size for codec processing.""" + self._check_chunk_ix(chunk_ix) return self.edges[chunk_ix] def data_size(self, chunk_ix: int) -> int: """Valid data region within the buffer — clipped at extent.""" - offset = self.chunk_offset(chunk_ix) + self._check_chunk_ix(chunk_ix) + offset = self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 return max(0, min(self.edges[chunk_ix], self.extent - offset)) @property @@ -505,7 +512,7 @@ def is_regular(self) -> bool: return self._is_regular @property - def shape(self) -> tuple[int, ...]: + def grid_shape(self) -> tuple[int, ...]: """Number of chunks per dimension.""" return tuple(d.nchunks for d in self.dimensions) @@ -589,7 +596,7 @@ def all_chunk_coords( origin_parsed = tuple(origin) if selection_shape is None: selection_shape_parsed = tuple( - g - o for o, g in zip(origin_parsed, self.shape, strict=True) + g - o for o, g in zip(origin_parsed, self.grid_shape, strict=True) ) else: selection_shape_parsed = tuple(selection_shape) @@ -702,13 +709,7 @@ def parse_chunk_grid( decoded: list[list[int]] = [] for dim_spec, extent in zip(chunk_shapes_raw, array_shape, strict=True): decoded.append(_decode_dim_spec(dim_spec, array_extent=extent)) - for i, (edges, extent) in enumerate(zip(decoded, array_shape, strict=True)): - edge_sum = sum(edges) - if edge_sum < extent: - raise ValueError( - f"Rectilinear chunk edges for dimension {i} sum to {edge_sum} " - f"but array shape extent is {extent} (edge sum must be >= extent)" - ) + validate_rectilinear_edges(decoded, array_shape) return ChunkGrid.from_rectilinear(decoded, array_shape=array_shape) raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 36dbd27c3b..cce7d0d385 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -2,6 +2,7 @@ import warnings from collections.abc import Iterable, Sequence +from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast from zarr.abc.metadata import Metadata @@ -17,6 +18,7 @@ import numpy.typing as npt from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.chunk_grids import ChunkGrid from zarr.core.dtype.wrapper import ( TBaseDType, TBaseScalar, @@ -115,6 +117,24 @@ def __init__( def ndim(self) -> int: return len(self.shape) + @cached_property + def chunk_grid(self) -> ChunkGrid: + """Backwards-compatible chunk grid property. + + .. deprecated:: + Access the chunk grid via the array layer instead. + This property will be removed in a future release. + """ + from zarr.core.chunk_grids import ChunkGrid + + warnings.warn( + "ArrayV2Metadata.chunk_grid is deprecated. " + "Use ChunkGrid.from_metadata(metadata) instead.", + DeprecationWarning, + stacklevel=2, + ) + return ChunkGrid.from_regular(self.shape, self.chunks) + @property def shards(self) -> tuple[int, ...] | None: return None diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 4f6c3be408..c62458b74e 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -24,6 +24,7 @@ expand_rle, parse_named_configuration, parse_shapelike, + validate_rectilinear_edges, ) from zarr.core.config import config from zarr.core.dtype import VariableLengthUTF8, ZDType, get_data_type_from_json @@ -465,6 +466,8 @@ def __init__( def _validate_metadata(self) -> None: if len(self.shape) != self.chunk_grid.ndim: raise ValueError("`chunk_grid` and `shape` need to have the same number of dimensions.") + if isinstance(self.chunk_grid, RectilinearChunkGrid): + validate_rectilinear_edges(self.chunk_grid.chunk_shapes, self.shape) if self.dimension_names is not None and len(self.shape) != len(self.dimension_names): raise ValueError( "`dimension_names` and `shape` need to have the same number of dimensions." diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index e627f6d2c7..a286e993a1 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -365,15 +365,15 @@ def test_all_uniform_becomes_regular(self) -> None: class TestChunkGridQueries: def test_regular_shape(self) -> None: g = ChunkGrid.from_regular((100, 200), (10, 20)) - assert g.shape == (10, 10) + assert g.grid_shape == (10, 10) def test_regular_shape_boundary(self) -> None: g = ChunkGrid.from_regular((95, 200), (10, 20)) - assert g.shape == (10, 10) # ceildiv(95, 10) == 10 + assert g.grid_shape == (10, 10) # ceildiv(95, 10) == 10 def test_rectilinear_shape(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) - assert g.shape == (3, 4) + assert g.grid_shape == (3, 4) def test_regular_getitem(self) -> None: g = ChunkGrid.from_regular((100, 200), (10, 20)) @@ -718,7 +718,7 @@ def test_json_roundtrip(self) -> None: json_str = json.dumps(d) d2 = json.loads(json_str) g2 = parse_chunk_grid(d2, (60, 100)) - assert g2.shape == (3, 2) + assert g2.grid_shape == (3, 2) def test_unknown_name_raises(self) -> None: with pytest.raises(ValueError, match="Unknown chunk grid"): @@ -866,7 +866,7 @@ def test_rectilinear_extent_match_passes(self) -> None: "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [25, 25]]}, } g = parse_chunk_grid(data, (60, 50)) - assert g.shape == (3, 2) + assert g.grid_shape == (3, 2) def test_rectilinear_ndim_mismatch_raises(self) -> None: data: dict[str, Any] = { @@ -884,7 +884,7 @@ def test_rectilinear_rle_extent_validated(self) -> None: } # sum = 50 and 50 — match (50, 50) g = parse_chunk_grid(data, (50, 50)) - assert g.shape == (5, 2) + assert g.grid_shape == (5, 2) # mismatch with pytest.raises(ValueError, match="sum to 50 but array shape extent is 100"): parse_chunk_grid(data, (100, 50)) @@ -1042,7 +1042,7 @@ def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) d = serialize_chunk_grid(g, "rectilinear") g2 = parse_chunk_grid(d, (60, 100)) - assert g2.shape == g.shape + assert g2.grid_shape == g.grid_shape for coord in g.all_chunk_coords(): orig_spec = g[coord] new_spec = g2[coord] @@ -1243,7 +1243,7 @@ def test_chunk_grid_boundary_iter(self) -> None: def test_chunk_grid_boundary_shape(self) -> None: """shape property with boundary extent.""" g = ChunkGrid(dimensions=(FixedDimension(10, 95),)) - assert g.shape == (10,) # ceildiv(95, 10) = 10 + assert g.grid_shape == (10,) # ceildiv(95, 10) = 10 # -- Boundary FixedDimension in rectilinear serialization -- @@ -1255,7 +1255,7 @@ def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: FixedDimension(size=10, extent=95), ) ) - assert g.shape == (3, 10) + assert g.grid_shape == (3, 10) d = serialize_chunk_grid(g, "rectilinear") assert d["name"] == "rectilinear" @@ -1269,7 +1269,7 @@ def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: assert chunk_shapes[1] == 10 # bare integer shorthand g2 = parse_chunk_grid(d, (60, 95)) - assert g2.shape == g.shape + assert g2.grid_shape == g.grid_shape # Round-tripped grid should have correct extent for coord in g.all_chunk_coords(): orig = g[coord] @@ -1288,7 +1288,7 @@ def test_exact_extent_fixed_dim_rectilinear_roundtrip(self) -> None: ) d = serialize_chunk_grid(g, "rectilinear") g2 = parse_chunk_grid(d, (30, 100)) - assert g2.shape == g.shape + assert g2.grid_shape == g.grid_shape # All chunks should be uniform for coord in g.all_chunk_coords(): orig = g[coord] @@ -1377,7 +1377,7 @@ def test_parse_chunk_grid_rebinds_fixed_extent(self) -> None: g2 = parse_chunk_grid(g, (50, 100)) assert isinstance(g2.dimensions[0], FixedDimension) assert g2.dimensions[0].extent == 50 # re-bound - assert g2.shape == (5, 5) + assert g2.grid_shape == (5, 5) # -- ChunkGrid.__getitem__ validation -- @@ -1460,7 +1460,7 @@ def test_zero_nchunks_fixed_dim_in_rectilinear_serialize_raises(self) -> None: FixedDimension(size=10, extent=0), ) ) - assert g.shape == (2, 0) + assert g.grid_shape == (2, 0) with pytest.raises(ValueError, match="zero-extent"): serialize_chunk_grid(g, "rectilinear") @@ -2010,7 +2010,7 @@ def test_v2_chunk_grid_is_regular(self, tmp_path: Path) -> None: grid = a.chunk_grid assert grid.is_regular assert grid.chunk_shape == (10, 15) - assert grid.shape == (2, 2) + assert grid.grid_shape == (2, 2) assert all(isinstance(d, FixedDimension) for d in grid.dimensions) def test_v2_boundary_chunks(self, tmp_path: Path) -> None: @@ -2444,7 +2444,7 @@ def test_parse_chunk_grid_regular_from_dict(self) -> None: g = parse_chunk_grid(d, (100, 200)) assert g.is_regular assert g.chunk_shape == (10, 20) - assert g.shape == (10, 10) + assert g.grid_shape == (10, 10) assert g.get_nchunks() == 100 From 764eeaf62e4be27de9423e25f8c7e2a352b1ce27 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:36:47 -0400 Subject: [PATCH 081/118] Update shorthand --- src/zarr/core/metadata/v3.py | 14 +++++++++----- tests/test_unified_chunk_grid.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index c62458b74e..214867fcb5 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -302,12 +302,16 @@ def ndim(self) -> int: def to_dict(self) -> RectilinearChunkGridJSON: # type: ignore[override] serialized_dims: list[RectilinearDimSpecJSON] = [] for edges in self.chunk_shapes: - rle = compress_rle(edges) - # Use RLE only if it's actually shorter - if len(rle) < len(edges): - serialized_dims.append(rle) + if len(edges) == 1: + # Bare int shorthand: single edge length repeated until sum >= extent + serialized_dims.append(edges[0]) else: - serialized_dims.append(list(edges)) + rle = compress_rle(edges) + # Use RLE only if it's actually shorter + if len(rle) < len(edges): + serialized_dims.append(rle) + else: + serialized_dims.append(list(edges)) return { "name": "rectilinear", "configuration": { diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index a286e993a1..b9852d9767 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -27,6 +27,7 @@ serialize_chunk_grid, ) from zarr.core.common import compress_rle, expand_rle +from zarr.core.metadata.v3 import RectilinearChunkGrid from zarr.errors import BoundsCheckError from zarr.storage import MemoryStore @@ -720,6 +721,19 @@ def test_json_roundtrip(self) -> None: g2 = parse_chunk_grid(d2, (60, 100)) assert g2.grid_shape == (3, 2) + def test_bare_int_roundtrip(self) -> None: + """Bare-integer shorthand in chunk_shapes round-trips as bare int, not [int].""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [10, [20, 30]]}, + } + meta = RectilinearChunkGrid.from_dict(data) # type: ignore[arg-type] + out = meta.to_dict() + # Dim 0 was bare int — should stay bare int + assert out["configuration"]["chunk_shapes"][0] == 10 + # Dim 1 was explicit list — should stay list + assert out["configuration"]["chunk_shapes"][1] == [20, 30] + def test_unknown_name_raises(self) -> None: with pytest.raises(ValueError, match="Unknown chunk grid"): parse_chunk_grid({"name": "hexagonal", "configuration": {}}, (10,)) From 54b399df44d0e725238fb743a3480c93bf3ed1cb Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:41:55 -0400 Subject: [PATCH 082/118] Fix zero chunks --- src/zarr/core/chunk_grids.py | 2 +- tests/test_unified_chunk_grid.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index f3c20a1647..332b9a1d34 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -56,7 +56,7 @@ def __post_init__(self) -> None: if self.extent < 0: raise ValueError(f"FixedDimension extent must be >= 0, got {self.extent}") if self.size == 0: - n = 1 if self.extent == 0 else 0 + n = 0 else: n = ceildiv(self.extent, self.size) object.__setattr__(self, "nchunks", n) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index b9852d9767..8b008a4a9e 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -216,7 +216,7 @@ def test_negative_extent_rejected(self) -> None: def test_zero_size_allowed(self) -> None: d = FixedDimension(size=0, extent=0) assert d.size == 0 - assert d.nchunks == 1 # 0-size with 0-extent = 1 chunk + assert d.nchunks == 0 # zero-sized chunks can't hold data def test_chunk_offset_oob_raises(self) -> None: d = FixedDimension(size=10, extent=100) @@ -1314,11 +1314,13 @@ def test_exact_extent_fixed_dim_rectilinear_roundtrip(self) -> None: # -- Zero-size and zero-extent -- def test_zero_size_zero_extent(self) -> None: - """FixedDimension(size=0, extent=0) => 1 chunk of size 0.""" + """FixedDimension(size=0, extent=0) => 0 chunks (consistent with size=0, extent=5).""" d = FixedDimension(size=0, extent=0) - assert d.nchunks == 1 - assert d.chunk_size(0) == 0 - assert d.data_size(0) == 0 + assert d.nchunks == 0 + with pytest.raises(IndexError, match="out of bounds"): + d.chunk_size(0) + with pytest.raises(IndexError, match="out of bounds"): + d.data_size(0) def test_zero_size_nonzero_extent(self) -> None: """FixedDimension(size=0, extent=5) => 0 chunks (can't partition).""" From becd392e333ee1608c6e745df91884d7a2d4a3d6 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:48:05 -0400 Subject: [PATCH 083/118] Remove extraneous validation --- src/zarr/core/chunk_grids.py | 59 +++++++++++++++--------------- tests/test_unified_chunk_grid.py | 61 +++++++------------------------- 2 files changed, 42 insertions(+), 78 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 332b9a1d34..004850fdeb 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -71,31 +71,28 @@ def index_to_chunk(self, idx: int) -> int: return 0 return idx // self.size - # Bounds checking: all callers (ChunkGrid.__getitem__, indexers) validate - # chunk indices before calling these methods, so the checks here are - # redundant on the hot path. They are retained for safety when methods - # are called directly. If profiling shows this overhead matters, the - # checks can be removed — VaryingDimension gets natural IndexError from - # tuple indexing, and FixedDimension would silently return wrong values. - - def _check_chunk_ix(self, chunk_ix: int) -> None: - if chunk_ix < 0 or chunk_ix >= self.nchunks: - raise IndexError( - f"Chunk index {chunk_ix} out of bounds for dimension with {self.nchunks} chunks" - ) - def chunk_offset(self, chunk_ix: int) -> int: - self._check_chunk_ix(chunk_ix) + """Byte-aligned start position of chunk *chunk_ix* in array coordinates. + + Does not validate *chunk_ix* — callers must ensure it is in + ``[0, nchunks)``. Use ``ChunkGrid.__getitem__`` for safe access. + """ return chunk_ix * self.size def chunk_size(self, chunk_ix: int) -> int: - """Buffer size for codec processing — always uniform.""" - self._check_chunk_ix(chunk_ix) + """Buffer size for codec processing — always uniform. + + Does not validate *chunk_ix* — callers must ensure it is in + ``[0, nchunks)``. Use ``ChunkGrid.__getitem__`` for safe access. + """ return self.size def data_size(self, chunk_ix: int) -> int: - """Valid data region within the buffer — clipped at extent.""" - self._check_chunk_ix(chunk_ix) + """Valid data region within the buffer — clipped at extent. + + Does not validate *chunk_ix* — callers must ensure it is in + ``[0, nchunks)``. Use ``ChunkGrid.__getitem__`` for safe access. + """ if self.size == 0: return 0 return max(0, min(self.size, self.extent - chunk_ix * self.size)) @@ -160,24 +157,28 @@ def index_to_chunk(self, idx: int) -> int: raise IndexError(f"Index {idx} out of bounds for dimension with extent {self.extent}") return bisect.bisect_right(self.cumulative, idx) - def _check_chunk_ix(self, chunk_ix: int) -> None: - if chunk_ix < 0 or chunk_ix >= len(self.edges): - raise IndexError( - f"Chunk index {chunk_ix} out of bounds for dimension with {len(self.edges)} grid cells" - ) - def chunk_offset(self, chunk_ix: int) -> int: - self._check_chunk_ix(chunk_ix) + """Start position of chunk *chunk_ix* in array coordinates. + + Does not validate *chunk_ix* — callers must ensure it is in + ``[0, ngridcells)``. Use ``ChunkGrid.__getitem__`` for safe access. + """ return self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 def chunk_size(self, chunk_ix: int) -> int: - """Buffer size for codec processing.""" - self._check_chunk_ix(chunk_ix) + """Buffer size for codec processing. + + Does not validate *chunk_ix* — callers must ensure it is in + ``[0, ngridcells)``. Use ``ChunkGrid.__getitem__`` for safe access. + """ return self.edges[chunk_ix] def data_size(self, chunk_ix: int) -> int: - """Valid data region within the buffer — clipped at extent.""" - self._check_chunk_ix(chunk_ix) + """Valid data region within the buffer — clipped at extent. + + Does not validate *chunk_ix* — callers must ensure it is in + ``[0, ngridcells)``. Use ``ChunkGrid.__getitem__`` for safe access. + """ offset = self.cumulative[chunk_ix - 1] if chunk_ix > 0 else 0 return max(0, min(self.edges[chunk_ix], self.extent - offset)) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 8b008a4a9e..1b916a0fab 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -218,26 +218,9 @@ def test_zero_size_allowed(self) -> None: assert d.size == 0 assert d.nchunks == 0 # zero-sized chunks can't hold data - def test_chunk_offset_oob_raises(self) -> None: - d = FixedDimension(size=10, extent=100) - with pytest.raises(IndexError, match="out of bounds"): - d.chunk_offset(10) - with pytest.raises(IndexError, match="out of bounds"): - d.chunk_offset(-1) - - def test_chunk_size_oob_raises(self) -> None: - d = FixedDimension(size=10, extent=100) - with pytest.raises(IndexError, match="out of bounds"): - d.chunk_size(10) - with pytest.raises(IndexError, match="out of bounds"): - d.chunk_size(-1) - - def test_data_size_oob_raises(self) -> None: - d = FixedDimension(size=10, extent=100) - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(10) - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(-1) + # FixedDimension.chunk_offset/chunk_size/data_size do not bounds-check + # for performance (callers validate). OOB access is tested via + # ChunkGrid.__getitem__ which checks before delegating. # --------------------------------------------------------------------------- @@ -1216,23 +1199,8 @@ def test_fixed_dim_boundary_data_size(self) -> None: assert d.data_size(9) == 5 # 95 - 9*10 = 5 assert d.chunk_size(9) == 10 # codec buffer always full - def test_fixed_dim_data_size_out_of_bounds(self) -> None: - """data_size raises IndexError for out-of-bounds chunk indices.""" - d = FixedDimension(size=10, extent=100) - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(10) # exactly at boundary - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(11) # past boundary - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(999) - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(-1) - - def test_fixed_dim_data_size_boundary_oob(self) -> None: - """data_size raises IndexError past last chunk.""" - d = FixedDimension(size=10, extent=95) - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(10) # past nchunks=10 + # FixedDimension.data_size does not bounds-check for performance. + # OOB access is tested via ChunkGrid.__getitem__. def test_chunk_grid_boundary_getitem(self) -> None: """ChunkGrid with boundary FixedDimension via direct construction.""" @@ -1317,28 +1285,23 @@ def test_zero_size_zero_extent(self) -> None: """FixedDimension(size=0, extent=0) => 0 chunks (consistent with size=0, extent=5).""" d = FixedDimension(size=0, extent=0) assert d.nchunks == 0 - with pytest.raises(IndexError, match="out of bounds"): - d.chunk_size(0) - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(0) + # OOB access tested via ChunkGrid.__getitem__, not direct method calls + g = ChunkGrid(dimensions=(d,)) + assert g[0] is None def test_zero_size_nonzero_extent(self) -> None: """FixedDimension(size=0, extent=5) => 0 chunks (can't partition).""" d = FixedDimension(size=0, extent=5) assert d.nchunks == 0 - # No valid chunk index exists on a 0-chunk dimension - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(0) - with pytest.raises(IndexError, match="out of bounds"): - d.chunk_size(0) + g = ChunkGrid(dimensions=(d,)) + assert g[0] is None def test_zero_extent_nonzero_size(self) -> None: """FixedDimension(size=10, extent=0) => 0 chunks.""" d = FixedDimension(size=10, extent=0) assert d.nchunks == 0 - # No valid chunk index exists on a 0-chunk dimension - with pytest.raises(IndexError, match="out of bounds"): - d.data_size(0) + g = ChunkGrid(dimensions=(d,)) + assert g[0] is None # -- 0-d grid -- From 6764ba18c3b8d2e336fae0aa9d4d95b4c454a171 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:51:50 -0400 Subject: [PATCH 084/118] Improve tests --- tests/test_unified_chunk_grid.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 1b916a0fab..e20f850c51 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -502,11 +502,11 @@ def test_expand_rejects_negative_rle_count(self) -> None: class TestExpandRleHandlesJsonFloats: def test_bare_integer_floats_accepted(self) -> None: """JSON parsers may emit 10.0 for the integer 10; expand_rle should handle it.""" - result = expand_rle([10.0, 20.0]) # type: ignore[list-item] + result = expand_rle([10.0, 20.0]) assert result == [10, 20] def test_rle_pair_with_float_count(self) -> None: - result = expand_rle([[10, 3.0]]) # type: ignore[list-item] + result = expand_rle([[10, 3.0]]) assert result == [10, 10, 10] @@ -710,7 +710,7 @@ def test_bare_int_roundtrip(self) -> None: "name": "rectilinear", "configuration": {"kind": "inline", "chunk_shapes": [10, [20, 30]]}, } - meta = RectilinearChunkGrid.from_dict(data) # type: ignore[arg-type] + meta = RectilinearChunkGrid.from_dict(data) out = meta.to_dict() # Dim 0 was bare int — should stay bare int assert out["configuration"]["chunk_shapes"][0] == 10 @@ -729,7 +729,7 @@ def test_serialize_non_regular_as_regular_raises(self) -> None: def test_serialize_unknown_name_raises(self) -> None: g = ChunkGrid.from_regular((100,), (10,)) with pytest.raises(ValueError, match="Unknown chunk grid name for serialization"): - serialize_chunk_grid(g, "hexagonal") # type: ignore[arg-type] + serialize_chunk_grid(g, "hexagonal") def test_zero_extent_rectilinear_raises(self) -> None: """Zero-extent grids cannot be serialized as rectilinear (spec requires positive edges).""" @@ -1476,9 +1476,11 @@ def test_orthogonal_int_array_selection_rectilinear(self) -> None: assert chunk_coords == [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)] def test_orthogonal_bool_array_selection_rectilinear(self) -> None: - """Boolean array selection with rectilinear grid.""" + """Boolean array selection with rectilinear grid produces correct chunk projections.""" from zarr.core.indexing import OrthogonalIndexer + # chunks: dim0 = [10, 20, 30], dim1 = [50, 50] + # mask selects: idx 5 (chunk 0), idx 15 (chunk 1), idx 35 (chunk 2) g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) mask = np.zeros(60, dtype=bool) mask[5] = True @@ -1490,7 +1492,15 @@ def test_orthogonal_bool_array_selection_rectilinear(self) -> None: chunk_grid=g, ) projections = list(indexer) - assert len(projections) > 0 + # 3 chunks touched in dim0 x 2 chunks in dim1 = 6 projections + assert len(projections) == 6 + chunk_coords = [p.chunk_coords for p in projections] + assert (0, 0) in chunk_coords + assert (1, 0) in chunk_coords + assert (2, 0) in chunk_coords + assert (0, 1) in chunk_coords + assert (1, 1) in chunk_coords + assert (2, 1) in chunk_coords def test_orthogonal_advanced_indexing_produces_correct_projections(self) -> None: """Verify OrthogonalIndexer produces correct chunk projections From 9b3644879987247b4523ebbc6cd24b93cab70ff7 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:54:44 -0400 Subject: [PATCH 085/118] Improve docstrings --- src/zarr/core/chunk_grids.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 004850fdeb..11823ff838 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -108,11 +108,20 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int return indices // self.size def with_extent(self, new_extent: int) -> FixedDimension: - """Return a copy re-bound to *new_extent*.""" + """Re-bind to *new_extent* without modifying edges. + + Used when constructing a grid from existing metadata where edges + are already correct (e.g. ``parse_chunk_grid``). Raises on + ``VaryingDimension`` if edges don't cover the new extent. + """ return FixedDimension(size=self.size, extent=new_extent) def resize(self, new_extent: int) -> FixedDimension: - """Return a copy adjusted for a new array extent (same as with_extent for fixed).""" + """Adapt for a user-initiated array resize, growing edges if needed. + + For ``FixedDimension`` this is identical to ``with_extent`` since + regular grids don't store explicit edges. + """ return FixedDimension(size=self.size, extent=new_extent) @@ -195,7 +204,12 @@ def indices_to_chunks(self, indices: npt.NDArray[np.intp]) -> npt.NDArray[np.int return np.searchsorted(self.cumulative, indices, side="right") def with_extent(self, new_extent: int) -> VaryingDimension: - """Return a copy re-bound to *new_extent*, validating edge coverage.""" + """Re-bind to *new_extent* without modifying edges. + + Used when constructing a grid from existing metadata where edges + are already correct (e.g. ``parse_chunk_grid``). Raises if the + existing edges don't cover *new_extent*. + """ edge_sum = self.cumulative[-1] if edge_sum < new_extent: raise ValueError( @@ -204,11 +218,12 @@ def with_extent(self, new_extent: int) -> VaryingDimension: return VaryingDimension(self.edges, extent=new_extent) def resize(self, new_extent: int) -> VaryingDimension: - """Return a copy adjusted for a new array extent (grow/shrink). + """Adapt for a user-initiated array resize, growing edges if needed. - Grow past existing edges: appends a chunk for the additional extent. - Shrink or grow within existing edges: preserves all edges and re-binds - the extent. The spec allows trailing edges beyond the array extent. + Unlike ``with_extent``, this never fails — if *new_extent* exceeds + the current edge sum, a new chunk is appended to cover the gap. + Shrinking preserves all edges (the spec allows trailing edges + beyond the array extent). """ if new_extent == self.extent: return self From 11a47ff1501deff28481aeee13a3c268abeac13a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:14:47 -0400 Subject: [PATCH 086/118] Update design doc --- docs/design/chunk-grid.md | 89 ++++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index eca8025fcb..ab4ffc3898 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -1,6 +1,6 @@ # Unified Chunk Grid -Version: 5 +Version: 6 **Related:** @@ -35,7 +35,7 @@ A registry-based plugin system adds complexity without clear benefit. 4. **Design for future iteration.** The internal architecture should allow refactoring (e.g., metadata/array separation, new dimension types) without breaking the public API. 5. **Minimize downstream changes.** xarray, VirtualiZarr, Icechunk, Cubed, etc. should need minimal updates. 6. **Minimize time to stable release.** Ship behind a feature flag, stabilize through real-world usage, promote to stable API. -7. **The new API should be useful.** `chunk_sizes`, `ChunkGrid.__getitem__`, `is_regular` — these should solve real problems, not just expose internals. +7. **The new API should be useful.** `read_chunk_sizes`/`write_chunk_sizes`, `ChunkGrid.__getitem__`, `is_regular` — these should solve real problems, not just expose internals. 8. **Extensible for other serialization structures.** The per-dimension design should support future encodings (tile, temporal) without changes to indexing or codecs. ## Design @@ -64,7 +64,7 @@ class FixedDimension: @property def nchunks(self) -> int: if self.size == 0: - return 1 if self.extent == 0 else 0 + return 0 return ceildiv(self.extent, self.size) def index_to_chunk(self, idx: int) -> int: @@ -101,6 +101,13 @@ class VaryingDimension: @property def nchunks(self) -> int: + # number of chunks that overlap [0, extent) + if extent == 0: + return 0 + return bisect.bisect_left(self.cumulative, extent) + 1 + + @property + def ngridcells(self) -> int: return len(self.edges) def index_to_chunk(self, idx: int) -> int: @@ -121,8 +128,8 @@ class VaryingDimension: # validates cumulative[-1] >= new_extent (O(1)), re-binds extent return VaryingDimension(self.edges, extent=new_extent) def resize(self, new_extent: int) -> VaryingDimension: - # grow: append chunk of size (new_extent - old_extent) - # shrink: drop trailing chunks, keep those up to new_extent + # grow past edge sum: append chunk of size (new_extent - sum(edges)) + # shrink or grow within edge sum: preserve all edges, re-bind extent ``` Both types implement the `DimensionGrid` protocol: `nchunks`, `extent`, `index_to_chunk`, `chunk_offset`, `chunk_size`, `data_size`, `indices_to_chunks`, `unique_edge_lengths`, `with_extent`, `resize`. Memory usage scales with the number of *varying* dimensions, not total chunks. @@ -148,20 +155,24 @@ class DimensionGrid(Protocol): @property def nchunks(self) -> int: ... @property - def extent(self) -> int: ... + def ngridcells(self) -> int: ... @property - def unique_edge_lengths(self) -> Iterable[int]: ... + def extent(self) -> int: ... def index_to_chunk(self, idx: int) -> int: ... def chunk_offset(self, chunk_ix: int) -> int: ... # raises IndexError if OOB def chunk_size(self, chunk_ix: int) -> int: ... # raises IndexError if OOB def data_size(self, chunk_ix: int) -> int: ... # raises IndexError if OOB def indices_to_chunks(self, indices: NDArray[np.intp]) -> NDArray[np.intp]: ... + @property + def unique_edge_lengths(self) -> Iterable[int]: ... def with_extent(self, new_extent: int) -> DimensionGrid: ... def resize(self, new_extent: int) -> DimensionGrid: ... ``` The protocol is `@runtime_checkable`, enabling polymorphic handling of both dimension types without `isinstance` checks. +`nchunks` and `ngridcells` differ when `extent < sum(edges)`: `nchunks` counts only chunks that overlap `[0, extent)`, while `ngridcells` counts total defined grid cells (i.e., `len(edges)`). For `FixedDimension`, both are equal. For `VaryingDimension`, they differ after a resize that shrinks the extent below the edge sum. + ### ChunkSpec ```python @@ -189,8 +200,8 @@ arr = zarr.create_array(shape=(100, 200), chunks=(10, 20)) arr = zarr.create_array(shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]]) # rectilinear # ChunkGrid as a collection -grid = arr.metadata.chunk_grid # ChunkGrid instance -grid.shape # (10, 10) — number of chunks per dimension +grid = arr.chunk_grid # behavioral ChunkGrid (bound to array shape) +grid.grid_shape # (10, 10) — number of chunks per dimension grid.ndim # 2 grid.is_regular # True if all dimensions are Fixed @@ -211,8 +222,8 @@ for spec in grid: # iterate all chunks # .chunks property: retained for regular grids, raises NotImplementedError for rectilinear arr.chunks # (10, 20) -# .chunk_sizes property: works for all grids (dask-style) -arr.chunk_sizes # ((10, 10, ..., 10), (20, 20, ..., 20)) +# .read_chunk_sizes / .write_chunk_sizes: works for all grids (dask-style) +arr.write_chunk_sizes # ((10, 10, ..., 10), (20, 20, ..., 20)) ``` `ChunkGrid.__getitem__` constructs `ChunkSpec` using `chunk_size` for `codec_shape` and `data_size` for `slices`: @@ -265,7 +276,7 @@ When `extent < sum(edges)`, the dimension is always stored as `VaryingDimension` Both names deserialize to the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. -**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`, which stores the chunk grid's JSON `name` in the `chunk_grid_name` field. `serialize_chunk_grid(grid, name)` is called by `ArrayV3Metadata.to_dict()`. This gives round-trip fidelity — a store written as rectilinear with uniform edges stays rectilinear. +**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`. The name is inferred from the chunk grid metadata DTO type (`RegularChunkGrid` → `"regular"`, `RectilinearChunkGrid` → `"rectilinear"`) or from `grid.is_regular` when a behavioral `ChunkGrid` is passed directly. For `create_array`, the format is inferred from the `chunks` argument: a flat tuple produces `"regular"`, a nested list produces `"rectilinear"`. The `_is_rectilinear_chunks()` helper detects nested sequences like `[[10, 20], [5, 5]]`. @@ -284,35 +295,40 @@ RLE compression is used when serializing: runs of identical sizes become `[value # _expand_rle([[10, 3], 5]) -> [10, 10, 10, 5] ``` -For `FixedDimension` serialized as rectilinear, `_serialize_fixed_dim()` produces a compact representation: bare integer when evenly divisible, `[size, last_data]` for two chunks, `[[size, n-1], last_data]` for more. +For `FixedDimension` serialized as rectilinear, `_serialize_fixed_dim()` returns the bare integer `dim.size`. Per the rectilinear spec, a bare integer is repeated until the sum >= extent, preserving the full codec buffer size for boundary chunks. **Zero-extent handling:** Regular grids serialize zero-extent dimensions without issue (the format encodes only `chunk_shape`, no edges). Rectilinear grids reject zero-extent dimensions because the spec requires at least one positive-integer edge length per axis. This asymmetry is intentional and spec-compliant — documented in `serialize_chunk_grid()`. -#### chunk_sizes +#### read_chunk_sizes / write_chunk_sizes + +The `read_chunk_sizes` and `write_chunk_sizes` properties provide universal access to per-dimension chunk data sizes, matching the dask `Array.chunks` convention. They work for both regular and rectilinear grids: -The `chunk_sizes` property provides universal access to per-dimension chunk data sizes, matching the dask `Array.chunks` convention. It works for both regular and rectilinear grids: +- `write_chunk_sizes`: always returns outer (storage) chunk sizes +- `read_chunk_sizes`: returns inner chunk sizes when sharding is used, otherwise same as `write_chunk_sizes` ```python >>> arr = zarr.create_array(store, shape=(100, 80), chunks=(30, 40)) ->>> arr.chunk_sizes +>>> arr.write_chunk_sizes ((30, 30, 30, 10), (40, 40)) >>> arr = zarr.create_array(store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]]) ->>> arr.chunk_sizes +>>> arr.write_chunk_sizes ((10, 20, 30), (50, 50)) ``` +The underlying `ChunkGrid.chunk_sizes` property (on the grid, not the array) returns the same as `write_chunk_sizes`. + #### Resize ```python arr.resize((80, 100)) # re-binds extent; FixedDimension stays fixed arr.resize((200, 100)) # VaryingDimension grows by appending a new chunk -arr.resize((30, 100)) # VaryingDimension shrinks by dropping trailing chunks +arr.resize((30, 100)) # VaryingDimension shrinks: preserves all edges, re-binds extent ``` Resize uses `ChunkGrid.update_shape(new_shape)`, which delegates to each dimension's `.resize()` method: - `FixedDimension.resize()`: simply re-binds the extent (identical to `with_extent`) -- `VaryingDimension.resize()`: grow appends a chunk of size `new_extent - old_extent`; shrink drops trailing chunks whose cumulative offset lies beyond the new extent +- `VaryingDimension.resize()`: grow past `sum(edges)` appends a chunk covering the gap; shrink or grow within `sum(edges)` preserves all edges and re-binds the extent (the spec allows trailing edges beyond the array extent) **Known limitation (deferred):** When growing a `VaryingDimension`, the current implementation always appends a single chunk covering the new region. For example, `[10, 10, 10]` resized from 30 to 45 produces `[10, 10, 10, 15]` instead of the more natural `[10, 10, 10, 10, 10]`. A future improvement should add an optional `chunks` parameter to `resize()` that controls how the new region is partitioned, with a sane default (e.g., repeating the last chunk size). This is safely deferrable because: - `FixedDimension` already handles resize correctly (regular grids stay regular) @@ -332,12 +348,12 @@ The `from_array()` function handles both regular and rectilinear source arrays: ```python src = zarr.create_array(store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]]) new = zarr.from_array(data=src, store=new_store, chunks="keep") -# Preserves rectilinear structure: new.chunk_sizes == ((10, 20, 30), (50, 50)) +# Preserves rectilinear structure: new.write_chunk_sizes == ((10, 20, 30), (50, 50)) ``` -When `chunks="keep"`, the logic checks `data.metadata.chunk_grid.is_regular`: +When `chunks="keep"`, the logic checks `data.chunk_grid.is_regular`: - Regular: extracts `data.chunks` (flat tuple) and preserves shards -- Rectilinear: extracts `data.chunk_sizes` (nested tuples) and forces shards to None +- Rectilinear: extracts `data.write_chunk_sizes` (nested tuples) and forces shards to None ### Indexing @@ -393,7 +409,7 @@ Level 3 — Shard index: ceil(shard_dim / subchunk_dim) entries per dimension | `ChunkGrid` ABC + `RegularChunkGrid` subclass | Single concrete `ChunkGrid` with `is_regular` | | `RectilinearChunkGrid` (#3534) | Same `ChunkGrid` class | | Chunk grid registry + entrypoints (#3735) | Direct name dispatch | -| `arr.chunks` | Retained for regular; `arr.chunk_sizes` for general use | +| `arr.chunks` | Retained for regular; `arr.read_chunk_sizes`/`arr.write_chunk_sizes` for general use | | `get_chunk_shape(shape, coord)` | `grid[coord].codec_shape` or `grid[coord].shape` | ## Design decisions @@ -441,8 +457,8 @@ Note: the *dimension* types (`FixedDimension`, `VaryingDimension`) do use a `Dim The resolution: - `.chunks` is retained for regular grids (returns `tuple[int, ...]` as before) -- `.chunks` raises `NotImplementedError` for rectilinear grids with a message pointing to `.chunk_sizes` -- `.chunk_sizes` returns `tuple[tuple[int, ...], ...]` (dask convention) for all grids +- `.chunks` raises `NotImplementedError` for rectilinear grids with a message pointing to `.read_chunk_sizes`/`.write_chunk_sizes` +- `.read_chunk_sizes` and `.write_chunk_sizes` return `tuple[tuple[int, ...], ...]` (dask convention) for all grids @maxrjones noted in review that deprecating `.chunks` for regular grids was not desirable. The current branch does not deprecate it. @@ -450,7 +466,7 @@ The resolution: @d-v-b raised in #3534 that users need a way to say "these chunks are regular, but serialize as rectilinear" (e.g., to allow future append/extend workflows without format changes). @jhamman initially made nested-list input always produce `RectilinearChunkGrid`. -The current branch resolves this via `chunk_grid_name: ChunkGridName` on `ArrayV3Metadata`. The name is stored internally for round-trip fidelity and is not part of the Zarr spec metadata. Current inference behavior: +The current branch resolves this via `_infer_chunk_grid_name()`, which extracts or infers the serialization name from the chunk grid input. When metadata is deserialized, the original name (from `{"name": "regular"}` or `{"name": "rectilinear"}`) flows through to `serialize_chunk_grid()` at write time. When a `ChunkGrid` is passed directly, the name is inferred from `grid.is_regular`. Current inference behavior: - `chunks=(10, 20)` (flat tuple) → infers `"regular"` - `chunks=[[10, 20], [5, 5]]` (nested lists with varying sizes) → infers `"rectilinear"` - `chunks=[[10, 10], [20, 20]]` (nested lists with uniform sizes) → `from_rectilinear` collapses to `FixedDimension`, so `is_regular=True` and infers `"regular"` @@ -471,15 +487,18 @@ A `TiledDimension` prototype was built ([commit 9c0f582](https://github.com/maxr 2. **The per-dimension architecture doesn't preclude it.** A future `TiledDimension` can implement the `DimensionGrid` protocol alongside `FixedDimension` and `VaryingDimension` with no changes to indexing, codecs, or the `ChunkGrid` class. 3. **RLE covers the MVP.** Most real-world variable chunk patterns (HPC boundaries, irregular partitions) are efficiently encoded with RLE. Tile encoding is an optimization for a specific (temporal) subset. -### Deferred: Metadata / Array separation +### Metadata / Array separation (partially implemented) -An earlier design doc proposed decoupling `ChunkGrid` (behavioral) from `ArrayV3Metadata` (data), so that metadata would store only a plain dict and the array layer would construct the `ChunkGrid`. This was deferred because: +An earlier design doc proposed decoupling `ChunkGrid` (behavioral) from `ArrayV3Metadata` (data), so that metadata would store only a plain dict and the array layer would construct the `ChunkGrid`. -1. **Scope.** The unified chunk grid is already a large change spanning chunk grids, indexing, codecs, metadata, and the array API. Adding a metadata refactor would increase the review surface and risk without a concrete payoff for this PR. -2. **No blocking issue.** The current coupling — `ArrayV3Metadata` stores a `ChunkGrid` and calls `serialize_chunk_grid()` / `parse_chunk_grid()` — works correctly. The grid is constructed once from metadata + `shape` and round-trips cleanly. -3. **Independent concern.** Separating metadata DTOs from behavioral objects is a general architectural goal that applies beyond chunk grids (e.g., codec pipelines). It's better addressed holistically than piecemeal. +The current implementation partially realizes this separation: -The current design stores `chunk_grid: ChunkGrid` and `chunk_grid_name: str` on `ArrayV3Metadata`. The name controls serialization format; the grid handles all behavioral queries. If a future refactor makes metadata a pure DTO, the `ChunkGrid` construction would move to the array layer and `parse_chunk_grid()` already provides the right entry point. +- **Metadata DTOs** (`RegularChunkGrid`, `RectilinearChunkGrid` in `metadata/v3.py`): Pure data, frozen dataclasses, no array shape. These live on `ArrayV3Metadata.chunk_grid` and represent only what goes into `zarr.json`. +- **Behavioral `ChunkGrid`** (`chunk_grids.py`): Shape-bound, supports indexing, iteration, and chunk specs. Lives on `AsyncArray.chunk_grid`, constructed from metadata + `shape` via `ChunkGrid.from_metadata()`. + +This means `ArrayV3Metadata.chunk_grid` is now a `ChunkGridMetadata` (the DTO union type), **not** the behavioral `ChunkGrid`. Code that previously accessed behavioral methods on `metadata.chunk_grid` (e.g., `all_chunk_coords()`, `__getitem__`) must now use the behavioral grid from the array layer instead. + +The name controls serialization format; `serialize_chunk_grid()` is called by `ArrayV3Metadata.to_dict()`. The behavioral grid handles all runtime queries. ## Prior art @@ -505,6 +524,8 @@ isinstance(grid, RegularChunkGrid) # True for any regular ChunkGrid The shim uses `chunk_shape` as extent (matching the old shape-unaware behavior). The deprecation warning directs users to `ChunkGrid.from_regular()`. +**Known limitation:** Because the shim binds `extent=chunk_shape`, `RegularChunkGrid(chunk_shape=(100,)).get_nchunks()` returns `1` (one chunk of size 100 in a dimension of extent 100). This is intentional — the old `RegularChunkGrid` was shape-unaware, and the shim preserves that by using the chunk shape as a stand-in extent. Code that relied on constructing a `RegularChunkGrid` and later querying `nchunks` without binding an array shape must migrate to `ChunkGrid.from_regular(array_shape, chunk_shape)`. + ### Downstream migration | Two-class pattern | Unified pattern | @@ -551,7 +572,7 @@ If the design is accepted, the POC branch can be split into 5 incremental PRs. P - `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `__iter__`, `all_chunk_coords`, `is_regular`, `chunk_shape`, `chunk_sizes`, `unique_edge_lengths` - `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()` - `RegularChunkGrid` deprecation shim -- `chunk_grid_name: ChunkGridName` on `ArrayV3Metadata` +- `_infer_chunk_grid_name()` for serialization format inference - Feature flag (`array.rectilinear_chunks`) **PR 3: Indexing generalization** @@ -562,7 +583,7 @@ If the design is accepted, the POC branch can be split into 5 incremental PRs. P - Wire `ChunkGrid` into `create_array` / `init_array` - `get_chunk_spec()` → `grid[chunk_coords].codec_shape` - Sharding validation via `dim.unique_edge_lengths` -- `arr.chunk_sizes`, `from_array` with `chunks="keep"`, resize support +- `arr.read_chunk_sizes`, `arr.write_chunk_sizes`, `from_array` with `chunks="keep"`, resize support - Hypothesis strategies for rectilinear grids **PR 5: End-to-end tests + docs** From 4d7c72423a8bd482c98e924ec3668cb70a0787e8 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:17:24 -0400 Subject: [PATCH 087/118] Update docs --- docs/user-guide/arrays.md | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index 7bc965665b..e230d7f962 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -659,12 +659,13 @@ print(z[5:25, 0:5]) ### Inspecting chunk sizes -The `.chunk_sizes` property returns the actual data size of each chunk along -every dimension. It works for both regular and rectilinear arrays and returns -a tuple of tuples: +The `.write_chunk_sizes` property returns the actual data size of each storage +chunk along every dimension. It works for both regular and rectilinear arrays +and returns a tuple of tuples (matching the dask `Array.chunks` convention). +When sharding is used, `.read_chunk_sizes` returns the inner chunk sizes instead: ```python exec="true" session="arrays" source="above" result="ansi" -print(z.chunk_sizes) +print(z.write_chunk_sizes) ``` For regular arrays, this includes the boundary chunk: @@ -676,16 +677,18 @@ z_regular = zarr.create_array( chunks=(30, 40), dtype='int32', ) -print(z_regular.chunk_sizes) +print(z_regular.write_chunk_sizes) ``` Note that the `.chunks` property is only available for regular chunk grids. For -rectilinear arrays, use `.chunk_sizes` instead. +rectilinear arrays, use `.write_chunk_sizes` (or `.read_chunk_sizes`) instead. ### Resizing and appending -Rectilinear arrays can be resized. When growing, a new chunk is appended with -the size of the added region. When shrinking, trailing chunks are dropped: +Rectilinear arrays can be resized. When growing past the current edge sum, a +new chunk is appended covering the additional extent. When shrinking, the chunk +edges are preserved and the extent is re-bound (chunks beyond the new extent +simply become inactive): ```python exec="true" session="arrays" source="above" result="ansi" z = zarr.create_array( @@ -695,16 +698,16 @@ z = zarr.create_array( dtype='float64', ) z[:] = np.arange(30, dtype='float64') -print(f"Before resize: chunk_sizes={z.chunk_sizes}") +print(f"Before resize: chunk_sizes={z.write_chunk_sizes}") z.resize((50,)) -print(f"After resize: chunk_sizes={z.chunk_sizes}") +print(f"After resize: chunk_sizes={z.write_chunk_sizes}") ``` The `append` method also works with rectilinear arrays: ```python exec="true" session="arrays" source="above" result="ansi" z.append(np.arange(10, dtype='float64')) -print(f"After append: shape={z.shape}, chunk_sizes={z.chunk_sizes}") +print(f"After append: shape={z.shape}, chunk_sizes={z.write_chunk_sizes}") ``` ### Compressors and filters From 826e030e8ed659a493d489bff3cdd67fda9cb8e6 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:20:33 -0400 Subject: [PATCH 088/118] DRY --- src/zarr/core/chunk_grids.py | 21 ++------------------- src/zarr/core/common.py | 17 +++++++++++++++++ src/zarr/core/metadata/v3.py | 17 ++--------------- 3 files changed, 21 insertions(+), 34 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 11823ff838..e365ed518e 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -25,6 +25,7 @@ parse_named_configuration, parse_shapelike, validate_rectilinear_edges, + validate_rectilinear_kind, ) from zarr.errors import ZarrUserWarning @@ -318,24 +319,6 @@ def _serialize_varying_dim(dim: VaryingDimension) -> RectilinearDimSpec: return cast("RectilinearDimSpec", edges) -def _validate_rectilinear_kind(configuration: dict[str, JSON]) -> None: - """Validate the ``kind`` field of a rectilinear chunk grid configuration. - - The spec requires ``kind: "inline"``. - """ - kind = configuration.get("kind") - if kind is None: - raise ValueError( - "Rectilinear chunk grid configuration requires a 'kind' field. " - "Only 'inline' is currently supported." - ) - if kind != "inline": - raise ValueError( - f"Unsupported rectilinear chunk grid kind: {kind!r}. " - f"Only 'inline' is currently supported." - ) - - def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[int]: """Decode a single dimension's chunk edge specification per the rectilinear spec. @@ -711,7 +694,7 @@ def parse_chunk_grid( return ChunkGrid.from_regular(array_shape, cast("Sequence[int]", chunk_shape_raw)) if name_parsed == "rectilinear": - _validate_rectilinear_kind(configuration_parsed) + validate_rectilinear_kind(cast("str | None", configuration_parsed.get("kind"))) chunk_shapes_raw = configuration_parsed.get("chunk_shapes") if chunk_shapes_raw is None: raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 97bf5cb097..98fca1faf7 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -305,6 +305,23 @@ def compress_rle(sizes: Sequence[int]) -> list[int | list[int]]: return result +def validate_rectilinear_kind(kind: str | None) -> None: + """Validate the ``kind`` field of a rectilinear chunk grid configuration. + + The rectilinear spec requires ``kind: "inline"``. + """ + if kind is None: + raise ValueError( + "Rectilinear chunk grid configuration requires a 'kind' field. " + "Only 'inline' is currently supported." + ) + if kind != "inline": + raise ValueError( + f"Unsupported rectilinear chunk grid kind: {kind!r}. " + "Only 'inline' is currently supported." + ) + + def validate_rectilinear_edges( chunk_shapes: Sequence[Sequence[int]], array_shape: Sequence[int] ) -> None: diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 214867fcb5..e5afe96e9f 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -25,6 +25,7 @@ parse_named_configuration, parse_shapelike, validate_rectilinear_edges, + validate_rectilinear_kind, ) from zarr.core.config import config from zarr.core.dtype import VariableLengthUTF8, ZDType, get_data_type_from_json @@ -212,20 +213,6 @@ def _parse_chunk_shape(chunk_shape: Iterable[int]) -> tuple[int, ...]: return as_tup -def _validate_rectilinear_kind(kind: str | None) -> None: - """The rectilinear spec requires ``kind: "inline"``.""" - if kind is None: - raise ValueError( - "Rectilinear chunk grid configuration requires a 'kind' field. " - "Only 'inline' is currently supported." - ) - if kind != "inline": - raise ValueError( - f"Unsupported rectilinear chunk grid kind: {kind!r}. " - "Only 'inline' is currently supported." - ) - - def _validate_chunk_shapes( chunk_shapes: Sequence[Sequence[int]], ) -> tuple[tuple[int, ...], ...]: @@ -342,7 +329,7 @@ def update_shape( def from_dict(cls, data: RectilinearChunkGridJSON) -> Self: # type: ignore[override] parse_named_configuration(data, "rectilinear") # validate name configuration = data["configuration"] - _validate_rectilinear_kind(configuration.get("kind")) + validate_rectilinear_kind(configuration.get("kind")) raw_shapes = configuration["chunk_shapes"] expanded: list[tuple[int, ...]] = [] for dim_spec in raw_shapes: From 6f51e1ce70d7e7d9a840005ae5611065fa7cd336 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:23:51 -0400 Subject: [PATCH 089/118] Add test --- tests/test_unified_chunk_grid.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index e20f850c51..9ef6103a83 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -1723,6 +1723,23 @@ def test_set_block_selection_2d(self, tmp_path: Path) -> None: a[0:30, 25:75] = val np.testing.assert_array_equal(z[:], a) + def test_block_selection_slice_stop_at_nchunks(self, tmp_path: Path) -> None: + """Block slice with stop == nchunks exercises the dim_len fallback + in BlockIndexer (``chunk_offset(stop) if stop < nchunks else dim_len``). + """ + z, a = self._make_1d(tmp_path) + # nchunks == 3; stop=3 hits the `else dim_len` path + np.testing.assert_array_equal(z.blocks[1:3], a[5:30]) + # stop > nchunks should also produce the full remainder + np.testing.assert_array_equal(z.blocks[0:10], a[:]) + + def test_block_selection_slice_stop_at_nchunks_2d(self, tmp_path: Path) -> None: + """Same fallback test for 2D rectilinear arrays.""" + z, a = self._make_2d(tmp_path) + # dim0 nchunks=3, dim1 nchunks=4 + np.testing.assert_array_equal(z.blocks[2:3, 3:4], a[30:60, 75:100]) + np.testing.assert_array_equal(z.blocks[0:99, 0:99], a[:, :]) + # --- Set coordinate selection --- def test_set_coordinate_selection_1d(self, tmp_path: Path) -> None: From 879f20f73e386b249046c78c95d0b5cf341af450 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:25:18 -0400 Subject: [PATCH 090/118] Simplify --- src/zarr/core/indexing.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 57ccdf13b2..72b19ef0b2 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -340,11 +340,6 @@ def get_chunk_shape(chunk_grid: ChunkGrid) -> tuple[int, ...]: return chunk_grid.chunk_shape -def _get_dim_grids(chunk_grid: ChunkGrid) -> tuple[DimensionGrid, ...]: - """Extract per-dimension grid objects from a ChunkGrid.""" - return chunk_grid.dimensions - - def normalize_integer_selection(dim_sel: int, dim_len: int) -> int: # normalize type to int dim_sel = int(dim_sel) @@ -595,7 +590,7 @@ def __init__( shape: tuple[int, ...], chunk_grid: ChunkGrid, ) -> None: - dim_grids = _get_dim_grids(chunk_grid) + dim_grids = chunk_grid.dimensions # handle ellipsis selection_normalized = replace_ellipsis(selection, shape) @@ -922,7 +917,7 @@ class OrthogonalIndexer(Indexer): drop_axes: tuple[int, ...] def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: ChunkGrid) -> None: - dim_grids = _get_dim_grids(chunk_grid) + dim_grids = chunk_grid.dimensions # handle ellipsis selection = replace_ellipsis(selection, shape) @@ -1060,7 +1055,7 @@ class BlockIndexer(Indexer): def __init__( self, selection: BasicSelection, shape: tuple[int, ...], chunk_grid: ChunkGrid ) -> None: - dim_grids = _get_dim_grids(chunk_grid) + dim_grids = chunk_grid.dimensions # handle ellipsis selection_normalized = replace_ellipsis(selection, shape) @@ -1190,7 +1185,7 @@ class CoordinateIndexer(Indexer): def __init__( self, selection: CoordinateSelection, shape: tuple[int, ...], chunk_grid: ChunkGrid ) -> None: - dim_grids = _get_dim_grids(chunk_grid) + dim_grids = chunk_grid.dimensions cdata_shape: tuple[int, ...] if shape == (): From edbdb5d9adfd800bea4e549a1242cc2cbab3e439 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:56:58 -0400 Subject: [PATCH 091/118] Consistent .chunks and .shards --- src/zarr/core/metadata/v3.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index e5afe96e9f..27ea325f6e 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -484,7 +484,7 @@ def chunks(self) -> tuple[int, ...]: if not isinstance(self.chunk_grid, RegularChunkGrid): msg = ( "The `chunks` attribute is only defined for arrays using regular chunk grids. " - "This array has a rectilinear chunk grid. Use `read_chunk_sizes` or `write_chunk_sizes` for general access." + "This array has a rectilinear chunk grid. Use `read_chunk_sizes` for general access." ) raise NotImplementedError(msg) @@ -496,12 +496,15 @@ def chunks(self) -> tuple[int, ...]: @property def shards(self) -> tuple[int, ...] | None: - if not isinstance(self.chunk_grid, RegularChunkGrid): - return None - from zarr.codecs.sharding import ShardingCodec if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + if not isinstance(self.chunk_grid, RegularChunkGrid): + msg = ( + "The `shards` attribute is only defined for arrays using regular chunk grids. " + "This array has a rectilinear chunk grid. Use `write_chunk_sizes` for general access." + ) + raise NotImplementedError(msg) return self.chunk_grid.chunk_shape return None From 4a940b1cb30fb54f42453aeba967118e7426f351 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 21 Mar 2026 00:10:37 -0400 Subject: [PATCH 092/118] Remove separators --- src/zarr/core/chunk_grids.py | 29 ------- src/zarr/core/metadata/v3.py | 4 - tests/test_unified_chunk_grid.py | 127 +------------------------------ 3 files changed, 4 insertions(+), 156 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index e365ed518e..d095c430bf 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -36,11 +36,6 @@ from zarr.core.metadata import ArrayMetadata -# --------------------------------------------------------------------------- -# Per-dimension grid types -# --------------------------------------------------------------------------- - - @dataclass(frozen=True) class FixedDimension: """Uniform chunk size. Boundary chunks contain less data but are @@ -256,11 +251,6 @@ def with_extent(self, new_extent: int) -> DimensionGrid: ... def resize(self, new_extent: int) -> DimensionGrid: ... -# --------------------------------------------------------------------------- -# ChunkSpec -# --------------------------------------------------------------------------- - - @dataclass(frozen=True) class ChunkSpec: """Specification of a single chunk's location and size. @@ -285,11 +275,6 @@ def is_boundary(self) -> bool: return self.shape != self.codec_shape -# --------------------------------------------------------------------------- -# RLE helpers (ported from #3534) -# --------------------------------------------------------------------------- - - # A single dimension's rectilinear chunk spec: bare int (uniform shorthand), # list of ints (explicit edges), or mixed RLE (e.g. [[10, 3], 5]). RectilinearDimSpec = int | list[int | list[int]] @@ -352,10 +337,6 @@ def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[in raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") -# --------------------------------------------------------------------------- -# Unified ChunkGrid -# --------------------------------------------------------------------------- - # Type alias for what users can pass as chunks to create_array ChunksLike = tuple[int, ...] | list[list[int] | int] | int @@ -771,11 +752,6 @@ def _infer_chunk_grid_name( return "regular" if grid.is_regular else "rectilinear" -# --------------------------------------------------------------------------- -# Chunk guessing / normalization (unchanged) -# --------------------------------------------------------------------------- - - def _guess_chunks( shape: tuple[int, ...] | int, typesize: int, @@ -999,11 +975,6 @@ def _auto_partition( return _shards_out, _chunks_out -# --------------------------------------------------------------------------- -# Backwards-compatibility shim for RegularChunkGrid -# --------------------------------------------------------------------------- - - class _RegularChunkGridMeta(type): """Metaclass that makes ``isinstance(obj, RegularChunkGrid)`` work. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 27ea325f6e..6436b9d6a3 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -171,10 +171,6 @@ def parse_extra_fields( return dict(data) -# --------------------------------------------------------------------------- -# Chunk grid metadata types (pure DTOs — no array shape, no behavioral logic) -# --------------------------------------------------------------------------- - # JSON type for a single dimension's rectilinear spec: # bare int (uniform shorthand), or list of ints / [value, count] RLE pairs. RectilinearDimSpecJSON = int | list[int | list[int]] diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 9ef6103a83..20129f4af5 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -53,11 +53,6 @@ def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: raise TypeError(f"Unexpected dimension type: {type(d)}") -# --------------------------------------------------------------------------- -# Index to chunk -# --------------------------------------------------------------------------- - - class TestVaryingDimensionIndexToChunkBounds: def test_index_at_extent_raises(self) -> None: """index_to_chunk(extent) should raise since extent is out of bounds.""" @@ -92,11 +87,6 @@ def test_last_valid_index_works(self) -> None: assert dim.index_to_chunk(94) == 9 -# --------------------------------------------------------------------------- -# Feature flag gating -# --------------------------------------------------------------------------- - - class TestRectilinearFeatureFlag: """Test that rectilinear chunks are gated behind the config flag.""" @@ -132,11 +122,6 @@ def test_parse_chunk_grid_blocked(self) -> None: ) -# --------------------------------------------------------------------------- -# RegularChunkGrid backwards compatibility -# --------------------------------------------------------------------------- - - class TestRegularChunkGridCompat: """The deprecated RegularChunkGrid shim should work for common patterns.""" @@ -168,11 +153,6 @@ def test_isinstance_false_for_unrelated_types(self) -> None: assert not isinstance(42, RegularChunkGrid) -# --------------------------------------------------------------------------- -# FixedDimension -# --------------------------------------------------------------------------- - - class TestFixedDimension: def test_basic(self) -> None: d = FixedDimension(size=10, extent=100) @@ -223,11 +203,6 @@ def test_zero_size_allowed(self) -> None: # ChunkGrid.__getitem__ which checks before delegating. -# --------------------------------------------------------------------------- -# VaryingDimension -# --------------------------------------------------------------------------- - - class TestVaryingDimension: def test_basic(self) -> None: d = VaryingDimension([10, 20, 30], extent=60) @@ -279,11 +254,6 @@ def test_zero_edge_rejected(self) -> None: VaryingDimension([10, 0, 5], extent=15) -# --------------------------------------------------------------------------- -# ChunkSpec -# --------------------------------------------------------------------------- - - class TestChunkSpec: def test_basic(self) -> None: spec = ChunkSpec( @@ -302,11 +272,6 @@ def test_boundary(self) -> None: assert spec.is_boundary -# --------------------------------------------------------------------------- -# ChunkGrid construction -# --------------------------------------------------------------------------- - - class TestChunkGridConstruction: def test_from_regular(self) -> None: g = ChunkGrid.from_regular((100, 200), (10, 20)) @@ -341,11 +306,6 @@ def test_all_uniform_becomes_regular(self) -> None: assert g.chunk_shape == (10, 25) -# --------------------------------------------------------------------------- -# ChunkGrid queries -# --------------------------------------------------------------------------- - - class TestChunkGridQueries: def test_regular_shape(self) -> None: g = ChunkGrid.from_regular((100, 200), (10, 20)) @@ -453,11 +413,6 @@ def test_iter(self) -> None: assert all(isinstance(s, ChunkSpec) for s in specs) -# --------------------------------------------------------------------------- -# RLE helpers -# --------------------------------------------------------------------------- - - class TestRLE: def test_expand(self) -> None: assert expand_rle([[10, 3]]) == [10, 10, 10] @@ -502,19 +457,14 @@ def test_expand_rejects_negative_rle_count(self) -> None: class TestExpandRleHandlesJsonFloats: def test_bare_integer_floats_accepted(self) -> None: """JSON parsers may emit 10.0 for the integer 10; expand_rle should handle it.""" - result = expand_rle([10.0, 20.0]) + result = expand_rle([10.0, 20.0]) # type: ignore[list-item] assert result == [10, 20] def test_rle_pair_with_float_count(self) -> None: - result = expand_rle([[10, 3.0]]) + result = expand_rle([[10, 3.0]]) # type: ignore[list-item] assert result == [10, 10, 10] -# --------------------------------------------------------------------------- -# _decode_dim_spec edge cases -# --------------------------------------------------------------------------- - - class TestDecodeDimSpec: """Edge cases for _decode_dim_spec: floats, empty lists, negatives, missing extent.""" @@ -560,11 +510,6 @@ def test_none_raises(self) -> None: _decode_dim_spec(None, array_extent=10) -# --------------------------------------------------------------------------- -# _is_rectilinear_chunks edge cases -# --------------------------------------------------------------------------- - - class TestIsRectilinearChunks: """Edge cases for _is_rectilinear_chunks.""" @@ -604,11 +549,6 @@ def test_float(self) -> None: assert _is_rectilinear_chunks(3.14) is False -# --------------------------------------------------------------------------- -# _infer_chunk_grid_name edge cases -# --------------------------------------------------------------------------- - - class TestInferChunkGridName: """Edge cases for _infer_chunk_grid_name.""" @@ -639,11 +579,6 @@ def test_dict_with_rectilinear_name(self) -> None: assert _infer_chunk_grid_name(d, g) == "rectilinear" -# --------------------------------------------------------------------------- -# Serialization -# --------------------------------------------------------------------------- - - class TestSerialization: def test_regular_roundtrip(self) -> None: g = ChunkGrid.from_regular((100, 200), (10, 20)) @@ -710,7 +645,7 @@ def test_bare_int_roundtrip(self) -> None: "name": "rectilinear", "configuration": {"kind": "inline", "chunk_shapes": [10, [20, 30]]}, } - meta = RectilinearChunkGrid.from_dict(data) + meta = RectilinearChunkGrid.from_dict(data) # type: ignore[arg-type] out = meta.to_dict() # Dim 0 was bare int — should stay bare int assert out["configuration"]["chunk_shapes"][0] == 10 @@ -729,7 +664,7 @@ def test_serialize_non_regular_as_regular_raises(self) -> None: def test_serialize_unknown_name_raises(self) -> None: g = ChunkGrid.from_regular((100,), (10,)) with pytest.raises(ValueError, match="Unknown chunk grid name for serialization"): - serialize_chunk_grid(g, "hexagonal") + serialize_chunk_grid(g, "hexagonal") # type: ignore[arg-type] def test_zero_extent_rectilinear_raises(self) -> None: """Zero-extent grids cannot be serialized as rectilinear (spec requires positive edges).""" @@ -924,11 +859,6 @@ def test_single_chunk_boundary_codec_size_preserved(self) -> None: assert parsed.dimensions[0].chunk_size(0) == 10 -# --------------------------------------------------------------------------- -# Indexing with rectilinear grids -# --------------------------------------------------------------------------- - - class TestRectilinearIndexing: """Test that the indexing pipeline works with VaryingDimension.""" @@ -999,11 +929,6 @@ def test_oob_block_raises_bounds_check_error(self) -> None: a.get_block_selection((2,)) -# --------------------------------------------------------------------------- -# End-to-end: array creation with rectilinear chunks -# --------------------------------------------------------------------------- - - class TestEndToEnd: """Test creating, writing, and reading arrays with rectilinear chunk grids.""" @@ -1158,11 +1083,6 @@ def test_get_chunk_spec_rectilinear(self, tmp_path: Path) -> None: assert spec2.shape == (30, 50) -# --------------------------------------------------------------------------- -# Sharding compatibility -# --------------------------------------------------------------------------- - - class TestShardingCompat: def test_sharding_accepts_rectilinear_outer_grid(self) -> None: """ShardingCodec.validate should not reject rectilinear outer grids.""" @@ -1180,11 +1100,6 @@ def test_sharding_accepts_rectilinear_outer_grid(self) -> None: ) -# --------------------------------------------------------------------------- -# Edge cases -# --------------------------------------------------------------------------- - - class TestEdgeCases: """Edge cases around boundary chunks, zero-size dims, direct construction, and serialization round-trips.""" @@ -1562,11 +1477,6 @@ def test_sharding_accepts_divisible_rectilinear(self) -> None: ) -# --------------------------------------------------------------------------- -# Full-pipeline read/write tests with rectilinear grids -# --------------------------------------------------------------------------- - - class TestFullPipelineRectilinear: """End-to-end read/write tests through the full Array pipeline.""" @@ -1900,10 +1810,6 @@ def test_nchunks(self, tmp_path: Path) -> None: assert z.chunk_grid.get_nchunks() == 12 -# --------------------------------------------------------------------------- -# Hypothesis property-based tests -# --------------------------------------------------------------------------- - pytest.importorskip("hypothesis") import hypothesis.strategies as st # noqa: E402 @@ -1978,11 +1884,6 @@ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: ) -# --------------------------------------------------------------------------- -# V2 regression tests -# --------------------------------------------------------------------------- - - class TestV2Regression: """Verify V2 arrays still work correctly after the ChunkGrid refactor. @@ -2089,11 +1990,6 @@ def test_v2_chunk_spec_via_grid(self, tmp_path: Path) -> None: assert spec.codec_shape == (10, 10) # full buffer -# --------------------------------------------------------------------------- -# .read_chunk_sizes / .write_chunk_sizes properties -# --------------------------------------------------------------------------- - - class TestChunkSizes: """Tests for ChunkGrid.chunk_sizes and Array.read_chunk_sizes / write_chunk_sizes.""" @@ -2145,11 +2041,6 @@ def test_array_sharded_chunk_sizes(self) -> None: assert arr.write_chunk_sizes == ((120,), (80,)) -# --------------------------------------------------------------------------- -# .info display for rectilinear grids -# --------------------------------------------------------------------------- - - def test_info_display_rectilinear() -> None: """Array.info should not crash for rectilinear grids.""" store = zarr.storage.MemoryStore() @@ -2166,11 +2057,6 @@ def test_info_display_rectilinear() -> None: assert "Array" in text -# --------------------------------------------------------------------------- -# Resize / append for rectilinear grids -# --------------------------------------------------------------------------- - - class TestUpdateShape: """Unit tests for ChunkGrid.update_shape().""" @@ -2454,11 +2340,6 @@ def test_parse_chunk_grid_regular_from_dict(self) -> None: assert g.get_nchunks() == 100 -# --------------------------------------------------------------------------- -# Boundary chunk tests -# --------------------------------------------------------------------------- - - class TestVaryingDimensionBoundary: """VaryingDimension with extent < sum(edges), mirroring how FixedDimension handles boundary chunks.""" From 475de214c825c4e3f0e47893e3a22ac522eb4f23 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 21 Mar 2026 00:17:45 -0400 Subject: [PATCH 093/118] Polish --- src/zarr/core/chunk_grids.py | 9 +------- tests/test_unified_chunk_grid.py | 39 +++++++++++++++----------------- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index d095c430bf..42da511e37 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -327,12 +327,10 @@ def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[in n = ceildiv(array_extent, dim_spec) return [dim_spec] * n if isinstance(dim_spec, list): - # Check if the list contains any sub-lists (RLE pairs) or is all bare ints has_sublists = any(isinstance(e, list) for e in dim_spec) if has_sublists: return expand_rle(dim_spec) else: - # All bare integers — explicit edge lengths return [int(e) for e in dim_spec] raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") @@ -504,11 +502,7 @@ def chunk_shape(self) -> tuple[int, ...]: "chunk_shape is only available for regular chunk grids. " "Use grid[coords] for per-chunk sizes." ) - return tuple( - d.size - for d in self.dimensions - if isinstance(d, FixedDimension) # guaranteed by is_regular - ) + return tuple(d.size for d in self.dimensions if isinstance(d, FixedDimension)) @property def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: @@ -748,7 +742,6 @@ def _infer_chunk_grid_name( if isinstance(data, dict): name, _ = parse_named_configuration(data) return cast("ChunkGridName", name) - # ChunkGrid passed directly — infer from structure return "regular" if grid.is_regular else "rectilinear" diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 20129f4af5..070a13a967 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -168,7 +168,6 @@ def test_basic(self) -> None: # chunk_size is always uniform (codec buffer) assert d.chunk_size(0) == 10 assert d.chunk_size(9) == 10 - # data_size clips at boundary assert d.data_size(0) == 10 assert d.data_size(9) == 10 assert d.nchunks == 10 @@ -176,8 +175,8 @@ def test_basic(self) -> None: def test_boundary_data_size(self) -> None: d = FixedDimension(size=10, extent=95) assert d.nchunks == 10 - assert d.chunk_size(9) == 10 # codec buffer always full - assert d.data_size(9) == 5 # only 5 valid elements at boundary + assert d.chunk_size(9) == 10 + assert d.data_size(9) == 5 def test_vectorized(self) -> None: d = FixedDimension(size=10, extent=100) @@ -196,7 +195,7 @@ def test_negative_extent_rejected(self) -> None: def test_zero_size_allowed(self) -> None: d = FixedDimension(size=0, extent=0) assert d.size == 0 - assert d.nchunks == 0 # zero-sized chunks can't hold data + assert d.nchunks == 0 # FixedDimension.chunk_offset/chunk_size/data_size do not bounds-check # for performance (callers validate). OOB access is tested via @@ -234,7 +233,6 @@ def test_chunk_size(self) -> None: def test_data_size(self) -> None: d = VaryingDimension([10, 20, 30], extent=60) - # data_size == chunk_size when extent == sum(edges) (no boundary) assert d.data_size(0) == 10 assert d.data_size(1) == 20 assert d.data_size(2) == 30 @@ -313,7 +311,7 @@ def test_regular_shape(self) -> None: def test_regular_shape_boundary(self) -> None: g = ChunkGrid.from_regular((95, 200), (10, 20)) - assert g.grid_shape == (10, 10) # ceildiv(95, 10) == 10 + assert g.grid_shape == (10, 10) def test_rectilinear_shape(self) -> None: g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) @@ -2347,29 +2345,28 @@ class TestVaryingDimensionBoundary: def test_extent_parameter(self) -> None: d = VaryingDimension([10, 20, 30], extent=50) assert d.extent == 50 - assert d.chunk_size(2) == 30 # codec buffer: full edge - assert d.data_size(2) == 20 # valid data: clipped to extent + assert d.chunk_size(2) == 30 + assert d.data_size(2) == 20 def test_extent_equals_sum_no_clipping(self) -> None: d = VaryingDimension([10, 20, 30], extent=60) assert d.extent == 60 - assert d.data_size(2) == 30 # no clipping when extent == sum(edges) + assert d.data_size(2) == 30 def test_data_size_interior_chunks_unaffected(self) -> None: d = VaryingDimension([10, 20, 30], extent=50) - assert d.data_size(0) == 10 # fully within extent - assert d.data_size(1) == 20 # fully within extent (offset 10, ends at 30) + assert d.data_size(0) == 10 + assert d.data_size(1) == 20 def test_data_size_at_exact_boundary(self) -> None: d = VaryingDimension([10, 20, 30], extent=60) - # extent == sum(edges), so no clipping assert d.data_size(2) == 30 def test_data_size_single_element_boundary(self) -> None: d = VaryingDimension([10, 20, 30], extent=31) assert d.data_size(0) == 10 assert d.data_size(1) == 20 - assert d.data_size(2) == 1 # only 1 element in last chunk + assert d.data_size(2) == 1 def test_extent_exceeds_sum_rejected(self) -> None: with pytest.raises(ValueError, match="exceeds sum of edges"): @@ -2384,8 +2381,8 @@ def test_chunk_spec_boundary_varying(self) -> None: g = ChunkGrid(dimensions=(VaryingDimension([10, 20, 30], extent=50),)) spec = g[(2,)] assert spec is not None - assert spec.codec_shape == (30,) # full edge - assert spec.shape == (20,) # clipped to extent + assert spec.codec_shape == (30,) + assert spec.shape == (20,) assert spec.is_boundary is True def test_chunk_spec_interior_varying(self) -> None: @@ -2404,12 +2401,12 @@ def test_multiple_chunks_past_extent(self) -> None: """Edges past extent are structural; nchunks counts active only.""" g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) d = g.dimensions[0] - assert d.ngridcells == 4 # structural: all edges - assert d.nchunks == 3 # active: chunks overlapping [0, 50) - assert d.data_size(0) == 10 # fully within - assert d.data_size(1) == 20 # fully within - assert d.data_size(2) == 20 # partial: 50 - 30 = 20 - assert d.chunk_size(2) == 30 # codec buffer: full edge + assert d.ngridcells == 4 + assert d.nchunks == 3 + assert d.data_size(0) == 10 + assert d.data_size(1) == 20 + assert d.data_size(2) == 20 + assert d.chunk_size(2) == 30 def test_chunk_spec_past_extent_is_oob(self) -> None: """Chunk entirely past the extent is out of bounds (not active).""" From a5715b906875557658a4380f4ff328c7c42a0b03 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 27 Mar 2026 08:23:33 +0100 Subject: [PATCH 094/118] Improve layout of work in progress page (#3841) * Improve layout of work in progress page * Remove out of date entries --- docs/user-guide/v3_migration.md | 39 +++++++++++++++------------------ 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/docs/user-guide/v3_migration.md b/docs/user-guide/v3_migration.md index d5a8067a88..8f835d4440 100644 --- a/docs/user-guide/v3_migration.md +++ b/docs/user-guide/v3_migration.md @@ -198,32 +198,29 @@ after the 3.0.0 release. If features listed below are important to your use case of Zarr-Python, please open (or comment on) a [GitHub issue](https://github.com/zarr-developers/zarr-python/issues/new). -- The following functions / methods have not been ported to Zarr-Python 3 yet: +The following functions / methods have not been ported to Zarr-Python 3 yet: - * `zarr.copy` ([issue #2407](https://github.com/zarr-developers/zarr-python/issues/2407)) - * `zarr.copy_all` ([issue #2407](https://github.com/zarr-developers/zarr-python/issues/2407)) - * `zarr.copy_store` ([issue #2407](https://github.com/zarr-developers/zarr-python/issues/2407)) - * `zarr.Group.move` ([issue #2108](https://github.com/zarr-developers/zarr-python/issues/2108)) +- `zarr.copy` ([issue #2407](https://github.com/zarr-developers/zarr-python/issues/2407)) +- `zarr.copy_all` ([issue #2407](https://github.com/zarr-developers/zarr-python/issues/2407)) +- `zarr.copy_store` ([issue #2407](https://github.com/zarr-developers/zarr-python/issues/2407)) +- `zarr.Group.move` ([issue #2108](https://github.com/zarr-developers/zarr-python/issues/2108)) -- The following features (corresponding to function arguments to functions in +The following features (corresponding to function arguments to functions in `zarr`) have not been ported to Zarr-Python 3 yet. Using these features will raise a warning or a `NotImplementedError`: - * `cache_attrs` - * `cache_metadata` - * `chunk_store` ([issue #2495](https://github.com/zarr-developers/zarr-python/issues/2495)) - * `meta_array` - * `object_codec` ([issue #2617](https://github.com/zarr-developers/zarr-python/issues/2617)) - * `synchronizer` ([issue #1596](https://github.com/zarr-developers/zarr-python/issues/1596)) - * `dimension_separator` +- `cache_attrs` +- `cache_metadata` +- `chunk_store` ([issue #2495](https://github.com/zarr-developers/zarr-python/issues/2495)) +- `meta_array` +- `object_codec` ([issue #2617](https://github.com/zarr-developers/zarr-python/issues/2617)) +- `synchronizer` ([issue #1596](https://github.com/zarr-developers/zarr-python/issues/1596)) +- `dimension_separator` -- The following features that were supported by Zarr-Python 2 have not been ported +The following features that were supported by Zarr-Python 2 have not been ported to Zarr-Python 3 yet: - * Structured arrays / dtypes ([issue #2134](https://github.com/zarr-developers/zarr-python/issues/2134)) - * Fixed-length string dtypes ([issue #2347](https://github.com/zarr-developers/zarr-python/issues/2347)) - * Datetime and timedelta dtypes ([issue #2616](https://github.com/zarr-developers/zarr-python/issues/2616)) - * Object dtypes ([issue #2616](https://github.com/zarr-developers/zarr-python/issues/2616)) - * Ragged arrays ([issue #2618](https://github.com/zarr-developers/zarr-python/issues/2618)) - * Groups and Arrays do not implement `__enter__` and `__exit__` protocols ([issue #2619](https://github.com/zarr-developers/zarr-python/issues/2619)) - * Default filters for object dtypes for Zarr format 2 arrays ([issue #2627](https://github.com/zarr-developers/zarr-python/issues/2627)) +- Object dtypes ([issue #2616](https://github.com/zarr-developers/zarr-python/issues/2616)) +- Ragged arrays ([issue #2618](https://github.com/zarr-developers/zarr-python/issues/2618)) +- Groups and Arrays do not implement `__enter__` and `__exit__` protocols ([issue #2619](https://github.com/zarr-developers/zarr-python/issues/2619)) +- Default filters for object dtypes for Zarr format 2 arrays ([issue #2627](https://github.com/zarr-developers/zarr-python/issues/2627)) From 3a9d0420c91af423fdd80ed6435563d6879d8c7a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Mar 2026 17:50:31 -0400 Subject: [PATCH 095/118] perf: oindex optimization (#3830) * oindex single dim optimization * changelog * changelog type * Apply suggestions from code review Co-authored-by: Deepak Cherian * lint --------- Co-authored-by: Deepak Cherian --- changes/3830.misc.md | 1 + src/zarr/core/indexing.py | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) create mode 100644 changes/3830.misc.md diff --git a/changes/3830.misc.md b/changes/3830.misc.md new file mode 100644 index 0000000000..f622038f7e --- /dev/null +++ b/changes/3830.misc.md @@ -0,0 +1 @@ +Optimize the performance of indexing operations when using an array-like indexer on a single dimension. diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 72b19ef0b2..d2e8b67f0e 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -981,19 +981,26 @@ def __iter__(self) -> Iterator[ChunkProjection]: # handle advanced indexing arrays orthogonally if self.is_advanced: - # N.B., numpy doesn't support orthogonal indexing directly as yet, - # so need to work around via np.ix_. Also np.ix_ does not support a - # mixture of arrays and slices or integers, so need to convert slices - # and integers into ranges. - chunk_shape = tuple( - g.chunk_size(p.dim_chunk_ix) - for g, p in zip(self.dim_grids, dim_projections, strict=True) - ) - chunk_selection = ix_(chunk_selection, chunk_shape) + # NumPy can handle a single array-indexed dimension directly, + # which preserves full slices and avoids an + # unnecessary advanced-indexing copy. Integer-indexed + # dimensions still need the ix_ path for downstream squeezing. + # Example: we skip `ix_` for array[:, :, [1, 2, 3]] + n_array_dims = sum(isinstance(sel, np.ndarray) for sel in chunk_selection) + + if n_array_dims > 1 or self.drop_axes: + # N.B., numpy doesn't support orthogonal indexing directly + # for multiple array-indexed dimensions, so we need to + # convert the orthogonal selection into coordinate arrays. + chunk_shape = tuple( + g.chunk_size(p.dim_chunk_ix) + for g, p in zip(self.dim_grids, dim_projections, strict=True) + ) + chunk_selection = ix_(chunk_selection, chunk_shape) - # special case for non-monotonic indices - if not is_basic_selection(out_selection): - out_selection = ix_(out_selection, self.shape) + # special case for non-monotonic indices + if not is_basic_selection(out_selection): + out_selection = ix_(out_selection, self.shape) is_complete_chunk = all(p.is_complete_chunk for p in dim_projections) yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) From 1c2efa6d634f789d8c05dcf182787fd0ee0c53d3 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Fri, 27 Mar 2026 18:15:15 +0100 Subject: [PATCH 096/118] chore: spec0 compat (python 3.14 compat, python 3.12 min) (#3564) * chore: spec0 compat (python 3.14 compat, numpy 2 min) * fix: point oh * fix: `universal_pathlib` min bound * fix: new generic syntax (leave `covariant` untouched) * feat: `uv` in `hatch` * fix: try covariance handling * fix: allow covariance in `ZDType` * fix: remove covariance in `common.py` * fix: remove covariance in `codec.py` * fix: remove unused `TypeVar` * fix: bye byte `covariant`! * fix: merge issue * fix: more merge issues * fix: universal_pathlib * fix: pre-commit * fix: no more needed to pin numpy * chore: relnote * `yaml` not `yml` * fix: some `NDArrayLike` fixes * fix: back to old numpy * Revert "fix: some `NDArrayLike` fixes" This reverts commit 73b72b15629d3249c8c056a8bedd15a8aa383d69. * fix: try 3.12 * fix: oops * fix: ok that still produces some wierd output locally? * chore: try printing out full issue * fix: try skipping darwin build * fix: bring back `GetResult` --------- Co-authored-by: Deepak Cherian --- .github/ISSUE_TEMPLATE/bug_report.yml | 2 +- .github/workflows/gpu_test.yml | 2 +- .github/workflows/nightly_wheels.yml | 2 +- .github/workflows/releases.yml | 2 +- .github/workflows/test.yml | 16 +++---- .pre-commit-config.yaml | 4 +- changes/3546.misc.md | 1 + docs/contributing.md | 2 +- docs/index.md | 2 +- docs/user-guide/installation.md | 2 +- examples/custom_dtype/custom_dtype.py | 2 +- pyproject.toml | 23 ++++++---- src/zarr/_compat.py | 6 +-- src/zarr/abc/codec.py | 46 +++++++++---------- src/zarr/abc/numcodec.py | 4 +- src/zarr/abc/store.py | 4 +- src/zarr/api/asynchronous.py | 4 +- src/zarr/core/array.py | 17 +++---- src/zarr/core/buffer/core.py | 2 +- src/zarr/core/chunk_key_encodings.py | 4 +- src/zarr/core/codec_pipeline.py | 9 ++-- src/zarr/core/common.py | 22 ++------- src/zarr/core/dtype/__init__.py | 4 +- src/zarr/core/dtype/common.py | 11 ++--- src/zarr/core/dtype/npy/common.py | 15 ------ src/zarr/core/dtype/npy/complex.py | 27 +++++------ src/zarr/core/dtype/npy/float.py | 29 ++++++------ src/zarr/core/dtype/npy/int.py | 25 +++++----- src/zarr/core/dtype/npy/string.py | 6 +-- src/zarr/core/dtype/npy/time.py | 20 +++----- src/zarr/core/dtype/wrapper.py | 26 ++++------- src/zarr/core/group.py | 10 ++-- src/zarr/core/indexing.py | 15 ++---- src/zarr/core/metadata/__init__.py | 7 +-- src/zarr/core/metadata/v2.py | 8 ++-- src/zarr/core/sync.py | 17 ++++--- src/zarr/registry.py | 6 +-- src/zarr/storage/_common.py | 4 +- src/zarr/storage/_logging.py | 6 +-- src/zarr/storage/_obstore.py | 7 +-- src/zarr/storage/_utils.py | 20 +++++--- src/zarr/storage/_wrapper.py | 6 +-- src/zarr/testing/stateful.py | 6 +-- src/zarr/testing/store.py | 8 +--- src/zarr/testing/utils.py | 7 +-- src/zarr/types.py | 14 +++--- tests/test_regression/scripts/v2.18.py | 2 +- .../test_v2_dtype_regression.py | 7 ++- tests/test_store/test_core.py | 8 +--- 49 files changed, 212 insertions(+), 287 deletions(-) create mode 100644 changes/3546.misc.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 84bb89d82a..e765e3136e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -61,7 +61,7 @@ body: value: | ```python # /// script - # requires-python = ">=3.11" + # requires-python = ">=3.12" # dependencies = [ # "zarr@git+https://github.com/zarr-developers/zarr-python.git@main", # ] diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index c474485dc0..4fdffab057 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -27,7 +27,7 @@ jobs: runs-on: gpu-runner strategy: matrix: - python-version: ['3.11'] + python-version: ['3.12'] steps: - uses: actions/checkout@v6 diff --git a/.github/workflows/nightly_wheels.yml b/.github/workflows/nightly_wheels.yml index 834d563722..56ffe8f1b4 100644 --- a/.github/workflows/nightly_wheels.yml +++ b/.github/workflows/nightly_wheels.yml @@ -23,7 +23,7 @@ jobs: - uses: actions/setup-python@v6 name: Install Python with: - python-version: '3.13' + python-version: '3.14' - name: Install Hatch uses: pypa/hatch@257e27e51a6a5616ed08a39a408a21c35c9931bc diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index bb9256568c..fde8ff9804 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -31,7 +31,7 @@ jobs: - uses: actions/setup-python@v6 name: Install Python with: - python-version: '3.11' + python-version: '3.12' - name: Install Hatch uses: pypa/hatch@257e27e51a6a5616ed08a39a408a21c35c9931bc diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5af29c960e..8c55c7e93d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,23 +23,23 @@ jobs: strategy: matrix: - python-version: ['3.11', '3.12', '3.13'] + python-version: ['3.12', '3.13', '3.14'] dependency-set: ["minimal", "optional"] os: ["ubuntu-latest"] include: - - python-version: '3.11' + - python-version: '3.12' dependency-set: 'optional' os: 'macos-latest' - - python-version: '3.13' + - python-version: '3.14' dependency-set: 'optional' os: 'macos-latest' - - python-version: '3.11' + - python-version: '3.12' dependency-set: 'optional' os: 'windows-latest' - - python-version: '3.13' + - python-version: '3.14' dependency-set: 'optional' os: 'windows-latest' runs-on: ${{ matrix.os }} @@ -79,12 +79,12 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.11', "3.13"] + python-version: ['3.12', "3.14"] dependency-set: ["upstream", "min_deps"] exclude: - - python-version: "3.13" + - python-version: "3.14" dependency-set: min_deps - - python-version: "3.11" + - python-version: "3.12" dependency-set: upstream steps: - uses: actions/checkout@v6 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 37f41b8222..5d2b5de860 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ ci: default_stages: [pre-commit, pre-push] default_language_version: - python: python3.11 + python: python3.12 repos: - repo: https://github.com/astral-sh/ruff-pre-commit @@ -38,7 +38,7 @@ repos: - donfig - numcodecs - google-crc32c>=1.5 - - numpy==2.1 # until https://github.com/numpy/numpy/issues/28034 is resolved + - numpy==2.1 # https://github.com/zarr-developers/zarr-python/issues/3780 + https://github.com/zarr-developers/zarr-python/issues/3688 - typing_extensions - universal-pathlib - obstore>=0.5.1 diff --git a/changes/3546.misc.md b/changes/3546.misc.md new file mode 100644 index 0000000000..77fa0acb5f --- /dev/null +++ b/changes/3546.misc.md @@ -0,0 +1 @@ +Upgrade to spec0 compat (python 3.14 max, python 3.12 min). \ No newline at end of file diff --git a/docs/contributing.md b/docs/contributing.md index b2c1ae635c..e62ce54c35 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -205,7 +205,7 @@ When submitting a pull request, coverage will also be collected across all suppo ### Documentation -Docstrings for user-facing classes and functions should follow the [numpydoc](https://numpydoc.readthedocs.io/en/stable/format.html#docstring-standard) standard, including sections for Parameters and Examples. All examples should run and pass as doctests under Python 3.11. +Docstrings for user-facing classes and functions should follow the [numpydoc](https://numpydoc.readthedocs.io/en/stable/format.html#docstring-standard) standard, including sections for Parameters and Examples. All examples should run and pass as doctests under Python 3.12. Zarr uses mkdocs for documentation, hosted on readthedocs.org. Documentation is written in the Markdown markup language (.md files) in the `docs` folder. The documentation consists both of prose and API documentation. All user-facing classes and functions are included in the API documentation, under the `docs/api` folder using the [mkdocstrings](https://mkdocstrings.github.io/) extension. Add any new public functions or classes to the relevant markdown file in `docs/api/*.md`. Any new features or important usage information should be included in the user-guide (`docs/user-guide`). Any changes should also be included as a new file in the `changes` directory. diff --git a/docs/index.md b/docs/index.md index b61646d6a6..b8c2b07ee7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,7 +21,7 @@ Zarr-Python is a Python library for reading and writing Zarr groups and arrays. ## Installation -Zarr requires Python 3.11 or higher. You can install it via `pip`: +Zarr requires Python 3.12 or higher. You can install it via `pip`: ```bash pip install zarr diff --git a/docs/user-guide/installation.md b/docs/user-guide/installation.md index 6c1414e81a..c902acf171 100644 --- a/docs/user-guide/installation.md +++ b/docs/user-guide/installation.md @@ -4,7 +4,7 @@ Required dependencies include: -- [Python](https://docs.python.org/3/) (3.11 or later) +- [Python](https://docs.python.org/3/) (3.12 or later) - [packaging](https://packaging.pypa.io) (22.0 or later) - [numpy](https://numpy.org) (2.0 or later) - [numcodecs](https://numcodecs.readthedocs.io) (0.14 or later) diff --git a/examples/custom_dtype/custom_dtype.py b/examples/custom_dtype/custom_dtype.py index ec38d782b6..eee510349b 100644 --- a/examples/custom_dtype/custom_dtype.py +++ b/examples/custom_dtype/custom_dtype.py @@ -1,5 +1,5 @@ # /// script -# requires-python = ">=3.11" +# requires-python = ">=3.12" # dependencies = [ # "zarr @ git+https://github.com/zarr-developers/zarr-python.git@main", # "ml_dtypes==0.5.1", diff --git a/pyproject.toml b/pyproject.toml index b1077e3e5d..8277c3f752 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,11 +29,11 @@ maintainers = [ { name = "Tom Augspurger", email = "tom.w.augspurger@gmail.com" }, { name = "Deepak Cherian" } ] -requires-python = ">=3.11" +requires-python = ">=3.12" # If you add a new dependency here, please also add it to .pre-commit-config.yaml dependencies = [ 'packaging>=22.0', - 'numpy>=2.0', + 'numpy>=2', 'numcodecs>=0.14', 'google-crc32c>=1.5', 'typing_extensions>=4.12', @@ -52,9 +52,9 @@ classifiers = [ 'Topic :: Software Development :: Libraries :: Python Modules', 'Operating System :: Unix', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', + 'Programming Language :: Python :: 3.14', ] license = "MIT" license-files = ["LICENSE.txt"] @@ -161,7 +161,7 @@ COV_CORE_CONFIG = ".coveragerc" COV_CORE_DATAFILE = ".coverage.eager" [[tool.hatch.envs.test.matrix]] -python = ["3.11", "3.12", "3.13"] +python = ["3.12", "3.13", "3.14"] deps = ["minimal", "optional"] [tool.hatch.envs.test.overrides] @@ -192,7 +192,7 @@ extra-dependencies = [ features = ["gpu"] [[tool.hatch.envs.gputest.matrix]] -python = ["3.11", "3.12", "3.13"] +python = ["3.12", "3.13"] [tool.hatch.envs.gputest.scripts] run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=src --cov-report xml --junitxml=junit.xml -o junit_family=legacy --ignore tests/benchmarks" @@ -200,7 +200,7 @@ run = "run-coverage --no-cov" [tool.hatch.envs.upstream] template = 'test' -python = "3.13" +python = "3.14" extra-dependencies = [ 'packaging @ git+https://github.com/pypa/packaging', 'numpy', # from scientific-python-nightly-wheels @@ -223,7 +223,7 @@ description = """Test environment for minimum supported dependencies See Spec 0000 for details and drop schedule: https://scientific-python.org/specs/spec-0000/ """ template = "test" -python = "3.11" +python = "3.12" features = ["remote"] dependency-groups = ["remote-tests"] extra-dependencies = [ @@ -232,12 +232,15 @@ extra-dependencies = [ 'numcodecs==0.14.*', # 0.14 needed for zarr3 codecs 'fsspec==2023.10.0', 's3fs==2023.10.0', - 'universal_pathlib==0.0.22', + 'universal_pathlib==0.2.0', 'typing_extensions==4.12.*', 'donfig==0.8.*', 'obstore==0.5.*', ] +[tool.hatch.envs.defaults] +installer = "uv" + [tool.hatch.envs.docs] features = ['remote'] dependency-groups = ['docs'] @@ -343,7 +346,7 @@ ignore = [ "tests/**" = ["ANN001", "ANN201", "RUF029", "SIM117", "SIM300"] [tool.mypy] -python_version = "3.11" +python_version = "3.12" ignore_missing_imports = true namespace_packages = false @@ -409,7 +412,7 @@ filterwarnings = [ # s3fs finalizers can fail during session cleanup when aiobotocore sessions are garbage # collected without being entered. This is a known issue in s3fs/aiobotocore, and pytest # per-test filterwarnings markers can't catch it (https://github.com/pytest-dev/pytest/issues/14096). - "ignore:Exception ignored in[\\s\\S]*Session was never entered:pytest.PytestUnraisableExceptionWarning", + "ignore:Exception ignored ((on calling weakref callback)|(in[\\s\\S]*Session was never entered)):pytest.PytestUnraisableExceptionWarning", ] markers = [ "asyncio: mark test as asyncio test", diff --git a/src/zarr/_compat.py b/src/zarr/_compat.py index ae973d6292..061e75dbe4 100644 --- a/src/zarr/_compat.py +++ b/src/zarr/_compat.py @@ -2,7 +2,7 @@ from collections.abc import Callable from functools import wraps from inspect import Parameter, signature -from typing import TYPE_CHECKING, Any, TypeVar +from typing import TYPE_CHECKING, Any import numpy as np from packaging.version import Version @@ -12,12 +12,10 @@ if TYPE_CHECKING: from numpy.typing import NDArray -T = TypeVar("T") - # Based off https://github.com/scikit-learn/scikit-learn/blob/e87b32a81c70abed8f2e97483758eb64df8255e9/sklearn/utils/validation.py#L63 -def _deprecate_positional_args( +def _deprecate_positional_args[T]( func: Callable[..., T] | None = None, *, version: str = "3.1.0" ) -> Callable[..., T]: """Decorator for methods that issues warnings for positional arguments. diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 9e0260573f..0408e4769e 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -2,7 +2,7 @@ from abc import abstractmethod from collections.abc import Mapping -from typing import TYPE_CHECKING, Generic, Literal, Protocol, TypeGuard, TypeVar, runtime_checkable +from typing import TYPE_CHECKING, Literal, Protocol, TypeGuard, runtime_checkable from typing_extensions import ReadOnly, TypedDict @@ -43,13 +43,11 @@ class GetResult(TypedDict): status: Literal["present", "missing"] -CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) -CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer) +type CodecInput = NDBuffer | Buffer +type CodecOutput = NDBuffer | Buffer -TName = TypeVar("TName", bound=str, covariant=True) - -class CodecJSON_V2(TypedDict, Generic[TName]): +class CodecJSON_V2[TName: str](TypedDict): """The JSON representation of a codec for Zarr V2""" id: ReadOnly[TName] @@ -85,7 +83,7 @@ def _encode_sync( ) -> NDBuffer | Buffer | None: ... -class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): +class BaseCodec[CI: CodecInput, CO: CodecOutput](Metadata): """Generic base class for codecs. Codecs can be registered via zarr.codecs.registry. @@ -163,13 +161,13 @@ def validate( The array chunk grid metadata """ - async def _decode_single(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: + async def _decode_single(self, chunk_data: CO, chunk_spec: ArraySpec) -> CI: raise NotImplementedError # pragma: no cover async def decode( self, - chunks_and_specs: Iterable[tuple[CodecOutput | None, ArraySpec]], - ) -> Iterable[CodecInput | None]: + chunks_and_specs: Iterable[tuple[CO | None, ArraySpec]], + ) -> Iterable[CI | None]: """Decodes a batch of chunks. Chunks can be None in which case they are ignored by the codec. @@ -180,25 +178,23 @@ async def decode( Returns ------- - Iterable[CodecInput | None] + Iterable[CI | None] """ return await _batching_helper(self._decode_single, chunks_and_specs) - async def _encode_single( - self, chunk_data: CodecInput, chunk_spec: ArraySpec - ) -> CodecOutput | None: + async def _encode_single(self, chunk_data: CI, chunk_spec: ArraySpec) -> CO | None: raise NotImplementedError # pragma: no cover async def encode( self, - chunks_and_specs: Iterable[tuple[CodecInput | None, ArraySpec]], - ) -> Iterable[CodecOutput | None]: + chunks_and_specs: Iterable[tuple[CI | None, ArraySpec]], + ) -> Iterable[CO | None]: """Encodes a batch of chunks. Chunks can be None in which case they are ignored by the codec. Parameters ---------- - chunks_and_specs : Iterable[tuple[CodecInput | None, ArraySpec]] + chunks_and_specs : Iterable[tuple[CI | None, ArraySpec]] Ordered set of to-be-encoded chunks with their accompanying chunk spec. Returns @@ -491,10 +487,10 @@ async def write( ... -async def _batching_helper( - func: Callable[[CodecInput, ArraySpec], Awaitable[CodecOutput | None]], - batch_info: Iterable[tuple[CodecInput | None, ArraySpec]], -) -> list[CodecOutput | None]: +async def _batching_helper[CI: CodecInput, CO: CodecOutput]( + func: Callable[[CI, ArraySpec], Awaitable[CO | None]], + batch_info: Iterable[tuple[CI | None, ArraySpec]], +) -> list[CO | None]: return await concurrent_map( list(batch_info), _noop_for_none(func), @@ -502,10 +498,10 @@ async def _batching_helper( ) -def _noop_for_none( - func: Callable[[CodecInput, ArraySpec], Awaitable[CodecOutput | None]], -) -> Callable[[CodecInput | None, ArraySpec], Awaitable[CodecOutput | None]]: - async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | None: +def _noop_for_none[CI: CodecInput, CO: CodecOutput]( + func: Callable[[CI, ArraySpec], Awaitable[CO | None]], +) -> Callable[[CI | None, ArraySpec], Awaitable[CO | None]]: + async def wrap(chunk: CI | None, chunk_spec: ArraySpec) -> CO | None: if chunk is None: return None return await func(chunk, chunk_spec) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index 76eac1d898..d60422209a 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,6 +1,4 @@ -from typing import Any, Self, TypeGuard - -from typing_extensions import Protocol +from typing import Any, Protocol, Self, TypeGuard class Numcodec(Protocol): diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index d2ab353d43..600df17ee5 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -12,7 +12,7 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable from types import TracebackType - from typing import Any, Self, TypeAlias + from typing import Any, Self from zarr.core.buffer import Buffer, BufferPrototype @@ -54,7 +54,7 @@ class SuffixByteRequest: """The number of bytes from the suffix to request.""" -ByteRequest: TypeAlias = RangeByteRequest | OffsetByteRequest | SuffixByteRequest +type ByteRequest = RangeByteRequest | OffsetByteRequest | SuffixByteRequest class Store(ABC): diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 66cf3bad7e..c776176665 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -3,7 +3,7 @@ import asyncio import dataclasses import warnings -from typing import TYPE_CHECKING, Any, Literal, NotRequired, TypeAlias, TypedDict, cast +from typing import TYPE_CHECKING, Any, Literal, NotRequired, TypedDict, cast import numpy as np import numpy.typing as npt @@ -61,7 +61,7 @@ from zarr.types import AnyArray, AnyAsyncArray # TODO: this type could use some more thought - ArrayLike: TypeAlias = AnyAsyncArray | AnyArray | npt.NDArray[Any] + type ArrayLike = AnyAsyncArray | AnyArray | npt.NDArray[Any] PathLike = str __all__ = [ diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a22a7e6cdf..dc12aafaaf 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -10,9 +10,7 @@ from typing import ( TYPE_CHECKING, Any, - Generic, Literal, - TypeAlias, TypedDict, cast, overload, @@ -111,7 +109,6 @@ ArrayV2Metadata, ArrayV2MetadataDict, ArrayV3Metadata, - T_ArrayMetadata, ) from zarr.core.metadata.io import save_metadata from zarr.core.metadata.v2 import ( @@ -300,7 +297,7 @@ async def get_array_metadata( @dataclass(frozen=True) -class AsyncArray(Generic[T_ArrayMetadata]): +class AsyncArray[T_ArrayMetadata: (ArrayV2Metadata, ArrayV3Metadata)]: """ An asynchronous array class representing a chunked array stored in a Zarr store. @@ -2042,7 +2039,7 @@ def _info( # TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed @dataclass(frozen=False) -class Array(Generic[T_ArrayMetadata]): +class Array[T_ArrayMetadata: (ArrayV2Metadata, ArrayV3Metadata)]: """ A Zarr array. """ @@ -4359,7 +4356,7 @@ async def _shards_initialized( ) -FiltersLike: TypeAlias = ( +type FiltersLike = ( Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec | Iterable[Numcodec] @@ -4368,9 +4365,9 @@ async def _shards_initialized( | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors -CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None +type CompressorLike = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None -CompressorsLike: TypeAlias = ( +type CompressorsLike = ( Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] | Mapping[str, JSON] | BytesBytesCodec @@ -4378,7 +4375,7 @@ async def _shards_initialized( | Literal["auto"] | None ) -SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] +type SerializerLike = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] class ShardsConfigParam(TypedDict): @@ -4386,7 +4383,7 @@ class ShardsConfigParam(TypedDict): index_location: ShardingCodecIndexLocation | None -ShardsLike: TypeAlias = ( +type ShardsLike = ( tuple[int, ...] | Sequence[Sequence[int]] | ShardsConfigParam | Literal["auto"] ) diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 9602a55258..ddfb179213 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -267,7 +267,7 @@ def as_buffer_like(self) -> BytesLike: ------- An object that implements the Python buffer protocol """ - return memoryview(self.as_numpy_array()) # type: ignore[arg-type] + return memoryview(self.as_numpy_array()) def to_bytes(self) -> bytes: """Returns the buffer as `bytes` (host memory). diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 9eef80656d..098f2c8981 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeAlias, TypedDict, cast +from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypedDict, cast if TYPE_CHECKING: from typing import NotRequired, Self @@ -62,7 +62,7 @@ def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: """ -ChunkKeyEncodingLike: TypeAlias = ( +type ChunkKeyEncodingLike = ( dict[str, JSON] | ChunkKeyEncodingParams | ChunkKeyEncoding | NamedConfig[str, Any] ) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index d2d6ad9eac..a9de9b4dbe 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from itertools import islice, pairwise -from typing import TYPE_CHECKING, Any, TypeVar +from typing import TYPE_CHECKING, Any from warnings import warn from zarr.abc.codec import ( @@ -31,11 +31,8 @@ from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType from zarr.core.metadata.v3 import ChunkGridMetadata -T = TypeVar("T") -U = TypeVar("U") - -def _unzip2(iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]: +def _unzip2[T, U](iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]: out0: list[T] = [] out1: list[U] = [] for item0, item1 in iterable: @@ -44,7 +41,7 @@ def _unzip2(iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]: return (out0, out1) -def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: +def batched[T](iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: if n < 1: raise ValueError("n must be at least one") it = iter(iterable) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 98fca1faf7..318cc67068 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -12,11 +12,9 @@ TYPE_CHECKING, Any, Final, - Generic, Literal, NotRequired, TypedDict, - TypeVar, cast, overload, ) @@ -50,11 +48,8 @@ DimensionNamesLike = Iterable[str | None] | None DimensionNames = DimensionNamesLike # for backwards compatibility -TName = TypeVar("TName", bound=str) -TConfig = TypeVar("TConfig", bound=Mapping[str, object]) - -class NamedConfig(TypedDict, Generic[TName, TConfig]): +class NamedConfig[TName: str, TConfig: Mapping[str, object]](TypedDict): """ A typed dictionary representing an object with a name and configuration, where the configuration is an optional mapping of string keys to values, e.g. another typed dictionary or a JSON object. @@ -70,7 +65,7 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]): """The configuration of the object. Not required.""" -class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]): +class NamedRequiredConfig[TName: str, TConfig: Mapping[str, object]](TypedDict): """ A typed dictionary representing an object with a name and configuration, where the configuration is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. @@ -96,11 +91,7 @@ def ceildiv(a: float, b: float) -> int: return math.ceil(a / b) -T = TypeVar("T", bound=tuple[Any, ...]) -V = TypeVar("V") - - -async def concurrent_map( +async def concurrent_map[T: tuple[Any, ...], V]( items: Iterable[T], func: Callable[..., Awaitable[V]], limit: int | None = None, @@ -118,15 +109,12 @@ async def run(item: tuple[Any]) -> V: return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items]) -E = TypeVar("E", bound=Enum) - - -def enum_names(enum: type[E]) -> Iterator[str]: +def enum_names[E: Enum](enum: type[E]) -> Iterator[str]: for item in enum: yield item.name -def parse_enum(data: object, cls: type[E]) -> E: +def parse_enum[E: Enum](data: object, cls: type[E]) -> E: if isinstance(data, cls): return data if not isinstance(data, str): diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 1049a2063f..7c7b0fc5c6 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Sequence -from typing import TYPE_CHECKING, Final, TypeAlias +from typing import TYPE_CHECKING, Final from zarr.core.dtype.common import ( DataTypeValidationError, @@ -149,7 +149,7 @@ VLEN_UTF8_ALIAS: Final = ("str", str, "string") # This type models inputs that can be coerced to a ZDType -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str +type ZDTypeLike = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str for dtype in ANY_DTYPE: # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 6b70f595ba..87e46b53d2 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -6,11 +6,9 @@ from typing import ( ClassVar, Final, - Generic, Literal, TypedDict, TypeGuard, - TypeVar, ) from typing_extensions import ReadOnly @@ -53,13 +51,10 @@ # This models the type of the name a dtype might have in zarr v2 array metadata DTypeName_V2 = StructuredName_V2 | str -TDTypeNameV2_co = TypeVar("TDTypeNameV2_co", bound=DTypeName_V2, covariant=True) -TObjectCodecID_co = TypeVar("TObjectCodecID_co", bound=None | str, covariant=True) - -class DTypeConfig_V2(TypedDict, Generic[TDTypeNameV2_co, TObjectCodecID_co]): - name: ReadOnly[TDTypeNameV2_co] - object_codec_id: ReadOnly[TObjectCodecID_co] +class DTypeConfig_V2[TDTypeNameV2: DTypeName_V2, TObjectCodecID: None | str](TypedDict): + name: ReadOnly[TDTypeNameV2] + object_codec_id: ReadOnly[TObjectCodecID] DTypeSpec_V2 = DTypeConfig_V2[DTypeName_V2, None | str] diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 107b3bd12d..f413f5f678 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -15,7 +15,6 @@ SupportsIndex, SupportsInt, TypeGuard, - TypeVar, ) import numpy as np @@ -67,20 +66,6 @@ NumpyEndiannessStr = Literal[">", "<", "="] NUMPY_ENDIANNESS_STR: Final = ">", "<", "=" -TFloatDType_co = TypeVar( - "TFloatDType_co", - bound=np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, - covariant=True, -) -TFloatScalar_co = TypeVar( - "TFloatScalar_co", bound=np.float16 | np.float32 | np.float64, covariant=True -) - -TComplexDType_co = TypeVar( - "TComplexDType_co", bound=np.dtypes.Complex64DType | np.dtypes.Complex128DType, covariant=True -) -TComplexScalar_co = TypeVar("TComplexScalar_co", bound=np.complex64 | np.complex128, covariant=True) - def endianness_from_numpy_str(endianness: NumpyEndiannessStr) -> EndiannessStr: """ diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 99abee5e24..76a0f05869 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -22,8 +22,6 @@ ) from zarr.core.dtype.npy.common import ( ComplexLike, - TComplexDType_co, - TComplexScalar_co, check_json_complex_float_v2, check_json_complex_float_v3, complex_float_from_json_v2, @@ -40,7 +38,10 @@ @dataclass(frozen=True) -class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): +class BaseComplex[ + DType: np.dtypes.Complex64DType | np.dtypes.Complex128DType, + Scalar: np.complex64 | np.complex128, +](ZDType[DType, Scalar], HasEndianness, HasItemSize): """ A base class for Zarr data types that wrap NumPy complex float data types. """ @@ -74,18 +75,18 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> TComplexDType_co: + def to_native_dtype(self) -> DType: """ Convert this class to a NumPy complex dtype with the appropriate byte order. Returns ------- - TComplexDType_co + DType A NumPy data type object representing the complex data type with the specified byte order. """ byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] + return self.dtype_cls().newbyteorder(byte_order) # type: ignore[no-any-return,call-overload] @classmethod def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: @@ -235,7 +236,7 @@ def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: """ return isinstance(data, ComplexLike) - def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: + def _cast_scalar_unchecked(self, data: ComplexLike) -> Scalar: """ Cast the provided scalar data to the native scalar type of this class. @@ -246,7 +247,7 @@ def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: Returns ------- - TComplexScalar_co + Scalar The casted data as a numpy complex scalar. Notes @@ -256,7 +257,7 @@ def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: """ return self.to_native_dtype().type(data) # type: ignore[return-value] - def cast_scalar(self, data: object) -> TComplexScalar_co: + def cast_scalar(self, data: object) -> Scalar: """ Attempt to cast a given object to a numpy complex scalar. @@ -267,7 +268,7 @@ def cast_scalar(self, data: object) -> TComplexScalar_co: Returns ------- - TComplexScalar_co + Scalar The data cast as a numpy complex scalar. Raises @@ -283,7 +284,7 @@ def cast_scalar(self, data: object) -> TComplexScalar_co: ) raise TypeError(msg) - def default_scalar(self) -> TComplexScalar_co: + def default_scalar(self) -> Scalar: """ Get the default value, which is 0 cast to this dtype @@ -294,7 +295,7 @@ def default_scalar(self) -> TComplexScalar_co: """ return self._cast_scalar_unchecked(0) - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> Scalar: """ Read a JSON-serializable value as a numpy float. @@ -307,7 +308,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexSc Returns ------- - TScalar_co + Scalar The numpy float. """ if zarr_format == 2: diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 2a23cb429d..668e169d8b 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -15,8 +15,6 @@ ) from zarr.core.dtype.npy.common import ( FloatLike, - TFloatDType_co, - TFloatScalar_co, check_json_float_v2, check_json_float_v3, check_json_floatish_str, @@ -34,7 +32,10 @@ @dataclass(frozen=True) -class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): +class BaseFloat[ + DType: np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, + Scalar: np.float16 | np.float32 | np.float64, +](ZDType[DType, Scalar], HasEndianness, HasItemSize): """ A base class for Zarr data types that wrap NumPy float data types. """ @@ -63,17 +64,17 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> TFloatDType_co: + def to_native_dtype(self) -> DType: """ Convert the wrapped data type to a NumPy data type. Returns ------- - TFloatDType_co + DType The NumPy data type. """ byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] + return self.dtype_cls().newbyteorder(byte_order) # type: ignore[no-any-return,call-overload] @classmethod def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: @@ -213,7 +214,7 @@ def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: return True return isinstance(data, FloatLike) - def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: + def _cast_scalar_unchecked(self, data: FloatLike) -> Scalar: """ Cast a scalar value to a NumPy float scalar. @@ -224,12 +225,12 @@ def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: Returns ------- - TFloatScalar_co + Scalar The NumPy float scalar. """ return self.to_native_dtype().type(data) # type: ignore[return-value] - def cast_scalar(self, data: object) -> TFloatScalar_co: + def cast_scalar(self, data: object) -> Scalar: """ Cast a scalar value to a NumPy float scalar. @@ -240,7 +241,7 @@ def cast_scalar(self, data: object) -> TFloatScalar_co: Returns ------- - TFloatScalar_co + Scalar The NumPy float scalar. """ if self._check_scalar(data): @@ -251,18 +252,18 @@ def cast_scalar(self, data: object) -> TFloatScalar_co: ) raise TypeError(msg) - def default_scalar(self) -> TFloatScalar_co: + def default_scalar(self) -> Scalar: """ Get the default value, which is 0 cast to this zdtype. Returns ------- - TFloatScalar_co + Scalar The default value. """ return self._cast_scalar_unchecked(0) - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> Scalar: """ Read a JSON-serializable value as a NumPy float scalar. @@ -275,7 +276,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScal Returns ------- - TFloatScalar_co + Scalar The NumPy float scalar. """ if zarr_format == 2: diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index e5007b7acb..e5b8fa6aa1 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -9,7 +9,6 @@ SupportsIndex, SupportsInt, TypeGuard, - TypeVar, overload, ) @@ -48,13 +47,15 @@ _NumpyIntScalar = ( np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 ) -TIntDType_co = TypeVar("TIntDType_co", bound=_NumpyIntDType, covariant=True) -TIntScalar_co = TypeVar("TIntScalar_co", bound=_NumpyIntScalar, covariant=True) + IntLike = SupportsInt | SupportsIndex | bytes | str @dataclass(frozen=True) -class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): +class BaseInt[ + DType: _NumpyIntDType, + Scalar: np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64, +](ZDType[DType, Scalar], HasItemSize): """ A base class for integer data types in Zarr. @@ -129,7 +130,7 @@ def _check_scalar(self, data: object) -> TypeGuard[IntLike]: return isinstance(data, IntLike) - def _cast_scalar_unchecked(self, data: IntLike) -> TIntScalar_co: + def _cast_scalar_unchecked(self, data: IntLike) -> Scalar: """ Casts a given scalar value to the native integer scalar type without type checking. @@ -140,13 +141,13 @@ def _cast_scalar_unchecked(self, data: IntLike) -> TIntScalar_co: Returns ------- - TIntScalar_co + Scalar The casted integer scalar of the native dtype. """ return self.to_native_dtype().type(data) # type: ignore[return-value] - def cast_scalar(self, data: object) -> TIntScalar_co: + def cast_scalar(self, data: object) -> Scalar: """ Attempt to cast a given object to a NumPy integer scalar. @@ -157,7 +158,7 @@ def cast_scalar(self, data: object) -> TIntScalar_co: Returns ------- - TIntScalar_co + Scalar The data cast as a NumPy integer scalar. Raises @@ -174,18 +175,18 @@ def cast_scalar(self, data: object) -> TIntScalar_co: ) raise TypeError(msg) - def default_scalar(self) -> TIntScalar_co: + def default_scalar(self) -> Scalar: """ Get the default value, which is 0 cast to this dtype. Returns ------- - TIntScalar_co + Scalar The default value. """ return self._cast_scalar_unchecked(0) - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> Scalar: """ Read a JSON-serializable value as a NumPy int scalar. @@ -198,7 +199,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar Returns ------- - TIntScalar_co + Scalar The NumPy int scalar. Raises diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 904280a330..069d0b128d 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -33,7 +33,7 @@ endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TDType_co, ZDType +from zarr.core.dtype.wrapper import ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -453,7 +453,7 @@ class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8 # If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length # string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object # dtype as the native dtype. -class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): +class UTF8Base[DType: TBaseDType](ZDType[DType, str], HasObjectCodec): """ A base class for variable-length UTF-8 string data types. @@ -740,7 +740,7 @@ class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type- The object codec ID for this data type. """ - dtype_cls = np.dtypes.StringDType + dtype_cls = np.dtypes.StringDType # type: ignore[assignment] @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 402a140321..1a46f77983 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -9,7 +9,6 @@ Self, TypedDict, TypeGuard, - TypeVar, cast, get_args, overload, @@ -90,16 +89,6 @@ def check_json_time(data: JSON) -> TypeGuard[Literal["NaT"] | int]: return check_json_int(data) or data == "NaT" -BaseTimeDType_co = TypeVar( - "BaseTimeDType_co", - bound=np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, - covariant=True, -) -BaseTimeScalar_co = TypeVar( - "BaseTimeScalar_co", bound=np.timedelta64 | np.datetime64, covariant=True -) - - class TimeConfig(TypedDict): """ The configuration for the numpy.timedelta64 or numpy.datetime64 data type in Zarr V3. @@ -217,7 +206,10 @@ class DateTime64JSON_V2(DTypeConfig_V2[str, None]): @dataclass(frozen=True, kw_only=True, slots=True) -class TimeDTypeBase(ZDType[BaseTimeDType_co, BaseTimeScalar_co], HasEndianness, HasItemSize): +class TimeDTypeBase[ + DType: np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, + Scalar: np.timedelta64 | np.datetime64, +](ZDType[DType, Scalar], HasEndianness, HasItemSize): """ A base class for data types that represent time via the NumPy TimeDelta64 and DateTime64 data types. @@ -275,7 +267,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> BaseTimeDType_co: + def to_native_dtype(self) -> DType: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. @@ -285,7 +277,7 @@ def to_native_dtype(self) -> BaseTimeDType_co: Returns ------- - BaseTimeDType_co + DType A NumPy data type object representing the time data type with the specified unit, scale factor, and byte order. """ diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index fdc5f747f0..42d5d88473 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -28,11 +28,9 @@ from typing import ( TYPE_CHECKING, ClassVar, - Generic, Literal, Self, TypeGuard, - TypeVar, overload, ) @@ -44,20 +42,14 @@ # This the upper bound for the scalar types we support. It's numpy scalars + str, # because the new variable-length string dtype in numpy does not have a corresponding scalar type -TBaseScalar = np.generic | str | bytes +type TBaseScalar = np.generic | str | bytes # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. -TBaseDType = np.dtype[np.generic] - -# These two type parameters are covariant because we want -# x : ZDType[BaseDType, BaseScalar] = ZDType[SubDType, SubScalar] -# to type check -TScalar_co = TypeVar("TScalar_co", bound=TBaseScalar, covariant=True) -TDType_co = TypeVar("TDType_co", bound=TBaseDType, covariant=True) +type TBaseDType = np.dtype[np.generic] @dataclass(frozen=True, kw_only=True, slots=True) -class ZDType(ABC, Generic[TDType_co, TScalar_co]): +class ZDType[DType: TBaseDType, Scalar: TBaseScalar](ABC): """ Abstract base class for wrapping native array data types, e.g. numpy dtypes @@ -71,11 +63,11 @@ class variable, and it should generally be unique across different data types. """ # this class will create a native data type - dtype_cls: ClassVar[type[TDType_co]] + dtype_cls: ClassVar[type[TBaseDType]] _zarr_v3_name: ClassVar[str] @classmethod - def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: + def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[DType]: """ Check that a native data type matches the dtype_cls class attribute. @@ -120,7 +112,7 @@ def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: raise NotImplementedError # pragma: no cover @abstractmethod - def to_native_dtype(self: Self) -> TDType_co: + def to_native_dtype(self: Self) -> DType: """ Return an instance of the wrapped data type. This operation inverts ``from_native_dtype``. @@ -206,7 +198,7 @@ def _check_scalar(self, data: object) -> bool: raise NotImplementedError # pragma: no cover @abstractmethod - def cast_scalar(self, data: object) -> TScalar_co: + def cast_scalar(self, data: object) -> Scalar: """ Cast a python object to the wrapped scalar type. @@ -226,7 +218,7 @@ def cast_scalar(self, data: object) -> TScalar_co: raise NotImplementedError # pragma: no cover @abstractmethod - def default_scalar(self) -> TScalar_co: + def default_scalar(self) -> Scalar: """ Get the default scalar value for the wrapped data type. @@ -242,7 +234,7 @@ def default_scalar(self) -> TScalar_co: raise NotImplementedError # pragma: no cover @abstractmethod - def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: + def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> Scalar: """ Read a JSON-serializable value as a scalar. diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 203a853a51..7f92d0922a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -9,7 +9,7 @@ from collections import defaultdict from dataclasses import asdict, dataclass, field, fields, replace from itertools import accumulate -from typing import TYPE_CHECKING, Literal, TypeVar, assert_never, cast, overload +from typing import TYPE_CHECKING, Literal, assert_never, cast, overload import numpy as np import numpy.typing as npt @@ -84,8 +84,6 @@ logger = logging.getLogger("zarr.group") -DefaultT = TypeVar("DefaultT") - def parse_zarr_format(data: Any) -> ZarrFormat: """Parse the zarr_format field from metadata.""" @@ -807,7 +805,7 @@ async def delitem(self, key: str) -> None: self.metadata.consolidated_metadata.metadata.pop(key, None) await self._save_metadata() - async def get( + async def get[DefaultT]( self, key: str, default: DefaultT | None = None ) -> AnyAsyncArray | AsyncGroup | DefaultT | None: """Obtain a group member, returning default if not found. @@ -1922,7 +1920,9 @@ def __getitem__(self, path: str) -> AnyArray | Group: else: return Group(obj) - def get(self, path: str, default: DefaultT | None = None) -> AnyArray | Group | DefaultT | None: + def get[DefaultT]( + self, path: str, default: DefaultT | None = None + ) -> AnyArray | Group | DefaultT | None: """Obtain a group member, returning default if not found. Parameters diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index d2e8b67f0e..efbe22f291 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -11,13 +11,10 @@ from typing import ( TYPE_CHECKING, Any, - Generic, Literal, NamedTuple, Protocol, - TypeAlias, TypeGuard, - TypeVar, cast, runtime_checkable, ) @@ -26,7 +23,8 @@ import numpy.typing as npt from zarr.core.common import ceildiv, product -from zarr.core.metadata import T_ArrayMetadata +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.errors import ( ArrayIndexError, BoundsCheckError, @@ -78,7 +76,7 @@ class Indexer(Protocol): def __iter__(self) -> Iterator[ChunkProjection]: ... -_ArrayIndexingOrder: TypeAlias = Literal["lexicographic"] +type _ArrayIndexingOrder = Literal["lexicographic"] def _iter_grid( @@ -524,9 +522,6 @@ def replace_lists(selection: SelectionNormalized) -> SelectionNormalized: ) -T = TypeVar("T") - - def ensure_tuple(v: Any) -> SelectionNormalized: if not isinstance(v, tuple): v = (v,) @@ -1035,7 +1030,7 @@ def __setitem__(self, selection: OrthogonalSelection, value: npt.ArrayLike) -> N @dataclass(frozen=True) -class AsyncOIndex(Generic[T_ArrayMetadata]): +class AsyncOIndex[T_ArrayMetadata: (ArrayV2Metadata, ArrayV3Metadata)]: array: AsyncArray[T_ArrayMetadata] async def getitem(self, selection: OrthogonalSelection | AnyArray) -> NDArrayLikeOrScalar: @@ -1378,7 +1373,7 @@ def __setitem__( @dataclass(frozen=True) -class AsyncVIndex(Generic[T_ArrayMetadata]): +class AsyncVIndex[T_ArrayMetadata: (ArrayV2Metadata, ArrayV3Metadata)]: array: AsyncArray[T_ArrayMetadata] # TODO: develop Array generic and move zarr.Array[np.intp] | zarr.Array[np.bool_] to ArrayOfIntOrBool diff --git a/src/zarr/core/metadata/__init__.py b/src/zarr/core/metadata/__init__.py index 57385386b6..cacfc933b5 100644 --- a/src/zarr/core/metadata/__init__.py +++ b/src/zarr/core/metadata/__init__.py @@ -1,11 +1,8 @@ -from typing import TypeAlias, TypeVar - from .v2 import ArrayV2Metadata, ArrayV2MetadataDict from .v3 import ArrayMetadataJSON_V3, ArrayV3Metadata -ArrayMetadata: TypeAlias = ArrayV2Metadata | ArrayV3Metadata -ArrayMetadataDict: TypeAlias = ArrayV2MetadataDict | ArrayMetadataJSON_V3 -T_ArrayMetadata = TypeVar("T_ArrayMetadata", ArrayV2Metadata, ArrayV3Metadata, covariant=True) +ArrayMetadata = ArrayV2Metadata | ArrayV3Metadata +type ArrayMetadataDict = ArrayV2MetadataDict | ArrayMetadataJSON_V3 __all__ = [ "ArrayMetadata", diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index cce7d0d385..7357e2365b 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -3,7 +3,7 @@ import warnings from collections.abc import Iterable, Sequence from functools import cached_property -from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast +from typing import TYPE_CHECKING, Any, TypedDict, cast from zarr.abc.metadata import Metadata from zarr.abc.numcodec import Numcodec, _is_numcodec @@ -22,8 +22,6 @@ from zarr.core.dtype.wrapper import ( TBaseDType, TBaseScalar, - TDType_co, - TScalar_co, ZDType, ) @@ -55,7 +53,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None +type CompressorLikev2 = dict[str, JSON] | Numcodec | None @dataclass(frozen=True, kw_only=True) @@ -75,7 +73,7 @@ def __init__( self, *, shape: tuple[int, ...], - dtype: ZDType[TDType_co, TScalar_co], + dtype: ZDType[TBaseDType, TBaseScalar], chunks: tuple[int, ...], fill_value: Any, order: MemoryOrder, diff --git a/src/zarr/core/sync.py b/src/zarr/core/sync.py index fe435cc2b8..7bcb0bf034 100644 --- a/src/zarr/core/sync.py +++ b/src/zarr/core/sync.py @@ -6,7 +6,7 @@ import os import threading from concurrent.futures import ThreadPoolExecutor, wait -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING from typing_extensions import ParamSpec @@ -20,7 +20,6 @@ P = ParamSpec("P") -T = TypeVar("T") # From https://github.com/fsspec/filesystem_spec/blob/master/fsspec/asyn.py @@ -110,7 +109,7 @@ def reset_resources_after_fork() -> None: os.register_at_fork(after_in_child=reset_resources_after_fork) -async def _runner(coro: Coroutine[Any, Any, T]) -> T | BaseException: +async def _runner[T](coro: Coroutine[Any, Any, T]) -> T | BaseException: """ Await a coroutine and return the result of running it. If awaiting the coroutine raises an exception, the exception will be returned. @@ -121,7 +120,7 @@ async def _runner(coro: Coroutine[Any, Any, T]) -> T | BaseException: return ex -def sync( +def sync[T]( coro: Coroutine[Any, Any, T], loop: asyncio.AbstractEventLoop | None = None, timeout: float | None = None, @@ -182,7 +181,7 @@ def _get_loop() -> asyncio.AbstractEventLoop: return loop[0] -async def _collect_aiterator(data: AsyncIterator[T]) -> tuple[T, ...]: +async def _collect_aiterator[T](data: AsyncIterator[T]) -> tuple[T, ...]: """ Collect an entire async iterator into a tuple """ @@ -190,7 +189,7 @@ async def _collect_aiterator(data: AsyncIterator[T]) -> tuple[T, ...]: return tuple(result) -def collect_aiterator(data: AsyncIterator[T]) -> tuple[T, ...]: +def collect_aiterator[T](data: AsyncIterator[T]) -> tuple[T, ...]: """ Synchronously collect an entire async iterator into a tuple. """ @@ -198,7 +197,7 @@ def collect_aiterator(data: AsyncIterator[T]) -> tuple[T, ...]: class SyncMixin: - def _sync(self, coroutine: Coroutine[Any, Any, T]) -> T: + def _sync[T](self, coroutine: Coroutine[Any, Any, T]) -> T: # TODO: refactor this to to take *args and **kwargs and pass those to the method # this should allow us to better type the sync wrapper return sync( @@ -206,14 +205,14 @@ def _sync(self, coroutine: Coroutine[Any, Any, T]) -> T: timeout=config.get("async.timeout"), ) - def _sync_iter(self, async_iterator: AsyncIterator[T]) -> list[T]: + def _sync_iter[T](self, async_iterator: AsyncIterator[T]) -> list[T]: async def iter_to_list() -> list[T]: return [item async for item in async_iterator] return self._sync(iter_to_list()) -async def _with_semaphore( +async def _with_semaphore[T]( func: Callable[[], Awaitable[T]], semaphore: asyncio.Semaphore | None = None ) -> T: """ diff --git a/src/zarr/registry.py b/src/zarr/registry.py index d0850a1387..a1938b575c 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -3,7 +3,7 @@ import warnings from collections import defaultdict from importlib.metadata import entry_points as get_entry_points -from typing import TYPE_CHECKING, Any, Generic, TypeVar +from typing import TYPE_CHECKING, Any from zarr.core.config import BadConfigError, config from zarr.core.dtype import data_type_registry @@ -39,10 +39,8 @@ "register_pipeline", ] -T = TypeVar("T") - -class Registry(dict[str, type[T]], Generic[T]): +class Registry[T](dict[str, type[T]]): def __init__(self) -> None: super().__init__() self.lazy_load_list: list[EntryPoint] = [] diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index 08c05864aa..7138ad7622 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -3,7 +3,7 @@ import importlib.util import json from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, Self, TypeAlias +from typing import TYPE_CHECKING, Any, Literal, Self from zarr.abc.store import ( ByteRequest, @@ -296,7 +296,7 @@ def __eq__(self, other: object) -> bool: return False -StoreLike: TypeAlias = Store | StorePath | FSMap | Path | str | dict[str, Buffer] +type StoreLike = Store | StorePath | FSMap | Path | str | dict[str, Buffer] async def make_store( diff --git a/src/zarr/storage/_logging.py b/src/zarr/storage/_logging.py index 98dca6b23d..a4c376c332 100644 --- a/src/zarr/storage/_logging.py +++ b/src/zarr/storage/_logging.py @@ -6,7 +6,7 @@ import time from collections import defaultdict from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Self, TypeVar +from typing import TYPE_CHECKING, Any, Self from zarr.abc.store import Store from zarr.storage._wrapper import WrapperStore @@ -19,10 +19,8 @@ counter: defaultdict[str, int] -T_Store = TypeVar("T_Store", bound=Store) - -class LoggingStore(WrapperStore[T_Store]): +class LoggingStore[T_Store: Store](WrapperStore[T_Store]): """ Store that logs all calls to another wrapped store. diff --git a/src/zarr/storage/_obstore.py b/src/zarr/storage/_obstore.py index 6e4011da59..ffea523f9f 100644 --- a/src/zarr/storage/_obstore.py +++ b/src/zarr/storage/_obstore.py @@ -6,7 +6,7 @@ from collections import defaultdict from itertools import chain from operator import itemgetter -from typing import TYPE_CHECKING, Generic, Self, TypedDict, TypeVar +from typing import TYPE_CHECKING, Self, TypedDict from zarr.abc.store import ( ByteRequest, @@ -37,10 +37,7 @@ ) -T_Store = TypeVar("T_Store", bound="_UpstreamObjectStore") - - -class ObjectStore(Store, Generic[T_Store]): +class ObjectStore[T_Store: "_UpstreamObjectStore"](Store): """ Store that uses obstore for fast read/write from AWS, GCP, Azure. diff --git a/src/zarr/storage/_utils.py b/src/zarr/storage/_utils.py index 10ac395b36..8939ead30b 100644 --- a/src/zarr/storage/_utils.py +++ b/src/zarr/storage/_utils.py @@ -1,8 +1,18 @@ from __future__ import annotations +import importlib import re from pathlib import Path -from typing import TYPE_CHECKING, TypeVar + +if importlib.util.find_spec("upath"): + from upath.core import UPath +else: + + class UPath: # type: ignore[no-redef] + pass + + +from typing import TYPE_CHECKING from zarr.abc.store import OffsetByteRequest, RangeByteRequest, SuffixByteRequest @@ -20,7 +30,8 @@ def normalize_path(path: str | bytes | Path | None) -> str: result = str(path, "ascii") # handle pathlib.Path - elif isinstance(path, Path): + + elif isinstance(path, Path | UPath): result = str(path) elif isinstance(path, str): @@ -155,10 +166,7 @@ def _normalize_paths(paths: Iterable[str]) -> tuple[str, ...]: return tuple(path_map.keys()) -T = TypeVar("T") - - -def _normalize_path_keys(data: Mapping[str, T]) -> dict[str, T]: +def _normalize_path_keys[T](data: Mapping[str, T]) -> dict[str, T]: """ Normalize the keys of the input dict according to the normalization scheme used for zarr node paths. If any two keys in the input normalize to the same value, raise a ValueError. diff --git a/src/zarr/storage/_wrapper.py b/src/zarr/storage/_wrapper.py index e8a2859abc..50fe5c0e59 100644 --- a/src/zarr/storage/_wrapper.py +++ b/src/zarr/storage/_wrapper.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, TypeVar, cast +from typing import TYPE_CHECKING, cast if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable @@ -13,10 +13,8 @@ from zarr.abc.store import Store -T_Store = TypeVar("T_Store", bound=Store) - -class WrapperStore(Store, Generic[T_Store]): +class WrapperStore[T_Store: Store](Store): """ Store that wraps an existing Store. diff --git a/src/zarr/testing/stateful.py b/src/zarr/testing/stateful.py index 382f1467da..4bdc7db491 100644 --- a/src/zarr/testing/stateful.py +++ b/src/zarr/testing/stateful.py @@ -1,7 +1,7 @@ import builtins import functools from collections.abc import Callable -from typing import Any, TypeVar, cast +from typing import Any, cast import hypothesis.extra.numpy as npst import hypothesis.strategies as st @@ -36,10 +36,8 @@ MAX_BINARY_SIZE = 100 -F = TypeVar("F", bound=Callable[..., Any]) - -def with_frequency(frequency: float) -> Callable[[F], F]: +def with_frequency[F: Callable[..., Any]](frequency: float) -> Callable[[F], F]: """This needs to be deterministic for hypothesis replaying""" def decorator(func: F) -> F: diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index ce83715b86..91c174b589 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -4,7 +4,7 @@ import json import pickle from abc import abstractmethod -from typing import TYPE_CHECKING, Generic, Self, TypeVar +from typing import TYPE_CHECKING, Self from zarr.storage import WrapperStore @@ -33,11 +33,7 @@ __all__ = ["StoreTests"] -S = TypeVar("S", bound=Store) -B = TypeVar("B", bound=Buffer) - - -class StoreTests(Generic[S, B]): +class StoreTests[S: Store, B: Buffer]: store_cls: type[S] buffer_cls: type[B] diff --git a/src/zarr/testing/utils.py b/src/zarr/testing/utils.py index 2a4c3e45c5..94f73f6798 100644 --- a/src/zarr/testing/utils.py +++ b/src/zarr/testing/utils.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypeVar, cast +from typing import TYPE_CHECKING, cast import pytest @@ -37,13 +37,10 @@ def has_cupy() -> bool: return False -T = TypeVar("T") - - gpu_mark = pytest.mark.gpu skip_if_no_gpu = pytest.mark.skipif(not has_cupy(), reason="CuPy not installed or no GPU available") # Decorator for GPU tests -def gpu_test(func: T) -> T: +def gpu_test[T](func: T) -> T: return cast(T, gpu_mark(skip_if_no_gpu(func))) diff --git a/src/zarr/types.py b/src/zarr/types.py index 38990982f9..c159d5d5f2 100644 --- a/src/zarr/types.py +++ b/src/zarr/types.py @@ -1,23 +1,23 @@ -from typing import Any, TypeAlias +from typing import Any from zarr.core.array import Array, AsyncArray from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata -AnyAsyncArray: TypeAlias = AsyncArray[Any] +type AnyAsyncArray = AsyncArray[Any] """A Zarr format 2 or 3 `AsyncArray`""" -AsyncArrayV2: TypeAlias = AsyncArray[ArrayV2Metadata] +type AsyncArrayV2 = AsyncArray[ArrayV2Metadata] """A Zarr format 2 `AsyncArray`""" -AsyncArrayV3: TypeAlias = AsyncArray[ArrayV3Metadata] +type AsyncArrayV3 = AsyncArray[ArrayV3Metadata] """A Zarr format 3 `AsyncArray`""" -AnyArray: TypeAlias = Array[Any] +type AnyArray = Array[Any] """A Zarr format 2 or 3 `Array`""" -ArrayV2: TypeAlias = Array[ArrayV2Metadata] +type ArrayV2 = Array[ArrayV2Metadata] """A Zarr format 2 `Array`""" -ArrayV3: TypeAlias = Array[ArrayV3Metadata] +type ArrayV3 = Array[ArrayV3Metadata] """A Zarr format 3 `Array`""" diff --git a/tests/test_regression/scripts/v2.18.py b/tests/test_regression/scripts/v2.18.py index 39e1c5210c..4c730b9c79 100644 --- a/tests/test_regression/scripts/v2.18.py +++ b/tests/test_regression/scripts/v2.18.py @@ -1,5 +1,5 @@ # /// script -# requires-python = ">=3.11" +# requires-python = ">=3.12" # dependencies = [ # "zarr==2.18", # "numcodecs==0.15" diff --git a/tests/test_regression/test_v2_dtype_regression.py b/tests/test_regression/test_v2_dtype_regression.py index 4f3329e88c..2607f9aa36 100644 --- a/tests/test_regression/test_v2_dtype_regression.py +++ b/tests/test_regression/test_v2_dtype_regression.py @@ -1,4 +1,5 @@ import subprocess +import sys from dataclasses import dataclass from itertools import product from pathlib import Path @@ -193,6 +194,10 @@ def source_array_v3(tmp_path: Path, request: pytest.FixtureRequest) -> ArrayV3: script_paths = [Path(__file__).resolve().parent / "scripts" / "v2.18.py"] +@pytest.mark.skipif( + sys.platform == "darwin" and sys.version_info >= (3, 14), + reason="Numcodecs pinned to 0.15 does not build on newer macos installations with newer python versions: see discussion https://github.com/zarr-developers/zarr-python/pull/3564#issuecomment-4081145034", +) @pytest.mark.skipif(not runner_installed(), reason="no python script runner installed") @pytest.mark.parametrize( "source_array_v2", array_cases_v2_18, indirect=True, ids=tuple(map(str, array_cases_v2_18)) @@ -211,7 +216,7 @@ def test_roundtrip_v2(source_array_v2: ArrayV2, tmp_path: Path, script_path: Pat capture_output=True, text=True, ) - assert copy_op.returncode == 0 + assert copy_op.returncode == 0, "stdout " + copy_op.stdout + "\n stderr" + copy_op.stderr out_array = zarr.open_array(store=out_path, mode="r", zarr_format=2) assert source_array_v2.metadata.to_dict() == out_array.metadata.to_dict() assert np.array_equal(source_array_v2[:], out_array[:]) diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index 6589c68e09..e673bfd40b 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -145,11 +145,7 @@ async def test_store_path_invalid_mode_raises( Test that ValueErrors are raise for invalid mode. """ with pytest.raises(ValueError): - await StorePath.open( - LocalStore(str(tmp_path), read_only=modes[0]), - path="", - mode=modes[1], # type:ignore[arg-type] - ) + await StorePath.open(LocalStore(str(tmp_path), read_only=modes[0]), path="", mode=modes[1]) # type: ignore[arg-type] async def test_make_store_path_invalid() -> None: @@ -195,7 +191,7 @@ def test_normalize_path_valid(path: str | bytes | Path) -> None: def test_normalize_path_upath() -> None: upath = pytest.importorskip("upath") - assert normalize_path(upath.UPath("foo/bar")) == "foo/bar" + assert normalize_path(upath.UPath("foo/bar", protocol="memory")) == "memory:/foo/bar" def test_normalize_path_none() -> None: From 1e668460e8bd17f8e7c00ca1360829c693fe4a49 Mon Sep 17 00:00:00 2001 From: Sam Levang <39069044+slevang@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:29:55 -0400 Subject: [PATCH 097/118] fix: remove numcodecs off-spec warning (#3833) * move warning into to_dict * remove warning entirely * changelog * changelog type --------- Co-authored-by: Davis Bennett --- changes/3833.misc.md | 1 + src/zarr/codecs/numcodecs/_codecs.py | 8 -- src/zarr/core/array.py | 4 +- tests/test_array.py | 25 ++-- tests/test_cli/test_migrate_v3.py | 10 +- tests/test_codec_pipeline.py | 5 +- tests/test_codecs/test_numcodecs.py | 195 ++++++++++++--------------- 7 files changed, 104 insertions(+), 144 deletions(-) create mode 100644 changes/3833.misc.md diff --git a/changes/3833.misc.md b/changes/3833.misc.md new file mode 100644 index 0000000000..1f3c87b482 --- /dev/null +++ b/changes/3833.misc.md @@ -0,0 +1 @@ +Remove the warning that is emitted when any Numcodecs codec is instantiated. diff --git a/src/zarr/codecs/numcodecs/_codecs.py b/src/zarr/codecs/numcodecs/_codecs.py index 4a3d88a84f..06c085ad2a 100644 --- a/src/zarr/codecs/numcodecs/_codecs.py +++ b/src/zarr/codecs/numcodecs/_codecs.py @@ -32,7 +32,6 @@ from dataclasses import dataclass, replace from functools import cached_property from typing import TYPE_CHECKING, Any, Self -from warnings import warn import numpy as np @@ -41,7 +40,6 @@ from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import JSON, parse_named_configuration, product from zarr.dtype import UInt8, ZDType, parse_dtype -from zarr.errors import ZarrUserWarning from zarr.registry import get_numcodec if TYPE_CHECKING: @@ -102,12 +100,6 @@ def __init__(self, **codec_config: JSON) -> None: ) # pragma: no cover object.__setattr__(self, "codec_config", codec_config) - warn( - "Numcodecs codecs are not in the Zarr version 3 specification and " - "may not be supported by other zarr implementations.", - category=ZarrUserWarning, - stacklevel=2, - ) @cached_property def _codec(self) -> Numcodec: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index dc12aafaaf..1222c82863 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4383,9 +4383,7 @@ class ShardsConfigParam(TypedDict): index_location: ShardingCodecIndexLocation | None -type ShardsLike = ( - tuple[int, ...] | Sequence[Sequence[int]] | ShardsConfigParam | Literal["auto"] -) +type ShardsLike = tuple[int, ...] | Sequence[Sequence[int]] | ShardsConfigParam | Literal["auto"] async def from_array( diff --git a/tests/test_array.py b/tests/test_array.py index e7223b9f43..6fb84a9152 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1853,24 +1853,21 @@ def test_roundtrip_numcodecs() -> None: # Create the array with the correct codecs root = zarr.group(store) - warn_msg = "Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations." - with pytest.warns(ZarrUserWarning, match=warn_msg): - root.create_array( - "test", - shape=(720, 1440), - chunks=(720, 1440), - dtype="float64", - compressors=compressors, # type: ignore[arg-type] - filters=filters, # type: ignore[arg-type] - fill_value=-9.99, - dimension_names=["lat", "lon"], - ) + root.create_array( + "test", + shape=(720, 1440), + chunks=(720, 1440), + dtype="float64", + compressors=compressors, # type: ignore[arg-type] + filters=filters, # type: ignore[arg-type] + fill_value=-9.99, + dimension_names=["lat", "lon"], + ) BYTES_CODEC = {"name": "bytes", "configuration": {"endian": "little"}} # Read in the array again and check compressor config root = zarr.open_group(store) - with pytest.warns(ZarrUserWarning, match=warn_msg): - metadata = root["test"].metadata.to_dict() + metadata = root["test"].metadata.to_dict() expected = (*filters, BYTES_CODEC, *compressors) assert metadata["codecs"] == expected diff --git a/tests/test_cli/test_migrate_v3.py b/tests/test_cli/test_migrate_v3.py index dd3ca02549..7213aada12 100644 --- a/tests/test_cli/test_migrate_v3.py +++ b/tests/test_cli/test_migrate_v3.py @@ -31,8 +31,6 @@ runner = typer_testing.CliRunner() -NUMCODECS_USER_WARNING = "Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations." - def test_migrate_array(local_store: LocalStore) -> None: shape = (10, 10) @@ -315,7 +313,6 @@ def test_migrate_compressor( assert np.all(zarr_array[:] == 1) -@pytest.mark.filterwarnings(f"ignore:{NUMCODECS_USER_WARNING}:UserWarning") def test_migrate_numcodecs_compressor(local_store: LocalStore) -> None: """Test migration of a numcodecs compressor without a zarr.codecs equivalent.""" @@ -359,7 +356,6 @@ def test_migrate_numcodecs_compressor(local_store: LocalStore) -> None: assert np.all(zarr_array[:] == 1) -@pytest.mark.filterwarnings(f"ignore:{NUMCODECS_USER_WARNING}:UserWarning") def test_migrate_filter(local_store: LocalStore) -> None: filter_v2 = numcodecs.Delta(dtype=" None: fill_value=0, ) - with pytest.warns(UserWarning, match=NUMCODECS_USER_WARNING): - result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) @@ -547,8 +542,7 @@ def test_migrate_incorrect_compressor(local_store: LocalStore) -> None: fill_value=0, ) - with pytest.warns(UserWarning, match=NUMCODECS_USER_WARNING): - result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) diff --git a/tests/test_codec_pipeline.py b/tests/test_codec_pipeline.py index 8d044c10d7..c923555756 100644 --- a/tests/test_codec_pipeline.py +++ b/tests/test_codec_pipeline.py @@ -3,6 +3,7 @@ import pytest import zarr +from zarr.core.array import _get_chunk_spec from zarr.core.buffer.core import default_buffer_prototype from zarr.core.indexing import BasicIndexer from zarr.storage import MemoryStore @@ -42,7 +43,7 @@ async def test_read_returns_get_results( indexer = BasicIndexer( read_slice, shape=metadata.shape, - chunk_grid=metadata.chunk_grid, + chunk_grid=async_arr.chunk_grid, ) out_buffer = prototype.nd_buffer.empty( @@ -55,7 +56,7 @@ async def test_read_returns_get_results( [ ( async_arr.store_path / metadata.encode_chunk_key(chunk_coords), - metadata.get_chunk_spec(chunk_coords, config, prototype=prototype), + _get_chunk_spec(metadata, async_arr.chunk_grid, chunk_coords, config, prototype), chunk_selection, out_selection, is_complete_chunk, diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index ddfca71294..eec0ecacae 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -17,7 +17,6 @@ from zarr import config, create_array, open_array from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls from zarr.codecs import numcodecs as _numcodecs -from zarr.errors import ZarrUserWarning from zarr.registry import get_codec_class, get_numcodec if TYPE_CHECKING: @@ -76,8 +75,6 @@ def test_is_numcodec_cls() -> None: assert _is_numcodec_cls(GZip) -EXPECTED_WARNING_STR = "Numcodecs codecs are not in the Zarr version 3.*" - ALL_CODECS = tuple( filter( lambda v: issubclass(v, _numcodecs._NumcodecsCodec) and hasattr(v, "codec_name"), @@ -115,15 +112,14 @@ def test_docstring(codec_class: type[_numcodecs._NumcodecsCodec]) -> None: def test_generic_compressor(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec]) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - a = create_array( - {}, - shape=data.shape, - chunks=(16, 16), - dtype=data.dtype, - fill_value=0, - compressors=[codec_class()], - ) + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + compressors=[codec_class()], + ) a[:, :] = data.copy() np.testing.assert_array_equal(data, a[:, :]) @@ -150,60 +146,54 @@ def test_generic_filter( ) -> None: data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - a = create_array( - {}, - shape=data.shape, - chunks=(16, 16), - dtype=data.dtype, - fill_value=0, - filters=[ - codec_class(**codec_config), - ], - ) + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[ + codec_class(**codec_config), + ], + ) a[:, :] = data.copy() with codec_conf(): - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") np.testing.assert_array_equal(data, b[:, :]) def test_generic_filter_bitround() -> None: data = np.linspace(0, 1, 256, dtype="float32").reshape((16, 16)) - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - a = create_array( - {}, - shape=data.shape, - chunks=(16, 16), - dtype=data.dtype, - fill_value=0, - filters=[_numcodecs.BitRound(keepbits=3)], - ) + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.BitRound(keepbits=3)], + ) a[:, :] = data.copy() - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") assert np.allclose(data, b[:, :], atol=0.1) def test_generic_filter_quantize() -> None: data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - a = create_array( - {}, - shape=data.shape, - chunks=(16, 16), - dtype=data.dtype, - fill_value=0, - filters=[_numcodecs.Quantize(digits=3)], - ) + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.Quantize(digits=3)], + ) a[:, :] = data.copy() - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") assert np.allclose(data, b[:, :], atol=0.001) @@ -211,32 +201,29 @@ def test_generic_filter_packbits() -> None: data = np.zeros((16, 16), dtype="bool") data[0:4, :] = True - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - a = create_array( + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.PackBits()], + ) + + a[:, :] = data.copy() + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + with pytest.raises(ValueError, match=".*requires bool dtype.*"): + create_array( {}, shape=data.shape, chunks=(16, 16), - dtype=data.dtype, + dtype="uint32", fill_value=0, filters=[_numcodecs.PackBits()], ) - a[:, :] = data.copy() - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") - np.testing.assert_array_equal(data, b[:, :]) - - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - with pytest.raises(ValueError, match=".*requires bool dtype.*"): - create_array( - {}, - shape=data.shape, - chunks=(16, 16), - dtype="uint32", - fill_value=0, - filters=[_numcodecs.PackBits()], - ) - @pytest.mark.parametrize( "codec_class", @@ -251,35 +238,31 @@ def test_generic_filter_packbits() -> None: def test_generic_checksum(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec]) -> None: # Check if the codec is available in numcodecs try: - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - codec_class()._codec # noqa: B018 + codec_class()._codec # noqa: B018 except UnknownCodecError as e: # pragma: no cover pytest.skip(f"{codec_class.codec_name} is not available in numcodecs: {e}") data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - a = create_array( - {}, - shape=data.shape, - chunks=(16, 16), - dtype=data.dtype, - fill_value=0, - compressors=[codec_class()], - ) + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + compressors=[codec_class()], + ) a[:, :] = data.copy() with codec_conf(): - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") np.testing.assert_array_equal(data, b[:, :]) @pytest.mark.parametrize("codec_class", [_numcodecs.PCodec, _numcodecs.ZFPY]) def test_generic_bytes_codec(codec_class: type[_numcodecs._NumcodecsArrayBytesCodec]) -> None: try: - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - codec_class()._codec # noqa: B018 + codec_class()._codec # noqa: B018 except ValueError as e: # pragma: no cover if "codec not available" in str(e): pytest.xfail(f"{codec_class.codec_name} is not available: {e}") @@ -290,15 +273,14 @@ def test_generic_bytes_codec(codec_class: type[_numcodecs._NumcodecsArrayBytesCo data = np.arange(0, 256, dtype="float32").reshape((16, 16)) - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - a = create_array( - {}, - shape=data.shape, - chunks=(16, 16), - dtype=data.dtype, - fill_value=0, - serializer=codec_class(), - ) + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + serializer=codec_class(), + ) a[:, :] = data.copy() np.testing.assert_array_equal(data, a[:, :]) @@ -307,34 +289,30 @@ def test_generic_bytes_codec(codec_class: type[_numcodecs._NumcodecsArrayBytesCo def test_delta_astype() -> None: data = np.linspace(0, 10, 256, dtype="i8").reshape((16, 16)) - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - a = create_array( - {}, - shape=data.shape, - chunks=(16, 16), - dtype=data.dtype, - fill_value=0, - filters=[ - _numcodecs.Delta(dtype="i8", astype="i2"), - ], - ) + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[ + _numcodecs.Delta(dtype="i8", astype="i2"), + ], + ) a[:, :] = data.copy() with codec_conf(): - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") np.testing.assert_array_equal(data, b[:, :]) def test_repr() -> None: - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - codec = _numcodecs.LZ4(level=5) + codec = _numcodecs.LZ4(level=5) assert repr(codec) == "LZ4(codec_name='numcodecs.lz4', codec_config={'level': 5})" def test_to_dict() -> None: - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - codec = _numcodecs.LZ4(level=5) + codec = _numcodecs.LZ4(level=5) assert codec.to_dict() == {"name": "numcodecs.lz4", "configuration": {"level": 5}} @@ -367,8 +345,7 @@ def test_to_dict() -> None: def test_codecs_pickleable(codec_cls: type[_numcodecs._NumcodecsCodec]) -> None: # Check if the codec is available in numcodecs try: - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - codec = codec_cls() + codec = codec_cls() except UnknownCodecError as e: # pragma: no cover pytest.skip(f"{codec_cls.codec_name} is not available in numcodecs: {e}") From f2ed1c4830c06ae0510e0ca545ae20471ba151cc Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Mar 2026 18:01:02 -0400 Subject: [PATCH 098/118] Improve design doc --- docs/design/chunk-grid.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index ab4ffc3898..8d77039b58 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -2,6 +2,8 @@ Version: 6 +Design document for adding rectilinear (variable) chunk grid support to **zarr-python**, conforming to the [rectilinear chunk grid extension spec](https://github.com/zarr-developers/zarr-extensions/pull/25). + **Related:** - [#3750](https://github.com/zarr-developers/zarr-python/issues/3750) (single ChunkGrid proposal) @@ -14,9 +16,7 @@ Version: 6 ## Problem -The Zarr V3 spec defines `chunk_grid` as an extension point, but chunk grids are fundamentally different from codecs. Codecs are independent — supporting `zstd` tells you nothing about `gzip`. Chunk grids form a hierarchy — the rectilinear grid is strictly more general than the regular grid. Any regular grid is expressible as a rectilinear grid. - -There is no known chunk grid that is both (a) more general than rectilinear and (b) retains the axis-aligned tessellation properties Zarr assumes. All known grids are special cases: +Chunk grids form a hierarchy — the rectilinear grid is strictly more general than the regular grid. Any regular grid is expressible as a rectilinear grid. There is no known chunk grid that is both (a) more general than rectilinear and (b) retains the axis-aligned tessellation properties Zarr assumes. All known grids are special cases: | Grid type | Description | Example | |---|---|---| @@ -25,7 +25,7 @@ There is no known chunk grid that is both (a) more general than rectilinear and | HPC boundary-padded | Regular interior, larger boundary chunks ([VirtualiZarr#217](https://github.com/zarr-developers/VirtualiZarr/issues/217)) | `[10, 8, 8, 8, 10]` | | Fully variable | Arbitrary per-chunk sizes | `[5, 12, 3, 20]` | -A registry-based plugin system adds complexity without clear benefit. +Prior iterations on the chunk grid design were based on the Zarr V3 spec's definition of chunk grids as an extension point alongside codecs, dtypes, etc. Therefore, we started designing the chunk grid implementation following a similar registry based approach. However, in practice chunk grids are fundamentally different than codecs. Codecs are independent; supporting `zstd` tells you nothing about `gzip`. Chunk grids are not: every regular grid is a valid rectilinear grid. A registry-based plugin system makes sense for codecs but adds complexity without clear benefit for chunk grids. Here we start from some basic goals and propose a more fitting design for supporting different chunk grids in zarr-python. ## Goals @@ -42,7 +42,7 @@ A registry-based plugin system adds complexity without clear benefit. ### Design choices -1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern — the specific partition of a specific array. The grid stores enough information to answer any question about any chunk without external parameters. +1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern. This means that the chunk grid is bound to specific array dimensions, which enables the chunk grid to answer any question about any chunk (offset, size, count) without external parameters. 2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. The serialization format (`"regular"` vs `"rectilinear"`) is chosen by the metadata layer, not the grid. 3. **No chunk grid registry.** Simple name-based dispatch in `parse_chunk_grid()`. 4. **Fixed vs Varying per dimension.** `FixedDimension(size, extent)` for uniform chunks; `VaryingDimension(edges, extent)` for per-chunk edge lengths with precomputed prefix sums. Avoids expanding regular dimensions into lists of identical values. From 731825548c5f1b5c4e0728a42f92121f6e911cb5 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Mar 2026 21:05:00 -0400 Subject: [PATCH 099/118] Add demo notebook --- .../examples/rectilinear_chunks.ipynb | 428 ++++++++++++++++++ mkdocs.yml | 6 + pyproject.toml | 1 + 3 files changed, 435 insertions(+) create mode 100644 docs/user-guide/examples/rectilinear_chunks.ipynb diff --git a/docs/user-guide/examples/rectilinear_chunks.ipynb b/docs/user-guide/examples/rectilinear_chunks.ipynb new file mode 100644 index 0000000000..541750d440 --- /dev/null +++ b/docs/user-guide/examples/rectilinear_chunks.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "da9139cc", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:13.606739Z", + "iopub.status.busy": "2026-03-30T01:03:13.606597Z", + "iopub.status.idle": "2026-03-30T01:03:13.613466Z", + "shell.execute_reply": "2026-03-30T01:03:13.612167Z", + "shell.execute_reply.started": "2026-03-30T01:03:13.606724Z" + } + }, + "outputs": [], + "source": [ + "# /// script\n", + "# requires-python = \">=3.12\"\n", + "# dependencies = [\n", + "# \"dask\",\n", + "# \"healpix-geo\",\n", + "# \"matplotlib\",\n", + "# \"numpy\",\n", + "# \"obstore\",\n", + "# \"xarray\",\n", + "# \"zarr\",\n", + "# ]\n", + "#\n", + "# [tool.uv.sources]\n", + "# zarr = { git = \"https://github.com/maxrjones/zarr-python\", branch = \"poc/unified-chunk-grid\" }\n", + "# xarray = { git = \"https://github.com/maxrjones/xarray\", branch = \"poc/unified-zarr-chunk-grid\" }\n", + "# ///" + ] + }, + { + "cell_type": "markdown", + "id": "71gnhfq4pfe", + "metadata": {}, + "source": [ + "# Rectilinear Chunk Grids\n", + "\n", + "This notebook demonstrates the unified chunk grid implementation from [#3802](https://github.com/zarr-developers/zarr-python/pull/3802), which adds support for rectilinear (variable) chunk grids.\n", + "\n", + "Rectilinear grids allow different chunk sizes along each dimension, which is useful for data that doesn't partition evenly. For example, sparse HEALPix cells grouped by parent tile, boundary-padded HPC arrays, or ingesting existing variable-chunked datasets via VirtualiZarr." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9e9nyjdx06f", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:13.614308Z", + "iopub.status.busy": "2026-03-30T01:03:13.614115Z", + "iopub.status.idle": "2026-03-30T01:03:27.161046Z", + "shell.execute_reply": "2026-03-30T01:03:27.160587Z", + "shell.execute_reply.started": "2026-03-30T01:03:13.614287Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import tempfile\n", + "from pathlib import Path\n", + "import json\n", + "\n", + "import numpy as np\n", + "import xarray as xr\n", + "from healpix_geo import nested\n", + "from obstore.store import HTTPStore\n", + "\n", + "import zarr\n", + "from zarr.storage import ObjectStore\n", + "\n", + "zarr.config.set({'async.concurrency': 128}) # Increase concurrency for better performance with obstore\n", + "zarr.config.set({\"array.rectilinear_chunks\": True}) # Opt-in to rectilinear chunks\n" + ] + }, + { + "cell_type": "markdown", + "id": "kj1o9xik9l", + "metadata": {}, + "source": [ + "## 1. Inspect HEALPix dataset\n", + "\n", + "Load the remote Zarr store to understand the data structure before chunking it." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "v6cot74r1gq", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:27.161531Z", + "iopub.status.busy": "2026-03-30T01:03:27.161384Z", + "iopub.status.idle": "2026-03-30T01:03:46.210431Z", + "shell.execute_reply": "2026-03-30T01:03:46.209459Z", + "shell.execute_reply.started": "2026-03-30T01:03:27.161522Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Members: [('cell_ids', ), ('da', )]\n", + "Attrs: {}\n", + "Chunk grid: ChunkGrid(dimensions=(FixedDimension(size=55611, extent=222442),), _is_regular=True)\n" + ] + } + ], + "source": [ + "ob_store = HTTPStore.from_url(\"https://data-taos.ifremer.fr/GRID4EARTH/no_chunk_healpix.zarr\")\n", + "store = ObjectStore(ob_store)\n", + "g = zarr.open_group(store, mode=\"r\", zarr_format=2, use_consolidated=True)\n", + "arr = g['da']\n", + "\n", + "print(\"Members:\", list(g.members()))\n", + "print(\"Attrs:\", dict(g.attrs))\n", + "print(\"Chunk grid:\", arr.chunk_grid)" + ] + }, + { + "cell_type": "markdown", + "id": "wmuqi66d46", + "metadata": {}, + "source": [ + "## 2. HEALPix-style variable chunking\n", + "\n", + "Inspired by [this use case](https://github.com/zarr-developers/zarr-python/pull/3534#issuecomment-3848669859): HEALPix grids where cells are grouped by parent tile at a coarser resolution level, producing variable-sized chunks along the cell dimension when accounting for sparsity." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "90bc91b9", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:46.211439Z", + "iopub.status.busy": "2026-03-30T01:03:46.211138Z", + "iopub.status.idle": "2026-03-30T01:03:47.632389Z", + "shell.execute_reply": "2026-03-30T01:03:47.631887Z", + "shell.execute_reply.started": "2026-03-30T01:03:46.211416Z" + } + }, + "outputs": [], + "source": [ + "da = xr.open_zarr(\n", + " store,\n", + " zarr_format=2,\n", + " consolidated=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0d7785b0-d72f-4ef8-8a57-91d61f07be96", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:47.633034Z", + "iopub.status.busy": "2026-03-30T01:03:47.632792Z", + "iopub.status.idle": "2026-03-30T01:03:47.635838Z", + "shell.execute_reply": "2026-03-30T01:03:47.635478Z", + "shell.execute_reply.started": "2026-03-30T01:03:47.633024Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "depth = da.cell_ids.attrs['level']\n", + "depth" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "72c80224-dcac-4724-8caf-5717b29a25d5", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:47.636213Z", + "iopub.status.busy": "2026-03-30T01:03:47.636138Z", + "iopub.status.idle": "2026-03-30T01:03:47.643387Z", + "shell.execute_reply": "2026-03-30T01:03:47.642743Z", + "shell.execute_reply.started": "2026-03-30T01:03:47.636205Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 25, 645, 1510, 2363, 3203, 74, 769, 3963, 4096, 233, 1603,\n", + " 2450, 4096, 4096, 3327, 4047, 4096, 4096, 1278, 2113, 4096, 3879,\n", + " 4096, 3842, 2173, 983, 4046, 2187, 4095, 1369, 4096, 4096, 4096,\n", + " 4096, 3515, 1395, 4096, 3622, 4096, 4096, 3875, 4096, 4096, 4096,\n", + " 4096, 4096, 2034, 4096, 358, 3991, 4096, 4096, 4096, 4096, 2714,\n", + " 1210, 4096, 4096, 4096, 4096, 92, 3826, 4096, 2629, 4096, 1438,\n", + " 4096, 353, 4078, 3410, 2407, 226, 132, 2738, 1223, 23])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_depth = depth-6\n", + "parents = nested.zoom_to(da.cell_ids, depth=depth, new_depth=new_depth)\n", + "_, chunk_sizes =np.unique(parents, return_counts=True)\n", + "chunk_sizes" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a79a281b-ca74-49c3-a467-60490a4ad63e", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:47.643801Z", + "iopub.status.busy": "2026-03-30T01:03:47.643715Z", + "iopub.status.idle": "2026-03-30T01:03:47.648805Z", + "shell.execute_reply": "2026-03-30T01:03:47.648326Z", + "shell.execute_reply.started": "2026-03-30T01:03:47.643792Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Frozen({'cell_ids': (25, 645, 1510, 2363, 3203, 74, 769, 3963, 4096, 233, 1603, 2450, 4096, 4096, 3327, 4047, 4096, 4096, 1278, 2113, 4096, 3879, 4096, 3842, 2173, 983, 4046, 2187, 4095, 1369, 4096, 4096, 4096, 4096, 3515, 1395, 4096, 3622, 4096, 4096, 3875, 4096, 4096, 4096, 4096, 4096, 2034, 4096, 358, 3991, 4096, 4096, 4096, 4096, 2714, 1210, 4096, 4096, 4096, 4096, 92, 3826, 4096, 2629, 4096, 1438, 4096, 353, 4078, 3410, 2407, 226, 132, 2738, 1223, 23)})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "da = da.chunk({\"cell_ids\": tuple(chunk_sizes.tolist())})\n", + "da.chunks" + ] + }, + { + "cell_type": "markdown", + "id": "bsp6y7otkzb", + "metadata": {}, + "source": [ + "## 3. Write as rectilinear Zarr V3\n", + "\n", + "Write the variable-chunked dataset to a local Zarr V3 store with rectilinear chunk grids enabled." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ribguojdr0s", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:47.649560Z", + "iopub.status.busy": "2026-03-30T01:03:47.649480Z", + "iopub.status.idle": "2026-03-30T01:03:47.943997Z", + "shell.execute_reply": "2026-03-30T01:03:47.943488Z", + "shell.execute_reply.started": "2026-03-30T01:03:47.649552Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Written to: /var/folders/70/hc_nynms54d8lp67z4rsfctc0000gp/T/tmpgjlmnfff/healpix_rectilinear.zarr\n" + ] + } + ], + "source": [ + "output_path = Path(tempfile.mkdtemp()) / \"healpix_rectilinear.zarr\"\n", + "\n", + "encoding = {\n", + " \"da\": {\"chunks\": [chunk_sizes.tolist()]},\n", + " \"cell_ids\": {\"chunks\": [chunk_sizes.tolist()]},\n", + "}\n", + "\n", + "da.to_zarr(output_path, zarr_format=3, mode=\"w\", encoding=encoding, consolidated=False)\n", + "\n", + "print(f\"Written to: {output_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "rbfm1hn63g9", + "metadata": {}, + "source": [ + "## 4. Verify rectilinear metadata\n", + "\n", + "Inspect the output store to confirm the chunk grid is serialized as `\"rectilinear\"` in `zarr.json`,\n", + "following the [rectilinear chunk grid extension spec](https://github.com/zarr-developers/zarr-extensions/tree/main/chunk-grids/rectilinear).\n", + "\n", + "Key things to look for in `chunk_grid`:\n", + "- **`name`**: `\"rectilinear\"` (the extension identifier)\n", + "- **`configuration.kind`**: `\"inline\"` (edge lengths stored directly in metadata)\n", + "- **`configuration.chunk_shapes`**: one entry per dimension — here a single list for the 1D `cell_ids` axis. Each element is either:\n", + " - a **bare integer** for a unique edge length (e.g., `25`, `645`)\n", + " - a **`[value, count]` array** using [run-length encoding](https://github.com/zarr-developers/zarr-extensions/tree/main/chunk-grids/rectilinear#run-length-encoding) for consecutive repeated sizes (e.g., `[4096, 4]` means four consecutive chunks of size 4096)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "mpdn5hxp7lp", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:47.944360Z", + "iopub.status.busy": "2026-03-30T01:03:47.944287Z", + "iopub.status.idle": "2026-03-30T01:03:47.946825Z", + "shell.execute_reply": "2026-03-30T01:03:47.946372Z", + "shell.execute_reply.started": "2026-03-30T01:03:47.944352Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'rectilinear', 'configuration': {'kind': 'inline', 'chunk_shapes': [[25, 645, 1510, 2363, 3203, 74, 769, 3963, 4096, 233, 1603, 2450, [4096, 2], 3327, 4047, [4096, 2], 1278, 2113, 4096, 3879, 4096, 3842, 2173, 983, 4046, 2187, 4095, 1369, [4096, 4], 3515, 1395, 4096, 3622, [4096, 2], 3875, [4096, 5], 2034, 4096, 358, 3991, [4096, 4], 2714, 1210, [4096, 4], 92, 3826, 4096, 2629, 4096, 1438, 4096, 353, 4078, 3410, 2407, 226, 132, 2738, 1223, 23]]}}\n" + ] + } + ], + "source": [ + "\n", + "# Read the zarr.json for the 'da' array\n", + "da_meta_path = output_path / \"da\" / \"zarr.json\"\n", + "meta = json.loads(da_meta_path.read_text())\n", + "print(meta['chunk_grid'])" + ] + }, + { + "cell_type": "markdown", + "id": "inz7s8ugu2c", + "metadata": {}, + "source": [ + "## 5. Round-trip verification\n", + "\n", + "Read the rectilinear store back and confirm the chunk sizes are preserved." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "308gxly6r3j", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-30T01:03:47.947072Z", + "iopub.status.busy": "2026-03-30T01:03:47.946996Z", + "iopub.status.idle": "2026-03-30T01:03:47.967534Z", + "shell.execute_reply": "2026-03-30T01:03:47.967022Z", + "shell.execute_reply.started": "2026-03-30T01:03:47.947061Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Round-trip chunk sizes: Frozen({'cell_ids': (25, 645, 1510, 2363, 3203, 74, 769, 3963, 4096, 233, 1603, 2450, 4096, 4096, 3327, 4047, 4096, 4096, 1278, 2113, 4096, 3879, 4096, 3842, 2173, 983, 4046, 2187, 4095, 1369, 4096, 4096, 4096, 4096, 3515, 1395, 4096, 3622, 4096, 4096, 3875, 4096, 4096, 4096, 4096, 4096, 2034, 4096, 358, 3991, 4096, 4096, 4096, 4096, 2714, 1210, 4096, 4096, 4096, 4096, 92, 3826, 4096, 2629, 4096, 1438, 4096, 353, 4078, 3410, 2407, 226, 132, 2738, 1223, 23)})\n", + "Regular grid: False\n" + ] + } + ], + "source": [ + "roundtrip = xr.open_zarr(output_path, zarr_format=3, consolidated=False)\n", + "\n", + "print(\"Round-trip chunk sizes:\", roundtrip.chunks)\n", + "print(\"Regular grid:\", roundtrip[\"da\"].variable.encoding.get(\"chunks\") == tuple(chunk_sizes.tolist()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3c32f1c-2c4f-403f-a4e8-98c6582eef96", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/mkdocs.yml b/mkdocs.yml index fb4dabd786..1d86034d6b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -30,6 +30,7 @@ nav: - user-guide/glossary.md - Examples: - user-guide/examples/custom_dtype.md + - user-guide/examples/rectilinear_chunks.ipynb - API Reference: - api/zarr/index.md - api/zarr/array.md @@ -134,6 +135,11 @@ extra_css: plugins: - autorefs - search + - mkdocs-jupyter: + include: ["docs/user-guide/examples/*.ipynb"] + execute: false + ignore_h1_titles: true + show_input: true - markdown-exec - mkdocstrings: enable_inventory: true diff --git a/pyproject.toml b/pyproject.toml index 8277c3f752..442a70fbce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,6 +114,7 @@ docs = [ "mkdocstrings>=0.29.1", "mkdocstrings-python>=1.16.10", "mike>=2.1.3", + "mkdocs-jupyter>=0.25.1", "mkdocs-redirects>=1.2.0", "markdown-exec[ansi]", "griffe-inherited-docstrings", From 662ceefb53775577b756f4db97935e18206dbe46 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Mar 2026 21:56:22 -0400 Subject: [PATCH 100/118] Add release notes --- changes/3802.feature.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changes/3802.feature.md diff --git a/changes/3802.feature.md b/changes/3802.feature.md new file mode 100644 index 0000000000..888cf1a7fa --- /dev/null +++ b/changes/3802.feature.md @@ -0,0 +1,3 @@ +Add support for creating arrays with rectilinear (variable-sized) chunk grids by passing +dask-style nested sequences to the ``chunk_shape`` parameter in ``zarr.create``. This feature +is experimental and must be explicitly enabled via ``zarr.config.set({'array.rectilinear_chunks': True})``. From 994e329e44a0e34ee58b1e1cb9a502dd7c223da6 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Mar 2026 22:08:31 -0400 Subject: [PATCH 101/118] Fix indexing empty slices --- src/zarr/core/indexing.py | 2 ++ tests/test_unified_chunk_grid.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index efbe22f291..d31e7c628e 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -428,6 +428,8 @@ def __init__( def __iter__(self) -> Iterator[ChunkDimProjection]: # figure out the range of chunks we need to visit + if self.start >= self.stop: + return # empty slice g = self.dim_grid dim_chunk_ix_from = g.index_to_chunk(self.start) if self.start > 0 else 0 dim_chunk_ix_to = g.index_to_chunk(self.stop - 1) + 1 if self.stop > 0 else 0 diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 070a13a967..aca6ff79ab 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -2556,6 +2556,21 @@ def test_int_array_indexer_varying_boundary(self) -> None: assert isinstance(sel, np.ndarray) np.testing.assert_array_equal(sel, [1]) + def test_slice_indexer_empty_slice_at_boundary(self) -> None: + """SliceDimIndexer yields no projections for an empty slice at the dimension boundary.""" + from zarr.core.indexing import SliceDimIndexer + + dim = FixedDimension(size=2, extent=10) + # slice(10, 10) is empty — start equals extent + indexer = SliceDimIndexer(slice(10, 10), 10, dim) + projections = list(indexer) + assert len(projections) == 0 + + # also works for VaryingDimension + dim_v = VaryingDimension([5, 5], extent=10) + indexer_v = SliceDimIndexer(slice(10, 10), 10, dim_v) + assert list(indexer_v) == [] + def test_orthogonal_indexer_varying_boundary_advanced(self) -> None: """OrthogonalIndexer with advanced indexing uses per-chunk chunk_size for ix_() conversion, not a precomputed max.""" From f1c518257f93405e271e03591732bd406eb198f3 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Mar 2026 22:16:45 -0400 Subject: [PATCH 102/118] Add open support --- src/zarr/core/array.py | 62 ++++++++++++++++++++++++++++++++---------- tests/test_api.py | 13 +++++++++ 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1222c82863..888257f8f9 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -675,13 +675,30 @@ async def _create( if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") - item_size = 1 - if isinstance(dtype_parsed, HasItemSize): - item_size = dtype_parsed.item_size - if chunks: - _chunks = normalize_chunks(chunks, shape, item_size) + + # detect rectilinear (dask-style) chunks before normalize_chunks flattens them + from zarr.core.chunk_grids import _is_rectilinear_chunks + + _raw_chunks = chunks if chunks is not None else chunk_shape + _rectilinear_chunk_grid: ChunkGridMetadata | None = None + _chunks: tuple[int, ...] | None = None + if _is_rectilinear_chunks(_raw_chunks): + from zarr.core.metadata.v3 import ( + RectilinearChunkGrid as RectilinearChunkGridMeta, + ) + + _rectilinear_chunk_grid = RectilinearChunkGridMeta( + chunk_shapes=tuple(tuple(c) for c in _raw_chunks) + ) else: - _chunks = normalize_chunks(chunk_shape, shape, item_size) + item_size = 1 + if isinstance(dtype_parsed, HasItemSize): + item_size = dtype_parsed.item_size + if chunks: + _chunks = normalize_chunks(chunks, shape, item_size) + else: + _chunks = normalize_chunks(chunk_shape, shape, item_size) + config_parsed = parse_array_config(config) result: AnyAsyncArray @@ -714,6 +731,7 @@ async def _create( attributes=attributes, overwrite=overwrite, config=config_parsed, + chunk_grid=_rectilinear_chunk_grid, ) elif zarr_format == 2: if codecs is not None: @@ -726,6 +744,9 @@ async def _create( ) if dimension_names is not None: raise ValueError("dimension_names cannot be used for arrays with zarr_format 2.") + if _rectilinear_chunk_grid is not None: + raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") + assert _chunks is not None if order is None: order_parsed = config_parsed.order @@ -760,7 +781,7 @@ async def _create( def _create_metadata_v3( shape: ShapeLike, dtype: ZDType[TBaseDType, TBaseScalar], - chunk_shape: tuple[int, ...], + chunk_shape: tuple[int, ...] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, chunk_key_encoding: ChunkKeyEncodingLike | None = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, @@ -771,8 +792,12 @@ def _create_metadata_v3( """ Create an instance of ArrayV3Metadata. - If `chunk_grid` is provided, it takes precedence over `chunk_shape`. + Exactly one of ``chunk_shape`` or ``chunk_grid`` must be provided. """ + if chunk_shape is not None and chunk_grid is not None: + raise ValueError("Only one of chunk_shape or chunk_grid can be provided.") + if chunk_shape is None and chunk_grid is None: + raise ValueError("One of chunk_shape or chunk_grid must be provided.") filters: tuple[ArrayArrayCodec, ...] compressors: tuple[BytesBytesCodec, ...] @@ -799,11 +824,12 @@ def _create_metadata_v3( else: fill_value_parsed = fill_value - chunk_grid_meta = ( - chunk_grid - if chunk_grid is not None - else RegularChunkGrid(chunk_shape=parse_shapelike(chunk_shape)) - ) + chunk_grid_meta: ChunkGridMetadata + if chunk_grid is not None: + chunk_grid_meta = chunk_grid + else: + assert chunk_shape is not None # validated above + chunk_grid_meta = RegularChunkGrid(chunk_shape=parse_shapelike(chunk_shape)) return ArrayV3Metadata( shape=shape, data_type=dtype, @@ -822,7 +848,7 @@ async def _create_v3( *, shape: ShapeLike, dtype: ZDType[TBaseDType, TBaseScalar], - chunk_shape: tuple[int, ...], + chunk_shape: tuple[int, ...] | None = None, config: ArrayConfig, fill_value: Any | None = DEFAULT_FILL_VALUE, chunk_key_encoding: ( @@ -835,7 +861,12 @@ async def _create_v3( dimension_names: DimensionNamesLike = None, attributes: dict[str, JSON] | None = None, overwrite: bool = False, + chunk_grid: ChunkGridMetadata | None = None, ) -> AsyncArrayV3: + if chunk_shape is not None and chunk_grid is not None: + raise ValueError("Only one of chunk_shape or chunk_grid can be provided.") + if chunk_shape is None and chunk_grid is None: + raise ValueError("One of chunk_shape or chunk_grid must be provided.") if overwrite: if store_path.store.supports_deletes: await store_path.delete_dir() @@ -860,6 +891,7 @@ async def _create_v3( codecs=codecs, dimension_names=dimension_names, attributes=attributes, + chunk_grid=chunk_grid, ) array = cls(metadata=metadata, store_path=store_path, config=config) @@ -4902,7 +4934,7 @@ async def init_array( shape=shape_parsed, dtype=zdtype, fill_value=fill_value, - chunk_shape=chunks_out, + chunk_shape=chunks_out if rectilinear_meta is None else None, chunk_key_encoding=chunk_key_encoding_parsed, codecs=codecs_out, dimension_names=dimension_names, diff --git a/tests/test_api.py b/tests/test_api.py index a306ff3dc3..33cd3bd301 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -280,6 +280,19 @@ async def test_open_array(memory_store: MemoryStore, zarr_format: ZarrFormat) -> zarr.api.synchronous.open(store="doesnotexist", mode="r", zarr_format=zarr_format) +def test_open_array_rectilinear_chunks(tmp_path: Path) -> None: + """zarr.open with rectilinear (dask-style) chunks preserves the chunk grid.""" + from zarr.core.metadata.v3 import RectilinearChunkGrid + + chunks = ((3, 3, 4), (5, 5)) + with zarr.config.set({"array.rectilinear_chunks": True}): + z = zarr.open(store=tmp_path, shape=(10, 10), dtype="float64", chunks=chunks, mode="w") + assert isinstance(z, Array) + assert z.shape == (10, 10) + assert isinstance(z.metadata.chunk_grid, RectilinearChunkGrid) + assert z.read_chunk_sizes == ((3, 3, 4), (5, 5)) + + @pytest.mark.asyncio async def test_async_array_open_array_not_found() -> None: """Test that AsyncArray.open raises ArrayNotFoundError when array doesn't exist""" From c4f7cf4fcaaf0b077966957cbc1a2f101a47c25a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Mar 2026 22:36:30 -0400 Subject: [PATCH 103/118] Improve release note --- changes/3802.feature.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/changes/3802.feature.md b/changes/3802.feature.md index 888cf1a7fa..8199b5b718 100644 --- a/changes/3802.feature.md +++ b/changes/3802.feature.md @@ -1,3 +1,11 @@ -Add support for creating arrays with rectilinear (variable-sized) chunk grids by passing -dask-style nested sequences to the ``chunk_shape`` parameter in ``zarr.create``. This feature -is experimental and must be explicitly enabled via ``zarr.config.set({'array.rectilinear_chunks': True})``. +Add support for rectilinear (variable-sized) chunk grids. This feature is experimental and +must be explicitly enabled via ``zarr.config.set({'array.rectilinear_chunks': True})``. + +Rectilinear chunks can be used through: + +- **Creating arrays**: Pass nested sequences (e.g., ``[[10, 20, 30], [50, 50]]``) to ``chunks`` + in ``zarr.create_array``, ``zarr.from_array``, ``zarr.zeros``, ``zarr.ones``, ``zarr.full``, + ``zarr.open``, and related functions, or to ``chunk_shape`` in ``zarr.create``. +- **Opening existing arrays**: Arrays stored with the ``rectilinear`` chunk grid are read + transparently via ``zarr.open`` and ``zarr.open_array``. +- **Rectilinear sharding**: Shard boundaries can be rectilinear while inner chunks remain regular. From 6436db675226c7288a51b8b9f51159eb87a801e9 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 08:55:00 -0400 Subject: [PATCH 104/118] Create and use resolve_chunks functions --- src/zarr/core/array.py | 77 ++++++++++++------------------------ src/zarr/core/chunk_grids.py | 4 -- src/zarr/core/common.py | 1 + src/zarr/core/metadata/v3.py | 31 ++++++++++++++- 4 files changed, 57 insertions(+), 56 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 888257f8f9..8bb056968b 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -122,6 +122,7 @@ RectilinearChunkGrid, RegularChunkGrid, parse_node_type_array, + resolve_chunks, ) from zarr.core.sync import sync from zarr.errors import ( @@ -676,28 +677,9 @@ async def _create( if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") - # detect rectilinear (dask-style) chunks before normalize_chunks flattens them from zarr.core.chunk_grids import _is_rectilinear_chunks _raw_chunks = chunks if chunks is not None else chunk_shape - _rectilinear_chunk_grid: ChunkGridMetadata | None = None - _chunks: tuple[int, ...] | None = None - if _is_rectilinear_chunks(_raw_chunks): - from zarr.core.metadata.v3 import ( - RectilinearChunkGrid as RectilinearChunkGridMeta, - ) - - _rectilinear_chunk_grid = RectilinearChunkGridMeta( - chunk_shapes=tuple(tuple(c) for c in _raw_chunks) - ) - else: - item_size = 1 - if isinstance(dtype_parsed, HasItemSize): - item_size = dtype_parsed.item_size - if chunks: - _chunks = normalize_chunks(chunks, shape, item_size) - else: - _chunks = normalize_chunks(chunk_shape, shape, item_size) config_parsed = parse_array_config(config) @@ -719,11 +701,14 @@ async def _create( if order is not None: _warn_order_kwarg() + item_size = 1 + if isinstance(dtype_parsed, HasItemSize): + item_size = dtype_parsed.item_size + chunk_grid = resolve_chunks(_raw_chunks, shape, item_size) result = await cls._create_v3( store_path, shape=shape, dtype=dtype_parsed, - chunk_shape=_chunks, fill_value=fill_value, chunk_key_encoding=chunk_key_encoding, codecs=codecs, @@ -731,7 +716,7 @@ async def _create( attributes=attributes, overwrite=overwrite, config=config_parsed, - chunk_grid=_rectilinear_chunk_grid, + chunk_grid=chunk_grid, ) elif zarr_format == 2: if codecs is not None: @@ -744,9 +729,16 @@ async def _create( ) if dimension_names is not None: raise ValueError("dimension_names cannot be used for arrays with zarr_format 2.") - if _rectilinear_chunk_grid is not None: + if _is_rectilinear_chunks(_raw_chunks): raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") - assert _chunks is not None + + item_size = 1 + if isinstance(dtype_parsed, HasItemSize): + item_size = dtype_parsed.item_size + if chunks: + _chunks = normalize_chunks(chunks, shape, item_size) + else: + _chunks = normalize_chunks(chunk_shape, shape, item_size) if order is None: order_parsed = config_parsed.order @@ -781,23 +773,14 @@ async def _create( def _create_metadata_v3( shape: ShapeLike, dtype: ZDType[TBaseDType, TBaseScalar], - chunk_shape: tuple[int, ...] | None = None, + chunk_grid: ChunkGridMetadata, fill_value: Any | None = DEFAULT_FILL_VALUE, chunk_key_encoding: ChunkKeyEncodingLike | None = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: DimensionNamesLike = None, attributes: dict[str, JSON] | None = None, - chunk_grid: ChunkGridMetadata | None = None, ) -> ArrayV3Metadata: - """ - Create an instance of ArrayV3Metadata. - - Exactly one of ``chunk_shape`` or ``chunk_grid`` must be provided. - """ - if chunk_shape is not None and chunk_grid is not None: - raise ValueError("Only one of chunk_shape or chunk_grid can be provided.") - if chunk_shape is None and chunk_grid is None: - raise ValueError("One of chunk_shape or chunk_grid must be provided.") + """Create an instance of ArrayV3Metadata.""" filters: tuple[ArrayArrayCodec, ...] compressors: tuple[BytesBytesCodec, ...] @@ -824,16 +807,10 @@ def _create_metadata_v3( else: fill_value_parsed = fill_value - chunk_grid_meta: ChunkGridMetadata - if chunk_grid is not None: - chunk_grid_meta = chunk_grid - else: - assert chunk_shape is not None # validated above - chunk_grid_meta = RegularChunkGrid(chunk_shape=parse_shapelike(chunk_shape)) return ArrayV3Metadata( shape=shape, data_type=dtype, - chunk_grid=chunk_grid_meta, + chunk_grid=chunk_grid, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, codecs=codecs_parsed, # type: ignore[arg-type] @@ -848,7 +825,7 @@ async def _create_v3( *, shape: ShapeLike, dtype: ZDType[TBaseDType, TBaseScalar], - chunk_shape: tuple[int, ...] | None = None, + chunk_grid: ChunkGridMetadata, config: ArrayConfig, fill_value: Any | None = DEFAULT_FILL_VALUE, chunk_key_encoding: ( @@ -861,12 +838,7 @@ async def _create_v3( dimension_names: DimensionNamesLike = None, attributes: dict[str, JSON] | None = None, overwrite: bool = False, - chunk_grid: ChunkGridMetadata | None = None, ) -> AsyncArrayV3: - if chunk_shape is not None and chunk_grid is not None: - raise ValueError("Only one of chunk_shape or chunk_grid can be provided.") - if chunk_shape is None and chunk_grid is None: - raise ValueError("One of chunk_shape or chunk_grid must be provided.") if overwrite: if store_path.store.supports_deletes: await store_path.delete_dir() @@ -885,13 +857,12 @@ async def _create_v3( metadata = cls._create_metadata_v3( shape=shape, dtype=dtype, - chunk_shape=chunk_shape, + chunk_grid=chunk_grid, fill_value=fill_value, chunk_key_encoding=chunk_key_encoding, codecs=codecs, dimension_names=dimension_names, attributes=attributes, - chunk_grid=chunk_grid, ) array = cls(metadata=metadata, store_path=store_path, config=config) @@ -4930,16 +4901,20 @@ async def init_array( if order is not None: _warn_order_kwarg() + grid: ChunkGridMetadata + if rectilinear_meta is not None: + grid = rectilinear_meta + else: + grid = RegularChunkGrid(chunk_shape=chunks_out) meta = AsyncArray._create_metadata_v3( shape=shape_parsed, dtype=zdtype, fill_value=fill_value, - chunk_shape=chunks_out if rectilinear_meta is None else None, + chunk_grid=grid, chunk_key_encoding=chunk_key_encoding_parsed, codecs=codecs_out, dimension_names=dimension_names, attributes=attributes, - chunk_grid=rectilinear_meta, ) arr = AsyncArray(metadata=meta, store_path=store_path, config=config) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 42da511e37..8f7db8f2d2 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -335,10 +335,6 @@ def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[in raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") -# Type alias for what users can pass as chunks to create_array -ChunksLike = tuple[int, ...] | list[list[int] | int] | int - - def _is_rectilinear_chunks(chunks: Any) -> TypeGuard[Sequence[Sequence[int]]]: """Check if chunks is a nested sequence (e.g. [[10, 20], [5, 5]]). diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 318cc67068..cbc2bb2d37 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -37,6 +37,7 @@ BytesLike = bytes | bytearray | memoryview ShapeLike = Iterable[int | np.integer[Any]] | int | np.integer[Any] +ChunksLike = ShapeLike | Sequence[Sequence[int]] | None # For backwards compatibility ChunkCoords = tuple[int, ...] ZarrFormat = Literal[2, 3] diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 6436b9d6a3..db38541c70 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -17,6 +17,7 @@ from zarr.core.common import ( JSON, ZARR_JSON, + ChunksLike, DimensionNamesLike, NamedConfig, NamedRequiredConfig, @@ -348,10 +349,38 @@ def from_dict(cls, data: RectilinearChunkGridJSON) -> Self: # type: ignore[over ChunkGridMetadata = RegularChunkGrid | RectilinearChunkGrid +def resolve_chunks( + chunks: ChunksLike, + shape: tuple[int, ...], + typesize: int, +) -> ChunkGridMetadata: + """Construct a chunk grid from user-facing input (e.g. ``create_array(chunks=...)``). + + Nested sequences like ``[[10, 20], [5, 5]]`` produce a ``RectilinearChunkGrid``. + Flat inputs like ``(10, 10)`` or a scalar ``int`` produce a ``RegularChunkGrid`` + after normalization via :func:`~zarr.core.chunk_grids.normalize_chunks`. + + See Also + -------- + parse_chunk_grid : Deserialize a chunk grid from stored JSON metadata. + """ + from zarr.core.chunk_grids import _is_rectilinear_chunks, normalize_chunks + + if _is_rectilinear_chunks(chunks): + return RectilinearChunkGrid(chunk_shapes=tuple(tuple(c) for c in chunks)) + + return RegularChunkGrid(chunk_shape=normalize_chunks(chunks, shape, typesize)) + + def parse_chunk_grid( data: dict[str, JSON] | ChunkGridMetadata | NamedConfig[str, Any], ) -> ChunkGridMetadata: - """Parse a chunk grid from a metadata dict or pass through an existing instance.""" + """Deserialize a chunk grid from stored JSON metadata or pass through an existing instance. + + See Also + -------- + resolve_chunks : Construct a chunk grid from user-facing input. + """ if isinstance(data, (RegularChunkGrid, RectilinearChunkGrid)): return data From 26b47606f17efd325888598c126a2b42aacec687 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 09:09:10 -0400 Subject: [PATCH 105/118] Make chunk_grid property private --- docs/design/chunk-grid.md | 6 +- .../examples/rectilinear_chunks.ipynb | 118 +++++++++--------- src/zarr/core/array.py | 64 +++++----- tests/test_array.py | 10 +- tests/test_codec_pipeline.py | 4 +- tests/test_group.py | 2 +- tests/test_indexing.py | 4 +- tests/test_unified_chunk_grid.py | 26 ++-- 8 files changed, 115 insertions(+), 119 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 8d77039b58..c74a654e69 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -200,7 +200,7 @@ arr = zarr.create_array(shape=(100, 200), chunks=(10, 20)) arr = zarr.create_array(shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]]) # rectilinear # ChunkGrid as a collection -grid = arr.chunk_grid # behavioral ChunkGrid (bound to array shape) +grid = arr._chunk_grid # behavioral ChunkGrid (bound to array shape) grid.grid_shape # (10, 10) — number of chunks per dimension grid.ndim # 2 grid.is_regular # True if all dimensions are Fixed @@ -351,7 +351,7 @@ new = zarr.from_array(data=src, store=new_store, chunks="keep") # Preserves rectilinear structure: new.write_chunk_sizes == ((10, 20, 30), (50, 50)) ``` -When `chunks="keep"`, the logic checks `data.chunk_grid.is_regular`: +When `chunks="keep"`, the logic checks `data._chunk_grid.is_regular`: - Regular: extracts `data.chunks` (flat tuple) and preserves shards - Rectilinear: extracts `data.write_chunk_sizes` (nested tuples) and forces shards to None @@ -372,7 +372,7 @@ Today, `get_chunk_spec()` returns the same `ArraySpec(shape=chunk_grid.chunk_sha ```python def get_chunk_spec(self, chunk_coords, array_config, prototype) -> ArraySpec: - spec = self.chunk_grid[chunk_coords] + spec = self._chunk_grid[chunk_coords] return ArraySpec(shape=spec.codec_shape, ...) ``` diff --git a/docs/user-guide/examples/rectilinear_chunks.ipynb b/docs/user-guide/examples/rectilinear_chunks.ipynb index 541750d440..4d89142b67 100644 --- a/docs/user-guide/examples/rectilinear_chunks.ipynb +++ b/docs/user-guide/examples/rectilinear_chunks.ipynb @@ -6,11 +6,11 @@ "id": "da9139cc", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:13.606739Z", - "iopub.status.busy": "2026-03-30T01:03:13.606597Z", - "iopub.status.idle": "2026-03-30T01:03:13.613466Z", - "shell.execute_reply": "2026-03-30T01:03:13.612167Z", - "shell.execute_reply.started": "2026-03-30T01:03:13.606724Z" + "iopub.execute_input": "2026-03-30T13:09:30.606395Z", + "iopub.status.busy": "2026-03-30T13:09:30.606270Z", + "iopub.status.idle": "2026-03-30T13:09:30.611784Z", + "shell.execute_reply": "2026-03-30T13:09:30.608914Z", + "shell.execute_reply.started": "2026-03-30T13:09:30.606380Z" } }, "outputs": [], @@ -51,18 +51,18 @@ "id": "9e9nyjdx06f", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:13.614308Z", - "iopub.status.busy": "2026-03-30T01:03:13.614115Z", - "iopub.status.idle": "2026-03-30T01:03:27.161046Z", - "shell.execute_reply": "2026-03-30T01:03:27.160587Z", - "shell.execute_reply.started": "2026-03-30T01:03:13.614287Z" + "iopub.execute_input": "2026-03-30T13:09:30.613147Z", + "iopub.status.busy": "2026-03-30T13:09:30.612776Z", + "iopub.status.idle": "2026-03-30T13:09:44.301116Z", + "shell.execute_reply": "2026-03-30T13:09:44.300677Z", + "shell.execute_reply.started": "2026-03-30T13:09:30.613126Z" } }, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -103,11 +103,11 @@ "id": "v6cot74r1gq", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:27.161531Z", - "iopub.status.busy": "2026-03-30T01:03:27.161384Z", - "iopub.status.idle": "2026-03-30T01:03:46.210431Z", - "shell.execute_reply": "2026-03-30T01:03:46.209459Z", - "shell.execute_reply.started": "2026-03-30T01:03:27.161522Z" + "iopub.execute_input": "2026-03-30T13:09:44.301615Z", + "iopub.status.busy": "2026-03-30T13:09:44.301466Z", + "iopub.status.idle": "2026-03-30T13:09:45.297053Z", + "shell.execute_reply": "2026-03-30T13:09:45.292702Z", + "shell.execute_reply.started": "2026-03-30T13:09:44.301605Z" } }, "outputs": [ @@ -117,7 +117,7 @@ "text": [ "Members: [('cell_ids', ), ('da', )]\n", "Attrs: {}\n", - "Chunk grid: ChunkGrid(dimensions=(FixedDimension(size=55611, extent=222442),), _is_regular=True)\n" + "Write chunk sizes: ((55611, 55611, 55611, 55609),)\n" ] } ], @@ -129,7 +129,7 @@ "\n", "print(\"Members:\", list(g.members()))\n", "print(\"Attrs:\", dict(g.attrs))\n", - "print(\"Chunk grid:\", arr.chunk_grid)" + "print(\"Write chunk sizes:\", arr.write_chunk_sizes)" ] }, { @@ -148,11 +148,11 @@ "id": "90bc91b9", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:46.211439Z", - "iopub.status.busy": "2026-03-30T01:03:46.211138Z", - "iopub.status.idle": "2026-03-30T01:03:47.632389Z", - "shell.execute_reply": "2026-03-30T01:03:47.631887Z", - "shell.execute_reply.started": "2026-03-30T01:03:46.211416Z" + "iopub.execute_input": "2026-03-30T13:09:45.301060Z", + "iopub.status.busy": "2026-03-30T13:09:45.299959Z", + "iopub.status.idle": "2026-03-30T13:09:46.662995Z", + "shell.execute_reply": "2026-03-30T13:09:46.662527Z", + "shell.execute_reply.started": "2026-03-30T13:09:45.301022Z" } }, "outputs": [], @@ -170,11 +170,11 @@ "id": "0d7785b0-d72f-4ef8-8a57-91d61f07be96", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:47.633034Z", - "iopub.status.busy": "2026-03-30T01:03:47.632792Z", - "iopub.status.idle": "2026-03-30T01:03:47.635838Z", - "shell.execute_reply": "2026-03-30T01:03:47.635478Z", - "shell.execute_reply.started": "2026-03-30T01:03:47.633024Z" + "iopub.execute_input": "2026-03-30T13:09:46.663527Z", + "iopub.status.busy": "2026-03-30T13:09:46.663301Z", + "iopub.status.idle": "2026-03-30T13:09:46.666266Z", + "shell.execute_reply": "2026-03-30T13:09:46.665875Z", + "shell.execute_reply.started": "2026-03-30T13:09:46.663516Z" } }, "outputs": [ @@ -200,11 +200,11 @@ "id": "72c80224-dcac-4724-8caf-5717b29a25d5", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:47.636213Z", - "iopub.status.busy": "2026-03-30T01:03:47.636138Z", - "iopub.status.idle": "2026-03-30T01:03:47.643387Z", - "shell.execute_reply": "2026-03-30T01:03:47.642743Z", - "shell.execute_reply.started": "2026-03-30T01:03:47.636205Z" + "iopub.execute_input": "2026-03-30T13:09:46.666584Z", + "iopub.status.busy": "2026-03-30T13:09:46.666509Z", + "iopub.status.idle": "2026-03-30T13:09:46.673617Z", + "shell.execute_reply": "2026-03-30T13:09:46.672827Z", + "shell.execute_reply.started": "2026-03-30T13:09:46.666576Z" } }, "outputs": [ @@ -238,11 +238,11 @@ "id": "a79a281b-ca74-49c3-a467-60490a4ad63e", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:47.643801Z", - "iopub.status.busy": "2026-03-30T01:03:47.643715Z", - "iopub.status.idle": "2026-03-30T01:03:47.648805Z", - "shell.execute_reply": "2026-03-30T01:03:47.648326Z", - "shell.execute_reply.started": "2026-03-30T01:03:47.643792Z" + "iopub.execute_input": "2026-03-30T13:09:46.674098Z", + "iopub.status.busy": "2026-03-30T13:09:46.673995Z", + "iopub.status.idle": "2026-03-30T13:09:46.690488Z", + "shell.execute_reply": "2026-03-30T13:09:46.686144Z", + "shell.execute_reply.started": "2026-03-30T13:09:46.674089Z" } }, "outputs": [ @@ -278,11 +278,11 @@ "id": "ribguojdr0s", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:47.649560Z", - "iopub.status.busy": "2026-03-30T01:03:47.649480Z", - "iopub.status.idle": "2026-03-30T01:03:47.943997Z", - "shell.execute_reply": "2026-03-30T01:03:47.943488Z", - "shell.execute_reply.started": "2026-03-30T01:03:47.649552Z" + "iopub.execute_input": "2026-03-30T13:09:46.691041Z", + "iopub.status.busy": "2026-03-30T13:09:46.690923Z", + "iopub.status.idle": "2026-03-30T13:09:47.241836Z", + "shell.execute_reply": "2026-03-30T13:09:47.241361Z", + "shell.execute_reply.started": "2026-03-30T13:09:46.691030Z" } }, "outputs": [ @@ -290,7 +290,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Written to: /var/folders/70/hc_nynms54d8lp67z4rsfctc0000gp/T/tmpgjlmnfff/healpix_rectilinear.zarr\n" + "Written to: /var/folders/70/hc_nynms54d8lp67z4rsfctc0000gp/T/tmp30rjewd0/healpix_rectilinear.zarr\n" ] } ], @@ -331,11 +331,11 @@ "id": "mpdn5hxp7lp", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:47.944360Z", - "iopub.status.busy": "2026-03-30T01:03:47.944287Z", - "iopub.status.idle": "2026-03-30T01:03:47.946825Z", - "shell.execute_reply": "2026-03-30T01:03:47.946372Z", - "shell.execute_reply.started": "2026-03-30T01:03:47.944352Z" + "iopub.execute_input": "2026-03-30T13:09:47.242174Z", + "iopub.status.busy": "2026-03-30T13:09:47.242097Z", + "iopub.status.idle": "2026-03-30T13:09:47.244629Z", + "shell.execute_reply": "2026-03-30T13:09:47.244126Z", + "shell.execute_reply.started": "2026-03-30T13:09:47.242166Z" } }, "outputs": [ @@ -367,15 +367,15 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "308gxly6r3j", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T01:03:47.947072Z", - "iopub.status.busy": "2026-03-30T01:03:47.946996Z", - "iopub.status.idle": "2026-03-30T01:03:47.967534Z", - "shell.execute_reply": "2026-03-30T01:03:47.967022Z", - "shell.execute_reply.started": "2026-03-30T01:03:47.947061Z" + "iopub.execute_input": "2026-03-30T13:11:17.611493Z", + "iopub.status.busy": "2026-03-30T13:11:17.610692Z", + "iopub.status.idle": "2026-03-30T13:11:17.655199Z", + "shell.execute_reply": "2026-03-30T13:11:17.654529Z", + "shell.execute_reply.started": "2026-03-30T13:11:17.611441Z" } }, "outputs": [ @@ -383,22 +383,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Round-trip chunk sizes: Frozen({'cell_ids': (25, 645, 1510, 2363, 3203, 74, 769, 3963, 4096, 233, 1603, 2450, 4096, 4096, 3327, 4047, 4096, 4096, 1278, 2113, 4096, 3879, 4096, 3842, 2173, 983, 4046, 2187, 4095, 1369, 4096, 4096, 4096, 4096, 3515, 1395, 4096, 3622, 4096, 4096, 3875, 4096, 4096, 4096, 4096, 4096, 2034, 4096, 358, 3991, 4096, 4096, 4096, 4096, 2714, 1210, 4096, 4096, 4096, 4096, 92, 3826, 4096, 2629, 4096, 1438, 4096, 353, 4078, 3410, 2407, 226, 132, 2738, 1223, 23)})\n", - "Regular grid: False\n" + "Round-trip chunk sizes: Frozen({'cell_ids': (25, 645, 1510, 2363, 3203, 74, 769, 3963, 4096, 233, 1603, 2450, 4096, 4096, 3327, 4047, 4096, 4096, 1278, 2113, 4096, 3879, 4096, 3842, 2173, 983, 4046, 2187, 4095, 1369, 4096, 4096, 4096, 4096, 3515, 1395, 4096, 3622, 4096, 4096, 3875, 4096, 4096, 4096, 4096, 4096, 2034, 4096, 358, 3991, 4096, 4096, 4096, 4096, 2714, 1210, 4096, 4096, 4096, 4096, 92, 3826, 4096, 2629, 4096, 1438, 4096, 353, 4078, 3410, 2407, 226, 132, 2738, 1223, 23)})\n" ] } ], "source": [ "roundtrip = xr.open_zarr(output_path, zarr_format=3, consolidated=False)\n", "\n", - "print(\"Round-trip chunk sizes:\", roundtrip.chunks)\n", - "print(\"Regular grid:\", roundtrip[\"da\"].variable.encoding.get(\"chunks\") == tuple(chunk_sizes.tolist()))" + "print(\"Round-trip chunk sizes:\", roundtrip.chunks)" ] }, { "cell_type": "code", "execution_count": null, - "id": "d3c32f1c-2c4f-403f-a4e8-98c6582eef96", + "id": "e8d42341-c242-44f5-ad6a-491370e3ffab", "metadata": {}, "outputs": [], "source": [] diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8bb056968b..eec7acd47f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -321,14 +321,12 @@ class AsyncArray[T_ArrayMetadata: (ArrayV2Metadata, ArrayV3Metadata)]: The codec pipeline used for encoding and decoding chunks. config : ArrayConfig The runtime configuration of the array. - chunk_grid : ChunkGrid - The behavioral chunk grid bound to this array's shape. """ metadata: T_ArrayMetadata store_path: StorePath codec_pipeline: CodecPipeline = field(init=False) - chunk_grid: ChunkGrid = field(init=False) + _chunk_grid: ChunkGrid = field(init=False) config: ArrayConfig @overload @@ -359,7 +357,7 @@ def __init__( object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) object.__setattr__(self, "config", config_parsed) - object.__setattr__(self, "chunk_grid", ChunkGrid.from_metadata(metadata_parsed)) + object.__setattr__(self, "_chunk_grid", ChunkGrid.from_metadata(metadata_parsed)) object.__setattr__( self, "codec_pipeline", @@ -1111,7 +1109,7 @@ def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: if len(codecs) == 1 and isinstance(codecs[0], ShardingCodec): inner_chunk_shape = codecs[0].chunk_shape return _chunk_sizes_from_shape(self.shape, inner_chunk_shape) - return self.chunk_grid.chunk_sizes + return self._chunk_grid.chunk_sizes @property def write_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: @@ -1130,7 +1128,7 @@ def write_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: >>> arr.write_chunk_sizes ((30, 30, 30, 10), (40, 40)) """ - return self.chunk_grid.chunk_sizes + return self._chunk_grid.chunk_sizes @property def shards(self) -> tuple[int, ...] | None: @@ -1345,7 +1343,7 @@ def _chunk_grid_shape(self) -> tuple[int, ...]: # When sharding, count inner chunks across the whole array chunk_shape = codecs[0].chunk_shape return tuple(starmap(ceildiv, zip(self.shape, chunk_shape, strict=True))) - return self.chunk_grid.grid_shape + return self._chunk_grid.grid_shape @property def _shard_grid_shape(self) -> tuple[int, ...]: @@ -1673,7 +1671,7 @@ async def _get_selection( self.metadata, self.codec_pipeline, self.config, - self.chunk_grid, + self._chunk_grid, indexer, prototype=prototype, out=out, @@ -1728,7 +1726,7 @@ async def example(): self.metadata, self.codec_pipeline, self.config, - self.chunk_grid, + self._chunk_grid, selection, prototype=prototype, ) @@ -1746,7 +1744,7 @@ async def get_orthogonal_selection( self.metadata, self.codec_pipeline, self.config, - self.chunk_grid, + self._chunk_grid, selection, out=out, fields=fields, @@ -1766,7 +1764,7 @@ async def get_mask_selection( self.metadata, self.codec_pipeline, self.config, - self.chunk_grid, + self._chunk_grid, mask, out=out, fields=fields, @@ -1786,7 +1784,7 @@ async def get_coordinate_selection( self.metadata, self.codec_pipeline, self.config, - self.chunk_grid, + self._chunk_grid, selection, out=out, fields=fields, @@ -1812,7 +1810,7 @@ async def _set_selection( self.metadata, self.codec_pipeline, self.config, - self.chunk_grid, + self._chunk_grid, indexer, value, prototype=prototype, @@ -1863,7 +1861,7 @@ async def setitem( self.metadata, self.codec_pipeline, self.config, - self.chunk_grid, + self._chunk_grid, selection, value, prototype=prototype, @@ -2020,7 +2018,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - chunk_shape = self.chunks if self.chunk_grid.is_regular else None + chunk_shape = self.chunks if self._chunk_grid.is_regular else None return ArrayInfo( _zarr_format=self.metadata.zarr_format, _data_type=self._zdtype, @@ -2073,9 +2071,9 @@ def config(self) -> ArrayConfig: return self.async_array.config @property - def chunk_grid(self) -> ChunkGrid: + def _chunk_grid(self) -> ChunkGrid: """The behavioral chunk grid for this array, bound to the array's shape.""" - return self.async_array.chunk_grid + return self.async_array._chunk_grid @classmethod @deprecated("Use zarr.create_array instead.", category=ZarrDeprecationWarning) @@ -3203,7 +3201,7 @@ def get_basic_selection( prototype = default_buffer_prototype() return sync( self.async_array._get_selection( - BasicIndexer(selection, self.shape, self.chunk_grid), + BasicIndexer(selection, self.shape, self._chunk_grid), out=out, fields=fields, prototype=prototype, @@ -3310,7 +3308,7 @@ def set_basic_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BasicIndexer(selection, self.shape, self.chunk_grid) + indexer = BasicIndexer(selection, self.shape, self._chunk_grid) sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_orthogonal_selection( @@ -3438,7 +3436,7 @@ def get_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self._chunk_grid) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3557,7 +3555,7 @@ def set_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self._chunk_grid) return sync( self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype) ) @@ -3645,7 +3643,7 @@ def get_mask_selection( if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self._chunk_grid) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3735,7 +3733,7 @@ def set_mask_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self._chunk_grid) sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_coordinate_selection( @@ -3823,7 +3821,7 @@ def get_coordinate_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self._chunk_grid) out_array = sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3916,7 +3914,7 @@ def set_coordinate_selection( if prototype is None: prototype = default_buffer_prototype() # setup indexer - indexer = CoordinateIndexer(selection, self.shape, self.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self._chunk_grid) # handle value - need ndarray-like flatten value if not is_scalar(value, self.dtype): @@ -4038,7 +4036,7 @@ def get_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self._chunk_grid) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -4139,7 +4137,7 @@ def set_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self._chunk_grid) sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @property @@ -5142,12 +5140,12 @@ def _parse_keep_array_attr( ]: if isinstance(data, Array): if chunks == "keep": - if data.chunk_grid.is_regular: + if data._chunk_grid.is_regular: chunks = data.chunks else: chunks = data.write_chunk_sizes if shards == "keep": - shards = data.shards if data.chunk_grid.is_regular else None + shards = data.shards if data._chunk_grid.is_regular else None if zarr_format is None: zarr_format = data.metadata.zarr_format if filters == "keep": @@ -5656,7 +5654,7 @@ def _iter_chunk_regions( A tuple of slice objects representing the region spanned by each shard in the selection. """ - return array.chunk_grid.iter_chunk_regions(origin=origin, selection_shape=selection_shape) + return array._chunk_grid.iter_chunk_regions(origin=origin, selection_shape=selection_shape) async def _nchunks_initialized( @@ -6252,7 +6250,7 @@ async def _resize( if delete_outside_chunks and not only_growing: # Remove all chunks outside of the new shape - old_chunk_coords = set(array.chunk_grid.all_chunk_coords()) + old_chunk_coords = set(array._chunk_grid.all_chunk_coords()) new_chunk_coords = set(new_chunk_grid.all_chunk_coords()) async def _delete_key(key: str) -> None: @@ -6272,7 +6270,7 @@ async def _delete_key(key: str) -> None: # Update metadata and chunk_grid (in place) object.__setattr__(array, "metadata", new_metadata) - object.__setattr__(array, "chunk_grid", new_chunk_grid) + object.__setattr__(array, "_chunk_grid", new_chunk_grid) async def _append( @@ -6338,7 +6336,7 @@ async def _append( array.metadata, array.codec_pipeline, array.config, - array.chunk_grid, + array._chunk_grid, append_selection, data, ) diff --git a/tests/test_array.py b/tests/test_array.py index 6fb84a9152..0e9d750608 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -796,13 +796,13 @@ def test_resize_growing_skips_chunk_enumeration( ) z[:] = np.ones((10, 10), dtype="i4") - grid_cls = type(z.chunk_grid) + grid_cls = type(z._chunk_grid) # growth only - ensure no chunk coords are enumerated with mock.patch.object( grid_cls, "all_chunk_coords", - wraps=z.chunk_grid.all_chunk_coords, + wraps=z._chunk_grid.all_chunk_coords, ) as mock_coords: z.resize((20, 20)) mock_coords.assert_not_called() @@ -815,7 +815,7 @@ def test_resize_growing_skips_chunk_enumeration( with mock.patch.object( grid_cls, "all_chunk_coords", - wraps=z.chunk_grid.all_chunk_coords, + wraps=z._chunk_grid.all_chunk_coords, ) as mock_coords: z.resize((5, 5)) assert mock_coords.call_count > 0 @@ -838,7 +838,7 @@ def test_resize_growing_skips_chunk_enumeration( with mock.patch.object( grid_cls, "all_chunk_coords", - wraps=z2.chunk_grid.all_chunk_coords, + wraps=z2._chunk_grid.all_chunk_coords, ) as mock_coords: z2.resize((20, 5)) assert mock_coords.call_count > 0 @@ -1576,7 +1576,7 @@ async def test_with_data(impl: Literal["sync", "async"], store: Store) -> None: elif impl == "async": arr = await create_array(store, name=name, data=data, zarr_format=3) stored = await arr._get_selection( - BasicIndexer(..., shape=arr.shape, chunk_grid=arr.chunk_grid), + BasicIndexer(..., shape=arr.shape, chunk_grid=arr._chunk_grid), prototype=default_buffer_prototype(), ) else: diff --git a/tests/test_codec_pipeline.py b/tests/test_codec_pipeline.py index c923555756..48e15b0643 100644 --- a/tests/test_codec_pipeline.py +++ b/tests/test_codec_pipeline.py @@ -43,7 +43,7 @@ async def test_read_returns_get_results( indexer = BasicIndexer( read_slice, shape=metadata.shape, - chunk_grid=async_arr.chunk_grid, + chunk_grid=async_arr._chunk_grid, ) out_buffer = prototype.nd_buffer.empty( @@ -56,7 +56,7 @@ async def test_read_returns_get_results( [ ( async_arr.store_path / metadata.encode_chunk_key(chunk_coords), - _get_chunk_spec(metadata, async_arr.chunk_grid, chunk_coords, config, prototype), + _get_chunk_spec(metadata, async_arr._chunk_grid, chunk_coords, config, prototype), chunk_selection, out_selection, is_complete_chunk, diff --git a/tests/test_group.py b/tests/test_group.py index 89c80709a5..e53b0b9ea0 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -1176,7 +1176,7 @@ async def test_asyncgroup_create_array( assert subnode.store_path.store == store assert subnode.shape == shape assert subnode.dtype == dtype - assert subnode.chunk_grid.chunk_shape == chunk_shape + assert subnode._chunk_grid.chunk_shape == chunk_shape assert subnode.metadata.zarr_format == zarr_format diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 2031a4bf01..ef98cf3345 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1236,8 +1236,8 @@ def test_get_block_selection_1d(store: StorePath) -> None: _test_get_block_selection(a, z, selection, expected_idx) bad_selections = block_selections_1d_bad + [ - z.chunk_grid.get_nchunks() + 1, # out of bounds - -(z.chunk_grid.get_nchunks() + 1), # out of bounds + z._chunk_grid.get_nchunks() + 1, # out of bounds + -(z._chunk_grid.get_nchunks() + 1), # out of bounds ] for selection_bad in bad_selections: diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index aca6ff79ab..9fdc7ae847 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -939,7 +939,7 @@ def test_create_regular_array(self, tmp_path: Path) -> None: chunks=(10, 20), dtype="float32", ) - assert arr.chunk_grid.is_regular + assert arr._chunk_grid.is_regular assert arr.chunks == (10, 20) def test_create_rectilinear_array(self, tmp_path: Path) -> None: @@ -954,8 +954,8 @@ def test_create_rectilinear_array(self, tmp_path: Path) -> None: ) assert isinstance(arr.metadata, ArrayV3Metadata) assert isinstance(arr.metadata.chunk_grid, RectilinearChunkGrid) - assert not arr.chunk_grid.is_regular - assert arr.chunk_grid.ndim == 2 + assert not arr._chunk_grid.is_regular + assert arr._chunk_grid.ndim == 2 def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: """Verify metadata round-trips through JSON.""" @@ -1717,7 +1717,7 @@ def test_1d_single_chunk(self, tmp_path: Path) -> None: def test_persistence_roundtrip(self, tmp_path: Path) -> None: _, a = self._make_2d(tmp_path) z2 = zarr.open_array(store=tmp_path / "arr2d.zarr", mode="r") - assert not z2.chunk_grid.is_regular + assert not z2._chunk_grid.is_regular np.testing.assert_array_equal(z2[:], a) # --- Highly irregular chunks --- @@ -1805,7 +1805,7 @@ def test_rectilinear_shards_validates_divisibility(self, tmp_path: Path) -> None def test_nchunks(self, tmp_path: Path) -> None: z, _ = self._make_2d(tmp_path) - assert z.chunk_grid.get_nchunks() == 12 + assert z._chunk_grid.get_nchunks() == 12 pytest.importorskip("hypothesis") @@ -1863,7 +1863,7 @@ def rectilinear_arrays_st(draw: st.DrawFn) -> tuple[zarr.Array[Any], np.ndarray[ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: """Property test: block indexing on rectilinear arrays matches numpy.""" z, a = data.draw(rectilinear_arrays_st()) - grid = z.chunk_grid + grid = z._chunk_grid # Pick a random block per dimension and verify it matches the expected slice for dim in range(a.ndim): @@ -1912,7 +1912,7 @@ def test_v2_chunk_grid_is_regular(self, tmp_path: Path) -> None: dtype="int32", zarr_format=2, ) - grid = a.chunk_grid + grid = a._chunk_grid assert grid.is_regular assert grid.chunk_shape == (10, 15) assert grid.grid_shape == (2, 2) @@ -1927,7 +1927,7 @@ def test_v2_boundary_chunks(self, tmp_path: Path) -> None: dtype="int32", zarr_format=2, ) - grid = a.chunk_grid + grid = a._chunk_grid assert grid.dimensions[0].nchunks == 3 assert grid.dimensions[0].chunk_size(2) == 10 # full codec buffer assert grid.dimensions[0].data_size(2) == 5 # clipped to extent @@ -1963,7 +1963,7 @@ def test_v2_metadata_roundtrip(self, tmp_path: Path) -> None: b = zarr.open_array(store=store_path, mode="r") assert b.metadata.zarr_format == 2 assert b.chunks == (2, 2) - assert b.chunk_grid.chunk_shape == (2, 2) + assert b._chunk_grid.chunk_shape == (2, 2) np.testing.assert_array_equal(b[:], data) def test_v2_chunk_spec_via_grid(self, tmp_path: Path) -> None: @@ -1975,7 +1975,7 @@ def test_v2_chunk_spec_via_grid(self, tmp_path: Path) -> None: dtype="int32", zarr_format=2, ) - grid = a.chunk_grid + grid = a._chunk_grid # Interior chunk spec = grid[(0, 0)] assert spec is not None @@ -2137,7 +2137,7 @@ def test_regular_preserves_extents(self, tmp_path: Path) -> None: z[:] = np.arange(100, dtype="int32") z.resize(50) assert z.shape == (50,) - assert z.chunk_grid.dimensions[0].extent == 50 + assert z._chunk_grid.dimensions[0].extent == 50 class TestResizeRectilinear: @@ -2157,8 +2157,8 @@ async def test_async_resize_grow(self) -> None: await arr.resize((50, 60)) assert arr.shape == (50, 60) - assert _edges(arr.chunk_grid, 0) == (10, 20, 20) - assert _edges(arr.chunk_grid, 1) == (20, 20, 20) + assert _edges(arr._chunk_grid, 0) == (10, 20, 20) + assert _edges(arr._chunk_grid, 1) == (20, 20, 20) result = await arr.getitem((slice(0, 30), slice(0, 40))) np.testing.assert_array_equal(result, data) From 5a7280b1bf0e131ace46f2c0718fb88380c4a584 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 09:18:59 -0400 Subject: [PATCH 106/118] Improve docstrings --- src/zarr/core/array.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index eec7acd47f..fc8eaa76b6 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1087,7 +1087,11 @@ def chunks(self) -> tuple[int, ...]: @property def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: - """Per-dimension sizes of chunks used for reading. + """Per-dimension data sizes of chunks used for reading, clipped to the array extent. + + Boundary chunks that extend past the array shape are clipped, so + the last size along a dimension may be smaller than the declared + chunk size. This matches the dask ``Array.chunks`` convention. When sharding is used, returns the inner chunk sizes. Otherwise, returns the outer chunk sizes (same as ``write_chunk_sizes``). @@ -1095,7 +1099,8 @@ def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: Returns ------- tuple[tuple[int, ...], ...] - One inner tuple per dimension containing chunk sizes. + One inner tuple per dimension containing the data size of each + chunk (not the encoded buffer size). Examples -------- @@ -1113,14 +1118,18 @@ def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: @property def write_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: - """Per-dimension sizes of chunks used for writing (storage chunks). + """Per-dimension data sizes of storage chunks, clipped to the array extent. Always returns the outer chunk sizes, regardless of sharding. + Boundary chunks that extend past the array shape are clipped, so + the last size along a dimension may be smaller than the declared + chunk size. This matches the dask ``Array.chunks`` convention. Returns ------- tuple[tuple[int, ...], ...] - One inner tuple per dimension containing chunk sizes. + One inner tuple per dimension containing the data size of each + chunk (not the encoded buffer size). Examples -------- @@ -2378,7 +2387,11 @@ def chunks(self) -> tuple[int, ...]: @property def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: - """Per-dimension sizes of chunks used for reading. + """Per-dimension data sizes of chunks used for reading, clipped to the array extent. + + Boundary chunks that extend past the array shape are clipped, so + the last size along a dimension may be smaller than the declared + chunk size. This matches the dask ``Array.chunks`` convention. When sharding is used, returns the inner chunk sizes. Otherwise, returns the outer chunk sizes (same as ``write_chunk_sizes``). @@ -2386,7 +2399,8 @@ def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: Returns ------- tuple[tuple[int, ...], ...] - One inner tuple per dimension containing chunk sizes. + One inner tuple per dimension containing the data size of each + chunk (not the encoded buffer size). Examples -------- @@ -2398,14 +2412,18 @@ def read_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: @property def write_chunk_sizes(self) -> tuple[tuple[int, ...], ...]: - """Per-dimension sizes of chunks used for writing (storage chunks). + """Per-dimension data sizes of storage chunks, clipped to the array extent. Always returns the outer chunk sizes, regardless of sharding. + Boundary chunks that extend past the array shape are clipped, so + the last size along a dimension may be smaller than the declared + chunk size. This matches the dask ``Array.chunks`` convention. Returns ------- tuple[tuple[int, ...], ...] - One inner tuple per dimension containing chunk sizes. + One inner tuple per dimension containing the data size of each + chunk (not the encoded buffer size). Examples -------- From 60ad5cb4625d978080543d59580bac3f0434f73e Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 09:21:12 -0400 Subject: [PATCH 107/118] Update notebook --- .../examples/rectilinear_chunks.ipynb | 106 +++++++++--------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/docs/user-guide/examples/rectilinear_chunks.ipynb b/docs/user-guide/examples/rectilinear_chunks.ipynb index 4d89142b67..376cd9ad88 100644 --- a/docs/user-guide/examples/rectilinear_chunks.ipynb +++ b/docs/user-guide/examples/rectilinear_chunks.ipynb @@ -6,11 +6,11 @@ "id": "da9139cc", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:30.606395Z", - "iopub.status.busy": "2026-03-30T13:09:30.606270Z", - "iopub.status.idle": "2026-03-30T13:09:30.611784Z", - "shell.execute_reply": "2026-03-30T13:09:30.608914Z", - "shell.execute_reply.started": "2026-03-30T13:09:30.606380Z" + "iopub.execute_input": "2026-03-30T13:18:20.792275Z", + "iopub.status.busy": "2026-03-30T13:18:20.792050Z", + "iopub.status.idle": "2026-03-30T13:18:20.801655Z", + "shell.execute_reply": "2026-03-30T13:18:20.797952Z", + "shell.execute_reply.started": "2026-03-30T13:18:20.792253Z" } }, "outputs": [], @@ -51,18 +51,18 @@ "id": "9e9nyjdx06f", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:30.613147Z", - "iopub.status.busy": "2026-03-30T13:09:30.612776Z", - "iopub.status.idle": "2026-03-30T13:09:44.301116Z", - "shell.execute_reply": "2026-03-30T13:09:44.300677Z", - "shell.execute_reply.started": "2026-03-30T13:09:30.613126Z" + "iopub.execute_input": "2026-03-30T13:18:20.802629Z", + "iopub.status.busy": "2026-03-30T13:18:20.802471Z", + "iopub.status.idle": "2026-03-30T13:18:21.183147Z", + "shell.execute_reply": "2026-03-30T13:18:21.182751Z", + "shell.execute_reply.started": "2026-03-30T13:18:20.802615Z" } }, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -103,11 +103,11 @@ "id": "v6cot74r1gq", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:44.301615Z", - "iopub.status.busy": "2026-03-30T13:09:44.301466Z", - "iopub.status.idle": "2026-03-30T13:09:45.297053Z", - "shell.execute_reply": "2026-03-30T13:09:45.292702Z", - "shell.execute_reply.started": "2026-03-30T13:09:44.301605Z" + "iopub.execute_input": "2026-03-30T13:18:21.183653Z", + "iopub.status.busy": "2026-03-30T13:18:21.183505Z", + "iopub.status.idle": "2026-03-30T13:18:22.028419Z", + "shell.execute_reply": "2026-03-30T13:18:22.027356Z", + "shell.execute_reply.started": "2026-03-30T13:18:21.183644Z" } }, "outputs": [ @@ -148,11 +148,11 @@ "id": "90bc91b9", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:45.301060Z", - "iopub.status.busy": "2026-03-30T13:09:45.299959Z", - "iopub.status.idle": "2026-03-30T13:09:46.662995Z", - "shell.execute_reply": "2026-03-30T13:09:46.662527Z", - "shell.execute_reply.started": "2026-03-30T13:09:45.301022Z" + "iopub.execute_input": "2026-03-30T13:18:22.029842Z", + "iopub.status.busy": "2026-03-30T13:18:22.029258Z", + "iopub.status.idle": "2026-03-30T13:18:23.629597Z", + "shell.execute_reply": "2026-03-30T13:18:23.628896Z", + "shell.execute_reply.started": "2026-03-30T13:18:22.029824Z" } }, "outputs": [], @@ -170,11 +170,11 @@ "id": "0d7785b0-d72f-4ef8-8a57-91d61f07be96", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:46.663527Z", - "iopub.status.busy": "2026-03-30T13:09:46.663301Z", - "iopub.status.idle": "2026-03-30T13:09:46.666266Z", - "shell.execute_reply": "2026-03-30T13:09:46.665875Z", - "shell.execute_reply.started": "2026-03-30T13:09:46.663516Z" + "iopub.execute_input": "2026-03-30T13:18:23.630244Z", + "iopub.status.busy": "2026-03-30T13:18:23.629978Z", + "iopub.status.idle": "2026-03-30T13:18:23.633850Z", + "shell.execute_reply": "2026-03-30T13:18:23.632930Z", + "shell.execute_reply.started": "2026-03-30T13:18:23.630232Z" } }, "outputs": [ @@ -200,11 +200,11 @@ "id": "72c80224-dcac-4724-8caf-5717b29a25d5", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:46.666584Z", - "iopub.status.busy": "2026-03-30T13:09:46.666509Z", - "iopub.status.idle": "2026-03-30T13:09:46.673617Z", - "shell.execute_reply": "2026-03-30T13:09:46.672827Z", - "shell.execute_reply.started": "2026-03-30T13:09:46.666576Z" + "iopub.execute_input": "2026-03-30T13:18:23.634211Z", + "iopub.status.busy": "2026-03-30T13:18:23.634119Z", + "iopub.status.idle": "2026-03-30T13:18:23.642291Z", + "shell.execute_reply": "2026-03-30T13:18:23.641668Z", + "shell.execute_reply.started": "2026-03-30T13:18:23.634203Z" } }, "outputs": [ @@ -238,11 +238,11 @@ "id": "a79a281b-ca74-49c3-a467-60490a4ad63e", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:46.674098Z", - "iopub.status.busy": "2026-03-30T13:09:46.673995Z", - "iopub.status.idle": "2026-03-30T13:09:46.690488Z", - "shell.execute_reply": "2026-03-30T13:09:46.686144Z", - "shell.execute_reply.started": "2026-03-30T13:09:46.674089Z" + "iopub.execute_input": "2026-03-30T13:18:23.642721Z", + "iopub.status.busy": "2026-03-30T13:18:23.642622Z", + "iopub.status.idle": "2026-03-30T13:18:23.649165Z", + "shell.execute_reply": "2026-03-30T13:18:23.648723Z", + "shell.execute_reply.started": "2026-03-30T13:18:23.642712Z" } }, "outputs": [ @@ -278,11 +278,11 @@ "id": "ribguojdr0s", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:46.691041Z", - "iopub.status.busy": "2026-03-30T13:09:46.690923Z", - "iopub.status.idle": "2026-03-30T13:09:47.241836Z", - "shell.execute_reply": "2026-03-30T13:09:47.241361Z", - "shell.execute_reply.started": "2026-03-30T13:09:46.691030Z" + "iopub.execute_input": "2026-03-30T13:18:23.649823Z", + "iopub.status.busy": "2026-03-30T13:18:23.649737Z", + "iopub.status.idle": "2026-03-30T13:18:24.089390Z", + "shell.execute_reply": "2026-03-30T13:18:24.088640Z", + "shell.execute_reply.started": "2026-03-30T13:18:23.649815Z" } }, "outputs": [ @@ -290,7 +290,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Written to: /var/folders/70/hc_nynms54d8lp67z4rsfctc0000gp/T/tmp30rjewd0/healpix_rectilinear.zarr\n" + "Written to: /var/folders/70/hc_nynms54d8lp67z4rsfctc0000gp/T/tmp6dibcrho/healpix_rectilinear.zarr\n" ] } ], @@ -331,11 +331,11 @@ "id": "mpdn5hxp7lp", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:09:47.242174Z", - "iopub.status.busy": "2026-03-30T13:09:47.242097Z", - "iopub.status.idle": "2026-03-30T13:09:47.244629Z", - "shell.execute_reply": "2026-03-30T13:09:47.244126Z", - "shell.execute_reply.started": "2026-03-30T13:09:47.242166Z" + "iopub.execute_input": "2026-03-30T13:18:24.090312Z", + "iopub.status.busy": "2026-03-30T13:18:24.090192Z", + "iopub.status.idle": "2026-03-30T13:18:24.093595Z", + "shell.execute_reply": "2026-03-30T13:18:24.092908Z", + "shell.execute_reply.started": "2026-03-30T13:18:24.090303Z" } }, "outputs": [ @@ -367,15 +367,15 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "308gxly6r3j", "metadata": { "execution": { - "iopub.execute_input": "2026-03-30T13:11:17.611493Z", - "iopub.status.busy": "2026-03-30T13:11:17.610692Z", - "iopub.status.idle": "2026-03-30T13:11:17.655199Z", - "shell.execute_reply": "2026-03-30T13:11:17.654529Z", - "shell.execute_reply.started": "2026-03-30T13:11:17.611441Z" + "iopub.execute_input": "2026-03-30T13:18:24.094252Z", + "iopub.status.busy": "2026-03-30T13:18:24.094013Z", + "iopub.status.idle": "2026-03-30T13:18:24.117313Z", + "shell.execute_reply": "2026-03-30T13:18:24.116670Z", + "shell.execute_reply.started": "2026-03-30T13:18:24.094242Z" } }, "outputs": [ From f2ec718a0feec4f28378b58eefd58b4dc92dc7af Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 09:47:58 -0400 Subject: [PATCH 108/118] Remove dead code --- docs/design/chunk-grid.md | 27 +- src/zarr/core/chunk_grids.py | 186 +---------- src/zarr/core/metadata/v3.py | 3 - tests/test_unified_chunk_grid.py | 524 +++---------------------------- 4 files changed, 67 insertions(+), 673 deletions(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index c74a654e69..7a180859cd 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -44,7 +44,7 @@ Prior iterations on the chunk grid design were based on the Zarr V3 spec's defin 1. **A chunk grid is a concrete arrangement of chunks.** Not an abstract tiling pattern. This means that the chunk grid is bound to specific array dimensions, which enables the chunk grid to answer any question about any chunk (offset, size, count) without external parameters. 2. **One implementation, multiple serialization forms.** A single `ChunkGrid` class handles all chunking logic. The serialization format (`"regular"` vs `"rectilinear"`) is chosen by the metadata layer, not the grid. -3. **No chunk grid registry.** Simple name-based dispatch in `parse_chunk_grid()`. +3. **No chunk grid registry.** Simple name-based dispatch in the metadata layer's `parse_chunk_grid()`. 4. **Fixed vs Varying per dimension.** `FixedDimension(size, extent)` for uniform chunks; `VaryingDimension(edges, extent)` for per-chunk edge lengths with precomputed prefix sums. Avoids expanding regular dimensions into lists of identical values. 5. **Transparent transitions.** Operations like `resize()` can move an array from regular to rectilinear chunking. @@ -274,15 +274,15 @@ When `extent < sum(edges)`, the dimension is always stored as `VaryingDimension` {"name": "rectilinear", "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [[25, 4]]]}} ``` -Both names deserialize to the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is passed to `parse_chunk_grid()` at construction time. +Both names deserialize to the same `ChunkGrid` class. The serialized form does not include the array extent — that comes from `shape` in array metadata and is combined with the chunk grid when constructing a behavioral `ChunkGrid` via `ChunkGrid.from_metadata()`. -**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`. The name is inferred from the chunk grid metadata DTO type (`RegularChunkGrid` → `"regular"`, `RectilinearChunkGrid` → `"rectilinear"`) or from `grid.is_regular` when a behavioral `ChunkGrid` is passed directly. +**The `ChunkGrid` does not serialize itself.** The format choice (`"regular"` vs `"rectilinear"`) belongs to `ArrayV3Metadata`. Serialization and deserialization are handled by the metadata-layer chunk grid classes (`RegularChunkGrid` and `RectilinearChunkGrid` in `metadata/v3.py`), which provide `to_dict()` and `from_dict()` methods. For `create_array`, the format is inferred from the `chunks` argument: a flat tuple produces `"regular"`, a nested list produces `"rectilinear"`. The `_is_rectilinear_chunks()` helper detects nested sequences like `[[10, 20], [5, 5]]`. ##### Rectilinear spec compliance -The rectilinear format requires `"kind": "inline"` (validated by `_validate_rectilinear_kind()`). Per the spec, each element of `chunk_shapes` can be: +The rectilinear format requires `"kind": "inline"` (validated by `validate_rectilinear_kind()`). Per the spec, each element of `chunk_shapes` can be: - A bare integer `m`: repeated until `sum >= array_extent` - A list of bare integers: explicit per-chunk sizes @@ -291,13 +291,13 @@ The rectilinear format requires `"kind": "inline"` (validated by `_validate_rect RLE compression is used when serializing: runs of identical sizes become `[value, count]` pairs, singletons stay as bare integers. ```python -# _compress_rle([10, 10, 10, 5]) -> [[10, 3], 5] -# _expand_rle([[10, 3], 5]) -> [10, 10, 10, 5] +# compress_rle([10, 10, 10, 5]) -> [[10, 3], 5] +# expand_rle([[10, 3], 5]) -> [10, 10, 10, 5] ``` -For `FixedDimension` serialized as rectilinear, `_serialize_fixed_dim()` returns the bare integer `dim.size`. Per the rectilinear spec, a bare integer is repeated until the sum >= extent, preserving the full codec buffer size for boundary chunks. +For a single-element `chunk_shapes` tuple like `(10,)`, `RectilinearChunkGrid.to_dict()` serializes it as a bare integer `10`. Per the rectilinear spec, a bare integer is repeated until the sum >= extent, preserving the full codec buffer size for boundary chunks. -**Zero-extent handling:** Regular grids serialize zero-extent dimensions without issue (the format encodes only `chunk_shape`, no edges). Rectilinear grids reject zero-extent dimensions because the spec requires at least one positive-integer edge length per axis. This asymmetry is intentional and spec-compliant — documented in `serialize_chunk_grid()`. +**Zero-extent handling:** Regular grids serialize zero-extent dimensions without issue (the format encodes only `chunk_shape`, no edges). Rectilinear grids cannot represent zero-extent dimensions because the spec requires at least one positive-integer edge length per axis. #### read_chunk_sizes / write_chunk_sizes @@ -418,7 +418,7 @@ Level 3 — Shard index: ceil(shard_dim / subchunk_dim) entries per dimension The chunk grid is a concrete arrangement, not an abstract tiling pattern. A finite collection naturally has an extent. Storing it enables `__getitem__`, eliminates `dim_len` parameters from every method, and makes the grid self-describing. -This does *not* mean `ArrayV3Metadata.shape` should delegate to the grid. The array shape remains an independent field in metadata. The extent is passed into the grid at construction time so it can answer boundary questions without external parameters. It is **not** serialized as part of the chunk grid JSON — it comes from the `shape` field in array metadata and is passed to `parse_chunk_grid()`. +This does *not* mean `ArrayV3Metadata.shape` should delegate to the grid. The array shape remains an independent field in metadata. The extent is passed into the grid at construction time so it can answer boundary questions without external parameters. It is **not** serialized as part of the chunk grid JSON — it comes from the `shape` field in array metadata and is combined with the chunk grid configuration in `ChunkGrid.from_metadata()`. ### Why distinguish chunk_size from data_size? @@ -466,7 +466,7 @@ The resolution: @d-v-b raised in #3534 that users need a way to say "these chunks are regular, but serialize as rectilinear" (e.g., to allow future append/extend workflows without format changes). @jhamman initially made nested-list input always produce `RectilinearChunkGrid`. -The current branch resolves this via `_infer_chunk_grid_name()`, which extracts or infers the serialization name from the chunk grid input. When metadata is deserialized, the original name (from `{"name": "regular"}` or `{"name": "rectilinear"}`) flows through to `serialize_chunk_grid()` at write time. When a `ChunkGrid` is passed directly, the name is inferred from `grid.is_regular`. Current inference behavior: +The current branch resolves this via the metadata-layer chunk grid classes. When metadata is deserialized, the original name (from `{"name": "regular"}` or `{"name": "rectilinear"}`) determines which metadata class is instantiated (`RegularChunkGrid` or `RectilinearChunkGrid`), and that class handles serialization via `to_dict()`. Current inference behavior for `create_array`: - `chunks=(10, 20)` (flat tuple) → infers `"regular"` - `chunks=[[10, 20], [5, 5]]` (nested lists with varying sizes) → infers `"rectilinear"` - `chunks=[[10, 10], [20, 20]]` (nested lists with uniform sizes) → `from_rectilinear` collapses to `FixedDimension`, so `is_regular=True` and infers `"regular"` @@ -498,7 +498,7 @@ The current implementation partially realizes this separation: This means `ArrayV3Metadata.chunk_grid` is now a `ChunkGridMetadata` (the DTO union type), **not** the behavioral `ChunkGrid`. Code that previously accessed behavioral methods on `metadata.chunk_grid` (e.g., `all_chunk_coords()`, `__getitem__`) must now use the behavioral grid from the array layer instead. -The name controls serialization format; `serialize_chunk_grid()` is called by `ArrayV3Metadata.to_dict()`. The behavioral grid handles all runtime queries. +The name controls serialization format; each metadata DTO class provides its own `to_dict()` method for serialization. The behavioral grid handles all runtime queries. ## Prior art @@ -569,10 +569,9 @@ If the design is accepted, the POC branch can be split into 5 incremental PRs. P - Zero changes to existing code **PR 2: Unified ChunkGrid class + serialization** (replaces hierarchy) -- `ChunkGrid` with `from_regular`, `from_rectilinear`, `__getitem__`, `__iter__`, `all_chunk_coords`, `is_regular`, `chunk_shape`, `chunk_sizes`, `unique_edge_lengths` -- `parse_chunk_grid()`, `serialize_chunk_grid()`, `_infer_chunk_grid_name()` +- `ChunkGrid` with `from_regular`, `from_rectilinear`, `from_metadata`, `__getitem__`, `__iter__`, `all_chunk_coords`, `is_regular`, `chunk_shape`, `chunk_sizes`, `unique_edge_lengths` - `RegularChunkGrid` deprecation shim -- `_infer_chunk_grid_name()` for serialization format inference +- Metadata-layer serialization via `RegularChunkGrid.to_dict()`/`RectilinearChunkGrid.to_dict()` - Feature flag (`array.rectilinear_chunks`) **PR 3: Indexing generalization** diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 8f7db8f2d2..03323fdd6d 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -6,7 +6,6 @@ import numbers import operator import warnings -from collections.abc import Iterable, Sequence from dataclasses import dataclass, field from functools import reduce from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeGuard, cast, runtime_checkable @@ -16,21 +15,14 @@ import zarr from zarr.core.common import ( - JSON, - NamedConfig, ShapeLike, ceildiv, - compress_rle, - expand_rle, - parse_named_configuration, parse_shapelike, - validate_rectilinear_edges, - validate_rectilinear_kind, ) from zarr.errors import ZarrUserWarning if TYPE_CHECKING: - from collections.abc import Iterator + from collections.abc import Iterable, Iterator, Sequence from zarr.core.array import ShardsLike from zarr.core.metadata import ArrayMetadata @@ -107,7 +99,7 @@ def with_extent(self, new_extent: int) -> FixedDimension: """Re-bind to *new_extent* without modifying edges. Used when constructing a grid from existing metadata where edges - are already correct (e.g. ``parse_chunk_grid``). Raises on + are already correct. Raises on ``VaryingDimension`` if edges don't cover the new extent. """ return FixedDimension(size=self.size, extent=new_extent) @@ -203,7 +195,7 @@ def with_extent(self, new_extent: int) -> VaryingDimension: """Re-bind to *new_extent* without modifying edges. Used when constructing a grid from existing metadata where edges - are already correct (e.g. ``parse_chunk_grid``). Raises if the + are already correct. Raises if the existing edges don't cover *new_extent*. """ edge_sum = self.cumulative[-1] @@ -275,66 +267,6 @@ def is_boundary(self) -> bool: return self.shape != self.codec_shape -# A single dimension's rectilinear chunk spec: bare int (uniform shorthand), -# list of ints (explicit edges), or mixed RLE (e.g. [[10, 3], 5]). -RectilinearDimSpec = int | list[int | list[int]] - -# The serialization format name for a chunk grid. -ChunkGridName = Literal["regular", "rectilinear"] - - -def _serialize_fixed_dim(dim: FixedDimension) -> RectilinearDimSpec: - """Compact rectilinear representation for a fixed-size dimension. - - Per the rectilinear spec, a bare integer is repeated until the sum - >= extent. This preserves the full codec buffer size for boundary - chunks, matching the regular grid spec ("chunks at the border always - have the full chunk size"). - """ - return dim.size - - -def _serialize_varying_dim(dim: VaryingDimension) -> RectilinearDimSpec: - """RLE-compressed rectilinear representation for a varying dimension.""" - edges = list(dim.edges) - rle = compress_rle(edges) - if len(rle) < len(edges): - return rle - # mypy: list[int] is invariant, so it won't widen to list[int | list[int]] - return cast("RectilinearDimSpec", edges) - - -def _decode_dim_spec(dim_spec: JSON, array_extent: int | None = None) -> list[int]: - """Decode a single dimension's chunk edge specification per the rectilinear spec. - - Per the spec, each element of ``chunk_shapes`` can be: - - a bare integer ``m``: repeat ``m`` until the sum >= array extent - - an array of bare integers and/or ``[value, count]`` RLE pairs - - Parameters - ---------- - dim_spec - The raw JSON value for one dimension's chunk edges. - array_extent - Array length along this dimension. Required when *dim_spec* is a bare - integer (to know how many repetitions). - """ - if isinstance(dim_spec, int): - if array_extent is None: - raise ValueError("Integer chunk_shapes shorthand requires array shape to expand.") - if dim_spec <= 0: - raise ValueError(f"Integer chunk edge length must be > 0, got {dim_spec}") - n = ceildiv(array_extent, dim_spec) - return [dim_spec] * n - if isinstance(dim_spec, list): - has_sublists = any(isinstance(e, list) for e in dim_spec) - if has_sublists: - return expand_rle(dim_spec) - else: - return [int(e) for e in dim_spec] - raise ValueError(f"Invalid chunk_shapes entry: {dim_spec}") - - def _is_rectilinear_chunks(chunks: Any) -> TypeGuard[Sequence[Sequence[int]]]: """Check if chunks is a nested sequence (e.g. [[10, 20], [5, 5]]). @@ -628,118 +560,6 @@ def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: ) return ChunkGrid(dimensions=dims) - # ChunkGrid does not serialize itself. The format choice ("regular" vs - # "rectilinear") belongs to the metadata layer. Use serialize_chunk_grid() - # for output and parse_chunk_grid() for input. - - -def parse_chunk_grid( - data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], - array_shape: tuple[int, ...], -) -> ChunkGrid: - """Create a ChunkGrid from a metadata dict or existing grid, binding array shape. - - This is the primary entry point for constructing a ChunkGrid from serialized - metadata. It always produces a grid with correct extent values. - - Both ``"regular"`` and ``"rectilinear"`` grid names are supported. Rectilinear - grids are experimental and require the ``array.rectilinear_chunks`` config - option to be enabled; a ``ValueError`` is raised otherwise. - """ - if isinstance(data, ChunkGrid): - # Re-bind extent if array_shape differs from what's stored - dims = tuple( - dim.with_extent(extent) - for dim, extent in zip(data.dimensions, array_shape, strict=True) - ) - return ChunkGrid(dimensions=dims) - - name_parsed, configuration_parsed = parse_named_configuration(data) - - if name_parsed == "regular": - chunk_shape_raw = configuration_parsed.get("chunk_shape") - if chunk_shape_raw is None: - raise ValueError("Regular chunk grid requires 'chunk_shape' configuration") - if not isinstance(chunk_shape_raw, Sequence): - raise TypeError(f"chunk_shape must be a sequence, got {type(chunk_shape_raw)}") - return ChunkGrid.from_regular(array_shape, cast("Sequence[int]", chunk_shape_raw)) - - if name_parsed == "rectilinear": - validate_rectilinear_kind(cast("str | None", configuration_parsed.get("kind"))) - chunk_shapes_raw = configuration_parsed.get("chunk_shapes") - if chunk_shapes_raw is None: - raise ValueError("Rectilinear chunk grid requires 'chunk_shapes' configuration") - if not isinstance(chunk_shapes_raw, Sequence): - raise TypeError(f"chunk_shapes must be a sequence, got {type(chunk_shapes_raw)}") - if len(chunk_shapes_raw) != len(array_shape): - raise ValueError( - f"chunk_shapes has {len(chunk_shapes_raw)} dimensions but array shape " - f"has {len(array_shape)} dimensions" - ) - decoded: list[list[int]] = [] - for dim_spec, extent in zip(chunk_shapes_raw, array_shape, strict=True): - decoded.append(_decode_dim_spec(dim_spec, array_extent=extent)) - validate_rectilinear_edges(decoded, array_shape) - return ChunkGrid.from_rectilinear(decoded, array_shape=array_shape) - - raise ValueError(f"Unknown chunk grid name: {name_parsed!r}") - - -def serialize_chunk_grid(grid: ChunkGrid, name: ChunkGridName) -> dict[str, JSON]: - """Serialize a ChunkGrid to a metadata dict using the given format name. - - The format choice ("regular" vs "rectilinear") belongs to the metadata layer, - not the grid itself. This function is called by ArrayV3Metadata.to_dict(). - """ - if name == "regular": - if not grid.is_regular: - raise ValueError( - "Cannot serialize a non-regular chunk grid as 'regular'. Use 'rectilinear' instead." - ) - # The regular grid spec encodes only chunk_shape, not per-axis edges, - # so zero-extent dimensions are valid (they simply produce zero chunks). - return { - "name": "regular", - "configuration": {"chunk_shape": tuple(grid.chunk_shape)}, - } - - if name == "rectilinear": - # Zero-extent dimensions cannot be represented as rectilinear because - # the spec requires at least one positive-integer edge length per axis. - # This is intentionally asymmetric with the regular grid, which encodes - # only chunk_shape (no per-axis edges) and thus handles zero-extent - # arrays without issue. - if any(d.extent == 0 for d in grid.dimensions): - raise ValueError( - "Cannot serialize a zero-extent grid as 'rectilinear': " - "the spec requires all edge lengths to be positive integers." - ) - chunk_shapes: list[RectilinearDimSpec] = [] - for dim in grid.dimensions: - if isinstance(dim, FixedDimension): - chunk_shapes.append(_serialize_fixed_dim(dim)) - elif isinstance(dim, VaryingDimension): - chunk_shapes.append(_serialize_varying_dim(dim)) - else: - raise TypeError(f"Unexpected dimension type: {type(dim)}") - return { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": chunk_shapes}, - } - - raise ValueError(f"Unknown chunk grid name for serialization: {name!r}") - - -def _infer_chunk_grid_name( - data: dict[str, JSON] | ChunkGrid | NamedConfig[str, Any], - grid: ChunkGrid, -) -> ChunkGridName: - """Extract or infer the chunk grid serialization name from the input.""" - if isinstance(data, dict): - name, _ = parse_named_configuration(data) - return cast("ChunkGridName", name) - return "regular" if grid.is_regular else "rectilinear" - def _guess_chunks( shape: tuple[int, ...] | int, diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index db38541c70..4391ea40f3 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -360,9 +360,6 @@ def resolve_chunks( Flat inputs like ``(10, 10)`` or a scalar ``int`` produce a ``RegularChunkGrid`` after normalization via :func:`~zarr.core.chunk_grids.normalize_chunks`. - See Also - -------- - parse_chunk_grid : Deserialize a chunk grid from stored JSON metadata. """ from zarr.core.chunk_grids import _is_rectilinear_chunks, normalize_chunks diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 9fdc7ae847..564677a41b 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -8,7 +8,6 @@ from __future__ import annotations -import json from typing import TYPE_CHECKING, Any import numpy as np @@ -20,11 +19,7 @@ ChunkSpec, FixedDimension, VaryingDimension, - _decode_dim_spec, - _infer_chunk_grid_name, _is_rectilinear_chunks, - parse_chunk_grid, - serialize_chunk_grid, ) from zarr.core.common import compress_rle, expand_rle from zarr.core.metadata.v3 import RectilinearChunkGrid @@ -106,21 +101,6 @@ def test_create_array_blocked(self) -> None: with pytest.raises(ValueError, match="experimental and disabled by default"): zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") - def test_parse_chunk_grid_blocked(self) -> None: - """Opening a rectilinear array from metadata is also gated.""" - with zarr.config.set({"array.rectilinear_chunks": False}): - with pytest.raises(ValueError, match="experimental and disabled by default"): - parse_chunk_grid( - { - "name": "rectilinear", - "configuration": { - "kind": "inline", - "chunk_shapes": [[10, 20, 30], [50, 50]], - }, - }, - array_shape=(60, 100), - ) - class TestRegularChunkGridCompat: """The deprecated RegularChunkGrid shim should work for common patterns.""" @@ -463,51 +443,6 @@ def test_rle_pair_with_float_count(self) -> None: assert result == [10, 10, 10] -class TestDecodeDimSpec: - """Edge cases for _decode_dim_spec: floats, empty lists, negatives, missing extent.""" - - def test_bare_integer(self) -> None: - assert _decode_dim_spec(10, array_extent=25) == [10, 10, 10] - - def test_bare_integer_exact_fit(self) -> None: - assert _decode_dim_spec(5, array_extent=10) == [5, 5] - - def test_bare_integer_no_extent_raises(self) -> None: - with pytest.raises(ValueError, match="requires array shape"): - _decode_dim_spec(10, array_extent=None) - - def test_bare_integer_zero_raises(self) -> None: - with pytest.raises(ValueError, match="must be > 0"): - _decode_dim_spec(0, array_extent=10) - - def test_bare_integer_negative_raises(self) -> None: - with pytest.raises(ValueError, match="must be > 0"): - _decode_dim_spec(-5, array_extent=10) - - def test_bare_float_raises(self) -> None: - """A bare float (not in a list) is not int or list — should raise.""" - with pytest.raises(ValueError, match="Invalid chunk_shapes entry"): - _decode_dim_spec(10.0, array_extent=10) - - def test_explicit_integer_list(self) -> None: - assert _decode_dim_spec([10, 20, 30]) == [10, 20, 30] - - def test_empty_list(self) -> None: - """An empty list has no sub-lists, so it returns an empty explicit list.""" - assert _decode_dim_spec([]) == [] - - def test_list_with_rle(self) -> None: - assert _decode_dim_spec([[5, 3], 10]) == [5, 5, 5, 10] - - def test_string_raises(self) -> None: - with pytest.raises(ValueError, match="Invalid chunk_shapes entry"): - _decode_dim_spec("auto", array_extent=10) - - def test_none_raises(self) -> None: - with pytest.raises(ValueError, match="Invalid chunk_shapes entry"): - _decode_dim_spec(None, array_extent=10) - - class TestIsRectilinearChunks: """Edge cases for _is_rectilinear_chunks.""" @@ -547,316 +482,6 @@ def test_float(self) -> None: assert _is_rectilinear_chunks(3.14) is False -class TestInferChunkGridName: - """Edge cases for _infer_chunk_grid_name.""" - - def test_regular_grid(self) -> None: - g = ChunkGrid.from_regular((100,), (10,)) - assert _infer_chunk_grid_name(g, g) == "regular" - - @pytest.fixture(autouse=True) - def _enable_rectilinear(self) -> Any: - with zarr.config.set({"array.rectilinear_chunks": True}): - yield - - def test_rectilinear_grid(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) - assert _infer_chunk_grid_name(g, g) == "rectilinear" - - def test_dict_with_regular_name(self) -> None: - g = ChunkGrid.from_regular((100,), (10,)) - d: dict[str, Any] = {"name": "regular", "configuration": {"chunk_shape": [10]}} - assert _infer_chunk_grid_name(d, g) == "regular" - - def test_dict_with_rectilinear_name(self) -> None: - g = ChunkGrid.from_regular((100,), (10,)) - d: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [10]}, - } - assert _infer_chunk_grid_name(d, g) == "rectilinear" - - -class TestSerialization: - def test_regular_roundtrip(self) -> None: - g = ChunkGrid.from_regular((100, 200), (10, 20)) - d = serialize_chunk_grid(g, "regular") - assert d["name"] == "regular" - config = d["configuration"] - assert isinstance(config, dict) - assert tuple(config["chunk_shape"]) == (10, 20) - g2 = parse_chunk_grid(d, (100, 200)) - assert g2.is_regular - assert g2.chunk_shape == (10, 20) - - def test_rectilinear_roundtrip(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) - d = serialize_chunk_grid(g, "rectilinear") - assert d["name"] == "rectilinear" - g2 = parse_chunk_grid(d, (60, 100)) - assert not g2.is_regular - # Verify the reconstructed grid has same dimensions - spec0 = g2[(0, 0)] - assert spec0 is not None - assert spec0.shape == (10, 25) - spec1 = g2[(1, 0)] - assert spec1 is not None - assert spec1.shape == (20, 25) - - def test_rectilinear_rle_serialization(self) -> None: - """RLE should be used when it actually compresses.""" - g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]], array_shape=(1000, 100)) - # All uniform, but name is chosen by the metadata layer, not the grid. - # Serializing as "regular" works because is_regular is True. - d = serialize_chunk_grid(g, "regular") - assert d["name"] == "regular" - - def test_rectilinear_uniform_stays_rectilinear(self) -> None: - """A rectilinear grid with uniform edges stays rectilinear if the name says so.""" - g = ChunkGrid.from_rectilinear([[100] * 10, [25, 25, 25, 25]], array_shape=(1000, 100)) - d = serialize_chunk_grid(g, "rectilinear") - assert d["name"] == "rectilinear" - - def test_rectilinear_rle_with_varying(self) -> None: - g = ChunkGrid.from_rectilinear( - [[100, 100, 100, 50], [25, 25, 25, 25]], array_shape=(350, 100) - ) - d = serialize_chunk_grid(g, "rectilinear") - assert d["name"] == "rectilinear" - config = d["configuration"] - assert isinstance(config, dict) - chunk_shapes = config["chunk_shapes"] - assert isinstance(chunk_shapes, list) - assert chunk_shapes[0] == [[100, 3], 50] - - def test_json_roundtrip(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - d = serialize_chunk_grid(g, "rectilinear") - json_str = json.dumps(d) - d2 = json.loads(json_str) - g2 = parse_chunk_grid(d2, (60, 100)) - assert g2.grid_shape == (3, 2) - - def test_bare_int_roundtrip(self) -> None: - """Bare-integer shorthand in chunk_shapes round-trips as bare int, not [int].""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [10, [20, 30]]}, - } - meta = RectilinearChunkGrid.from_dict(data) # type: ignore[arg-type] - out = meta.to_dict() - # Dim 0 was bare int — should stay bare int - assert out["configuration"]["chunk_shapes"][0] == 10 - # Dim 1 was explicit list — should stay list - assert out["configuration"]["chunk_shapes"][1] == [20, 30] - - def test_unknown_name_raises(self) -> None: - with pytest.raises(ValueError, match="Unknown chunk grid"): - parse_chunk_grid({"name": "hexagonal", "configuration": {}}, (10,)) - - def test_serialize_non_regular_as_regular_raises(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) - with pytest.raises(ValueError, match="Cannot serialize a non-regular chunk grid"): - serialize_chunk_grid(g, "regular") - - def test_serialize_unknown_name_raises(self) -> None: - g = ChunkGrid.from_regular((100,), (10,)) - with pytest.raises(ValueError, match="Unknown chunk grid name for serialization"): - serialize_chunk_grid(g, "hexagonal") # type: ignore[arg-type] - - def test_zero_extent_rectilinear_raises(self) -> None: - """Zero-extent grids cannot be serialized as rectilinear (spec requires positive edges).""" - grid = ChunkGrid.from_regular((0,), (10,)) - with pytest.raises(ValueError, match="zero-extent"): - serialize_chunk_grid(grid, "rectilinear") - - -class TestSpecCompliance: - """Tests for compliance with the rectilinear chunk grid extension spec - (zarr-extensions PR #25).""" - - def test_kind_inline_required_on_deserialize(self) -> None: - """Deserialization requires kind: 'inline'.""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"chunk_shapes": [[10, 20], [15, 15]]}, - } - with pytest.raises(ValueError, match="requires a 'kind' field"): - parse_chunk_grid(data, (30, 30)) - - def test_kind_unknown_rejected(self) -> None: - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "reference", "chunk_shapes": [[10, 20], [15, 15]]}, - } - with pytest.raises(ValueError, match="Unsupported rectilinear chunk grid kind"): - parse_chunk_grid(data, (30, 30)) - - def test_kind_inline_in_serialized_output(self) -> None: - """Serialization includes kind: 'inline'.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) - d = serialize_chunk_grid(g, "rectilinear") - config = d["configuration"] - assert isinstance(config, dict) - assert config["kind"] == "inline" - - def test_integer_shorthand_per_dimension(self) -> None: - """A bare integer in chunk_shapes means repeat until >= extent.""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [4, [1, 2, 3]]}, - } - g = parse_chunk_grid(data, (6, 6)) - # 4 repeated: ceildiv(6, 4) = 2 → [4, 4] - assert _edges(g, 0) == (4, 4) - assert _edges(g, 1) == (1, 2, 3) - - def test_mixed_rle_and_bare_integers(self) -> None: - """An array can mix bare integers and [value, count] RLE pairs.""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[[1, 3], 3]]}, - } - # [[1, 3], 3] → [1, 1, 1, 3] → sum = 6 - g = parse_chunk_grid(data, (6,)) - assert _edges(g, 0) == (1, 1, 1, 3) - - def test_overflow_chunks_allowed(self) -> None: - """Edge sum >= extent is valid (overflow chunks permitted).""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[4, 4, 4]]}, - } - # sum = 12 > extent = 6 — allowed per spec - g = parse_chunk_grid(data, (6,)) - assert _edges(g, 0) == (4, 4, 4) - - def test_spec_example(self) -> None: - """The full example from the spec README.""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": { - "kind": "inline", - "chunk_shapes": [ - 4, # integer shorthand → [4, 4] - [1, 2, 3], # explicit list - [[4, 2]], # pure RLE → [4, 4] - [[1, 3], 3], # mixed RLE + bare → [1, 1, 1, 3] - [4, 4, 4], # explicit list with overflow - ], - }, - } - g = parse_chunk_grid(data, (6, 6, 6, 6, 6)) - assert _edges(g, 0) == (4, 4) - assert _edges(g, 1) == (1, 2, 3) - assert _edges(g, 2) == (4, 4) - assert _edges(g, 3) == (1, 1, 1, 3) - assert _edges(g, 4) == (4, 4, 4) - - -class TestParseChunkGridValidation: - def test_varying_extent_mismatch_raises(self) -> None: - from zarr.core.chunk_grids import parse_chunk_grid - - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - # VaryingDimension extent is 60, but array_shape says 100 - with pytest.raises(ValueError, match="extent"): - parse_chunk_grid(g, (100, 100)) - - def test_varying_extent_match_ok(self) -> None: - from zarr.core.chunk_grids import parse_chunk_grid - - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - # Matching extents should work fine - g2 = parse_chunk_grid(g, (60, 100)) - assert g2.dimensions[0].extent == 60 - - def test_rectilinear_extent_mismatch_raises(self) -> None: - """sum(edges) must match the array shape for each dimension.""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [25, 25]]}, - } - # sum([10,20,30])=60, sum([25,25])=50 — array shape (100, 50) mismatches dim 0 - with pytest.raises(ValueError, match="sum to 60 but array shape extent is 100"): - parse_chunk_grid(data, (100, 50)) - - def test_rectilinear_extent_mismatch_second_dim(self) -> None: - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[50, 50], [10, 20]]}, - } - # dim 0 OK (100), dim 1: sum([10,20])=30 != 50 - with pytest.raises(ValueError, match="dimension 1 sum to 30 but array shape extent is 50"): - parse_chunk_grid(data, (100, 50)) - - def test_rectilinear_extent_match_passes(self) -> None: - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [25, 25]]}, - } - g = parse_chunk_grid(data, (60, 50)) - assert g.grid_shape == (3, 2) - - def test_rectilinear_ndim_mismatch_raises(self) -> None: - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[10, 20], [25, 25]]}, - } - with pytest.raises(ValueError, match="2 dimensions but array shape has 3"): - parse_chunk_grid(data, (30, 50, 100)) - - def test_rectilinear_rle_extent_validated(self) -> None: - """RLE-encoded edges are expanded before validation.""" - data: dict[str, Any] = { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[[10, 5]], [[25, 2]]]}, - } - # sum = 50 and 50 — match (50, 50) - g = parse_chunk_grid(data, (50, 50)) - assert g.grid_shape == (5, 2) - # mismatch - with pytest.raises(ValueError, match="sum to 50 but array shape extent is 100"): - parse_chunk_grid(data, (100, 50)) - - def test_varying_dimension_extent_mismatch_on_chunkgrid_input(self) -> None: - """When passing a ChunkGrid directly, VaryingDimension extent is validated. - - After resize, extent >= array_shape is allowed (last chunk extends past - boundary). But extent < array_shape is still an error. - """ - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) - with pytest.raises(ValueError, match="less than"): - parse_chunk_grid(g, (100, 50)) - - -class TestRectilinearRoundTripPreservesCodecShape: - def test_boundary_chunk_codec_size_preserved(self) -> None: - """Round-tripping through rectilinear should not change codec buffer sizes.""" - grid = ChunkGrid.from_regular((95,), (10,)) - original_codec_size = grid.dimensions[0].chunk_size(9) - assert original_codec_size == 10 - - serialized = serialize_chunk_grid(grid, "rectilinear") - parsed = parse_chunk_grid(serialized, (95,)) - - roundtripped_codec_size = parsed.dimensions[0].chunk_size(9) - assert roundtripped_codec_size == original_codec_size, ( - f"codec buffer changed from {original_codec_size} to " - f"{roundtripped_codec_size} after round-trip" - ) - - def test_single_chunk_boundary_codec_size_preserved(self) -> None: - """shape=7, chunk_size=10: single chunk's codec buffer should stay 10.""" - grid = ChunkGrid.from_regular((7,), (10,)) - assert grid.dimensions[0].chunk_size(0) == 10 - - serialized = serialize_chunk_grid(grid, "rectilinear") - parsed = parse_chunk_grid(serialized, (7,)) - - assert parsed.dimensions[0].chunk_size(0) == 10 - - class TestRectilinearIndexing: """Test that the indexing pipeline works with VaryingDimension.""" @@ -944,7 +569,7 @@ def test_create_regular_array(self, tmp_path: Path) -> None: def test_create_rectilinear_array(self, tmp_path: Path) -> None: """Create an array with a rectilinear chunk grid via metadata.""" - from zarr.core.metadata.v3 import ArrayV3Metadata, RectilinearChunkGrid + from zarr.core.metadata.v3 import ArrayV3Metadata arr = zarr.create_array( store=tmp_path / "rect.zarr", @@ -959,9 +584,13 @@ def test_create_rectilinear_array(self, tmp_path: Path) -> None: def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: """Verify metadata round-trips through JSON.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - d = serialize_chunk_grid(g, "rectilinear") - g2 = parse_chunk_grid(d, (60, 100)) + from zarr.core.metadata.v3 import RectilinearChunkGrid as RectilinearChunkGridMeta + + meta = RectilinearChunkGridMeta(chunk_shapes=((10, 20, 30), (50, 50))) + d = meta.to_dict() + meta2 = RectilinearChunkGridMeta.from_dict(d) + g = ChunkGrid.from_rectilinear(list(meta.chunk_shapes), array_shape=(60, 100)) + g2 = ChunkGrid.from_rectilinear(list(meta2.chunk_shapes), array_shape=(60, 100)) assert g2.grid_shape == g.grid_shape for coord in g.all_chunk_coords(): orig_spec = g[coord] @@ -989,7 +618,7 @@ def test_chunk_grid_serializes_regular(self, tmp_path: Path) -> None: def test_chunk_grid_serializes_rectilinear(self, tmp_path: Path) -> None: """Rectilinear arrays serialize with name='rectilinear'.""" - from zarr.core.metadata.v3 import ArrayV3Metadata, RectilinearChunkGrid + from zarr.core.metadata.v3 import ArrayV3Metadata arr = zarr.create_array( store=tmp_path / "rect.zarr", @@ -1006,7 +635,7 @@ def test_chunk_grid_serializes_rectilinear(self, tmp_path: Path) -> None: def test_chunk_grid_name_roundtrip_preserves_rectilinear(self, tmp_path: Path) -> None: """A rectilinear grid with uniform edges stays 'rectilinear' through to_dict/from_dict.""" - from zarr.core.metadata.v3 import ArrayV3Metadata, RectilinearChunkGrid + from zarr.core.metadata.v3 import ArrayV3Metadata meta_dict: dict[str, Any] = { "zarr_format": 3, @@ -1086,7 +715,6 @@ def test_sharding_accepts_rectilinear_outer_grid(self) -> None: """ShardingCodec.validate should not reject rectilinear outer grids.""" from zarr.codecs.sharding import ShardingCodec from zarr.core.dtype import Float32 - from zarr.core.metadata.v3 import RectilinearChunkGrid codec = ShardingCodec(chunk_shape=(5, 5)) grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 30), (50, 50))) @@ -1142,8 +770,8 @@ def test_chunk_grid_boundary_shape(self) -> None: # -- Boundary FixedDimension in rectilinear serialization -- - def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: - """A rectilinear grid with a boundary FixedDimension preserves extent.""" + def test_boundary_fixed_dim_mixed_grid(self) -> None: + """A grid mixing VaryingDimension and boundary FixedDimension works correctly.""" g = ChunkGrid( dimensions=( VaryingDimension([10, 20, 30], extent=60), @@ -1151,46 +779,26 @@ def test_boundary_fixed_dim_rectilinear_roundtrip(self) -> None: ) ) assert g.grid_shape == (3, 10) + # Boundary chunk along dim 1 has clipped data size + spec = g[(0, 9)] + assert spec is not None + assert spec.shape == (10, 5) + assert spec.codec_shape == (10, 10) - d = serialize_chunk_grid(g, "rectilinear") - assert d["name"] == "rectilinear" - # Second dim serializes as bare integer (per rectilinear spec, - # a bare integer repeats until sum >= extent, preserving full - # codec buffer size for boundary chunks). - config = d["configuration"] - assert isinstance(config, dict) - chunk_shapes = config["chunk_shapes"] - assert isinstance(chunk_shapes, list) - assert chunk_shapes[1] == 10 # bare integer shorthand - - g2 = parse_chunk_grid(d, (60, 95)) - assert g2.grid_shape == g.grid_shape - # Round-tripped grid should have correct extent - for coord in g.all_chunk_coords(): - orig = g[coord] - new = g2[coord] - assert orig is not None - assert new is not None - assert orig.shape == new.shape - - def test_exact_extent_fixed_dim_rectilinear_roundtrip(self) -> None: - """No boundary: extent == size * nchunks round-trips cleanly.""" + def test_exact_extent_fixed_dim_mixed_grid(self) -> None: + """No boundary: extent == size * nchunks.""" g = ChunkGrid( dimensions=( VaryingDimension([10, 20], extent=30), FixedDimension(size=25, extent=100), ) ) - d = serialize_chunk_grid(g, "rectilinear") - g2 = parse_chunk_grid(d, (30, 100)) - assert g2.grid_shape == g.grid_shape - # All chunks should be uniform - for coord in g.all_chunk_coords(): - orig = g[coord] - new = g2[coord] - assert orig is not None - assert new is not None - assert orig.shape == new.shape + assert g.grid_shape == (2, 4) + # All chunks along dim 1 have full size + for i in range(4): + spec = g[(0, i)] + assert spec is not None + assert spec.shape[1] == 25 # -- Zero-size and zero-extent -- @@ -1243,33 +851,26 @@ def test_0d_grid_nchunks(self) -> None: g = ChunkGrid.from_regular((), ()) assert g.get_nchunks() == 1 - # -- parse_chunk_grid edge cases -- - - def test_parse_chunk_grid_preserves_varying_extent(self) -> None: - """parse_chunk_grid does not overwrite VaryingDimension extent.""" - from zarr.core.chunk_grids import parse_chunk_grid + # -- with_extent edge cases -- + def test_with_extent_preserves_varying_extent(self) -> None: + """with_extent on VaryingDimension preserves extent when unchanged.""" g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - # VaryingDimension extent is 60 (sum of edges) assert isinstance(g.dimensions[0], VaryingDimension) assert g.dimensions[0].extent == 60 - # Re-binding with a different array shape should not change VaryingDimension - g2 = parse_chunk_grid(g, (60, 100)) - assert isinstance(g2.dimensions[0], VaryingDimension) - assert g2.dimensions[0].extent == 60 # unchanged - - def test_parse_chunk_grid_rebinds_fixed_extent(self) -> None: - """parse_chunk_grid updates FixedDimension extent from array shape.""" - from zarr.core.chunk_grids import parse_chunk_grid + d2 = g.dimensions[0].with_extent(60) + assert isinstance(d2, VaryingDimension) + assert d2.extent == 60 + def test_with_extent_rebinds_fixed_extent(self) -> None: + """with_extent on FixedDimension updates the extent.""" g = ChunkGrid.from_regular((100, 200), (10, 20)) assert g.dimensions[0].extent == 100 - g2 = parse_chunk_grid(g, (50, 100)) - assert isinstance(g2.dimensions[0], FixedDimension) - assert g2.dimensions[0].extent == 50 # re-bound - assert g2.grid_shape == (5, 5) + d2 = g.dimensions[0].with_extent(50) + assert isinstance(d2, FixedDimension) + assert d2.extent == 50 # -- ChunkGrid.__getitem__ validation -- @@ -1342,20 +943,6 @@ def test_chunk_spec_multidim_boundary(self) -> None: assert spec.shape == (10, 5) assert spec.is_boundary # second dim differs - # -- Rectilinear with zero-nchunks FixedDimension in serialize_chunk_grid -- - - def test_zero_nchunks_fixed_dim_in_rectilinear_serialize_raises(self) -> None: - """A rectilinear grid with a 0-extent dimension cannot be serialized.""" - g = ChunkGrid( - dimensions=( - VaryingDimension([10, 20], extent=30), - FixedDimension(size=10, extent=0), - ) - ) - assert g.grid_shape == (2, 0) - with pytest.raises(ValueError, match="zero-extent"): - serialize_chunk_grid(g, "rectilinear") - # -- VaryingDimension data_size -- def test_varying_dim_data_size_equals_chunk_size(self) -> None: @@ -1445,7 +1032,6 @@ def test_sharding_rejects_non_divisible_rectilinear(self) -> None: """Rectilinear shard sizes not divisible by inner chunk_shape should raise.""" from zarr.codecs.sharding import ShardingCodec from zarr.core.dtype import Float32 - from zarr.core.metadata.v3 import RectilinearChunkGrid codec = ShardingCodec(chunk_shape=(5, 5)) # 17 is not divisible by 5 @@ -1462,7 +1048,6 @@ def test_sharding_accepts_divisible_rectilinear(self) -> None: """Rectilinear shard sizes all divisible by inner chunk_shape should pass.""" from zarr.codecs.sharding import ShardingCodec from zarr.core.dtype import Float32 - from zarr.core.metadata.v3 import RectilinearChunkGrid codec = ShardingCodec(chunk_shape=(5, 5)) grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 30), (50, 50))) @@ -2328,15 +1913,6 @@ async def test_append_small_data(self) -> None: result = await arr.getitem((slice(20, 23), slice(None))) np.testing.assert_array_equal(result, small) - def test_parse_chunk_grid_regular_from_dict(self) -> None: - """parse_chunk_grid constructs a regular grid from a metadata dict.""" - d: dict[str, Any] = {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} - g = parse_chunk_grid(d, (100, 200)) - assert g.is_regular - assert g.chunk_shape == (10, 20) - assert g.grid_shape == (10, 10) - assert g.get_nchunks() == 100 - class TestVaryingDimensionBoundary: """VaryingDimension with extent < sum(edges), mirroring how FixedDimension @@ -2449,14 +2025,17 @@ def test_uniform_edges_with_overflow_collapses_to_fixed(self) -> None: assert g.dimensions[0].nchunks == 4 def test_serialization_roundtrip_overflow(self) -> None: - """Overflow chunks survive serialization round-trip.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) - serialized = serialize_chunk_grid(g, "rectilinear") - assert serialized == { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30, 40]]}, - } - g2 = parse_chunk_grid(serialized, (50,)) + """Overflow chunks survive metadata serialization round-trip.""" + from zarr.core.metadata.v3 import RectilinearChunkGrid as RectilinearChunkGridMeta + + meta = RectilinearChunkGridMeta(chunk_shapes=((10, 20, 30, 40),)) + d = meta.to_dict() + assert d["name"] == "rectilinear" + dim0 = d["configuration"]["chunk_shapes"][0] + assert isinstance(dim0, (list, tuple)) + assert list(dim0) == [10, 20, 30, 40] + meta2 = RectilinearChunkGridMeta.from_dict(d) + g2 = ChunkGrid.from_rectilinear(list(meta2.chunk_shapes), array_shape=(50,)) assert g2.dimensions[0].ngridcells == 4 assert g2.dimensions[0].nchunks == 3 assert g2.chunk_sizes == ((10, 20, 20),) @@ -2629,12 +2208,11 @@ def test_shrink_chunk_spec(self) -> None: assert spec.shape == (15,) assert spec.is_boundary is True - def test_parse_chunk_grid_rebinds_extent(self) -> None: - """parse_chunk_grid re-binds VaryingDimension extent to array shape.""" + def test_with_extent_rebinds_varying_extent(self) -> None: + """with_extent re-binds VaryingDimension extent.""" g = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) - # sum(edges)=60, array_shape=50 → re-bind extent - g2 = parse_chunk_grid(g, (50,)) - dim = g2.dimensions[0] + # sum(edges)=60, new extent=50 → re-bind + dim = g.dimensions[0].with_extent(50) assert isinstance(dim, VaryingDimension) assert dim.extent == 50 assert dim.data_size(2) == 20 # 50 - 30 = 20 From 2c06fb2d7cf240da19196594d3a7d4a3d06a0530 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 30 Mar 2026 16:40:35 +0200 Subject: [PATCH 109/118] chore: simplify sharding codec validation against varying chunk grid metadata (#7) * chore: simplify sharding codec validation against varying chunk grid metadata * test: restore test strength --- src/zarr/codecs/sharding.py | 32 +++++++++--------------------- tests/test_codecs/test_sharding.py | 7 +++---- 2 files changed, 12 insertions(+), 27 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 6b8ac6de87..64ed789c59 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -394,35 +394,21 @@ def validate( "The shard's `chunk_shape` and array's `shape` need to have the same number of dimensions." ) if isinstance(chunk_grid, RegularChunkGrid): - if not all( - s % c == 0 - for s, c in zip( - chunk_grid.chunk_shape, - self.chunk_shape, - strict=False, - ) - ): - raise ValueError( - f"The array's `chunk_shape` (got {chunk_grid.chunk_shape}) " - f"needs to be divisible by the shard's inner `chunk_shape` (got {self.chunk_shape})." - ) + edges_per_dim: tuple[tuple[int, ...], ...] = tuple((s,) for s in chunk_grid.chunk_shape) elif isinstance(chunk_grid, RectilinearChunkGrid): - # For rectilinear grids, every unique edge length per dimension - # must be divisible by the corresponding inner chunk size. - for i, (edges, inner) in enumerate( - zip(chunk_grid.chunk_shapes, self.chunk_shape, strict=False) - ): - for edge in set(edges): - if edge % inner != 0: - raise ValueError( - f"Chunk edge length {edge} in dimension {i} is not " - f"divisible by the shard's inner chunk size {inner}." - ) + edges_per_dim = chunk_grid.chunk_shapes else: raise TypeError( f"Sharding is only compatible with regular and rectilinear chunk grids, " f"got {type(chunk_grid)}" ) + for i, (edges, inner) in enumerate(zip(edges_per_dim, self.chunk_shape, strict=False)): + for edge in set(edges): + if edge % inner != 0: + raise ValueError( + f"Chunk edge length {edge} in dimension {i} is not " + f"divisible by the shard's inner chunk size {inner}." + ) async def _decode_single( self, diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index d7cbeb5bdb..43d03caf11 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -1,5 +1,4 @@ import pickle -import re from typing import Any import numpy as np @@ -489,9 +488,9 @@ def test_invalid_metadata(store: Store) -> None: def test_invalid_shard_shape() -> None: with pytest.raises( ValueError, - match=re.escape( - "The array's `chunk_shape` (got (16, 16)) needs to be divisible " - "by the shard's inner `chunk_shape` (got (9,))." + match=( + f"Chunk edge length {16} in dimension {0} is not " + f"divisible by the shard's inner chunk size {9}\\." ), ): zarr.create_array( From 8965d09c5abaa3580a5ceba9dc8cf6d1f88f9a97 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 30 Mar 2026 16:45:05 +0200 Subject: [PATCH 110/118] refactor: allow regular-style chunk grid declaration for rectilinear chunk grid (#8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: allow regular-style chunk grid declaration for rectilinear chunk grid The rectilinear chunk grid spec allows bare integers per dimension (meaning "regular step size"), distinct from explicit single-element edge lists. This commit widens `RectilinearChunkGrid.chunk_shapes` to `tuple[int | tuple[int, ...], ...]` so bare ints are preserved for faithful JSON round-tripping. Additionally: - unifies `_validate_chunk_shapes` to handle both regular and rectilinear validation; `_parse_chunk_shape` now delegates to it - adds `from_sizes` method to `ChunkGrid`, accepting `int | Sequence[int]` per dimension - removes `from_regular` and `from_rectilinear` methods from `ChunkGrid` - removes `parse_chunk_grid` from `chunk_grids.py` (JSON → ChunkGrid shortcut that bypassed the metadata layer) - removes `serialize_chunk_grid`, `_infer_chunk_grid_name`, and serialization helpers from `chunk_grids.py` (ChunkGrid never needs to be serialized; metadata DTOs handle it) - renames `parse_chunk_grid` in `v3.py` to `parse_chunk_grid_metadata` to disambiguate - moves the rectilinear feature flag to `RectilinearChunkGrid.__post_init__` - simplifies sharding codec validation into a single divisibility check for both regular and rectilinear grids - updates `validate_rectilinear_edges` to skip bare-int dimensions - refactors chunk grid tests to functional style with parametrization - adds docstrings to all test functions * chore: remove .claude * refactor: rename chunk_grid parsing function --------- Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --- src/zarr/codecs/sharding.py | 8 +- src/zarr/core/chunk_grids.py | 108 +- src/zarr/core/common.py | 12 +- src/zarr/core/metadata/v2.py | 2 +- src/zarr/core/metadata/v3.py | 138 +- tests/test_unified_chunk_grid.py | 4604 +++++++++++++++++------------- 6 files changed, 2694 insertions(+), 2178 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 64ed789c59..deb25a5bf6 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -423,7 +423,7 @@ async def _decode_single( indexer = BasicIndexer( tuple(slice(0, s) for s in shard_shape), shape=shard_shape, - chunk_grid=ChunkGrid.from_regular(shard_shape, chunk_shape), + chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape), ) # setup output array @@ -469,7 +469,7 @@ async def _decode_partial_single( indexer = get_indexer( selection, shape=shard_shape, - chunk_grid=ChunkGrid.from_regular(shard_shape, chunk_shape), + chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape), ) # setup output array @@ -544,7 +544,7 @@ async def _encode_single( BasicIndexer( tuple(slice(0, s) for s in shard_shape), shape=shard_shape, - chunk_grid=ChunkGrid.from_regular(shard_shape, chunk_shape), + chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape), ) ) @@ -586,7 +586,7 @@ async def _encode_partial_single( get_indexer( selection, shape=shard_shape, - chunk_grid=ChunkGrid.from_regular(shard_shape, chunk_shape), + chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape), ) ) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 03323fdd6d..ed91e37461 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -267,6 +267,10 @@ def is_boundary(self) -> bool: return self.shape != self.codec_shape +# A single dimension's rectilinear chunk spec: bare int (uniform shorthand), +# list of ints (explicit edges), or mixed RLE (e.g. [[10, 3], 5]). + + def _is_rectilinear_chunks(chunks: Any) -> TypeGuard[Sequence[Sequence[int]]]: """Check if chunks is a nested sequence (e.g. [[10, 20], [5, 5]]). @@ -319,92 +323,58 @@ def from_metadata(cls, metadata: ArrayMetadata) -> ChunkGrid: from zarr.core.metadata.v3 import RectilinearChunkGrid, RegularChunkGrid if isinstance(metadata, ArrayV2Metadata): - return cls.from_regular(metadata.shape, metadata.chunks) + return cls.from_sizes(metadata.shape, tuple(metadata.chunks)) chunk_grid_meta = metadata.chunk_grid if isinstance(chunk_grid_meta, RegularChunkGrid): - return cls.from_regular(metadata.shape, chunk_grid_meta.chunk_shape) + return cls.from_sizes(metadata.shape, tuple(chunk_grid_meta.chunk_shape)) elif isinstance(chunk_grid_meta, RectilinearChunkGrid): - return cls.from_rectilinear(chunk_grid_meta.chunk_shapes, metadata.shape) + return cls.from_sizes(metadata.shape, chunk_grid_meta.chunk_shapes) else: raise TypeError(f"Unknown chunk grid metadata type: {type(chunk_grid_meta)}") @classmethod - def from_regular(cls, array_shape: ShapeLike, chunk_shape: ShapeLike) -> ChunkGrid: - """Create a ChunkGrid where all dimensions are fixed (regular).""" - shape_parsed = parse_shapelike(array_shape) - chunks_parsed = parse_shapelike(chunk_shape) - if len(shape_parsed) != len(chunks_parsed): - raise ValueError( - f"array_shape and chunk_shape must have same ndim, " - f"got {len(shape_parsed)} vs {len(chunks_parsed)}" - ) - dims = tuple( - FixedDimension(size=c, extent=s) - for s, c in zip(shape_parsed, chunks_parsed, strict=True) - ) - return cls(dimensions=dims) - - @classmethod - def from_rectilinear( + def from_sizes( cls, - chunk_shapes: Sequence[Sequence[int]], array_shape: ShapeLike, + chunk_sizes: Sequence[int | Sequence[int]], ) -> ChunkGrid: - """Create a ChunkGrid with per-dimension edge lists. - - Each element of chunk_shapes is a sequence of chunk sizes for that dimension. - If all sizes in a dimension are identical *and* the extent equals - ``sum(edges)``, the dimension is stored as ``FixedDimension``. - Otherwise it is stored as ``VaryingDimension``, preserving the - explicit edge count (important when the last chunk extends past - the array boundary). + """Create a ChunkGrid from per-dimension chunk size specifications. Parameters ---------- - chunk_shapes - Per-dimension sequences of chunk edge lengths. array_shape - The array shape to bind as the extent per dimension. The last - chunk along each dimension may extend past the array boundary - (the edge is the codec buffer size; ``data_size`` clips to the - extent). - - Raises - ------ - ValueError - If the ``array.rectilinear_chunks`` config option is not enabled. + The array shape (one extent per dimension). + chunk_sizes + Per-dimension chunk sizes. Each element is either: + + - An ``int`` — regular (fixed) chunk size for that dimension. + - A ``Sequence[int]`` — explicit per-chunk edge lengths. If all + edges are identical and cover the extent, the dimension is + stored as ``FixedDimension``; otherwise as ``VaryingDimension``. """ - from zarr.core.config import config - - if not config.get("array.rectilinear_chunks"): - raise ValueError( - "Rectilinear chunk grids are experimental and disabled by default. " - "Enable them with: zarr.config.set({'array.rectilinear_chunks': True}) " - "or set the environment variable ZARR_ARRAY__RECTILINEAR_CHUNKS=True" - ) extents = parse_shapelike(array_shape) - if len(extents) != len(chunk_shapes): + if len(extents) != len(chunk_sizes): raise ValueError( - f"array_shape has {len(extents)} dimensions but chunk_shapes " - f"has {len(chunk_shapes)} dimensions" + f"array_shape has {len(extents)} dimensions but chunk_sizes " + f"has {len(chunk_sizes)} dimensions" ) dims: list[DimensionGrid] = [] - for edges, extent in zip(chunk_shapes, extents, strict=True): - edges_list = list(edges) - if not edges_list: - raise ValueError("Each dimension must have at least one chunk") - edge_sum = sum(edges_list) - # Collapse to FixedDimension when edges are uniform AND either - # extent == edge_sum (exact fit) or the number of edges matches - # ceildiv(extent, edge) (regular grid with boundary overflow). - if ( - edges_list[0] > 0 - and all(e == edges_list[0] for e in edges_list) - and (extent == edge_sum or len(edges_list) == ceildiv(extent, edges_list[0])) - ): - dims.append(FixedDimension(size=edges_list[0], extent=extent)) + for dim_spec, extent in zip(chunk_sizes, extents, strict=True): + if isinstance(dim_spec, int): + dims.append(FixedDimension(size=dim_spec, extent=extent)) else: - dims.append(VaryingDimension(edges_list, extent=extent)) + edges_list = list(dim_spec) + if not edges_list: + raise ValueError("Each dimension must have at least one chunk") + edge_sum = sum(edges_list) + if ( + edges_list[0] > 0 + and all(e == edges_list[0] for e in edges_list) + and (extent == edge_sum or len(edges_list) == ceildiv(extent, edges_list[0])) + ): + dims.append(FixedDimension(size=edges_list[0], extent=extent)) + else: + dims.append(VaryingDimension(edges_list, extent=extent)) return cls(dimensions=tuple(dims)) # -- Properties -- @@ -798,18 +768,18 @@ class RegularChunkGrid(metaclass=_RegularChunkGridMeta): """Deprecated compatibility shim. .. deprecated:: 3.1 - Use ``ChunkGrid.from_regular(array_shape, chunk_shape)`` instead. + Use ``ChunkGrid.from_sizes(array_shape, chunk_sizes)`` instead. Use ``grid.is_regular`` instead of ``isinstance(grid, RegularChunkGrid)``. """ def __new__(cls, *, chunk_shape: ShapeLike) -> ChunkGrid: # type: ignore[misc] warnings.warn( "RegularChunkGrid is deprecated. " - "Use ChunkGrid.from_regular(array_shape, chunk_shape) instead.", + "Use ChunkGrid.from_sizes(array_shape, chunk_sizes) instead.", DeprecationWarning, stacklevel=2, ) # Without array_shape we cannot bind extents, so use chunk_shape as extent. # This matches the old behavior where RegularChunkGrid was shape-unaware. parsed = parse_shapelike(chunk_shape) - return ChunkGrid.from_regular(array_shape=parsed, chunk_shape=parsed) + return ChunkGrid.from_sizes(array_shape=parsed, chunk_sizes=tuple(parsed)) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index cbc2bb2d37..a16257df7c 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -312,15 +312,17 @@ def validate_rectilinear_kind(kind: str | None) -> None: def validate_rectilinear_edges( - chunk_shapes: Sequence[Sequence[int]], array_shape: Sequence[int] + chunk_shapes: Sequence[int | Sequence[int]], array_shape: Sequence[int] ) -> None: """Validate that rectilinear chunk edges cover the array extent per dimension. - Raises ValueError if any dimension's edge sum is less than the corresponding - array extent. + Bare-int dimensions (regular step) always cover any extent, so they are + skipped. Explicit edge lists must sum to at least the array extent. """ - for i, (edges, extent) in enumerate(zip(chunk_shapes, array_shape, strict=True)): - edge_sum = sum(edges) + for i, (dim_spec, extent) in enumerate(zip(chunk_shapes, array_shape, strict=True)): + if isinstance(dim_spec, int): + continue + edge_sum = sum(dim_spec) if edge_sum < extent: raise ValueError( f"Rectilinear chunk edges for dimension {i} sum to {edge_sum} " diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 7357e2365b..8626d480a7 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -131,7 +131,7 @@ def chunk_grid(self) -> ChunkGrid: DeprecationWarning, stacklevel=2, ) - return ChunkGrid.from_regular(self.shape, self.chunks) + return ChunkGrid.from_sizes(self.shape, tuple(self.chunks)) @property def shards(self) -> tuple[int, ...] | None: diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 4391ea40f3..1d9018c856 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -193,43 +193,43 @@ class RectilinearChunkGridConfig(TypedDict): def _parse_chunk_shape(chunk_shape: Iterable[int]) -> tuple[int, ...]: - """Validate and normalize a regular chunk shape. All elements must be >= 1. + """Validate and normalize a regular chunk shape. - The spec defines chunk indexing via modular arithmetic with the chunk - edge length, so zero is not a valid edge length. + Delegates to ``_validate_chunk_shapes`` — a regular chunk shape is just + a sequence of bare ints (one per dimension), each of which must be >= 1. """ - as_tup = tuple(chunk_shape) - problems = [idx for idx, val in enumerate(as_tup) if val < 1] - if len(problems) == 1: - idx = problems[0] - raise ValueError(f"Invalid chunk shape {as_tup[idx]} at index {idx}.") - elif len(problems) > 1: - raise ValueError( - f"Invalid chunk shapes {[as_tup[idx] for idx in problems]} at indices {problems}." - ) - return as_tup + result = _validate_chunk_shapes(tuple(chunk_shape)) + # Regular grids only have bare ints — cast is safe after validation + return cast(tuple[int, ...], result) def _validate_chunk_shapes( - chunk_shapes: Sequence[Sequence[int]], -) -> tuple[tuple[int, ...], ...]: - """Validate expanded per-dimension edge lists. All edges must be >= 1. + chunk_shapes: Sequence[int | Sequence[int]], +) -> tuple[int | tuple[int, ...], ...]: + """Validate per-dimension chunk specifications. - Unlike regular grids, rectilinear grids list explicit per-chunk edges, - so zero-sized edges are not meaningful. + Each element is either a bare ``int`` (regular step size, must be >= 1) + or a sequence of explicit edge lengths (all must be >= 1, non-empty). """ - result: list[tuple[int, ...]] = [] - for dim_idx, edges in enumerate(chunk_shapes): - edges_tup = tuple(edges) - if not edges_tup: - raise ValueError(f"Dimension {dim_idx} has no chunk edges.") - bad = [i for i, e in enumerate(edges_tup) if e < 1] - if bad: - raise ValueError( - f"Dimension {dim_idx} has invalid edge lengths at indices {bad}: " - f"{[edges_tup[i] for i in bad]}" - ) - result.append(edges_tup) + result: list[int | tuple[int, ...]] = [] + for dim_idx, dim_spec in enumerate(chunk_shapes): + if isinstance(dim_spec, int): + if dim_spec < 1: + raise ValueError( + f"Dimension {dim_idx}: integer chunk edge length must be >= 1, got {dim_spec}" + ) + result.append(dim_spec) + else: + edges = tuple(dim_spec) + if not edges: + raise ValueError(f"Dimension {dim_idx} has no chunk edges.") + bad = [i for i, e in enumerate(edges) if e < 1] + if bad: + raise ValueError( + f"Dimension {dim_idx} has invalid edge lengths at indices {bad}: " + f"{[edges[i] for i in bad]}" + ) + result.append(edges) return tuple(result) @@ -268,16 +268,30 @@ def from_dict(cls, data: RegularChunkGridJSON) -> Self: # type: ignore[override class RectilinearChunkGrid(Metadata): """Metadata-only description of a rectilinear chunk grid. - Stores the per-dimension chunk edge lengths as expanded integer tuples - (no RLE). Serialization re-compresses to RLE via ``to_dict``. - This is what lives on ``ArrayV3Metadata.chunk_grid``. + Each element of ``chunk_shapes`` is either: + + - A bare ``int`` — a regular step size that repeats to cover the axis + (the spec's single-integer shorthand). + - A ``tuple[int, ...]`` — explicit per-chunk edge lengths (already + expanded from any RLE encoding). + + This distinction matters for faithful round-tripping: a bare int + serializes back as a bare int, while a single-element tuple serializes + as a list. """ - chunk_shapes: tuple[tuple[int, ...], ...] + chunk_shapes: tuple[int | tuple[int, ...], ...] def __post_init__(self) -> None: - chunk_shapes_parsed = _validate_chunk_shapes(self.chunk_shapes) - object.__setattr__(self, "chunk_shapes", chunk_shapes_parsed) + from zarr.core.config import config + + if not config.get("array.rectilinear_chunks"): + raise ValueError( + "Rectilinear chunk grids are experimental and disabled by default. " + "Enable them with: zarr.config.set({'array.rectilinear_chunks': True}) " + "or set the environment variable ZARR_ARRAY__RECTILINEAR_CHUNKS=True" + ) + object.__setattr__(self, "chunk_shapes", _validate_chunk_shapes(self.chunk_shapes)) @property def ndim(self) -> int: @@ -285,17 +299,17 @@ def ndim(self) -> int: def to_dict(self) -> RectilinearChunkGridJSON: # type: ignore[override] serialized_dims: list[RectilinearDimSpecJSON] = [] - for edges in self.chunk_shapes: - if len(edges) == 1: - # Bare int shorthand: single edge length repeated until sum >= extent - serialized_dims.append(edges[0]) + for dim_spec in self.chunk_shapes: + if isinstance(dim_spec, int): + # Bare int shorthand — serialize as-is + serialized_dims.append(dim_spec) else: - rle = compress_rle(edges) + rle = compress_rle(dim_spec) # Use RLE only if it's actually shorter - if len(rle) < len(edges): + if len(rle) < len(dim_spec): serialized_dims.append(rle) else: - serialized_dims.append(list(edges)) + serialized_dims.append(list(dim_spec)) return { "name": "rectilinear", "configuration": { @@ -309,17 +323,23 @@ def update_shape( ) -> RectilinearChunkGrid: """Return a new RectilinearChunkGrid with edges adjusted for *new_shape*. - Grow past existing edges: appends a chunk covering the additional extent. - Shrink or grow within existing edges: edges are kept as-is (the spec - allows trailing edges beyond the array extent). + - Bare-int dimensions stay as bare ints (they cover any extent). + - Explicit-edge dimensions: if the new extent exceeds the sum of + edges, a new chunk is appended to cover the additional extent. + Otherwise edges are kept as-is (the spec allows trailing edges + beyond the array extent). """ - new_chunk_shapes: list[tuple[int, ...]] = [] - for edges, new_ext in zip(self.chunk_shapes, new_shape, strict=True): - edge_sum = sum(edges) - if new_ext > edge_sum: - new_chunk_shapes.append((*edges, new_ext - edge_sum)) + new_chunk_shapes: list[int | tuple[int, ...]] = [] + for dim_spec, new_ext in zip(self.chunk_shapes, new_shape, strict=True): + if isinstance(dim_spec, int): + # Bare int covers any extent — no change needed + new_chunk_shapes.append(dim_spec) else: - new_chunk_shapes.append(edges) + edge_sum = sum(dim_spec) + if new_ext > edge_sum: + new_chunk_shapes.append((*dim_spec, new_ext - edge_sum)) + else: + new_chunk_shapes.append(dim_spec) return RectilinearChunkGrid(chunk_shapes=tuple(new_chunk_shapes)) @classmethod @@ -328,22 +348,19 @@ def from_dict(cls, data: RectilinearChunkGridJSON) -> Self: # type: ignore[over configuration = data["configuration"] validate_rectilinear_kind(configuration.get("kind")) raw_shapes = configuration["chunk_shapes"] - expanded: list[tuple[int, ...]] = [] + parsed: list[int | tuple[int, ...]] = [] for dim_spec in raw_shapes: if isinstance(dim_spec, int): - # Bare int shorthand — uniform edge length for this dimension. - # The DTO stores the single edge length; the behavioral ChunkGrid - # will repeat it to match the array extent when constructed. if dim_spec < 1: raise ValueError(f"Integer chunk edge length must be >= 1, got {dim_spec}") - expanded.append((dim_spec,)) + parsed.append(dim_spec) elif isinstance(dim_spec, list): - expanded.append(tuple(expand_rle(dim_spec))) + parsed.append(tuple(expand_rle(dim_spec))) else: raise TypeError( f"Invalid chunk_shapes entry: expected int or list, got {type(dim_spec)}" ) - return cls(chunk_shapes=tuple(expanded)) + return cls(chunk_shapes=tuple(parsed)) ChunkGridMetadata = RegularChunkGrid | RectilinearChunkGrid @@ -360,6 +377,9 @@ def resolve_chunks( Flat inputs like ``(10, 10)`` or a scalar ``int`` produce a ``RegularChunkGrid`` after normalization via :func:`~zarr.core.chunk_grids.normalize_chunks`. + See Also + -------- + parse_chunk_grid : Deserialize a chunk grid from stored JSON metadata. """ from zarr.core.chunk_grids import _is_rectilinear_chunks, normalize_chunks diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 564677a41b..dc7d967745 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -22,7 +22,13 @@ _is_rectilinear_chunks, ) from zarr.core.common import compress_rle, expand_rle -from zarr.core.metadata.v3 import RectilinearChunkGrid +from zarr.core.metadata.v3 import ( + RectilinearChunkGrid, + parse_chunk_grid, +) +from zarr.core.metadata.v3 import ( + RegularChunkGrid as RegularChunkGridMeta, +) from zarr.errors import BoundsCheckError from zarr.storage import MemoryStore @@ -48,1349 +54,2645 @@ def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: raise TypeError(f"Unexpected dimension type: {type(d)}") -class TestVaryingDimensionIndexToChunkBounds: - def test_index_at_extent_raises(self) -> None: - """index_to_chunk(extent) should raise since extent is out of bounds.""" - dim = VaryingDimension([10, 20, 30], extent=60) - with pytest.raises(IndexError, match="out of bounds"): - dim.index_to_chunk(60) - - def test_index_past_extent_raises(self) -> None: - dim = VaryingDimension([10, 20, 30], extent=60) - with pytest.raises(IndexError, match="out of bounds"): - dim.index_to_chunk(100) - - def test_last_valid_index_works(self) -> None: - dim = VaryingDimension([10, 20, 30], extent=60) - assert dim.index_to_chunk(59) == 2 - - -class TestFixedDimensionIndexToChunkBounds: - def test_negative_index_raises(self) -> None: - """index_to_chunk(-1) should raise, not silently return -1.""" - dim = FixedDimension(size=10, extent=95) - with pytest.raises(IndexError, match="Negative"): - dim.index_to_chunk(-1) - - def test_index_at_extent_raises(self) -> None: - dim = FixedDimension(size=10, extent=95) - with pytest.raises(IndexError, match="out of bounds"): - dim.index_to_chunk(95) - - def test_last_valid_index_works(self) -> None: - dim = FixedDimension(size=10, extent=95) - assert dim.index_to_chunk(94) == 9 - - -class TestRectilinearFeatureFlag: - """Test that rectilinear chunks are gated behind the config flag.""" - - def test_disabled_by_default(self) -> None: - with zarr.config.set({"array.rectilinear_chunks": False}): - with pytest.raises(ValueError, match="experimental and disabled by default"): - ChunkGrid.from_rectilinear([[10, 20], [25, 25]], array_shape=(30, 50)) - - def test_enabled_via_config(self) -> None: - with zarr.config.set({"array.rectilinear_chunks": True}): - g = ChunkGrid.from_rectilinear([[10, 20], [25, 25]], array_shape=(30, 50)) - assert g.ndim == 2 - - def test_create_array_blocked(self) -> None: - with zarr.config.set({"array.rectilinear_chunks": False}): - store = MemoryStore() - with pytest.raises(ValueError, match="experimental and disabled by default"): - zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") - - -class TestRegularChunkGridCompat: - """The deprecated RegularChunkGrid shim should work for common patterns.""" - - def test_construction_emits_deprecation_warning(self) -> None: - from zarr.core.chunk_grids import RegularChunkGrid - - with pytest.warns(DeprecationWarning, match="RegularChunkGrid is deprecated"): - grid = RegularChunkGrid(chunk_shape=(10, 20)) - assert isinstance(grid, ChunkGrid) - assert grid.is_regular - assert grid.chunk_shape == (10, 20) - - def test_isinstance_check(self) -> None: - from zarr.core.chunk_grids import RegularChunkGrid - - grid = ChunkGrid.from_regular((100, 200), (10, 20)) - assert isinstance(grid, RegularChunkGrid) - - def test_isinstance_false_for_rectilinear(self) -> None: - from zarr.core.chunk_grids import RegularChunkGrid - - grid = ChunkGrid.from_rectilinear([[10, 20], [25, 25]], array_shape=(30, 50)) - assert not isinstance(grid, RegularChunkGrid) - - def test_isinstance_false_for_unrelated_types(self) -> None: - from zarr.core.chunk_grids import RegularChunkGrid - - assert not isinstance("hello", RegularChunkGrid) - assert not isinstance(42, RegularChunkGrid) - - -class TestFixedDimension: - def test_basic(self) -> None: - d = FixedDimension(size=10, extent=100) - assert d.size == 10 - assert d.extent == 100 - assert d.index_to_chunk(0) == 0 - assert d.index_to_chunk(9) == 0 - assert d.index_to_chunk(10) == 1 - assert d.index_to_chunk(25) == 2 - assert d.chunk_offset(0) == 0 - assert d.chunk_offset(1) == 10 - assert d.chunk_offset(3) == 30 - # chunk_size is always uniform (codec buffer) - assert d.chunk_size(0) == 10 - assert d.chunk_size(9) == 10 - assert d.data_size(0) == 10 - assert d.data_size(9) == 10 - assert d.nchunks == 10 - - def test_boundary_data_size(self) -> None: - d = FixedDimension(size=10, extent=95) - assert d.nchunks == 10 - assert d.chunk_size(9) == 10 - assert d.data_size(9) == 5 - - def test_vectorized(self) -> None: - d = FixedDimension(size=10, extent=100) - indices = np.array([0, 5, 10, 15, 99]) - chunks = d.indices_to_chunks(indices) - np.testing.assert_array_equal(chunks, [0, 0, 1, 1, 9]) - - def test_negative_size_rejected(self) -> None: - with pytest.raises(ValueError, match="must be >= 0"): - FixedDimension(size=-1, extent=100) - - def test_negative_extent_rejected(self) -> None: - with pytest.raises(ValueError, match="must be >= 0"): - FixedDimension(size=10, extent=-1) - - def test_zero_size_allowed(self) -> None: - d = FixedDimension(size=0, extent=0) - assert d.size == 0 - assert d.nchunks == 0 - - # FixedDimension.chunk_offset/chunk_size/data_size do not bounds-check - # for performance (callers validate). OOB access is tested via - # ChunkGrid.__getitem__ which checks before delegating. - - -class TestVaryingDimension: - def test_basic(self) -> None: - d = VaryingDimension([10, 20, 30], extent=60) - assert d.edges == (10, 20, 30) - assert d.cumulative == (10, 30, 60) - assert d.nchunks == 3 - assert d.extent == 60 - - def test_index_to_chunk(self) -> None: - d = VaryingDimension([10, 20, 30], extent=60) - assert d.index_to_chunk(0) == 0 - assert d.index_to_chunk(9) == 0 - assert d.index_to_chunk(10) == 1 - assert d.index_to_chunk(29) == 1 - assert d.index_to_chunk(30) == 2 - assert d.index_to_chunk(59) == 2 - - def test_chunk_offset(self) -> None: - d = VaryingDimension([10, 20, 30], extent=60) - assert d.chunk_offset(0) == 0 - assert d.chunk_offset(1) == 10 - assert d.chunk_offset(2) == 30 - - def test_chunk_size(self) -> None: - d = VaryingDimension([10, 20, 30], extent=60) - assert d.chunk_size(0) == 10 - assert d.chunk_size(1) == 20 - assert d.chunk_size(2) == 30 - - def test_data_size(self) -> None: - d = VaryingDimension([10, 20, 30], extent=60) - assert d.data_size(0) == 10 - assert d.data_size(1) == 20 - assert d.data_size(2) == 30 - - def test_vectorized(self) -> None: - d = VaryingDimension([10, 20, 30], extent=60) - indices = np.array([0, 9, 10, 29, 30, 59]) - chunks = d.indices_to_chunks(indices) - np.testing.assert_array_equal(chunks, [0, 0, 1, 1, 2, 2]) - - def test_empty_rejected(self) -> None: - with pytest.raises(ValueError, match="must not be empty"): - VaryingDimension([], extent=0) - - def test_zero_edge_rejected(self) -> None: - with pytest.raises(ValueError, match="must be > 0"): - VaryingDimension([10, 0, 5], extent=15) - - -class TestChunkSpec: - def test_basic(self) -> None: - spec = ChunkSpec( - slices=(slice(0, 10), slice(0, 20)), - codec_shape=(10, 20), - ) - assert spec.shape == (10, 20) - assert not spec.is_boundary +# --------------------------------------------------------------------------- +# Dimension index_to_chunk bounds tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("dim", "index", "match"), + [ + (VaryingDimension([10, 20, 30], extent=60), 60, "out of bounds"), + (VaryingDimension([10, 20, 30], extent=60), 100, "out of bounds"), + (FixedDimension(size=10, extent=95), 95, "out of bounds"), + (FixedDimension(size=10, extent=95), -1, "Negative"), + ], + ids=[ + "varying-at-extent", + "varying-past-extent", + "fixed-at-extent", + "fixed-negative", + ], +) +def test_dimension_index_to_chunk_bounds( + dim: FixedDimension | VaryingDimension, index: int, match: str +) -> None: + """Out-of-bounds or negative indices raise IndexError for both dimension types""" + with pytest.raises(IndexError, match=match): + dim.index_to_chunk(index) + + +@pytest.mark.parametrize( + ("dim", "index", "expected"), + [ + (VaryingDimension([10, 20, 30], extent=60), 59, 2), + (FixedDimension(size=10, extent=95), 94, 9), + ], + ids=["varying-last-valid", "fixed-last-valid"], +) +def test_dimension_index_to_chunk_last_valid( + dim: FixedDimension | VaryingDimension, index: int, expected: int +) -> None: + """Last valid index maps to the correct chunk for both dimension types""" + assert dim.index_to_chunk(index) == expected - def test_boundary(self) -> None: - spec = ChunkSpec( - slices=(slice(90, 95), slice(0, 20)), - codec_shape=(10, 20), - ) - assert spec.shape == (5, 20) - assert spec.is_boundary - - -class TestChunkGridConstruction: - def test_from_regular(self) -> None: - g = ChunkGrid.from_regular((100, 200), (10, 20)) - assert g.is_regular - assert g.chunk_shape == (10, 20) - assert g.ndim == 2 - - def test_zero_dim(self) -> None: - """0-d arrays produce a ChunkGrid with no dimensions.""" - g = ChunkGrid.from_regular((), ()) - assert g.is_regular - assert g.chunk_shape == () - assert g.ndim == 0 - - def test_from_rectilinear(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) - assert not g.is_regular - assert g.ndim == 2 - with pytest.raises(ValueError, match="only available for regular"): - _ = g.chunk_shape - def test_rectilinear_with_uniform_dim(self) -> None: - """A rectilinear grid with all-same sizes in one dim stores it as Fixed.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) - assert isinstance(g.dimensions[0], VaryingDimension) - assert isinstance(g.dimensions[1], FixedDimension) - - def test_all_uniform_becomes_regular(self) -> None: - """If all dimensions have uniform sizes, the grid is regular.""" - g = ChunkGrid.from_rectilinear([[10, 10, 10], [25, 25]], array_shape=(30, 50)) - assert g.is_regular - assert g.chunk_shape == (10, 25) - - -class TestChunkGridQueries: - def test_regular_shape(self) -> None: - g = ChunkGrid.from_regular((100, 200), (10, 20)) - assert g.grid_shape == (10, 10) - - def test_regular_shape_boundary(self) -> None: - g = ChunkGrid.from_regular((95, 200), (10, 20)) - assert g.grid_shape == (10, 10) - - def test_rectilinear_shape(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) - assert g.grid_shape == (3, 4) - - def test_regular_getitem(self) -> None: - g = ChunkGrid.from_regular((100, 200), (10, 20)) - spec = g[(0, 0)] - assert spec is not None - assert spec.shape == (10, 20) - assert spec.codec_shape == (10, 20) - assert not spec.is_boundary - - def test_regular_getitem_boundary(self) -> None: - g = ChunkGrid.from_regular((95, 200), (10, 20)) - spec = g[(9, 0)] - assert spec is not None - assert spec.shape == (5, 20) # data_size clipped - assert spec.codec_shape == (10, 20) # codec always full - assert spec.is_boundary - - def test_regular_getitem_oob(self) -> None: - g = ChunkGrid.from_regular((100, 200), (10, 20)) - assert g[(99, 0)] is None - - def test_rectilinear_getitem(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) - spec0 = g[(0, 0)] - assert spec0 is not None - assert spec0.shape == (10, 25) - - spec1 = g[(1, 0)] - assert spec1 is not None - assert spec1.shape == (20, 25) - - spec2 = g[(2, 3)] - assert spec2 is not None - assert spec2.shape == (30, 25) - - assert g[(3, 0)] is None # OOB - - def test_getitem_slices(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25, 25, 25]], array_shape=(60, 100)) - spec = g[(1, 2)] - assert spec is not None - assert spec.slices == (slice(10, 30, 1), slice(50, 75, 1)) - - def test_all_chunk_coords(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - coords = list(g.all_chunk_coords()) - assert len(coords) == 6 - assert coords[0] == (0, 0) - assert coords[-1] == (2, 1) - - def test_all_chunk_coords_with_origin(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - coords = list(g.all_chunk_coords(origin=(1, 0))) - assert len(coords) == 4 # 2 remaining in dim0 * 2 in dim1 - assert coords[0] == (1, 0) - assert coords[-1] == (2, 1) - - def test_all_chunk_coords_with_selection_shape(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - coords = list(g.all_chunk_coords(selection_shape=(2, 1))) - assert len(coords) == 2 - assert coords == [(0, 0), (1, 0)] - - def test_all_chunk_coords_with_origin_and_selection_shape(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - coords = list(g.all_chunk_coords(origin=(1, 1), selection_shape=(2, 1))) - assert coords == [(1, 1), (2, 1)] - - def test_all_chunk_coords_origin_at_last_chunk(self) -> None: - g = ChunkGrid.from_regular((30, 40), (10, 20)) - coords = list(g.all_chunk_coords(origin=(2, 1))) - assert coords == [(2, 1)] - - def test_all_chunk_coords_selection_shape_zero(self) -> None: - g = ChunkGrid.from_regular((30, 40), (10, 20)) - coords = list(g.all_chunk_coords(selection_shape=(0, 0))) - assert coords == [] - - def test_all_chunk_coords_single_dim_slice(self) -> None: - """Origin shifts one dim, selection_shape restricts the other.""" - g = ChunkGrid.from_regular((60, 80), (20, 20)) # 3x4 - coords = list(g.all_chunk_coords(origin=(0, 2), selection_shape=(3, 1))) - assert coords == [(0, 2), (1, 2), (2, 2)] - - def test_get_nchunks(self) -> None: - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - assert g.get_nchunks() == 6 - - def test_iter(self) -> None: - g = ChunkGrid.from_regular((30, 40), (10, 20)) - specs = list(g) - assert len(specs) == 6 # 3 * 2 - assert all(isinstance(s, ChunkSpec) for s in specs) - - -class TestRLE: - def test_expand(self) -> None: - assert expand_rle([[10, 3]]) == [10, 10, 10] - assert expand_rle([[10, 2], [20, 1]]) == [10, 10, 20] - - def test_compress(self) -> None: - assert compress_rle([10, 10, 10]) == [[10, 3]] - assert compress_rle([10, 10, 20]) == [[10, 2], 20] - assert compress_rle([5]) == [5] - assert compress_rle([10, 20, 30]) == [10, 20, 30] - - def test_roundtrip(self) -> None: - original = [10, 10, 10, 20, 20, 30] - compressed = compress_rle(original) - assert expand_rle(compressed) == original - - def test_expand_rejects_zero_edge(self) -> None: - with pytest.raises(ValueError, match="Chunk edge length must be >= 1"): - expand_rle([0]) +# --------------------------------------------------------------------------- +# Rectilinear feature flag tests +# --------------------------------------------------------------------------- - def test_expand_rejects_negative_edge(self) -> None: - with pytest.raises(ValueError, match="Chunk edge length must be >= 1"): - expand_rle([-5]) - def test_expand_rejects_zero_rle_size(self) -> None: - with pytest.raises(ValueError, match="Chunk edge length must be >= 1"): - expand_rle([[0, 3]]) +@pytest.mark.parametrize( + "action", + [ + lambda: RectilinearChunkGrid(chunk_shapes=((10, 20), (25, 25))), + lambda: RectilinearChunkGrid.from_dict( + { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [50, 50]]}, # type: ignore[typeddict-item] + } + ), + lambda: zarr.create_array(MemoryStore(), shape=(30,), chunks=[[10, 20]], dtype="int32"), + ], + ids=["constructor", "from_dict", "create_array"], +) +def test_rectilinear_feature_flag_blocked(action: Any) -> None: + """Rectilinear chunk operations raise ValueError when the feature flag is disabled""" + with zarr.config.set({"array.rectilinear_chunks": False}): + with pytest.raises(ValueError, match="experimental and disabled by default"): + action() - def test_expand_rejects_negative_rle_size(self) -> None: - with pytest.raises(ValueError, match="Chunk edge length must be >= 1"): - expand_rle([[-10, 2]]) - def test_expand_rejects_zero_rle_count(self) -> None: - with pytest.raises(ValueError, match="RLE repeat count must be >= 1"): - expand_rle([[5, 0]]) +def test_rectilinear_feature_flag_enabled() -> None: + """Rectilinear chunk grid construction succeeds when the feature flag is enabled""" + with zarr.config.set({"array.rectilinear_chunks": True}): + grid = RectilinearChunkGrid(chunk_shapes=((10, 20), (25, 25))) + assert grid.ndim == 2 - def test_expand_rejects_negative_rle_count(self) -> None: - with pytest.raises(ValueError, match="RLE repeat count must be >= 1"): - expand_rle([[5, -1]]) +# --------------------------------------------------------------------------- +# RegularChunkGrid compatibility tests +# --------------------------------------------------------------------------- -class TestExpandRleHandlesJsonFloats: - def test_bare_integer_floats_accepted(self) -> None: - """JSON parsers may emit 10.0 for the integer 10; expand_rle should handle it.""" - result = expand_rle([10.0, 20.0]) # type: ignore[list-item] - assert result == [10, 20] - def test_rle_pair_with_float_count(self) -> None: - result = expand_rle([[10, 3.0]]) # type: ignore[list-item] - assert result == [10, 10, 10] +def test_regular_chunk_grid_compat_construction_emits_deprecation_warning() -> None: + """Constructing RegularChunkGrid emits a DeprecationWarning and returns a ChunkGrid""" + from zarr.core.chunk_grids import RegularChunkGrid + with pytest.warns(DeprecationWarning, match="RegularChunkGrid is deprecated"): + grid = RegularChunkGrid(chunk_shape=(10, 20)) + assert isinstance(grid, ChunkGrid) + assert grid.is_regular + assert grid.chunk_shape == (10, 20) -class TestIsRectilinearChunks: - """Edge cases for _is_rectilinear_chunks.""" - def test_nested_lists(self) -> None: - assert _is_rectilinear_chunks([[10, 20], [5, 5]]) is True +@pytest.mark.parametrize( + ("grid", "expected"), + [ + (ChunkGrid.from_sizes((100, 200), (10, 20)), True), + (ChunkGrid.from_sizes((30, 50), [[10, 20], [25, 25]]), False), + ], + ids=["regular-is-instance", "rectilinear-is-not-instance"], +) +def test_regular_chunk_grid_isinstance(grid: ChunkGrid, expected: bool) -> None: + """isinstance check against RegularChunkGrid matches only regular grids""" + from zarr.core.chunk_grids import RegularChunkGrid + + assert isinstance(grid, RegularChunkGrid) == expected + + +@pytest.mark.parametrize("obj", ["hello", 42], ids=["string", "int"]) +def test_regular_chunk_grid_isinstance_false_for_unrelated_types(obj: Any) -> None: + """Unrelated types are not instances of RegularChunkGrid""" + from zarr.core.chunk_grids import RegularChunkGrid + + assert not isinstance(obj, RegularChunkGrid) + + +# --------------------------------------------------------------------------- +# FixedDimension tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ( + "size", + "extent", + "chunk_ix", + "expected_nchunks", + "expected_chunk_size", + "expected_data_size", + "expected_offset", + ), + [ + (10, 100, 0, 10, 10, 10, 0), + (10, 100, 1, 10, 10, 10, 10), + (10, 100, 9, 10, 10, 10, 90), + (10, 95, 9, 10, 10, 5, 90), # boundary chunk + (0, 0, None, 0, None, None, None), # zero-size + ], + ids=["start", "middle", "end", "boundary", "zero-size"], +) +def test_fixed_dimension( + size: int, + extent: int, + chunk_ix: int | None, + expected_nchunks: int, + expected_chunk_size: int | None, + expected_data_size: int | None, + expected_offset: int | None, +) -> None: + """FixedDimension properties match expected values for various chunk/extent combinations""" + d = FixedDimension(size=size, extent=extent) + assert d.nchunks == expected_nchunks + if chunk_ix is not None: + assert d.chunk_size(chunk_ix) == expected_chunk_size + assert d.data_size(chunk_ix) == expected_data_size + assert d.chunk_offset(chunk_ix) == expected_offset + + +@pytest.mark.parametrize( + ("idx", "expected"), + [(0, 0), (9, 0), (10, 1), (25, 2)], +) +def test_fixed_dimension_index_to_chunk(idx: int, expected: int) -> None: + """FixedDimension.index_to_chunk maps element indices to correct chunk indices""" + d = FixedDimension(size=10, extent=100) + assert d.index_to_chunk(idx) == expected - def test_nested_tuples(self) -> None: - assert _is_rectilinear_chunks(((10, 20), (5, 5))) is True - def test_flat_tuple(self) -> None: - assert _is_rectilinear_chunks((10, 20)) is False +def test_fixed_dimension_indices_to_chunks() -> None: + """FixedDimension.indices_to_chunks vectorizes index-to-chunk mapping over an array""" + d = FixedDimension(size=10, extent=100) + indices = np.array([0, 5, 10, 15, 99]) + np.testing.assert_array_equal(d.indices_to_chunks(indices), [0, 0, 1, 1, 9]) - def test_flat_list(self) -> None: - assert _is_rectilinear_chunks([10, 20]) is False - def test_single_int(self) -> None: - assert _is_rectilinear_chunks(10) is False +@pytest.mark.parametrize( + ("size", "extent", "match"), + [(-1, 100, "must be >= 0"), (10, -1, "must be >= 0")], + ids=["negative-size", "negative-extent"], +) +def test_fixed_dimension_rejects_negative(size: int, extent: int, match: str) -> None: + """FixedDimension raises ValueError for negative size or extent""" + with pytest.raises(ValueError, match=match): + FixedDimension(size=size, extent=extent) + + +# --------------------------------------------------------------------------- +# VaryingDimension tests +# --------------------------------------------------------------------------- + + +def test_varying_dimension_construction() -> None: + """VaryingDimension stores edges, cumulative sums, nchunks, and extent correctly""" + d = VaryingDimension([10, 20, 30], extent=60) + assert d.edges == (10, 20, 30) + assert d.cumulative == (10, 30, 60) + assert d.nchunks == 3 + assert d.extent == 60 + + +@pytest.mark.parametrize( + ( + "chunk_idx", + "expected_offset", + "expected_size", + "expected_data", + "expected_chunk_for_first_idx", + ), + [ + (0, 0, 10, 10, 0), + (1, 10, 20, 20, 1), + (2, 30, 30, 30, 2), + ], +) +def test_varying_dimension( + chunk_idx: int, + expected_offset: int, + expected_size: int, + expected_data: int, + expected_chunk_for_first_idx: int, +) -> None: + """VaryingDimension chunk_offset, chunk_size, data_size, and index_to_chunk return correct values""" + d = VaryingDimension([10, 20, 30], extent=60) + assert d.chunk_offset(chunk_idx) == expected_offset + assert d.chunk_size(chunk_idx) == expected_size + assert d.data_size(chunk_idx) == expected_data + assert d.index_to_chunk(expected_offset) == expected_chunk_for_first_idx + + +def test_varying_dimension_indices_to_chunks() -> None: + """VaryingDimension.indices_to_chunks vectorizes index-to-chunk mapping over an array""" + d = VaryingDimension([10, 20, 30], extent=60) + indices = np.array([0, 9, 10, 29, 30, 59]) + np.testing.assert_array_equal(d.indices_to_chunks(indices), [0, 0, 1, 1, 2, 2]) + + +@pytest.mark.parametrize( + ("edges", "extent", "match"), + [ + ([], 0, "must not be empty"), + ([10, 0, 5], 15, "must be > 0"), + ], + ids=["empty", "zero-edge"], +) +def test_varying_dimension_rejects_invalid(edges: list[int], extent: int, match: str) -> None: + """VaryingDimension raises ValueError for empty edges or zero-length edges""" + with pytest.raises(ValueError, match=match): + VaryingDimension(edges, extent=extent) + + +# --------------------------------------------------------------------------- +# ChunkSpec tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("slices", "codec_shape", "expected_shape", "expected_boundary"), + [ + ((slice(0, 10), slice(0, 20)), (10, 20), (10, 20), False), + ((slice(90, 95), slice(0, 20)), (10, 20), (5, 20), True), + ((slice(10, 10),), (0,), (0,), False), + ((slice(0, 10), slice(0, 5)), (10, 10), (10, 5), True), + ], + ids=["basic", "boundary", "empty-slices", "multidim-boundary"], +) +def test_chunk_spec( + slices: tuple[slice, ...], + codec_shape: tuple[int, ...], + expected_shape: tuple[int, ...], + expected_boundary: bool, +) -> None: + """ChunkSpec reports correct shape and boundary status from slices and codec_shape""" + spec = ChunkSpec(slices=slices, codec_shape=codec_shape) + assert spec.shape == expected_shape + assert spec.is_boundary == expected_boundary + + +# --------------------------------------------------------------------------- +# ChunkGrid construction tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("array_shape", "chunk_sizes", "expected_regular", "expected_ndim", "expected_chunk_shape"), + [ + ((100, 200), (10, 20), True, 2, (10, 20)), + ((), (), True, 0, ()), + ((60, 100), [[10, 20, 30], [25, 25, 25, 25]], False, 2, None), + ((30, 50), [[10, 10, 10], [25, 25]], True, 2, (10, 25)), # uniform edges → regular + ], + ids=["regular", "zero-dim", "rectilinear", "uniform-becomes-regular"], +) +def test_chunk_grid_construction( + array_shape: tuple[int, ...], + chunk_sizes: Any, + expected_regular: bool, + expected_ndim: int, + expected_chunk_shape: tuple[int, ...] | None, +) -> None: + """ChunkGrid.from_sizes produces grids with correct regularity, ndim, and chunk_shape""" + g = ChunkGrid.from_sizes(array_shape, chunk_sizes) + assert g.is_regular == expected_regular + assert g.ndim == expected_ndim + if expected_chunk_shape is not None: + assert g.chunk_shape == expected_chunk_shape + else: + with pytest.raises(ValueError, match="only available for regular"): + _ = g.chunk_shape - def test_string(self) -> None: - assert _is_rectilinear_chunks("auto") is False - def test_empty_list(self) -> None: - assert _is_rectilinear_chunks([]) is False +def test_chunk_grid_rectilinear_uniform_dim_is_fixed() -> None: + """A rectilinear grid with all-same sizes in one dim stores it as Fixed.""" + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [25, 25, 25, 25]]) + assert isinstance(g.dimensions[0], VaryingDimension) + assert isinstance(g.dimensions[1], FixedDimension) - def test_empty_nested_list(self) -> None: - """First element is an empty list — it's iterable and not str/int.""" - assert _is_rectilinear_chunks([[]]) is True - def test_chunk_grid_instance(self) -> None: - g = ChunkGrid.from_regular((10,), (5,)) - assert _is_rectilinear_chunks(g) is False +# --------------------------------------------------------------------------- +# ChunkGrid query tests +# --------------------------------------------------------------------------- - def test_none(self) -> None: - assert _is_rectilinear_chunks(None) is False - def test_float(self) -> None: - assert _is_rectilinear_chunks(3.14) is False +@pytest.mark.parametrize( + ("shape", "chunks", "expected_grid_shape"), + [ + ((100, 200), (10, 20), (10, 10)), + ((95, 200), (10, 20), (10, 10)), + ((60, 100), [[10, 20, 30], [25, 25, 25, 25]], (3, 4)), + ], + ids=["regular", "regular-boundary", "rectilinear"], +) +def test_chunk_grid_shape( + shape: tuple[int, ...], + chunks: Any, + expected_grid_shape: tuple[int, ...], +) -> None: + """ChunkGrid.grid_shape returns the expected number of chunks per dimension""" + g = ChunkGrid.from_sizes(shape, chunks) + assert g.grid_shape == expected_grid_shape + + +@pytest.mark.parametrize( + ( + "array_shape", + "chunk_sizes", + "coords", + "expected_shape", + "expected_codec_shape", + "expected_boundary", + ), + [ + # regular interior + ((100, 200), (10, 20), (0, 0), (10, 20), (10, 20), False), + # regular boundary + ((95, 200), (10, 20), (9, 0), (5, 20), (10, 20), True), + # rectilinear + ((60, 100), [[10, 20, 30], [25, 25, 25, 25]], (0, 0), (10, 25), (10, 25), False), + ((60, 100), [[10, 20, 30], [25, 25, 25, 25]], (1, 0), (20, 25), (20, 25), False), + ((60, 100), [[10, 20, 30], [25, 25, 25, 25]], (2, 3), (30, 25), (30, 25), False), + ], + ids=["regular", "regular-boundary", "rectilinear-0,0", "rectilinear-1,0", "rectilinear-2,3"], +) +def test_chunk_grid_getitem( + array_shape: tuple[int, ...], + chunk_sizes: Any, + coords: tuple[int, ...], + expected_shape: tuple[int, ...], + expected_codec_shape: tuple[int, ...], + expected_boundary: bool, +) -> None: + """ChunkGrid.__getitem__ returns a ChunkSpec with correct shape, codec_shape, and boundary flag""" + g = ChunkGrid.from_sizes(array_shape, chunk_sizes) + spec = g[coords] + assert spec is not None + assert spec.shape == expected_shape + assert spec.codec_shape == expected_codec_shape + assert spec.is_boundary == expected_boundary + + +@pytest.mark.parametrize( + ("array_shape", "chunk_sizes", "coords"), + [ + ((100, 200), (10, 20), (99, 0)), + ((60, 100), [[10, 20, 30], [25, 25, 25, 25]], (3, 0)), + ], + ids=["regular-oob", "rectilinear-oob"], +) +def test_chunk_grid_getitem_oob( + array_shape: tuple[int, ...], chunk_sizes: Any, coords: tuple[int, ...] +) -> None: + """Out-of-bounds chunk coordinates return None""" + g = ChunkGrid.from_sizes(array_shape, chunk_sizes) + assert g[coords] is None + + +def test_chunk_grid_getitem_slices() -> None: + """ChunkSpec.slices reflect the correct start/stop for a rectilinear chunk""" + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [25, 25, 25, 25]]) + spec = g[(1, 2)] + assert spec is not None + assert spec.slices == (slice(10, 30, 1), slice(50, 75, 1)) + + +# -- all_chunk_coords tests -- + + +@pytest.mark.parametrize( + ("array_shape", "chunk_sizes", "origin", "selection_shape", "expected_coords"), + [ + # rectilinear grid + ( + (60, 100), + [[10, 20, 30], [50, 50]], + None, + None, + [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)], + ), + ((60, 100), [[10, 20, 30], [50, 50]], (1, 0), None, [(1, 0), (1, 1), (2, 0), (2, 1)]), + ((60, 100), [[10, 20, 30], [50, 50]], None, (2, 1), [(0, 0), (1, 0)]), + ((60, 100), [[10, 20, 30], [50, 50]], (1, 1), (2, 1), [(1, 1), (2, 1)]), + # regular grid + ((30, 40), (10, 20), (2, 1), None, [(2, 1)]), + ((30, 40), (10, 20), None, (0, 0), []), + ((60, 80), (20, 20), (0, 2), (3, 1), [(0, 2), (1, 2), (2, 2)]), + ], + ids=[ + "all", + "with-origin", + "with-sel-shape", + "origin+sel", + "last-chunk", + "zero-sel", + "single-dim", + ], +) +def test_all_chunk_coords( + array_shape: tuple[int, ...], + chunk_sizes: Any, + origin: tuple[int, ...] | None, + selection_shape: tuple[int, ...] | None, + expected_coords: list[tuple[int, ...]], +) -> None: + """all_chunk_coords yields the expected coordinates with optional origin and selection_shape""" + g = ChunkGrid.from_sizes(array_shape, chunk_sizes) + kwargs: dict[str, Any] = {} + if origin is not None: + kwargs["origin"] = origin + if selection_shape is not None: + kwargs["selection_shape"] = selection_shape + assert list(g.all_chunk_coords(**kwargs)) == expected_coords + + +def test_chunk_grid_get_nchunks() -> None: + """get_nchunks returns the total number of chunks across all dimensions""" + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + assert g.get_nchunks() == 6 + + +def test_chunk_grid_iter() -> None: + """Iterating a ChunkGrid yields the correct number of ChunkSpec objects""" + g = ChunkGrid.from_sizes((30, 40), (10, 20)) + specs = list(g) + assert len(specs) == 6 + assert all(isinstance(s, ChunkSpec) for s in specs) + + +# --------------------------------------------------------------------------- +# RLE tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("compressed", "expected"), + [ + ([[10, 3]], [10, 10, 10]), + ([[10, 2], [20, 1]], [10, 10, 20]), + ], +) +def test_rle_expand(compressed: list[Any], expected: list[int]) -> None: + """RLE-encoded edges expand correctly""" + assert expand_rle(compressed) == expected + + +@pytest.mark.parametrize( + ("original", "expected"), + [ + ([10, 10, 10], [[10, 3]]), + ([10, 10, 20], [[10, 2], 20]), + ([5], [5]), + ([10, 20, 30], [10, 20, 30]), + ], +) +def test_rle_compress(original: list[int], expected: list[Any]) -> None: + """compress_rle produces the expected RLE encoding for various input sequences""" + assert compress_rle(original) == expected + + +def test_rle_roundtrip() -> None: + """compress_rle followed by expand_rle recovers the original sequence""" + original = [10, 10, 10, 20, 20, 30] + compressed = compress_rle(original) + assert expand_rle(compressed) == original + + +@pytest.mark.parametrize( + ("rle_input", "match"), + [ + ([0], "Chunk edge length must be >= 1"), + ([-5], "Chunk edge length must be >= 1"), + ([[0, 3]], "Chunk edge length must be >= 1"), + ([[-10, 2]], "Chunk edge length must be >= 1"), + ([[5, 0]], "RLE repeat count must be >= 1"), + ([[5, -1]], "RLE repeat count must be >= 1"), + ], + ids=[ + "zero-edge", + "negative-edge", + "zero-rle-size", + "negative-rle-size", + "zero-rle-count", + "negative-rle-count", + ], +) +def test_rle_expand_rejects_invalid(rle_input: list[Any], match: str) -> None: + """expand_rle raises ValueError for zero/negative edge lengths or repeat counts""" + with pytest.raises(ValueError, match=match): + expand_rle(rle_input) + + +# -- expand_rle handles JSON floats -- + + +def test_expand_rle_bare_integer_floats_accepted() -> None: + """JSON parsers may emit 10.0 for the integer 10; expand_rle should handle it.""" + result = expand_rle([10.0, 20.0]) # type: ignore[list-item] + assert result == [10, 20] + + +def test_expand_rle_pair_with_float_count() -> None: + """expand_rle accepts float repeat counts that are integer-valued""" + result = expand_rle([[10, 3.0]]) # type: ignore[list-item] + assert result == [10, 10, 10] + + +# --------------------------------------------------------------------------- +# _is_rectilinear_chunks tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + ([[10, 20], [5, 5]], True), + (((10, 20), (5, 5)), True), + ((10, 20), False), + ([10, 20], False), + (10, False), + ("auto", False), + ([], False), + ([[]], True), + (ChunkGrid.from_sizes((10,), (5,)), False), + (None, False), + (3.14, False), + ], + ids=[ + "nested-lists", + "nested-tuples", + "flat-tuple", + "flat-list", + "single-int", + "string", + "empty-list", + "empty-nested-list", + "chunk-grid-instance", + "none", + "float", + ], +) +def test_is_rectilinear_chunks(value: Any, expected: bool) -> None: + """_is_rectilinear_chunks correctly identifies nested sequences as rectilinear""" + assert _is_rectilinear_chunks(value) is expected + + +# --------------------------------------------------------------------------- +# Serialization tests +# --------------------------------------------------------------------------- + + +def test_serialization_error_non_regular_chunk_shape() -> None: + """Accessing chunk_shape on a non-regular grid raises ValueError.""" + grid = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [25, 25, 25, 25]]) + with pytest.raises(ValueError, match="only available for regular"): + grid.chunk_shape # noqa: B018 + + +def test_serialization_error_zero_extent_rectilinear() -> None: + """RectilinearChunkGrid rejects empty edge tuples.""" + with pytest.raises(ValueError, match="has no chunk edges"): + RectilinearChunkGrid(chunk_shapes=((),)) + + +def test_serialization_unknown_name_parse() -> None: + """Parsing metadata with an unknown chunk grid name raises ValueError""" + with pytest.raises(ValueError, match="Unknown chunk grid"): + parse_chunk_grid({"name": "hexagonal", "configuration": {}}) + + +# --------------------------------------------------------------------------- +# Spec compliance tests +# --------------------------------------------------------------------------- + + +def test_spec_kind_inline_required_on_deserialize() -> None: + """Deserialization requires kind: 'inline'.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"chunk_shapes": [[10, 20], [15, 15]]}, + } + with pytest.raises(ValueError, match="requires a 'kind' field"): + parse_chunk_grid(data) + + +def test_spec_kind_unknown_rejected() -> None: + """Unsupported rectilinear chunk grid kind raises ValueError on parse""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "reference", "chunk_shapes": [[10, 20], [15, 15]]}, + } + with pytest.raises(ValueError, match="Unsupported rectilinear chunk grid kind"): + parse_chunk_grid(data) + + +def test_spec_integer_shorthand_per_dimension() -> None: + """A bare integer in chunk_shapes means repeat until >= extent.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [4, [1, 2, 3]]}, + } + meta = parse_chunk_grid(data) + g = ChunkGrid.from_sizes((6, 6), meta.chunk_shapes) # type: ignore[union-attr] + assert _edges(g, 0) == (4, 4) + assert _edges(g, 1) == (1, 2, 3) + + +def test_spec_mixed_rle_and_bare_integers() -> None: + """An array can mix bare integers and [value, count] RLE pairs.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[[1, 3], 3]]}, + } + meta = parse_chunk_grid(data) + g = ChunkGrid.from_sizes((6,), meta.chunk_shapes) # type: ignore[union-attr] + assert _edges(g, 0) == (1, 1, 1, 3) + + +def test_spec_overflow_chunks_allowed() -> None: + """Edge sum >= extent is valid (overflow chunks permitted).""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[4, 4, 4]]}, + } + meta = parse_chunk_grid(data) + g = ChunkGrid.from_sizes((6,), meta.chunk_shapes) # type: ignore[union-attr] + assert _edges(g, 0) == (4, 4, 4) + + +def test_spec_example() -> None: + """The full example from the spec README.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": { + "kind": "inline", + "chunk_shapes": [ + 4, + [1, 2, 3], + [[4, 2]], + [[1, 3], 3], + [4, 4, 4], + ], + }, + } + meta = parse_chunk_grid(data) + g = ChunkGrid.from_sizes((6, 6, 6, 6, 6), meta.chunk_shapes) # type: ignore[union-attr] + assert _edges(g, 0) == (4, 4) + assert _edges(g, 1) == (1, 2, 3) + assert _edges(g, 2) == (4, 4) + assert _edges(g, 3) == (1, 1, 1, 3) + assert _edges(g, 4) == (4, 4, 4) + + +# --------------------------------------------------------------------------- +# parse_chunk_grid validation tests +# --------------------------------------------------------------------------- + + +def test_parse_chunk_grid_varying_extent_mismatch_raises() -> None: + """Reconstructing a ChunkGrid with mismatched extents raises ValueError""" + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + with pytest.raises(ValueError, match="extent"): + ChunkGrid( + dimensions=tuple( + dim.with_extent(ext) for dim, ext in zip(g.dimensions, (100, 100), strict=True) + ) + ) -class TestRectilinearIndexing: - """Test that the indexing pipeline works with VaryingDimension.""" +def test_parse_chunk_grid_varying_extent_match_ok() -> None: + """Reconstructing a ChunkGrid with matching extents succeeds""" + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + g2 = ChunkGrid( + dimensions=tuple( + dim.with_extent(ext) for dim, ext in zip(g.dimensions, (60, 100), strict=True) + ) + ) + assert g2.dimensions[0].extent == 60 - def test_basic_indexer_rectilinear(self) -> None: - from zarr.core.indexing import BasicIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - indexer = BasicIndexer( - selection=(slice(None), slice(None)), - shape=(60, 100), - chunk_grid=g, +@pytest.mark.parametrize( + ("chunk_shapes", "array_shape", "match"), + [ + ([[10, 20, 30], [25, 25]], (100, 50), "extent 100 exceeds sum of edges 60"), + ([[50, 50], [10, 20]], (100, 50), "extent 50 exceeds sum of edges 30"), + ], + ids=["first-dim-mismatch", "second-dim-mismatch"], +) +def test_parse_chunk_grid_rectilinear_extent_mismatch_raises( + chunk_shapes: list[list[int]], array_shape: tuple[int, ...], match: str +) -> None: + """Rectilinear grid raises ValueError when array extent exceeds sum of edges""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": chunk_shapes}, + } + meta = parse_chunk_grid(data) + with pytest.raises(ValueError, match=match): + ChunkGrid.from_sizes(array_shape, meta.chunk_shapes) # type: ignore[union-attr] + + +def test_parse_chunk_grid_rectilinear_extent_match_passes() -> None: + """Rectilinear grid with matching extents parses and builds successfully""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[10, 20, 30], [25, 25]]}, + } + meta = parse_chunk_grid(data) + g = ChunkGrid.from_sizes((60, 50), meta.chunk_shapes) # type: ignore[union-attr] + assert g.grid_shape == (3, 2) + + +def test_parse_chunk_grid_rectilinear_ndim_mismatch_raises() -> None: + """Mismatched ndim between array shape and chunk_sizes raises ValueError""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[10, 20], [25, 25]]}, + } + meta = parse_chunk_grid(data) + with pytest.raises(ValueError, match="3 dimensions but chunk_sizes has 2"): + ChunkGrid.from_sizes((30, 50, 100), meta.chunk_shapes) # type: ignore[union-attr] + + +def test_parse_chunk_grid_rectilinear_rle_extent_validated() -> None: + """RLE-encoded edges are expanded before validation.""" + data: dict[str, Any] = { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[[10, 5]], [[25, 2]]]}, + } + meta = parse_chunk_grid(data) + g = ChunkGrid.from_sizes((50, 50), meta.chunk_shapes) # type: ignore[union-attr] + assert g.grid_shape == (5, 2) + with pytest.raises(ValueError, match="extent 100 exceeds sum of edges 50"): + ChunkGrid.from_sizes((100, 50), meta.chunk_shapes) # type: ignore[union-attr] + + +def test_parse_chunk_grid_varying_dimension_extent_mismatch_on_chunkgrid_input() -> None: + """ChunkGrid constructor rejects VaryingDimension with extent exceeding sum of edges""" + g = ChunkGrid.from_sizes((60, 50), [[10, 20, 30], [25, 25]]) + with pytest.raises(ValueError, match="less than"): + ChunkGrid( + dimensions=tuple( + dim.with_extent(ext) for dim, ext in zip(g.dimensions, (100, 50), strict=True) + ) ) - projections = list(indexer) - assert len(projections) == 6 - p0 = projections[0] - assert p0.chunk_coords == (0, 0) - assert p0.chunk_selection == (slice(0, 10, 1), slice(0, 50, 1)) - p1 = projections[2] - assert p1.chunk_coords == (1, 0) - assert p1.chunk_selection == (slice(0, 20, 1), slice(0, 50, 1)) +# --------------------------------------------------------------------------- +# Rectilinear indexing tests +# --------------------------------------------------------------------------- - def test_basic_indexer_int_selection(self) -> None: - from zarr.core.indexing import BasicIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - indexer = BasicIndexer( - selection=(15, slice(None)), - shape=(60, 100), - chunk_grid=g, - ) - projections = list(indexer) - assert len(projections) == 2 - assert projections[0].chunk_coords == (1, 0) - assert projections[0].chunk_selection == (5, slice(0, 50, 1)) +def test_basic_indexer_rectilinear() -> None: + """BasicIndexer produces correct projections for a full-slice rectilinear selection""" + from zarr.core.indexing import BasicIndexer - def test_basic_indexer_slice_subset(self) -> None: - from zarr.core.indexing import BasicIndexer + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + indexer = BasicIndexer( + selection=(slice(None), slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) == 6 - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - indexer = BasicIndexer( - selection=(slice(5, 35), slice(0, 50)), - shape=(60, 100), - chunk_grid=g, - ) - projections = list(indexer) - chunk_coords_dim0 = sorted({p.chunk_coords[0] for p in projections}) - assert chunk_coords_dim0 == [0, 1, 2] + p0 = projections[0] + assert p0.chunk_coords == (0, 0) + assert p0.chunk_selection == (slice(0, 10, 1), slice(0, 50, 1)) - def test_orthogonal_indexer_rectilinear(self) -> None: - from zarr.core.indexing import OrthogonalIndexer + p1 = projections[2] + assert p1.chunk_coords == (1, 0) + assert p1.chunk_selection == (slice(0, 20, 1), slice(0, 50, 1)) - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - indexer = OrthogonalIndexer( - selection=(slice(None), slice(None)), - shape=(60, 100), - chunk_grid=g, - ) - projections = list(indexer) - assert len(projections) == 6 - def test_oob_block_raises_bounds_check_error(self) -> None: - """Out-of-bounds block index should raise BoundsCheckError, not IndexError.""" - store = MemoryStore() - a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") - with pytest.raises(BoundsCheckError): - a.get_block_selection((2,)) +def test_basic_indexer_int_selection() -> None: + """BasicIndexer with integer selection maps to the correct chunk and local offset""" + from zarr.core.indexing import BasicIndexer + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + indexer = BasicIndexer( + selection=(15, slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) == 2 + assert projections[0].chunk_coords == (1, 0) + assert projections[0].chunk_selection == (5, slice(0, 50, 1)) -class TestEndToEnd: - """Test creating, writing, and reading arrays with rectilinear chunk grids.""" - def test_create_regular_array(self, tmp_path: Path) -> None: - import zarr +def test_basic_indexer_slice_subset() -> None: + """BasicIndexer with partial slices spans the expected chunk dimensions""" + from zarr.core.indexing import BasicIndexer - arr = zarr.create_array( - store=tmp_path / "regular.zarr", - shape=(100, 200), - chunks=(10, 20), - dtype="float32", - ) - assert arr._chunk_grid.is_regular - assert arr.chunks == (10, 20) + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + indexer = BasicIndexer( + selection=(slice(5, 35), slice(0, 50)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + chunk_coords_dim0 = sorted({p.chunk_coords[0] for p in projections}) + assert chunk_coords_dim0 == [0, 1, 2] - def test_create_rectilinear_array(self, tmp_path: Path) -> None: - """Create an array with a rectilinear chunk grid via metadata.""" - from zarr.core.metadata.v3 import ArrayV3Metadata - arr = zarr.create_array( - store=tmp_path / "rect.zarr", - shape=(60, 100), - chunks=[[10, 20, 30], [50, 50]], - dtype="float32", - ) - assert isinstance(arr.metadata, ArrayV3Metadata) - assert isinstance(arr.metadata.chunk_grid, RectilinearChunkGrid) - assert not arr._chunk_grid.is_regular - assert arr._chunk_grid.ndim == 2 - - def test_rectilinear_metadata_serialization(self, tmp_path: Path) -> None: - """Verify metadata round-trips through JSON.""" - from zarr.core.metadata.v3 import RectilinearChunkGrid as RectilinearChunkGridMeta - - meta = RectilinearChunkGridMeta(chunk_shapes=((10, 20, 30), (50, 50))) - d = meta.to_dict() - meta2 = RectilinearChunkGridMeta.from_dict(d) - g = ChunkGrid.from_rectilinear(list(meta.chunk_shapes), array_shape=(60, 100)) - g2 = ChunkGrid.from_rectilinear(list(meta2.chunk_shapes), array_shape=(60, 100)) - assert g2.grid_shape == g.grid_shape - for coord in g.all_chunk_coords(): - orig_spec = g[coord] - new_spec = g2[coord] - assert orig_spec is not None - assert new_spec is not None - assert orig_spec.shape == new_spec.shape - - def test_chunk_grid_serializes_regular(self, tmp_path: Path) -> None: - """Regular arrays serialize with name='regular'.""" - from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid - - arr = zarr.create_array( - store=tmp_path / "regular.zarr", - shape=(100, 200), - chunks=(10, 20), - dtype="float32", - ) - assert isinstance(arr.metadata, ArrayV3Metadata) - assert isinstance(arr.metadata.chunk_grid, RegularChunkGrid) - d = arr.metadata.to_dict() - chunk_grid_dict = d["chunk_grid"] - assert isinstance(chunk_grid_dict, dict) - assert chunk_grid_dict["name"] == "regular" - - def test_chunk_grid_serializes_rectilinear(self, tmp_path: Path) -> None: - """Rectilinear arrays serialize with name='rectilinear'.""" - from zarr.core.metadata.v3 import ArrayV3Metadata - - arr = zarr.create_array( - store=tmp_path / "rect.zarr", - shape=(60, 100), - chunks=[[10, 20, 30], [50, 50]], - dtype="float32", - ) - assert isinstance(arr.metadata, ArrayV3Metadata) - assert isinstance(arr.metadata.chunk_grid, RectilinearChunkGrid) - d = arr.metadata.to_dict() - chunk_grid_dict = d["chunk_grid"] - assert isinstance(chunk_grid_dict, dict) - assert chunk_grid_dict["name"] == "rectilinear" - - def test_chunk_grid_name_roundtrip_preserves_rectilinear(self, tmp_path: Path) -> None: - """A rectilinear grid with uniform edges stays 'rectilinear' through to_dict/from_dict.""" - from zarr.core.metadata.v3 import ArrayV3Metadata - - meta_dict: dict[str, Any] = { - "zarr_format": 3, - "node_type": "array", - "shape": [100, 100], - "chunk_grid": { - "name": "rectilinear", - "configuration": {"kind": "inline", "chunk_shapes": [[[50, 2]], [[25, 4]]]}, - }, - "chunk_key_encoding": {"name": "default"}, - "data_type": "float32", - "fill_value": 0.0, - "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], - } - meta = ArrayV3Metadata.from_dict(meta_dict) - assert isinstance(meta.chunk_grid, RectilinearChunkGrid) - d = meta.to_dict() - chunk_grid_dict = d["chunk_grid"] - assert isinstance(chunk_grid_dict, dict) - assert chunk_grid_dict["name"] == "rectilinear" - - def test_chunk_grid_name_regular_from_dict(self, tmp_path: Path) -> None: - """A 'regular' chunk grid name is preserved through from_dict.""" - from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid - - meta_dict: dict[str, Any] = { - "zarr_format": 3, - "node_type": "array", - "shape": [100, 100], - "chunk_grid": { - "name": "regular", - "configuration": {"chunk_shape": [50, 25]}, - }, - "chunk_key_encoding": {"name": "default"}, - "data_type": "float32", - "fill_value": 0.0, - "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], - } - meta = ArrayV3Metadata.from_dict(meta_dict) - assert isinstance(meta.chunk_grid, RegularChunkGrid) - d = meta.to_dict() - chunk_grid_dict = d["chunk_grid"] - assert isinstance(chunk_grid_dict, dict) - assert chunk_grid_dict["name"] == "regular" - - def test_get_chunk_spec_regular(self, tmp_path: Path) -> None: - """ChunkGrid indexing works for regular grids.""" - grid = ChunkGrid.from_regular((100, 200), (10, 20)) - - spec = grid[(0, 0)] - assert spec is not None - assert spec.shape == (10, 20) - - spec_boundary = grid[(9, 9)] - assert spec_boundary is not None - assert spec_boundary.shape == (10, 20) - - def test_get_chunk_spec_rectilinear(self, tmp_path: Path) -> None: - """ChunkGrid indexing returns per-chunk shapes for rectilinear grids.""" - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - - spec0 = grid[(0, 0)] - assert spec0 is not None - assert spec0.shape == (10, 50) - - spec1 = grid[(1, 0)] - assert spec1 is not None - assert spec1.shape == (20, 50) - - spec2 = grid[(2, 1)] - assert spec2 is not None - assert spec2.shape == (30, 50) - - -class TestShardingCompat: - def test_sharding_accepts_rectilinear_outer_grid(self) -> None: - """ShardingCodec.validate should not reject rectilinear outer grids.""" - from zarr.codecs.sharding import ShardingCodec - from zarr.core.dtype import Float32 - - codec = ShardingCodec(chunk_shape=(5, 5)) - grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 30), (50, 50))) +def test_orthogonal_indexer_rectilinear() -> None: + """OrthogonalIndexer produces the expected number of projections for a rectilinear grid""" + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + indexer = OrthogonalIndexer( + selection=(slice(None), slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) == 6 + + +def test_oob_block_raises_bounds_check_error() -> None: + """Out-of-bounds block index should raise BoundsCheckError, not IndexError.""" + store = MemoryStore() + a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") + with pytest.raises(BoundsCheckError): + a.get_block_selection((2,)) + + +# --------------------------------------------------------------------------- +# End-to-end tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("shape", "chunks", "expected_regular"), + [ + ((100, 200), (10, 20), True), + ((60, 100), [[10, 20, 30], [50, 50]], False), + ], + ids=["regular", "rectilinear"], +) +def test_e2e_create_array( + tmp_path: Path, shape: tuple[int, ...], chunks: Any, expected_regular: bool +) -> None: + """End-to-end array creation sets correct regularity and ndim on chunk_grid""" + arr = zarr.create_array( + store=tmp_path / "arr.zarr", + shape=shape, + chunks=chunks, + dtype="float32", + ) + assert ChunkGrid.from_metadata(arr.metadata).is_regular == expected_regular + assert ChunkGrid.from_metadata(arr.metadata).ndim == len(shape) + + +@pytest.mark.parametrize( + ("shape", "chunks", "grid_type_name", "grid_name"), + [ + ((100, 200), (10, 20), "RegularChunkGrid", "regular"), + ((60, 100), [[10, 20, 30], [50, 50]], "RectilinearChunkGrid", "rectilinear"), + ], + ids=["regular", "rectilinear"], +) +def test_e2e_chunk_grid_serializes( + tmp_path: Path, shape: tuple[int, ...], chunks: Any, grid_type_name: str, grid_name: str +) -> None: + """Array metadata serializes chunk_grid with the correct type and name""" + from zarr.core.metadata.v3 import ArrayV3Metadata, RectilinearChunkGrid, RegularChunkGrid + + grid_type = RegularChunkGrid if grid_type_name == "RegularChunkGrid" else RectilinearChunkGrid + arr = zarr.create_array( + store=tmp_path / "arr.zarr", + shape=shape, + chunks=chunks, + dtype="float32", + ) + assert isinstance(arr.metadata, ArrayV3Metadata) + assert isinstance(arr.metadata.chunk_grid, grid_type) + d = arr.metadata.to_dict() + chunk_grid_dict = d["chunk_grid"] + assert isinstance(chunk_grid_dict, dict) + assert chunk_grid_dict["name"] == grid_name + + +def test_e2e_chunk_grid_name_roundtrip_preserves_rectilinear(tmp_path: Path) -> None: + """A rectilinear grid with uniform edges stays 'rectilinear' through to_dict/from_dict.""" + from zarr.core.metadata.v3 import ArrayV3Metadata, RectilinearChunkGrid + + meta_dict: dict[str, Any] = { + "zarr_format": 3, + "node_type": "array", + "shape": [100, 100], + "chunk_grid": { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[[50, 2]], [[25, 4]]]}, + }, + "chunk_key_encoding": {"name": "default"}, + "data_type": "float32", + "fill_value": 0.0, + "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], + } + meta = ArrayV3Metadata.from_dict(meta_dict) + assert isinstance(meta.chunk_grid, RectilinearChunkGrid) + d = meta.to_dict() + chunk_grid_dict = d["chunk_grid"] + assert isinstance(chunk_grid_dict, dict) + assert chunk_grid_dict["name"] == "rectilinear" + + +def test_e2e_chunk_grid_name_regular_from_dict(tmp_path: Path) -> None: + """A 'regular' chunk grid name is preserved through from_dict.""" + from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid + + meta_dict: dict[str, Any] = { + "zarr_format": 3, + "node_type": "array", + "shape": [100, 100], + "chunk_grid": { + "name": "regular", + "configuration": {"chunk_shape": [50, 25]}, + }, + "chunk_key_encoding": {"name": "default"}, + "data_type": "float32", + "fill_value": 0.0, + "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], + } + meta = ArrayV3Metadata.from_dict(meta_dict) + assert isinstance(meta.chunk_grid, RegularChunkGrid) + d = meta.to_dict() + chunk_grid_dict = d["chunk_grid"] + assert isinstance(chunk_grid_dict, dict) + assert chunk_grid_dict["name"] == "regular" + + +# --------------------------------------------------------------------------- +# Sharding compatibility tests +# --------------------------------------------------------------------------- + + +def test_sharding_accepts_rectilinear_outer_grid() -> None: + """ShardingCodec.validate should not reject rectilinear outer grids.""" + from zarr.codecs.sharding import ShardingCodec + from zarr.core.dtype import Float32 + from zarr.core.metadata.v3 import RectilinearChunkGrid + + codec = ShardingCodec(chunk_shape=(5, 5)) + grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 30), (50, 50))) + + codec.validate( + shape=(60, 100), + dtype=Float32(), + chunk_grid=grid_meta, + ) + +def test_sharding_rejects_non_divisible_rectilinear() -> None: + """Rectilinear shard sizes not divisible by inner chunk_shape should raise.""" + from zarr.codecs.sharding import ShardingCodec + from zarr.core.dtype import Float32 + from zarr.core.metadata.v3 import RectilinearChunkGrid + + codec = ShardingCodec(chunk_shape=(5, 5)) + grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 17), (50, 50))) + + with pytest.raises(ValueError, match="divisible"): codec.validate( - shape=(60, 100), + shape=(47, 100), dtype=Float32(), chunk_grid=grid_meta, ) -class TestEdgeCases: - """Edge cases around boundary chunks, zero-size dims, direct construction, - and serialization round-trips.""" - - # -- FixedDimension boundary (extent != size * nchunks) -- - - def test_fixed_dim_boundary_data_size(self) -> None: - """Boundary chunk's data_size is clipped to the remainder.""" - d = FixedDimension(size=10, extent=95) - assert d.nchunks == 10 - assert d.data_size(0) == 10 - assert d.data_size(9) == 5 # 95 - 9*10 = 5 - assert d.chunk_size(9) == 10 # codec buffer always full - - # FixedDimension.data_size does not bounds-check for performance. - # OOB access is tested via ChunkGrid.__getitem__. - - def test_chunk_grid_boundary_getitem(self) -> None: - """ChunkGrid with boundary FixedDimension via direct construction.""" - g = ChunkGrid(dimensions=(FixedDimension(10, 95), FixedDimension(20, 40))) - spec = g[(9, 1)] - assert spec is not None - assert spec.shape == (5, 20) # data: (95-90, 40-20) - assert spec.codec_shape == (10, 20) # codec buffers are full - assert spec.is_boundary - - def test_chunk_grid_boundary_iter(self) -> None: - """Iterating a boundary grid yields correct boundary ChunkSpecs.""" - g = ChunkGrid(dimensions=(FixedDimension(10, 25),)) - specs = list(g) - assert len(specs) == 3 - assert specs[0].shape == (10,) - assert specs[1].shape == (10,) - assert specs[2].shape == (5,) - assert specs[2].is_boundary - assert not specs[0].is_boundary - - def test_chunk_grid_boundary_shape(self) -> None: - """shape property with boundary extent.""" - g = ChunkGrid(dimensions=(FixedDimension(10, 95),)) - assert g.grid_shape == (10,) # ceildiv(95, 10) = 10 - - # -- Boundary FixedDimension in rectilinear serialization -- - - def test_boundary_fixed_dim_mixed_grid(self) -> None: - """A grid mixing VaryingDimension and boundary FixedDimension works correctly.""" - g = ChunkGrid( - dimensions=( - VaryingDimension([10, 20, 30], extent=60), - FixedDimension(size=10, extent=95), - ) - ) - assert g.grid_shape == (3, 10) - # Boundary chunk along dim 1 has clipped data size - spec = g[(0, 9)] - assert spec is not None - assert spec.shape == (10, 5) - assert spec.codec_shape == (10, 10) - - def test_exact_extent_fixed_dim_mixed_grid(self) -> None: - """No boundary: extent == size * nchunks.""" - g = ChunkGrid( - dimensions=( - VaryingDimension([10, 20], extent=30), - FixedDimension(size=25, extent=100), - ) - ) - assert g.grid_shape == (2, 4) - # All chunks along dim 1 have full size - for i in range(4): - spec = g[(0, i)] - assert spec is not None - assert spec.shape[1] == 25 - - # -- Zero-size and zero-extent -- - - def test_zero_size_zero_extent(self) -> None: - """FixedDimension(size=0, extent=0) => 0 chunks (consistent with size=0, extent=5).""" - d = FixedDimension(size=0, extent=0) - assert d.nchunks == 0 - # OOB access tested via ChunkGrid.__getitem__, not direct method calls - g = ChunkGrid(dimensions=(d,)) - assert g[0] is None - - def test_zero_size_nonzero_extent(self) -> None: - """FixedDimension(size=0, extent=5) => 0 chunks (can't partition).""" - d = FixedDimension(size=0, extent=5) - assert d.nchunks == 0 - g = ChunkGrid(dimensions=(d,)) - assert g[0] is None - - def test_zero_extent_nonzero_size(self) -> None: - """FixedDimension(size=10, extent=0) => 0 chunks.""" - d = FixedDimension(size=10, extent=0) - assert d.nchunks == 0 - g = ChunkGrid(dimensions=(d,)) - assert g[0] is None - - # -- 0-d grid -- - - def test_0d_grid_getitem(self) -> None: - """0-d grid has exactly one chunk at coords ().""" - g = ChunkGrid.from_regular((), ()) - spec = g[()] - assert spec is not None - assert spec.shape == () - assert spec.codec_shape == () - assert not spec.is_boundary - - def test_0d_grid_iter(self) -> None: - """0-d grid iteration yields a single ChunkSpec.""" - g = ChunkGrid.from_regular((), ()) - specs = list(g) - assert len(specs) == 1 - - def test_0d_grid_all_chunk_coords(self) -> None: - """0-d grid has one chunk coord: the empty tuple.""" - g = ChunkGrid.from_regular((), ()) - coords = list(g.all_chunk_coords()) - assert coords == [()] - - def test_0d_grid_nchunks(self) -> None: - g = ChunkGrid.from_regular((), ()) - assert g.get_nchunks() == 1 - - # -- with_extent edge cases -- - - def test_with_extent_preserves_varying_extent(self) -> None: - """with_extent on VaryingDimension preserves extent when unchanged.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - assert isinstance(g.dimensions[0], VaryingDimension) - assert g.dimensions[0].extent == 60 - - d2 = g.dimensions[0].with_extent(60) - assert isinstance(d2, VaryingDimension) - assert d2.extent == 60 - - def test_with_extent_rebinds_fixed_extent(self) -> None: - """with_extent on FixedDimension updates the extent.""" - g = ChunkGrid.from_regular((100, 200), (10, 20)) - assert g.dimensions[0].extent == 100 - - d2 = g.dimensions[0].with_extent(50) - assert isinstance(d2, FixedDimension) - assert d2.extent == 50 - - # -- ChunkGrid.__getitem__ validation -- - - def test_getitem_int_1d_regular(self) -> None: - """Integer indexing works for 1-d regular grids.""" - g = ChunkGrid.from_regular((100,), (10,)) - spec = g[0] - assert spec is not None - assert spec.shape == (10,) - assert spec.slices == (slice(0, 10, 1),) - # Boundary chunk - spec = g[9] - assert spec is not None - assert spec.shape == (10,) - - def test_getitem_int_1d_rectilinear(self) -> None: - """Integer indexing works for 1-d rectilinear grids.""" - g = ChunkGrid.from_rectilinear([[20, 30, 50]], array_shape=(100,)) - spec = g[0] - assert spec is not None - assert spec.shape == (20,) - spec = g[1] - assert spec is not None - assert spec.shape == (30,) - spec = g[2] - assert spec is not None - assert spec.shape == (50,) - - def test_getitem_int_0d_raises(self) -> None: - """Integer indexing raises ValueError for 0-d grids (ndim mismatch).""" - g = ChunkGrid.from_regular((), ()) - with pytest.raises(ValueError, match="Expected 0 coordinate.*got 1"): - g[0] - - def test_getitem_int_2d_raises(self) -> None: - """Integer indexing raises ValueError for 2-d grids (ndim mismatch).""" - g = ChunkGrid.from_regular((100, 200), (10, 20)) - with pytest.raises(ValueError, match="Expected 2 coordinate.*got 1"): - g[0] - - def test_getitem_int_oob_returns_none(self) -> None: - """Integer OOB returns None for 1-d grid.""" - g = ChunkGrid.from_regular((100,), (10,)) - assert g[10] is None - assert g[99] is None - - def test_getitem_negative_index_returns_none(self) -> None: - g = ChunkGrid.from_regular((100,), (10,)) - assert g[(-1,)] is None - - def test_getitem_oob_returns_none(self) -> None: - g = ChunkGrid.from_regular((100,), (10,)) - assert g[(10,)] is None - assert g[(99,)] is None - - # -- ChunkSpec properties -- - - def test_chunk_spec_empty_slices(self) -> None: - """ChunkSpec with zero-width slice.""" - spec = ChunkSpec(slices=(slice(10, 10),), codec_shape=(0,)) - assert spec.shape == (0,) - assert not spec.is_boundary - - def test_chunk_spec_multidim_boundary(self) -> None: - """is_boundary only when shape != codec_shape.""" - spec = ChunkSpec( - slices=(slice(0, 10), slice(0, 5)), - codec_shape=(10, 10), - ) - assert spec.shape == (10, 5) - assert spec.is_boundary # second dim differs +def test_sharding_accepts_divisible_rectilinear() -> None: + """Rectilinear shard sizes all divisible by inner chunk_shape should pass.""" + from zarr.codecs.sharding import ShardingCodec + from zarr.core.dtype import Float32 + from zarr.core.metadata.v3 import RectilinearChunkGrid - # -- VaryingDimension data_size -- + codec = ShardingCodec(chunk_shape=(5, 5)) + grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 30), (50, 50))) - def test_varying_dim_data_size_equals_chunk_size(self) -> None: - """For VaryingDimension, data_size == chunk_size (no padding).""" - d = VaryingDimension([10, 20, 5], extent=35) - for i in range(3): - assert d.data_size(i) == d.chunk_size(i) + codec.validate( + shape=(60, 100), + dtype=Float32(), + chunk_grid=grid_meta, + ) -class TestOrthogonalIndexerRectilinear: - """OrthogonalIndexer must use correct per-chunk sizes for VaryingDimension, - not a hardcoded 1. The chunk_shape field is used by ix_() to convert slices - to ranges for advanced indexing.""" +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- - def test_orthogonal_int_array_selection_rectilinear(self) -> None: - """Integer array selection with rectilinear grid must produce correct - chunk-local selections.""" - from zarr.core.indexing import OrthogonalIndexer - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - indexer = OrthogonalIndexer( - selection=(np.array([5, 15, 35]), slice(None)), - shape=(60, 100), - chunk_grid=g, - ) - projections = list(indexer) - # Grid: dim0 chunks [0..10), [10..30), [30..60); dim1 chunks [0..50), [50..100) - # Indices 5, 15, 35 land in chunks 0, 1, 2 respectively. - # Combined with slice(None) over 2 dim1 chunks, we get 6 projections. - chunk_coords = [p.chunk_coords for p in projections] - assert chunk_coords == [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)] - - def test_orthogonal_bool_array_selection_rectilinear(self) -> None: - """Boolean array selection with rectilinear grid produces correct chunk projections.""" - from zarr.core.indexing import OrthogonalIndexer - - # chunks: dim0 = [10, 20, 30], dim1 = [50, 50] - # mask selects: idx 5 (chunk 0), idx 15 (chunk 1), idx 35 (chunk 2) - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - mask = np.zeros(60, dtype=bool) - mask[5] = True - mask[15] = True - mask[35] = True - indexer = OrthogonalIndexer( - selection=(mask, slice(None)), - shape=(60, 100), - chunk_grid=g, +def test_edge_case_chunk_grid_boundary_getitem() -> None: + """ChunkGrid with boundary FixedDimension via direct construction.""" + g = ChunkGrid(dimensions=(FixedDimension(10, 95), FixedDimension(20, 40))) + spec = g[(9, 1)] + assert spec is not None + assert spec.shape == (5, 20) + assert spec.codec_shape == (10, 20) + assert spec.is_boundary + + +def test_edge_case_chunk_grid_boundary_iter() -> None: + """Iterating a boundary grid yields correct boundary ChunkSpecs.""" + g = ChunkGrid(dimensions=(FixedDimension(10, 25),)) + specs = list(g) + assert len(specs) == 3 + assert specs[0].shape == (10,) + assert specs[1].shape == (10,) + assert specs[2].shape == (5,) + assert specs[2].is_boundary + assert not specs[0].is_boundary + + +def test_edge_case_chunk_grid_boundary_shape() -> None: + """shape property with boundary extent.""" + g = ChunkGrid(dimensions=(FixedDimension(10, 95),)) + assert g.grid_shape == (10,) + + +# -- Zero-size and zero-extent -- + + +@pytest.mark.parametrize( + ("size", "extent"), + [(0, 0), (0, 5), (10, 0)], + ids=["zero-size-zero-extent", "zero-size-nonzero-extent", "zero-extent-nonzero-size"], +) +def test_edge_case_zero_size_or_extent(size: int, extent: int) -> None: + """FixedDimension with zero size or extent has zero chunks and getitem returns None""" + d = FixedDimension(size=size, extent=extent) + assert d.nchunks == 0 + g = ChunkGrid(dimensions=(d,)) + assert g[0] is None + + +# -- 0-d grid -- + + +def test_0d_grid_getitem() -> None: + """0-d grid has exactly one chunk at coords ().""" + g = ChunkGrid.from_sizes((), ()) + spec = g[()] + assert spec is not None + assert spec.shape == () + assert spec.codec_shape == () + assert not spec.is_boundary + + +def test_0d_grid_iter() -> None: + """0-d grid iteration yields a single ChunkSpec.""" + g = ChunkGrid.from_sizes((), ()) + specs = list(g) + assert len(specs) == 1 + + +def test_0d_grid_all_chunk_coords() -> None: + """0-d grid has one chunk coord: the empty tuple.""" + g = ChunkGrid.from_sizes((), ()) + coords = list(g.all_chunk_coords()) + assert coords == [()] + + +def test_0d_grid_nchunks() -> None: + """0-d grid reports exactly one chunk""" + g = ChunkGrid.from_sizes((), ()) + assert g.get_nchunks() == 1 + + +# -- parse_chunk_grid edge cases -- + + +def test_parse_chunk_grid_preserves_varying_extent() -> None: + """parse_chunk_grid does not overwrite VaryingDimension extent.""" + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + assert isinstance(g.dimensions[0], VaryingDimension) + assert g.dimensions[0].extent == 60 + + g2 = ChunkGrid( + dimensions=tuple( + dim.with_extent(ext) for dim, ext in zip(g.dimensions, (60, 100), strict=True) ) - projections = list(indexer) - # 3 chunks touched in dim0 x 2 chunks in dim1 = 6 projections - assert len(projections) == 6 - chunk_coords = [p.chunk_coords for p in projections] - assert (0, 0) in chunk_coords - assert (1, 0) in chunk_coords - assert (2, 0) in chunk_coords - assert (0, 1) in chunk_coords - assert (1, 1) in chunk_coords - assert (2, 1) in chunk_coords - - def test_orthogonal_advanced_indexing_produces_correct_projections(self) -> None: - """Verify OrthogonalIndexer produces correct chunk projections - for advanced indexing with VaryingDimension.""" - from zarr.core.indexing import OrthogonalIndexer - - g = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - indexer = OrthogonalIndexer( - selection=(np.array([5, 15]), slice(None)), - shape=(60, 100), - chunk_grid=g, + ) + assert isinstance(g2.dimensions[0], VaryingDimension) + assert g2.dimensions[0].extent == 60 + + +def test_parse_chunk_grid_rebinds_fixed_extent() -> None: + """parse_chunk_grid updates FixedDimension extent from array shape.""" + g = ChunkGrid.from_sizes((100, 200), (10, 20)) + assert g.dimensions[0].extent == 100 + + g2 = ChunkGrid( + dimensions=tuple( + dim.with_extent(ext) for dim, ext in zip(g.dimensions, (50, 100), strict=True) ) - projections = list(indexer) - # index 5 is in chunk 0 (edges [10,...]), index 15 is in chunk 1 (edges [...,20,...]) - # dim 1 slice(None) covers both chunks [50, 50] - # cartesian product: 2 chunks in dim 0 x 2 chunks in dim 1 = 4 projections - assert len(projections) == 4 - coords = [p.chunk_coords for p in projections] - assert (0, 0) in coords - assert (0, 1) in coords - assert (1, 0) in coords - assert (1, 1) in coords - - -class TestShardingValidationRectilinear: - """ShardingCodec.validate must check divisibility for rectilinear grids too.""" - - def test_sharding_rejects_non_divisible_rectilinear(self) -> None: - """Rectilinear shard sizes not divisible by inner chunk_shape should raise.""" - from zarr.codecs.sharding import ShardingCodec - from zarr.core.dtype import Float32 - - codec = ShardingCodec(chunk_shape=(5, 5)) - # 17 is not divisible by 5 - grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 17), (50, 50))) - - with pytest.raises(ValueError, match="divisible"): - codec.validate( - shape=(47, 100), - dtype=Float32(), - chunk_grid=grid_meta, - ) + ) + assert isinstance(g2.dimensions[0], FixedDimension) + assert g2.dimensions[0].extent == 50 + assert g2.grid_shape == (5, 5) + + +# -- ChunkGrid.__getitem__ validation -- + + +def test_getitem_int_1d_regular() -> None: + """Integer indexing works for 1-d regular grids.""" + g = ChunkGrid.from_sizes((100,), (10,)) + spec = g[0] + assert spec is not None + assert spec.shape == (10,) + assert spec.slices == (slice(0, 10, 1),) + spec = g[9] + assert spec is not None + assert spec.shape == (10,) + + +def test_getitem_int_1d_rectilinear() -> None: + """Integer indexing works for 1-d rectilinear grids.""" + g = ChunkGrid.from_sizes((100,), [[20, 30, 50]]) + spec = g[0] + assert spec is not None + assert spec.shape == (20,) + spec = g[1] + assert spec is not None + assert spec.shape == (30,) + spec = g[2] + assert spec is not None + assert spec.shape == (50,) + + +@pytest.mark.parametrize( + ("shape", "chunks", "match"), + [ + ((), (), "Expected 0 coordinate.*got 1"), + ((100, 200), (10, 20), "Expected 2 coordinate.*got 1"), + ], + ids=["0d", "2d"], +) +def test_getitem_int_ndim_mismatch_raises( + shape: tuple[int, ...], chunks: tuple[int, ...], match: str +) -> None: + """Integer indexing on a multi-dim or 0-d grid raises ValueError for ndim mismatch""" + g = ChunkGrid.from_sizes(shape, chunks) + with pytest.raises(ValueError, match=match): + g[0] + + +@pytest.mark.parametrize( + "index", + [(10,), (99,), (-1,)], + ids=["oob-10", "oob-99", "negative"], +) +def test_getitem_oob_returns_none(index: tuple[int, ...]) -> None: + """Out-of-bounds or negative chunk indices return None""" + g = ChunkGrid.from_sizes((100,), (10,)) + assert g[index] is None - def test_sharding_accepts_divisible_rectilinear(self) -> None: - """Rectilinear shard sizes all divisible by inner chunk_shape should pass.""" - from zarr.codecs.sharding import ShardingCodec - from zarr.core.dtype import Float32 - codec = ShardingCodec(chunk_shape=(5, 5)) - grid_meta = RectilinearChunkGrid(chunk_shapes=((10, 20, 30), (50, 50))) +# -- Rectilinear with zero-nchunks FixedDimension -- - # Should not raise - codec.validate( - shape=(60, 100), - dtype=Float32(), - chunk_grid=grid_meta, + +def test_zero_nchunks_fixed_dim_in_rectilinear() -> None: + """A rectilinear grid with a 0-extent FixedDimension still has valid size.""" + g = ChunkGrid( + dimensions=( + VaryingDimension([10, 20], extent=30), + FixedDimension(size=10, extent=0), ) + ) + assert g.grid_shape == (2, 0) + + +# -- VaryingDimension data_size -- + + +def test_varying_dim_data_size_equals_chunk_size() -> None: + """For VaryingDimension, data_size == chunk_size (no padding).""" + d = VaryingDimension([10, 20, 5], extent=35) + for i in range(3): + assert d.data_size(i) == d.chunk_size(i) + + +# --------------------------------------------------------------------------- +# OrthogonalIndexer rectilinear tests +# --------------------------------------------------------------------------- + + +def test_orthogonal_int_array_selection_rectilinear() -> None: + """Integer array selection with rectilinear grid must produce correct + chunk-local selections.""" + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + indexer = OrthogonalIndexer( + selection=(np.array([5, 15, 35]), slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + chunk_coords = [p.chunk_coords for p in projections] + assert chunk_coords == [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)] + + +def test_orthogonal_bool_array_selection_rectilinear() -> None: + """Boolean array selection with rectilinear grid produces correct chunk projections.""" + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + mask = np.zeros(60, dtype=bool) + mask[5] = True + mask[15] = True + mask[35] = True + indexer = OrthogonalIndexer( + selection=(mask, slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) == 6 + chunk_coords = [p.chunk_coords for p in projections] + assert (0, 0) in chunk_coords + assert (1, 0) in chunk_coords + assert (2, 0) in chunk_coords + assert (0, 1) in chunk_coords + assert (1, 1) in chunk_coords + assert (2, 1) in chunk_coords + + +def test_orthogonal_advanced_indexing_produces_correct_projections() -> None: + """Verify OrthogonalIndexer produces correct chunk projections + for advanced indexing with VaryingDimension.""" + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) + indexer = OrthogonalIndexer( + selection=(np.array([5, 15]), slice(None)), + shape=(60, 100), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) == 4 + coords = [p.chunk_coords for p in projections] + assert (0, 0) in coords + assert (0, 1) in coords + assert (1, 0) in coords + assert (1, 1) in coords + + +# --------------------------------------------------------------------------- +# Full pipeline rectilinear tests (helpers) +# --------------------------------------------------------------------------- + + +def _make_1d(tmp_path: Path) -> tuple[zarr.Array[Any], np.ndarray[Any, Any]]: + a = np.arange(30, dtype="int32") + z = zarr.create_array( + store=tmp_path / "arr1d.zarr", + shape=(30,), + chunks=[[5, 10, 15]], + dtype="int32", + ) + z[:] = a + return z, a + + +def _make_2d(tmp_path: Path) -> tuple[zarr.Array[Any], np.ndarray[Any, Any]]: + a = np.arange(6000, dtype="int32").reshape(60, 100) + z = zarr.create_array( + store=tmp_path / "arr2d.zarr", + shape=(60, 100), + chunks=[[10, 20, 30], [25, 25, 25, 25]], + dtype="int32", + ) + z[:] = a + return z, a + + +# --- Basic selection --- + + +def test_pipeline_basic_selection_1d(tmp_path: Path) -> None: + """1D rectilinear basic selections match numpy for ints, slices, and full-array reads""" + z, a = _make_1d(tmp_path) + sels: list[Any] = [0, 4, 5, 14, 15, 29, -1, slice(None), slice(3, 18), slice(0, 0)] + for sel in sels: + np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") + + +def test_pipeline_basic_selection_1d_strided(tmp_path: Path) -> None: + """1D rectilinear strided slice selections match numpy""" + z, a = _make_1d(tmp_path) + for sel in [slice(None, None, 2), slice(1, 25, 3), slice(0, 30, 7)]: + np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") + + +def test_pipeline_basic_selection_2d(tmp_path: Path) -> None: + """2D rectilinear basic selections match numpy across chunk boundaries""" + z, a = _make_2d(tmp_path) + selections: list[Any] = [ + 42, + -1, + (9, 24), + (10, 25), + (30, 50), + (59, 99), + slice(None), + (slice(5, 35), slice(20, 80)), + (slice(0, 10), slice(0, 25)), + (slice(10, 10), slice(None)), + (slice(None, None, 3), slice(None, None, 7)), + ] + for sel in selections: + np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") + + +# --- Orthogonal selection --- + + +def test_pipeline_orthogonal_selection_1d_bool(tmp_path: Path) -> None: + """1D boolean orthogonal indexing on rectilinear arrays matches numpy""" + z, a = _make_1d(tmp_path) + ix = np.zeros(30, dtype=bool) + ix[[0, 4, 5, 14, 15, 29]] = True + np.testing.assert_array_equal(z.oindex[ix], a[ix]) + + +def test_pipeline_orthogonal_selection_1d_int(tmp_path: Path) -> None: + """1D integer and negative-index orthogonal selection on rectilinear arrays matches numpy""" + z, a = _make_1d(tmp_path) + ix = np.array([0, 4, 5, 14, 15, 29]) + np.testing.assert_array_equal(z.oindex[ix], a[ix]) + ix_neg = np.array([0, -1, -15, -25]) + np.testing.assert_array_equal(z.oindex[ix_neg], a[ix_neg]) + + +def test_pipeline_orthogonal_selection_2d_bool(tmp_path: Path) -> None: + """2D boolean orthogonal selection on rectilinear arrays matches numpy""" + z, a = _make_2d(tmp_path) + ix0 = np.zeros(60, dtype=bool) + ix0[[0, 9, 10, 29, 30, 59]] = True + ix1 = np.zeros(100, dtype=bool) + ix1[[0, 24, 25, 49, 50, 99]] = True + np.testing.assert_array_equal(z.oindex[ix0, ix1], a[np.ix_(ix0, ix1)]) + + +def test_pipeline_orthogonal_selection_2d_int(tmp_path: Path) -> None: + """2D integer orthogonal selection on rectilinear arrays matches numpy""" + z, a = _make_2d(tmp_path) + ix0 = np.array([0, 9, 10, 29, 30, 59]) + ix1 = np.array([0, 24, 25, 49, 50, 99]) + np.testing.assert_array_equal(z.oindex[ix0, ix1], a[np.ix_(ix0, ix1)]) + + +def test_pipeline_orthogonal_selection_2d_mixed(tmp_path: Path) -> None: + """2D mixed int-array and slice orthogonal selection on rectilinear arrays matches numpy""" + z, a = _make_2d(tmp_path) + ix = np.array([0, 9, 10, 29, 30, 59]) + np.testing.assert_array_equal(z.oindex[ix, slice(25, 75)], a[np.ix_(ix, np.arange(25, 75))]) + np.testing.assert_array_equal( + z.oindex[slice(10, 30), ix[:4]], a[np.ix_(np.arange(10, 30), ix[:4])] + ) + + +# --- Coordinate (vindex) selection --- + + +def test_pipeline_coordinate_selection_1d(tmp_path: Path) -> None: + """1D coordinate (vindex) selection on rectilinear arrays matches numpy""" + z, a = _make_1d(tmp_path) + ix = np.array([0, 4, 5, 14, 15, 29]) + np.testing.assert_array_equal(z.vindex[ix], a[ix]) + + +def test_pipeline_coordinate_selection_2d(tmp_path: Path) -> None: + """2D coordinate (vindex) selection on rectilinear arrays matches numpy""" + z, a = _make_2d(tmp_path) + r = np.array([0, 9, 10, 29, 30, 59]) + c = np.array([0, 24, 25, 49, 50, 99]) + np.testing.assert_array_equal(z.vindex[r, c], a[r, c]) + + +def test_pipeline_coordinate_selection_2d_bool_mask(tmp_path: Path) -> None: + """2D boolean mask vindex selection on rectilinear arrays matches numpy""" + z, a = _make_2d(tmp_path) + mask = a > 3000 + np.testing.assert_array_equal(z.vindex[mask], a[mask]) + + +# --- Block selection --- + + +def test_pipeline_block_selection_1d(tmp_path: Path) -> None: + """1D block selection on rectilinear arrays returns correct chunk data""" + z, a = _make_1d(tmp_path) + np.testing.assert_array_equal(z.blocks[0], a[0:5]) + np.testing.assert_array_equal(z.blocks[1], a[5:15]) + np.testing.assert_array_equal(z.blocks[2], a[15:30]) + np.testing.assert_array_equal(z.blocks[-1], a[15:30]) + np.testing.assert_array_equal(z.blocks[0:2], a[0:15]) + np.testing.assert_array_equal(z.blocks[1:3], a[5:30]) + np.testing.assert_array_equal(z.blocks[:], a[:]) + + +def test_pipeline_block_selection_2d(tmp_path: Path) -> None: + """2D block selection on rectilinear arrays returns correct chunk data""" + z, a = _make_2d(tmp_path) + np.testing.assert_array_equal(z.blocks[0, 0], a[0:10, 0:25]) + np.testing.assert_array_equal(z.blocks[1, 2], a[10:30, 50:75]) + np.testing.assert_array_equal(z.blocks[2, 3], a[30:60, 75:100]) + np.testing.assert_array_equal(z.blocks[-1, -1], a[30:60, 75:100]) + np.testing.assert_array_equal(z.blocks[0:2, 1:3], a[0:30, 25:75]) + np.testing.assert_array_equal(z.blocks[:, :], a[:, :]) + + +def test_pipeline_set_block_selection_1d(tmp_path: Path) -> None: + """Writing via 1D block selection on rectilinear arrays persists correctly""" + z, a = _make_1d(tmp_path) + val = np.full(10, -1, dtype="int32") + z.blocks[1] = val + a[5:15] = val + np.testing.assert_array_equal(z[:], a) + + +def test_pipeline_set_block_selection_2d(tmp_path: Path) -> None: + """Writing via 2D block selection on rectilinear arrays persists correctly""" + z, a = _make_2d(tmp_path) + val = np.full((30, 50), -99, dtype="int32") + z.blocks[0:2, 1:3] = val + a[0:30, 25:75] = val + np.testing.assert_array_equal(z[:], a) + + +def test_pipeline_block_selection_slice_stop_at_nchunks(tmp_path: Path) -> None: + """Block slice with stop == nchunks exercises the dim_len fallback.""" + z, a = _make_1d(tmp_path) + np.testing.assert_array_equal(z.blocks[1:3], a[5:30]) + np.testing.assert_array_equal(z.blocks[0:10], a[:]) + +def test_pipeline_block_selection_slice_stop_at_nchunks_2d(tmp_path: Path) -> None: + """Same fallback test for 2D rectilinear arrays.""" + z, a = _make_2d(tmp_path) + np.testing.assert_array_equal(z.blocks[2:3, 3:4], a[30:60, 75:100]) + np.testing.assert_array_equal(z.blocks[0:99, 0:99], a[:, :]) -class TestFullPipelineRectilinear: - """End-to-end read/write tests through the full Array pipeline.""" - @staticmethod - def _make_1d(tmp_path: Path) -> tuple[zarr.Array[Any], np.ndarray[Any, Any]]: - a = np.arange(30, dtype="int32") - z = zarr.create_array( - store=tmp_path / "arr1d.zarr", +# --- Set coordinate selection --- + + +def test_pipeline_set_coordinate_selection_1d(tmp_path: Path) -> None: + """Writing via 1D coordinate selection on rectilinear arrays persists correctly""" + z, a = _make_1d(tmp_path) + ix = np.array([0, 4, 5, 14, 15, 29]) + val = np.full(len(ix), -7, dtype="int32") + z.vindex[ix] = val + a[ix] = val + np.testing.assert_array_equal(z[:], a) + + +def test_pipeline_set_coordinate_selection_2d(tmp_path: Path) -> None: + """Writing via 2D coordinate selection on rectilinear arrays persists correctly""" + z, a = _make_2d(tmp_path) + r = np.array([0, 9, 10, 29, 30, 59]) + c = np.array([0, 24, 25, 49, 50, 99]) + val = np.full(len(r), -42, dtype="int32") + z.vindex[r, c] = val + a[r, c] = val + np.testing.assert_array_equal(z[:], a) + + +# --- Set selection --- + + +def test_pipeline_set_basic_selection(tmp_path: Path) -> None: + """Writing via basic slice selection on rectilinear arrays persists correctly""" + z, a = _make_2d(tmp_path) + new_data = np.full((20, 50), -1, dtype="int32") + z[5:25, 10:60] = new_data + a[5:25, 10:60] = new_data + np.testing.assert_array_equal(z[:], a) + + +def test_pipeline_set_orthogonal_selection(tmp_path: Path) -> None: + """Writing via orthogonal selection on rectilinear arrays persists correctly""" + z, a = _make_2d(tmp_path) + rows = np.array([0, 10, 30]) + cols = np.array([0, 25, 50, 75]) + val = np.full((3, 4), -99, dtype="int32") + z.oindex[rows, cols] = val + a[np.ix_(rows, cols)] = val + np.testing.assert_array_equal(z[:], a) + + +# --- Higher dimensions --- + + +def test_pipeline_3d_array(tmp_path: Path) -> None: + """3D rectilinear array write and read-back match numpy""" + shape = (12, 20, 15) + chunk_shapes = [[4, 8], [5, 5, 10], [5, 10]] + a = np.arange(int(np.prod(shape)), dtype="int32").reshape(shape) + z = zarr.create_array( + store=tmp_path / "arr3d.zarr", + shape=shape, + chunks=chunk_shapes, + dtype="int32", + ) + z[:] = a + np.testing.assert_array_equal(z[:], a) + np.testing.assert_array_equal(z[2:10, 3:18, 4:14], a[2:10, 3:18, 4:14]) + + +def test_pipeline_1d_single_chunk(tmp_path: Path) -> None: + """Single-chunk rectilinear array write and read-back match numpy""" + a = np.arange(20, dtype="int32") + z = zarr.create_array( + store=tmp_path / "arr1c.zarr", + shape=(20,), + chunks=[[20]], + dtype="int32", + ) + z[:] = a + np.testing.assert_array_equal(z[:], a) + + +# --- Persistence roundtrip --- + + +def test_pipeline_persistence_roundtrip(tmp_path: Path) -> None: + """Rectilinear array survives close and reopen with correct data""" + _, a = _make_2d(tmp_path) + z2 = zarr.open_array(store=tmp_path / "arr2d.zarr", mode="r") + assert not ChunkGrid.from_metadata(z2.metadata).is_regular + np.testing.assert_array_equal(z2[:], a) + + +# --- Highly irregular chunks --- + + +def test_pipeline_highly_irregular_chunks(tmp_path: Path) -> None: + """Highly irregular chunk sizes produce correct write and partial-read results""" + shape = (100, 100) + chunk_shapes = [[5, 10, 15, 20, 50], [100]] + a = np.arange(10000, dtype="int32").reshape(shape) + z = zarr.create_array( + store=tmp_path / "irreg.zarr", + shape=shape, + chunks=chunk_shapes, + dtype="int32", + ) + z[:] = a + np.testing.assert_array_equal(z[:], a) + np.testing.assert_array_equal(z[3:97, 10:90], a[3:97, 10:90]) + + +# --- API validation --- + + +def test_pipeline_v2_rejects_rectilinear(tmp_path: Path) -> None: + """Creating a rectilinear array with zarr_format=2 raises ValueError""" + with pytest.raises(ValueError, match="Zarr format 2"): + zarr.create_array( + store=tmp_path / "v2.zarr", shape=(30,), - chunks=[[5, 10, 15]], + chunks=[[10, 20]], dtype="int32", + zarr_format=2, ) - z[:] = a - return z, a - - @staticmethod - def _make_2d(tmp_path: Path) -> tuple[zarr.Array[Any], np.ndarray[Any, Any]]: - a = np.arange(6000, dtype="int32").reshape(60, 100) - z = zarr.create_array( - store=tmp_path / "arr2d.zarr", + + +def test_pipeline_sharding_rejects_rectilinear_chunks_with_shards(tmp_path: Path) -> None: + """Rectilinear chunks (inner) with sharding is not supported.""" + with pytest.raises(ValueError, match="Rectilinear chunks with sharding"): + zarr.create_array( + store=tmp_path / "shard.zarr", shape=(60, 100), chunks=[[10, 20, 30], [25, 25, 25, 25]], + shards=(30, 50), dtype="int32", ) - z[:] = a - return z, a - - # --- Basic selection --- - - def test_basic_selection_1d(self, tmp_path: Path) -> None: - z, a = self._make_1d(tmp_path) - sels: list[Any] = [0, 4, 5, 14, 15, 29, -1, slice(None), slice(3, 18), slice(0, 0)] - for sel in sels: - np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") - - def test_basic_selection_1d_strided(self, tmp_path: Path) -> None: - z, a = self._make_1d(tmp_path) - for sel in [slice(None, None, 2), slice(1, 25, 3), slice(0, 30, 7)]: - np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") - - def test_basic_selection_2d(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - selections: list[Any] = [ - 42, - -1, - (9, 24), - (10, 25), - (30, 50), - (59, 99), - slice(None), - (slice(5, 35), slice(20, 80)), - (slice(0, 10), slice(0, 25)), # within one chunk - (slice(10, 10), slice(None)), # empty - (slice(None, None, 3), slice(None, None, 7)), # strided - ] - for sel in selections: - np.testing.assert_array_equal(z[sel], a[sel], err_msg=f"sel={sel}") - - # --- Orthogonal selection --- - - def test_orthogonal_selection_1d_bool(self, tmp_path: Path) -> None: - z, a = self._make_1d(tmp_path) - ix = np.zeros(30, dtype=bool) - ix[[0, 4, 5, 14, 15, 29]] = True - np.testing.assert_array_equal(z.oindex[ix], a[ix]) - - def test_orthogonal_selection_1d_int(self, tmp_path: Path) -> None: - z, a = self._make_1d(tmp_path) - ix = np.array([0, 4, 5, 14, 15, 29]) - np.testing.assert_array_equal(z.oindex[ix], a[ix]) - ix_neg = np.array([0, -1, -15, -25]) - np.testing.assert_array_equal(z.oindex[ix_neg], a[ix_neg]) - - def test_orthogonal_selection_2d_bool(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - ix0 = np.zeros(60, dtype=bool) - ix0[[0, 9, 10, 29, 30, 59]] = True - ix1 = np.zeros(100, dtype=bool) - ix1[[0, 24, 25, 49, 50, 99]] = True - np.testing.assert_array_equal(z.oindex[ix0, ix1], a[np.ix_(ix0, ix1)]) - - def test_orthogonal_selection_2d_int(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - ix0 = np.array([0, 9, 10, 29, 30, 59]) - ix1 = np.array([0, 24, 25, 49, 50, 99]) - np.testing.assert_array_equal(z.oindex[ix0, ix1], a[np.ix_(ix0, ix1)]) - - def test_orthogonal_selection_2d_mixed(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - ix = np.array([0, 9, 10, 29, 30, 59]) - np.testing.assert_array_equal(z.oindex[ix, slice(25, 75)], a[np.ix_(ix, np.arange(25, 75))]) - np.testing.assert_array_equal( - z.oindex[slice(10, 30), ix[:4]], a[np.ix_(np.arange(10, 30), ix[:4])] - ) - # --- Coordinate (vindex) selection --- - - def test_coordinate_selection_1d(self, tmp_path: Path) -> None: - z, a = self._make_1d(tmp_path) - ix = np.array([0, 4, 5, 14, 15, 29]) - np.testing.assert_array_equal(z.vindex[ix], a[ix]) - - def test_coordinate_selection_2d(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - r = np.array([0, 9, 10, 29, 30, 59]) - c = np.array([0, 24, 25, 49, 50, 99]) - np.testing.assert_array_equal(z.vindex[r, c], a[r, c]) - - def test_coordinate_selection_2d_bool_mask(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - mask = a > 3000 - np.testing.assert_array_equal(z.vindex[mask], a[mask]) - - # --- Block selection --- - - def test_block_selection_1d(self, tmp_path: Path) -> None: - z, a = self._make_1d(tmp_path) - # chunks: [5, 10, 15] -> offsets 0, 5, 15 - # block 0: a[0:5], block 1: a[5:15], block 2: a[15:30] - np.testing.assert_array_equal(z.blocks[0], a[0:5]) - np.testing.assert_array_equal(z.blocks[1], a[5:15]) - np.testing.assert_array_equal(z.blocks[2], a[15:30]) - np.testing.assert_array_equal(z.blocks[-1], a[15:30]) - # slice of blocks - np.testing.assert_array_equal(z.blocks[0:2], a[0:15]) - np.testing.assert_array_equal(z.blocks[1:3], a[5:30]) - np.testing.assert_array_equal(z.blocks[:], a[:]) - - def test_block_selection_2d(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - # dim0 chunks: [10, 20, 30] -> offsets 0, 10, 30 - # dim1 chunks: [25, 25, 25, 25] -> offsets 0, 25, 50, 75 - np.testing.assert_array_equal(z.blocks[0, 0], a[0:10, 0:25]) - np.testing.assert_array_equal(z.blocks[1, 2], a[10:30, 50:75]) - np.testing.assert_array_equal(z.blocks[2, 3], a[30:60, 75:100]) - np.testing.assert_array_equal(z.blocks[-1, -1], a[30:60, 75:100]) - # slice of blocks - np.testing.assert_array_equal(z.blocks[0:2, 1:3], a[0:30, 25:75]) - np.testing.assert_array_equal(z.blocks[:, :], a[:, :]) - - def test_set_block_selection_1d(self, tmp_path: Path) -> None: - z, a = self._make_1d(tmp_path) - # overwrite block 1 (a[5:15]) - val = np.full(10, -1, dtype="int32") - z.blocks[1] = val - a[5:15] = val - np.testing.assert_array_equal(z[:], a) - - def test_set_block_selection_2d(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - # overwrite blocks [0:2, 1:3] -> a[0:30, 25:75] - val = np.full((30, 50), -99, dtype="int32") - z.blocks[0:2, 1:3] = val - a[0:30, 25:75] = val - np.testing.assert_array_equal(z[:], a) - - def test_block_selection_slice_stop_at_nchunks(self, tmp_path: Path) -> None: - """Block slice with stop == nchunks exercises the dim_len fallback - in BlockIndexer (``chunk_offset(stop) if stop < nchunks else dim_len``). - """ - z, a = self._make_1d(tmp_path) - # nchunks == 3; stop=3 hits the `else dim_len` path - np.testing.assert_array_equal(z.blocks[1:3], a[5:30]) - # stop > nchunks should also produce the full remainder - np.testing.assert_array_equal(z.blocks[0:10], a[:]) - - def test_block_selection_slice_stop_at_nchunks_2d(self, tmp_path: Path) -> None: - """Same fallback test for 2D rectilinear arrays.""" - z, a = self._make_2d(tmp_path) - # dim0 nchunks=3, dim1 nchunks=4 - np.testing.assert_array_equal(z.blocks[2:3, 3:4], a[30:60, 75:100]) - np.testing.assert_array_equal(z.blocks[0:99, 0:99], a[:, :]) - - # --- Set coordinate selection --- - - def test_set_coordinate_selection_1d(self, tmp_path: Path) -> None: - z, a = self._make_1d(tmp_path) - ix = np.array([0, 4, 5, 14, 15, 29]) - val = np.full(len(ix), -7, dtype="int32") - z.vindex[ix] = val - a[ix] = val - np.testing.assert_array_equal(z[:], a) - - def test_set_coordinate_selection_2d(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - r = np.array([0, 9, 10, 29, 30, 59]) - c = np.array([0, 24, 25, 49, 50, 99]) - val = np.full(len(r), -42, dtype="int32") - z.vindex[r, c] = val - a[r, c] = val - np.testing.assert_array_equal(z[:], a) - - # --- Set selection --- - - def test_set_basic_selection(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - new_data = np.full((20, 50), -1, dtype="int32") - z[5:25, 10:60] = new_data - a[5:25, 10:60] = new_data - np.testing.assert_array_equal(z[:], a) - - def test_set_orthogonal_selection(self, tmp_path: Path) -> None: - z, a = self._make_2d(tmp_path) - rows = np.array([0, 10, 30]) - cols = np.array([0, 25, 50, 75]) - val = np.full((3, 4), -99, dtype="int32") - z.oindex[rows, cols] = val - a[np.ix_(rows, cols)] = val - np.testing.assert_array_equal(z[:], a) - - # --- Higher dimensions --- - - def test_3d_array(self, tmp_path: Path) -> None: - shape = (12, 20, 15) - chunk_shapes = [[4, 8], [5, 5, 10], [5, 10]] - a = np.arange(int(np.prod(shape)), dtype="int32").reshape(shape) - z = zarr.create_array( - store=tmp_path / "arr3d.zarr", - shape=shape, - chunks=chunk_shapes, - dtype="int32", - ) - z[:] = a - np.testing.assert_array_equal(z[:], a) - np.testing.assert_array_equal(z[2:10, 3:18, 4:14], a[2:10, 3:18, 4:14]) - - def test_1d_single_chunk(self, tmp_path: Path) -> None: - a = np.arange(20, dtype="int32") - z = zarr.create_array( - store=tmp_path / "arr1c.zarr", - shape=(20,), - chunks=[[20]], - dtype="int32", - ) - z[:] = a - np.testing.assert_array_equal(z[:], a) - - # --- Persistence roundtrip --- - - def test_persistence_roundtrip(self, tmp_path: Path) -> None: - _, a = self._make_2d(tmp_path) - z2 = zarr.open_array(store=tmp_path / "arr2d.zarr", mode="r") - assert not z2._chunk_grid.is_regular - np.testing.assert_array_equal(z2[:], a) - - # --- Highly irregular chunks --- - - def test_highly_irregular_chunks(self, tmp_path: Path) -> None: - shape = (100, 100) - chunk_shapes = [[5, 10, 15, 20, 50], [100]] - a = np.arange(10000, dtype="int32").reshape(shape) - z = zarr.create_array( - store=tmp_path / "irreg.zarr", - shape=shape, - chunks=chunk_shapes, - dtype="int32", - ) - z[:] = a - np.testing.assert_array_equal(z[:], a) - np.testing.assert_array_equal(z[3:97, 10:90], a[3:97, 10:90]) - - # --- API validation --- - - def test_v2_rejects_rectilinear(self, tmp_path: Path) -> None: - with pytest.raises(ValueError, match="Zarr format 2"): - zarr.create_array( - store=tmp_path / "v2.zarr", - shape=(30,), - chunks=[[10, 20]], - dtype="int32", - zarr_format=2, - ) - def test_sharding_rejects_rectilinear_chunks_with_shards(self, tmp_path: Path) -> None: - """Rectilinear chunks (inner) with sharding is not supported.""" - with pytest.raises(ValueError, match="Rectilinear chunks with sharding"): - zarr.create_array( - store=tmp_path / "shard.zarr", - shape=(60, 100), - chunks=[[10, 20, 30], [25, 25, 25, 25]], - shards=(30, 50), - dtype="int32", - ) +def test_pipeline_rectilinear_shards_roundtrip(tmp_path: Path) -> None: + """Rectilinear shards with uniform inner chunks: full write/read roundtrip.""" + data = np.arange(120 * 100, dtype="int32").reshape(120, 100) + arr = zarr.create_array( + store=tmp_path / "rect_shards.zarr", + shape=(120, 100), + chunks=(10, 10), + shards=[[60, 40, 20], [50, 50]], + dtype="int32", + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, data) + + +def test_pipeline_rectilinear_shards_partial_read(tmp_path: Path) -> None: + """Partial reads across rectilinear shard boundaries.""" + data = np.arange(120 * 100, dtype="float64").reshape(120, 100) + arr = zarr.create_array( + store=tmp_path / "rect_shards.zarr", + shape=(120, 100), + chunks=(10, 10), + shards=[[60, 40, 20], [50, 50]], + dtype="float64", + ) + arr[:] = data + result = arr[50:70, 40:60] + np.testing.assert_array_equal(result, data[50:70, 40:60]) - def test_rectilinear_shards_roundtrip(self, tmp_path: Path) -> None: - """Rectilinear shards with uniform inner chunks: full write/read roundtrip.""" - import numpy as np - data = np.arange(120 * 100, dtype="int32").reshape(120, 100) - arr = zarr.create_array( - store=tmp_path / "rect_shards.zarr", +def test_pipeline_rectilinear_shards_validates_divisibility(tmp_path: Path) -> None: + """Inner chunk_shape must divide every shard's dimensions.""" + with pytest.raises(ValueError, match="divisible"): + zarr.create_array( + store=tmp_path / "bad.zarr", shape=(120, 100), - chunks=(10, 10), # uniform inner chunks - shards=[[60, 40, 20], [50, 50]], # rectilinear shard boundaries + chunks=(10, 10), + shards=[[60, 45, 15], [50, 50]], dtype="int32", ) - arr[:] = data - result = arr[:] - np.testing.assert_array_equal(result, data) - def test_rectilinear_shards_partial_read(self, tmp_path: Path) -> None: - """Partial reads across rectilinear shard boundaries.""" - import numpy as np - data = np.arange(120 * 100, dtype="float64").reshape(120, 100) - arr = zarr.create_array( - store=tmp_path / "rect_shards.zarr", - shape=(120, 100), - chunks=(10, 10), - shards=[[60, 40, 20], [50, 50]], - dtype="float64", +def test_pipeline_nchunks(tmp_path: Path) -> None: + """Rectilinear array reports the correct total number of chunks""" + z, _ = _make_2d(tmp_path) + assert ChunkGrid.from_metadata(z.metadata).get_nchunks() == 12 + + +def test_pipeline_parse_chunk_grid_regular_from_dict() -> None: + """parse_chunk_grid constructs a regular grid from a metadata dict.""" + d: dict[str, Any] = {"name": "regular", "configuration": {"chunk_shape": [10, 20]}} + meta = parse_chunk_grid(d) + assert isinstance(meta, RegularChunkGridMeta) + g = ChunkGrid.from_sizes((100, 200), tuple(meta.chunk_shape)) + assert g.is_regular + assert g.chunk_shape == (10, 20) + assert g.grid_shape == (10, 10) + assert g.get_nchunks() == 100 + + +# --------------------------------------------------------------------------- +# VaryingDimension boundary tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("edges", "extent", "chunk_idx", "expected_data_size"), + [ + ([10, 20, 30], 50, 0, 10), + ([10, 20, 30], 50, 1, 20), + ([10, 20, 30], 50, 2, 20), + ([10, 20, 30], 60, 2, 30), + ([10, 20, 30], 31, 0, 10), + ([10, 20, 30], 31, 1, 20), + ([10, 20, 30], 31, 2, 1), + ], + ids=[ + "interior-0", + "interior-1", + "boundary-clipped", + "exact-no-clip", + "single-element-boundary-0", + "single-element-boundary-1", + "single-element-boundary-2", + ], +) +def test_varying_dimension_boundary_data_size( + edges: list[int], extent: int, chunk_idx: int, expected_data_size: int +) -> None: + """VaryingDimension.data_size clips correctly at boundary chunks""" + d = VaryingDimension(edges, extent=extent) + assert d.data_size(chunk_idx) == expected_data_size + + +def test_varying_dimension_boundary_extent_parameter() -> None: + """VaryingDimension preserves extent and full chunk_size even when extent < sum of edges""" + d = VaryingDimension([10, 20, 30], extent=50) + assert d.extent == 50 + assert d.chunk_size(2) == 30 + + +def test_varying_dimension_extent_exceeds_sum_rejected() -> None: + """VaryingDimension rejects extent greater than sum of edges""" + with pytest.raises(ValueError, match="exceeds sum of edges"): + VaryingDimension([10, 20], extent=50) + + +def test_varying_dimension_negative_extent_rejected() -> None: + """VaryingDimension rejects negative extent""" + with pytest.raises(ValueError, match="must be >= 0"): + VaryingDimension([10, 20], extent=-1) + + +def test_varying_dimension_boundary_chunk_spec() -> None: + """ChunkGrid with a boundary VaryingDimension produces correct ChunkSpec.""" + g = ChunkGrid(dimensions=(VaryingDimension([10, 20, 30], extent=50),)) + spec = g[(2,)] + assert spec is not None + assert spec.codec_shape == (30,) + assert spec.shape == (20,) + assert spec.is_boundary is True + + +def test_varying_dimension_interior_chunk_spec() -> None: + """Interior VaryingDimension chunk has matching codec_shape and shape with no boundary""" + g = ChunkGrid(dimensions=(VaryingDimension([10, 20, 30], extent=50),)) + spec = g[(0,)] + assert spec is not None + assert spec.codec_shape == (10,) + assert spec.shape == (10,) + assert spec.is_boundary is False + + +# --------------------------------------------------------------------------- +# Multiple overflow chunks tests +# --------------------------------------------------------------------------- + + +def test_overflow_multiple_chunks_past_extent() -> None: + """Edges past extent are structural; nchunks counts active only.""" + g = ChunkGrid.from_sizes((50,), [[10, 20, 30, 40]]) + d = g.dimensions[0] + assert d.ngridcells == 4 + assert d.nchunks == 3 + assert d.data_size(0) == 10 + assert d.data_size(1) == 20 + assert d.data_size(2) == 20 + assert d.chunk_size(2) == 30 + + +def test_overflow_chunk_spec_past_extent_is_oob() -> None: + """Chunk entirely past the extent is out of bounds (not active).""" + g = ChunkGrid.from_sizes((50,), [[10, 20, 30, 40]]) + spec = g[(3,)] + assert spec is None + + +def test_overflow_chunk_spec_partial() -> None: + """ChunkSpec for a partially-overflowing chunk clips correctly.""" + g = ChunkGrid.from_sizes((50,), [[10, 20, 30, 40]]) + spec = g[(2,)] + assert spec is not None + assert spec.shape == (20,) + assert spec.codec_shape == (30,) + assert spec.is_boundary is True + assert spec.slices == (slice(30, 50, 1),) + + +def test_overflow_chunk_sizes() -> None: + """chunk_sizes only includes active chunks.""" + g = ChunkGrid.from_sizes((50,), [[10, 20, 30, 40]]) + assert g.chunk_sizes == ((10, 20, 20),) + + +def test_overflow_multidim() -> None: + """Overflow in multiple dimensions simultaneously.""" + g = ChunkGrid.from_sizes((45, 100), [[10, 20, 30], [40, 40, 40]]) + assert g.chunk_sizes == ((10, 20, 15), (40, 40, 20)) + spec = g[(2, 2)] + assert spec is not None + assert spec.shape == (15, 20) + assert spec.codec_shape == (30, 40) + + +def test_overflow_uniform_edges_collapses_to_fixed() -> None: + """Uniform edges where len == ceildiv(extent, edge) collapse to FixedDimension.""" + g = ChunkGrid.from_sizes((35,), [[10, 10, 10, 10]]) + assert isinstance(g.dimensions[0], FixedDimension) + assert g.is_regular + assert g.chunk_sizes == ((10, 10, 10, 5),) + assert g.dimensions[0].nchunks == 4 + + +def test_overflow_index_to_chunk_near_extent() -> None: + """Index lookup near and at the extent boundary.""" + d = VaryingDimension([10, 20, 30, 40], extent=50) + assert d.index_to_chunk(29) == 1 + assert d.index_to_chunk(30) == 2 + assert d.index_to_chunk(49) == 2 + + +# --------------------------------------------------------------------------- +# Boundary indexing tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ( + "dim", + "mask", + "dim_len", + "expected_chunk_ix", + "expected_sel_len", + "expected_first_two", + "expected_third", + ), + [ + ( + FixedDimension(size=5, extent=7), + np.array([False, False, False, False, False, True, True]), + 7, + 1, + 5, + (np.True_, np.True_), + np.False_, + ), + ( + VaryingDimension([5, 10], extent=7), + np.array([False, False, False, False, False, True, True]), + 7, + 1, + 10, + (np.True_, np.True_), + np.False_, + ), + ], + ids=["fixed-boundary", "varying-boundary"], +) +def test_bool_indexer_boundary( + dim: FixedDimension | VaryingDimension, + mask: np.ndarray[Any, Any], + dim_len: int, + expected_chunk_ix: int, + expected_sel_len: int, + expected_first_two: tuple[Any, Any], + expected_third: Any, +) -> None: + """BoolArrayDimIndexer pads to codec size for boundary chunks.""" + from zarr.core.indexing import BoolArrayDimIndexer + + indexer = BoolArrayDimIndexer(mask, dim_len, dim) + projections = list(indexer) + assert len(projections) == 1 + p = projections[0] + assert p.dim_chunk_ix == expected_chunk_ix + sel = p.dim_chunk_sel + assert isinstance(sel, np.ndarray) + assert sel.shape[0] == expected_sel_len + assert sel[0] is expected_first_two[0] + assert sel[1] is expected_first_two[1] + assert sel[2] is expected_third + + +def test_bool_indexer_no_padding_interior() -> None: + """No padding needed for interior chunks.""" + from zarr.core.indexing import BoolArrayDimIndexer + + dim = FixedDimension(size=5, extent=10) + mask = np.array([True, False, False, False, False, False, False, False, False, False]) + indexer = BoolArrayDimIndexer(mask, 10, dim) + projections = list(indexer) + assert len(projections) == 1 + p = projections[0] + assert p.dim_chunk_ix == 0 + sel = p.dim_chunk_sel + assert isinstance(sel, np.ndarray) + assert sel.shape[0] == 5 + + +def test_slice_indexer_varying_boundary() -> None: + """SliceDimIndexer clips to data_size at boundary for VaryingDimension.""" + from zarr.core.indexing import SliceDimIndexer + + dim = VaryingDimension([5, 10], extent=7) + indexer = SliceDimIndexer(slice(None), 7, dim) + projections = list(indexer) + assert len(projections) == 2 + assert projections[0].dim_chunk_sel == slice(0, 5, 1) + assert projections[1].dim_chunk_sel == slice(0, 2, 1) + + +def test_int_array_indexer_varying_boundary() -> None: + """IntArrayDimIndexer handles indices near boundary correctly.""" + from zarr.core.indexing import IntArrayDimIndexer + + dim = VaryingDimension([5, 10], extent=7) + indices = np.array([6]) + indexer = IntArrayDimIndexer(indices, 7, dim) + projections = list(indexer) + assert len(projections) == 1 + assert projections[0].dim_chunk_ix == 1 + sel = projections[0].dim_chunk_sel + assert isinstance(sel, np.ndarray) + np.testing.assert_array_equal(sel, [1]) + + +@pytest.mark.parametrize( + "dim", + [FixedDimension(size=2, extent=10), VaryingDimension([5, 5], extent=10)], + ids=["fixed", "varying"], +) +def test_slice_indexer_empty_slice_at_boundary(dim: FixedDimension | VaryingDimension) -> None: + """SliceDimIndexer yields no projections for an empty slice at the dimension boundary.""" + from zarr.core.indexing import SliceDimIndexer + + indexer = SliceDimIndexer(slice(10, 10), 10, dim) + projections = list(indexer) + assert len(projections) == 0 + + +def test_orthogonal_indexer_varying_boundary_advanced() -> None: + """OrthogonalIndexer with advanced indexing uses per-chunk chunk_size.""" + from zarr.core.indexing import OrthogonalIndexer + + g = ChunkGrid( + dimensions=( + VaryingDimension([5, 10], extent=7), + FixedDimension(size=4, extent=8), ) - arr[:] = data - # Read a slice crossing shard boundaries - result = arr[50:70, 40:60] - np.testing.assert_array_equal(result, data[50:70, 40:60]) - - def test_rectilinear_shards_validates_divisibility(self, tmp_path: Path) -> None: - """Inner chunk_shape must divide every shard's dimensions.""" - with pytest.raises(ValueError, match="divisible"): - zarr.create_array( - store=tmp_path / "bad.zarr", - shape=(120, 100), - chunks=(10, 10), - shards=[[60, 45, 15], [50, 50]], # 45 not divisible by 10 - dtype="int32", - ) + ) + indexer = OrthogonalIndexer( + selection=(np.array([0, 6]), slice(None)), + shape=(7, 8), + chunk_grid=g, + ) + projections = list(indexer) + assert len(projections) == 4 + coords = {p.chunk_coords for p in projections} + assert coords == {(0, 0), (0, 1), (1, 0), (1, 1)} + + +# --------------------------------------------------------------------------- +# update_shape tests +# --------------------------------------------------------------------------- + + +def test_update_shape_no_change() -> None: + """update_shape with the same shape preserves edges unchanged""" + grid = ChunkGrid.from_sizes((60, 50), [[10, 20, 30], [25, 25]]) + new_grid = grid.update_shape((60, 50)) + assert _edges(new_grid, 0) == (10, 20, 30) + assert _edges(new_grid, 1) == (25, 25) + + +def test_update_shape_grow_single_dim() -> None: + """Growing a single dimension appends a new edge chunk""" + grid = ChunkGrid.from_sizes((60, 50), [[10, 20, 30], [25, 25]]) + new_grid = grid.update_shape((80, 50)) + assert _edges(new_grid, 0) == (10, 20, 30, 20) + assert _edges(new_grid, 1) == (25, 25) + + +def test_update_shape_grow_multiple_dims() -> None: + """Growing multiple dimensions appends correctly sized edge chunks""" + grid = ChunkGrid.from_sizes((30, 50), [[10, 20], [20, 30]]) + new_grid = grid.update_shape((45, 65)) + assert _edges(new_grid, 0) == (10, 20, 15) + assert _edges(new_grid, 1) == (20, 30, 15) + + +def test_update_shape_shrink_single_dim() -> None: + """Shrinking a single dimension reduces nchunks while preserving edges""" + grid = ChunkGrid.from_sizes((100, 50), [[10, 20, 30, 40], [25, 25]]) + new_grid = grid.update_shape((35, 50)) + assert _edges(new_grid, 0) == (10, 20, 30, 40) + assert new_grid.dimensions[0].nchunks == 3 + assert _edges(new_grid, 1) == (25, 25) + + +def test_update_shape_shrink_to_single_chunk() -> None: + """Shrinking to fit within the first chunk reduces nchunks to 1""" + grid = ChunkGrid.from_sizes((60, 50), [[10, 20, 30], [25, 25]]) + new_grid = grid.update_shape((5, 50)) + assert _edges(new_grid, 0) == (10, 20, 30) + assert new_grid.dimensions[0].nchunks == 1 + assert _edges(new_grid, 1) == (25, 25) + + +def test_update_shape_shrink_multiple_dims() -> None: + """Shrinking multiple dimensions reduces nchunks in each dimension""" + grid = ChunkGrid.from_sizes((40, 60), [[10, 10, 15, 5], [20, 25, 15]]) + new_grid = grid.update_shape((25, 35)) + assert _edges(new_grid, 0) == (10, 10, 15, 5) + assert new_grid.dimensions[0].nchunks == 3 + assert _edges(new_grid, 1) == (20, 25, 15) + assert new_grid.dimensions[1].nchunks == 2 + + +def test_update_shape_dimension_mismatch_error() -> None: + """update_shape raises ValueError when new shape has different ndim""" + grid = ChunkGrid.from_sizes((30, 70), [[10, 20], [30, 40]]) + with pytest.raises(ValueError, match="dimensions"): + grid.update_shape((30, 70, 100)) + + +def test_update_shape_boundary_cases() -> None: + """update_shape handles grow-one-dim and shrink-both-dims edge cases correctly""" + grid = ChunkGrid.from_sizes((60, 40), [[10, 20, 30], [15, 25]]) + new_grid = grid.update_shape((60, 65)) + assert _edges(new_grid, 0) == (10, 20, 30) + assert _edges(new_grid, 1) == (15, 25, 25) + + grid2 = ChunkGrid.from_sizes((60, 50), [[10, 20, 30], [15, 25, 10]]) + new_grid2 = grid2.update_shape((30, 40)) + assert _edges(new_grid2, 0) == (10, 20, 30) + assert new_grid2.dimensions[0].nchunks == 2 + assert _edges(new_grid2, 1) == (15, 25, 10) + assert new_grid2.dimensions[1].nchunks == 2 + + +def test_update_shape_regular_preserves_extents(tmp_path: Path) -> None: + """Resize a regular array -- chunk_grid extents must match new shape.""" + z = zarr.create_array( + store=tmp_path / "regular.zarr", + shape=(100,), + chunks=(10,), + dtype="int32", + ) + z[:] = np.arange(100, dtype="int32") + z.resize(50) + assert z.shape == (50,) + assert ChunkGrid.from_metadata(z.metadata).dimensions[0].extent == 50 + + +# --------------------------------------------------------------------------- +# update_shape boundary tests +# --------------------------------------------------------------------------- + + +def test_update_shape_shrink_creates_boundary() -> None: + """Shrinking extent into a chunk creates a boundary with clipped data_size""" + grid = ChunkGrid.from_sizes((60,), [[10, 20, 30]]) + new_grid = grid.update_shape((45,)) + dim = new_grid.dimensions[0] + assert isinstance(dim, VaryingDimension) + assert dim.edges == (10, 20, 30) + assert dim.extent == 45 + assert dim.chunk_size(2) == 30 + assert dim.data_size(2) == 15 + + +def test_update_shape_shrink_to_exact_boundary() -> None: + """Shrinking to an exact chunk boundary reduces nchunks without partial data""" + grid = ChunkGrid.from_sizes((60,), [[10, 20, 30]]) + new_grid = grid.update_shape((30,)) + dim = new_grid.dimensions[0] + assert isinstance(dim, VaryingDimension) + assert dim.edges == (10, 20, 30) + assert dim.nchunks == 2 + assert dim.ngridcells == 3 + assert dim.extent == 30 + assert dim.data_size(1) == 20 + + +def test_update_shape_shrink_chunk_spec() -> None: + """After shrink, ChunkSpec reflects boundary correctly.""" + grid = ChunkGrid.from_sizes((60,), [[10, 20, 30]]) + new_grid = grid.update_shape((45,)) + spec = new_grid[(2,)] + assert spec is not None + assert spec.codec_shape == (30,) + assert spec.shape == (15,) + assert spec.is_boundary is True + + +def test_update_shape_parse_chunk_grid_rebinds_extent() -> None: + """parse_chunk_grid re-binds VaryingDimension extent to array shape.""" + g = ChunkGrid.from_sizes((60,), [[10, 20, 30]]) + g2 = ChunkGrid( + dimensions=tuple(dim.with_extent(ext) for dim, ext in zip(g.dimensions, (50,), strict=True)) + ) + dim = g2.dimensions[0] + assert isinstance(dim, VaryingDimension) + assert dim.extent == 50 + assert dim.data_size(2) == 20 + + +# --------------------------------------------------------------------------- +# Resize rectilinear tests +# --------------------------------------------------------------------------- + + +async def test_async_resize_grow() -> None: + """Async resize grow appends new edge chunks and preserves existing data""" + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(30, 40), + chunks=[[10, 20], [20, 20]], + dtype="i4", + zarr_format=3, + ) + data = np.arange(30 * 40, dtype="i4").reshape(30, 40) + await arr.setitem(slice(None), data) + + await arr.resize((50, 60)) + assert arr.shape == (50, 60) + assert _edges(ChunkGrid.from_metadata(arr.metadata), 0) == (10, 20, 20) + assert _edges(ChunkGrid.from_metadata(arr.metadata), 1) == (20, 20, 20) + result = await arr.getitem((slice(0, 30), slice(0, 40))) + np.testing.assert_array_equal(result, data) + + +async def test_async_resize_shrink() -> None: + """Async resize shrink truncates data to the new shape""" + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(60, 50), + chunks=[[10, 20, 30], [25, 25]], + dtype="f4", + zarr_format=3, + ) + data = np.arange(60 * 50, dtype="f4").reshape(60, 50) + await arr.setitem(slice(None), data) + + await arr.resize((25, 30)) + assert arr.shape == (25, 30) + result = await arr.getitem(slice(None)) + np.testing.assert_array_equal(result, data[:25, :30]) + + +def test_sync_resize_grow() -> None: + """Sync resize grow expands the array and preserves existing data""" + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(20, 30), + chunks=[[8, 12], [10, 20]], + dtype="u1", + zarr_format=3, + ) + data = np.arange(20 * 30, dtype="u1").reshape(20, 30) + arr[:] = data + arr.resize((35, 45)) + assert arr.shape == (35, 45) + np.testing.assert_array_equal(arr[:20, :30], data) + + +def test_sync_resize_shrink() -> None: + """Sync resize shrink truncates the array and returns correct data""" + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(40, 50), + chunks=[[10, 15, 15], [20, 30]], + dtype="i2", + zarr_format=3, + ) + data = np.arange(40 * 50, dtype="i2").reshape(40, 50) + arr[:] = data + arr.resize((15, 30)) + assert arr.shape == (15, 30) + np.testing.assert_array_equal(arr[:], data[:15, :30]) + + +# --------------------------------------------------------------------------- +# Append rectilinear tests +# --------------------------------------------------------------------------- + + +async def test_append_first_axis() -> None: + """Appending along axis 0 grows the array and concatenates data correctly""" + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(30, 20), + chunks=[[10, 20], [10, 10]], + dtype="i4", + zarr_format=3, + ) + initial = np.arange(30 * 20, dtype="i4").reshape(30, 20) + await arr.setitem(slice(None), initial) + + append_data = np.arange(30 * 20, 45 * 20, dtype="i4").reshape(15, 20) + await arr.append(append_data, axis=0) + assert arr.shape == (45, 20) + + result = await arr.getitem(slice(None)) + np.testing.assert_array_equal(result, np.vstack([initial, append_data])) + + +async def test_append_second_axis() -> None: + """Appending along axis 1 grows the array and concatenates data correctly""" + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(20, 30), + chunks=[[10, 10], [10, 20]], + dtype="f4", + zarr_format=3, + ) + initial = np.arange(20 * 30, dtype="f4").reshape(20, 30) + await arr.setitem(slice(None), initial) + + append_data = np.arange(20 * 30, 20 * 45, dtype="f4").reshape(20, 15) + await arr.append(append_data, axis=1) + assert arr.shape == (20, 45) + + result = await arr.getitem(slice(None)) + np.testing.assert_array_equal(result, np.hstack([initial, append_data])) + + +def test_sync_append() -> None: + """Sync append grows the array and preserves both initial and appended data""" + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(20, 20), + chunks=[[8, 12], [7, 13]], + dtype="u2", + zarr_format=3, + ) + initial = np.arange(20 * 20, dtype="u2").reshape(20, 20) + arr[:] = initial + + append_data = np.arange(20 * 20, 25 * 20, dtype="u2").reshape(5, 20) + arr.append(append_data, axis=0) + assert arr.shape == (25, 20) + np.testing.assert_array_equal(arr[:20, :], initial) + np.testing.assert_array_equal(arr[20:, :], append_data) + + +async def test_multiple_appends() -> None: + """Multiple sequential appends accumulate data correctly""" + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(10, 10), + chunks=[[3, 7], [4, 6]], + dtype="i4", + zarr_format=3, + ) + initial = np.arange(10 * 10, dtype="i4").reshape(10, 10) + await arr.setitem(slice(None), initial) + + all_data = [initial] + for i in range(3): + chunk = np.full((5, 10), i + 100, dtype="i4") + await arr.append(chunk, axis=0) + all_data.append(chunk) + + assert arr.shape == (25, 10) + result = await arr.getitem(slice(None)) + np.testing.assert_array_equal(result, np.vstack(all_data)) + + +async def test_append_with_partial_edge_chunks() -> None: + """Appending data that creates partial edge chunks preserves all data""" + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(25, 30), + chunks=[[10, 15], [12, 18]], + dtype="f8", + zarr_format=3, + ) + initial = np.random.default_rng(42).random((25, 30)) + await arr.setitem(slice(None), initial) + + append_data = np.random.default_rng(43).random((10, 30)) + await arr.append(append_data, axis=0) + assert arr.shape == (35, 30) + + result = np.asarray(await arr.getitem(slice(None))) + np.testing.assert_array_almost_equal(result, np.vstack([initial, append_data])) - def test_nchunks(self, tmp_path: Path) -> None: - z, _ = self._make_2d(tmp_path) - assert z._chunk_grid.get_nchunks() == 12 + +async def test_append_small_data() -> None: + """Appending a small amount of data smaller than a chunk works correctly""" + store = zarr.storage.MemoryStore() + arr = await zarr.api.asynchronous.create_array( + store=store, + shape=(20, 20), + chunks=[[8, 12], [7, 13]], + dtype="i4", + zarr_format=3, + ) + data = np.arange(20 * 20, dtype="i4").reshape(20, 20) + await arr.setitem(slice(None), data) + + small = np.full((3, 20), 999, dtype="i4") + await arr.append(small, axis=0) + assert arr.shape == (23, 20) + result = await arr.getitem((slice(20, 23), slice(None))) + np.testing.assert_array_equal(result, small) + + +# --------------------------------------------------------------------------- +# V2 regression tests +# --------------------------------------------------------------------------- + + +def test_v2_create_and_readback(tmp_path: Path) -> None: + """Basic V2 array: create, write, read back.""" + data = np.arange(60, dtype="float64").reshape(6, 10) + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=data.shape, + chunks=(3, 5), + dtype=data.dtype, + zarr_format=2, + ) + a[:] = data + np.testing.assert_array_equal(a[:], data) + + +def test_v2_chunk_grid_is_regular(tmp_path: Path) -> None: + """V2 chunk_grid produces a regular ChunkGrid with FixedDimensions.""" + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(20, 30), + chunks=(10, 15), + dtype="int32", + zarr_format=2, + ) + grid = ChunkGrid.from_metadata(a.metadata) + assert grid.is_regular + assert grid.chunk_shape == (10, 15) + assert grid.grid_shape == (2, 2) + assert all(isinstance(d, FixedDimension) for d in grid.dimensions) + + +def test_v2_boundary_chunks(tmp_path: Path) -> None: + """V2 boundary chunks: codec buffer size stays full, data is clipped.""" + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(25,), + chunks=(10,), + dtype="int32", + zarr_format=2, + ) + grid = ChunkGrid.from_metadata(a.metadata) + assert grid.dimensions[0].nchunks == 3 + assert grid.dimensions[0].chunk_size(2) == 10 + assert grid.dimensions[0].data_size(2) == 5 + + +def test_v2_slicing_with_boundary(tmp_path: Path) -> None: + """V2 array slicing across boundary chunks returns correct data.""" + data = np.arange(25, dtype="int32") + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(25,), + chunks=(10,), + dtype="int32", + zarr_format=2, + ) + a[:] = data + np.testing.assert_array_equal(a[18:25], data[18:25]) + np.testing.assert_array_equal(a[:], data) + + +def test_v2_metadata_roundtrip(tmp_path: Path) -> None: + """V2 metadata survives store close and reopen.""" + store_path = tmp_path / "v2.zarr" + data = np.arange(12, dtype="float32").reshape(3, 4) + a = zarr.create_array( + store=store_path, + shape=data.shape, + chunks=(2, 2), + dtype=data.dtype, + zarr_format=2, + ) + a[:] = data + + b = zarr.open_array(store=store_path, mode="r") + assert b.metadata.zarr_format == 2 + assert b.chunks == (2, 2) + assert ChunkGrid.from_metadata(b.metadata).chunk_shape == (2, 2) + np.testing.assert_array_equal(b[:], data) + + +def test_v2_chunk_spec_via_grid(tmp_path: Path) -> None: + """ChunkSpec from V2 grid has correct slices and codec_shape.""" + a = zarr.create_array( + store=tmp_path / "v2.zarr", + shape=(15, 20), + chunks=(10, 10), + dtype="int32", + zarr_format=2, + ) + grid = ChunkGrid.from_metadata(a.metadata) + spec = grid[(0, 0)] + assert spec is not None + assert spec.shape == (10, 10) + assert spec.codec_shape == (10, 10) + spec = grid[(1, 1)] + assert spec is not None + assert spec.shape == (5, 10) + assert spec.codec_shape == (10, 10) + + +# --------------------------------------------------------------------------- +# ChunkSizes tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("shape", "chunks", "expected"), + [ + ((100, 80), (30, 40), ((30, 30, 30, 10), (40, 40))), + ((90, 80), (30, 40), ((30, 30, 30), (40, 40))), + ((60, 100), [[10, 20, 30], [50, 50]], ((10, 20, 30), (50, 50))), + ((10,), (10,), ((10,),)), + ], + ids=["regular", "regular-exact", "rectilinear", "single-chunk"], +) +def test_chunk_sizes( + shape: tuple[int, ...], chunks: Any, expected: tuple[tuple[int, ...], ...] +) -> None: + """chunk_sizes returns the per-dimension tuple of actual data sizes""" + grid = ChunkGrid.from_sizes(shape, chunks) + assert grid.chunk_sizes == expected + + +def test_array_read_chunk_sizes_regular() -> None: + """Regular array exposes correct read_chunk_sizes and write_chunk_sizes""" + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, shape=(100, 80), chunks=(30, 40), dtype="i4", zarr_format=3 + ) + assert arr.read_chunk_sizes == ((30, 30, 30, 10), (40, 40)) + assert arr.write_chunk_sizes == ((30, 30, 30, 10), (40, 40)) + + +def test_array_read_chunk_sizes_rectilinear() -> None: + """Rectilinear array exposes correct read_chunk_sizes and write_chunk_sizes""" + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]], dtype="i4", zarr_format=3 + ) + assert arr.read_chunk_sizes == ((10, 20, 30), (50, 50)) + assert arr.write_chunk_sizes == ((10, 20, 30), (50, 50)) + + +def test_array_sharded_chunk_sizes() -> None: + """Sharded array read_chunk_sizes reflects inner chunks and write_chunk_sizes reflects shards""" + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(120, 80), + chunks=(60, 40), + shards=(120, 80), + dtype="i4", + zarr_format=3, + ) + assert arr.read_chunk_sizes == ((60, 60), (40, 40)) + assert arr.write_chunk_sizes == ((120,), (80,)) + + +# --------------------------------------------------------------------------- +# Info display test +# --------------------------------------------------------------------------- + + +def test_info_display_rectilinear() -> None: + """Array.info should not crash for rectilinear grids.""" + store = zarr.storage.MemoryStore() + arr = zarr.create_array( + store=store, + shape=(30,), + chunks=[[10, 20]], + dtype="i4", + zarr_format=3, + ) + info = arr.info + text = repr(info) + assert "" in text + assert "Array" in text + + +# --------------------------------------------------------------------------- +# nchunks tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("shape", "chunks", "expected"), + [ + ((30,), [[10, 20]], 2), + ((30, 40), [[10, 20], [15, 25]], 4), + ], + ids=["1d", "2d"], +) +def test_nchunks_rectilinear( + shape: tuple[int, ...], chunks: list[list[int]], expected: int +) -> None: + """Array.nchunks reports correct total chunk count for rectilinear arrays""" + store = MemoryStore() + a = zarr.create_array(store, shape=shape, chunks=chunks, dtype="int32") + assert a.nchunks == expected + + +# --------------------------------------------------------------------------- +# iter_chunk_regions test +# --------------------------------------------------------------------------- + + +def test_iter_chunk_regions_rectilinear() -> None: + """_iter_chunk_regions should work for rectilinear arrays.""" + from zarr.core.array import _iter_chunk_regions + + store = MemoryStore() + a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") + regions = list(_iter_chunk_regions(a)) + assert len(regions) == 2 + assert regions[0] == (slice(0, 10, 1),) + assert regions[1] == (slice(10, 30, 1),) + + +# --------------------------------------------------------------------------- +# RectilinearChunkGrid metadata object tests (already parametrized) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("json_input", "expected_chunk_shapes"), + [ + ( + { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [4, 8]}, + }, + (4, 8), + ), + ( + { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[1, 2, 3], [10, 20]]}, + }, + ((1, 2, 3), (10, 20)), + ), + ( + { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[[4, 3]], [10, 20]]}, + }, + ((4, 4, 4), (10, 20)), + ), + ( + { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[[1, 3], 3], [5]]}, + }, + ((1, 1, 1, 3), (5,)), + ), + ( + { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [4, [10, 20]]}, + }, + (4, (10, 20)), + ), + ], +) +def test_rectilinear_from_dict( + json_input: dict[str, Any], expected_chunk_shapes: tuple[int | tuple[int, ...], ...] +) -> None: + """RectilinearChunkGrid.from_dict correctly parses all spec forms.""" + grid = RectilinearChunkGrid.from_dict(json_input) # type: ignore[arg-type] + assert grid.chunk_shapes == expected_chunk_shapes + + +@pytest.mark.parametrize( + ("chunk_shapes", "expected_json_shapes"), + [ + ((4, 8), [4, 8]), + (((4,), (8,)), [[4], [8]]), + (((10, 20), (5, 5)), [[10, 20], [[5, 2]]]), + (((4, 4, 4), (10, 20)), [[[4, 3]], [10, 20]]), + ((4, (10, 20)), [4, [10, 20]]), + ], +) +def test_rectilinear_to_dict( + chunk_shapes: tuple[int | tuple[int, ...], ...], + expected_json_shapes: list[Any], +) -> None: + """RectilinearChunkGrid.to_dict serializes back to spec-compliant JSON.""" + grid = RectilinearChunkGrid(chunk_shapes=chunk_shapes) + result = grid.to_dict() + assert result["name"] == "rectilinear" + assert result["configuration"]["kind"] == "inline" + assert list(result["configuration"]["chunk_shapes"]) == expected_json_shapes + + +@pytest.mark.parametrize( + "json_input", + [ + {"name": "rectilinear", "configuration": {"kind": "inline", "chunk_shapes": [4, 8]}}, + { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[1, 2, 3], [10, 20]]}, + }, + { + "name": "rectilinear", + "configuration": {"kind": "inline", "chunk_shapes": [[[4, 3]], [[5, 2]]]}, + }, + ], +) +def test_rectilinear_roundtrip(json_input: dict[str, Any]) -> None: + """from_dict -> to_dict -> from_dict produces the same grid.""" + grid1 = RectilinearChunkGrid.from_dict(json_input) # type: ignore[arg-type] + grid2 = RectilinearChunkGrid.from_dict(grid1.to_dict()) + assert grid1.chunk_shapes == grid2.chunk_shapes + + +# --------------------------------------------------------------------------- +# Hypothesis property tests +# --------------------------------------------------------------------------- pytest.importorskip("hypothesis") @@ -1448,9 +2750,8 @@ def rectilinear_arrays_st(draw: st.DrawFn) -> tuple[zarr.Array[Any], np.ndarray[ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: """Property test: block indexing on rectilinear arrays matches numpy.""" z, a = data.draw(rectilinear_arrays_st()) - grid = z._chunk_grid + grid = ChunkGrid.from_metadata(z.metadata) - # Pick a random block per dimension and verify it matches the expected slice for dim in range(a.ndim): dim_grid = grid.dimensions[dim] block_ix = data.draw(st.integers(min_value=0, max_value=dim_grid.nchunks - 1)) @@ -1465,780 +2766,3 @@ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: a[tuple(sel)], err_msg=f"dim={dim}, block={block_ix}", ) - - -class TestV2Regression: - """Verify V2 arrays still work correctly after the ChunkGrid refactor. - - V2 only supports regular chunks. These tests ensure the V2 metadata - round-trip (create → write → read) and chunk_grid property work as - expected with the unified ChunkGrid infrastructure. - """ - - def test_v2_create_and_readback(self, tmp_path: Path) -> None: - """Basic V2 array: create, write, read back.""" - data = np.arange(60, dtype="float64").reshape(6, 10) - a = zarr.create_array( - store=tmp_path / "v2.zarr", - shape=data.shape, - chunks=(3, 5), - dtype=data.dtype, - zarr_format=2, - ) - a[:] = data - np.testing.assert_array_equal(a[:], data) - - def test_v2_chunk_grid_is_regular(self, tmp_path: Path) -> None: - """V2 chunk_grid produces a regular ChunkGrid with FixedDimensions.""" - a = zarr.create_array( - store=tmp_path / "v2.zarr", - shape=(20, 30), - chunks=(10, 15), - dtype="int32", - zarr_format=2, - ) - grid = a._chunk_grid - assert grid.is_regular - assert grid.chunk_shape == (10, 15) - assert grid.grid_shape == (2, 2) - assert all(isinstance(d, FixedDimension) for d in grid.dimensions) - - def test_v2_boundary_chunks(self, tmp_path: Path) -> None: - """V2 boundary chunks: codec buffer size stays full, data is clipped.""" - a = zarr.create_array( - store=tmp_path / "v2.zarr", - shape=(25,), - chunks=(10,), - dtype="int32", - zarr_format=2, - ) - grid = a._chunk_grid - assert grid.dimensions[0].nchunks == 3 - assert grid.dimensions[0].chunk_size(2) == 10 # full codec buffer - assert grid.dimensions[0].data_size(2) == 5 # clipped to extent - - def test_v2_slicing_with_boundary(self, tmp_path: Path) -> None: - """V2 array slicing across boundary chunks returns correct data.""" - data = np.arange(25, dtype="int32") - a = zarr.create_array( - store=tmp_path / "v2.zarr", - shape=(25,), - chunks=(10,), - dtype="int32", - zarr_format=2, - ) - a[:] = data - np.testing.assert_array_equal(a[18:25], data[18:25]) - np.testing.assert_array_equal(a[:], data) - - def test_v2_metadata_roundtrip(self, tmp_path: Path) -> None: - """V2 metadata survives store close and reopen.""" - store_path = tmp_path / "v2.zarr" - data = np.arange(12, dtype="float32").reshape(3, 4) - a = zarr.create_array( - store=store_path, - shape=data.shape, - chunks=(2, 2), - dtype=data.dtype, - zarr_format=2, - ) - a[:] = data - - # Reopen from store - b = zarr.open_array(store=store_path, mode="r") - assert b.metadata.zarr_format == 2 - assert b.chunks == (2, 2) - assert b._chunk_grid.chunk_shape == (2, 2) - np.testing.assert_array_equal(b[:], data) - - def test_v2_chunk_spec_via_grid(self, tmp_path: Path) -> None: - """ChunkSpec from V2 grid has correct slices and codec_shape.""" - a = zarr.create_array( - store=tmp_path / "v2.zarr", - shape=(15, 20), - chunks=(10, 10), - dtype="int32", - zarr_format=2, - ) - grid = a._chunk_grid - # Interior chunk - spec = grid[(0, 0)] - assert spec is not None - assert spec.shape == (10, 10) - assert spec.codec_shape == (10, 10) - # Boundary chunk - spec = grid[(1, 1)] - assert spec is not None - assert spec.shape == (5, 10) # clipped data - assert spec.codec_shape == (10, 10) # full buffer - - -class TestChunkSizes: - """Tests for ChunkGrid.chunk_sizes and Array.read_chunk_sizes / write_chunk_sizes.""" - - def test_regular_grid(self) -> None: - grid = ChunkGrid.from_regular((100, 80), (30, 40)) - assert grid.chunk_sizes == ((30, 30, 30, 10), (40, 40)) - - def test_regular_grid_exact(self) -> None: - grid = ChunkGrid.from_regular((90, 80), (30, 40)) - assert grid.chunk_sizes == ((30, 30, 30), (40, 40)) - - def test_rectilinear_grid(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [50, 50]], array_shape=(60, 100)) - assert grid.chunk_sizes == ((10, 20, 30), (50, 50)) - - def test_single_chunk(self) -> None: - grid = ChunkGrid.from_regular((10,), (10,)) - assert grid.chunk_sizes == ((10,),) - - def test_array_read_chunk_sizes_regular(self) -> None: - store = zarr.storage.MemoryStore() - arr = zarr.create_array( - store=store, shape=(100, 80), chunks=(30, 40), dtype="i4", zarr_format=3 - ) - assert arr.read_chunk_sizes == ((30, 30, 30, 10), (40, 40)) - assert arr.write_chunk_sizes == ((30, 30, 30, 10), (40, 40)) - - def test_array_read_chunk_sizes_rectilinear(self) -> None: - store = zarr.storage.MemoryStore() - arr = zarr.create_array( - store=store, shape=(60, 100), chunks=[[10, 20, 30], [50, 50]], dtype="i4", zarr_format=3 - ) - assert arr.read_chunk_sizes == ((10, 20, 30), (50, 50)) - assert arr.write_chunk_sizes == ((10, 20, 30), (50, 50)) - - def test_array_sharded_chunk_sizes(self) -> None: - store = zarr.storage.MemoryStore() - arr = zarr.create_array( - store=store, - shape=(120, 80), - chunks=(60, 40), - shards=(120, 80), - dtype="i4", - zarr_format=3, - ) - # read_chunk_sizes returns inner chunks - assert arr.read_chunk_sizes == ((60, 60), (40, 40)) - # write_chunk_sizes returns outer (shard) chunks - assert arr.write_chunk_sizes == ((120,), (80,)) - - -def test_info_display_rectilinear() -> None: - """Array.info should not crash for rectilinear grids.""" - store = zarr.storage.MemoryStore() - arr = zarr.create_array( - store=store, - shape=(30,), - chunks=[[10, 20]], - dtype="i4", - zarr_format=3, - ) - info = arr.info - text = repr(info) - assert "" in text - assert "Array" in text - - -class TestUpdateShape: - """Unit tests for ChunkGrid.update_shape().""" - - def test_no_change(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) - new_grid = grid.update_shape((60, 50)) - assert _edges(new_grid, 0) == (10, 20, 30) - assert _edges(new_grid, 1) == (25, 25) - - def test_grow_single_dim(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) - new_grid = grid.update_shape((80, 50)) - assert _edges(new_grid, 0) == (10, 20, 30, 20) - assert _edges(new_grid, 1) == (25, 25) - - def test_grow_multiple_dims(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20], [20, 30]], array_shape=(30, 50)) - # from (30, 50) to (45, 65) - new_grid = grid.update_shape((45, 65)) - assert _edges(new_grid, 0) == (10, 20, 15) - assert _edges(new_grid, 1) == (20, 30, 15) - - def test_shrink_single_dim(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30, 40], [25, 25]], array_shape=(100, 50)) - new_grid = grid.update_shape((35, 50)) - # All edges preserved (spec allows trailing edges beyond extent) - assert _edges(new_grid, 0) == (10, 20, 30, 40) - # But only 3 chunks are active (10+20+30=60 >= 35) - assert new_grid.dimensions[0].nchunks == 3 - assert _edges(new_grid, 1) == (25, 25) - - def test_shrink_to_single_chunk(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [25, 25]], array_shape=(60, 50)) - new_grid = grid.update_shape((5, 50)) - # All edges preserved - assert _edges(new_grid, 0) == (10, 20, 30) - # But only 1 chunk is active (10 >= 5) - assert new_grid.dimensions[0].nchunks == 1 - assert _edges(new_grid, 1) == (25, 25) - - def test_shrink_multiple_dims(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 10, 15, 5], [20, 25, 15]], array_shape=(40, 60)) - # from (40, 60) to (25, 35) - new_grid = grid.update_shape((25, 35)) - # All edges preserved, but nchunks reflects active chunks - assert _edges(new_grid, 0) == (10, 10, 15, 5) - assert new_grid.dimensions[0].nchunks == 3 # 10+10+15=35 >= 25 - assert _edges(new_grid, 1) == (20, 25, 15) - assert new_grid.dimensions[1].nchunks == 2 # 20+25=45 >= 35 - - def test_dimension_mismatch_error(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20], [30, 40]], array_shape=(30, 70)) - with pytest.raises(ValueError, match="dimensions"): - grid.update_shape((30, 70, 100)) - - def test_boundary_cases(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25]], array_shape=(60, 40)) - # Grow to exact chunk boundary on dim 0, add 25 to dim 1 - new_grid = grid.update_shape((60, 65)) - assert _edges(new_grid, 0) == (10, 20, 30) # no change (60 == sum) - assert _edges(new_grid, 1) == (15, 25, 25) # added chunk of 25 - - # Shrink to exact chunk boundary — edges preserved, nchunks adjusts - grid2 = ChunkGrid.from_rectilinear([[10, 20, 30], [15, 25, 10]], array_shape=(60, 50)) - new_grid2 = grid2.update_shape((30, 40)) - # All edges preserved - assert _edges(new_grid2, 0) == (10, 20, 30) - assert new_grid2.dimensions[0].nchunks == 2 # 10+20=30 >= 30 - assert _edges(new_grid2, 1) == (15, 25, 10) - assert new_grid2.dimensions[1].nchunks == 2 # 15+25=40 >= 40 - - def test_regular_preserves_extents(self, tmp_path: Path) -> None: - """Resize a regular array — chunk_grid extents must match new shape.""" - z = zarr.create_array( - store=tmp_path / "regular.zarr", - shape=(100,), - chunks=(10,), - dtype="int32", - ) - z[:] = np.arange(100, dtype="int32") - z.resize(50) - assert z.shape == (50,) - assert z._chunk_grid.dimensions[0].extent == 50 - - -class TestResizeRectilinear: - """End-to-end resize tests on rectilinear arrays.""" - - async def test_async_resize_grow(self) -> None: - store = zarr.storage.MemoryStore() - arr = await zarr.api.asynchronous.create_array( - store=store, - shape=(30, 40), - chunks=[[10, 20], [20, 20]], - dtype="i4", - zarr_format=3, - ) - data = np.arange(30 * 40, dtype="i4").reshape(30, 40) - await arr.setitem(slice(None), data) - - await arr.resize((50, 60)) - assert arr.shape == (50, 60) - assert _edges(arr._chunk_grid, 0) == (10, 20, 20) - assert _edges(arr._chunk_grid, 1) == (20, 20, 20) - result = await arr.getitem((slice(0, 30), slice(0, 40))) - np.testing.assert_array_equal(result, data) - - async def test_async_resize_shrink(self) -> None: - store = zarr.storage.MemoryStore() - arr = await zarr.api.asynchronous.create_array( - store=store, - shape=(60, 50), - chunks=[[10, 20, 30], [25, 25]], - dtype="f4", - zarr_format=3, - ) - data = np.arange(60 * 50, dtype="f4").reshape(60, 50) - await arr.setitem(slice(None), data) - - await arr.resize((25, 30)) - assert arr.shape == (25, 30) - result = await arr.getitem(slice(None)) - np.testing.assert_array_equal(result, data[:25, :30]) - - def test_sync_resize_grow(self) -> None: - store = zarr.storage.MemoryStore() - arr = zarr.create_array( - store=store, - shape=(20, 30), - chunks=[[8, 12], [10, 20]], - dtype="u1", - zarr_format=3, - ) - data = np.arange(20 * 30, dtype="u1").reshape(20, 30) - arr[:] = data - arr.resize((35, 45)) - assert arr.shape == (35, 45) - np.testing.assert_array_equal(arr[:20, :30], data) - - def test_sync_resize_shrink(self) -> None: - store = zarr.storage.MemoryStore() - arr = zarr.create_array( - store=store, - shape=(40, 50), - chunks=[[10, 15, 15], [20, 30]], - dtype="i2", - zarr_format=3, - ) - data = np.arange(40 * 50, dtype="i2").reshape(40, 50) - arr[:] = data - arr.resize((15, 30)) - assert arr.shape == (15, 30) - np.testing.assert_array_equal(arr[:], data[:15, :30]) - - -class TestAppendRectilinear: - """End-to-end append tests on rectilinear arrays.""" - - async def test_append_first_axis(self) -> None: - store = zarr.storage.MemoryStore() - arr = await zarr.api.asynchronous.create_array( - store=store, - shape=(30, 20), - chunks=[[10, 20], [10, 10]], - dtype="i4", - zarr_format=3, - ) - initial = np.arange(30 * 20, dtype="i4").reshape(30, 20) - await arr.setitem(slice(None), initial) - - append_data = np.arange(30 * 20, 45 * 20, dtype="i4").reshape(15, 20) - await arr.append(append_data, axis=0) - assert arr.shape == (45, 20) - - result = await arr.getitem(slice(None)) - np.testing.assert_array_equal(result, np.vstack([initial, append_data])) - - async def test_append_second_axis(self) -> None: - store = zarr.storage.MemoryStore() - arr = await zarr.api.asynchronous.create_array( - store=store, - shape=(20, 30), - chunks=[[10, 10], [10, 20]], - dtype="f4", - zarr_format=3, - ) - initial = np.arange(20 * 30, dtype="f4").reshape(20, 30) - await arr.setitem(slice(None), initial) - - append_data = np.arange(20 * 30, 20 * 45, dtype="f4").reshape(20, 15) - await arr.append(append_data, axis=1) - assert arr.shape == (20, 45) - - result = await arr.getitem(slice(None)) - np.testing.assert_array_equal(result, np.hstack([initial, append_data])) - - def test_sync_append(self) -> None: - store = zarr.storage.MemoryStore() - arr = zarr.create_array( - store=store, - shape=(20, 20), - chunks=[[8, 12], [7, 13]], - dtype="u2", - zarr_format=3, - ) - initial = np.arange(20 * 20, dtype="u2").reshape(20, 20) - arr[:] = initial - - append_data = np.arange(20 * 20, 25 * 20, dtype="u2").reshape(5, 20) - arr.append(append_data, axis=0) - assert arr.shape == (25, 20) - np.testing.assert_array_equal(arr[:20, :], initial) - np.testing.assert_array_equal(arr[20:, :], append_data) - - async def test_multiple_appends(self) -> None: - store = zarr.storage.MemoryStore() - arr = await zarr.api.asynchronous.create_array( - store=store, - shape=(10, 10), - chunks=[[3, 7], [4, 6]], - dtype="i4", - zarr_format=3, - ) - initial = np.arange(10 * 10, dtype="i4").reshape(10, 10) - await arr.setitem(slice(None), initial) - - all_data = [initial] - for i in range(3): - chunk = np.full((5, 10), i + 100, dtype="i4") - await arr.append(chunk, axis=0) - all_data.append(chunk) - - assert arr.shape == (25, 10) - result = await arr.getitem(slice(None)) - np.testing.assert_array_equal(result, np.vstack(all_data)) - - async def test_append_with_partial_edge_chunks(self) -> None: - store = zarr.storage.MemoryStore() - arr = await zarr.api.asynchronous.create_array( - store=store, - shape=(25, 30), - chunks=[[10, 15], [12, 18]], - dtype="f8", - zarr_format=3, - ) - initial = np.random.default_rng(42).random((25, 30)) - await arr.setitem(slice(None), initial) - - append_data = np.random.default_rng(43).random((10, 30)) - await arr.append(append_data, axis=0) - assert arr.shape == (35, 30) - - result = np.asarray(await arr.getitem(slice(None))) - np.testing.assert_array_almost_equal(result, np.vstack([initial, append_data])) - - async def test_append_small_data(self) -> None: - store = zarr.storage.MemoryStore() - arr = await zarr.api.asynchronous.create_array( - store=store, - shape=(20, 20), - chunks=[[8, 12], [7, 13]], - dtype="i4", - zarr_format=3, - ) - data = np.arange(20 * 20, dtype="i4").reshape(20, 20) - await arr.setitem(slice(None), data) - - small = np.full((3, 20), 999, dtype="i4") - await arr.append(small, axis=0) - assert arr.shape == (23, 20) - result = await arr.getitem((slice(20, 23), slice(None))) - np.testing.assert_array_equal(result, small) - - -class TestVaryingDimensionBoundary: - """VaryingDimension with extent < sum(edges), mirroring how FixedDimension - handles boundary chunks.""" - - def test_extent_parameter(self) -> None: - d = VaryingDimension([10, 20, 30], extent=50) - assert d.extent == 50 - assert d.chunk_size(2) == 30 - assert d.data_size(2) == 20 - - def test_extent_equals_sum_no_clipping(self) -> None: - d = VaryingDimension([10, 20, 30], extent=60) - assert d.extent == 60 - assert d.data_size(2) == 30 - - def test_data_size_interior_chunks_unaffected(self) -> None: - d = VaryingDimension([10, 20, 30], extent=50) - assert d.data_size(0) == 10 - assert d.data_size(1) == 20 - - def test_data_size_at_exact_boundary(self) -> None: - d = VaryingDimension([10, 20, 30], extent=60) - assert d.data_size(2) == 30 - - def test_data_size_single_element_boundary(self) -> None: - d = VaryingDimension([10, 20, 30], extent=31) - assert d.data_size(0) == 10 - assert d.data_size(1) == 20 - assert d.data_size(2) == 1 - - def test_extent_exceeds_sum_rejected(self) -> None: - with pytest.raises(ValueError, match="exceeds sum of edges"): - VaryingDimension([10, 20], extent=50) - - def test_negative_extent_rejected(self) -> None: - with pytest.raises(ValueError, match="must be >= 0"): - VaryingDimension([10, 20], extent=-1) - - def test_chunk_spec_boundary_varying(self) -> None: - """ChunkGrid with a boundary VaryingDimension produces correct ChunkSpec.""" - g = ChunkGrid(dimensions=(VaryingDimension([10, 20, 30], extent=50),)) - spec = g[(2,)] - assert spec is not None - assert spec.codec_shape == (30,) - assert spec.shape == (20,) - assert spec.is_boundary is True - - def test_chunk_spec_interior_varying(self) -> None: - g = ChunkGrid(dimensions=(VaryingDimension([10, 20, 30], extent=50),)) - spec = g[(0,)] - assert spec is not None - assert spec.codec_shape == (10,) - assert spec.shape == (10,) - assert spec.is_boundary is False - - -class TestMultipleOverflowChunks: - """Rectilinear grids where multiple chunks extend past the array extent.""" - - def test_multiple_chunks_past_extent(self) -> None: - """Edges past extent are structural; nchunks counts active only.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) - d = g.dimensions[0] - assert d.ngridcells == 4 - assert d.nchunks == 3 - assert d.data_size(0) == 10 - assert d.data_size(1) == 20 - assert d.data_size(2) == 20 - assert d.chunk_size(2) == 30 - - def test_chunk_spec_past_extent_is_oob(self) -> None: - """Chunk entirely past the extent is out of bounds (not active).""" - g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) - spec = g[(3,)] - assert spec is None - - def test_chunk_spec_partial_overflow(self) -> None: - """ChunkSpec for a partially-overflowing chunk clips correctly.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) - spec = g[(2,)] - assert spec is not None - assert spec.shape == (20,) - assert spec.codec_shape == (30,) - assert spec.is_boundary is True - assert spec.slices == (slice(30, 50, 1),) - - def test_chunk_sizes_with_overflow(self) -> None: - """chunk_sizes only includes active chunks.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30, 40]], array_shape=(50,)) - assert g.chunk_sizes == ((10, 20, 20),) - - def test_multidim_overflow(self) -> None: - """Overflow in multiple dimensions simultaneously.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30], [40, 40, 40]], array_shape=(45, 100)) - # dim 0: edges sum to 60, extent 45 → chunk 2 partial (45-30=15) - # dim 1: edges sum to 120, extent 100 → chunk 2 partial (100-80=20) - assert g.chunk_sizes == ((10, 20, 15), (40, 40, 20)) - spec = g[(2, 2)] - assert spec is not None - assert spec.shape == (15, 20) - assert spec.codec_shape == (30, 40) - - def test_uniform_edges_with_overflow_collapses_to_fixed(self) -> None: - """Uniform edges where len == ceildiv(extent, edge) collapse to FixedDimension.""" - g = ChunkGrid.from_rectilinear([[10, 10, 10, 10]], array_shape=(35,)) - assert isinstance(g.dimensions[0], FixedDimension) - assert g.is_regular - assert g.chunk_sizes == ((10, 10, 10, 5),) - assert g.dimensions[0].nchunks == 4 - - def test_serialization_roundtrip_overflow(self) -> None: - """Overflow chunks survive metadata serialization round-trip.""" - from zarr.core.metadata.v3 import RectilinearChunkGrid as RectilinearChunkGridMeta - - meta = RectilinearChunkGridMeta(chunk_shapes=((10, 20, 30, 40),)) - d = meta.to_dict() - assert d["name"] == "rectilinear" - dim0 = d["configuration"]["chunk_shapes"][0] - assert isinstance(dim0, (list, tuple)) - assert list(dim0) == [10, 20, 30, 40] - meta2 = RectilinearChunkGridMeta.from_dict(d) - g2 = ChunkGrid.from_rectilinear(list(meta2.chunk_shapes), array_shape=(50,)) - assert g2.dimensions[0].ngridcells == 4 - assert g2.dimensions[0].nchunks == 3 - assert g2.chunk_sizes == ((10, 20, 20),) - - def test_index_to_chunk_near_extent(self) -> None: - """Index lookup near and at the extent boundary.""" - d = VaryingDimension([10, 20, 30, 40], extent=50) - assert d.index_to_chunk(29) == 1 # last index in chunk 1 - assert d.index_to_chunk(30) == 2 # first index in chunk 2 - assert d.index_to_chunk(49) == 2 # last valid index - - -class TestBoundaryIndexing: - """Indexing operations on boundary chunks for both FixedDimension and - VaryingDimension, ensuring the isinstance cleanup works correctly.""" - - def test_bool_indexer_fixed_boundary(self) -> None: - """BoolArrayDimIndexer pads to codec size for FixedDimension boundary.""" - from zarr.core.indexing import BoolArrayDimIndexer - - # array extent 7, chunk size 5 → 2 chunks, last has data_size=2 - dim = FixedDimension(size=5, extent=7) - mask = np.array([False, False, False, False, False, True, True]) - indexer = BoolArrayDimIndexer(mask, 7, dim) - projections = list(indexer) - assert len(projections) == 1 - p = projections[0] - assert p.dim_chunk_ix == 1 - # boolean selection should be padded to chunk_size (5) - sel = p.dim_chunk_sel - assert isinstance(sel, np.ndarray) - assert sel.shape[0] == 5 - assert sel[0] is np.True_ - assert sel[1] is np.True_ - assert sel[2] is np.False_ # padding - - def test_bool_indexer_varying_boundary(self) -> None: - """BoolArrayDimIndexer pads to codec size for VaryingDimension boundary.""" - from zarr.core.indexing import BoolArrayDimIndexer - - # edges [5, 10], extent=7 -> last chunk has data_size=2, chunk_size=10 - dim = VaryingDimension([5, 10], extent=7) - mask = np.array([False, False, False, False, False, True, True]) - indexer = BoolArrayDimIndexer(mask, 7, dim) - projections = list(indexer) - assert len(projections) == 1 - p = projections[0] - assert p.dim_chunk_ix == 1 - # boolean selection should be padded to chunk_size (10) - sel = p.dim_chunk_sel - assert isinstance(sel, np.ndarray) - assert sel.shape[0] == 10 - assert sel[0] is np.True_ - assert sel[1] is np.True_ - assert sel[2] is np.False_ # padding - - def test_bool_indexer_no_padding_interior(self) -> None: - """No padding needed for interior chunks.""" - from zarr.core.indexing import BoolArrayDimIndexer - - dim = FixedDimension(size=5, extent=10) - mask = np.array([True, False, False, False, False, False, False, False, False, False]) - indexer = BoolArrayDimIndexer(mask, 10, dim) - projections = list(indexer) - assert len(projections) == 1 - p = projections[0] - assert p.dim_chunk_ix == 0 - sel = p.dim_chunk_sel - assert isinstance(sel, np.ndarray) - assert sel.shape[0] == 5 # equals chunk_size, no padding needed - - def test_slice_indexer_varying_boundary(self) -> None: - """SliceDimIndexer clips to data_size at boundary for VaryingDimension.""" - from zarr.core.indexing import SliceDimIndexer - - dim = VaryingDimension([5, 10], extent=7) - # select all elements - indexer = SliceDimIndexer(slice(None), 7, dim) - projections = list(indexer) - assert len(projections) == 2 - # chunk 0: full chunk - assert projections[0].dim_chunk_sel == slice(0, 5, 1) - # chunk 1: clipped to data_size (2), not chunk_size (10) - assert projections[1].dim_chunk_sel == slice(0, 2, 1) - - def test_int_array_indexer_varying_boundary(self) -> None: - """IntArrayDimIndexer handles indices near boundary correctly.""" - from zarr.core.indexing import IntArrayDimIndexer - - dim = VaryingDimension([5, 10], extent=7) - indices = np.array([6]) # in chunk 1, offset 5, so chunk-local = 1 - indexer = IntArrayDimIndexer(indices, 7, dim) - projections = list(indexer) - assert len(projections) == 1 - assert projections[0].dim_chunk_ix == 1 - sel = projections[0].dim_chunk_sel - assert isinstance(sel, np.ndarray) - np.testing.assert_array_equal(sel, [1]) - - def test_slice_indexer_empty_slice_at_boundary(self) -> None: - """SliceDimIndexer yields no projections for an empty slice at the dimension boundary.""" - from zarr.core.indexing import SliceDimIndexer - - dim = FixedDimension(size=2, extent=10) - # slice(10, 10) is empty — start equals extent - indexer = SliceDimIndexer(slice(10, 10), 10, dim) - projections = list(indexer) - assert len(projections) == 0 - - # also works for VaryingDimension - dim_v = VaryingDimension([5, 5], extent=10) - indexer_v = SliceDimIndexer(slice(10, 10), 10, dim_v) - assert list(indexer_v) == [] - - def test_orthogonal_indexer_varying_boundary_advanced(self) -> None: - """OrthogonalIndexer with advanced indexing uses per-chunk chunk_size - for ix_() conversion, not a precomputed max.""" - from zarr.core.indexing import OrthogonalIndexer - - # 2D: dim 0 has boundary chunk, dim 1 is regular - g = ChunkGrid( - dimensions=( - VaryingDimension([5, 10], extent=7), - FixedDimension(size=4, extent=8), - ) - ) - indexer = OrthogonalIndexer( - selection=(np.array([0, 6]), slice(None)), - shape=(7, 8), - chunk_grid=g, - ) - projections = list(indexer) - # index 0 → chunk 0, index 6 → chunk 1; dim 1 has 2 chunks - assert len(projections) == 4 - coords = {p.chunk_coords for p in projections} - assert coords == {(0, 0), (0, 1), (1, 0), (1, 1)} - - -class TestUpdateShapeBoundary: - """Resize creates boundary VaryingDimensions with correct extent.""" - - def test_shrink_creates_boundary(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) - new_grid = grid.update_shape((45,)) - dim = new_grid.dimensions[0] - assert isinstance(dim, VaryingDimension) - assert dim.edges == (10, 20, 30) # last chunk kept (cumulative 60 >= 45) - assert dim.extent == 45 - assert dim.chunk_size(2) == 30 # codec buffer - assert dim.data_size(2) == 15 # clipped: 45 - 30 = 15 - - def test_shrink_to_exact_boundary(self) -> None: - grid = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) - new_grid = grid.update_shape((30,)) - dim = new_grid.dimensions[0] - assert isinstance(dim, VaryingDimension) - assert dim.edges == (10, 20, 30) # all edges preserved - assert dim.nchunks == 2 # only first two are active (10+20=30 >= 30) - assert dim.ngridcells == 3 - assert dim.extent == 30 - assert dim.data_size(1) == 20 # no clipping needed - - def test_shrink_chunk_spec(self) -> None: - """After shrink, ChunkSpec reflects boundary correctly.""" - grid = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) - new_grid = grid.update_shape((45,)) - spec = new_grid[(2,)] - assert spec is not None - assert spec.codec_shape == (30,) - assert spec.shape == (15,) - assert spec.is_boundary is True - - def test_with_extent_rebinds_varying_extent(self) -> None: - """with_extent re-binds VaryingDimension extent.""" - g = ChunkGrid.from_rectilinear([[10, 20, 30]], array_shape=(60,)) - # sum(edges)=60, new extent=50 → re-bind - dim = g.dimensions[0].with_extent(50) - assert isinstance(dim, VaryingDimension) - assert dim.extent == 50 - assert dim.data_size(2) == 20 # 50 - 30 = 20 - - -class TestNchunksWorksForRectilinear: - def test_nchunks_returns_correct_count(self) -> None: - """nchunks should work for rectilinear arrays.""" - store = MemoryStore() - a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") - assert a.nchunks == 2 - - def test_nchunks_2d_rectilinear(self) -> None: - store = MemoryStore() - a = zarr.create_array(store, shape=(30, 40), chunks=[[10, 20], [15, 25]], dtype="int32") - assert a.nchunks == 4 # 2 chunks x 2 chunks - - -class TestIterChunkRegionsWorksForRectilinear: - def test_iter_chunk_regions_rectilinear(self) -> None: - """_iter_chunk_regions should work for rectilinear arrays.""" - from zarr.core.array import _iter_chunk_regions - - store = MemoryStore() - a = zarr.create_array(store, shape=(30,), chunks=[[10, 20]], dtype="int32") - regions = list(_iter_chunk_regions(a)) - assert len(regions) == 2 - assert regions[0] == (slice(0, 10, 1),) - assert regions[1] == (slice(10, 30, 1),) From b3b593356e8807c7b2565e67fd1c434b8395587b Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 10:58:53 -0400 Subject: [PATCH 111/118] Fix typo --- docs/design/chunk-grid.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/chunk-grid.md b/docs/design/chunk-grid.md index 7a180859cd..ac16a4b264 100644 --- a/docs/design/chunk-grid.md +++ b/docs/design/chunk-grid.md @@ -25,7 +25,7 @@ Chunk grids form a hierarchy — the rectilinear grid is strictly more general t | HPC boundary-padded | Regular interior, larger boundary chunks ([VirtualiZarr#217](https://github.com/zarr-developers/VirtualiZarr/issues/217)) | `[10, 8, 8, 8, 10]` | | Fully variable | Arbitrary per-chunk sizes | `[5, 12, 3, 20]` | -Prior iterations on the chunk grid design were based on the Zarr V3 spec's definition of chunk grids as an extension point alongside codecs, dtypes, etc. Therefore, we started designing the chunk grid implementation following a similar registry based approach. However, in practice chunk grids are fundamentally different than codecs. Codecs are independent; supporting `zstd` tells you nothing about `gzip`. Chunk grids are not: every regular grid is a valid rectilinear grid. A registry-based plugin system makes sense for codecs but adds complexity without clear benefit for chunk grids. Here we start from some basic goals and propose a more fitting design for supporting different chunk grids in zarr-python. +Prior iterations on the chunk grid design were based on the Zarr V3 spec's definition of chunk grids as an extension point alongside codecs, dtypes, etc. Therefore, we started designing the chunk grid implementation following a similar registry-based approach. However, in practice chunk grids are fundamentally different than codecs. Codecs are independent; supporting `zstd` tells you nothing about `gzip`. Chunk grids are not: every regular grid is a valid rectilinear grid. A registry-based plugin system makes sense for codecs but adds complexity without clear benefit for chunk grids. Here we start from some basic goals and propose a more fitting design for supporting different chunk grids in zarr-python. ## Goals From f80f79882f9fc488b0349255b85629a4c2fa8e45 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 10:59:59 -0400 Subject: [PATCH 112/118] Normalize --- src/zarr/codecs/sharding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index deb25a5bf6..da5cfce3bd 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -396,7 +396,9 @@ def validate( if isinstance(chunk_grid, RegularChunkGrid): edges_per_dim: tuple[tuple[int, ...], ...] = tuple((s,) for s in chunk_grid.chunk_shape) elif isinstance(chunk_grid, RectilinearChunkGrid): - edges_per_dim = chunk_grid.chunk_shapes + edges_per_dim = tuple( + (s,) if isinstance(s, int) else s for s in chunk_grid.chunk_shapes + ) else: raise TypeError( f"Sharding is only compatible with regular and rectilinear chunk grids, " From 3327cc171d61ef18e13d1e69b7f27b283b1e244d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 11:00:10 -0400 Subject: [PATCH 113/118] Remove shim --- src/zarr/core/chunk_grids.py | 31 ------------------------- tests/test_unified_chunk_grid.py | 39 -------------------------------- 2 files changed, 70 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ed91e37461..607e5d7838 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -752,34 +752,3 @@ def _auto_partition( _shards_out = cast("tuple[int, ...]", shard_shape) return _shards_out, _chunks_out - - -class _RegularChunkGridMeta(type): - """Metaclass that makes ``isinstance(obj, RegularChunkGrid)`` work. - - Returns True when *obj* is a ``ChunkGrid`` whose ``is_regular`` flag is set. - """ - - def __instancecheck__(cls, instance: object) -> bool: - return isinstance(instance, ChunkGrid) and instance.is_regular - - -class RegularChunkGrid(metaclass=_RegularChunkGridMeta): - """Deprecated compatibility shim. - - .. deprecated:: 3.1 - Use ``ChunkGrid.from_sizes(array_shape, chunk_sizes)`` instead. - Use ``grid.is_regular`` instead of ``isinstance(grid, RegularChunkGrid)``. - """ - - def __new__(cls, *, chunk_shape: ShapeLike) -> ChunkGrid: # type: ignore[misc] - warnings.warn( - "RegularChunkGrid is deprecated. " - "Use ChunkGrid.from_sizes(array_shape, chunk_sizes) instead.", - DeprecationWarning, - stacklevel=2, - ) - # Without array_shape we cannot bind extents, so use chunk_shape as extent. - # This matches the old behavior where RegularChunkGrid was shape-unaware. - parsed = parse_shapelike(chunk_shape) - return ChunkGrid.from_sizes(array_shape=parsed, chunk_sizes=tuple(parsed)) diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index dc7d967745..4a2c3d925d 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -130,45 +130,6 @@ def test_rectilinear_feature_flag_enabled() -> None: assert grid.ndim == 2 -# --------------------------------------------------------------------------- -# RegularChunkGrid compatibility tests -# --------------------------------------------------------------------------- - - -def test_regular_chunk_grid_compat_construction_emits_deprecation_warning() -> None: - """Constructing RegularChunkGrid emits a DeprecationWarning and returns a ChunkGrid""" - from zarr.core.chunk_grids import RegularChunkGrid - - with pytest.warns(DeprecationWarning, match="RegularChunkGrid is deprecated"): - grid = RegularChunkGrid(chunk_shape=(10, 20)) - assert isinstance(grid, ChunkGrid) - assert grid.is_regular - assert grid.chunk_shape == (10, 20) - - -@pytest.mark.parametrize( - ("grid", "expected"), - [ - (ChunkGrid.from_sizes((100, 200), (10, 20)), True), - (ChunkGrid.from_sizes((30, 50), [[10, 20], [25, 25]]), False), - ], - ids=["regular-is-instance", "rectilinear-is-not-instance"], -) -def test_regular_chunk_grid_isinstance(grid: ChunkGrid, expected: bool) -> None: - """isinstance check against RegularChunkGrid matches only regular grids""" - from zarr.core.chunk_grids import RegularChunkGrid - - assert isinstance(grid, RegularChunkGrid) == expected - - -@pytest.mark.parametrize("obj", ["hello", 42], ids=["string", "int"]) -def test_regular_chunk_grid_isinstance_false_for_unrelated_types(obj: Any) -> None: - """Unrelated types are not instances of RegularChunkGrid""" - from zarr.core.chunk_grids import RegularChunkGrid - - assert not isinstance(obj, RegularChunkGrid) - - # --------------------------------------------------------------------------- # FixedDimension tests # --------------------------------------------------------------------------- From 5642e03d9e8fa17f1f66860b47f2dfece5dd5870 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 11:32:25 -0400 Subject: [PATCH 114/118] Consistent typing --- src/zarr/api/synchronous.py | 7 ++++--- src/zarr/core/array.py | 20 +++++++++++++------- src/zarr/core/group.py | 10 +++++----- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 2a9be16381..a865f97646 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -13,7 +13,7 @@ from zarr.errors import ZarrDeprecationWarning if TYPE_CHECKING: - from collections.abc import Iterable, Sequence + from collections.abc import Iterable import numpy as np import numpy.typing as npt @@ -33,6 +33,7 @@ from zarr.core.common import ( JSON, AccessModeLiteral, + ChunksLike, DimensionNamesLike, MemoryOrder, ShapeLike, @@ -822,7 +823,7 @@ def create_array( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", + chunks: ChunksLike | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -997,7 +998,7 @@ def from_array( data: AnyArray | npt.ArrayLike, write_data: bool = True, name: str | None = None, - chunks: Literal["auto", "keep"] | tuple[int, ...] | Sequence[Sequence[int]] = "keep", + chunks: ChunksLike | Literal["auto", "keep"] = "keep", shards: ShardsLike | None | Literal["keep"] = "keep", filters: FiltersLike | Literal["keep"] = "keep", compressors: CompressorsLike | Literal["keep"] = "keep", diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fc8eaa76b6..be838f285c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -55,6 +55,7 @@ ZARR_JSON, ZARRAY_JSON, ZATTRS_JSON, + ChunksLike, DimensionNamesLike, MemoryOrder, ShapeLike, @@ -2828,7 +2829,7 @@ def __array__( raise ValueError(msg) arr = self[...] - arr_np: NDArrayLike = np.array(arr, dtype=dtype) + arr_np = np.array(arr, dtype=dtype) if dtype is not None: arr_np = arr_np.astype(dtype) @@ -4411,7 +4412,7 @@ async def from_array( data: AnyArray | npt.ArrayLike, write_data: bool = True, name: str | None = None, - chunks: Literal["auto", "keep"] | tuple[int, ...] | Sequence[Sequence[int]] = "keep", + chunks: ChunksLike | Literal["auto", "keep"] = "keep", shards: ShardsLike | None | Literal["keep"] = "keep", filters: FiltersLike | Literal["keep"] = "keep", compressors: CompressorsLike | Literal["keep"] = "keep", @@ -4684,7 +4685,7 @@ async def init_array( store_path: StorePath, shape: ShapeLike, dtype: ZDTypeLike, - chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", + chunks: ChunksLike | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -4821,6 +4822,9 @@ async def init_array( # Use first chunk size per dim as placeholder for _auto_partition chunks_flat: tuple[int, ...] | Literal["auto"] = tuple(dim_edges[0] for dim_edges in chunks) else: + # Normalize scalar int to per-dimension tuple (e.g. chunks=100000 for a 1D array) + if isinstance(chunks, int): + chunks = tuple(chunks for _ in shape_parsed) chunks_flat = cast("tuple[int, ...] | Literal['auto']", chunks) # Handle rectilinear shards: shards=[[60, 40, 20], [50, 50]] @@ -4945,7 +4949,7 @@ async def create_array( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", + chunks: ChunksLike | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -5134,7 +5138,7 @@ async def create_array( def _parse_keep_array_attr( data: AnyArray | npt.ArrayLike, - chunks: Literal["auto", "keep"] | tuple[int, ...] | Sequence[Sequence[int]], + chunks: ChunksLike | Literal["auto", "keep"], shards: ShardsLike | None | Literal["keep"], filters: FiltersLike | Literal["keep"], compressors: CompressorsLike | Literal["keep"], @@ -5145,7 +5149,7 @@ def _parse_keep_array_attr( chunk_key_encoding: ChunkKeyEncodingLike | None, dimension_names: DimensionNamesLike, ) -> tuple[ - tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"], + ChunksLike | Literal["auto"], ShardsLike | None, FiltersLike, CompressorsLike, @@ -5215,8 +5219,10 @@ def _parse_keep_array_attr( compressors = "auto" if serializer == "keep": serializer = "auto" + # After resolving "keep" above, chunks is never "keep" at this point. + chunks_out: ChunksLike | Literal["auto"] = chunks # type: ignore[assignment] return ( - chunks, + chunks_out, shards, filters, compressors, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 7f92d0922a..b810041e7b 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -40,6 +40,7 @@ ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON, + ChunksLike, DimensionNamesLike, NodeType, ShapeLike, @@ -71,7 +72,6 @@ Iterable, Iterator, Mapping, - Sequence, ) from typing import Any @@ -1021,7 +1021,7 @@ async def create_array( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", + chunks: ChunksLike | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -2474,7 +2474,7 @@ def create( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", + chunks: ChunksLike | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -2618,7 +2618,7 @@ def create_array( shape: ShapeLike | None = None, dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, - chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", + chunks: ChunksLike | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", @@ -3016,7 +3016,7 @@ def array( *, shape: ShapeLike, dtype: npt.DTypeLike, - chunks: tuple[int, ...] | Sequence[Sequence[int]] | Literal["auto"] = "auto", + chunks: ChunksLike | Literal["auto"] = "auto", shards: tuple[int, ...] | Literal["auto"] | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", From 6a4c01a0bc0c475c9a27910050c992daebd86e5c Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 11:40:19 -0400 Subject: [PATCH 115/118] Move design doc outside public docs --- {docs/design => design}/chunk-grid.md | 0 mkdocs.yml | 2 -- 2 files changed, 2 deletions(-) rename {docs/design => design}/chunk-grid.md (100%) diff --git a/docs/design/chunk-grid.md b/design/chunk-grid.md similarity index 100% rename from docs/design/chunk-grid.md rename to design/chunk-grid.md diff --git a/mkdocs.yml b/mkdocs.yml index 1d86034d6b..ce39fd0f2e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,8 +77,6 @@ nav: - Creation sub-module: api/zarr/deprecated/creation.md - release-notes.md - contributing.md - - Design documents: - - design/chunk-grid.md watch: - src/zarr - docs From e3ba71fd844d2380b2bac0057d1c313e1a1cf78d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 12:43:06 -0400 Subject: [PATCH 116/118] Update config.md Co-authored-by: Davis Bennett --- docs/user-guide/config.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index e0cce321be..113217e097 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -30,7 +30,7 @@ Configuration options include the following: - Default Zarr format `default_zarr_version` - Default array order in memory `array.order` - Whether empty chunks are written to storage `array.write_empty_chunks` -- Enable experimental rectilinear chunk grids `array.rectilinear_chunks` +- Enable experimental rectilinear chunks `array.rectilinear_chunks` - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. From 6667fda5dcd56f211c5007a8bada52d983a06cab Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 20:00:58 -0400 Subject: [PATCH 117/118] Remove get_chunk_shape --- src/zarr/core/indexing.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index d31e7c628e..af716d2870 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -329,15 +329,6 @@ def is_pure_orthogonal_indexing(selection: Selection, ndim: int) -> TypeGuard[Or ) -def get_chunk_shape(chunk_grid: ChunkGrid) -> tuple[int, ...]: - if not chunk_grid.is_regular: - raise ValueError( - "get_chunk_shape only works with regular chunk grids. " - "Use chunk_grid.dimensions for rectilinear grids." - ) - return chunk_grid.chunk_shape - - def normalize_integer_selection(dim_sel: int, dim_len: int) -> int: # normalize type to int dim_sel = int(dim_sel) From 0f0576e5691ed7b96af65145c76dcc1d07b83111 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 30 Mar 2026 20:14:35 -0400 Subject: [PATCH 118/118] Make dimensions private --- src/zarr/core/chunk_grids.py | 30 ++++++++----- src/zarr/core/indexing.py | 8 ++-- tests/test_unified_chunk_grid.py | 72 ++++++++++++++++---------------- 3 files changed, 61 insertions(+), 49 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 607e5d7838..dcea33f3bf 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -303,15 +303,25 @@ class ChunkGrid: or VaryingDimension (per-chunk edge lengths with prefix sums). """ - dimensions: tuple[DimensionGrid, ...] + _dimensions: tuple[DimensionGrid, ...] _is_regular: bool def __init__(self, *, dimensions: tuple[DimensionGrid, ...]) -> None: - object.__setattr__(self, "dimensions", dimensions) + object.__setattr__(self, "_dimensions", dimensions) object.__setattr__( self, "_is_regular", all(isinstance(d, FixedDimension) for d in dimensions) ) + def __repr__(self) -> str: + sizes: list[str] = [] + for d in self._dimensions: + if isinstance(d, FixedDimension): + sizes.append(str(d.size)) + elif isinstance(d, VaryingDimension): + sizes.append(repr(tuple(d.edges))) + shape = tuple(d.extent for d in self._dimensions) + return f"ChunkGrid(chunk_sizes=({', '.join(sizes)}), array_shape={shape})" + @classmethod def from_metadata(cls, metadata: ArrayMetadata) -> ChunkGrid: """Construct a behavioral ChunkGrid from array metadata. @@ -381,7 +391,7 @@ def from_sizes( @property def ndim(self) -> int: - return len(self.dimensions) + return len(self._dimensions) @property def is_regular(self) -> bool: @@ -390,7 +400,7 @@ def is_regular(self) -> bool: @property def grid_shape(self) -> tuple[int, ...]: """Number of chunks per dimension.""" - return tuple(d.nchunks for d in self.dimensions) + return tuple(d.nchunks for d in self._dimensions) @property def chunk_shape(self) -> tuple[int, ...]: @@ -400,7 +410,7 @@ def chunk_shape(self) -> tuple[int, ...]: "chunk_shape is only available for regular chunk grids. " "Use grid[coords] for per-chunk sizes." ) - return tuple(d.size for d in self.dimensions if isinstance(d, FixedDimension)) + return tuple(d.size for d in self._dimensions if isinstance(d, FixedDimension)) @property def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: @@ -416,7 +426,7 @@ def chunk_sizes(self) -> tuple[tuple[int, ...], ...]: One inner tuple per dimension, each containing the data size of every chunk along that dimension. """ - return tuple(tuple(d.data_size(i) for i in range(d.nchunks)) for d in self.dimensions) + return tuple(tuple(d.data_size(i) for i in range(d.nchunks)) for d in self._dimensions) # -- Collection interface -- @@ -431,7 +441,7 @@ def __getitem__(self, coords: int | tuple[int, ...]) -> ChunkSpec | None: ) slices: list[slice] = [] codec_shape: list[int] = [] - for dim, ix in zip(self.dimensions, coords, strict=True): + for dim, ix in zip(self._dimensions, coords, strict=True): if ix < 0 or ix >= dim.nchunks: return None offset = dim.chunk_offset(ix) @@ -441,7 +451,7 @@ def __getitem__(self, coords: int | tuple[int, ...]) -> ChunkSpec | None: def __iter__(self) -> Iterator[ChunkSpec]: """Iterate all chunks, yielding ChunkSpec for each.""" - for coords in itertools.product(*(range(d.nchunks) for d in self.dimensions)): + for coords in itertools.product(*(range(d.nchunks) for d in self._dimensions)): spec = self[coords] if spec is not None: yield spec @@ -499,7 +509,7 @@ def iter_chunk_regions( yield spec.slices def get_nchunks(self) -> int: - return reduce(operator.mul, (d.nchunks for d in self.dimensions), 1) + return reduce(operator.mul, (d.nchunks for d in self._dimensions), 1) # -- Resize -- @@ -526,7 +536,7 @@ def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: ) dims = tuple( dim.resize(new_extent) - for dim, new_extent in zip(self.dimensions, new_shape, strict=True) + for dim, new_extent in zip(self._dimensions, new_shape, strict=True) ) return ChunkGrid(dimensions=dims) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index af716d2870..cb81164209 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -578,7 +578,7 @@ def __init__( shape: tuple[int, ...], chunk_grid: ChunkGrid, ) -> None: - dim_grids = chunk_grid.dimensions + dim_grids = chunk_grid._dimensions # handle ellipsis selection_normalized = replace_ellipsis(selection, shape) @@ -905,7 +905,7 @@ class OrthogonalIndexer(Indexer): drop_axes: tuple[int, ...] def __init__(self, selection: Selection, shape: tuple[int, ...], chunk_grid: ChunkGrid) -> None: - dim_grids = chunk_grid.dimensions + dim_grids = chunk_grid._dimensions # handle ellipsis selection = replace_ellipsis(selection, shape) @@ -1050,7 +1050,7 @@ class BlockIndexer(Indexer): def __init__( self, selection: BasicSelection, shape: tuple[int, ...], chunk_grid: ChunkGrid ) -> None: - dim_grids = chunk_grid.dimensions + dim_grids = chunk_grid._dimensions # handle ellipsis selection_normalized = replace_ellipsis(selection, shape) @@ -1180,7 +1180,7 @@ class CoordinateIndexer(Indexer): def __init__( self, selection: CoordinateSelection, shape: tuple[int, ...], chunk_grid: ChunkGrid ) -> None: - dim_grids = chunk_grid.dimensions + dim_grids = chunk_grid._dimensions cdata_shape: tuple[int, ...] if shape == (): diff --git a/tests/test_unified_chunk_grid.py b/tests/test_unified_chunk_grid.py index 4a2c3d925d..92bb1abae9 100644 --- a/tests/test_unified_chunk_grid.py +++ b/tests/test_unified_chunk_grid.py @@ -46,7 +46,7 @@ def _enable_rectilinear_chunks() -> Generator[None, None, None]: def _edges(grid: ChunkGrid, dim: int) -> tuple[int, ...]: """Extract the per-chunk edge lengths for *dim* from a ChunkGrid.""" - d = grid.dimensions[dim] + d = grid._dimensions[dim] if isinstance(d, FixedDimension): return tuple(d.size for _ in range(d.nchunks)) if isinstance(d, VaryingDimension): @@ -327,8 +327,8 @@ def test_chunk_grid_construction( def test_chunk_grid_rectilinear_uniform_dim_is_fixed() -> None: """A rectilinear grid with all-same sizes in one dim stores it as Fixed.""" g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [25, 25, 25, 25]]) - assert isinstance(g.dimensions[0], VaryingDimension) - assert isinstance(g.dimensions[1], FixedDimension) + assert isinstance(g._dimensions[0], VaryingDimension) + assert isinstance(g._dimensions[1], FixedDimension) # --------------------------------------------------------------------------- @@ -715,7 +715,7 @@ def test_parse_chunk_grid_varying_extent_mismatch_raises() -> None: with pytest.raises(ValueError, match="extent"): ChunkGrid( dimensions=tuple( - dim.with_extent(ext) for dim, ext in zip(g.dimensions, (100, 100), strict=True) + dim.with_extent(ext) for dim, ext in zip(g._dimensions, (100, 100), strict=True) ) ) @@ -725,10 +725,10 @@ def test_parse_chunk_grid_varying_extent_match_ok() -> None: g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) g2 = ChunkGrid( dimensions=tuple( - dim.with_extent(ext) for dim, ext in zip(g.dimensions, (60, 100), strict=True) + dim.with_extent(ext) for dim, ext in zip(g._dimensions, (60, 100), strict=True) ) ) - assert g2.dimensions[0].extent == 60 + assert g2._dimensions[0].extent == 60 @pytest.mark.parametrize( @@ -793,7 +793,7 @@ def test_parse_chunk_grid_varying_dimension_extent_mismatch_on_chunkgrid_input() with pytest.raises(ValueError, match="less than"): ChunkGrid( dimensions=tuple( - dim.with_extent(ext) for dim, ext in zip(g.dimensions, (100, 50), strict=True) + dim.with_extent(ext) for dim, ext in zip(g._dimensions, (100, 50), strict=True) ) ) @@ -1126,30 +1126,30 @@ def test_0d_grid_nchunks() -> None: def test_parse_chunk_grid_preserves_varying_extent() -> None: """parse_chunk_grid does not overwrite VaryingDimension extent.""" g = ChunkGrid.from_sizes((60, 100), [[10, 20, 30], [50, 50]]) - assert isinstance(g.dimensions[0], VaryingDimension) - assert g.dimensions[0].extent == 60 + assert isinstance(g._dimensions[0], VaryingDimension) + assert g._dimensions[0].extent == 60 g2 = ChunkGrid( dimensions=tuple( - dim.with_extent(ext) for dim, ext in zip(g.dimensions, (60, 100), strict=True) + dim.with_extent(ext) for dim, ext in zip(g._dimensions, (60, 100), strict=True) ) ) - assert isinstance(g2.dimensions[0], VaryingDimension) - assert g2.dimensions[0].extent == 60 + assert isinstance(g2._dimensions[0], VaryingDimension) + assert g2._dimensions[0].extent == 60 def test_parse_chunk_grid_rebinds_fixed_extent() -> None: """parse_chunk_grid updates FixedDimension extent from array shape.""" g = ChunkGrid.from_sizes((100, 200), (10, 20)) - assert g.dimensions[0].extent == 100 + assert g._dimensions[0].extent == 100 g2 = ChunkGrid( dimensions=tuple( - dim.with_extent(ext) for dim, ext in zip(g.dimensions, (50, 100), strict=True) + dim.with_extent(ext) for dim, ext in zip(g._dimensions, (50, 100), strict=True) ) ) - assert isinstance(g2.dimensions[0], FixedDimension) - assert g2.dimensions[0].extent == 50 + assert isinstance(g2._dimensions[0], FixedDimension) + assert g2._dimensions[0].extent == 50 assert g2.grid_shape == (5, 5) @@ -1775,7 +1775,7 @@ def test_varying_dimension_interior_chunk_spec() -> None: def test_overflow_multiple_chunks_past_extent() -> None: """Edges past extent are structural; nchunks counts active only.""" g = ChunkGrid.from_sizes((50,), [[10, 20, 30, 40]]) - d = g.dimensions[0] + d = g._dimensions[0] assert d.ngridcells == 4 assert d.nchunks == 3 assert d.data_size(0) == 10 @@ -1821,10 +1821,10 @@ def test_overflow_multidim() -> None: def test_overflow_uniform_edges_collapses_to_fixed() -> None: """Uniform edges where len == ceildiv(extent, edge) collapse to FixedDimension.""" g = ChunkGrid.from_sizes((35,), [[10, 10, 10, 10]]) - assert isinstance(g.dimensions[0], FixedDimension) + assert isinstance(g._dimensions[0], FixedDimension) assert g.is_regular assert g.chunk_sizes == ((10, 10, 10, 5),) - assert g.dimensions[0].nchunks == 4 + assert g._dimensions[0].nchunks == 4 def test_overflow_index_to_chunk_near_extent() -> None: @@ -2009,7 +2009,7 @@ def test_update_shape_shrink_single_dim() -> None: grid = ChunkGrid.from_sizes((100, 50), [[10, 20, 30, 40], [25, 25]]) new_grid = grid.update_shape((35, 50)) assert _edges(new_grid, 0) == (10, 20, 30, 40) - assert new_grid.dimensions[0].nchunks == 3 + assert new_grid._dimensions[0].nchunks == 3 assert _edges(new_grid, 1) == (25, 25) @@ -2018,7 +2018,7 @@ def test_update_shape_shrink_to_single_chunk() -> None: grid = ChunkGrid.from_sizes((60, 50), [[10, 20, 30], [25, 25]]) new_grid = grid.update_shape((5, 50)) assert _edges(new_grid, 0) == (10, 20, 30) - assert new_grid.dimensions[0].nchunks == 1 + assert new_grid._dimensions[0].nchunks == 1 assert _edges(new_grid, 1) == (25, 25) @@ -2027,9 +2027,9 @@ def test_update_shape_shrink_multiple_dims() -> None: grid = ChunkGrid.from_sizes((40, 60), [[10, 10, 15, 5], [20, 25, 15]]) new_grid = grid.update_shape((25, 35)) assert _edges(new_grid, 0) == (10, 10, 15, 5) - assert new_grid.dimensions[0].nchunks == 3 + assert new_grid._dimensions[0].nchunks == 3 assert _edges(new_grid, 1) == (20, 25, 15) - assert new_grid.dimensions[1].nchunks == 2 + assert new_grid._dimensions[1].nchunks == 2 def test_update_shape_dimension_mismatch_error() -> None: @@ -2049,9 +2049,9 @@ def test_update_shape_boundary_cases() -> None: grid2 = ChunkGrid.from_sizes((60, 50), [[10, 20, 30], [15, 25, 10]]) new_grid2 = grid2.update_shape((30, 40)) assert _edges(new_grid2, 0) == (10, 20, 30) - assert new_grid2.dimensions[0].nchunks == 2 + assert new_grid2._dimensions[0].nchunks == 2 assert _edges(new_grid2, 1) == (15, 25, 10) - assert new_grid2.dimensions[1].nchunks == 2 + assert new_grid2._dimensions[1].nchunks == 2 def test_update_shape_regular_preserves_extents(tmp_path: Path) -> None: @@ -2065,7 +2065,7 @@ def test_update_shape_regular_preserves_extents(tmp_path: Path) -> None: z[:] = np.arange(100, dtype="int32") z.resize(50) assert z.shape == (50,) - assert ChunkGrid.from_metadata(z.metadata).dimensions[0].extent == 50 + assert ChunkGrid.from_metadata(z.metadata)._dimensions[0].extent == 50 # --------------------------------------------------------------------------- @@ -2077,7 +2077,7 @@ def test_update_shape_shrink_creates_boundary() -> None: """Shrinking extent into a chunk creates a boundary with clipped data_size""" grid = ChunkGrid.from_sizes((60,), [[10, 20, 30]]) new_grid = grid.update_shape((45,)) - dim = new_grid.dimensions[0] + dim = new_grid._dimensions[0] assert isinstance(dim, VaryingDimension) assert dim.edges == (10, 20, 30) assert dim.extent == 45 @@ -2089,7 +2089,7 @@ def test_update_shape_shrink_to_exact_boundary() -> None: """Shrinking to an exact chunk boundary reduces nchunks without partial data""" grid = ChunkGrid.from_sizes((60,), [[10, 20, 30]]) new_grid = grid.update_shape((30,)) - dim = new_grid.dimensions[0] + dim = new_grid._dimensions[0] assert isinstance(dim, VaryingDimension) assert dim.edges == (10, 20, 30) assert dim.nchunks == 2 @@ -2113,9 +2113,11 @@ def test_update_shape_parse_chunk_grid_rebinds_extent() -> None: """parse_chunk_grid re-binds VaryingDimension extent to array shape.""" g = ChunkGrid.from_sizes((60,), [[10, 20, 30]]) g2 = ChunkGrid( - dimensions=tuple(dim.with_extent(ext) for dim, ext in zip(g.dimensions, (50,), strict=True)) + dimensions=tuple( + dim.with_extent(ext) for dim, ext in zip(g._dimensions, (50,), strict=True) + ) ) - dim = g2.dimensions[0] + dim = g2._dimensions[0] assert isinstance(dim, VaryingDimension) assert dim.extent == 50 assert dim.data_size(2) == 20 @@ -2364,7 +2366,7 @@ def test_v2_chunk_grid_is_regular(tmp_path: Path) -> None: assert grid.is_regular assert grid.chunk_shape == (10, 15) assert grid.grid_shape == (2, 2) - assert all(isinstance(d, FixedDimension) for d in grid.dimensions) + assert all(isinstance(d, FixedDimension) for d in grid._dimensions) def test_v2_boundary_chunks(tmp_path: Path) -> None: @@ -2377,9 +2379,9 @@ def test_v2_boundary_chunks(tmp_path: Path) -> None: zarr_format=2, ) grid = ChunkGrid.from_metadata(a.metadata) - assert grid.dimensions[0].nchunks == 3 - assert grid.dimensions[0].chunk_size(2) == 10 - assert grid.dimensions[0].data_size(2) == 5 + assert grid._dimensions[0].nchunks == 3 + assert grid._dimensions[0].chunk_size(2) == 10 + assert grid._dimensions[0].data_size(2) == 5 def test_v2_slicing_with_boundary(tmp_path: Path) -> None: @@ -2714,7 +2716,7 @@ def test_property_block_indexing_rectilinear(data: st.DataObject) -> None: grid = ChunkGrid.from_metadata(z.metadata) for dim in range(a.ndim): - dim_grid = grid.dimensions[dim] + dim_grid = grid._dimensions[dim] block_ix = data.draw(st.integers(min_value=0, max_value=dim_grid.nchunks - 1)) sel = [slice(None)] * a.ndim start = dim_grid.chunk_offset(block_ix)