From d007c64ad9200463c97e1dffafd01a4dd66059c4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:09:40 +0200 Subject: [PATCH 01/55] feat(core): add _coalesce module skeleton with CoalesceOptions and stub --- src/zarr/core/_coalesce.py | 76 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 src/zarr/core/_coalesce.py diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py new file mode 100644 index 0000000000..d155c143f9 --- /dev/null +++ b/src/zarr/core/_coalesce.py @@ -0,0 +1,76 @@ +# src/zarr/core/_coalesce.py +from __future__ import annotations + +from typing import TYPE_CHECKING, TypedDict + +if TYPE_CHECKING: + from collections.abc import AsyncIterator, Awaitable, Callable, Iterable, Sequence + + from zarr.abc.store import ByteRequest + from zarr.core.buffer import Buffer + + +class CoalesceOptions(TypedDict): + """Knobs for coalescing contiguous byte ranges into fewer I/O requests. + + All fields required. See DEFAULT_COALESCE_OPTIONS for a sensible default. + """ + + max_gap_bytes: int + """Two RangeByteRequests separated by at most this many bytes may be merged into one fetch.""" + max_coalesced_bytes: int + """Upper bound on the size of a single merged fetch (ignored for an already-oversized single request).""" + max_concurrency: int + """Maximum number of merged fetches in flight at once.""" + + +DEFAULT_COALESCE_OPTIONS: CoalesceOptions = { + "max_gap_bytes": 1 << 20, # 1 MiB + "max_coalesced_bytes": 16 << 20, # 16 MiB + "max_concurrency": 10, +} + + +async def coalesced_get( + fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]], + byte_ranges: Iterable[ByteRequest | None], + *, + options: CoalesceOptions, +) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: + """Read many byte ranges through ``fetch``, coalescing nearby ranges and firing merged requests concurrently. + + Each yield corresponds to exactly one underlying I/O operation: a sequence of + ``(input_index, result)`` tuples for all input ranges served by that I/O. + Tuples within a yielded sequence are ordered by start offset. Yields across + groups are in completion order, not input order. + + Parameters + ---------- + fetch + Callable that reads one byte range and returns a ``Buffer`` (or ``None`` + if the underlying key does not exist). Typically constructed via + ``functools.partial(store.get, key, prototype)``. + byte_ranges + Input ranges. ``None`` means "the whole value". + options + Coalescing knobs. + + Yields + ------ + Sequence[tuple[int, Buffer | None]] + Per-I/O batch of ``(input_index, result)`` tuples. + + Notes + ----- + - Only ``RangeByteRequest`` inputs are coalesced. ``OffsetByteRequest``, + ``SuffixByteRequest``, and ``None`` are each treated as uncoalescable + (one fetch, one single-tuple yield per input). + - If any fetch returns ``None`` the iterator stops scheduling further fetches + and completes without yielding the missing group. Groups completed before + the miss remain observable. + - If a fetch raises, the exception propagates on the yield that produced the + failing group; earlier-completed groups remain observable. + """ + # Stub body; real implementation filled in by later tasks. + raise NotImplementedError + yield () # type: ignore[unreachable] # pragma: no cover -- keeps this function an async generator From abac6d37a6003ed7bc51bc0bf94c06fd879f0319 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:10:42 +0200 Subject: [PATCH 02/55] test(core): add failing tests for coalesced_get basic cases --- tests/test_coalesce.py | 160 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 tests/test_coalesce.py diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py new file mode 100644 index 0000000000..8642803b11 --- /dev/null +++ b/tests/test_coalesce.py @@ -0,0 +1,160 @@ +# tests/test_coalesce.py +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +import pytest + +from zarr.abc.store import ( + ByteRequest, + OffsetByteRequest, + RangeByteRequest, + SuffixByteRequest, +) +from zarr.core._coalesce import ( + DEFAULT_COALESCE_OPTIONS, + CoalesceOptions, + coalesced_get, +) +from zarr.core.buffer import Buffer, default_buffer_prototype + +if TYPE_CHECKING: + from collections.abc import AsyncIterator, Callable, Sequence + +pytestmark = pytest.mark.asyncio + + +def _buf(data: bytes) -> Buffer: + return default_buffer_prototype().buffer.from_bytes(data) + + +@dataclass +class FakeFetch: + """Records every call and serves canned bytes from an in-memory blob.""" + + blob: bytes + key_exists: bool = True + raise_on: Callable[[ByteRequest | None], bool] | None = None + calls: list[ByteRequest | None] = field(default_factory=list) + + async def __call__(self, byte_range: ByteRequest | None) -> Buffer | None: + self.calls.append(byte_range) + if not self.key_exists: + return None + if self.raise_on is not None and self.raise_on(byte_range): + raise OSError("injected") + if byte_range is None: + return _buf(self.blob) + if isinstance(byte_range, RangeByteRequest): + return _buf(self.blob[byte_range.start : byte_range.end]) + if isinstance(byte_range, OffsetByteRequest): + return _buf(self.blob[byte_range.offset :]) + if isinstance(byte_range, SuffixByteRequest): + return _buf(self.blob[-byte_range.suffix :]) + raise AssertionError(f"unknown byte_range {byte_range!r}") + + +# A permissive options value used by most tests (heavy merging allowed). +HEAVY_MERGE: CoalesceOptions = { + "max_gap_bytes": 1 << 20, + "max_coalesced_bytes": 1 << 30, + "max_concurrency": 10, +} + +# An options value that forbids all merging (gap threshold 0 and any adjacent +# merges would still be allowed at gap==0, so we also cap size). +NO_MERGE: CoalesceOptions = { + "max_gap_bytes": -1, # strictly less-than semantics -- any positive gap breaks + "max_coalesced_bytes": 1 << 30, + "max_concurrency": 10, +} + + +async def _collect( + agen: AsyncIterator[Sequence[tuple[int, Buffer | None]]], +) -> list[list[tuple[int, Buffer | None]]]: + """Drain an async generator of groups into a list of lists of tuples.""" + return [list(group) async for group in agen] + + +def _contents(groups: list[list[tuple[int, Buffer | None]]]) -> dict[int, bytes]: + """Flatten to {index: bytes}.""" + result: dict[int, bytes] = {} + for group in groups: + for idx, buf in group: + assert buf is not None + result[idx] = buf.to_bytes() + return result + + +async def test_empty_input() -> None: + fetch = FakeFetch(b"abc" * 1000) + groups = await _collect(coalesced_get(fetch, [], options=DEFAULT_COALESCE_OPTIONS)) + assert groups == [] + assert fetch.calls == [] + + +async def test_single_range() -> None: + fetch = FakeFetch(b"0123456789") + groups = await _collect( + coalesced_get( + fetch, + [RangeByteRequest(2, 5)], + options=DEFAULT_COALESCE_OPTIONS, + ) + ) + assert len(groups) == 1 + assert len(groups[0]) == 1 + assert _contents(groups) == {0: b"234"} + assert len(fetch.calls) == 1 + + +async def test_fully_disjoint_ranges_each_get_own_group() -> None: + # Ranges 100 bytes apart with max_gap_bytes < 100 will not merge. + fetch = FakeFetch(b"x" * 10_000) + opts: CoalesceOptions = { + "max_gap_bytes": 50, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 10, + } + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 10), + RangeByteRequest(200, 210), + RangeByteRequest(500, 510), + ] + groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + assert len(groups) == 3 + assert all(len(g) == 1 for g in groups) + # 3 fetches, one per input. + assert len(fetch.calls) == 3 + + +async def test_adjacent_ranges_merge_into_one_group() -> None: + # Three ranges within 10 bytes of each other; max_gap_bytes=50 -> one merged fetch. + fetch = FakeFetch(b"".join(bytes([i % 256]) for i in range(1000))) + opts: CoalesceOptions = { + "max_gap_bytes": 50, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 10, + } + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 5), + RangeByteRequest(10, 15), + RangeByteRequest(20, 25), + ] + groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + assert len(groups) == 1 + assert len(groups[0]) == 3 + # The single fetch should span the full merged region. + assert len(fetch.calls) == 1 + call = fetch.calls[0] + assert isinstance(call, RangeByteRequest) + assert call.start == 0 + assert call.end == 25 + # Contents correct. + assert _contents(groups) == { + 0: bytes(range(5)), + 1: bytes(range(10, 15)), + 2: bytes(range(20, 25)), + } From f65018a1d8a31a948a6ed8062b2e040ba0fae0b0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:11:26 +0200 Subject: [PATCH 03/55] feat(core): implement coalesced_get for basic sequential cases --- src/zarr/core/_coalesce.py | 60 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index d155c143f9..3d0d01185f 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -71,6 +71,60 @@ async def coalesced_get( - If a fetch raises, the exception propagates on the yield that produced the failing group; earlier-completed groups remain observable. """ - # Stub body; real implementation filled in by later tasks. - raise NotImplementedError - yield () # type: ignore[unreachable] # pragma: no cover -- keeps this function an async generator + # Local import to avoid cycles at module import time. + from zarr.abc.store import RangeByteRequest + + indexed: list[tuple[int, ByteRequest | None]] = list(enumerate(byte_ranges)) + if not indexed: + return + + # Split inputs into coalescable (RangeByteRequest only) and uncoalescable (the rest). + mergeable: list[tuple[int, RangeByteRequest]] = [ + (i, r) for i, r in indexed if isinstance(r, RangeByteRequest) + ] + uncoalescable: list[tuple[int, ByteRequest | None]] = [ + (i, r) for i, r in indexed if not isinstance(r, RangeByteRequest) + ] + + # Sort mergeables by start offset, then merge. + mergeable.sort(key=lambda pair: pair[1].start) + groups: list[list[tuple[int, RangeByteRequest]]] = [] + for pair in mergeable: + _i, r = pair + if groups: + last = groups[-1] + last_end = max(x[1].end for x in last) + gap = r.start - last_end + merged_start = min(x[1].start for x in last) + prospective_end = max(last_end, r.end) + prospective_size = prospective_end - merged_start + if ( + gap <= options["max_gap_bytes"] + and prospective_size <= options["max_coalesced_bytes"] + ): + last.append(pair) + continue + groups.append([pair]) + + # For now, serve groups sequentially (concurrency added in Task 5). + for group in groups: + group_start = min(x[1].start for x in group) + group_end = max(x[1].end for x in group) + big = await fetch(RangeByteRequest(group_start, group_end)) + if big is None: + return # key missing, stop yielding + # Slice back into per-input buffers, ordered by start offset. + group.sort(key=lambda pair: pair[1].start) + yielded: list[tuple[int, Buffer | None]] = [] + for i, r in group: + local_start = r.start - group_start + local_end = r.end - group_start + yielded.append((i, big[local_start:local_end])) + yield tuple(yielded) + + # Uncoalescable inputs are fetched one at a time, each as its own one-tuple group. + for idx, single in uncoalescable: + buf = await fetch(single) + if buf is None: + return # key missing, stop yielding + yield ((idx, buf),) From 9e1f1d2e1119fff1f2638b8c6f4dd7267e3f6241 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:12:03 +0200 Subject: [PATCH 04/55] test(core): cover Offset/Suffix/None and mixed-cluster cases in coalesced_get --- tests/test_coalesce.py | 81 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 8642803b11..6d3466d300 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -158,3 +158,84 @@ async def test_adjacent_ranges_merge_into_one_group() -> None: 1: bytes(range(10, 15)), 2: bytes(range(20, 25)), } + + +async def test_offset_and_suffix_and_none_each_get_own_group() -> None: + fetch = FakeFetch(b"abcdefghij") + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 3), + OffsetByteRequest(5), + SuffixByteRequest(2), + None, + ] + groups = await _collect(coalesced_get(fetch, ranges, options=DEFAULT_COALESCE_OPTIONS)) + # 1 group from the RangeByteRequest + 3 one-tuple groups from the rest. + assert len(groups) == 4 + # Contents. + flat = _contents(groups) + assert flat[0] == b"abc" + assert flat[1] == b"fghij" + assert flat[2] == b"ij" + assert flat[3] == b"abcdefghij" + + +async def test_indices_preserved_under_shuffled_input() -> None: + fetch = FakeFetch(b"".join(bytes([i % 256]) for i in range(1000))) + # Construct ranges in a deliberately non-sorted order. + ranges: list[ByteRequest | None] = [ + RangeByteRequest(500, 510), + RangeByteRequest(0, 10), + RangeByteRequest(200, 210), + RangeByteRequest(300, 310), + ] + opts: CoalesceOptions = { + "max_gap_bytes": 50, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 10, + } + groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + flat = _contents(groups) + # Indices match original positions, not sorted order. + assert flat[0] == bytes(b % 256 for b in range(500, 510)) + assert flat[1] == bytes(b % 256 for b in range(10)) + assert flat[2] == bytes(b % 256 for b in range(200, 210)) + assert flat[3] == bytes(b % 256 for b in range(300, 310)) + + +async def test_within_group_ordering_is_start_offset() -> None: + fetch = FakeFetch(b"".join(bytes([i % 256]) for i in range(100))) + # Two ranges will merge; one has a later start but is listed first in input. + ranges: list[ByteRequest | None] = [ + RangeByteRequest(20, 25), + RangeByteRequest(0, 5), + ] + opts: CoalesceOptions = { + "max_gap_bytes": 50, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 10, + } + groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + assert len(groups) == 1 + # Within the group, tuples are ordered by start offset. + # Input index 1 (start=0) comes first, then 0 (start=20). + assert [idx for idx, _ in groups[0]] == [1, 0] + + +async def test_mixed_mergeable_and_non_mergeable_counts_correct() -> None: + fetch = FakeFetch(b"x" * 10_000) + opts: CoalesceOptions = { + "max_gap_bytes": 50, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 10, + } + # Two clusters + one far-away singleton. + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 10), + RangeByteRequest(20, 30), + RangeByteRequest(500, 510), + ] + groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + assert len(groups) == 2 + # First group has 2, second has 1. + sizes = sorted(len(g) for g in groups) + assert sizes == [1, 2] From cd5097bc9805b5530317c737c93afcf4f50c6236 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:14:19 +0200 Subject: [PATCH 05/55] feat(core): run coalesced fetches concurrently under max_concurrency --- src/zarr/core/_coalesce.py | 116 ++++++++++++++++++++++++++++++------- tests/test_coalesce.py | 30 ++++++++++ 2 files changed, 124 insertions(+), 22 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 3d0d01185f..ae1937c873 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -1,6 +1,7 @@ # src/zarr/core/_coalesce.py from __future__ import annotations +import asyncio from typing import TYPE_CHECKING, TypedDict if TYPE_CHECKING: @@ -106,25 +107,96 @@ async def coalesced_get( continue groups.append([pair]) - # For now, serve groups sequentially (concurrency added in Task 5). - for group in groups: - group_start = min(x[1].start for x in group) - group_end = max(x[1].end for x in group) - big = await fetch(RangeByteRequest(group_start, group_end)) - if big is None: - return # key missing, stop yielding - # Slice back into per-input buffers, ordered by start offset. - group.sort(key=lambda pair: pair[1].start) - yielded: list[tuple[int, Buffer | None]] = [] - for i, r in group: - local_start = r.start - group_start - local_end = r.end - group_start - yielded.append((i, big[local_start:local_end])) - yield tuple(yielded) - - # Uncoalescable inputs are fetched one at a time, each as its own one-tuple group. - for idx, single in uncoalescable: - buf = await fetch(single) - if buf is None: - return # key missing, stop yielding - yield ((idx, buf),) + # Build a uniform list of work items. Each work item is a list of + # (input_index, ByteRequest | None) pairs. Merged groups have multiple + # members (all RangeByteRequest); uncoalescable items have a single member. + work_items: list[list[tuple[int, ByteRequest | None]]] = [ + [(idx, r) for idx, r in g] for g in groups + ] + work_items.extend([(idx, single)] for idx, single in uncoalescable) + + total = len(work_items) + if total == 0: + return + + # Completion queue entries are either ("ok", payload), ("missing", None), + # or ("error", exception). Kept as Any internally to avoid dragging + # Sequence out of TYPE_CHECKING. + completion_queue: asyncio.Queue[ + tuple[str, Sequence[tuple[int, Buffer | None]] | BaseException | None] + ] = asyncio.Queue() + semaphore = asyncio.Semaphore(options["max_concurrency"]) + + async def run_one(members: list[tuple[int, ByteRequest | None]]) -> None: + try: + async with semaphore: + if len(members) == 1 and not isinstance(members[0][1], RangeByteRequest): + # Uncoalescable single fetch. + idx, single = members[0] + buf = await fetch(single) + if buf is None: + await completion_queue.put(("missing", None)) + return + await completion_queue.put(("ok", ((idx, buf),))) + return + # Merged group path: all members are RangeByteRequest. + starts = [r.start for _, r in members if isinstance(r, RangeByteRequest)] + ends = [r.end for _, r in members if isinstance(r, RangeByteRequest)] + group_start = min(starts) + group_end = max(ends) + big = await fetch(RangeByteRequest(group_start, group_end)) + if big is None: + await completion_queue.put(("missing", None)) + return + ordered = sorted( + members, + key=lambda pair: pair[1].start if isinstance(pair[1], RangeByteRequest) else 0, + ) + sliced: list[tuple[int, Buffer | None]] = [] + for idx, r in ordered: + assert isinstance(r, RangeByteRequest) + sliced.append((idx, big[r.start - group_start : r.end - group_start])) + await completion_queue.put(("ok", tuple(sliced))) + except asyncio.CancelledError: + # Cancellation is expected when we stop scheduling on a missing key. + raise + except BaseException as exc: + await completion_queue.put(("error", exc)) + + # Launch all work items as tasks. The semaphore bounds actual concurrency. + tasks: set[asyncio.Task[None]] = set() + for item in work_items: + tasks.add(asyncio.create_task(run_one(item))) + + try: + drained = 0 + stopped = False + pending_error: BaseException | None = None + while drained < total: + kind, payload = await completion_queue.get() + drained += 1 + if stopped: + continue # Discard remaining results after a miss or error. + if kind == "ok": + assert payload is not None + assert not isinstance(payload, BaseException) + yield payload + elif kind == "missing": + stopped = True + # Cancel any still-pending tasks to avoid unnecessary I/O. + for t in tasks: + if not t.done(): + t.cancel() + else: # "error" + assert isinstance(payload, BaseException) + stopped = True + pending_error = payload + for t in tasks: + if not t.done(): + t.cancel() + if pending_error is not None: + raise pending_error + finally: + # Ensure we wait for any cancelled tasks to finish so no task escapes. + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 6d3466d300..69d3ee3a70 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -1,6 +1,7 @@ # tests/test_coalesce.py from __future__ import annotations +import asyncio from dataclasses import dataclass, field from typing import TYPE_CHECKING @@ -239,3 +240,32 @@ async def test_mixed_mergeable_and_non_mergeable_counts_correct() -> None: # First group has 2, second has 1. sizes = sorted(len(g) for g in groups) assert sizes == [1, 2] + + +async def test_max_concurrency_is_honored() -> None: + # Build 10 non-mergeable ranges, have the fetch hold a counter of in-flight calls. + in_flight = 0 + peak = 0 + lock = asyncio.Lock() + + async def fetch(byte_range: ByteRequest | None) -> Buffer | None: + nonlocal in_flight, peak + async with lock: + in_flight += 1 + peak = max(peak, in_flight) + # give the scheduler a chance to run other tasks + await asyncio.sleep(0.01) + async with lock: + in_flight -= 1 + return _buf(b"x") + + ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(10)] + opts: CoalesceOptions = { + "max_gap_bytes": 0, # force no merging + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 3, + } + async for _group in coalesced_get(fetch, ranges, options=opts): + pass + assert peak <= 3 + assert peak >= 2 # must have been some real concurrency From 3a85488becdc7a23362c6df3467c544e3a50612e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:14:49 +0200 Subject: [PATCH 06/55] test(core): cover key-missing (start/mid) and fetch-raises in coalesced_get --- tests/test_coalesce.py | 47 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 69d3ee3a70..6927c07482 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -269,3 +269,50 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: pass assert peak <= 3 assert peak >= 2 # must have been some real concurrency + + +async def test_key_missing_from_first_call_yields_nothing() -> None: + fetch = FakeFetch(b"x" * 100, key_exists=False) + ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(20, 30)] + groups = await _collect(coalesced_get(fetch, ranges, options=DEFAULT_COALESCE_OPTIONS)) + assert groups == [] + + +async def test_key_missing_mid_stream_yields_earlier_groups_only() -> None: + # Two non-mergeable ranges; the second fetch returns None. + call_count = 0 + + async def fetch(byte_range: ByteRequest | None) -> Buffer | None: + nonlocal call_count + call_count += 1 + # Ensure deterministic ordering: first call serves, second returns None. + await asyncio.sleep(0.01 if call_count == 1 else 0.02) + if call_count >= 2: + return None + return _buf(b"ok") + + opts: CoalesceOptions = { + "max_gap_bytes": -1, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 1, # serialize for determinism + } + ranges: list[ByteRequest | None] = [RangeByteRequest(0, 2), RangeByteRequest(100, 102)] + groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + # Exactly one group (the first) -- the second went missing. + assert len(groups) == 1 + assert len(groups[0]) == 1 + + +async def test_fetch_raises_propagates() -> None: + fetch = FakeFetch( + b"x" * 100, + raise_on=lambda r: isinstance(r, RangeByteRequest) and r.start >= 100, + ) + opts: CoalesceOptions = { + "max_gap_bytes": -1, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 1, + } + ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(200, 210)] + with pytest.raises(OSError, match="injected"): + await _collect(coalesced_get(fetch, ranges, options=opts)) From 4553523aa7179823ed830bd71d55b91ba2061370 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:15:16 +0200 Subject: [PATCH 07/55] test(core): cover max_coalesced_bytes cap in coalesced_get --- tests/test_coalesce.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 6927c07482..053c64d78f 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -316,3 +316,35 @@ async def test_fetch_raises_propagates() -> None: ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(200, 210)] with pytest.raises(OSError, match="injected"): await _collect(coalesced_get(fetch, ranges, options=opts)) + + +async def test_max_coalesced_bytes_prevents_over_cap_merge() -> None: + fetch = FakeFetch(b"x" * 10_000) + opts: CoalesceOptions = { + "max_gap_bytes": 1000, # allow gap + "max_coalesced_bytes": 50, # but cap the merged size + "max_concurrency": 10, + } + # Two ranges 20 bytes apart, each 20 bytes -- merged span would be 20+20+20=60 > 50. + ranges: list[ByteRequest | None] = [RangeByteRequest(0, 20), RangeByteRequest(40, 60)] + groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + assert len(groups) == 2 + assert all(len(g) == 1 for g in groups) + + +async def test_single_range_larger_than_cap_is_passed_through() -> None: + fetch = FakeFetch(b"x" * 10_000) + opts: CoalesceOptions = { + "max_gap_bytes": 1000, + "max_coalesced_bytes": 50, + "max_concurrency": 10, + } + # A single 200-byte range, larger than the cap. Should still be fetched. + ranges: list[ByteRequest | None] = [RangeByteRequest(0, 200)] + groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + assert len(groups) == 1 + assert len(groups[0]) == 1 + idx, buf = groups[0][0] + assert idx == 0 + assert buf is not None + assert buf.to_bytes() == b"x" * 200 From 162dd6de636a16cb6f0f43ad93c27c38e665a70c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:15:44 +0200 Subject: [PATCH 08/55] test(core): add coverage-invariant property test for coalesced_get --- tests/test_coalesce.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 053c64d78f..b34aa32a9d 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -348,3 +348,30 @@ async def test_single_range_larger_than_cap_is_passed_through() -> None: assert idx == 0 assert buf is not None assert buf.to_bytes() == b"x" * 200 + + +async def test_coverage_invariant_random_inputs() -> None: + import random + + rng = random.Random(42) + blob = bytes(i % 256 for i in range(10_000)) + fetch = FakeFetch(blob) + + # Generate 50 random RangeByteRequests within the blob. + ranges: list[ByteRequest | None] = [] + for _ in range(50): + start = rng.randint(0, 9000) + length = rng.randint(1, 500) + ranges.append(RangeByteRequest(start, start + length)) + + groups = await _collect(coalesced_get(fetch, ranges, options=DEFAULT_COALESCE_OPTIONS)) + seen: list[int] = [] + for group in groups: + for idx, _buf in group: + seen.append(idx) + assert sorted(seen) == list(range(len(ranges))) + # And the bytes are correct. + flat = _contents(groups) + for i, r in enumerate(ranges): + assert isinstance(r, RangeByteRequest) + assert flat[i] == blob[r.start : r.end] From 401e28bf681cba272bbb5ed31ac1547924d71583 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:26:11 +0200 Subject: [PATCH 09/55] test(core): drop unused HEAVY_MERGE/NO_MERGE constants Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_coalesce.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index b34aa32a9d..25962be6e4 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -56,22 +56,6 @@ async def __call__(self, byte_range: ByteRequest | None) -> Buffer | None: raise AssertionError(f"unknown byte_range {byte_range!r}") -# A permissive options value used by most tests (heavy merging allowed). -HEAVY_MERGE: CoalesceOptions = { - "max_gap_bytes": 1 << 20, - "max_coalesced_bytes": 1 << 30, - "max_concurrency": 10, -} - -# An options value that forbids all merging (gap threshold 0 and any adjacent -# merges would still be allowed at gap==0, so we also cap size). -NO_MERGE: CoalesceOptions = { - "max_gap_bytes": -1, # strictly less-than semantics -- any positive gap breaks - "max_coalesced_bytes": 1 << 30, - "max_concurrency": 10, -} - - async def _collect( agen: AsyncIterator[Sequence[tuple[int, Buffer | None]]], ) -> list[list[tuple[int, Buffer | None]]]: From 913928ce1513452275d1d3333a61f95506652633 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:26:41 +0200 Subject: [PATCH 10/55] docs(core): shorten coalesced_get docstring summary line Split the overlong first line into a short numpydoc summary plus an extended description. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/zarr/core/_coalesce.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index ae1937c873..216b8fd9a8 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -38,9 +38,11 @@ async def coalesced_get( *, options: CoalesceOptions, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: - """Read many byte ranges through ``fetch``, coalescing nearby ranges and firing merged requests concurrently. + """Read many byte ranges through ``fetch`` with coalescing and concurrency. - Each yield corresponds to exactly one underlying I/O operation: a sequence of + Nearby ranges are merged into a single underlying I/O (subject to + ``options``), and merged fetches are run concurrently. Each yield + corresponds to exactly one underlying I/O operation: a sequence of ``(input_index, result)`` tuples for all input ranges served by that I/O. Tuples within a yielded sequence are ordered by start offset. Yields across groups are in completion order, not input order. From 850b9cd92d5ec9be769baa5646371d2ce67fde17 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:27:05 +0200 Subject: [PATCH 11/55] refactor(core): drop dead invariant checks in merged-group path After the input split at the top of coalesced_get, merged groups only ever contain RangeByteRequest members. Replace the per-element isinstance filters (and the defensive ``else 0`` sort-key branch) with a single assertion at the top of the merged-group block and direct attribute access. Also remove the unreachable ``if total == 0: return`` guard (``indexed`` is non-empty by construction once we pass the earlier guard). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/zarr/core/_coalesce.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 216b8fd9a8..62d98334ce 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -118,8 +118,6 @@ async def coalesced_get( work_items.extend([(idx, single)] for idx, single in uncoalescable) total = len(work_items) - if total == 0: - return # Completion queue entries are either ("ok", payload), ("missing", None), # or ("error", exception). Kept as Any internally to avoid dragging @@ -142,22 +140,19 @@ async def run_one(members: list[tuple[int, ByteRequest | None]]) -> None: await completion_queue.put(("ok", ((idx, buf),))) return # Merged group path: all members are RangeByteRequest. - starts = [r.start for _, r in members if isinstance(r, RangeByteRequest)] - ends = [r.end for _, r in members if isinstance(r, RangeByteRequest)] + assert all(isinstance(r, RangeByteRequest) for _, r in members) + starts = [r.start for _, r in members] # type: ignore[union-attr] + ends = [r.end for _, r in members] # type: ignore[union-attr] group_start = min(starts) group_end = max(ends) big = await fetch(RangeByteRequest(group_start, group_end)) if big is None: await completion_queue.put(("missing", None)) return - ordered = sorted( - members, - key=lambda pair: pair[1].start if isinstance(pair[1], RangeByteRequest) else 0, - ) + ordered = sorted(members, key=lambda pair: pair[1].start) # type: ignore[union-attr] sliced: list[tuple[int, Buffer | None]] = [] for idx, r in ordered: - assert isinstance(r, RangeByteRequest) - sliced.append((idx, big[r.start - group_start : r.end - group_start])) + sliced.append((idx, big[r.start - group_start : r.end - group_start])) # type: ignore[union-attr] await completion_queue.put(("ok", tuple(sliced))) except asyncio.CancelledError: # Cancellation is expected when we stop scheduling on a missing key. From b2ec638f76847dd52512a88ec3e5f6e1db797a3f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:27:30 +0200 Subject: [PATCH 12/55] test(core): cover key-missing on uncoalescable input Exercise the ``kind == "missing"`` branch in the uncoalescable single-fetch arm for Offset/Suffix/None inputs, which was not hit by existing tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_coalesce.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 25962be6e4..1bd020a886 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -287,6 +287,25 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: assert len(groups[0]) == 1 +@pytest.mark.parametrize( + "byte_range", + [ + OffsetByteRequest(5), + SuffixByteRequest(5), + None, + ], + ids=["offset", "suffix", "none"], +) +async def test_key_missing_on_uncoalescable_input_yields_nothing( + byte_range: ByteRequest | None, +) -> None: + # Uncoalescable inputs take a distinct code path from the merged-group + # path; a missing key on that path must still short-circuit cleanly. + fetch = FakeFetch(b"x" * 100, key_exists=False) + groups = await _collect(coalesced_get(fetch, [byte_range], options=DEFAULT_COALESCE_OPTIONS)) + assert groups == [] + + async def test_fetch_raises_propagates() -> None: fetch = FakeFetch( b"x" * 100, From a4c333072298d630819e92d372a1b62868825a13 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:49:52 +0200 Subject: [PATCH 13/55] fix(core): cancel pending fetches on early exit and stop-after-miss Two related correctness issues in coalesced_get's drain loop: 1. When the consumer breaks out of the async-for (early exit), the generator's finally block only awaited in-flight tasks rather than cancelling them. That wasted I/O. Cancel first, then gather. 2. The drain loop waited on completion_queue for ``total`` entries, but after a "missing" or "error" we cancel pending tasks -- and cancelled tasks never enqueue a completion. With max_concurrency > 1 this could hang. Rework the drain loop to break out immediately on the first miss/error; the finally block handles cleanup. The new structure also collapses the redundant miss/error branches and removes the now-unused ``total``/``drained``/``stopped`` bookkeeping. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/zarr/core/_coalesce.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 62d98334ce..83f8e8b9f9 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -117,8 +117,6 @@ async def coalesced_get( ] work_items.extend([(idx, single)] for idx, single in uncoalescable) - total = len(work_items) - # Completion queue entries are either ("ok", payload), ("missing", None), # or ("error", exception). Kept as Any internally to avoid dragging # Sequence out of TYPE_CHECKING. @@ -166,34 +164,32 @@ async def run_one(members: list[tuple[int, ByteRequest | None]]) -> None: tasks.add(asyncio.create_task(run_one(item))) try: - drained = 0 - stopped = False pending_error: BaseException | None = None - while drained < total: + for _ in range(len(work_items)): kind, payload = await completion_queue.get() - drained += 1 - if stopped: - continue # Discard remaining results after a miss or error. if kind == "ok": assert payload is not None assert not isinstance(payload, BaseException) yield payload - elif kind == "missing": - stopped = True - # Cancel any still-pending tasks to avoid unnecessary I/O. - for t in tasks: - if not t.done(): - t.cancel() - else: # "error" + continue + # "missing" or "error": stop scheduling and cancel pending work. + # Late arrivals that raced to enqueue before cancellation took + # effect sit in the completion queue and are discarded by the + # finally block (the queue is local and will be garbage-collected). + for t in tasks: + if not t.done(): + t.cancel() + if kind == "error": assert isinstance(payload, BaseException) - stopped = True pending_error = payload - for t in tasks: - if not t.done(): - t.cancel() + break if pending_error is not None: raise pending_error finally: - # Ensure we wait for any cancelled tasks to finish so no task escapes. + # Best-effort cancellation for in-flight tasks (covers the consumer + # break / early-exit case where we did not proactively cancel). + for t in tasks: + if not t.done(): + t.cancel() if tasks: await asyncio.gather(*tasks, return_exceptions=True) From 5b1d8cd96955497d571582d7199b09da578fd61a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:50:24 +0200 Subject: [PATCH 14/55] test(core): cover mid-stream miss with concurrency > 1 Exercises the concurrent path where a missing key is observed while other fetches are still in flight. Uses an asyncio.Event to gate late arrivals until after the miss has been processed, giving the drain loop an opportunity to observe and discard post-stop completions, and verifies the iterator terminates cleanly without hanging or raising. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_coalesce.py | 59 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 1bd020a886..bfc53610c9 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -287,6 +287,65 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: assert len(groups[0]) == 1 +async def test_key_missing_mid_stream_with_concurrency_drains_late_arrivals() -> None: + # Schedule multiple non-mergeable fetches under max_concurrency=3. + # - Fetch #0 completes FIRST (short sleep) -> at least one yield observed. + # - Fetch #1 returns None shortly after -> triggers the miss path. + # - Fetches #2..#N are gated on an asyncio.Event so they only unblock + # AFTER the miss has been observed, producing late arrivals that + # exercise the `if stopped: continue` discard branch in the drain loop. + late_gate = asyncio.Event() + miss_fired = asyncio.Event() + + async def fetch(byte_range: ByteRequest | None) -> Buffer | None: + assert isinstance(byte_range, RangeByteRequest) + start = byte_range.start + if start == 0: + # First to complete: small sleep so it arrives before the miss. + await asyncio.sleep(0.01) + return _buf(b"ok") + if start == 1000: + # Miss: a little later than #0 so #0 yields first. + await asyncio.sleep(0.03) + miss_fired.set() + return None + # Late arrivals: wait until the miss has been processed, then return + # a buffer so the drain loop sees them post-stop. + await asyncio.wait_for(miss_fired.wait(), timeout=5.0) + await asyncio.wait_for(late_gate.wait(), timeout=5.0) + return _buf(b"ok") + + opts: CoalesceOptions = { + "max_gap_bytes": -1, # force no merging (each range its own fetch) + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 3, + } + # Stride by 1000 to avoid merging. 7 items fits within max_concurrency=3 + # while producing pending work after the miss. + ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(7)] + + groups: list[list[tuple[int, Buffer | None]]] = [] + agen = coalesced_get(fetch, ranges, options=opts) + try: + async for group in agen: + groups.append(list(group)) + # After the first yield, release the late gate so remaining + # in-flight tasks can complete and arrive post-stop. + late_gate.set() + finally: + # Guard against a bug preventing any yield: unblock waiters anyway. + late_gate.set() + + # We observed exactly the one pre-miss yield. + assert len(groups) == 1 + assert len(groups[0]) == 1 + idx, buf = groups[0][0] + assert idx == 0 + assert buf is not None + # The iterator completed cleanly without raising. + assert miss_fired.is_set() + + @pytest.mark.parametrize( "byte_range", [ From 6aa6f4b4cc0c1e3628be868b5ba47117aa8cecbe Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:50:45 +0200 Subject: [PATCH 15/55] test(core): verify consumer-break cancels pending fetches Drives many slow ranges with a small max_concurrency, breaks out of the async-for after the first yield, and verifies that at least one still-running fetch was cancelled rather than being left to run to completion. Cancellation is observed via a counter in the fetch's CancelledError branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_coalesce.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index bfc53610c9..4df3c7ce44 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -365,6 +365,48 @@ async def test_key_missing_on_uncoalescable_input_yields_nothing( assert groups == [] +async def test_consumer_break_cancels_pending_fetches() -> None: + # Kick off many slow ranges with small max_concurrency, break after the + # first yielded group, and verify the remaining tasks are cancelled rather + # than allowed to run to completion. + completed_calls = 0 + cancelled_calls = 0 + + async def fetch(byte_range: ByteRequest | None) -> Buffer | None: + nonlocal completed_calls, cancelled_calls + assert isinstance(byte_range, RangeByteRequest) + start = byte_range.start + try: + # First fetch returns fast so the async for body runs and can break. + # Later fetches sleep long enough that cancellation has room to land. + await asyncio.sleep(0.001 if start == 0 else 2.0) + except asyncio.CancelledError: + cancelled_calls += 1 + raise + completed_calls += 1 + return _buf(b"x") + + opts: CoalesceOptions = { + "max_gap_bytes": -1, # no merging + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 3, + } + ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(6)] + + agen = coalesced_get(fetch, ranges, options=opts) + # Break after receiving the first yield. + async for _group in agen: + break + # Make sure the async generator is fully closed so its finally runs. + await agen.aclose() + + # At least one slow fetch was actually running under the semaphore and got + # cancelled (rather than running to completion). + assert cancelled_calls >= 1 + # And the first range's fetch completed normally (no spurious cancels there). + assert completed_calls >= 1 + + async def test_fetch_raises_propagates() -> None: fetch = FakeFetch( b"x" * 100, From dded848a44305532e4690e397b67b8b983c60d92 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 15:54:05 +0200 Subject: [PATCH 16/55] fix(core): type coalesced_get as AsyncGenerator coalesced_get is implemented as an async generator (uses yield) and callers need access to aclose() to drive its finally block deterministically. Declaring the return type as AsyncGenerator instead of AsyncIterator exposes aclose()/asend()/athrow() through the type system, matches the runtime object, and lets consumers (e.g. the consumer-break test) avoid type-ignore escape hatches. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/zarr/core/_coalesce.py | 4 ++-- tests/test_coalesce.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 83f8e8b9f9..40fe14bf04 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, TypedDict if TYPE_CHECKING: - from collections.abc import AsyncIterator, Awaitable, Callable, Iterable, Sequence + from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence from zarr.abc.store import ByteRequest from zarr.core.buffer import Buffer @@ -37,7 +37,7 @@ async def coalesced_get( byte_ranges: Iterable[ByteRequest | None], *, options: CoalesceOptions, -) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: +) -> AsyncGenerator[Sequence[tuple[int, Buffer | None]], None]: """Read many byte ranges through ``fetch`` with coalescing and concurrency. Nearby ranges are merged into a single underlying I/O (subject to diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 4df3c7ce44..c95570f323 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -397,7 +397,8 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: # Break after receiving the first yield. async for _group in agen: break - # Make sure the async generator is fully closed so its finally runs. + # Explicitly close the generator so its finally block runs (cancelling + # in-flight tasks) before we make assertions. await agen.aclose() # At least one slow fetch was actually running under the semaphore and got From 865baf0828874b185b46b7c6fd64e94ff0722f0a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 16:05:08 +0200 Subject: [PATCH 17/55] refactor(test): drop redundant pytestmark asyncio pyproject asyncio_mode=auto already covers async test dispatch; the explicit pytestmark was a vestige. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_coalesce.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index c95570f323..41ccc71652 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -23,8 +23,6 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator, Callable, Sequence -pytestmark = pytest.mark.asyncio - def _buf(data: bytes) -> Buffer: return default_buffer_prototype().buffer.from_bytes(data) From 17d9f759b0c4ecbea4be575bfd70f5e8850c13a9 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 16:07:17 +0200 Subject: [PATCH 18/55] feat(storage): add private SupportsGetRanges protocol --- src/zarr/storage/_protocols.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 src/zarr/storage/_protocols.py diff --git a/src/zarr/storage/_protocols.py b/src/zarr/storage/_protocols.py new file mode 100644 index 0000000000..086a15dd90 --- /dev/null +++ b/src/zarr/storage/_protocols.py @@ -0,0 +1,34 @@ +# src/zarr/storage/_protocols.py +from __future__ import annotations + +from typing import TYPE_CHECKING, Protocol, runtime_checkable + +if TYPE_CHECKING: + from collections.abc import AsyncIterator, Iterable, Sequence + + from zarr.abc.store import ByteRequest + from zarr.core.buffer import Buffer, BufferPrototype + + +@runtime_checkable +class SupportsGetRanges(Protocol): + """Stores that satisfy this protocol can efficiently read many byte ranges + from a single key in a single call, typically via coalescing and concurrent fetch. + + Private / unstable. Shape may change before being made public. + """ + + def get_ranges( + self, + key: str, + byte_ranges: Iterable[ByteRequest | None], + *, + prototype: BufferPrototype, + ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: + """Read many byte ranges from ``key``. + + Each yield corresponds to one underlying I/O operation. + + See :func:`zarr.core._coalesce.coalesced_get` for full semantics. + """ + ... From 3ab711d64906e01d0fc21f5dce8e08b4c4c13f81 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 16:09:02 +0200 Subject: [PATCH 19/55] test(storage): add failing tests for FsspecStore.get_ranges --- tests/test_store/test_fsspec_get_ranges.py | 122 +++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 tests/test_store/test_fsspec_get_ranges.py diff --git a/tests/test_store/test_fsspec_get_ranges.py b/tests/test_store/test_fsspec_get_ranges.py new file mode 100644 index 0000000000..9eb517095a --- /dev/null +++ b/tests/test_store/test_fsspec_get_ranges.py @@ -0,0 +1,122 @@ +# tests/test_store/test_fsspec_get_ranges.py +"""Lightweight integration tests for FsspecStore.get_ranges using MemoryFileSystem. + +These don't need moto/s3 — they exercise the new method against an in-process +fsspec MemoryFileSystem wrapped in the async wrapper. +""" + +from __future__ import annotations + +import pytest + +from zarr.abc.store import RangeByteRequest +from zarr.core._coalesce import DEFAULT_COALESCE_OPTIONS, CoalesceOptions +from zarr.core.buffer import Buffer, default_buffer_prototype +from zarr.storage import FsspecStore +from zarr.storage._fsspec import _make_async + +fsspec = pytest.importorskip("fsspec") + + +@pytest.fixture +def memory_store() -> FsspecStore: + """An FsspecStore backed by fsspec MemoryFileSystem (wrapped async).""" + from fsspec.implementations.memory import MemoryFileSystem + + # Each test gets a clean filesystem; MemoryFileSystem is a singleton per target_options, + # so clear state explicitly. + fs: MemoryFileSystem = MemoryFileSystem() + fs.store.clear() + fs.pseudo_dirs.clear() + async_fs = _make_async(fs) + return FsspecStore(fs=async_fs, path="/root") + + +async def _write(store: FsspecStore, key: str, data: bytes) -> None: + buf = default_buffer_prototype().buffer.from_bytes(data) + await store.set(key, buf) + + +async def test_get_ranges_happy_path(memory_store: FsspecStore) -> None: + blob = bytes(i % 256 for i in range(1024)) + await _write(memory_store, "blob", blob) + proto = default_buffer_prototype() + + ranges = [ + RangeByteRequest(0, 10), + RangeByteRequest(100, 110), + RangeByteRequest(500, 520), + ] + groups: list[list[tuple[int, Buffer | None]]] = [ + list(group) async for group in memory_store.get_ranges("blob", ranges, prototype=proto) + ] + + flat: dict[int, bytes] = {} + for group in groups: + for idx, buf in group: + assert buf is not None + flat[idx] = buf.to_bytes() + + assert flat[0] == blob[0:10] + assert flat[1] == blob[100:110] + assert flat[2] == blob[500:520] + + +async def test_get_ranges_missing_key_yields_nothing(memory_store: FsspecStore) -> None: + proto = default_buffer_prototype() + groups: list[list[tuple[int, Buffer | None]]] = [ + list(group) + async for group in memory_store.get_ranges( + "does-not-exist", [RangeByteRequest(0, 10)], prototype=proto + ) + ] + assert groups == [] + + +async def test_default_coalesce_options_on_store_without_arg() -> None: + from fsspec.implementations.memory import MemoryFileSystem + + fs = MemoryFileSystem() + fs.store.clear() + store = FsspecStore(fs=_make_async(fs), path="/x") + assert store.coalesce_options == DEFAULT_COALESCE_OPTIONS + + +async def test_coalesce_options_wired_through() -> None: + from fsspec.implementations.memory import MemoryFileSystem + + fs = MemoryFileSystem() + fs.store.clear() + custom: CoalesceOptions = { + "max_gap_bytes": 0, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 2, + } + store = FsspecStore(fs=_make_async(fs), path="/x", coalesce_options=custom) + assert store.coalesce_options == custom + + +async def test_get_ranges_mixed_range_types(memory_store: FsspecStore) -> None: + """Covers RangeByteRequest, OffsetByteRequest, SuffixByteRequest, and None in one call.""" + from zarr.abc.store import OffsetByteRequest, SuffixByteRequest + + blob = bytes(i % 256 for i in range(512)) + await _write(memory_store, "mixed", blob) + proto = default_buffer_prototype() + + ranges = [ + RangeByteRequest(0, 10), + OffsetByteRequest(500), + SuffixByteRequest(12), + None, + ] + flat: dict[int, bytes] = {} + async for group in memory_store.get_ranges("mixed", ranges, prototype=proto): + for idx, buf in group: + assert buf is not None + flat[idx] = buf.to_bytes() + + assert flat[0] == blob[0:10] + assert flat[1] == blob[500:] + assert flat[2] == blob[-12:] + assert flat[3] == blob From 913be1067c203080484ef18ccf5f1e4e5a7a703c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 16:13:15 +0200 Subject: [PATCH 20/55] feat(storage): add FsspecStore.get_ranges and coalesce_options kwarg --- src/zarr/storage/_fsspec.py | 33 +++++++++++++++++++++- tests/test_store/test_fsspec_get_ranges.py | 4 +-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index 74e5869a66..d967e64ca2 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -3,6 +3,7 @@ import json import warnings from contextlib import suppress +from functools import partial from typing import TYPE_CHECKING, Any from packaging.version import parse as parse_version @@ -14,18 +15,24 @@ Store, SuffixByteRequest, ) +from zarr.core._coalesce import ( + DEFAULT_COALESCE_OPTIONS, + CoalesceOptions, + coalesced_get, +) from zarr.core.buffer import Buffer from zarr.errors import ZarrUserWarning from zarr.storage._utils import _join_paths, normalize_path if TYPE_CHECKING: - from collections.abc import AsyncIterator, Iterable + from collections.abc import AsyncIterator, Iterable, Sequence from fsspec import AbstractFileSystem from fsspec.asyn import AsyncFileSystem from fsspec.mapping import FSMap from zarr.core.buffer import BufferPrototype + from zarr.storage._protocols import SupportsGetRanges ALLOWED_EXCEPTIONS: tuple[type[Exception], ...] = ( @@ -124,11 +131,14 @@ def __init__( read_only: bool = False, path: str = "/", allowed_exceptions: tuple[type[Exception], ...] = ALLOWED_EXCEPTIONS, + *, + coalesce_options: CoalesceOptions = DEFAULT_COALESCE_OPTIONS, ) -> None: super().__init__(read_only=read_only) self.fs = fs self.path = normalize_path(path) self.allowed_exceptions = allowed_exceptions + self.coalesce_options = coalesce_options if not self.fs.async_impl: raise TypeError("Filesystem needs to support async operations.") @@ -315,6 +325,22 @@ async def get( else: return value + async def get_ranges( + self, + key: str, + byte_ranges: Iterable[ByteRequest | None], + *, + prototype: BufferPrototype, + ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: + """Read many byte ranges from ``key``, coalescing nearby ranges and fetching concurrently. + + See :class:`zarr.storage._protocols.SupportsGetRanges` for the contract and + :func:`zarr.core._coalesce.coalesced_get` for the full semantics. + """ + fetch = partial(self.get, key, prototype) + async for group in coalesced_get(fetch, byte_ranges, options=self.coalesce_options): + yield group + async def set( self, key: str, @@ -440,3 +466,8 @@ async def getsize(self, key: str) -> int: else: # fsspec doesn't have typing. We'll need to assume or verify this is true return int(size) + + +# Module-level type assertion: FsspecStore structurally satisfies SupportsGetRanges. +# This line is a no-op at runtime but causes mypy/pyright to complain if the shape drifts. +_: type[SupportsGetRanges] = FsspecStore diff --git a/tests/test_store/test_fsspec_get_ranges.py b/tests/test_store/test_fsspec_get_ranges.py index 9eb517095a..b30868a78b 100644 --- a/tests/test_store/test_fsspec_get_ranges.py +++ b/tests/test_store/test_fsspec_get_ranges.py @@ -98,13 +98,13 @@ async def test_coalesce_options_wired_through() -> None: async def test_get_ranges_mixed_range_types(memory_store: FsspecStore) -> None: """Covers RangeByteRequest, OffsetByteRequest, SuffixByteRequest, and None in one call.""" - from zarr.abc.store import OffsetByteRequest, SuffixByteRequest + from zarr.abc.store import ByteRequest, OffsetByteRequest, SuffixByteRequest blob = bytes(i % 256 for i in range(512)) await _write(memory_store, "mixed", blob) proto = default_buffer_prototype() - ranges = [ + ranges: list[ByteRequest | None] = [ RangeByteRequest(0, 10), OffsetByteRequest(500), SuffixByteRequest(12), From 0328e01e2b81550c6b0f54e06da542dcff93bbaa Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 16:13:34 +0200 Subject: [PATCH 21/55] test(storage): add SupportsGetRanges conformance tests --- tests/test_store/test_protocols.py | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/test_store/test_protocols.py diff --git a/tests/test_store/test_protocols.py b/tests/test_store/test_protocols.py new file mode 100644 index 0000000000..c47cf928c3 --- /dev/null +++ b/tests/test_store/test_protocols.py @@ -0,0 +1,38 @@ +# tests/test_store/test_protocols.py +"""Runtime and static conformance tests for zarr.storage._protocols.SupportsGetRanges.""" + +from __future__ import annotations + +import pytest + +from zarr.storage._protocols import SupportsGetRanges + + +def test_fsspec_store_satisfies_supports_get_ranges() -> None: + pytest.importorskip("fsspec") + from fsspec.implementations.memory import MemoryFileSystem + + from zarr.storage import FsspecStore + from zarr.storage._fsspec import _make_async + + fs = MemoryFileSystem() + fs.store.clear() + store = FsspecStore(fs=_make_async(fs), path="/x") + assert isinstance(store, SupportsGetRanges) + + +def test_memory_store_does_not_satisfy_supports_get_ranges() -> None: + """Sanity check: stores that don't implement get_ranges shouldn't satisfy the protocol.""" + from zarr.storage import MemoryStore + + store = MemoryStore() + assert not isinstance(store, SupportsGetRanges) + + +def test_type_assignment_at_module_level() -> None: + """Smoke-test the module-level `_: type[SupportsGetRanges] = FsspecStore`. + + If this runs without error the module imported cleanly; the static check is in mypy. + """ + pytest.importorskip("fsspec") + from zarr.storage import _fsspec # noqa: F401 From e7432c5c67209b7a15465c582d1827f63936457f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 16:16:46 +0200 Subject: [PATCH 22/55] chore: add towncrier fragment for get_ranges Used 0000.feature.md as a placeholder; rename to {pr-number}.feature.md once the PR is opened. Co-Authored-By: Claude Opus 4.7 (1M context) --- changes/0000.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/0000.feature.md diff --git a/changes/0000.feature.md b/changes/0000.feature.md new file mode 100644 index 0000000000..288c8af0b4 --- /dev/null +++ b/changes/0000.feature.md @@ -0,0 +1 @@ +Add `zarr.storage.FsspecStore.get_ranges` for concurrent, coalesced multi-range reads from a single key. The method satisfies the private `zarr.storage._protocols.SupportsGetRanges` protocol. A new keyword-only constructor argument `coalesce_options` on `FsspecStore` controls the max gap, max coalesced size, and max concurrency of the underlying requests. From 349bd9c6bc99e30349c817a3fd3d76e37cd05a64 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 16:58:44 +0200 Subject: [PATCH 23/55] docs: drop private-symbol mention from get_ranges changelog The SupportsGetRanges protocol is private; a user-facing release note shouldn't advertise it. Co-Authored-By: Claude Opus 4.7 (1M context) --- changes/0000.feature.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/0000.feature.md b/changes/0000.feature.md index 288c8af0b4..ce26a8c209 100644 --- a/changes/0000.feature.md +++ b/changes/0000.feature.md @@ -1 +1 @@ -Add `zarr.storage.FsspecStore.get_ranges` for concurrent, coalesced multi-range reads from a single key. The method satisfies the private `zarr.storage._protocols.SupportsGetRanges` protocol. A new keyword-only constructor argument `coalesce_options` on `FsspecStore` controls the max gap, max coalesced size, and max concurrency of the underlying requests. +Add `zarr.storage.FsspecStore.get_ranges` for concurrent, coalesced multi-range reads from a single key. A new keyword-only constructor argument `coalesce_options` on `FsspecStore` controls the max gap, max coalesced size, and max concurrency of the underlying requests. From 79e9927e6ecc05e77ecb08c0ab8c364959d7352f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 20:43:56 +0200 Subject: [PATCH 24/55] test: refactor tests --- tests/test_coalesce.py | 542 ++++++++++++++++++++++------------------- 1 file changed, 289 insertions(+), 253 deletions(-) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 41ccc71652..68a620ea5d 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -71,161 +71,236 @@ def _contents(groups: list[list[tuple[int, Buffer | None]]]) -> dict[int, bytes] return result -async def test_empty_input() -> None: - fetch = FakeFetch(b"abc" * 1000) - groups = await _collect(coalesced_get(fetch, [], options=DEFAULT_COALESCE_OPTIONS)) - assert groups == [] - assert fetch.calls == [] +# --------------------------------------------------------------------------- +# Shared option values for parametrized structural tests. +# --------------------------------------------------------------------------- + +DEFAULT: CoalesceOptions = DEFAULT_COALESCE_OPTIONS +"""The library default; permissive merging.""" + +MERGE_GAP_50: CoalesceOptions = { + "max_gap_bytes": 50, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 10, +} +"""Merge ranges within 50 bytes of each other.""" + +NO_MERGE: CoalesceOptions = { + "max_gap_bytes": -1, + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 10, +} +"""No merging: any positive gap is > -1, so no pair ever coalesces.""" + +CAP_50: CoalesceOptions = { + "max_gap_bytes": 1000, + "max_coalesced_bytes": 50, + "max_concurrency": 10, +} +"""Gap permissive but merged size capped at 50 bytes.""" + + +# A deterministic blob used for content-sensitive cases: byte i == (i % 256). +_INDEXED_BLOB = bytes(i % 256 for i in range(10_000)) + + +# --------------------------------------------------------------------------- +# Parametrized structural/content tests (cases without async timing or errors). +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class StructuralCase: + """One row of the parametrized structure-and-contents table.""" + + id: str + """pytest id for the case.""" + ranges: list[ByteRequest | None] + """Input to coalesced_get.""" + options: CoalesceOptions + """Coalescing knobs.""" + expected_group_sizes: list[int] + """Sorted list of group tuple-counts (order-independent).""" + expected_contents: dict[int, bytes] | None = None + """{input_index: bytes} to verify bytes, or None to skip the content check.""" + expected_n_fetches: int | None = None + """Exact number of calls to the fetch callable, or None to skip the check.""" + + +_STRUCTURAL_CASES: list[StructuralCase] = [ + StructuralCase( + id="empty-input", + ranges=[], + options=DEFAULT, + expected_group_sizes=[], + expected_n_fetches=0, + ), + StructuralCase( + id="single-range", + ranges=[RangeByteRequest(2, 5)], + options=DEFAULT, + expected_group_sizes=[1], + expected_contents={0: _INDEXED_BLOB[2:5]}, + expected_n_fetches=1, + ), + StructuralCase( + id="disjoint-3-no-merge", + ranges=[ + RangeByteRequest(0, 10), + RangeByteRequest(200, 210), + RangeByteRequest(500, 510), + ], + options=MERGE_GAP_50, + expected_group_sizes=[1, 1, 1], + expected_contents={ + 0: _INDEXED_BLOB[0:10], + 1: _INDEXED_BLOB[200:210], + 2: _INDEXED_BLOB[500:510], + }, + expected_n_fetches=3, + ), + StructuralCase( + id="adjacent-3-one-merged-group", + ranges=[ + RangeByteRequest(0, 5), + RangeByteRequest(10, 15), + RangeByteRequest(20, 25), + ], + options=MERGE_GAP_50, + expected_group_sizes=[3], + expected_contents={ + 0: _INDEXED_BLOB[0:5], + 1: _INDEXED_BLOB[10:15], + 2: _INDEXED_BLOB[20:25], + }, + expected_n_fetches=1, + ), + StructuralCase( + id="two-clusters-one-singleton", + ranges=[ + RangeByteRequest(0, 10), + RangeByteRequest(20, 30), + RangeByteRequest(500, 510), + ], + options=MERGE_GAP_50, + expected_group_sizes=[1, 2], + expected_contents={ + 0: _INDEXED_BLOB[0:10], + 1: _INDEXED_BLOB[20:30], + 2: _INDEXED_BLOB[500:510], + }, + expected_n_fetches=2, + ), + StructuralCase( + id="uncoalescable-mixed-with-range", + ranges=[ + RangeByteRequest(0, 3), + OffsetByteRequest(5), + SuffixByteRequest(2), + None, + ], + options=DEFAULT, + expected_group_sizes=[1, 1, 1, 1], + expected_contents={ + 0: _INDEXED_BLOB[0:3], + 1: _INDEXED_BLOB[5:], + 2: _INDEXED_BLOB[-2:], + 3: _INDEXED_BLOB, + }, + expected_n_fetches=4, + ), + StructuralCase( + id="shuffled-input-indices-preserved", + ranges=[ + RangeByteRequest(500, 510), + RangeByteRequest(0, 10), + RangeByteRequest(200, 210), + RangeByteRequest(300, 310), + ], + options=MERGE_GAP_50, + expected_group_sizes=[1, 1, 1, 1], + expected_contents={ + 0: _INDEXED_BLOB[500:510], + 1: _INDEXED_BLOB[0:10], + 2: _INDEXED_BLOB[200:210], + 3: _INDEXED_BLOB[300:310], + }, + expected_n_fetches=4, + ), + StructuralCase( + id="cap-prevents-merge-of-close-ranges", + # 20 + 20 gap + 20 = 60-byte merged span > cap of 50. + ranges=[RangeByteRequest(0, 20), RangeByteRequest(40, 60)], + options=CAP_50, + expected_group_sizes=[1, 1], + expected_n_fetches=2, + ), + StructuralCase( + id="single-range-larger-than-cap-passes-through", + # Cap only applies to MERGE decisions; a lone oversized range still fetches. + ranges=[RangeByteRequest(0, 200)], + options=CAP_50, + expected_group_sizes=[1], + expected_contents={0: _INDEXED_BLOB[0:200]}, + expected_n_fetches=1, + ), +] + + +@pytest.mark.parametrize("case", _STRUCTURAL_CASES, ids=lambda c: c.id) +async def test_coalescing_structure_and_contents(case: StructuralCase) -> None: + """Group structure, byte contents, and fetch-call count for the deterministic cases.""" + fetch = FakeFetch(_INDEXED_BLOB) + groups = await _collect(coalesced_get(fetch, case.ranges, options=case.options)) + + assert sorted(len(g) for g in groups) == sorted(case.expected_group_sizes) + + if case.expected_contents is not None: + assert _contents(groups) == case.expected_contents + + if case.expected_n_fetches is not None: + assert len(fetch.calls) == case.expected_n_fetches + + +# --------------------------------------------------------------------------- +# Focused non-parametrized tests for cases with distinctive assertion shapes. +# --------------------------------------------------------------------------- -async def test_single_range() -> None: - fetch = FakeFetch(b"0123456789") - groups = await _collect( - coalesced_get( - fetch, - [RangeByteRequest(2, 5)], - options=DEFAULT_COALESCE_OPTIONS, - ) - ) +async def test_within_group_ordering_is_start_offset() -> None: + """Within a merged group, tuples are ordered by start offset, not input order.""" + fetch = FakeFetch(_INDEXED_BLOB) + # Two ranges that merge; one has a later start but is listed first in input. + ranges: list[ByteRequest | None] = [RangeByteRequest(20, 25), RangeByteRequest(0, 5)] + groups = await _collect(coalesced_get(fetch, ranges, options=MERGE_GAP_50)) assert len(groups) == 1 - assert len(groups[0]) == 1 - assert _contents(groups) == {0: b"234"} - assert len(fetch.calls) == 1 - - -async def test_fully_disjoint_ranges_each_get_own_group() -> None: - # Ranges 100 bytes apart with max_gap_bytes < 100 will not merge. - fetch = FakeFetch(b"x" * 10_000) - opts: CoalesceOptions = { - "max_gap_bytes": 50, - "max_coalesced_bytes": 1 << 20, - "max_concurrency": 10, - } - ranges: list[ByteRequest | None] = [ - RangeByteRequest(0, 10), - RangeByteRequest(200, 210), - RangeByteRequest(500, 510), - ] - groups = await _collect(coalesced_get(fetch, ranges, options=opts)) - assert len(groups) == 3 - assert all(len(g) == 1 for g in groups) - # 3 fetches, one per input. - assert len(fetch.calls) == 3 + # Input index 1 (start=0) comes first, then 0 (start=20). + assert [idx for idx, _ in groups[0]] == [1, 0] -async def test_adjacent_ranges_merge_into_one_group() -> None: - # Three ranges within 10 bytes of each other; max_gap_bytes=50 -> one merged fetch. - fetch = FakeFetch(b"".join(bytes([i % 256]) for i in range(1000))) - opts: CoalesceOptions = { - "max_gap_bytes": 50, - "max_coalesced_bytes": 1 << 20, - "max_concurrency": 10, - } +async def test_adjacent_ranges_fire_single_fetch_spanning_merged_region() -> None: + """Verify the merged fetch covers exactly the span from min-start to max-end.""" + fetch = FakeFetch(_INDEXED_BLOB) ranges: list[ByteRequest | None] = [ RangeByteRequest(0, 5), RangeByteRequest(10, 15), RangeByteRequest(20, 25), ] - groups = await _collect(coalesced_get(fetch, ranges, options=opts)) - assert len(groups) == 1 - assert len(groups[0]) == 3 - # The single fetch should span the full merged region. + await _collect(coalesced_get(fetch, ranges, options=MERGE_GAP_50)) assert len(fetch.calls) == 1 call = fetch.calls[0] assert isinstance(call, RangeByteRequest) assert call.start == 0 assert call.end == 25 - # Contents correct. - assert _contents(groups) == { - 0: bytes(range(5)), - 1: bytes(range(10, 15)), - 2: bytes(range(20, 25)), - } -async def test_offset_and_suffix_and_none_each_get_own_group() -> None: - fetch = FakeFetch(b"abcdefghij") - ranges: list[ByteRequest | None] = [ - RangeByteRequest(0, 3), - OffsetByteRequest(5), - SuffixByteRequest(2), - None, - ] - groups = await _collect(coalesced_get(fetch, ranges, options=DEFAULT_COALESCE_OPTIONS)) - # 1 group from the RangeByteRequest + 3 one-tuple groups from the rest. - assert len(groups) == 4 - # Contents. - flat = _contents(groups) - assert flat[0] == b"abc" - assert flat[1] == b"fghij" - assert flat[2] == b"ij" - assert flat[3] == b"abcdefghij" - - -async def test_indices_preserved_under_shuffled_input() -> None: - fetch = FakeFetch(b"".join(bytes([i % 256]) for i in range(1000))) - # Construct ranges in a deliberately non-sorted order. - ranges: list[ByteRequest | None] = [ - RangeByteRequest(500, 510), - RangeByteRequest(0, 10), - RangeByteRequest(200, 210), - RangeByteRequest(300, 310), - ] - opts: CoalesceOptions = { - "max_gap_bytes": 50, - "max_coalesced_bytes": 1 << 20, - "max_concurrency": 10, - } - groups = await _collect(coalesced_get(fetch, ranges, options=opts)) - flat = _contents(groups) - # Indices match original positions, not sorted order. - assert flat[0] == bytes(b % 256 for b in range(500, 510)) - assert flat[1] == bytes(b % 256 for b in range(10)) - assert flat[2] == bytes(b % 256 for b in range(200, 210)) - assert flat[3] == bytes(b % 256 for b in range(300, 310)) - - -async def test_within_group_ordering_is_start_offset() -> None: - fetch = FakeFetch(b"".join(bytes([i % 256]) for i in range(100))) - # Two ranges will merge; one has a later start but is listed first in input. - ranges: list[ByteRequest | None] = [ - RangeByteRequest(20, 25), - RangeByteRequest(0, 5), - ] - opts: CoalesceOptions = { - "max_gap_bytes": 50, - "max_coalesced_bytes": 1 << 20, - "max_concurrency": 10, - } - groups = await _collect(coalesced_get(fetch, ranges, options=opts)) - assert len(groups) == 1 - # Within the group, tuples are ordered by start offset. - # Input index 1 (start=0) comes first, then 0 (start=20). - assert [idx for idx, _ in groups[0]] == [1, 0] - - -async def test_mixed_mergeable_and_non_mergeable_counts_correct() -> None: - fetch = FakeFetch(b"x" * 10_000) - opts: CoalesceOptions = { - "max_gap_bytes": 50, - "max_coalesced_bytes": 1 << 20, - "max_concurrency": 10, - } - # Two clusters + one far-away singleton. - ranges: list[ByteRequest | None] = [ - RangeByteRequest(0, 10), - RangeByteRequest(20, 30), - RangeByteRequest(500, 510), - ] - groups = await _collect(coalesced_get(fetch, ranges, options=opts)) - assert len(groups) == 2 - # First group has 2, second has 1. - sizes = sorted(len(g) for g in groups) - assert sizes == [1, 2] +# --------------------------------------------------------------------------- +# Concurrency and cancellation. +# --------------------------------------------------------------------------- async def test_max_concurrency_is_honored() -> None: - # Build 10 non-mergeable ranges, have the fetch hold a counter of in-flight calls. + """With 10 non-mergeable ranges and max_concurrency=3, peak in-flight must not exceed 3.""" in_flight = 0 peak = 0 lock = asyncio.Lock() @@ -253,21 +328,78 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: assert peak >= 2 # must have been some real concurrency +async def test_consumer_break_cancels_pending_fetches() -> None: + """Breaking out of the async for should cancel pending fetches rather than let them complete.""" + completed_calls = 0 + cancelled_calls = 0 + + async def fetch(byte_range: ByteRequest | None) -> Buffer | None: + nonlocal completed_calls, cancelled_calls + assert isinstance(byte_range, RangeByteRequest) + start = byte_range.start + try: + # First fetch returns fast so the async-for body runs and can break. + # Later fetches sleep long enough that cancellation has room to land. + await asyncio.sleep(0.001 if start == 0 else 2.0) + except asyncio.CancelledError: + cancelled_calls += 1 + raise + completed_calls += 1 + return _buf(b"x") + + opts: CoalesceOptions = { + "max_gap_bytes": -1, # no merging + "max_coalesced_bytes": 1 << 20, + "max_concurrency": 3, + } + ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(6)] + + agen = coalesced_get(fetch, ranges, options=opts) + async for _group in agen: + break + # Explicitly close the generator so its finally block runs (cancelling + # in-flight tasks) before we make assertions. + await agen.aclose() + + assert cancelled_calls >= 1 + assert completed_calls >= 1 + + +# --------------------------------------------------------------------------- +# Key-missing semantics. +# --------------------------------------------------------------------------- + + async def test_key_missing_from_first_call_yields_nothing() -> None: + """If the very first fetch returns None, the iterator yields no groups.""" fetch = FakeFetch(b"x" * 100, key_exists=False) ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(20, 30)] groups = await _collect(coalesced_get(fetch, ranges, options=DEFAULT_COALESCE_OPTIONS)) assert groups == [] +@pytest.mark.parametrize( + "byte_range", + [OffsetByteRequest(5), SuffixByteRequest(5), None], + ids=["offset", "suffix", "none"], +) +async def test_key_missing_on_uncoalescable_input_yields_nothing( + byte_range: ByteRequest | None, +) -> None: + """Uncoalescable inputs take a distinct path; key-missing must still short-circuit.""" + fetch = FakeFetch(b"x" * 100, key_exists=False) + groups = await _collect(coalesced_get(fetch, [byte_range], options=DEFAULT_COALESCE_OPTIONS)) + assert groups == [] + + async def test_key_missing_mid_stream_yields_earlier_groups_only() -> None: - # Two non-mergeable ranges; the second fetch returns None. + """If a later fetch returns None, earlier-completed groups remain observable.""" call_count = 0 async def fetch(byte_range: ByteRequest | None) -> Buffer | None: nonlocal call_count call_count += 1 - # Ensure deterministic ordering: first call serves, second returns None. + # Deterministic: first call serves, second returns None. await asyncio.sleep(0.01 if call_count == 1 else 0.02) if call_count >= 2: return None @@ -280,18 +412,16 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: } ranges: list[ByteRequest | None] = [RangeByteRequest(0, 2), RangeByteRequest(100, 102)] groups = await _collect(coalesced_get(fetch, ranges, options=opts)) - # Exactly one group (the first) -- the second went missing. assert len(groups) == 1 assert len(groups[0]) == 1 async def test_key_missing_mid_stream_with_concurrency_drains_late_arrivals() -> None: - # Schedule multiple non-mergeable fetches under max_concurrency=3. - # - Fetch #0 completes FIRST (short sleep) -> at least one yield observed. - # - Fetch #1 returns None shortly after -> triggers the miss path. - # - Fetches #2..#N are gated on an asyncio.Event so they only unblock - # AFTER the miss has been observed, producing late arrivals that - # exercise the `if stopped: continue` discard branch in the drain loop. + """ + Under max_concurrency > 1, a mid-stream miss should still cause the iterator + to complete cleanly even when unrelated tasks are still in flight and arrive + after the miss has been observed. + """ late_gate = asyncio.Event() miss_fired = asyncio.Event() @@ -299,7 +429,7 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: assert isinstance(byte_range, RangeByteRequest) start = byte_range.start if start == 0: - # First to complete: small sleep so it arrives before the miss. + # First to complete: arrives before the miss. await asyncio.sleep(0.01) return _buf(b"ok") if start == 1000: @@ -314,12 +444,10 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: return _buf(b"ok") opts: CoalesceOptions = { - "max_gap_bytes": -1, # force no merging (each range its own fetch) + "max_gap_bytes": -1, "max_coalesced_bytes": 1 << 20, "max_concurrency": 3, } - # Stride by 1000 to avoid merging. 7 items fits within max_concurrency=3 - # while producing pending work after the miss. ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(7)] groups: list[list[tuple[int, Buffer | None]]] = [] @@ -327,88 +455,27 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: try: async for group in agen: groups.append(list(group)) - # After the first yield, release the late gate so remaining - # in-flight tasks can complete and arrive post-stop. late_gate.set() finally: - # Guard against a bug preventing any yield: unblock waiters anyway. late_gate.set() - # We observed exactly the one pre-miss yield. assert len(groups) == 1 assert len(groups[0]) == 1 idx, buf = groups[0][0] assert idx == 0 assert buf is not None - # The iterator completed cleanly without raising. assert miss_fired.is_set() -@pytest.mark.parametrize( - "byte_range", - [ - OffsetByteRequest(5), - SuffixByteRequest(5), - None, - ], - ids=["offset", "suffix", "none"], -) -async def test_key_missing_on_uncoalescable_input_yields_nothing( - byte_range: ByteRequest | None, -) -> None: - # Uncoalescable inputs take a distinct code path from the merged-group - # path; a missing key on that path must still short-circuit cleanly. - fetch = FakeFetch(b"x" * 100, key_exists=False) - groups = await _collect(coalesced_get(fetch, [byte_range], options=DEFAULT_COALESCE_OPTIONS)) - assert groups == [] - - -async def test_consumer_break_cancels_pending_fetches() -> None: - # Kick off many slow ranges with small max_concurrency, break after the - # first yielded group, and verify the remaining tasks are cancelled rather - # than allowed to run to completion. - completed_calls = 0 - cancelled_calls = 0 - - async def fetch(byte_range: ByteRequest | None) -> Buffer | None: - nonlocal completed_calls, cancelled_calls - assert isinstance(byte_range, RangeByteRequest) - start = byte_range.start - try: - # First fetch returns fast so the async for body runs and can break. - # Later fetches sleep long enough that cancellation has room to land. - await asyncio.sleep(0.001 if start == 0 else 2.0) - except asyncio.CancelledError: - cancelled_calls += 1 - raise - completed_calls += 1 - return _buf(b"x") - - opts: CoalesceOptions = { - "max_gap_bytes": -1, # no merging - "max_coalesced_bytes": 1 << 20, - "max_concurrency": 3, - } - ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(6)] - - agen = coalesced_get(fetch, ranges, options=opts) - # Break after receiving the first yield. - async for _group in agen: - break - # Explicitly close the generator so its finally block runs (cancelling - # in-flight tasks) before we make assertions. - await agen.aclose() - - # At least one slow fetch was actually running under the semaphore and got - # cancelled (rather than running to completion). - assert cancelled_calls >= 1 - # And the first range's fetch completed normally (no spurious cancels there). - assert completed_calls >= 1 +# --------------------------------------------------------------------------- +# Error propagation. +# --------------------------------------------------------------------------- async def test_fetch_raises_propagates() -> None: + """An exception raised by fetch propagates on the yield that produced the failing group.""" fetch = FakeFetch( - b"x" * 100, + _INDEXED_BLOB, raise_on=lambda r: isinstance(r, RangeByteRequest) and r.start >= 100, ) opts: CoalesceOptions = { @@ -421,46 +488,18 @@ async def test_fetch_raises_propagates() -> None: await _collect(coalesced_get(fetch, ranges, options=opts)) -async def test_max_coalesced_bytes_prevents_over_cap_merge() -> None: - fetch = FakeFetch(b"x" * 10_000) - opts: CoalesceOptions = { - "max_gap_bytes": 1000, # allow gap - "max_coalesced_bytes": 50, # but cap the merged size - "max_concurrency": 10, - } - # Two ranges 20 bytes apart, each 20 bytes -- merged span would be 20+20+20=60 > 50. - ranges: list[ByteRequest | None] = [RangeByteRequest(0, 20), RangeByteRequest(40, 60)] - groups = await _collect(coalesced_get(fetch, ranges, options=opts)) - assert len(groups) == 2 - assert all(len(g) == 1 for g in groups) - - -async def test_single_range_larger_than_cap_is_passed_through() -> None: - fetch = FakeFetch(b"x" * 10_000) - opts: CoalesceOptions = { - "max_gap_bytes": 1000, - "max_coalesced_bytes": 50, - "max_concurrency": 10, - } - # A single 200-byte range, larger than the cap. Should still be fetched. - ranges: list[ByteRequest | None] = [RangeByteRequest(0, 200)] - groups = await _collect(coalesced_get(fetch, ranges, options=opts)) - assert len(groups) == 1 - assert len(groups[0]) == 1 - idx, buf = groups[0][0] - assert idx == 0 - assert buf is not None - assert buf.to_bytes() == b"x" * 200 +# --------------------------------------------------------------------------- +# Property-style coverage invariant. +# --------------------------------------------------------------------------- async def test_coverage_invariant_random_inputs() -> None: + """For any random RangeByteRequest input, every input index appears exactly once.""" import random rng = random.Random(42) - blob = bytes(i % 256 for i in range(10_000)) - fetch = FakeFetch(blob) + fetch = FakeFetch(_INDEXED_BLOB) - # Generate 50 random RangeByteRequests within the blob. ranges: list[ByteRequest | None] = [] for _ in range(50): start = rng.randint(0, 9000) @@ -468,13 +507,10 @@ async def test_coverage_invariant_random_inputs() -> None: ranges.append(RangeByteRequest(start, start + length)) groups = await _collect(coalesced_get(fetch, ranges, options=DEFAULT_COALESCE_OPTIONS)) - seen: list[int] = [] - for group in groups: - for idx, _buf in group: - seen.append(idx) + seen: list[int] = [idx for group in groups for idx, _buf in group] assert sorted(seen) == list(range(len(ranges))) - # And the bytes are correct. + flat = _contents(groups) for i, r in enumerate(ranges): assert isinstance(r, RangeByteRequest) - assert flat[i] == blob[r.start : r.end] + assert flat[i] == _INDEXED_BLOB[r.start : r.end] From 8754f854976a2fcfa9034bcc511bfc96cde5735f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 24 Apr 2026 21:20:29 +0200 Subject: [PATCH 25/55] test: skip fsspec-backed get_ranges tests on old fsspec The min_deps CI job pins fsspec to 2023.10.0, which predates AsyncFileSystemWrapper. Wrapping a sync MemoryFileSystem fails there at fixture setup. Guard the affected tests with the same skipif pattern already used in test_fsspec.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_store/test_fsspec_get_ranges.py | 8 ++++++++ tests/test_store/test_protocols.py | 14 ++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/test_store/test_fsspec_get_ranges.py b/tests/test_store/test_fsspec_get_ranges.py index b30868a78b..a659540182 100644 --- a/tests/test_store/test_fsspec_get_ranges.py +++ b/tests/test_store/test_fsspec_get_ranges.py @@ -8,6 +8,7 @@ from __future__ import annotations import pytest +from packaging.version import parse as parse_version from zarr.abc.store import RangeByteRequest from zarr.core._coalesce import DEFAULT_COALESCE_OPTIONS, CoalesceOptions @@ -17,6 +18,13 @@ fsspec = pytest.importorskip("fsspec") +# AsyncFileSystemWrapper (needed to wrap a sync MemoryFileSystem) landed in fsspec 2024.12.0. +# Older versions are pinned by the min-deps CI job, so skip the whole file there. +pytestmark = pytest.mark.skipif( + parse_version(fsspec.__version__) < parse_version("2024.12.0"), + reason="No AsyncFileSystemWrapper", +) + @pytest.fixture def memory_store() -> FsspecStore: diff --git a/tests/test_store/test_protocols.py b/tests/test_store/test_protocols.py index c47cf928c3..9dae19ffba 100644 --- a/tests/test_store/test_protocols.py +++ b/tests/test_store/test_protocols.py @@ -7,9 +7,20 @@ from zarr.storage._protocols import SupportsGetRanges +fsspec = pytest.importorskip("fsspec") +from packaging.version import parse as parse_version # noqa: E402 + +# AsyncFileSystemWrapper (needed to wrap a sync MemoryFileSystem) landed in fsspec 2024.12.0. +# Older versions are pinned by the min-deps CI job. +_needs_async_wrapper = pytest.mark.skipif( + parse_version(fsspec.__version__) < parse_version("2024.12.0"), + reason="No AsyncFileSystemWrapper", +) + + +@_needs_async_wrapper def test_fsspec_store_satisfies_supports_get_ranges() -> None: - pytest.importorskip("fsspec") from fsspec.implementations.memory import MemoryFileSystem from zarr.storage import FsspecStore @@ -34,5 +45,4 @@ def test_type_assignment_at_module_level() -> None: If this runs without error the module imported cleanly; the static check is in mypy. """ - pytest.importorskip("fsspec") from zarr.storage import _fsspec # noqa: F401 From f158af3144d6235b223f8a9015e8c73040f494e7 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 7 May 2026 15:30:12 -0400 Subject: [PATCH 26/55] Update src/zarr/core/_coalesce.py Co-authored-by: Ilan Gold --- src/zarr/core/_coalesce.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 40fe14bf04..35132678e7 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -91,22 +91,22 @@ async def coalesced_get( # Sort mergeables by start offset, then merge. mergeable.sort(key=lambda pair: pair[1].start) - groups: list[list[tuple[int, RangeByteRequest]]] = [] - for pair in mergeable: + iter_mergeable = iter(mergeable) + groups: list[list[tuple[int, RangeByteRequest]]] = [[next(iter_mergeable)]] + for pair in iter_mergeable: _i, r = pair - if groups: - last = groups[-1] - last_end = max(x[1].end for x in last) - gap = r.start - last_end - merged_start = min(x[1].start for x in last) - prospective_end = max(last_end, r.end) - prospective_size = prospective_end - merged_start - if ( - gap <= options["max_gap_bytes"] - and prospective_size <= options["max_coalesced_bytes"] - ): - last.append(pair) - continue + last = groups[-1] + last_end = max(x[1].end for x in last) + gap = r.start - last_end + merged_start = min(x[1].start for x in last) + prospective_end = max(last_end, r.end) + prospective_size = prospective_end - merged_start + if ( + gap <= options["max_gap_bytes"] + and prospective_size <= options["max_coalesced_bytes"] + ): + last.append(pair) + continue groups.append([pair]) # Build a uniform list of work items. Each work item is a list of From ef65b65f4731755c5d39947a5ce59f362c25ba5e Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 7 May 2026 15:30:29 -0400 Subject: [PATCH 27/55] Update src/zarr/core/_coalesce.py Co-authored-by: Ilan Gold --- src/zarr/core/_coalesce.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 35132678e7..86a556514a 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -148,9 +148,10 @@ async def run_one(members: list[tuple[int, ByteRequest | None]]) -> None: await completion_queue.put(("missing", None)) return ordered = sorted(members, key=lambda pair: pair[1].start) # type: ignore[union-attr] - sliced: list[tuple[int, Buffer | None]] = [] - for idx, r in ordered: - sliced.append((idx, big[r.start - group_start : r.end - group_start])) # type: ignore[union-attr] + sliced: list[tuple[int, Buffer | None]] = [ + (idx, big[r.start - group_start : r.end - group_start]) + for idx, r in ordered + ] await completion_queue.put(("ok", tuple(sliced))) except asyncio.CancelledError: # Cancellation is expected when we stop scheduling on a missing key. From 3e1f22c2e8163195e1f5bf965044bb396b9f7c6c Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 7 May 2026 15:30:53 -0400 Subject: [PATCH 28/55] Update src/zarr/core/_coalesce.py Co-authored-by: Ilan Gold --- src/zarr/core/_coalesce.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 86a556514a..c1c8cf603c 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -112,9 +112,7 @@ async def coalesced_get( # Build a uniform list of work items. Each work item is a list of # (input_index, ByteRequest | None) pairs. Merged groups have multiple # members (all RangeByteRequest); uncoalescable items have a single member. - work_items: list[list[tuple[int, ByteRequest | None]]] = [ - [(idx, r) for idx, r in g] for g in groups - ] + work_items: list[list[tuple[int, ByteRequest | None]]] = groups work_items.extend([(idx, single)] for idx, single in uncoalescable) # Completion queue entries are either ("ok", payload), ("missing", None), From 2d556d4a11bd22aa280aa1cb0b80343789dbac55 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 7 May 2026 15:46:45 -0400 Subject: [PATCH 29/55] chore: lint --- src/zarr/core/_coalesce.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index c1c8cf603c..d3c9dc948c 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -101,10 +101,7 @@ async def coalesced_get( merged_start = min(x[1].start for x in last) prospective_end = max(last_end, r.end) prospective_size = prospective_end - merged_start - if ( - gap <= options["max_gap_bytes"] - and prospective_size <= options["max_coalesced_bytes"] - ): + if gap <= options["max_gap_bytes"] and prospective_size <= options["max_coalesced_bytes"]: last.append(pair) continue groups.append([pair]) @@ -112,7 +109,7 @@ async def coalesced_get( # Build a uniform list of work items. Each work item is a list of # (input_index, ByteRequest | None) pairs. Merged groups have multiple # members (all RangeByteRequest); uncoalescable items have a single member. - work_items: list[list[tuple[int, ByteRequest | None]]] = groups + work_items: list[list[tuple[int, ByteRequest | None]]] = groups # type: ignore[assignment] work_items.extend([(idx, single)] for idx, single in uncoalescable) # Completion queue entries are either ("ok", payload), ("missing", None), @@ -137,8 +134,8 @@ async def run_one(members: list[tuple[int, ByteRequest | None]]) -> None: return # Merged group path: all members are RangeByteRequest. assert all(isinstance(r, RangeByteRequest) for _, r in members) - starts = [r.start for _, r in members] # type: ignore[union-attr] - ends = [r.end for _, r in members] # type: ignore[union-attr] + starts: list[int] = [r.start for _, r in members] # type: ignore[union-attr] + ends: list[int] = [r.end for _, r in members] # type: ignore[union-attr] group_start = min(starts) group_end = max(ends) big = await fetch(RangeByteRequest(group_start, group_end)) @@ -147,7 +144,7 @@ async def run_one(members: list[tuple[int, ByteRequest | None]]) -> None: return ordered = sorted(members, key=lambda pair: pair[1].start) # type: ignore[union-attr] sliced: list[tuple[int, Buffer | None]] = [ - (idx, big[r.start - group_start : r.end - group_start]) + (idx, big[r.start - group_start : r.end - group_start]) # type: ignore[union-attr] for idx, r in ordered ] await completion_queue.put(("ok", tuple(sliced))) From 0e5b2c5cc38837fe66c4940c79e2ef445f73f323 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 7 May 2026 16:07:19 -0400 Subject: [PATCH 30/55] refactor: better function design with explicit context --- src/zarr/core/_coalesce.py | 167 ++++++++++++++++++++----------------- 1 file changed, 92 insertions(+), 75 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index d3c9dc948c..1f11ae3a47 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -2,14 +2,72 @@ from __future__ import annotations import asyncio -from typing import TYPE_CHECKING, TypedDict +from typing import TYPE_CHECKING, Literal, NamedTuple, TypedDict if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence - from zarr.abc.store import ByteRequest + from zarr.abc.store import ByteRequest, RangeByteRequest from zarr.core.buffer import Buffer + _CompletionEntry = ( + tuple[Literal["ok"], Sequence[tuple[int, Buffer | None]]] + | tuple[Literal["missing"], None] + | tuple[Literal["error"], BaseException] + ) + + +class _WorkerCtx(NamedTuple): + """Shared state passed to the per-task worker coroutines. + + Bundling these lets the workers declare their dependencies as one + parameter instead of capturing them implicitly via closure. + """ + + fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]] + semaphore: asyncio.Semaphore + queue: asyncio.Queue[_CompletionEntry] + + +async def _fetch_single(ctx: _WorkerCtx, idx: int, req: ByteRequest | None) -> None: + try: + async with ctx.semaphore: + buf = await ctx.fetch(req) + if buf is None: + await ctx.queue.put(("missing", None)) + return + await ctx.queue.put(("ok", ((idx, buf),))) + except asyncio.CancelledError: + raise + except Exception as exc: + await ctx.queue.put(("error", exc)) + + +async def _fetch_group(ctx: _WorkerCtx, members: list[tuple[int, RangeByteRequest]]) -> None: + """Fetch one merged byte range and slice it back into per-input buffers. + + ``members`` must already be sorted by ``start``; callers in this module + build it from the sorted mergeable list. + """ + from zarr.abc.store import RangeByteRequest + + try: + start = members[0][1].start + end = max(r.end for _, r in members) + async with ctx.semaphore: + big = await ctx.fetch(RangeByteRequest(start, end)) + if big is None: + await ctx.queue.put(("missing", None)) + return + sliced: list[tuple[int, Buffer | None]] = [ + (idx, big[r.start - start : r.end - start]) for idx, r in members + ] + await ctx.queue.put(("ok", tuple(sliced))) + except asyncio.CancelledError: + raise + except Exception as exc: + await ctx.queue.put(("error", exc)) + class CoalesceOptions(TypedDict): """Knobs for coalescing contiguous byte ranges into fewer I/O requests. @@ -89,84 +147,44 @@ async def coalesced_get( (i, r) for i, r in indexed if not isinstance(r, RangeByteRequest) ] - # Sort mergeables by start offset, then merge. + # Sort mergeables by start offset, then merge. Track running start/end of the + # current group so each merge step is O(1) instead of O(group size). mergeable.sort(key=lambda pair: pair[1].start) - iter_mergeable = iter(mergeable) - groups: list[list[tuple[int, RangeByteRequest]]] = [[next(iter_mergeable)]] - for pair in iter_mergeable: + groups: list[list[tuple[int, RangeByteRequest]]] = [] + group_start = 0 + group_end = 0 + for pair in mergeable: _i, r = pair - last = groups[-1] - last_end = max(x[1].end for x in last) - gap = r.start - last_end - merged_start = min(x[1].start for x in last) - prospective_end = max(last_end, r.end) - prospective_size = prospective_end - merged_start - if gap <= options["max_gap_bytes"] and prospective_size <= options["max_coalesced_bytes"]: - last.append(pair) - continue + if groups and r.start - group_end <= options["max_gap_bytes"]: + prospective_end = max(group_end, r.end) + if prospective_end - group_start <= options["max_coalesced_bytes"]: + groups[-1].append(pair) + group_end = prospective_end + continue groups.append([pair]) + group_start = r.start + group_end = r.end + + ctx = _WorkerCtx( + fetch=fetch, + semaphore=asyncio.Semaphore(options["max_concurrency"]), + queue=asyncio.Queue(), + ) - # Build a uniform list of work items. Each work item is a list of - # (input_index, ByteRequest | None) pairs. Merged groups have multiple - # members (all RangeByteRequest); uncoalescable items have a single member. - work_items: list[list[tuple[int, ByteRequest | None]]] = groups # type: ignore[assignment] - work_items.extend([(idx, single)] for idx, single in uncoalescable) - - # Completion queue entries are either ("ok", payload), ("missing", None), - # or ("error", exception). Kept as Any internally to avoid dragging - # Sequence out of TYPE_CHECKING. - completion_queue: asyncio.Queue[ - tuple[str, Sequence[tuple[int, Buffer | None]] | BaseException | None] - ] = asyncio.Queue() - semaphore = asyncio.Semaphore(options["max_concurrency"]) - - async def run_one(members: list[tuple[int, ByteRequest | None]]) -> None: - try: - async with semaphore: - if len(members) == 1 and not isinstance(members[0][1], RangeByteRequest): - # Uncoalescable single fetch. - idx, single = members[0] - buf = await fetch(single) - if buf is None: - await completion_queue.put(("missing", None)) - return - await completion_queue.put(("ok", ((idx, buf),))) - return - # Merged group path: all members are RangeByteRequest. - assert all(isinstance(r, RangeByteRequest) for _, r in members) - starts: list[int] = [r.start for _, r in members] # type: ignore[union-attr] - ends: list[int] = [r.end for _, r in members] # type: ignore[union-attr] - group_start = min(starts) - group_end = max(ends) - big = await fetch(RangeByteRequest(group_start, group_end)) - if big is None: - await completion_queue.put(("missing", None)) - return - ordered = sorted(members, key=lambda pair: pair[1].start) # type: ignore[union-attr] - sliced: list[tuple[int, Buffer | None]] = [ - (idx, big[r.start - group_start : r.end - group_start]) # type: ignore[union-attr] - for idx, r in ordered - ] - await completion_queue.put(("ok", tuple(sliced))) - except asyncio.CancelledError: - # Cancellation is expected when we stop scheduling on a missing key. - raise - except BaseException as exc: - await completion_queue.put(("error", exc)) - - # Launch all work items as tasks. The semaphore bounds actual concurrency. + # Launch all work as tasks. The semaphore bounds actual I/O concurrency. tasks: set[asyncio.Task[None]] = set() - for item in work_items: - tasks.add(asyncio.create_task(run_one(item))) + for group in groups: + tasks.add(asyncio.create_task(_fetch_group(ctx, group))) + for idx, single in uncoalescable: + tasks.add(asyncio.create_task(_fetch_single(ctx, idx, single))) + total_work = len(tasks) try: pending_error: BaseException | None = None - for _ in range(len(work_items)): - kind, payload = await completion_queue.get() - if kind == "ok": - assert payload is not None - assert not isinstance(payload, BaseException) - yield payload + for _ in range(total_work): + entry = await ctx.queue.get() + if entry[0] == "ok": + yield entry[1] continue # "missing" or "error": stop scheduling and cancel pending work. # Late arrivals that raced to enqueue before cancellation took @@ -175,9 +193,8 @@ async def run_one(members: list[tuple[int, ByteRequest | None]]) -> None: for t in tasks: if not t.done(): t.cancel() - if kind == "error": - assert isinstance(payload, BaseException) - pending_error = payload + if entry[0] == "error": + pending_error = entry[1] break if pending_error is not None: raise pending_error From 17f8d64d475b4eb9e4e1b4ff06ffb3faecc324d8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 8 May 2026 13:47:29 -0400 Subject: [PATCH 31/55] refactor(coalesce): split coalescing from coalesced get --- src/zarr/core/_coalesce.py | 111 ++++++++++++++++++--------- src/zarr/storage/_fsspec.py | 6 +- src/zarr/storage/_protocols.py | 4 +- tests/test_coalesce.py | 132 +++++++++++++++++++++++++++++++++ 4 files changed, 214 insertions(+), 39 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 1f11ae3a47..3b0ffa0f01 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -46,7 +46,7 @@ async def _fetch_single(ctx: _WorkerCtx, idx: int, req: ByteRequest | None) -> N async def _fetch_group(ctx: _WorkerCtx, members: list[tuple[int, RangeByteRequest]]) -> None: """Fetch one merged byte range and slice it back into per-input buffers. - ``members`` must already be sorted by ``start``; callers in this module + `members` must already be sorted by `start`; callers in this module build it from the sorted mergeable list. """ from zarr.abc.store import RangeByteRequest @@ -90,56 +90,50 @@ class CoalesceOptions(TypedDict): } -async def coalesced_get( - fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]], +def coalesce_ranges( byte_ranges: Iterable[ByteRequest | None], *, options: CoalesceOptions, -) -> AsyncGenerator[Sequence[tuple[int, Buffer | None]], None]: - """Read many byte ranges through ``fetch`` with coalescing and concurrency. +) -> tuple[ + list[list[tuple[int, RangeByteRequest]]], + list[tuple[int, ByteRequest | None]], +]: + """Plan a set of byte-range fetches: which inputs merge, which stand alone. - Nearby ranges are merged into a single underlying I/O (subject to - ``options``), and merged fetches are run concurrently. Each yield - corresponds to exactly one underlying I/O operation: a sequence of - ``(input_index, result)`` tuples for all input ranges served by that I/O. - Tuples within a yielded sequence are ordered by start offset. Yields across - groups are in completion order, not input order. + Pure (no I/O). The result is the I/O plan a caller would execute: each + group corresponds to one fetch of a coalesced byte range, and each + uncoalescable item corresponds to one fetch of the original request. Parameters ---------- - fetch - Callable that reads one byte range and returns a ``Buffer`` (or ``None`` - if the underlying key does not exist). Typically constructed via - ``functools.partial(store.get, key, prototype)``. byte_ranges - Input ranges. ``None`` means "the whole value". + Input ranges. `None` means "the whole value". options - Coalescing knobs. - - Yields - ------ - Sequence[tuple[int, Buffer | None]] - Per-I/O batch of ``(input_index, result)`` tuples. + Coalescing knobs (see `CoalesceOptions`). + + Returns + ------- + groups + List of merged groups. Each group is a list of + `(input_index, RangeByteRequest)` pairs sorted by `start`. A + single-element group represents a `RangeByteRequest` that did not + merge with any neighbor. + uncoalescable + List of `(input_index, request)` pairs for inputs that are not + `RangeByteRequest` (`OffsetByteRequest`, `SuffixByteRequest`, + `None`). Indices are preserved from the input order. Notes ----- - - Only ``RangeByteRequest`` inputs are coalesced. ``OffsetByteRequest``, - ``SuffixByteRequest``, and ``None`` are each treated as uncoalescable - (one fetch, one single-tuple yield per input). - - If any fetch returns ``None`` the iterator stops scheduling further fetches - and completes without yielding the missing group. Groups completed before - the miss remain observable. - - If a fetch raises, the exception propagates on the yield that produced the - failing group; earlier-completed groups remain observable. + Only `RangeByteRequest` inputs participate in coalescing. Two ranges + merge when both: their gap (next `start` minus current group's running + `end`) is `<= options["max_gap_bytes"]`, and the resulting merged + span is `<= options["max_coalesced_bytes"]`. """ # Local import to avoid cycles at module import time. from zarr.abc.store import RangeByteRequest indexed: list[tuple[int, ByteRequest | None]] = list(enumerate(byte_ranges)) - if not indexed: - return - - # Split inputs into coalescable (RangeByteRequest only) and uncoalescable (the rest). mergeable: list[tuple[int, RangeByteRequest]] = [ (i, r) for i, r in indexed if isinstance(r, RangeByteRequest) ] @@ -165,6 +159,55 @@ async def coalesced_get( group_start = r.start group_end = r.end + return groups, uncoalescable + + +async def coalesced_get( + fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]], + byte_ranges: Iterable[ByteRequest | None], + *, + options: CoalesceOptions, +) -> AsyncGenerator[Sequence[tuple[int, Buffer | None]], None]: + """Read many byte ranges through `fetch` with coalescing and concurrency. + + Nearby ranges are merged into a single underlying I/O (subject to + `options`), and merged fetches are run concurrently. Each yield + corresponds to exactly one underlying I/O operation: a sequence of + `(input_index, result)` tuples for all input ranges served by that I/O. + Tuples within a yielded sequence are ordered by start offset. Yields across + groups are in completion order, not input order. + + Parameters + ---------- + fetch + Callable that reads one byte range and returns a `Buffer` (or `None` + if the underlying key does not exist). Typically constructed via + `functools.partial(store.get, key, prototype)`. + byte_ranges + Input ranges. `None` means "the whole value". + options + Coalescing knobs. + + Yields + ------ + Sequence[tuple[int, Buffer | None]] + Per-I/O batch of `(input_index, result)` tuples. + + Notes + ----- + - Only `RangeByteRequest` inputs are coalesced. `OffsetByteRequest`, + `SuffixByteRequest`, and `None` are each treated as uncoalescable + (one fetch, one single-tuple yield per input). + - If any fetch returns `None` the iterator stops scheduling further fetches + and completes without yielding the missing group. Groups completed before + the miss remain observable. + - If a fetch raises, the exception propagates on the yield that produced the + failing group; earlier-completed groups remain observable. + """ + groups, uncoalescable = coalesce_ranges(byte_ranges, options=options) + if not groups and not uncoalescable: + return + ctx = _WorkerCtx( fetch=fetch, semaphore=asyncio.Semaphore(options["max_concurrency"]), diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index 11e9af79e2..41591cadc2 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -332,10 +332,10 @@ async def get_ranges( *, prototype: BufferPrototype, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: - """Read many byte ranges from ``key``, coalescing nearby ranges and fetching concurrently. + """Read many byte ranges from `key`, coalescing nearby ranges and fetching concurrently. - See :class:`zarr.storage._protocols.SupportsGetRanges` for the contract and - :func:`zarr.core._coalesce.coalesced_get` for the full semantics. + See `zarr.storage._protocols.SupportsGetRanges` for the contract and + `zarr.core._coalesce.coalesced_get` for the full semantics. """ fetch = partial(self.get, key, prototype) async for group in coalesced_get(fetch, byte_ranges, options=self.coalesce_options): diff --git a/src/zarr/storage/_protocols.py b/src/zarr/storage/_protocols.py index 086a15dd90..579ff7936d 100644 --- a/src/zarr/storage/_protocols.py +++ b/src/zarr/storage/_protocols.py @@ -25,10 +25,10 @@ def get_ranges( *, prototype: BufferPrototype, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: - """Read many byte ranges from ``key``. + """Read many byte ranges from `key`. Each yield corresponds to one underlying I/O operation. - See :func:`zarr.core._coalesce.coalesced_get` for full semantics. + See `zarr.core._coalesce.coalesced_get` for full semantics. """ ... diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 68a620ea5d..2b8a5a6163 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -16,6 +16,7 @@ from zarr.core._coalesce import ( DEFAULT_COALESCE_OPTIONS, CoalesceOptions, + coalesce_ranges, coalesced_get, ) from zarr.core.buffer import Buffer, default_buffer_prototype @@ -514,3 +515,134 @@ async def test_coverage_invariant_random_inputs() -> None: for i, r in enumerate(ranges): assert isinstance(r, RangeByteRequest) assert flat[i] == _INDEXED_BLOB[r.start : r.end] + + +# --------------------------------------------------------------------------- +# Pure-function tests for coalesce_ranges (no async, no fetch). +# --------------------------------------------------------------------------- + + +def test_coalesce_ranges_empty_input() -> None: + groups, uncoalescable = coalesce_ranges([], options=DEFAULT_COALESCE_OPTIONS) + assert groups == [] + assert uncoalescable == [] + + +def test_coalesce_ranges_separates_coalescable_from_uncoalescable() -> None: + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 10), + OffsetByteRequest(100), + SuffixByteRequest(5), + None, + RangeByteRequest(20, 30), + ] + groups, uncoalescable = coalesce_ranges(ranges, options=MERGE_GAP_50) + + # Both range requests fall within MERGE_GAP_50's gap budget. + assert len(groups) == 1 + assert [idx for idx, _ in groups[0]] == [0, 4] + + # Non-RangeByteRequest entries preserve their original input indices. + assert [(idx, type(req).__name__ if req else None) for idx, req in uncoalescable] == [ + (1, "OffsetByteRequest"), + (2, "SuffixByteRequest"), + (3, None), + ] + + +def test_coalesce_ranges_no_merge_when_gap_exceeds_budget() -> None: + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 10), + RangeByteRequest(200, 210), + RangeByteRequest(500, 510), + ] + groups, uncoalescable = coalesce_ranges(ranges, options=MERGE_GAP_50) + assert uncoalescable == [] + assert [len(g) for g in groups] == [1, 1, 1] + assert [idx for g in groups for idx, _ in g] == [0, 1, 2] + + +def test_coalesce_ranges_merges_within_gap_budget() -> None: + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 5), + RangeByteRequest(10, 15), + RangeByteRequest(20, 25), + ] + groups, _ = coalesce_ranges(ranges, options=MERGE_GAP_50) + assert len(groups) == 1 + assert [idx for idx, _ in groups[0]] == [0, 1, 2] + + +def test_coalesce_ranges_respects_max_coalesced_bytes() -> None: + # Gap budget is permissive (1000), but the merged span would exceed CAP_50's + # 50-byte cap, so the second range starts a new group. + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 30), + RangeByteRequest(40, 80), + ] + groups, _ = coalesce_ranges(ranges, options=CAP_50) + assert [len(g) for g in groups] == [1, 1] + + +def test_coalesce_ranges_groups_are_sorted_by_start() -> None: + """Input order is irrelevant; groups always emerge in start-offset order.""" + ranges: list[ByteRequest | None] = [ + RangeByteRequest(500, 510), + RangeByteRequest(0, 10), + RangeByteRequest(20, 30), + RangeByteRequest(200, 210), + ] + groups, _ = coalesce_ranges(ranges, options=MERGE_GAP_50) + # First group is the {0-10, 20-30} cluster (from input indices 1, 2). + # Then the {200-210} singleton, then {500-510}. + flat = [idx for g in groups for idx, _ in g] + assert flat == [1, 2, 3, 0] + # Within each group, members are sorted by start. + for g in groups: + starts = [r.start for _, r in g] + assert starts == sorted(starts) + + +def test_coalesce_ranges_overlapping_ranges_merge() -> None: + """Nested/overlapping ranges have a non-positive 'gap' and always merge.""" + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 100), + RangeByteRequest(50, 60), # nested + RangeByteRequest(80, 120), # overlaps + ] + groups, _ = coalesce_ranges(ranges, options=MERGE_GAP_50) + assert len(groups) == 1 + assert [idx for idx, _ in groups[0]] == [0, 1, 2] + + +def test_coalesce_ranges_running_end_handles_nesting() -> None: + """A subsequent range fully inside the running span must not extend group_end backwards.""" + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 1000), # group_end=1000 + RangeByteRequest(100, 200), # nested; group_end stays at 1000 + RangeByteRequest(990, 1010), # gap = -10 from running end, still merges + ] + groups, _ = coalesce_ranges(ranges, options=MERGE_GAP_50) + assert len(groups) == 1 + assert {idx for idx, _ in groups[0]} == {0, 1, 2} + + +def test_coalesce_ranges_only_uncoalescable_inputs() -> None: + ranges: list[ByteRequest | None] = [None, OffsetByteRequest(10), SuffixByteRequest(5)] + groups, uncoalescable = coalesce_ranges(ranges, options=DEFAULT_COALESCE_OPTIONS) + assert groups == [] + assert [idx for idx, _ in uncoalescable] == [0, 1, 2] + + +def test_coalesce_ranges_total_index_coverage() -> None: + """Every input index appears exactly once across groups + uncoalescable.""" + ranges: list[ByteRequest | None] = [ + RangeByteRequest(0, 10), + None, + RangeByteRequest(15, 25), + OffsetByteRequest(100), + RangeByteRequest(30, 40), + ] + groups, uncoalescable = coalesce_ranges(ranges, options=MERGE_GAP_50) + seen = sorted([idx for g in groups for idx, _ in g] + [idx for idx, _ in uncoalescable]) + assert seen == list(range(len(ranges))) From 5eb25bc541156e17c8253e7e4d4bbd3d3ca24cd3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 8 May 2026 13:54:07 -0400 Subject: [PATCH 32/55] refactor(coalesce): use sequence instead of iterable --- src/zarr/core/_coalesce.py | 6 +++--- src/zarr/storage/_fsspec.py | 2 +- src/zarr/storage/_protocols.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 3b0ffa0f01..ba2043a48c 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Literal, NamedTuple, TypedDict if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence + from collections.abc import AsyncGenerator, Awaitable, Callable, Sequence from zarr.abc.store import ByteRequest, RangeByteRequest from zarr.core.buffer import Buffer @@ -91,7 +91,7 @@ class CoalesceOptions(TypedDict): def coalesce_ranges( - byte_ranges: Iterable[ByteRequest | None], + byte_ranges: Sequence[ByteRequest | None], *, options: CoalesceOptions, ) -> tuple[ @@ -164,7 +164,7 @@ def coalesce_ranges( async def coalesced_get( fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]], - byte_ranges: Iterable[ByteRequest | None], + byte_ranges: Sequence[ByteRequest | None], *, options: CoalesceOptions, ) -> AsyncGenerator[Sequence[tuple[int, Buffer | None]], None]: diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index 41591cadc2..dfeec1eb59 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -328,7 +328,7 @@ async def get( async def get_ranges( self, key: str, - byte_ranges: Iterable[ByteRequest | None], + byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: diff --git a/src/zarr/storage/_protocols.py b/src/zarr/storage/_protocols.py index 579ff7936d..29e20aeb23 100644 --- a/src/zarr/storage/_protocols.py +++ b/src/zarr/storage/_protocols.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Protocol, runtime_checkable if TYPE_CHECKING: - from collections.abc import AsyncIterator, Iterable, Sequence + from collections.abc import AsyncIterator, Sequence from zarr.abc.store import ByteRequest from zarr.core.buffer import Buffer, BufferPrototype @@ -21,7 +21,7 @@ class SupportsGetRanges(Protocol): def get_ranges( self, key: str, - byte_ranges: Iterable[ByteRequest | None], + byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: From 391985ae9b640d78f427f954e873a88c18a49ed9 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 11 May 2026 22:26:38 +0200 Subject: [PATCH 33/55] Update src/zarr/storage/_fsspec.py Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --- src/zarr/storage/_fsspec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index dfeec1eb59..251ff61a0d 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -132,7 +132,7 @@ def __init__( path: str = "/", allowed_exceptions: tuple[type[Exception], ...] = ALLOWED_EXCEPTIONS, *, - coalesce_options: CoalesceOptions = DEFAULT_COALESCE_OPTIONS, + coalesce_options: CoalesceOptions | None = None, ) -> None: super().__init__(read_only=read_only) self.fs = fs From 5731be6b11aaabb849c6758e9852ce365d1448f0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 May 2026 14:13:32 +0200 Subject: [PATCH 34/55] refactor: banish typeddict, we are fine with kwargs --- changes/0000.feature.md | 1 - changes/3925.feature.md | 1 + src/zarr/core/_coalesce.py | 93 +++++++++++++--------- src/zarr/storage/_fsspec.py | 17 ++-- src/zarr/storage/_protocols.py | 11 ++- tests/test_coalesce.py | 77 +++++++++--------- tests/test_store/test_fsspec_get_ranges.py | 39 +++++---- 7 files changed, 129 insertions(+), 110 deletions(-) delete mode 100644 changes/0000.feature.md create mode 100644 changes/3925.feature.md diff --git a/changes/0000.feature.md b/changes/0000.feature.md deleted file mode 100644 index ce26a8c209..0000000000 --- a/changes/0000.feature.md +++ /dev/null @@ -1 +0,0 @@ -Add `zarr.storage.FsspecStore.get_ranges` for concurrent, coalesced multi-range reads from a single key. A new keyword-only constructor argument `coalesce_options` on `FsspecStore` controls the max gap, max coalesced size, and max concurrency of the underlying requests. diff --git a/changes/3925.feature.md b/changes/3925.feature.md new file mode 100644 index 0000000000..f3ad5080d8 --- /dev/null +++ b/changes/3925.feature.md @@ -0,0 +1 @@ +Add `zarr.storage.FsspecStore.get_ranges` for concurrent, coalesced multi-range reads from a single key. Coalescing knobs (`max_concurrency`, `max_gap_bytes`, `max_coalesced_bytes`) are passed as keyword arguments to `get_ranges`, not configured on the store. diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index ba2043a48c..4d521cc73f 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -2,7 +2,7 @@ from __future__ import annotations import asyncio -from typing import TYPE_CHECKING, Literal, NamedTuple, TypedDict +from typing import TYPE_CHECKING, Final, Literal, NamedTuple if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Callable, Sequence @@ -69,31 +69,16 @@ async def _fetch_group(ctx: _WorkerCtx, members: list[tuple[int, RangeByteReques await ctx.queue.put(("error", exc)) -class CoalesceOptions(TypedDict): - """Knobs for coalescing contiguous byte ranges into fewer I/O requests. - - All fields required. See DEFAULT_COALESCE_OPTIONS for a sensible default. - """ - - max_gap_bytes: int - """Two RangeByteRequests separated by at most this many bytes may be merged into one fetch.""" - max_coalesced_bytes: int - """Upper bound on the size of a single merged fetch (ignored for an already-oversized single request).""" - max_concurrency: int - """Maximum number of merged fetches in flight at once.""" - - -DEFAULT_COALESCE_OPTIONS: CoalesceOptions = { - "max_gap_bytes": 1 << 20, # 1 MiB - "max_coalesced_bytes": 16 << 20, # 16 MiB - "max_concurrency": 10, -} +COALESCE_DEFAULT_MAX_GAP_BYTES: Final = 1 << 20 +COALESCE_DEFAULT_MAX_COALESCED_BYTES: Final = 16 << 20 +COALESCE_DEFAULT_MAX_CONCURRENCY: Final = 10 def coalesce_ranges( byte_ranges: Sequence[ByteRequest | None], *, - options: CoalesceOptions, + max_gap_bytes: int | None = None, + max_coalesced_bytes: int | None = None, ) -> tuple[ list[list[tuple[int, RangeByteRequest]]], list[tuple[int, ByteRequest | None]], @@ -108,8 +93,12 @@ def coalesce_ranges( ---------- byte_ranges Input ranges. `None` means "the whole value". - options - Coalescing knobs (see `CoalesceOptions`). + max_gap_bytes + Two `RangeByteRequest`s separated by at most this many bytes may be + merged into one fetch. Defaults to `COALESCE_DEFAULT_MAX_GAP_BYTES`. + max_coalesced_bytes + Upper bound on the size of a single merged fetch. Defaults to + `COALESCE_DEFAULT_MAX_COALESCED_BYTES`. Returns ------- @@ -127,12 +116,17 @@ def coalesce_ranges( ----- Only `RangeByteRequest` inputs participate in coalescing. Two ranges merge when both: their gap (next `start` minus current group's running - `end`) is `<= options["max_gap_bytes"]`, and the resulting merged - span is `<= options["max_coalesced_bytes"]`. + `end`) is `<= max_gap_bytes`, and the resulting merged span is + `<= max_coalesced_bytes`. """ # Local import to avoid cycles at module import time. from zarr.abc.store import RangeByteRequest + if max_gap_bytes is None: + max_gap_bytes = COALESCE_DEFAULT_MAX_GAP_BYTES + if max_coalesced_bytes is None: + max_coalesced_bytes = COALESCE_DEFAULT_MAX_COALESCED_BYTES + indexed: list[tuple[int, ByteRequest | None]] = list(enumerate(byte_ranges)) mergeable: list[tuple[int, RangeByteRequest]] = [ (i, r) for i, r in indexed if isinstance(r, RangeByteRequest) @@ -149,9 +143,9 @@ def coalesce_ranges( group_end = 0 for pair in mergeable: _i, r = pair - if groups and r.start - group_end <= options["max_gap_bytes"]: + if groups and r.start - group_end <= max_gap_bytes: prospective_end = max(group_end, r.end) - if prospective_end - group_start <= options["max_coalesced_bytes"]: + if prospective_end - group_start <= max_coalesced_bytes: groups[-1].append(pair) group_end = prospective_end continue @@ -166,16 +160,18 @@ async def coalesced_get( fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]], byte_ranges: Sequence[ByteRequest | None], *, - options: CoalesceOptions, + max_concurrency: int | None = None, + max_gap_bytes: int | None = None, + max_coalesced_bytes: int | None = None, ) -> AsyncGenerator[Sequence[tuple[int, Buffer | None]], None]: """Read many byte ranges through `fetch` with coalescing and concurrency. - Nearby ranges are merged into a single underlying I/O (subject to - `options`), and merged fetches are run concurrently. Each yield - corresponds to exactly one underlying I/O operation: a sequence of - `(input_index, result)` tuples for all input ranges served by that I/O. - Tuples within a yielded sequence are ordered by start offset. Yields across - groups are in completion order, not input order. + Nearby ranges are merged into a single underlying I/O, and merged fetches + are run concurrently. Each yield corresponds to exactly one underlying I/O + operation: a sequence of `(input_index, result)` tuples for all input + ranges served by that I/O. Tuples within a yielded sequence are ordered by + start offset. Yields across groups are in completion order, not input + order. Parameters ---------- @@ -185,8 +181,13 @@ async def coalesced_get( `functools.partial(store.get, key, prototype)`. byte_ranges Input ranges. `None` means "the whole value". - options - Coalescing knobs. + max_concurrency + Maximum number of merged fetches in flight at once. Defaults to + `COALESCE_DEFAULT_MAX_CONCURRENCY`. + max_gap_bytes + Forwarded to `coalesce_ranges`. + max_coalesced_bytes + Forwarded to `coalesce_ranges`. Yields ------ @@ -204,20 +205,34 @@ async def coalesced_get( - If a fetch raises, the exception propagates on the yield that produced the failing group; earlier-completed groups remain observable. """ - groups, uncoalescable = coalesce_ranges(byte_ranges, options=options) + if max_concurrency is None: + max_concurrency = COALESCE_DEFAULT_MAX_CONCURRENCY + + groups, uncoalescable = coalesce_ranges( + byte_ranges, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ) if not groups and not uncoalescable: return ctx = _WorkerCtx( fetch=fetch, - semaphore=asyncio.Semaphore(options["max_concurrency"]), + semaphore=asyncio.Semaphore(max_concurrency), queue=asyncio.Queue(), ) # Launch all work as tasks. The semaphore bounds actual I/O concurrency. + # A one-member group is a RangeByteRequest that did not merge with a + # neighbor; route it through _fetch_single so it skips the redundant + # slice-by-zero in _fetch_group. tasks: set[asyncio.Task[None]] = set() for group in groups: - tasks.add(asyncio.create_task(_fetch_group(ctx, group))) + if len(group) == 1: + solo_idx, solo_req = group[0] + tasks.add(asyncio.create_task(_fetch_single(ctx, solo_idx, solo_req))) + else: + tasks.add(asyncio.create_task(_fetch_group(ctx, group))) for idx, single in uncoalescable: tasks.add(asyncio.create_task(_fetch_single(ctx, idx, single))) total_work = len(tasks) diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index 251ff61a0d..60b82ec55d 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -16,8 +16,7 @@ SuffixByteRequest, ) from zarr.core._coalesce import ( - DEFAULT_COALESCE_OPTIONS, - CoalesceOptions, + COALESCE_DEFAULT_MAX_CONCURRENCY, coalesced_get, ) from zarr.core.buffer import Buffer @@ -131,14 +130,11 @@ def __init__( read_only: bool = False, path: str = "/", allowed_exceptions: tuple[type[Exception], ...] = ALLOWED_EXCEPTIONS, - *, - coalesce_options: CoalesceOptions | None = None, ) -> None: super().__init__(read_only=read_only) self.fs = fs self.path = path self.allowed_exceptions = allowed_exceptions - self.coalesce_options = coalesce_options if not self.fs.async_impl: raise TypeError("Filesystem needs to support async operations.") @@ -331,6 +327,9 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, + max_concurrency: int = COALESCE_DEFAULT_MAX_CONCURRENCY, + max_gap_bytes: int | None = None, + max_coalesced_bytes: int | None = None, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges from `key`, coalescing nearby ranges and fetching concurrently. @@ -338,7 +337,13 @@ async def get_ranges( `zarr.core._coalesce.coalesced_get` for the full semantics. """ fetch = partial(self.get, key, prototype) - async for group in coalesced_get(fetch, byte_ranges, options=self.coalesce_options): + async for group in coalesced_get( + fetch, + byte_ranges, + max_concurrency=max_concurrency, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ): yield group async def set( diff --git a/src/zarr/storage/_protocols.py b/src/zarr/storage/_protocols.py index 29e20aeb23..181b0694da 100644 --- a/src/zarr/storage/_protocols.py +++ b/src/zarr/storage/_protocols.py @@ -24,11 +24,16 @@ def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, + max_gap_bytes: int | None = ..., + max_coalesced_bytes: int | None = ..., ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges from `key`. - Each yield corresponds to one underlying I/O operation. - - See `zarr.core._coalesce.coalesced_get` for full semantics. + Each yield corresponds to one underlying I/O operation. The coalescing + knobs `max_gap_bytes` and `max_coalesced_bytes` describe *what counts + as coalesceable* and are part of this contract; implementations may + accept additional kwargs (e.g. concurrency knobs) but should honor + these two consistently. See `zarr.core._coalesce.coalesce_ranges` for + full semantics. """ ... diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 2b8a5a6163..3c722da476 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -14,15 +14,13 @@ SuffixByteRequest, ) from zarr.core._coalesce import ( - DEFAULT_COALESCE_OPTIONS, - CoalesceOptions, coalesce_ranges, coalesced_get, ) from zarr.core.buffer import Buffer, default_buffer_prototype if TYPE_CHECKING: - from collections.abc import AsyncIterator, Callable, Sequence + from collections.abc import AsyncIterator, Callable, Mapping, Sequence def _buf(data: bytes) -> Buffer: @@ -73,30 +71,29 @@ def _contents(groups: list[list[tuple[int, Buffer | None]]]) -> dict[int, bytes] # --------------------------------------------------------------------------- -# Shared option values for parametrized structural tests. +# Shared coalescing-knob bundles. Each is a mapping of kwargs to splat into +# `coalesce_ranges` and `coalesced_get`. Concurrency is orthogonal to merging +# and is set inline by tests that exercise it. # --------------------------------------------------------------------------- -DEFAULT: CoalesceOptions = DEFAULT_COALESCE_OPTIONS -"""The library default; permissive merging.""" +DEFAULT: Mapping[str, int] = {} +"""Empty: rely on the library defaults defined in `_coalesce`.""" -MERGE_GAP_50: CoalesceOptions = { +MERGE_GAP_50: Mapping[str, int] = { "max_gap_bytes": 50, "max_coalesced_bytes": 1 << 20, - "max_concurrency": 10, } """Merge ranges within 50 bytes of each other.""" -NO_MERGE: CoalesceOptions = { +NO_MERGE: Mapping[str, int] = { "max_gap_bytes": -1, "max_coalesced_bytes": 1 << 20, - "max_concurrency": 10, } """No merging: any positive gap is > -1, so no pair ever coalesces.""" -CAP_50: CoalesceOptions = { +CAP_50: Mapping[str, int] = { "max_gap_bytes": 1000, "max_coalesced_bytes": 50, - "max_concurrency": 10, } """Gap permissive but merged size capped at 50 bytes.""" @@ -118,8 +115,8 @@ class StructuralCase: """pytest id for the case.""" ranges: list[ByteRequest | None] """Input to coalesced_get.""" - options: CoalesceOptions - """Coalescing knobs.""" + options: Mapping[str, int] + """Coalescing knobs to splat into `coalesced_get`.""" expected_group_sizes: list[int] """Sorted list of group tuple-counts (order-independent).""" expected_contents: dict[int, bytes] | None = None @@ -252,7 +249,7 @@ class StructuralCase: async def test_coalescing_structure_and_contents(case: StructuralCase) -> None: """Group structure, byte contents, and fetch-call count for the deterministic cases.""" fetch = FakeFetch(_INDEXED_BLOB) - groups = await _collect(coalesced_get(fetch, case.ranges, options=case.options)) + groups = await _collect(coalesced_get(fetch, case.ranges, **case.options)) assert sorted(len(g) for g in groups) == sorted(case.expected_group_sizes) @@ -273,7 +270,7 @@ async def test_within_group_ordering_is_start_offset() -> None: fetch = FakeFetch(_INDEXED_BLOB) # Two ranges that merge; one has a later start but is listed first in input. ranges: list[ByteRequest | None] = [RangeByteRequest(20, 25), RangeByteRequest(0, 5)] - groups = await _collect(coalesced_get(fetch, ranges, options=MERGE_GAP_50)) + groups = await _collect(coalesced_get(fetch, ranges, **MERGE_GAP_50)) assert len(groups) == 1 # Input index 1 (start=0) comes first, then 0 (start=20). assert [idx for idx, _ in groups[0]] == [1, 0] @@ -287,7 +284,7 @@ async def test_adjacent_ranges_fire_single_fetch_spanning_merged_region() -> Non RangeByteRequest(10, 15), RangeByteRequest(20, 25), ] - await _collect(coalesced_get(fetch, ranges, options=MERGE_GAP_50)) + await _collect(coalesced_get(fetch, ranges, **MERGE_GAP_50)) assert len(fetch.calls) == 1 call = fetch.calls[0] assert isinstance(call, RangeByteRequest) @@ -318,12 +315,12 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: return _buf(b"x") ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(10)] - opts: CoalesceOptions = { + opts: Mapping[str, int] = { "max_gap_bytes": 0, # force no merging "max_coalesced_bytes": 1 << 20, "max_concurrency": 3, } - async for _group in coalesced_get(fetch, ranges, options=opts): + async for _group in coalesced_get(fetch, ranges, **opts): pass assert peak <= 3 assert peak >= 2 # must have been some real concurrency @@ -348,14 +345,14 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: completed_calls += 1 return _buf(b"x") - opts: CoalesceOptions = { + opts: Mapping[str, int] = { "max_gap_bytes": -1, # no merging "max_coalesced_bytes": 1 << 20, "max_concurrency": 3, } ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(6)] - agen = coalesced_get(fetch, ranges, options=opts) + agen = coalesced_get(fetch, ranges, **opts) async for _group in agen: break # Explicitly close the generator so its finally block runs (cancelling @@ -375,7 +372,7 @@ async def test_key_missing_from_first_call_yields_nothing() -> None: """If the very first fetch returns None, the iterator yields no groups.""" fetch = FakeFetch(b"x" * 100, key_exists=False) ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(20, 30)] - groups = await _collect(coalesced_get(fetch, ranges, options=DEFAULT_COALESCE_OPTIONS)) + groups = await _collect(coalesced_get(fetch, ranges)) assert groups == [] @@ -389,7 +386,7 @@ async def test_key_missing_on_uncoalescable_input_yields_nothing( ) -> None: """Uncoalescable inputs take a distinct path; key-missing must still short-circuit.""" fetch = FakeFetch(b"x" * 100, key_exists=False) - groups = await _collect(coalesced_get(fetch, [byte_range], options=DEFAULT_COALESCE_OPTIONS)) + groups = await _collect(coalesced_get(fetch, [byte_range])) assert groups == [] @@ -406,13 +403,13 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: return None return _buf(b"ok") - opts: CoalesceOptions = { + opts: Mapping[str, int] = { "max_gap_bytes": -1, "max_coalesced_bytes": 1 << 20, "max_concurrency": 1, # serialize for determinism } ranges: list[ByteRequest | None] = [RangeByteRequest(0, 2), RangeByteRequest(100, 102)] - groups = await _collect(coalesced_get(fetch, ranges, options=opts)) + groups = await _collect(coalesced_get(fetch, ranges, **opts)) assert len(groups) == 1 assert len(groups[0]) == 1 @@ -444,7 +441,7 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: await asyncio.wait_for(late_gate.wait(), timeout=5.0) return _buf(b"ok") - opts: CoalesceOptions = { + opts: Mapping[str, int] = { "max_gap_bytes": -1, "max_coalesced_bytes": 1 << 20, "max_concurrency": 3, @@ -452,7 +449,7 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(7)] groups: list[list[tuple[int, Buffer | None]]] = [] - agen = coalesced_get(fetch, ranges, options=opts) + agen = coalesced_get(fetch, ranges, **opts) try: async for group in agen: groups.append(list(group)) @@ -479,14 +476,14 @@ async def test_fetch_raises_propagates() -> None: _INDEXED_BLOB, raise_on=lambda r: isinstance(r, RangeByteRequest) and r.start >= 100, ) - opts: CoalesceOptions = { + opts: Mapping[str, int] = { "max_gap_bytes": -1, "max_coalesced_bytes": 1 << 20, "max_concurrency": 1, } ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(200, 210)] with pytest.raises(OSError, match="injected"): - await _collect(coalesced_get(fetch, ranges, options=opts)) + await _collect(coalesced_get(fetch, ranges, **opts)) # --------------------------------------------------------------------------- @@ -507,7 +504,7 @@ async def test_coverage_invariant_random_inputs() -> None: length = rng.randint(1, 500) ranges.append(RangeByteRequest(start, start + length)) - groups = await _collect(coalesced_get(fetch, ranges, options=DEFAULT_COALESCE_OPTIONS)) + groups = await _collect(coalesced_get(fetch, ranges)) seen: list[int] = [idx for group in groups for idx, _buf in group] assert sorted(seen) == list(range(len(ranges))) @@ -523,7 +520,7 @@ async def test_coverage_invariant_random_inputs() -> None: def test_coalesce_ranges_empty_input() -> None: - groups, uncoalescable = coalesce_ranges([], options=DEFAULT_COALESCE_OPTIONS) + groups, uncoalescable = coalesce_ranges([]) assert groups == [] assert uncoalescable == [] @@ -536,7 +533,7 @@ def test_coalesce_ranges_separates_coalescable_from_uncoalescable() -> None: None, RangeByteRequest(20, 30), ] - groups, uncoalescable = coalesce_ranges(ranges, options=MERGE_GAP_50) + groups, uncoalescable = coalesce_ranges(ranges, **MERGE_GAP_50) # Both range requests fall within MERGE_GAP_50's gap budget. assert len(groups) == 1 @@ -556,7 +553,7 @@ def test_coalesce_ranges_no_merge_when_gap_exceeds_budget() -> None: RangeByteRequest(200, 210), RangeByteRequest(500, 510), ] - groups, uncoalescable = coalesce_ranges(ranges, options=MERGE_GAP_50) + groups, uncoalescable = coalesce_ranges(ranges, **MERGE_GAP_50) assert uncoalescable == [] assert [len(g) for g in groups] == [1, 1, 1] assert [idx for g in groups for idx, _ in g] == [0, 1, 2] @@ -568,7 +565,7 @@ def test_coalesce_ranges_merges_within_gap_budget() -> None: RangeByteRequest(10, 15), RangeByteRequest(20, 25), ] - groups, _ = coalesce_ranges(ranges, options=MERGE_GAP_50) + groups, _ = coalesce_ranges(ranges, **MERGE_GAP_50) assert len(groups) == 1 assert [idx for idx, _ in groups[0]] == [0, 1, 2] @@ -580,7 +577,7 @@ def test_coalesce_ranges_respects_max_coalesced_bytes() -> None: RangeByteRequest(0, 30), RangeByteRequest(40, 80), ] - groups, _ = coalesce_ranges(ranges, options=CAP_50) + groups, _ = coalesce_ranges(ranges, **CAP_50) assert [len(g) for g in groups] == [1, 1] @@ -592,7 +589,7 @@ def test_coalesce_ranges_groups_are_sorted_by_start() -> None: RangeByteRequest(20, 30), RangeByteRequest(200, 210), ] - groups, _ = coalesce_ranges(ranges, options=MERGE_GAP_50) + groups, _ = coalesce_ranges(ranges, **MERGE_GAP_50) # First group is the {0-10, 20-30} cluster (from input indices 1, 2). # Then the {200-210} singleton, then {500-510}. flat = [idx for g in groups for idx, _ in g] @@ -610,7 +607,7 @@ def test_coalesce_ranges_overlapping_ranges_merge() -> None: RangeByteRequest(50, 60), # nested RangeByteRequest(80, 120), # overlaps ] - groups, _ = coalesce_ranges(ranges, options=MERGE_GAP_50) + groups, _ = coalesce_ranges(ranges, **MERGE_GAP_50) assert len(groups) == 1 assert [idx for idx, _ in groups[0]] == [0, 1, 2] @@ -622,14 +619,14 @@ def test_coalesce_ranges_running_end_handles_nesting() -> None: RangeByteRequest(100, 200), # nested; group_end stays at 1000 RangeByteRequest(990, 1010), # gap = -10 from running end, still merges ] - groups, _ = coalesce_ranges(ranges, options=MERGE_GAP_50) + groups, _ = coalesce_ranges(ranges, **MERGE_GAP_50) assert len(groups) == 1 assert {idx for idx, _ in groups[0]} == {0, 1, 2} def test_coalesce_ranges_only_uncoalescable_inputs() -> None: ranges: list[ByteRequest | None] = [None, OffsetByteRequest(10), SuffixByteRequest(5)] - groups, uncoalescable = coalesce_ranges(ranges, options=DEFAULT_COALESCE_OPTIONS) + groups, uncoalescable = coalesce_ranges(ranges) assert groups == [] assert [idx for idx, _ in uncoalescable] == [0, 1, 2] @@ -643,6 +640,6 @@ def test_coalesce_ranges_total_index_coverage() -> None: OffsetByteRequest(100), RangeByteRequest(30, 40), ] - groups, uncoalescable = coalesce_ranges(ranges, options=MERGE_GAP_50) + groups, uncoalescable = coalesce_ranges(ranges, **MERGE_GAP_50) seen = sorted([idx for g in groups for idx, _ in g] + [idx for idx, _ in uncoalescable]) assert seen == list(range(len(ranges))) diff --git a/tests/test_store/test_fsspec_get_ranges.py b/tests/test_store/test_fsspec_get_ranges.py index a659540182..b2b1a1f56c 100644 --- a/tests/test_store/test_fsspec_get_ranges.py +++ b/tests/test_store/test_fsspec_get_ranges.py @@ -11,7 +11,6 @@ from packaging.version import parse as parse_version from zarr.abc.store import RangeByteRequest -from zarr.core._coalesce import DEFAULT_COALESCE_OPTIONS, CoalesceOptions from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.storage import FsspecStore from zarr.storage._fsspec import _make_async @@ -81,27 +80,25 @@ async def test_get_ranges_missing_key_yields_nothing(memory_store: FsspecStore) assert groups == [] -async def test_default_coalesce_options_on_store_without_arg() -> None: - from fsspec.implementations.memory import MemoryFileSystem - - fs = MemoryFileSystem() - fs.store.clear() - store = FsspecStore(fs=_make_async(fs), path="/x") - assert store.coalesce_options == DEFAULT_COALESCE_OPTIONS - - -async def test_coalesce_options_wired_through() -> None: - from fsspec.implementations.memory import MemoryFileSystem +async def test_get_ranges_forwards_coalescing_kwargs(memory_store: FsspecStore) -> None: + """`max_gap_bytes=-1` forces no merging; we should see three groups for three ranges.""" + blob = bytes(i % 256 for i in range(1024)) + await _write(memory_store, "blob", blob) + proto = default_buffer_prototype() - fs = MemoryFileSystem() - fs.store.clear() - custom: CoalesceOptions = { - "max_gap_bytes": 0, - "max_coalesced_bytes": 1 << 20, - "max_concurrency": 2, - } - store = FsspecStore(fs=_make_async(fs), path="/x", coalesce_options=custom) - assert store.coalesce_options == custom + ranges = [ + RangeByteRequest(0, 10), + RangeByteRequest(11, 20), # adjacent: would merge under defaults + RangeByteRequest(21, 30), + ] + groups: list[list[tuple[int, Buffer | None]]] = [ + list(group) + async for group in memory_store.get_ranges( + "blob", ranges, prototype=proto, max_gap_bytes=-1 + ) + ] + # With merging disabled, every range becomes its own one-tuple group. + assert sorted(len(g) for g in groups) == [1, 1, 1] async def test_get_ranges_mixed_range_types(memory_store: FsspecStore) -> None: From b96dd1477b947dff2f08c424ad251bc8903b8e83 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 May 2026 14:26:32 +0200 Subject: [PATCH 35/55] refactor: apply suggestions from code review --- src/zarr/core/_coalesce.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 4d521cc73f..d1a19515c7 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Final, Literal, NamedTuple if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Awaitable, Callable, Sequence + from collections.abc import AsyncIterator, Awaitable, Callable, Sequence from zarr.abc.store import ByteRequest, RangeByteRequest from zarr.core.buffer import Buffer @@ -163,7 +163,7 @@ async def coalesced_get( max_concurrency: int | None = None, max_gap_bytes: int | None = None, max_coalesced_bytes: int | None = None, -) -> AsyncGenerator[Sequence[tuple[int, Buffer | None]], None]: +) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges through `fetch` with coalescing and concurrency. Nearby ranges are merged into a single underlying I/O, and merged fetches @@ -205,6 +205,9 @@ async def coalesced_get( - If a fetch raises, the exception propagates on the yield that produced the failing group; earlier-completed groups remain observable. """ + if not byte_ranges: + return + if max_concurrency is None: max_concurrency = COALESCE_DEFAULT_MAX_CONCURRENCY @@ -213,8 +216,6 @@ async def coalesced_get( max_gap_bytes=max_gap_bytes, max_coalesced_bytes=max_coalesced_bytes, ) - if not groups and not uncoalescable: - return ctx = _WorkerCtx( fetch=fetch, @@ -238,7 +239,6 @@ async def coalesced_get( total_work = len(tasks) try: - pending_error: BaseException | None = None for _ in range(total_work): entry = await ctx.queue.get() if entry[0] == "ok": @@ -252,10 +252,8 @@ async def coalesced_get( if not t.done(): t.cancel() if entry[0] == "error": - pending_error = entry[1] + raise entry[1] break - if pending_error is not None: - raise pending_error finally: # Best-effort cancellation for in-flight tasks (covers the consumer # break / early-exit case where we did not proactively cancel). From cbb42e99571a1fa11aaedfbf71065fa60169aaac Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 May 2026 14:38:27 +0200 Subject: [PATCH 36/55] refactor: use drop queue in favor of as_completed; abort early with FileNotFoundError when get yields None --- src/zarr/core/_coalesce.py | 114 ++++++++------------- tests/test_coalesce.py | 68 ++++++------ tests/test_store/test_fsspec_get_ranges.py | 13 +-- 3 files changed, 80 insertions(+), 115 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index d1a19515c7..5b46f5cf38 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -2,7 +2,7 @@ from __future__ import annotations import asyncio -from typing import TYPE_CHECKING, Final, Literal, NamedTuple +from typing import TYPE_CHECKING, Final, NamedTuple if TYPE_CHECKING: from collections.abc import AsyncIterator, Awaitable, Callable, Sequence @@ -10,12 +10,6 @@ from zarr.abc.store import ByteRequest, RangeByteRequest from zarr.core.buffer import Buffer - _CompletionEntry = ( - tuple[Literal["ok"], Sequence[tuple[int, Buffer | None]]] - | tuple[Literal["missing"], None] - | tuple[Literal["error"], BaseException] - ) - class _WorkerCtx(NamedTuple): """Shared state passed to the per-task worker coroutines. @@ -26,47 +20,40 @@ class _WorkerCtx(NamedTuple): fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]] semaphore: asyncio.Semaphore - queue: asyncio.Queue[_CompletionEntry] -async def _fetch_single(ctx: _WorkerCtx, idx: int, req: ByteRequest | None) -> None: - try: - async with ctx.semaphore: - buf = await ctx.fetch(req) - if buf is None: - await ctx.queue.put(("missing", None)) - return - await ctx.queue.put(("ok", ((idx, buf),))) - except asyncio.CancelledError: - raise - except Exception as exc: - await ctx.queue.put(("error", exc)) - - -async def _fetch_group(ctx: _WorkerCtx, members: list[tuple[int, RangeByteRequest]]) -> None: +async def _fetch_single( + ctx: _WorkerCtx, idx: int, req: ByteRequest | None +) -> Sequence[tuple[int, Buffer | None]]: + """Fetch one byte range. Raises FileNotFoundError if the key is absent.""" + async with ctx.semaphore: + buf = await ctx.fetch(req) + if buf is None: + raise FileNotFoundError + return ((idx, buf),) + + +async def _fetch_group( + ctx: _WorkerCtx, members: list[tuple[int, RangeByteRequest]] +) -> Sequence[tuple[int, Buffer | None]]: """Fetch one merged byte range and slice it back into per-input buffers. `members` must already be sorted by `start`; callers in this module - build it from the sorted mergeable list. + build it from the sorted mergeable list. Raises `FileNotFoundError` + if the key is absent. """ from zarr.abc.store import RangeByteRequest - try: - start = members[0][1].start - end = max(r.end for _, r in members) - async with ctx.semaphore: - big = await ctx.fetch(RangeByteRequest(start, end)) - if big is None: - await ctx.queue.put(("missing", None)) - return - sliced: list[tuple[int, Buffer | None]] = [ - (idx, big[r.start - start : r.end - start]) for idx, r in members - ] - await ctx.queue.put(("ok", tuple(sliced))) - except asyncio.CancelledError: - raise - except Exception as exc: - await ctx.queue.put(("error", exc)) + start = members[0][1].start + end = max(r.end for _, r in members) + async with ctx.semaphore: + big = await ctx.fetch(RangeByteRequest(start, end)) + if big is None: + raise FileNotFoundError + sliced: list[tuple[int, Buffer | None]] = [ + (idx, big[r.start - start : r.end - start]) for idx, r in members + ] + return tuple(sliced) COALESCE_DEFAULT_MAX_GAP_BYTES: Final = 1 << 20 @@ -199,11 +186,11 @@ async def coalesced_get( - Only `RangeByteRequest` inputs are coalesced. `OffsetByteRequest`, `SuffixByteRequest`, and `None` are each treated as uncoalescable (one fetch, one single-tuple yield per input). - - If any fetch returns `None` the iterator stops scheduling further fetches - and completes without yielding the missing group. Groups completed before - the miss remain observable. - - If a fetch raises, the exception propagates on the yield that produced the - failing group; earlier-completed groups remain observable. + - If any fetch returns `None`, the iterator raises `FileNotFoundError` + after cancelling pending fetches. Groups completed before the miss + remain observable on the yields preceding the raise. + - If a fetch raises, the exception propagates on the yield that produced + the failing group; earlier-completed groups remain observable. """ if not byte_ranges: return @@ -217,46 +204,29 @@ async def coalesced_get( max_coalesced_bytes=max_coalesced_bytes, ) - ctx = _WorkerCtx( - fetch=fetch, - semaphore=asyncio.Semaphore(max_concurrency), - queue=asyncio.Queue(), - ) + ctx = _WorkerCtx(fetch=fetch, semaphore=asyncio.Semaphore(max_concurrency)) # Launch all work as tasks. The semaphore bounds actual I/O concurrency. # A one-member group is a RangeByteRequest that did not merge with a # neighbor; route it through _fetch_single so it skips the redundant # slice-by-zero in _fetch_group. - tasks: set[asyncio.Task[None]] = set() + tasks: list[asyncio.Task[Sequence[tuple[int, Buffer | None]]]] = [] for group in groups: if len(group) == 1: solo_idx, solo_req = group[0] - tasks.add(asyncio.create_task(_fetch_single(ctx, solo_idx, solo_req))) + tasks.append(asyncio.create_task(_fetch_single(ctx, solo_idx, solo_req))) else: - tasks.add(asyncio.create_task(_fetch_group(ctx, group))) + tasks.append(asyncio.create_task(_fetch_group(ctx, group))) for idx, single in uncoalescable: - tasks.add(asyncio.create_task(_fetch_single(ctx, idx, single))) - total_work = len(tasks) + tasks.append(asyncio.create_task(_fetch_single(ctx, idx, single))) try: - for _ in range(total_work): - entry = await ctx.queue.get() - if entry[0] == "ok": - yield entry[1] - continue - # "missing" or "error": stop scheduling and cancel pending work. - # Late arrivals that raced to enqueue before cancellation took - # effect sit in the completion queue and are discarded by the - # finally block (the queue is local and will be garbage-collected). - for t in tasks: - if not t.done(): - t.cancel() - if entry[0] == "error": - raise entry[1] - break + for fut in asyncio.as_completed(tasks): + yield await fut finally: - # Best-effort cancellation for in-flight tasks (covers the consumer - # break / early-exit case where we did not proactively cancel). + # Best-effort cancellation for any task still in flight. Covers the + # consumer-break path as well as exception propagation from `await fut`, + # where the remaining tasks must not be left running. for t in tasks: if not t.done(): t.cancel() diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 3c722da476..4c7c24d643 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -3,7 +3,7 @@ import asyncio from dataclasses import dataclass, field -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, cast import pytest @@ -20,7 +20,7 @@ from zarr.core.buffer import Buffer, default_buffer_prototype if TYPE_CHECKING: - from collections.abc import AsyncIterator, Callable, Mapping, Sequence + from collections.abc import AsyncGenerator, AsyncIterator, Callable, Mapping, Sequence def _buf(data: bytes) -> Buffer: @@ -356,8 +356,10 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: async for _group in agen: break # Explicitly close the generator so its finally block runs (cancelling - # in-flight tasks) before we make assertions. - await agen.aclose() + # in-flight tasks) before we make assertions. The narrow AsyncIterator + # return type does not expose `.aclose()`, but the runtime object is an + # async generator and supports it. + await cast("AsyncGenerator[Any, None]", agen).aclose() assert cancelled_calls >= 1 assert completed_calls >= 1 @@ -368,12 +370,12 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: # --------------------------------------------------------------------------- -async def test_key_missing_from_first_call_yields_nothing() -> None: - """If the very first fetch returns None, the iterator yields no groups.""" +async def test_key_missing_from_first_call_raises() -> None: + """If the very first fetch returns None, the iterator raises FileNotFoundError.""" fetch = FakeFetch(b"x" * 100, key_exists=False) ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(20, 30)] - groups = await _collect(coalesced_get(fetch, ranges)) - assert groups == [] + with pytest.raises(FileNotFoundError): + await _collect(coalesced_get(fetch, ranges)) @pytest.mark.parametrize( @@ -381,17 +383,17 @@ async def test_key_missing_from_first_call_yields_nothing() -> None: [OffsetByteRequest(5), SuffixByteRequest(5), None], ids=["offset", "suffix", "none"], ) -async def test_key_missing_on_uncoalescable_input_yields_nothing( +async def test_key_missing_on_uncoalescable_input_raises( byte_range: ByteRequest | None, ) -> None: - """Uncoalescable inputs take a distinct path; key-missing must still short-circuit.""" + """Uncoalescable inputs take a distinct path; key-missing must still raise.""" fetch = FakeFetch(b"x" * 100, key_exists=False) - groups = await _collect(coalesced_get(fetch, [byte_range])) - assert groups == [] + with pytest.raises(FileNotFoundError): + await _collect(coalesced_get(fetch, [byte_range])) -async def test_key_missing_mid_stream_yields_earlier_groups_only() -> None: - """If a later fetch returns None, earlier-completed groups remain observable.""" +async def test_key_missing_mid_stream_raises_after_earlier_groups() -> None: + """If a later fetch returns None, earlier-completed groups yield before the raise.""" call_count = 0 async def fetch(byte_range: ByteRequest | None) -> Buffer | None: @@ -409,16 +411,17 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: "max_concurrency": 1, # serialize for determinism } ranges: list[ByteRequest | None] = [RangeByteRequest(0, 2), RangeByteRequest(100, 102)] - groups = await _collect(coalesced_get(fetch, ranges, **opts)) - assert len(groups) == 1 - assert len(groups[0]) == 1 + agen = coalesced_get(fetch, ranges, **opts) + first = await anext(agen) + assert len(first) == 1 + with pytest.raises(FileNotFoundError): + await anext(agen) -async def test_key_missing_mid_stream_with_concurrency_drains_late_arrivals() -> None: +async def test_key_missing_mid_stream_with_concurrency_cancels_late_arrivals() -> None: """ - Under max_concurrency > 1, a mid-stream miss should still cause the iterator - to complete cleanly even when unrelated tasks are still in flight and arrive - after the miss has been observed. + Under max_concurrency > 1, a mid-stream miss should raise FileNotFoundError + and cancel still-in-flight unrelated tasks rather than wait for them. """ late_gate = asyncio.Event() miss_fired = asyncio.Event() @@ -435,9 +438,8 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: await asyncio.sleep(0.03) miss_fired.set() return None - # Late arrivals: wait until the miss has been processed, then return - # a buffer so the drain loop sees them post-stop. - await asyncio.wait_for(miss_fired.wait(), timeout=5.0) + # Late arrivals would block on this gate; they should be cancelled + # before they ever return. await asyncio.wait_for(late_gate.wait(), timeout=5.0) return _buf(b"ok") @@ -448,21 +450,17 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: } ranges: list[ByteRequest | None] = [RangeByteRequest(i * 1000, i * 1000 + 1) for i in range(7)] - groups: list[list[tuple[int, Buffer | None]]] = [] agen = coalesced_get(fetch, ranges, **opts) - try: - async for group in agen: - groups.append(list(group)) - late_gate.set() - finally: - late_gate.set() - - assert len(groups) == 1 - assert len(groups[0]) == 1 - idx, buf = groups[0][0] + first = await anext(agen) + assert len(first) == 1 + idx, buf = first[0] assert idx == 0 assert buf is not None + with pytest.raises(FileNotFoundError): + await anext(agen) assert miss_fired.is_set() + # Sanity: late_gate was never set, so the cancellation path is what completed the test. + assert not late_gate.is_set() # --------------------------------------------------------------------------- diff --git a/tests/test_store/test_fsspec_get_ranges.py b/tests/test_store/test_fsspec_get_ranges.py index b2b1a1f56c..5853953c89 100644 --- a/tests/test_store/test_fsspec_get_ranges.py +++ b/tests/test_store/test_fsspec_get_ranges.py @@ -69,15 +69,12 @@ async def test_get_ranges_happy_path(memory_store: FsspecStore) -> None: assert flat[2] == blob[500:520] -async def test_get_ranges_missing_key_yields_nothing(memory_store: FsspecStore) -> None: +async def test_get_ranges_missing_key_raises(memory_store: FsspecStore) -> None: + """A request against a missing key raises FileNotFoundError.""" proto = default_buffer_prototype() - groups: list[list[tuple[int, Buffer | None]]] = [ - list(group) - async for group in memory_store.get_ranges( - "does-not-exist", [RangeByteRequest(0, 10)], prototype=proto - ) - ] - assert groups == [] + agen = memory_store.get_ranges("does-not-exist", [RangeByteRequest(0, 10)], prototype=proto) + with pytest.raises(FileNotFoundError): + await anext(agen) async def test_get_ranges_forwards_coalescing_kwargs(memory_store: FsspecStore) -> None: From 73682060900c235e6e1ccee9973aa2e4f570f116 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 May 2026 18:08:58 +0200 Subject: [PATCH 37/55] refactor: define get_ranges on the store abc --- src/zarr/abc/store.py | 56 ++++++++++- src/zarr/storage/_fsspec.py | 38 +------ src/zarr/storage/_protocols.py | 39 -------- src/zarr/storage/_wrapper.py | 22 +++- tests/test_store/test_protocols.py | 155 +++++++++++++++++++++++------ 5 files changed, 201 insertions(+), 109 deletions(-) delete mode 100644 src/zarr/storage/_protocols.py diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 600df17ee5..224923231a 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -10,7 +10,7 @@ from zarr.core.sync import sync if TYPE_CHECKING: - from collections.abc import AsyncGenerator, AsyncIterator, Iterable + from collections.abc import AsyncGenerator, AsyncIterator, Iterable, Sequence from types import TracebackType from typing import Any, Self @@ -616,6 +616,60 @@ async def _get_many( for req in requests: yield (req[0], await self.get(*req)) + async def get_ranges( + self, + key: str, + byte_ranges: Sequence[ByteRequest | None], + *, + prototype: BufferPrototype, + max_concurrency: int | None = None, + max_gap_bytes: int | None = None, + max_coalesced_bytes: int | None = None, + ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: + """Read many byte ranges from `key`. + + Yields one batch per underlying I/O operation, each a sequence of + `(input_index, Buffer | None)` tuples. Batches across yields arrive in + completion order, not input order. The default implementation built + into `Store` runs the coalescer over `self.get`, so subclasses get a + working implementation for free; stores that have a more efficient + backend (e.g. ranged HTTP, S3 byte-range fetches) should override. + + Parameters + ---------- + key + Storage key to read from. + byte_ranges + Input ranges. `None` means "the whole value". + prototype + Buffer prototype, forwarded to `self.get`. + max_concurrency + Maximum number of merged fetches in flight at once. + max_gap_bytes + See `zarr.core._coalesce.coalesce_ranges`. + max_coalesced_bytes + See `zarr.core._coalesce.coalesce_ranges`. + + Raises + ------ + FileNotFoundError + If any underlying fetch returns `None` (i.e. `key` is absent). + """ + # Local import: zarr.core._coalesce imports symbols from this module. + from functools import partial + + from zarr.core._coalesce import coalesced_get + + fetch = partial(self.get, key, prototype) + async for group in coalesced_get( + fetch, + byte_ranges, + max_concurrency=max_concurrency, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ): + yield group + async def getsize(self, key: str) -> int: """ Return the size, in bytes, of a value in a Store. diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index 60b82ec55d..14386d1aac 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -3,7 +3,6 @@ import json import warnings from contextlib import suppress -from functools import partial from typing import TYPE_CHECKING, Any from packaging.version import parse as parse_version @@ -15,23 +14,18 @@ Store, SuffixByteRequest, ) -from zarr.core._coalesce import ( - COALESCE_DEFAULT_MAX_CONCURRENCY, - coalesced_get, -) from zarr.core.buffer import Buffer from zarr.errors import ZarrUserWarning from zarr.storage._utils import _dereference_path if TYPE_CHECKING: - from collections.abc import AsyncIterator, Iterable, Sequence + from collections.abc import AsyncIterator, Iterable from fsspec import AbstractFileSystem from fsspec.asyn import AsyncFileSystem from fsspec.mapping import FSMap from zarr.core.buffer import BufferPrototype - from zarr.storage._protocols import SupportsGetRanges ALLOWED_EXCEPTIONS: tuple[type[Exception], ...] = ( @@ -321,31 +315,6 @@ async def get( else: return value - async def get_ranges( - self, - key: str, - byte_ranges: Sequence[ByteRequest | None], - *, - prototype: BufferPrototype, - max_concurrency: int = COALESCE_DEFAULT_MAX_CONCURRENCY, - max_gap_bytes: int | None = None, - max_coalesced_bytes: int | None = None, - ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: - """Read many byte ranges from `key`, coalescing nearby ranges and fetching concurrently. - - See `zarr.storage._protocols.SupportsGetRanges` for the contract and - `zarr.core._coalesce.coalesced_get` for the full semantics. - """ - fetch = partial(self.get, key, prototype) - async for group in coalesced_get( - fetch, - byte_ranges, - max_concurrency=max_concurrency, - max_gap_bytes=max_gap_bytes, - max_coalesced_bytes=max_coalesced_bytes, - ): - yield group - async def set( self, key: str, @@ -471,8 +440,3 @@ async def getsize(self, key: str) -> int: else: # fsspec doesn't have typing. We'll need to assume or verify this is true return int(size) - - -# Module-level type assertion: FsspecStore structurally satisfies SupportsGetRanges. -# This line is a no-op at runtime but causes mypy/pyright to complain if the shape drifts. -_: type[SupportsGetRanges] = FsspecStore diff --git a/src/zarr/storage/_protocols.py b/src/zarr/storage/_protocols.py deleted file mode 100644 index 181b0694da..0000000000 --- a/src/zarr/storage/_protocols.py +++ /dev/null @@ -1,39 +0,0 @@ -# src/zarr/storage/_protocols.py -from __future__ import annotations - -from typing import TYPE_CHECKING, Protocol, runtime_checkable - -if TYPE_CHECKING: - from collections.abc import AsyncIterator, Sequence - - from zarr.abc.store import ByteRequest - from zarr.core.buffer import Buffer, BufferPrototype - - -@runtime_checkable -class SupportsGetRanges(Protocol): - """Stores that satisfy this protocol can efficiently read many byte ranges - from a single key in a single call, typically via coalescing and concurrent fetch. - - Private / unstable. Shape may change before being made public. - """ - - def get_ranges( - self, - key: str, - byte_ranges: Sequence[ByteRequest | None], - *, - prototype: BufferPrototype, - max_gap_bytes: int | None = ..., - max_coalesced_bytes: int | None = ..., - ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: - """Read many byte ranges from `key`. - - Each yield corresponds to one underlying I/O operation. The coalescing - knobs `max_gap_bytes` and `max_coalesced_bytes` describe *what counts - as coalesceable* and are part of this contract; implementations may - accept additional kwargs (e.g. concurrency knobs) but should honor - these two consistently. See `zarr.core._coalesce.coalesce_ranges` for - full semantics. - """ - ... diff --git a/src/zarr/storage/_wrapper.py b/src/zarr/storage/_wrapper.py index d8ecfa6d45..051d3f2cb0 100644 --- a/src/zarr/storage/_wrapper.py +++ b/src/zarr/storage/_wrapper.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, cast if TYPE_CHECKING: - from collections.abc import AsyncGenerator, AsyncIterator, Iterable + from collections.abc import AsyncGenerator, AsyncIterator, Iterable, Sequence from types import TracebackType from typing import Any, Self @@ -103,6 +103,26 @@ async def get_partial_values( ) -> list[Buffer | None]: return await self._store.get_partial_values(prototype, key_ranges) + async def get_ranges( + self, + key: str, + byte_ranges: Sequence[ByteRequest | None], + *, + prototype: BufferPrototype, + max_concurrency: int | None = None, + max_gap_bytes: int | None = None, + max_coalesced_bytes: int | None = None, + ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: + async for group in self._store.get_ranges( + key, + byte_ranges, + prototype=prototype, + max_concurrency=max_concurrency, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ): + yield group + async def exists(self, key: str) -> bool: return await self._store.exists(key) diff --git a/tests/test_store/test_protocols.py b/tests/test_store/test_protocols.py index 9dae19ffba..529d172f92 100644 --- a/tests/test_store/test_protocols.py +++ b/tests/test_store/test_protocols.py @@ -1,48 +1,141 @@ # tests/test_store/test_protocols.py -"""Runtime and static conformance tests for zarr.storage._protocols.SupportsGetRanges.""" +"""Tests for `Store.get_ranges` — the ABC default implementation and wrapper delegation. + +This file used to validate a separate `SupportsGetRanges` protocol. That protocol +was folded into the `Store` ABC, so every store inherits a working `get_ranges` +built on `coalesced_get(self.get, ...)`. The tests here exercise that default +path and the explicit delegation in `WrapperStore`. +""" from __future__ import annotations +from typing import TYPE_CHECKING + import pytest -from zarr.storage._protocols import SupportsGetRanges +from zarr.abc.store import RangeByteRequest +from zarr.core.buffer import default_buffer_prototype +from zarr.storage import MemoryStore +from zarr.storage._wrapper import WrapperStore -fsspec = pytest.importorskip("fsspec") +if TYPE_CHECKING: + from collections.abc import AsyncIterator, Sequence -from packaging.version import parse as parse_version # noqa: E402 + from zarr.abc.store import ByteRequest + from zarr.core.buffer import Buffer, BufferPrototype -# AsyncFileSystemWrapper (needed to wrap a sync MemoryFileSystem) landed in fsspec 2024.12.0. -# Older versions are pinned by the min-deps CI job. -_needs_async_wrapper = pytest.mark.skipif( - parse_version(fsspec.__version__) < parse_version("2024.12.0"), - reason="No AsyncFileSystemWrapper", -) +async def _write(store: MemoryStore, key: str, data: bytes) -> None: + buf = default_buffer_prototype().buffer.from_bytes(data) + await store.set(key, buf) -@_needs_async_wrapper -def test_fsspec_store_satisfies_supports_get_ranges() -> None: - from fsspec.implementations.memory import MemoryFileSystem - from zarr.storage import FsspecStore - from zarr.storage._fsspec import _make_async +async def test_memory_store_inherits_get_ranges_from_abc() -> None: + """MemoryStore doesn't override `get_ranges`; the ABC default must work end-to-end.""" + store = MemoryStore() + blob = bytes(i % 256 for i in range(512)) + await _write(store, "blob", blob) - fs = MemoryFileSystem() - fs.store.clear() - store = FsspecStore(fs=_make_async(fs), path="/x") - assert isinstance(store, SupportsGetRanges) + ranges = [RangeByteRequest(0, 10), RangeByteRequest(100, 110)] + proto = default_buffer_prototype() + flat: dict[int, bytes] = {} + async for group in store.get_ranges("blob", ranges, prototype=proto): + for idx, buf in group: + assert buf is not None + flat[idx] = buf.to_bytes() + assert flat[0] == blob[0:10] + assert flat[1] == blob[100:110] -def test_memory_store_does_not_satisfy_supports_get_ranges() -> None: - """Sanity check: stores that don't implement get_ranges shouldn't satisfy the protocol.""" - from zarr.storage import MemoryStore +async def test_memory_store_get_ranges_missing_key_raises() -> None: + """A missing key on a default-impl store must raise FileNotFoundError, matching the contract.""" store = MemoryStore() - assert not isinstance(store, SupportsGetRanges) - - -def test_type_assignment_at_module_level() -> None: - """Smoke-test the module-level `_: type[SupportsGetRanges] = FsspecStore`. - - If this runs without error the module imported cleanly; the static check is in mypy. - """ - from zarr.storage import _fsspec # noqa: F401 + proto = default_buffer_prototype() + agen = store.get_ranges("does-not-exist", [RangeByteRequest(0, 10)], prototype=proto) + with pytest.raises(FileNotFoundError): + await anext(agen) + + +async def test_wrapper_store_delegates_get_ranges() -> None: + """WrapperStore.get_ranges must delegate to the wrapped store, not fall back to the default.""" + + class CountingMemoryStore(MemoryStore): + """Tallies get_ranges invocations so we can assert delegation.""" + + get_ranges_calls: int = 0 + + async def get_ranges( + self, + key: str, + byte_ranges: Sequence[ByteRequest | None], + *, + prototype: BufferPrototype, + max_concurrency: int | None = None, + max_gap_bytes: int | None = None, + max_coalesced_bytes: int | None = None, + ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: + type(self).get_ranges_calls += 1 + async for group in super().get_ranges( + key, + byte_ranges, + prototype=prototype, + max_concurrency=max_concurrency, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ): + yield group + + inner = CountingMemoryStore() + blob = b"x" * 100 + await _write(inner, "k", blob) + wrapped = WrapperStore(inner) + + proto = default_buffer_prototype() + groups: list[list[tuple[int, Buffer | None]]] = [ + list(group) + async for group in wrapped.get_ranges("k", [RangeByteRequest(0, 5)], prototype=proto) + ] + + assert CountingMemoryStore.get_ranges_calls == 1 + assert len(groups) == 1 + assert groups[0][0][0] == 0 + + +async def test_wrapper_store_forwards_coalescing_kwargs() -> None: + """Coalescing kwargs flow through WrapperStore to the wrapped store's get_ranges.""" + + class SpyMemoryStore(MemoryStore): + last_max_gap_bytes: int | None = None + + async def get_ranges( + self, + key: str, + byte_ranges: Sequence[ByteRequest | None], + *, + prototype: BufferPrototype, + max_concurrency: int | None = None, + max_gap_bytes: int | None = None, + max_coalesced_bytes: int | None = None, + ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: + type(self).last_max_gap_bytes = max_gap_bytes + async for group in super().get_ranges( + key, + byte_ranges, + prototype=prototype, + max_concurrency=max_concurrency, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ): + yield group + + inner = SpyMemoryStore() + await _write(inner, "k", b"y" * 100) + wrapped = WrapperStore(inner) + proto = default_buffer_prototype() + async for _ in wrapped.get_ranges( + "k", [RangeByteRequest(0, 5)], prototype=proto, max_gap_bytes=-1 + ): + pass + + assert SpyMemoryStore.last_max_gap_bytes == -1 From 76a3a9b4b94951949b9487580afd41557c48f084 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Tue, 12 May 2026 18:30:05 +0200 Subject: [PATCH 38/55] Update src/zarr/core/_coalesce.py Co-authored-by: Chuck Daniels --- src/zarr/core/_coalesce.py | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 5b46f5cf38..7e9a2f7de1 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -198,7 +198,7 @@ async def coalesced_get( if max_concurrency is None: max_concurrency = COALESCE_DEFAULT_MAX_CONCURRENCY - groups, uncoalescable = coalesce_ranges( + groups, singles = coalesce_ranges( byte_ranges, max_gap_bytes=max_gap_bytes, max_coalesced_bytes=max_coalesced_bytes, @@ -207,28 +207,12 @@ async def coalesced_get( ctx = _WorkerCtx(fetch=fetch, semaphore=asyncio.Semaphore(max_concurrency)) # Launch all work as tasks. The semaphore bounds actual I/O concurrency. - # A one-member group is a RangeByteRequest that did not merge with a - # neighbor; route it through _fetch_single so it skips the redundant - # slice-by-zero in _fetch_group. - tasks: list[asyncio.Task[Sequence[tuple[int, Buffer | None]]]] = [] - for group in groups: - if len(group) == 1: - solo_idx, solo_req = group[0] - tasks.append(asyncio.create_task(_fetch_single(ctx, solo_idx, solo_req))) - else: - tasks.append(asyncio.create_task(_fetch_group(ctx, group))) - for idx, single in uncoalescable: - tasks.append(asyncio.create_task(_fetch_single(ctx, idx, single))) - - try: + + async with asyncio.TaskGroup() as tg: + tasks = [ + *(tg.create_task(_fetch_group(ctx, group)) for group in groups), + *(tg.create_task(_fetch_single(ctx, i, single)) for i, single in singles), + ] + for fut in asyncio.as_completed(tasks): yield await fut - finally: - # Best-effort cancellation for any task still in flight. Covers the - # consumer-break path as well as exception propagation from `await fut`, - # where the remaining tasks must not be left running. - for t in tasks: - if not t.done(): - t.cancel() - if tasks: - await asyncio.gather(*tasks, return_exceptions=True) From 470ab80d65ecb39d09c75cad8dcd0b6069d39f91 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Tue, 12 May 2026 18:31:39 +0200 Subject: [PATCH 39/55] Apply suggestion from @chuckwondo Co-authored-by: Chuck Daniels --- src/zarr/core/_coalesce.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 7e9a2f7de1..3848193c51 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -44,6 +44,10 @@ async def _fetch_group( """ from zarr.abc.store import RangeByteRequest + if len(members) == 1: + solo_idx, solo_req = members[0] + return await _fetch_single(ctx, solo_idx, solo_req) + start = members[0][1].start end = max(r.end for _, r in members) async with ctx.semaphore: From 236fe05b7bff87196170b9eadbc76dcbe40207f2 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Tue, 12 May 2026 18:31:49 +0200 Subject: [PATCH 40/55] Apply suggestion from @chuckwondo Co-authored-by: Chuck Daniels --- src/zarr/core/_coalesce.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 3848193c51..9dae9220fe 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -54,9 +54,7 @@ async def _fetch_group( big = await ctx.fetch(RangeByteRequest(start, end)) if big is None: raise FileNotFoundError - sliced: list[tuple[int, Buffer | None]] = [ - (idx, big[r.start - start : r.end - start]) for idx, r in members - ] + sliced = [(idx, big[r.start - start : r.end - start]) for idx, r in members] return tuple(sliced) From 4d7a24e8c51a26f0f771e867040885f1466096fd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 May 2026 20:37:11 +0200 Subject: [PATCH 41/55] refactor: use function-scoped defaults --- src/zarr/abc/store.py | 25 ++++------ src/zarr/core/_coalesce.py | 75 ++++++++++++++++-------------- src/zarr/storage/_wrapper.py | 16 ++----- tests/test_store/test_protocols.py | 31 +++--------- 4 files changed, 59 insertions(+), 88 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 224923231a..1a3ae0d667 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from itertools import starmap -from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable +from typing import TYPE_CHECKING, Literal, Protocol, Unpack, runtime_checkable from zarr.core.sync import sync @@ -14,6 +14,7 @@ from types import TracebackType from typing import Any, Self + from zarr.core._coalesce import CoalesceKwargs from zarr.core.buffer import Buffer, BufferPrototype __all__ = [ @@ -622,9 +623,7 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, - max_concurrency: int | None = None, - max_gap_bytes: int | None = None, - max_coalesced_bytes: int | None = None, + **kwargs: Unpack[CoalesceKwargs], ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges from `key`. @@ -643,12 +642,10 @@ async def get_ranges( Input ranges. `None` means "the whole value". prototype Buffer prototype, forwarded to `self.get`. - max_concurrency - Maximum number of merged fetches in flight at once. - max_gap_bytes - See `zarr.core._coalesce.coalesce_ranges`. - max_coalesced_bytes - See `zarr.core._coalesce.coalesce_ranges`. + **kwargs + Forwarded to `zarr.core._coalesce.coalesced_get`. See its docstring + for supported tuning knobs (`max_concurrency`, `max_gap_bytes`, + `max_coalesced_bytes`). Raises ------ @@ -661,13 +658,7 @@ async def get_ranges( from zarr.core._coalesce import coalesced_get fetch = partial(self.get, key, prototype) - async for group in coalesced_get( - fetch, - byte_ranges, - max_concurrency=max_concurrency, - max_gap_bytes=max_gap_bytes, - max_coalesced_bytes=max_coalesced_bytes, - ): + async for group in coalesced_get(fetch, byte_ranges, **kwargs): yield group async def getsize(self, key: str) -> int: diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 5b46f5cf38..626e54fcc8 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -2,7 +2,7 @@ from __future__ import annotations import asyncio -from typing import TYPE_CHECKING, Final, NamedTuple +from typing import TYPE_CHECKING, NamedTuple, TypedDict, Unpack, cast if TYPE_CHECKING: from collections.abc import AsyncIterator, Awaitable, Callable, Sequence @@ -11,6 +11,28 @@ from zarr.core.buffer import Buffer +class _GroupingKwargs(TypedDict, total=False): + """Internal forwarding bundle for `coalesce_ranges`'s grouping knobs.""" + + max_gap_bytes: int + max_coalesced_bytes: int + + +class CoalesceKwargs(TypedDict, total=False): + """Internal forwarding bundle for `coalesced_get`'s tuning knobs. + + Used with `Unpack[CoalesceKwargs]` in forwarder signatures so intermediate + layers can pass overrides through without knowing or duplicating the leaf + defaults. An unset key means "use the leaf's default": `max_gap_bytes` and + `max_coalesced_bytes` resolve in `coalesce_ranges`; `max_concurrency` + resolves in `coalesced_get`. + """ + + max_gap_bytes: int + max_coalesced_bytes: int + max_concurrency: int + + class _WorkerCtx(NamedTuple): """Shared state passed to the per-task worker coroutines. @@ -56,16 +78,11 @@ async def _fetch_group( return tuple(sliced) -COALESCE_DEFAULT_MAX_GAP_BYTES: Final = 1 << 20 -COALESCE_DEFAULT_MAX_COALESCED_BYTES: Final = 16 << 20 -COALESCE_DEFAULT_MAX_CONCURRENCY: Final = 10 - - def coalesce_ranges( byte_ranges: Sequence[ByteRequest | None], *, - max_gap_bytes: int | None = None, - max_coalesced_bytes: int | None = None, + max_gap_bytes: int = 1 << 20, + max_coalesced_bytes: int = 16 << 20, ) -> tuple[ list[list[tuple[int, RangeByteRequest]]], list[tuple[int, ByteRequest | None]], @@ -82,10 +99,9 @@ def coalesce_ranges( Input ranges. `None` means "the whole value". max_gap_bytes Two `RangeByteRequest`s separated by at most this many bytes may be - merged into one fetch. Defaults to `COALESCE_DEFAULT_MAX_GAP_BYTES`. + merged into one fetch. max_coalesced_bytes - Upper bound on the size of a single merged fetch. Defaults to - `COALESCE_DEFAULT_MAX_COALESCED_BYTES`. + Upper bound on the size of a single merged fetch. Returns ------- @@ -109,11 +125,6 @@ def coalesce_ranges( # Local import to avoid cycles at module import time. from zarr.abc.store import RangeByteRequest - if max_gap_bytes is None: - max_gap_bytes = COALESCE_DEFAULT_MAX_GAP_BYTES - if max_coalesced_bytes is None: - max_coalesced_bytes = COALESCE_DEFAULT_MAX_COALESCED_BYTES - indexed: list[tuple[int, ByteRequest | None]] = list(enumerate(byte_ranges)) mergeable: list[tuple[int, RangeByteRequest]] = [ (i, r) for i, r in indexed if isinstance(r, RangeByteRequest) @@ -146,10 +157,7 @@ def coalesce_ranges( async def coalesced_get( fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]], byte_ranges: Sequence[ByteRequest | None], - *, - max_concurrency: int | None = None, - max_gap_bytes: int | None = None, - max_coalesced_bytes: int | None = None, + **kwargs: Unpack[CoalesceKwargs], ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges through `fetch` with coalescing and concurrency. @@ -168,13 +176,13 @@ async def coalesced_get( `functools.partial(store.get, key, prototype)`. byte_ranges Input ranges. `None` means "the whole value". - max_concurrency - Maximum number of merged fetches in flight at once. Defaults to - `COALESCE_DEFAULT_MAX_CONCURRENCY`. - max_gap_bytes - Forwarded to `coalesce_ranges`. - max_coalesced_bytes - Forwarded to `coalesce_ranges`. + **kwargs + Tuning knobs (see `CoalesceKwargs`): + + - `max_concurrency` — maximum merged fetches in flight at once. + Defaults to 10. + - `max_gap_bytes`, `max_coalesced_bytes` — forwarded to + `coalesce_ranges`; see that function for defaults. Yields ------ @@ -195,14 +203,11 @@ async def coalesced_get( if not byte_ranges: return - if max_concurrency is None: - max_concurrency = COALESCE_DEFAULT_MAX_CONCURRENCY - - groups, uncoalescable = coalesce_ranges( - byte_ranges, - max_gap_bytes=max_gap_bytes, - max_coalesced_bytes=max_coalesced_bytes, - ) + max_concurrency = kwargs.pop("max_concurrency", 10) + # After popping, `kwargs` only contains _GroupingKwargs keys, but type + # checkers can't narrow `**kwargs` parameters through dict mutation. + grouping_kwargs = cast("_GroupingKwargs", kwargs) + groups, uncoalescable = coalesce_ranges(byte_ranges, **grouping_kwargs) ctx = _WorkerCtx(fetch=fetch, semaphore=asyncio.Semaphore(max_concurrency)) diff --git a/src/zarr/storage/_wrapper.py b/src/zarr/storage/_wrapper.py index 051d3f2cb0..d78390f213 100644 --- a/src/zarr/storage/_wrapper.py +++ b/src/zarr/storage/_wrapper.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Unpack, cast if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable, Sequence @@ -9,6 +9,7 @@ from zarr.abc.buffer import Buffer from zarr.abc.store import ByteRequest + from zarr.core._coalesce import CoalesceKwargs from zarr.core.buffer import BufferPrototype from zarr.abc.store import Store @@ -109,18 +110,9 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, - max_concurrency: int | None = None, - max_gap_bytes: int | None = None, - max_coalesced_bytes: int | None = None, + **kwargs: Unpack[CoalesceKwargs], ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: - async for group in self._store.get_ranges( - key, - byte_ranges, - prototype=prototype, - max_concurrency=max_concurrency, - max_gap_bytes=max_gap_bytes, - max_coalesced_bytes=max_coalesced_bytes, - ): + async for group in self._store.get_ranges(key, byte_ranges, prototype=prototype, **kwargs): yield group async def exists(self, key: str) -> bool: diff --git a/tests/test_store/test_protocols.py b/tests/test_store/test_protocols.py index 529d172f92..f9949c43e7 100644 --- a/tests/test_store/test_protocols.py +++ b/tests/test_store/test_protocols.py @@ -9,7 +9,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Unpack import pytest @@ -22,6 +22,7 @@ from collections.abc import AsyncIterator, Sequence from zarr.abc.store import ByteRequest + from zarr.core._coalesce import CoalesceKwargs from zarr.core.buffer import Buffer, BufferPrototype @@ -71,19 +72,10 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, - max_concurrency: int | None = None, - max_gap_bytes: int | None = None, - max_coalesced_bytes: int | None = None, + **kwargs: Unpack[CoalesceKwargs], ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: type(self).get_ranges_calls += 1 - async for group in super().get_ranges( - key, - byte_ranges, - prototype=prototype, - max_concurrency=max_concurrency, - max_gap_bytes=max_gap_bytes, - max_coalesced_bytes=max_coalesced_bytes, - ): + async for group in super().get_ranges(key, byte_ranges, prototype=prototype, **kwargs): yield group inner = CountingMemoryStore() @@ -114,19 +106,10 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, - max_concurrency: int | None = None, - max_gap_bytes: int | None = None, - max_coalesced_bytes: int | None = None, + **kwargs: Unpack[CoalesceKwargs], ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: - type(self).last_max_gap_bytes = max_gap_bytes - async for group in super().get_ranges( - key, - byte_ranges, - prototype=prototype, - max_concurrency=max_concurrency, - max_gap_bytes=max_gap_bytes, - max_coalesced_bytes=max_coalesced_bytes, - ): + type(self).last_max_gap_bytes = kwargs.get("max_gap_bytes") + async for group in super().get_ranges(key, byte_ranges, prototype=prototype, **kwargs): yield group inner = SpyMemoryStore() From 11e6c398ba473a9f051dffd0f2217da47d629cd4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 May 2026 21:16:07 +0200 Subject: [PATCH 42/55] fix: fix failing test --- tests/test_coalesce.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 4c7c24d643..0b973c78ea 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -425,17 +425,19 @@ async def test_key_missing_mid_stream_with_concurrency_cancels_late_arrivals() - """ late_gate = asyncio.Event() miss_fired = asyncio.Event() + # Driven by the test body after the first successful yield, so the miss + # task can't race past the start=0 result. + fire_miss = asyncio.Event() async def fetch(byte_range: ByteRequest | None) -> Buffer | None: assert isinstance(byte_range, RangeByteRequest) start = byte_range.start if start == 0: - # First to complete: arrives before the miss. - await asyncio.sleep(0.01) return _buf(b"ok") if start == 1000: - # Miss: a little later than #0 so #0 yields first. - await asyncio.sleep(0.03) + # Wait for the test to give the green light before returning None. + # This makes ordering deterministic regardless of scheduling. + await asyncio.wait_for(fire_miss.wait(), timeout=5.0) miss_fired.set() return None # Late arrivals would block on this gate; they should be cancelled @@ -456,6 +458,8 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: idx, buf = first[0] assert idx == 0 assert buf is not None + # Now that #0 has yielded, signal the miss task to return None. + fire_miss.set() with pytest.raises(FileNotFoundError): await anext(agen) assert miss_fired.is_set() From 83cc824efb64659e45bada2b70c8c1a3f6da1c39 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 May 2026 22:02:07 +0200 Subject: [PATCH 43/55] refactor: defaults in one place --- changes/3925.feature.md | 2 +- src/zarr/abc/store.py | 26 +++++--- src/zarr/core/_coalesce.py | 64 ++++++++----------- src/zarr/storage/_wrapper.py | 20 +++++- tests/test_coalesce.py | 60 +++++++++++------ .../{test_protocols.py => test_get_ranges.py} | 42 ++++++++---- 6 files changed, 134 insertions(+), 80 deletions(-) rename tests/test_store/{test_protocols.py => test_get_ranges.py} (72%) diff --git a/changes/3925.feature.md b/changes/3925.feature.md index f3ad5080d8..14bda6ba56 100644 --- a/changes/3925.feature.md +++ b/changes/3925.feature.md @@ -1 +1 @@ -Add `zarr.storage.FsspecStore.get_ranges` for concurrent, coalesced multi-range reads from a single key. Coalescing knobs (`max_concurrency`, `max_gap_bytes`, `max_coalesced_bytes`) are passed as keyword arguments to `get_ranges`, not configured on the store. +Add `zarr.abc.store.Store.get_ranges` for concurrent, coalesced multi-range reads from a single key. The method is defined on the `Store` ABC with a default implementation built on `Store.get`, so every store inherits a working version; stores with native multi-range backends (e.g. `FsspecStore`) can override for efficiency. Coalescing knobs (`max_concurrency`, `max_gap_bytes`, `max_coalesced_bytes`) are passed as keyword arguments to `get_ranges`. diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 1a3ae0d667..fd62159ef6 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from itertools import starmap -from typing import TYPE_CHECKING, Literal, Protocol, Unpack, runtime_checkable +from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable from zarr.core.sync import sync @@ -14,7 +14,6 @@ from types import TracebackType from typing import Any, Self - from zarr.core._coalesce import CoalesceKwargs from zarr.core.buffer import Buffer, BufferPrototype __all__ = [ @@ -623,7 +622,9 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, - **kwargs: Unpack[CoalesceKwargs], + max_concurrency: int = 10, + max_gap_bytes: int = 1 << 20, + max_coalesced_bytes: int = 16 << 20, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges from `key`. @@ -642,10 +643,13 @@ async def get_ranges( Input ranges. `None` means "the whole value". prototype Buffer prototype, forwarded to `self.get`. - **kwargs - Forwarded to `zarr.core._coalesce.coalesced_get`. See its docstring - for supported tuning knobs (`max_concurrency`, `max_gap_bytes`, - `max_coalesced_bytes`). + max_concurrency + Maximum number of merged fetches in flight at once. + max_gap_bytes + Two `RangeByteRequest`s separated by at most this many bytes may + be merged into one fetch. + max_coalesced_bytes + Upper bound on the size of a single merged fetch. Raises ------ @@ -658,7 +662,13 @@ async def get_ranges( from zarr.core._coalesce import coalesced_get fetch = partial(self.get, key, prototype) - async for group in coalesced_get(fetch, byte_ranges, **kwargs): + async for group in coalesced_get( + fetch, + byte_ranges, + max_concurrency=max_concurrency, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ): yield group async def getsize(self, key: str) -> int: diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 9ed7a610c5..5d65de9318 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -2,7 +2,7 @@ from __future__ import annotations import asyncio -from typing import TYPE_CHECKING, NamedTuple, TypedDict, Unpack, cast +from typing import TYPE_CHECKING, NamedTuple if TYPE_CHECKING: from collections.abc import AsyncIterator, Awaitable, Callable, Sequence @@ -11,28 +11,6 @@ from zarr.core.buffer import Buffer -class _GroupingKwargs(TypedDict, total=False): - """Internal forwarding bundle for `coalesce_ranges`'s grouping knobs.""" - - max_gap_bytes: int - max_coalesced_bytes: int - - -class CoalesceKwargs(TypedDict, total=False): - """Internal forwarding bundle for `coalesced_get`'s tuning knobs. - - Used with `Unpack[CoalesceKwargs]` in forwarder signatures so intermediate - layers can pass overrides through without knowing or duplicating the leaf - defaults. An unset key means "use the leaf's default": `max_gap_bytes` and - `max_coalesced_bytes` resolve in `coalesce_ranges`; `max_concurrency` - resolves in `coalesced_get`. - """ - - max_gap_bytes: int - max_coalesced_bytes: int - max_concurrency: int - - class _WorkerCtx(NamedTuple): """Shared state passed to the per-task worker coroutines. @@ -83,8 +61,8 @@ async def _fetch_group( def coalesce_ranges( byte_ranges: Sequence[ByteRequest | None], *, - max_gap_bytes: int = 1 << 20, - max_coalesced_bytes: int = 16 << 20, + max_gap_bytes: int, + max_coalesced_bytes: int, ) -> tuple[ list[list[tuple[int, RangeByteRequest]]], list[tuple[int, ByteRequest | None]], @@ -95,6 +73,10 @@ def coalesce_ranges( group corresponds to one fetch of a coalesced byte range, and each uncoalescable item corresponds to one fetch of the original request. + All tuning knobs are required keyword arguments. `Store.get_ranges` is + the public entry point and owns the canonical default values; this + function takes them explicitly to avoid duplicating policy. + Parameters ---------- byte_ranges @@ -159,7 +141,10 @@ def coalesce_ranges( async def coalesced_get( fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]], byte_ranges: Sequence[ByteRequest | None], - **kwargs: Unpack[CoalesceKwargs], + *, + max_concurrency: int, + max_gap_bytes: int, + max_coalesced_bytes: int, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges through `fetch` with coalescing and concurrency. @@ -170,6 +155,10 @@ async def coalesced_get( start offset. Yields across groups are in completion order, not input order. + All tuning knobs are required keyword arguments. `Store.get_ranges` is + the public entry point and owns the canonical default values; this + function takes them explicitly to avoid duplicating policy. + Parameters ---------- fetch @@ -178,13 +167,12 @@ async def coalesced_get( `functools.partial(store.get, key, prototype)`. byte_ranges Input ranges. `None` means "the whole value". - **kwargs - Tuning knobs (see `CoalesceKwargs`): - - - `max_concurrency` — maximum merged fetches in flight at once. - Defaults to 10. - - `max_gap_bytes`, `max_coalesced_bytes` — forwarded to - `coalesce_ranges`; see that function for defaults. + max_concurrency + Maximum number of merged fetches in flight at once. + max_gap_bytes + Forwarded to `coalesce_ranges`. + max_coalesced_bytes + Forwarded to `coalesce_ranges`. Yields ------ @@ -205,11 +193,11 @@ async def coalesced_get( if not byte_ranges: return - max_concurrency = kwargs.pop("max_concurrency", 10) - # After popping, `kwargs` only contains _GroupingKwargs keys, but type - # checkers can't narrow `**kwargs` parameters through dict mutation. - grouping_kwargs = cast("_GroupingKwargs", kwargs) - groups, singles = coalesce_ranges(byte_ranges, **grouping_kwargs) + groups, singles = coalesce_ranges( + byte_ranges, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ) ctx = _WorkerCtx(fetch=fetch, semaphore=asyncio.Semaphore(max_concurrency)) diff --git a/src/zarr/storage/_wrapper.py b/src/zarr/storage/_wrapper.py index d78390f213..37aeb8166f 100644 --- a/src/zarr/storage/_wrapper.py +++ b/src/zarr/storage/_wrapper.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Unpack, cast +from typing import TYPE_CHECKING, cast if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable, Sequence @@ -9,7 +9,6 @@ from zarr.abc.buffer import Buffer from zarr.abc.store import ByteRequest - from zarr.core._coalesce import CoalesceKwargs from zarr.core.buffer import BufferPrototype from zarr.abc.store import Store @@ -110,8 +109,23 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, - **kwargs: Unpack[CoalesceKwargs], + max_concurrency: int | None = None, + max_gap_bytes: int | None = None, + max_coalesced_bytes: int | None = None, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: + """Forward `get_ranges` to the wrapped store. + + Default values for the coalescing kwargs are not declared here; the + wrapped store decides them. `None` means "don't override the wrapped + store's default". + """ + kwargs: dict[str, int] = {} + if max_concurrency is not None: + kwargs["max_concurrency"] = max_concurrency + if max_gap_bytes is not None: + kwargs["max_gap_bytes"] = max_gap_bytes + if max_coalesced_bytes is not None: + kwargs["max_coalesced_bytes"] = max_coalesced_bytes async for group in self._store.get_ranges(key, byte_ranges, prototype=prototype, **kwargs): yield group diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 0b973c78ea..57fe57130c 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -71,33 +71,55 @@ def _contents(groups: list[list[tuple[int, Buffer | None]]]) -> dict[int, bytes] # --------------------------------------------------------------------------- -# Shared coalescing-knob bundles. Each is a mapping of kwargs to splat into -# `coalesce_ranges` and `coalesced_get`. Concurrency is orthogonal to merging -# and is set inline by tests that exercise it. +# Shared coalescing-knob bundles. Each is a complete mapping of all three +# kwargs to splat into `coalesced_get`; `coalesce_ranges` ignores +# `max_concurrency`. The leaf functions in `_coalesce.py` require all knobs +# explicitly — `Store.get_ranges` is the public entry point and owns the +# canonical defaults. Tests pick their own values appropriate to the scenario. # --------------------------------------------------------------------------- -DEFAULT: Mapping[str, int] = {} -"""Empty: rely on the library defaults defined in `_coalesce`.""" +# Permissive default for tests that don't care about specific thresholds. Mirrors +# `Store.get_ranges`'s public defaults but the test file owns this independently +# of any production constants. +DEFAULT: Mapping[str, int] = { + "max_concurrency": 10, + "max_gap_bytes": 1 << 20, + "max_coalesced_bytes": 16 << 20, +} +"""Permissive defaults; mirrors `Store.get_ranges`'s baseline.""" MERGE_GAP_50: Mapping[str, int] = { + "max_concurrency": 10, "max_gap_bytes": 50, "max_coalesced_bytes": 1 << 20, } """Merge ranges within 50 bytes of each other.""" NO_MERGE: Mapping[str, int] = { + "max_concurrency": 10, "max_gap_bytes": -1, "max_coalesced_bytes": 1 << 20, } """No merging: any positive gap is > -1, so no pair ever coalesces.""" CAP_50: Mapping[str, int] = { + "max_concurrency": 10, "max_gap_bytes": 1000, "max_coalesced_bytes": 50, } """Gap permissive but merged size capped at 50 bytes.""" +def _grouping(opts: Mapping[str, int]) -> dict[str, int]: + """Return only the grouping knobs from a full options bundle. + + `coalesce_ranges` rejects `max_concurrency`; this lets test bundles be + full kwargs maps (for `coalesced_get`) and still be passed to the pure + planner via splat. + """ + return {k: v for k, v in opts.items() if k != "max_concurrency"} + + # A deterministic blob used for content-sensitive cases: byte i == (i % 256). _INDEXED_BLOB = bytes(i % 256 for i in range(10_000)) @@ -375,7 +397,7 @@ async def test_key_missing_from_first_call_raises() -> None: fetch = FakeFetch(b"x" * 100, key_exists=False) ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(20, 30)] with pytest.raises(FileNotFoundError): - await _collect(coalesced_get(fetch, ranges)) + await _collect(coalesced_get(fetch, ranges, **DEFAULT)) @pytest.mark.parametrize( @@ -389,7 +411,7 @@ async def test_key_missing_on_uncoalescable_input_raises( """Uncoalescable inputs take a distinct path; key-missing must still raise.""" fetch = FakeFetch(b"x" * 100, key_exists=False) with pytest.raises(FileNotFoundError): - await _collect(coalesced_get(fetch, [byte_range])) + await _collect(coalesced_get(fetch, [byte_range], **DEFAULT)) async def test_key_missing_mid_stream_raises_after_earlier_groups() -> None: @@ -506,7 +528,7 @@ async def test_coverage_invariant_random_inputs() -> None: length = rng.randint(1, 500) ranges.append(RangeByteRequest(start, start + length)) - groups = await _collect(coalesced_get(fetch, ranges)) + groups = await _collect(coalesced_get(fetch, ranges, **DEFAULT)) seen: list[int] = [idx for group in groups for idx, _buf in group] assert sorted(seen) == list(range(len(ranges))) @@ -522,7 +544,7 @@ async def test_coverage_invariant_random_inputs() -> None: def test_coalesce_ranges_empty_input() -> None: - groups, uncoalescable = coalesce_ranges([]) + groups, uncoalescable = coalesce_ranges([], max_gap_bytes=1 << 20, max_coalesced_bytes=16 << 20) assert groups == [] assert uncoalescable == [] @@ -535,7 +557,7 @@ def test_coalesce_ranges_separates_coalescable_from_uncoalescable() -> None: None, RangeByteRequest(20, 30), ] - groups, uncoalescable = coalesce_ranges(ranges, **MERGE_GAP_50) + groups, uncoalescable = coalesce_ranges(ranges, **_grouping(MERGE_GAP_50)) # Both range requests fall within MERGE_GAP_50's gap budget. assert len(groups) == 1 @@ -555,7 +577,7 @@ def test_coalesce_ranges_no_merge_when_gap_exceeds_budget() -> None: RangeByteRequest(200, 210), RangeByteRequest(500, 510), ] - groups, uncoalescable = coalesce_ranges(ranges, **MERGE_GAP_50) + groups, uncoalescable = coalesce_ranges(ranges, **_grouping(MERGE_GAP_50)) assert uncoalescable == [] assert [len(g) for g in groups] == [1, 1, 1] assert [idx for g in groups for idx, _ in g] == [0, 1, 2] @@ -567,7 +589,7 @@ def test_coalesce_ranges_merges_within_gap_budget() -> None: RangeByteRequest(10, 15), RangeByteRequest(20, 25), ] - groups, _ = coalesce_ranges(ranges, **MERGE_GAP_50) + groups, _ = coalesce_ranges(ranges, **_grouping(MERGE_GAP_50)) assert len(groups) == 1 assert [idx for idx, _ in groups[0]] == [0, 1, 2] @@ -579,7 +601,7 @@ def test_coalesce_ranges_respects_max_coalesced_bytes() -> None: RangeByteRequest(0, 30), RangeByteRequest(40, 80), ] - groups, _ = coalesce_ranges(ranges, **CAP_50) + groups, _ = coalesce_ranges(ranges, **_grouping(CAP_50)) assert [len(g) for g in groups] == [1, 1] @@ -591,7 +613,7 @@ def test_coalesce_ranges_groups_are_sorted_by_start() -> None: RangeByteRequest(20, 30), RangeByteRequest(200, 210), ] - groups, _ = coalesce_ranges(ranges, **MERGE_GAP_50) + groups, _ = coalesce_ranges(ranges, **_grouping(MERGE_GAP_50)) # First group is the {0-10, 20-30} cluster (from input indices 1, 2). # Then the {200-210} singleton, then {500-510}. flat = [idx for g in groups for idx, _ in g] @@ -609,7 +631,7 @@ def test_coalesce_ranges_overlapping_ranges_merge() -> None: RangeByteRequest(50, 60), # nested RangeByteRequest(80, 120), # overlaps ] - groups, _ = coalesce_ranges(ranges, **MERGE_GAP_50) + groups, _ = coalesce_ranges(ranges, **_grouping(MERGE_GAP_50)) assert len(groups) == 1 assert [idx for idx, _ in groups[0]] == [0, 1, 2] @@ -621,14 +643,16 @@ def test_coalesce_ranges_running_end_handles_nesting() -> None: RangeByteRequest(100, 200), # nested; group_end stays at 1000 RangeByteRequest(990, 1010), # gap = -10 from running end, still merges ] - groups, _ = coalesce_ranges(ranges, **MERGE_GAP_50) + groups, _ = coalesce_ranges(ranges, **_grouping(MERGE_GAP_50)) assert len(groups) == 1 assert {idx for idx, _ in groups[0]} == {0, 1, 2} def test_coalesce_ranges_only_uncoalescable_inputs() -> None: ranges: list[ByteRequest | None] = [None, OffsetByteRequest(10), SuffixByteRequest(5)] - groups, uncoalescable = coalesce_ranges(ranges) + groups, uncoalescable = coalesce_ranges( + ranges, max_gap_bytes=1 << 20, max_coalesced_bytes=16 << 20 + ) assert groups == [] assert [idx for idx, _ in uncoalescable] == [0, 1, 2] @@ -642,6 +666,6 @@ def test_coalesce_ranges_total_index_coverage() -> None: OffsetByteRequest(100), RangeByteRequest(30, 40), ] - groups, uncoalescable = coalesce_ranges(ranges, **MERGE_GAP_50) + groups, uncoalescable = coalesce_ranges(ranges, **_grouping(MERGE_GAP_50)) seen = sorted([idx for g in groups for idx, _ in g] + [idx for idx, _ in uncoalescable]) assert seen == list(range(len(ranges))) diff --git a/tests/test_store/test_protocols.py b/tests/test_store/test_get_ranges.py similarity index 72% rename from tests/test_store/test_protocols.py rename to tests/test_store/test_get_ranges.py index f9949c43e7..5d0036410c 100644 --- a/tests/test_store/test_protocols.py +++ b/tests/test_store/test_get_ranges.py @@ -1,15 +1,16 @@ -# tests/test_store/test_protocols.py +# tests/test_store/test_get_ranges.py """Tests for `Store.get_ranges` — the ABC default implementation and wrapper delegation. -This file used to validate a separate `SupportsGetRanges` protocol. That protocol -was folded into the `Store` ABC, so every store inherits a working `get_ranges` -built on `coalesced_get(self.get, ...)`. The tests here exercise that default -path and the explicit delegation in `WrapperStore`. +`Store.get_ranges` is defined on the ABC with a default implementation built +on `coalesced_get(self.get, ...)`, so every store inherits a working version. +These tests cover that inherited path and the explicit delegation in +`WrapperStore` (which ensures wrapped stores' optimized overrides are honored). +Store-specific overrides (e.g. `FsspecStore`) have their own test modules. """ from __future__ import annotations -from typing import TYPE_CHECKING, Unpack +from typing import TYPE_CHECKING import pytest @@ -22,7 +23,6 @@ from collections.abc import AsyncIterator, Sequence from zarr.abc.store import ByteRequest - from zarr.core._coalesce import CoalesceKwargs from zarr.core.buffer import Buffer, BufferPrototype @@ -72,10 +72,19 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, - **kwargs: Unpack[CoalesceKwargs], + max_concurrency: int = 10, + max_gap_bytes: int = 1 << 20, + max_coalesced_bytes: int = 16 << 20, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: type(self).get_ranges_calls += 1 - async for group in super().get_ranges(key, byte_ranges, prototype=prototype, **kwargs): + async for group in super().get_ranges( + key, + byte_ranges, + prototype=prototype, + max_concurrency=max_concurrency, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ): yield group inner = CountingMemoryStore() @@ -106,10 +115,19 @@ async def get_ranges( byte_ranges: Sequence[ByteRequest | None], *, prototype: BufferPrototype, - **kwargs: Unpack[CoalesceKwargs], + max_concurrency: int = 10, + max_gap_bytes: int = 1 << 20, + max_coalesced_bytes: int = 16 << 20, ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: - type(self).last_max_gap_bytes = kwargs.get("max_gap_bytes") - async for group in super().get_ranges(key, byte_ranges, prototype=prototype, **kwargs): + type(self).last_max_gap_bytes = max_gap_bytes + async for group in super().get_ranges( + key, + byte_ranges, + prototype=prototype, + max_concurrency=max_concurrency, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, + ): yield group inner = SpyMemoryStore() From c22eef64c0f4b1c8f13b6741848ffba168c68b5a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 May 2026 22:32:02 +0200 Subject: [PATCH 44/55] hoist imports --- src/zarr/abc/store.py | 3 +-- src/zarr/core/_coalesce.py | 9 +++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index fd62159ef6..8c6cccd6fb 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -4,6 +4,7 @@ import json from abc import ABC, abstractmethod from dataclasses import dataclass +from functools import partial from itertools import starmap from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable @@ -657,8 +658,6 @@ async def get_ranges( If any underlying fetch returns `None` (i.e. `key` is absent). """ # Local import: zarr.core._coalesce imports symbols from this module. - from functools import partial - from zarr.core._coalesce import coalesced_get fetch = partial(self.get, key, prototype) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 5d65de9318..302e029a24 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -4,10 +4,12 @@ import asyncio from typing import TYPE_CHECKING, NamedTuple +from zarr.abc.store import RangeByteRequest + if TYPE_CHECKING: from collections.abc import AsyncIterator, Awaitable, Callable, Sequence - from zarr.abc.store import ByteRequest, RangeByteRequest + from zarr.abc.store import ByteRequest from zarr.core.buffer import Buffer @@ -42,8 +44,6 @@ async def _fetch_group( build it from the sorted mergeable list. Raises `FileNotFoundError` if the key is absent. """ - from zarr.abc.store import RangeByteRequest - if len(members) == 1: solo_idx, solo_req = members[0] return await _fetch_single(ctx, solo_idx, solo_req) @@ -106,9 +106,6 @@ def coalesce_ranges( `end`) is `<= max_gap_bytes`, and the resulting merged span is `<= max_coalesced_bytes`. """ - # Local import to avoid cycles at module import time. - from zarr.abc.store import RangeByteRequest - indexed: list[tuple[int, ByteRequest | None]] = list(enumerate(byte_ranges)) mergeable: list[tuple[int, RangeByteRequest]] = [ (i, r) for i, r in indexed if isinstance(r, RangeByteRequest) From e2b943207a2dd8672084d0e2d1a096b4fbe2e5da Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 13 May 2026 10:49:32 +0200 Subject: [PATCH 45/55] Update tests/test_coalesce.py Co-authored-by: Chuck Daniels --- tests/test_coalesce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 57fe57130c..6c6b42d5cd 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -383,8 +383,8 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: # async generator and supports it. await cast("AsyncGenerator[Any, None]", agen).aclose() - assert cancelled_calls >= 1 assert completed_calls >= 1 + assert cancelled_calls == len(ranges) - completed_calls # --------------------------------------------------------------------------- From 07ed1cbdb8cba0f5765e78a2c117b6e730ddc9b9 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 13 May 2026 10:49:46 +0200 Subject: [PATCH 46/55] Update src/zarr/abc/store.py Co-authored-by: Chuck Daniels --- src/zarr/abc/store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 8c6cccd6fb..9dd11d95e1 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -624,8 +624,8 @@ async def get_ranges( *, prototype: BufferPrototype, max_concurrency: int = 10, - max_gap_bytes: int = 1 << 20, - max_coalesced_bytes: int = 16 << 20, + max_gap_bytes: int = 1 << 20, # 1 MiB + max_coalesced_bytes: int = 16 << 20, # 16 MiB ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges from `key`. From cf458f8db593f8021d7bf3eca7423f6bd578e1b4 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 13 May 2026 10:50:02 +0200 Subject: [PATCH 47/55] Update src/zarr/core/_coalesce.py Co-authored-by: Chuck Daniels --- src/zarr/core/_coalesce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 302e029a24..26b847b80a 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -142,7 +142,7 @@ async def coalesced_get( max_concurrency: int, max_gap_bytes: int, max_coalesced_bytes: int, -) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]: +) -> AsyncGenerator[Sequence[tuple[int, Buffer | None]]]: """Read many byte ranges through `fetch` with coalescing and concurrency. Nearby ranges are merged into a single underlying I/O, and merged fetches From 6fb2d37ab64d031bcc8015f237445a413874598a Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 13 May 2026 10:50:49 +0200 Subject: [PATCH 48/55] Update src/zarr/core/_coalesce.py Co-authored-by: Chuck Daniels --- src/zarr/core/_coalesce.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 26b847b80a..7fbc35febe 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -213,10 +213,7 @@ async def coalesced_get( for fut in asyncio.as_completed(tasks): yield await fut except BaseExceptionGroup as eg: - # Unwrap: prefer GeneratorExit, then a single inner exception, otherwise raise group. - for exc in eg.exceptions: - if isinstance(exc, GeneratorExit): - raise exc from None - if len(eg.exceptions) == 1: - raise eg.exceptions[0] from None - raise + # Unwrap: prefer a single inner exception, otherwise raise group. + if subgroup := eg.subgroup(lambda e: not isinstance(e, GeneratorExit)): + e = subgroup.exceptions[0] if len(subgroup.exceptions) == 1 else subgroup + raise e from None From b427017848e43bb173b157cd472c9a8dcefd9e34 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 13 May 2026 10:51:17 +0200 Subject: [PATCH 49/55] Update tests/test_coalesce.py Co-authored-by: Chuck Daniels --- tests/test_coalesce.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 6c6b42d5cd..42531ae295 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -378,10 +378,8 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: async for _group in agen: break # Explicitly close the generator so its finally block runs (cancelling - # in-flight tasks) before we make assertions. The narrow AsyncIterator - # return type does not expose `.aclose()`, but the runtime object is an - # async generator and supports it. - await cast("AsyncGenerator[Any, None]", agen).aclose() + # in-flight tasks) before we make assertions. + await agen.aclose() assert completed_calls >= 1 assert cancelled_calls == len(ranges) - completed_calls From a04082e0a33442c1b387ea61cae2af96d2a3f236 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 May 2026 11:21:03 +0200 Subject: [PATCH 50/55] apply suggestions from code review --- src/zarr/core/_coalesce.py | 2 +- tests/test_coalesce.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 7fbc35febe..d63c154eb6 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -7,7 +7,7 @@ from zarr.abc.store import RangeByteRequest if TYPE_CHECKING: - from collections.abc import AsyncIterator, Awaitable, Callable, Sequence + from collections.abc import AsyncGenerator, Awaitable, Callable, Sequence from zarr.abc.store import ByteRequest from zarr.core.buffer import Buffer diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 42531ae295..70bb52fc8d 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -3,7 +3,7 @@ import asyncio from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING import pytest @@ -20,7 +20,7 @@ from zarr.core.buffer import Buffer, default_buffer_prototype if TYPE_CHECKING: - from collections.abc import AsyncGenerator, AsyncIterator, Callable, Mapping, Sequence + from collections.abc import AsyncIterator, Callable, Mapping, Sequence def _buf(data: bytes) -> Buffer: @@ -381,8 +381,13 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: # in-flight tasks) before we make assertions. await agen.aclose() - assert completed_calls >= 1 - assert cancelled_calls == len(ranges) - completed_calls + # The fast task completes; the remaining tasks are either cancelled while + # sleeping (raising CancelledError into the user try block) or cancelled + # while still waiting on the semaphore (which doesn't enter the try at all). + # Either way, none of them should be allowed to complete. + assert completed_calls == 1 + assert cancelled_calls >= 1 + assert completed_calls + cancelled_calls <= len(ranges) # --------------------------------------------------------------------------- From 542fefdd8a344db6b5a0fadc0d8e3b8aa37415ef Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 13 May 2026 14:19:01 +0200 Subject: [PATCH 51/55] Update src/zarr/core/_coalesce.py Co-authored-by: Chuck Daniels --- src/zarr/core/_coalesce.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index d63c154eb6..318f019400 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -213,7 +213,6 @@ async def coalesced_get( for fut in asyncio.as_completed(tasks): yield await fut except BaseExceptionGroup as eg: - # Unwrap: prefer a single inner exception, otherwise raise group. + # Filter out GeneratorExits, which should NOT be reraised. if subgroup := eg.subgroup(lambda e: not isinstance(e, GeneratorExit)): - e = subgroup.exceptions[0] if len(subgroup.exceptions) == 1 else subgroup - raise e from None + raise subgroup from None From 2714ebaf46e3cd0cdaff0a467dcb1ca4f92a1c6b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 May 2026 22:27:11 +0200 Subject: [PATCH 52/55] raise exception group --- src/zarr/abc/store.py | 11 ++++++-- src/zarr/core/_coalesce.py | 33 ++++++++++++++-------- tests/test_coalesce.py | 14 ++++----- tests/test_store/test_fsspec_get_ranges.py | 4 +-- tests/test_store/test_get_ranges.py | 4 +-- 5 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 9dd11d95e1..3247649f10 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -654,8 +654,15 @@ async def get_ranges( Raises ------ - FileNotFoundError - If any underlying fetch returns `None` (i.e. `key` is absent). + BaseExceptionGroup + Failures from underlying fetches are reported as a + `BaseExceptionGroup` (PEP 654) and should be handled with + `except*`. Inner exceptions include `FileNotFoundError` if any + fetch returns `None` (i.e. `key` is absent), and any exception + raised by `self.get` for the corresponding range. Pending + fetches are cancelled as soon as one task fails, so the group + typically contains a single non-`CancelledError` exception even + under high concurrency. """ # Local import: zarr.core._coalesce imports symbols from this module. from zarr.core._coalesce import coalesced_get diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 318f019400..08100e822f 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -181,11 +181,15 @@ async def coalesced_get( - Only `RangeByteRequest` inputs are coalesced. `OffsetByteRequest`, `SuffixByteRequest`, and `None` are each treated as uncoalescable (one fetch, one single-tuple yield per input). - - If any fetch returns `None`, the iterator raises `FileNotFoundError` - after cancelling pending fetches. Groups completed before the miss - remain observable on the yields preceding the raise. - - If a fetch raises, the exception propagates on the yield that produced - the failing group; earlier-completed groups remain observable. + - Failures from underlying fetches surface as a `BaseExceptionGroup` + (PEP 654). Inner exceptions include `FileNotFoundError` if a fetch + returns `None`, plus any exception `fetch` raises. Pending fetches are + cancelled as soon as one task fails, so the group typically contains a + single non-`CancelledError` exception even under high concurrency. + - Groups completed before the failure remain observable on the yields + preceding the raise. + - `GeneratorExit` raised by `aclose()` is filtered out so the iterator + closes cleanly; callers don't see a group containing only it. """ if not byte_ranges: return @@ -199,10 +203,10 @@ async def coalesced_get( ctx = _WorkerCtx(fetch=fetch, semaphore=asyncio.Semaphore(max_concurrency)) # Launch all work as tasks. The semaphore bounds actual I/O concurrency. - # TaskGroup wraps any task exception in BaseExceptionGroup; we unwrap it - # so callers see the underlying error directly (e.g. FileNotFoundError). - # GeneratorExit (raised when the consumer calls aclose()) is also caught - # and re-raised bare so close completes cleanly. + # TaskGroup wraps task exceptions in BaseExceptionGroup; we propagate the + # group unchanged as part of the public contract (callers handle batch + # failures via `except*` / PEP 654). GeneratorExit (raised when the + # consumer calls aclose()) is filtered out so close completes cleanly. try: async with asyncio.TaskGroup() as tg: tasks = [ @@ -213,6 +217,11 @@ async def coalesced_get( for fut in asyncio.as_completed(tasks): yield await fut except BaseExceptionGroup as eg: - # Filter out GeneratorExits, which should NOT be reraised. - if subgroup := eg.subgroup(lambda e: not isinstance(e, GeneratorExit)): - raise subgroup from None + # Strip GeneratorExits (consumer aclose()) and propagate whatever + # remains. `split` is used instead of `subgroup` because the latter + # short-circuits on the group object itself, returning the unchanged + # group when a predicate lambda happens to be true for the wrapper. + _, rest = eg.split(GeneratorExit) + if rest is None: + return # only GeneratorExits — clean close + raise rest from None diff --git a/tests/test_coalesce.py b/tests/test_coalesce.py index 70bb52fc8d..cb8ff29ec7 100644 --- a/tests/test_coalesce.py +++ b/tests/test_coalesce.py @@ -396,10 +396,10 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: async def test_key_missing_from_first_call_raises() -> None: - """If the very first fetch returns None, the iterator raises FileNotFoundError.""" + """If the very first fetch returns None, the iterator raises an ExceptionGroup containing FileNotFoundError.""" fetch = FakeFetch(b"x" * 100, key_exists=False) ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(20, 30)] - with pytest.raises(FileNotFoundError): + with pytest.RaisesGroup(pytest.RaisesExc(FileNotFoundError)): await _collect(coalesced_get(fetch, ranges, **DEFAULT)) @@ -411,9 +411,9 @@ async def test_key_missing_from_first_call_raises() -> None: async def test_key_missing_on_uncoalescable_input_raises( byte_range: ByteRequest | None, ) -> None: - """Uncoalescable inputs take a distinct path; key-missing must still raise.""" + """Uncoalescable inputs take a distinct path; key-missing must still raise (wrapped in a group).""" fetch = FakeFetch(b"x" * 100, key_exists=False) - with pytest.raises(FileNotFoundError): + with pytest.RaisesGroup(pytest.RaisesExc(FileNotFoundError)): await _collect(coalesced_get(fetch, [byte_range], **DEFAULT)) @@ -439,7 +439,7 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: agen = coalesced_get(fetch, ranges, **opts) first = await anext(agen) assert len(first) == 1 - with pytest.raises(FileNotFoundError): + with pytest.RaisesGroup(pytest.RaisesExc(FileNotFoundError)): await anext(agen) @@ -485,7 +485,7 @@ async def fetch(byte_range: ByteRequest | None) -> Buffer | None: assert buf is not None # Now that #0 has yielded, signal the miss task to return None. fire_miss.set() - with pytest.raises(FileNotFoundError): + with pytest.RaisesGroup(pytest.RaisesExc(FileNotFoundError)): await anext(agen) assert miss_fired.is_set() # Sanity: late_gate was never set, so the cancellation path is what completed the test. @@ -509,7 +509,7 @@ async def test_fetch_raises_propagates() -> None: "max_concurrency": 1, } ranges: list[ByteRequest | None] = [RangeByteRequest(0, 10), RangeByteRequest(200, 210)] - with pytest.raises(OSError, match="injected"): + with pytest.RaisesGroup(pytest.RaisesExc(OSError, match="injected")): await _collect(coalesced_get(fetch, ranges, **opts)) diff --git a/tests/test_store/test_fsspec_get_ranges.py b/tests/test_store/test_fsspec_get_ranges.py index 5853953c89..61d834d22f 100644 --- a/tests/test_store/test_fsspec_get_ranges.py +++ b/tests/test_store/test_fsspec_get_ranges.py @@ -70,10 +70,10 @@ async def test_get_ranges_happy_path(memory_store: FsspecStore) -> None: async def test_get_ranges_missing_key_raises(memory_store: FsspecStore) -> None: - """A request against a missing key raises FileNotFoundError.""" + """A request against a missing key raises BaseExceptionGroup containing FileNotFoundError.""" proto = default_buffer_prototype() agen = memory_store.get_ranges("does-not-exist", [RangeByteRequest(0, 10)], prototype=proto) - with pytest.raises(FileNotFoundError): + with pytest.RaisesGroup(pytest.RaisesExc(FileNotFoundError)): await anext(agen) diff --git a/tests/test_store/test_get_ranges.py b/tests/test_store/test_get_ranges.py index 5d0036410c..8f0c6a4814 100644 --- a/tests/test_store/test_get_ranges.py +++ b/tests/test_store/test_get_ranges.py @@ -50,11 +50,11 @@ async def test_memory_store_inherits_get_ranges_from_abc() -> None: async def test_memory_store_get_ranges_missing_key_raises() -> None: - """A missing key on a default-impl store must raise FileNotFoundError, matching the contract.""" + """A missing key on a default-impl store raises BaseExceptionGroup containing FileNotFoundError.""" store = MemoryStore() proto = default_buffer_prototype() agen = store.get_ranges("does-not-exist", [RangeByteRequest(0, 10)], prototype=proto) - with pytest.raises(FileNotFoundError): + with pytest.RaisesGroup(pytest.RaisesExc(FileNotFoundError)): await anext(agen) From 4e29540dcb5e0b1c0e7bf31772bab643f17a9244 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 May 2026 22:27:24 +0200 Subject: [PATCH 53/55] raise exception group --- changes/3925.feature.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/3925.feature.md b/changes/3925.feature.md index 14bda6ba56..ed07be309c 100644 --- a/changes/3925.feature.md +++ b/changes/3925.feature.md @@ -1 +1 @@ -Add `zarr.abc.store.Store.get_ranges` for concurrent, coalesced multi-range reads from a single key. The method is defined on the `Store` ABC with a default implementation built on `Store.get`, so every store inherits a working version; stores with native multi-range backends (e.g. `FsspecStore`) can override for efficiency. Coalescing knobs (`max_concurrency`, `max_gap_bytes`, `max_coalesced_bytes`) are passed as keyword arguments to `get_ranges`. +Add `zarr.abc.store.Store.get_ranges` for concurrent, coalesced multi-range reads from a single key. The method is defined on the `Store` ABC with a default implementation built on `Store.get`, so every store inherits a working version; stores with native multi-range backends (e.g. `FsspecStore`) can override for efficiency. Coalescing knobs (`max_concurrency`, `max_gap_bytes`, `max_coalesced_bytes`) are passed as keyword arguments to `get_ranges`. Failures from underlying fetches surface as a `BaseExceptionGroup` (PEP 654); callers should use `except*` to filter for specific exception types such as `FileNotFoundError`. From 42790d9a7d2a41556d16bd80820cde140b0e576d Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 14 May 2026 14:11:32 +0200 Subject: [PATCH 54/55] Update src/zarr/core/_coalesce.py Co-authored-by: Chuck Daniels --- src/zarr/core/_coalesce.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 08100e822f..7d6cb66d89 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -106,10 +106,8 @@ def coalesce_ranges( `end`) is `<= max_gap_bytes`, and the resulting merged span is `<= max_coalesced_bytes`. """ - indexed: list[tuple[int, ByteRequest | None]] = list(enumerate(byte_ranges)) - mergeable: list[tuple[int, RangeByteRequest]] = [ - (i, r) for i, r in indexed if isinstance(r, RangeByteRequest) - ] + indexed = list(enumerate(byte_ranges)) + mergeable = [(i, r) for i, r in indexed if isinstance(r, RangeByteRequest)] uncoalescable: list[tuple[int, ByteRequest | None]] = [ (i, r) for i, r in indexed if not isinstance(r, RangeByteRequest) ] From 8963ef9a68b4ec5a7f485e35a02bed26d7735ef6 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 14 May 2026 14:11:40 +0200 Subject: [PATCH 55/55] Update src/zarr/core/_coalesce.py Co-authored-by: Chuck Daniels --- src/zarr/core/_coalesce.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/_coalesce.py b/src/zarr/core/_coalesce.py index 7d6cb66d89..bfee9b4052 100644 --- a/src/zarr/core/_coalesce.py +++ b/src/zarr/core/_coalesce.py @@ -215,11 +215,8 @@ async def coalesced_get( for fut in asyncio.as_completed(tasks): yield await fut except BaseExceptionGroup as eg: - # Strip GeneratorExits (consumer aclose()) and propagate whatever - # remains. `split` is used instead of `subgroup` because the latter - # short-circuits on the group object itself, returning the unchanged - # group when a predicate lambda happens to be true for the wrapper. - _, rest = eg.split(GeneratorExit) - if rest is None: - return # only GeneratorExits — clean close - raise rest from None + # Strip GeneratorExits (consumer aclose()) and propagate whatever remains. + _, other_errors = eg.split(GeneratorExit) + + if other_errors is not None: + raise other_errors from None