Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
697eb52
(feat): first pass remove dtype + fill val handling per chunk
ilan-gold Nov 2, 2025
1ad5cef
chore: bump `zarrs` to 0.23.0-beta.1
LDeakin Dec 26, 2025
57e2e8f
chore: bump `zarrs` to 0.23.0-beta.2
LDeakin Dec 30, 2025
95f2886
chore: incr to 0.2.2-dev
LDeakin Dec 30, 2025
7f9b244
chore: minimise diff
LDeakin Dec 31, 2025
10b1f4c
chore: bump `zarrs` to 0.23.0-beta.3
LDeakin Dec 31, 2025
ea9c3e5
Merge branch 'main' into ig/refactor_chunk_handling
ilan-gold Dec 31, 2025
9c43e81
feat: upgrade zarr v3
ilan-gold Jan 1, 2026
c6a5839
Revert "chore: incr to 0.2.2-dev"
LDeakin Jan 3, 2026
3cbe4be
fix: unsupported data type tests
LDeakin Jan 3, 2026
50c3560
fix: give a real title to zarr store
ilan-gold Jan 3, 2026
faf922b
fix: don't pass in any metadata
ilan-gold Jan 3, 2026
fc8c057
Merge branch 'ld/zarrs_0.23.0' into ig/refactor_chunk_handling
ilan-gold Jan 3, 2026
c922dd0
fix: warning
ilan-gold Jan 3, 2026
66096a5
fix: cleanups
ilan-gold Jan 3, 2026
dc5e60d
chore: small cleanups
ilan-gold Jan 4, 2026
613033f
chore: use `is_whole_chunk` more
ilan-gold Jan 5, 2026
d25b2f9
chore: bump `zarrs` to 0.23.0-beta.4
LDeakin Jan 9, 2026
53b3af6
Merge branch 'ld/zarrs_0.23.0' into ig/refactor_chunk_handling
ilan-gold Jan 9, 2026
5fe3132
chore: bump `zarrs` to 0.23.0-beta.5
LDeakin Jan 12, 2026
efd4c38
chore: bump `zarrs` to 0.23.0-beta.6
LDeakin Jan 13, 2026
5dfa55d
chore: bump `zarrs` to 0.23.0
LDeakin Feb 1, 2026
2db97c2
Merge remote-tracking branch 'origin/main' into ld/zarrs_0.23.0
LDeakin Feb 1, 2026
f1ee1b7
fix: use `map_py_err` in `WithSubset::new`
LDeakin Feb 1, 2026
ea833f9
Merge branch 'ld/zarrs_0.23.0' into ig/refactor_chunk_handling
ilan-gold Feb 1, 2026
536f5dc
rename: `WithSubset`
ilan-gold Feb 1, 2026
4eca1b8
run on ci while waiting for rustfmt to install
ilan-gold Feb 1, 2026
e262eca
fix: import
ilan-gold Feb 1, 2026
6101bd3
remove unused import
ilan-gold Feb 1, 2026
be5b36a
fix: no fill warning
ilan-gold Feb 1, 2026
5c279d9
key/shape
ilan-gold Feb 1, 2026
6c8c7a5
fix: pyi
ilan-gold Feb 1, 2026
4c0544f
remove old `ValueError`
ilan-gold Feb 1, 2026
de1b2e3
feat: improve data type / fill value incompatibility error
LDeakin Feb 2, 2026
68e0881
merge
ilan-gold Feb 2, 2026
86b8118
v2
ilan-gold Feb 2, 2026
3500c61
Revert "v2"
LDeakin Feb 2, 2026
0812f0a
Merge remote-tracking branch 'origin/main' into ig/refactor_chunk_han…
LDeakin Feb 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 11 additions & 14 deletions python/zarrs/_internal.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,15 @@ import numpy.typing
import zarr.abc.store

@typing.final
class Basic:
def __new__(cls, byte_interface: typing.Any, chunk_spec: typing.Any) -> Basic: ...
class ChunkItem:
def __new__(
cls,
key: builtins.str,
chunk_subset: typing.Sequence[slice],
chunk_shape: typing.Sequence[builtins.int],
subset: typing.Sequence[slice],
shape: typing.Sequence[builtins.int],
) -> ChunkItem: ...

@typing.final
class CodecPipelineImpl:
Expand All @@ -26,22 +33,12 @@ class CodecPipelineImpl:
) -> CodecPipelineImpl: ...
def retrieve_chunks_and_apply_index(
self,
chunk_descriptions: typing.Sequence[WithSubset],
chunk_descriptions: typing.Sequence[ChunkItem],
value: numpy.typing.NDArray[typing.Any],
) -> None: ...
def store_chunks_with_indices(
self,
chunk_descriptions: typing.Sequence[WithSubset],
chunk_descriptions: typing.Sequence[ChunkItem],
value: numpy.typing.NDArray[typing.Any],
write_empty_chunks: builtins.bool,
) -> None: ...

@typing.final
class WithSubset:
def __new__(
cls,
item: Basic,
chunk_subset: typing.Sequence[slice],
subset: typing.Sequence[slice],
shape: typing.Sequence[builtins.int],
) -> WithSubset: ...
12 changes: 6 additions & 6 deletions python/zarrs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from zarr.core.array_spec import ArraySpec
from zarr.core.indexing import SelectorTuple, is_integer

from zarrs._internal import Basic, WithSubset
from zarrs._internal import ChunkItem

if TYPE_CHECKING:
from collections.abc import Iterable
Expand Down Expand Up @@ -148,7 +148,7 @@ def get_implicit_fill_value(dtype: ZDType, fill_value: Any) -> Any:

@dataclass(frozen=True)
class RustChunkInfo:
chunk_info_with_indices: list[WithSubset]
chunk_info_with_indices: list[ChunkItem]
write_empty_chunks: bool


Expand All @@ -160,7 +160,7 @@ def make_chunk_info_for_rust_with_indices(
shape: tuple[int, ...],
) -> RustChunkInfo:
shape = shape if shape else (1,) # constant array
chunk_info_with_indices: list[WithSubset] = []
chunk_info_with_indices: list[ChunkItem] = []
write_empty_chunks: bool = True
for (
byte_getter,
Expand All @@ -178,7 +178,6 @@ def make_chunk_info_for_rust_with_indices(
chunk_spec.config,
chunk_spec.prototype,
)
chunk_info = Basic(byte_getter, chunk_spec)
out_selection_as_slices = selector_tuple_to_slice_selection(out_selection)
chunk_selection_as_slices = selector_tuple_to_slice_selection(chunk_selection)
shape_chunk_selection_slices = get_shape_for_selector(
Expand All @@ -195,9 +194,10 @@ def make_chunk_info_for_rust_with_indices(
f"{shape_chunk_selection} != {shape_chunk_selection_slices}"
)
chunk_info_with_indices.append(
WithSubset(
chunk_info,
ChunkItem(
key=byte_getter.path,
chunk_subset=chunk_selection_as_slices,
chunk_shape=chunk_spec.shape,
subset=out_selection_as_slices,
shape=shape,
)
Expand Down
159 changes: 29 additions & 130 deletions src/chunk_item.rs
Original file line number Diff line number Diff line change
@@ -1,175 +1,74 @@
use std::num::NonZeroU64;

use pyo3::{
Bound, PyAny, PyErr, PyResult,
exceptions::{PyIndexError, PyRuntimeError, PyValueError},
Bound, PyErr, PyResult,
exceptions::{PyIndexError, PyValueError},
pyclass, pymethods,
types::{PyAnyMethods, PyBytes, PyBytesMethods, PyInt, PySlice, PySliceMethods as _},
types::{PySlice, PySliceMethods as _},
};
use pyo3_stub_gen::derive::{gen_stub_pyclass, gen_stub_pymethods};
use zarrs::{
array::{ArraySubset, ChunkShape, DataType, FillValue},
metadata::v3::MetadataV3,
storage::StoreKey,
};
use zarrs::{array::ArraySubset, storage::StoreKey};

use crate::utils::PyErrExt;

pub(crate) trait ChunksItem {
fn key(&self) -> &StoreKey;
fn shape(&self) -> &[NonZeroU64];
fn data_type(&self) -> &DataType;
fn fill_value(&self) -> &FillValue;
}

#[derive(Clone)]
#[gen_stub_pyclass]
#[pyclass]
pub(crate) struct Basic {
key: StoreKey,
shape: ChunkShape,
data_type: DataType,
fill_value: FillValue,
}

fn fill_value_to_bytes(dtype: &str, fill_value: &Bound<'_, PyAny>) -> PyResult<Vec<u8>> {
if dtype == "string" {
// Match zarr-python 2.x.x string fill value behaviour with a 0 fill value
// See https://github.com/zarr-developers/zarr-python/issues/2792#issuecomment-2644362122
if let Ok(fill_value_downcast) = fill_value.cast::<PyInt>() {
let fill_value_usize: usize = fill_value_downcast.extract()?;
if fill_value_usize == 0 {
return Ok(vec![]);
}
Err(PyErr::new::<PyValueError, _>(format!(
"Cannot understand non-zero integer {fill_value_usize} fill value for dtype {dtype}"
)))?;
}
}

if let Ok(fill_value_downcast) = fill_value.cast::<PyBytes>() {
Ok(fill_value_downcast.as_bytes().to_vec())
} else if fill_value.hasattr("tobytes")? {
Ok(fill_value.call_method0("tobytes")?.extract()?)
} else {
Err(PyErr::new::<PyValueError, _>(format!(
"Unsupported fill value {fill_value:?}"
)))
}
}

#[gen_stub_pymethods]
#[pymethods]
impl Basic {
#[new]
fn new(byte_interface: &Bound<'_, PyAny>, chunk_spec: &Bound<'_, PyAny>) -> PyResult<Self> {
let path: String = byte_interface.getattr("path")?.extract()?;

let shape: Vec<NonZeroU64> = chunk_spec.getattr("shape")?.extract()?;

let mut dtype: String = chunk_spec
.getattr("dtype")?
.call_method0("to_native_dtype")?
.call_method0("__str__")?
.extract()?;
if dtype == "object" {
// zarrs doesn't understand `object` which is the output of `np.dtype("|O").__str__()`
// but maps it to "string" internally https://github.com/LDeakin/zarrs/blob/0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce/zarrs_metadata/src/v2_to_v3.rs#L288
dtype = String::from("string");
}
let data_type = get_data_type_from_dtype(&dtype)?;
let fill_value: Bound<'_, PyAny> = chunk_spec.getattr("fill_value")?;
let fill_value = FillValue::new(fill_value_to_bytes(&dtype, &fill_value)?);
Ok(Self {
key: StoreKey::new(path).map_py_err::<PyValueError>()?,
shape,
data_type,
fill_value,
fn to_nonzero_u64_vec(v: Vec<u64>) -> PyResult<Vec<NonZeroU64>> {
v.into_iter()
.map(|dim| {
NonZeroU64::new(dim).ok_or_else(|| {
PyErr::new::<PyValueError, _>(
"subset dimensions must be greater than zero".to_string(),
)
})
})
}
.collect::<PyResult<Vec<NonZeroU64>>>()
}

#[derive(Clone)]
#[gen_stub_pyclass]
#[pyclass]
pub(crate) struct WithSubset {
pub item: Basic,
pub(crate) struct ChunkItem {
pub key: StoreKey,
pub chunk_subset: ArraySubset,
pub subset: ArraySubset,
pub shape: Vec<NonZeroU64>,
pub num_elements: u64,
}

#[gen_stub_pymethods]
#[pymethods]
impl WithSubset {
impl ChunkItem {
#[new]
#[allow(clippy::needless_pass_by_value)]
fn new(
item: Basic,
key: String,
chunk_subset: Vec<Bound<'_, PySlice>>,
chunk_shape: Vec<u64>,
subset: Vec<Bound<'_, PySlice>>,
shape: Vec<u64>,
) -> PyResult<Self> {
let chunk_subset = selection_to_array_subset(&chunk_subset, &item.shape)?;
let shape: Vec<NonZeroU64> = shape
.into_iter()
.map(|dim| {
NonZeroU64::new(dim)
.ok_or("subset dimensions must be greater than zero")
.map_py_err::<PyValueError>()
})
.collect::<PyResult<Vec<NonZeroU64>>>()?;
let subset = selection_to_array_subset(&subset, &shape)?;
let num_elements = chunk_shape.iter().product();
let shape_nonzero_u64 = to_nonzero_u64_vec(shape)?;
let chunk_shape_nonzero_u64 = to_nonzero_u64_vec(chunk_shape)?;
let chunk_subset = selection_to_array_subset(&chunk_subset, &chunk_shape_nonzero_u64)?;
let subset = selection_to_array_subset(&subset, &shape_nonzero_u64)?;
// Check that subset and chunk_subset have the same number of elements.
// This permits broadcasting of a constant input.
if subset.num_elements() != chunk_subset.num_elements() && subset.num_elements() > 1 {
return Err(PyErr::new::<PyIndexError, _>(format!(
"the size of the chunk subset {chunk_subset} and input/output subset {subset} are incompatible",
)));
}

Ok(Self {
item,
key: StoreKey::new(key).map_py_err::<PyValueError>()?,
chunk_subset,
subset,
shape: chunk_shape_nonzero_u64,
num_elements,
})
}
}

impl ChunksItem for Basic {
fn key(&self) -> &StoreKey {
&self.key
}
fn shape(&self) -> &[NonZeroU64] {
&self.shape
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn fill_value(&self) -> &FillValue {
&self.fill_value
}
}

impl ChunksItem for WithSubset {
fn key(&self) -> &StoreKey {
&self.item.key
}
fn shape(&self) -> &[NonZeroU64] {
&self.item.shape
}
fn data_type(&self) -> &DataType {
&self.item.data_type
}
fn fill_value(&self) -> &FillValue {
&self.item.fill_value
}
}

fn get_data_type_from_dtype(dtype: &str) -> PyResult<DataType> {
let data_type =
DataType::from_metadata(&MetadataV3::new(dtype)).map_py_err::<PyRuntimeError>()?;
Ok(data_type)
}

fn slice_to_range(slice: &Bound<'_, PySlice>, length: isize) -> PyResult<std::ops::Range<u64>> {
let indices = slice.indices(length)?;
if indices.start < 0 {
Expand Down
11 changes: 4 additions & 7 deletions src/concurrency.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use zarrs::array::{
concurrency::calc_concurrency_outer_inner,
};

use crate::{CodecPipelineImpl, chunk_item::ChunksItem, utils::PyCodecErrExt as _};
use crate::{CodecPipelineImpl, chunk_item::ChunkItem, utils::PyCodecErrExt as _};

pub trait ChunkConcurrentLimitAndCodecOptions {
fn get_chunk_concurrent_limit_and_codec_options(
Expand All @@ -13,22 +13,19 @@ pub trait ChunkConcurrentLimitAndCodecOptions {
) -> PyResult<Option<(usize, CodecOptions)>>;
}

impl<T> ChunkConcurrentLimitAndCodecOptions for Vec<T>
where
T: ChunksItem,
{
impl ChunkConcurrentLimitAndCodecOptions for Vec<ChunkItem> {
fn get_chunk_concurrent_limit_and_codec_options(
&self,
codec_pipeline_impl: &CodecPipelineImpl,
) -> PyResult<Option<(usize, CodecOptions)>> {
let num_chunks = self.len();
let Some(chunk_descriptions0) = self.first() else {
let Some(item) = self.first() else {
return Ok(None);
};

let codec_concurrency = codec_pipeline_impl
.codec_chain
.recommended_concurrency(chunk_descriptions0.shape(), chunk_descriptions0.data_type())
.recommended_concurrency(&item.shape, &codec_pipeline_impl.data_type)
.map_codec_err()?;

let min_concurrent_chunks =
Expand Down
Loading