From 5b84bedfaec9a7721b9b49b0bcd07d9b31acfcc6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 14 May 2026 12:32:28 +0200 Subject: [PATCH 1/3] chore: remove old stringdtype specialized for numpy < 2 --- src/zarr/core/dtype/npy/string.py | 154 ++++++----------------- tests/test_array.py | 16 +-- tests/test_codecs/test_vlen.py | 18 +-- tests/test_dtype/test_npy/test_string.py | 101 +++++---------- tests/test_dtype_registry.py | 2 - 5 files changed, 80 insertions(+), 211 deletions(-) diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 069d0b128d..ce44173bd9 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -39,8 +39,6 @@ from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.wrapper import TBaseDType -_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") - @runtime_checkable class SupportsStr(Protocol): @@ -449,28 +447,31 @@ class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8 """ -# VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy. -# If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length -# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object -# dtype as the native dtype. -class UTF8Base[DType: TBaseDType](ZDType[DType, str], HasObjectCodec): +@dataclass(frozen=True, kw_only=True) +class VariableLengthUTF8(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] """ - A base class for variable-length UTF-8 string data types. + A Zarr data type for arrays containing variable-length UTF-8 strings. + + Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances + of ``str``. - Not intended for direct use, but as a base for concrete implementations. Attributes ---------- - object_codec_id : ClassVar[Literal["vlen-utf8"]] + dtype_cls : Type[np.dtypes.StringDType] + The NumPy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + The name of this data type in Zarr V3. + object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" The object codec ID for this data type. References ---------- - This data type does not have a Zarr V3 specification. + https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/string - The Zarr V2 data type specification can be found [here](https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding). """ + dtype_cls = np.dtypes.StringDType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["string"]] = "string" object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" @@ -478,7 +479,8 @@ class UTF8Base[DType: TBaseDType](ZDType[DType, str], HasObjectCodec): def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ Create an instance of this data type from a compatible NumPy data type. - + We reject NumPy StringDType instances that have the `na_object` field set, + because this is not representable by the Zarr `string` data type. Parameters ---------- @@ -494,13 +496,33 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ------ DataTypeValidationError If the input is not compatible with this data type. + ValueError + If the input is `numpy.dtypes.StringDType` and has `na_object` set. """ if cls._check_native_dtype(dtype): + if hasattr(dtype, "na_object"): + msg = ( + f"Zarr data type resolution from {dtype} failed. " + "Attempted to resolve a zarr data type from a `numpy.dtypes.StringDType` " + "with `na_object` set, which is not supported." + ) + raise ValueError(msg) return cls() raise DataTypeValidationError( f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) + def to_native_dtype(self) -> np.dtypes.StringDType: + """ + Create a NumPy string dtype from this VariableLengthUTF8 ZDType. + + Returns + ------- + np.dtypes.StringDType + The NumPy string dtype. + """ + return self.dtype_cls() + @classmethod def _check_json_v2( cls, @@ -717,109 +739,3 @@ def cast_scalar(self, data: object) -> str: f"data type {self}." ) raise TypeError(msg) # pragma: no cover - - -if _NUMPY_SUPPORTS_VLEN_STRING: - - @dataclass(frozen=True, kw_only=True) - class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] - """ - A Zarr data type for arrays containing variable-length UTF-8 strings. - - Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances - of ``str``. - - - Attributes - ---------- - dtype_cls : Type[np.dtypes.StringDType] - The NumPy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" - The name of this data type in Zarr V3. - object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" - The object codec ID for this data type. - """ - - dtype_cls = np.dtypes.StringDType # type: ignore[assignment] - - @classmethod - def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create an instance of this data type from a compatible NumPy data type. - We reject NumPy StringDType instances that have the `na_object` field set, - because this is not representable by the Zarr `string` data type. - - Parameters - ---------- - dtype : TBaseDType - The native data type. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input is not compatible with this data type. - ValueError - If the input is `numpy.dtypes.StringDType` and has `na_object` set. - """ - if cls._check_native_dtype(dtype): - if hasattr(dtype, "na_object"): - msg = ( - f"Zarr data type resolution from {dtype} failed. " - "Attempted to resolve a zarr data type from a `numpy.dtypes.StringDType` " - "with `na_object` set, which is not supported." - ) - raise ValueError(msg) - return cls() - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> np.dtypes.StringDType: - """ - Create a NumPy string dtype from this VariableLengthUTF8 ZDType. - - Returns - ------- - np.dtypes.StringDType - The NumPy string dtype. - """ - return self.dtype_cls() - -else: - # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. - @dataclass(frozen=True, kw_only=True) - class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] - """ - A Zarr data type for arrays containing variable-length UTF-8 strings. - - Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances - of ``str``. - - - Attributes - ---------- - dtype_cls : Type[np.dtypes.ObjectDType] - The NumPy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" - The name of this data type in Zarr V3. - object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" - The object codec ID for this data type. - """ - - dtype_cls = np.dtypes.ObjectDType - - def to_native_dtype(self) -> np.dtypes.ObjectDType: - """ - Create a NumPy object dtype from this VariableLengthUTF8 ZDType. - - Returns - ------- - np.dtypes.ObjectDType - The NumPy object dtype. - """ - return self.dtype_cls() diff --git a/tests/test_array.py b/tests/test_array.py index 51df8d12c6..ff70b7c148 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -69,7 +69,6 @@ ) from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str -from zarr.core.dtype.npy.string import UTF8Base from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, _iter_grid, _iter_regions from zarr.core.metadata.v2 import ArrayV2Metadata @@ -1981,19 +1980,10 @@ def test_array_repr(store: Store) -> None: assert str(arr) == f"" -class UnknownObjectDtype(UTF8Base[np.dtypes.ObjectDType]): - object_codec_id = "unknown" # type: ignore[assignment] - - def to_native_dtype(self) -> np.dtypes.ObjectDType: - """ - Create a NumPy object dtype from this VariableLengthUTF8 ZDType. +class UnknownObjectDtype(VariableLengthUTF8): + """A data type that requires an object codec with an unknown id, used for error-path tests.""" - Returns - ------- - np.dtypes.ObjectDType - The NumPy object dtype. - """ - return np.dtype("o") # type: ignore[return-value] + object_codec_id = "unknown" # type: ignore[assignment] @pytest.mark.parametrize( diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index f3445824b3..3422090a28 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -10,17 +10,19 @@ from zarr.codecs import ZstdCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.storage import StorePath -numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"] -expected_array_string_dtype: np.dtype[Any] -if _NUMPY_SUPPORTS_VLEN_STRING: - numpy_str_dtypes.append(np.dtypes.StringDType) - expected_array_string_dtype = np.dtypes.StringDType() -else: - expected_array_string_dtype = np.dtype("O") +numpy_str_dtypes: list[type | str | None] = [ + None, + str, + "str", + np.dtypes.StrDType, + "S", + "U", + np.dtypes.StringDType, +] +expected_array_string_dtype: np.dtype[Any] = np.dtypes.StringDType() @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 19d202d164..d538c50da9 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -5,80 +5,43 @@ from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype import FixedLengthUTF32 -from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthUTF8 +from zarr.core.dtype.npy.string import VariableLengthUTF8 from zarr.errors import UnstableSpecificationWarning -if _NUMPY_SUPPORTS_VLEN_STRING: - class TestVariableLengthString(BaseTestZDType): - test_cls = VariableLengthUTF8 # type: ignore[assignment] - valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment] - invalid_dtype = ( - np.dtype(np.int8), - np.dtype(np.float64), - np.dtype("|S10"), - ) - valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},) - valid_json_v3 = ("string",) - invalid_json_v2 = ( - "|S10", - "|f8", - "invalid", - ) - invalid_json_v3 = ( - {"name": "variable_length_utf8", "configuration": {"invalid_key": "value"}}, - {"name": "invalid_name"}, - ) - - scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi")) - scalar_v3_params = ( - (VariableLengthUTF8(), ""), - (VariableLengthUTF8(), "hi"), - ) - - cast_value_params = ( - (VariableLengthUTF8(), "", np.str_("")), - (VariableLengthUTF8(), "hi", np.str_("hi")), - ) - # anything can become a string - invalid_scalar_params = (None,) - item_size_params = (VariableLengthUTF8(),) - -else: - - class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] - test_cls = VariableLengthUTF8 # type: ignore[assignment] - valid_dtype = (np.dtype("O"),) - invalid_dtype = ( - np.dtype(np.int8), - np.dtype(np.float64), - np.dtype("|S10"), - ) - valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},) - valid_json_v3 = ("string",) - invalid_json_v2 = ( - "|S10", - "|f8", - "invalid", - ) - invalid_json_v3 = ( - {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, - {"name": "invalid_name"}, - ) +class TestVariableLengthString(BaseTestZDType): + test_cls = VariableLengthUTF8 # type: ignore[assignment] + valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment] + invalid_dtype = ( + np.dtype(np.int8), + np.dtype(np.float64), + np.dtype("|S10"), + ) + valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},) + valid_json_v3 = ("string",) + invalid_json_v2 = ( + "|S10", + "|f8", + "invalid", + ) + invalid_json_v3 = ( + {"name": "variable_length_utf8", "configuration": {"invalid_key": "value"}}, + {"name": "invalid_name"}, + ) - scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi")) - scalar_v3_params = ( - (VariableLengthUTF8(), ""), - (VariableLengthUTF8(), "hi"), - ) + scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi")) + scalar_v3_params = ( + (VariableLengthUTF8(), ""), + (VariableLengthUTF8(), "hi"), + ) - cast_value_params = ( - (VariableLengthUTF8(), "", np.str_("")), - (VariableLengthUTF8(), "hi", np.str_("hi")), - ) - # anything can become a string - invalid_scalar_params = (None,) - item_size_params = (VariableLengthUTF8(),) + cast_value_params = ( + (VariableLengthUTF8(), "", np.str_("")), + (VariableLengthUTF8(), "hi", np.str_("hi")), + ) + # anything can become a string + invalid_scalar_params = (None,) + item_size_params = (VariableLengthUTF8(),) class TestFixedLengthUTF32(BaseTestZDType): diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index b7ceb502b7..a0cbfb3408 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -15,7 +15,6 @@ get_data_type_from_json, ) from zarr.core.dtype.common import unpack_dtype_json -from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.dtype import ( # type: ignore[attr-defined] Bool, FixedLengthUTF32, @@ -76,7 +75,6 @@ def test_match_dtype( data_type_registry_fixture.register(wrapper_cls._zarr_v3_name, wrapper_cls) assert isinstance(data_type_registry_fixture.match_dtype(np.dtype(dtype_str)), wrapper_cls) - @pytest.mark.skipif(not _NUMPY_SUPPORTS_VLEN_STRING, reason="requires numpy with T dtype") @staticmethod def test_match_dtype_string_na_object_error( data_type_registry_fixture: DataTypeRegistry, From 78255d53b51485dfe65be18a0c29e241dc839406 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 14 May 2026 12:42:07 +0200 Subject: [PATCH 2/3] chore: narrow test fn signature --- tests/test_dtype/test_npy/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index d538c50da9..358a87e4e0 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -94,7 +94,7 @@ class TestFixedLengthUTF32(BaseTestZDType): FixedLengthUTF32(length=10), ], ) -def test_unstable_dtype_warning(zdtype: FixedLengthUTF32 | VariableLengthUTF8) -> None: +def test_unstable_dtype_warning(zdtype: FixedLengthUTF32) -> None: """ Test that we get a warning when serializing a dtype without a zarr v3 spec to json when zarr_format is 3 From a54666991cd0ab5e9b04e459829517865df4ca15 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 14 May 2026 13:56:39 +0200 Subject: [PATCH 3/3] docs: changelog --- changes/3973.removal.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/3973.removal.md diff --git a/changes/3973.removal.md b/changes/3973.removal.md new file mode 100644 index 0000000000..cdb734c7c6 --- /dev/null +++ b/changes/3973.removal.md @@ -0,0 +1 @@ +Removed the NumPy 1.x implementation of the `VariableLengthUTF8` data type. \ No newline at end of file