diff --git a/src/neo4j/_typing.py b/src/neo4j/_typing.py index 1961715c..2dfd9415 100644 --- a/src/neo4j/_typing.py +++ b/src/neo4j/_typing.py @@ -30,6 +30,7 @@ Mapping, Sequence, Set, + Sized, ValuesView, ) from importlib.util import find_spec as _find_spec @@ -79,6 +80,7 @@ "Protocol", "Sequence", "Set", + "Sized", "SupportsIndex", "TextIO", "TypeAlias", diff --git a/src/neo4j/vector.py b/src/neo4j/vector.py index 41a3c2e9..01dfc898 100644 --- a/src/neo4j/vector.py +++ b/src/neo4j/vector.py @@ -43,11 +43,9 @@ try: - from ._rust import vector as _vec_rust from ._rust.vector import swap_endian as _swap_endian_unchecked_rust except ImportError: _swap_endian_unchecked_rust = None - _vec_rust = None __all__ = [ @@ -602,11 +600,19 @@ def _swap_endian_unchecked_np(type_size: int, data: bytes, /) -> bytes: def _swap_endian_unchecked_py(type_size: int, data: bytes, /) -> bytes: - return bytes( - byte - for i in range(0, len(data), type_size) - for byte in data[i : i + type_size][::-1] - ) + match type_size: + case 2: + fmt = "h" + case 4: + fmt = "i" + case 8: + fmt = "q" + case _: + raise ValueError(f"Unsupported type size: {type_size}") + count = len(data) // type_size + fmt_be = f">{count}{fmt}" + fmt_le = f"<{count}{fmt}" + return _struct.pack(fmt_be, *_struct.unpack(fmt_le, data)) if _swap_endian_unchecked_rust is not None: @@ -777,58 +783,21 @@ class _VecF64(_InnerVectorFloat): cypher_inner_type_repr = "FLOAT NOT NULL" @classmethod - def _from_native_rust(cls, data: _t.Iterable[object], /) -> _t.Self: - return cls(_vec_rust.vec_f64_from_native(data)) + def from_native(cls, data: _t.Iterable[object], /) -> _t.Self: + if not isinstance(data, _t.Sized): + data = tuple(data) + if not all(isinstance(item, float) for item in data): + for item in data: + if not isinstance(item, float): + raise TypeError( + f"Cannot build f64 vector from {type(item).__name__}, " + "expected float." + ) + return cls(_struct.pack(f">{len(data)}d", *data)) - @classmethod - def _from_native_np(cls, data: _t.Iterable[object], /) -> _t.Self: - data = tuple(data) - non_float_gen = (item for item in data if not isinstance(item, float)) - non_float = next(non_float_gen, _DEFAULT) - if non_float is not _DEFAULT: - raise TypeError( - f"Cannot build f64 vector from {type(non_float).__name__}, " - "expected float." - ) - return cls(_np.fromiter(data, dtype=_np.dtype(">f8")).tobytes()) - - @classmethod - def _from_native_py(cls, data: _t.Iterable[object], /) -> _t.Self: - bytes_ = bytearray() - for item in data: - if not isinstance(item, float): - raise TypeError( - f"Cannot build f64 vector from {type(item).__name__}, " - "expected float." - ) - bytes_.extend(_struct.pack(">d", item)) - return cls(bytes(bytes_)) - - if _vec_rust is not None: - from_native = _from_native_rust - elif _np is not None: - from_native = _from_native_np - else: - from_native = _from_native_py - - def _to_native_rust(self) -> list[object]: - return _vec_rust.vec_f64_to_native(self.data) - - def _to_native_np(self) -> list[object]: - return _np.frombuffer(self.data, dtype=_np.dtype(">f8")).tolist() - - def _to_native_py(self) -> list[object]: - return [ - _struct.unpack(">d", self.data[i : i + self.size])[0] - for i in range(0, len(self.data), self.size) - ] - - if _vec_rust is not None: - to_native = _to_native_rust - elif _np is not None: - to_native = _to_native_np - else: - to_native = _to_native_py + def to_native(self) -> list[object]: + struct_format = f">{len(self.data) // self.size}d" + return list(_struct.unpack(struct_format, self.data)) def to_numpy(self) -> _np.ndarray: import numpy @@ -852,58 +821,33 @@ class _VecF32(_InnerVectorFloat): cypher_inner_type_repr = "FLOAT32 NOT NULL" @classmethod - def _from_native_rust(cls, data: _t.Iterable[object], /) -> _t.Self: - return cls(_vec_rust.vec_f32_from_native(data)) + def from_native(cls, data: _t.Iterable[object], /) -> _t.Self: + if not isinstance(data, _t.Sized): + data = tuple(data) + if not all(isinstance(item, float) for item in data): + for item in data: + if not isinstance(item, float): + raise TypeError( + f"Cannot build f32 vector from {type(item).__name__}, " + "expected float." + ) + try: + bytes_ = _struct.pack(f">{len(data)}f", *data) + except OverflowError: + for item in data: + try: + _struct.pack(">f", item) + except OverflowError: + raise OverflowError( + f"Value {item} is out of range for f32: " + f"[-3.4028234e+38, 3.4028234e+38]" + ) from None + raise + return cls(bytes_) - @classmethod - def _from_native_np(cls, data: _t.Iterable[object], /) -> _t.Self: - data = tuple(data) - non_float_gen = (item for item in data if not isinstance(item, float)) - non_float = next(non_float_gen, _DEFAULT) - if non_float is not _DEFAULT: - raise TypeError( - f"Cannot build f32 vector from {type(non_float).__name__}, " - "expected float." - ) - return cls(_np.fromiter(data, dtype=_np.dtype(">f4")).tobytes()) - - @classmethod - def _from_native_py(cls, data: _t.Iterable[object], /) -> _t.Self: - bytes_ = bytearray() - for item in data: - if not isinstance(item, float): - raise TypeError( - f"Cannot build f32 vector from {type(item).__name__}, " - "expected float." - ) - bytes_.extend(_struct.pack(">f", item)) - return cls(bytes(bytes_)) - - if _vec_rust is not None: - from_native = _from_native_rust - elif _np is not None: - from_native = _from_native_np - else: - from_native = _from_native_py - - def _to_native_rust(self) -> list[object]: - return _vec_rust.vec_f32_to_native(self.data) - - def _to_native_np(self) -> list[object]: - return _np.frombuffer(self.data, dtype=_np.dtype(">f4")).tolist() - - def _to_native_py(self) -> list[object]: - return [ - _struct.unpack(">f", self.data[i : i + self.size])[0] - for i in range(0, len(self.data), self.size) - ] - - if _vec_rust is not None: - to_native = _to_native_rust - elif _np is not None: - to_native = _to_native_np - else: - to_native = _to_native_py + def to_native(self) -> list[object]: + struct_format = f">{len(self.data) // self.size}f" + return list(_struct.unpack(struct_format, self.data)) def to_numpy(self) -> _np.ndarray: import numpy @@ -938,72 +882,29 @@ class _VecI64(_InnerVectorInt): cypher_inner_type_repr = "INTEGER NOT NULL" @classmethod - def _from_native_rust(cls, data: _t.Iterable[object], /) -> _t.Self: - return cls(_vec_rust.vec_i64_from_native(data)) - - @classmethod - def _from_native_np(cls, data: _t.Iterable[object], /) -> _t.Self: - data = tuple(data) - non_int_gen = (item for item in data if not isinstance(item, int)) - non_int = next(non_int_gen, _DEFAULT) - if non_int is not _DEFAULT: - raise TypeError( - f"Cannot build i64 vector from {type(non_int).__name__}, " - "expected int." - ) - data = _t.cast(tuple[int, ...], data) - overflow_int = tuple( - item for item in data if not _I64_MIN <= item <= _I64_MAX - ) - if overflow_int: - raise OverflowError( - f"Value {overflow_int[0]} is out of range for i64: " - f"[-{_I64_MIN}, {_I64_MAX}]" - ) - return cls(_np.fromiter(data, dtype=_np.dtype(">i8")).tobytes()) - - @classmethod - def _from_native_py(cls, data: _t.Iterable[object], /) -> _t.Self: - bytes_ = bytearray() - for item in data: - if not isinstance(item, int): - raise TypeError( - f"Cannot build i64 vector from {type(item).__name__}, " - "expected int." - ) - if not _I64_MIN <= item <= _I64_MAX: - raise OverflowError( - f"Value {item} is out of range for i64: " - f"[-{_I64_MIN}, {_I64_MAX}]" - ) - bytes_.extend(_struct.pack(">q", item)) - return cls(bytes(bytes_)) - - if _vec_rust is not None: - from_native = _from_native_rust - elif _np is not None: - from_native = _from_native_np - else: - from_native = _from_native_py - - def _to_native_rust(self) -> list[object]: - return _vec_rust.vec_i64_to_native(self.data) - - def _to_native_np(self) -> list[object]: - return _np.frombuffer(self.data, dtype=_np.dtype(">i8")).tolist() - - def _to_native_py(self) -> list[object]: - return [ - _struct.unpack(">q", self.data[i : i + self.size])[0] - for i in range(0, len(self.data), self.size) - ] + def from_native(cls, data: _t.Iterable[object], /) -> _t.Self: + if not isinstance(data, _t.Sized): + data = tuple(data) + try: + bytes_ = _struct.pack(f">{len(data)}q", *data) + except _struct.error: + for item in data: + if not isinstance(item, int): + raise TypeError( + f"Cannot build i64 vector from {type(item).__name__}, " + "expected int." + ) from None + if not _I64_MIN <= item <= _I64_MAX: + raise OverflowError( + f"Value {item} is out of range for i64: " + f"[{_I64_MIN}, {_I64_MAX}]" + ) from None + raise + return cls(bytes_) - if _vec_rust is not None: - to_native = _to_native_rust - elif _np is not None: - to_native = _to_native_np - else: - to_native = _to_native_py + def to_native(self) -> list[object]: + struct_format = f">{len(self.data) // self.size}q" + return list(_struct.unpack(struct_format, self.data)) def to_numpy(self) -> _np.ndarray: import numpy @@ -1031,72 +932,29 @@ class _VecI32(_InnerVectorInt): cypher_inner_type_repr = "INTEGER32 NOT NULL" @classmethod - def _from_native_rust(cls, data: _t.Iterable[object], /) -> _t.Self: - return cls(_vec_rust.vec_i32_from_native(data)) + def from_native(cls, data: _t.Iterable[object], /) -> _t.Self: + if not isinstance(data, _t.Sized): + data = tuple(data) + try: + bytes_ = _struct.pack(f">{len(data)}i", *data) + except _struct.error: + for item in data: + if not isinstance(item, int): + raise TypeError( + f"Cannot build i32 vector from {type(item).__name__}, " + "expected int." + ) from None + if not _I32_MIN <= item <= _I32_MAX: + raise OverflowError( + f"Value {item} is out of range for i32: " + f"[{_I32_MIN}, {_I32_MAX}]" + ) from None + raise + return cls(bytes_) - @classmethod - def _from_native_np(cls, data: _t.Iterable[object], /) -> _t.Self: - data = tuple(data) - non_int_gen = (item for item in data if not isinstance(item, int)) - non_int = next(non_int_gen, _DEFAULT) - if non_int is not _DEFAULT: - raise TypeError( - f"Cannot build i32 vector from {type(non_int).__name__}, " - "expected int." - ) - data = _t.cast(tuple[int, ...], data) - overflow_int = tuple( - item for item in data if not _I32_MIN <= item <= _I32_MAX - ) - if overflow_int: - raise OverflowError( - f"Value {overflow_int[0]} is out of range for i32: " - f"[-{_I32_MIN}, {_I32_MAX}]" - ) - return cls(_np.fromiter(data, dtype=_np.dtype(">i4")).tobytes()) - - @classmethod - def _from_native_py(cls, data: _t.Iterable[object], /) -> _t.Self: - bytes_ = bytearray() - for item in data: - if not isinstance(item, int): - raise TypeError( - f"Cannot build i32 vector from {type(item).__name__}, " - "expected int." - ) - if not _I32_MIN <= item <= _I32_MAX: - raise OverflowError( - f"Value {item} is out of range for i32: " - f"[-{_I32_MIN}, {_I32_MAX}]" - ) - bytes_.extend(_struct.pack(">i", item)) - return cls(bytes(bytes_)) - - if _vec_rust is not None: - from_native = _from_native_rust - elif _np is not None: - from_native = _from_native_np - else: - from_native = _from_native_py - - def _to_native_rust(self) -> list[object]: - return _vec_rust.vec_i32_to_native(self.data) - - def _to_native_np(self) -> list[object]: - return _np.frombuffer(self.data, dtype=_np.dtype(">i4")).tolist() - - def _to_native_py(self) -> list[object]: - return [ - _struct.unpack(">i", self.data[i : i + self.size])[0] - for i in range(0, len(self.data), self.size) - ] - - if _vec_rust is not None: - to_native = _to_native_rust - elif _np is not None: - to_native = _to_native_np - else: - to_native = _to_native_py + def to_native(self) -> list[object]: + struct_format = f">{len(self.data) // self.size}i" + return list(_struct.unpack(struct_format, self.data)) def to_numpy(self) -> _np.ndarray: import numpy @@ -1124,72 +982,29 @@ class _VecI16(_InnerVectorInt): cypher_inner_type_repr = "INTEGER16 NOT NULL" @classmethod - def _from_native_rust(cls, data: _t.Iterable[object], /) -> _t.Self: - return cls(_vec_rust.vec_i16_from_native(data)) - - @classmethod - def _from_native_np(cls, data: _t.Iterable[object], /) -> _t.Self: - data = tuple(data) - non_int_gen = (item for item in data if not isinstance(item, int)) - non_int = next(non_int_gen, _DEFAULT) - if non_int is not _DEFAULT: - raise TypeError( - f"Cannot build i16 vector from {type(non_int).__name__}, " - "expected int." - ) - data = _t.cast(tuple[int, ...], data) - overflow_int = tuple( - item for item in data if not _I16_MIN <= item <= _I16_MAX - ) - if overflow_int: - raise OverflowError( - f"Value {overflow_int[0]} is out of range for i16: " - f"[-{_I16_MIN}, {_I16_MAX}]" - ) - return cls(_np.fromiter(data, dtype=_np.dtype(">i2")).tobytes()) - - @classmethod - def _from_native_py(cls, data: _t.Iterable[object], /) -> _t.Self: - bytes_ = bytearray() - for item in data: - if not isinstance(item, int): - raise TypeError( - f"Cannot build i16 vector from {type(item).__name__}, " - "expected int." - ) - if not _I16_MIN <= item <= _I16_MAX: - raise OverflowError( - f"Value {item} is out of range for i16: " - f"[-{_I16_MIN}, {_I16_MAX}]" - ) - bytes_.extend(_struct.pack(">h", item)) - return cls(bytes(bytes_)) - - if _vec_rust is not None: - from_native = _from_native_rust - elif _np is not None: - from_native = _from_native_np - else: - from_native = _from_native_py - - def _to_native_rust(self) -> list[object]: - return _vec_rust.vec_i16_to_native(self.data) - - def _to_native_np(self) -> list[object]: - return _np.frombuffer(self.data, dtype=_np.dtype(">i2")).tolist() - - def _to_native_py(self) -> list[object]: - return [ - _struct.unpack(">h", self.data[i : i + self.size])[0] - for i in range(0, len(self.data), self.size) - ] + def from_native(cls, data: _t.Iterable[object], /) -> _t.Self: + if not isinstance(data, _t.Sized): + data = tuple(data) + try: + bytes_ = _struct.pack(f">{len(data)}h", *data) + except _struct.error: + for item in data: + if not isinstance(item, int): + raise TypeError( + f"Cannot build i16 vector from {type(item).__name__}, " + "expected int." + ) from None + if not _I16_MIN <= item <= _I16_MAX: + raise OverflowError( + f"Value {item} is out of range for i16: " + f"[{_I16_MIN}, {_I16_MAX}]" + ) from None + raise + return cls(bytes_) - if _vec_rust is not None: - to_native = _to_native_rust - elif _np is not None: - to_native = _to_native_np - else: - to_native = _to_native_py + def to_native(self) -> list[object]: + struct_format = f">{len(self.data) // self.size}h" + return list(_struct.unpack(struct_format, self.data)) def to_numpy(self) -> _np.ndarray: import numpy @@ -1217,72 +1032,29 @@ class _VecI8(_InnerVectorInt): cypher_inner_type_repr = "INTEGER8 NOT NULL" @classmethod - def _from_native_rust(cls, data: _t.Iterable[object], /) -> _t.Self: - return cls(_vec_rust.vec_i8_from_native(data)) - - @classmethod - def _from_native_np(cls, data: _t.Iterable[object], /) -> _t.Self: - data = tuple(data) - non_int_gen = (item for item in data if not isinstance(item, int)) - non_int = next(non_int_gen, _DEFAULT) - if non_int is not _DEFAULT: - raise TypeError( - f"Cannot build i8 vector from {type(non_int).__name__}, " - "expected int." - ) - data = _t.cast(tuple[int, ...], data) - overflow_int = tuple( - item for item in data if not _I8_MIN <= item <= _I8_MAX - ) - if overflow_int: - raise OverflowError( - f"Value {overflow_int[0]} is out of range for i8: " - f"[-{_I8_MIN}, {_I8_MAX}]" - ) - return cls(_np.fromiter(data, dtype=_np.dtype(">i1")).tobytes()) - - @classmethod - def _from_native_py(cls, data: _t.Iterable[object], /) -> _t.Self: - bytes_ = bytearray() - for item in data: - if not isinstance(item, int): - raise TypeError( - f"Cannot build i8 vector from {type(item).__name__}, " - "expected int." - ) - if not _I8_MIN <= item <= _I8_MAX: - raise OverflowError( - f"Value {item} is out of range for i8: " - f"[-{_I8_MIN}, {_I8_MAX}]" - ) - bytes_.extend(_struct.pack(">b", item)) - return cls(bytes(bytes_)) - - if _vec_rust is not None: - from_native = _from_native_rust - elif _np is not None: - from_native = _from_native_np - else: - from_native = _from_native_py - - def _to_native_rust(self) -> list[object]: - return _vec_rust.vec_i8_to_native(self.data) - - def _to_native_np(self) -> list[object]: - return _np.frombuffer(self.data, dtype=_np.dtype(">i1")).tolist() - - def _to_native_py(self) -> list[object]: - return [ - _struct.unpack(">b", self.data[i : i + self.size])[0] - for i in range(0, len(self.data), self.size) - ] + def from_native(cls, data: _t.Iterable[object], /) -> _t.Self: + if not isinstance(data, _t.Sized): + data = tuple(data) + try: + bytes_ = _struct.pack(f">{len(data)}b", *data) + except _struct.error: + for item in data: + if not isinstance(item, int): + raise TypeError( + f"Cannot build i8 vector from {type(item).__name__}, " + "expected int." + ) from None + if not _I8_MIN <= item <= _I8_MAX: + raise OverflowError( + f"Value {item} is out of range for i8: " + f"[{_I8_MIN}, {_I8_MAX}]" + ) from None + raise + return cls(bytes_) - if _vec_rust is not None: - to_native = _to_native_rust - elif _np is not None: - to_native = _to_native_np - else: - to_native = _to_native_py + def to_native(self) -> list[object]: + struct_format = f">{len(self.data) // self.size}b" + return list(_struct.unpack(struct_format, self.data)) def to_numpy(self) -> _np.ndarray: import numpy diff --git a/tests/unit/common/vector/test_vector.py b/tests/unit/common/vector/test_vector.py index 7844517d..d7702b48 100644 --- a/tests/unit/common/vector/test_vector.py +++ b/tests/unit/common/vector/test_vector.py @@ -16,6 +16,7 @@ from __future__ import annotations +import abc import math import random import struct @@ -39,6 +40,7 @@ if t.TYPE_CHECKING: import numpy import pyarrow + from pytest_mock import MockFixture T_ENDIAN_LITERAL: t.TypeAlias = t.Literal["big", "little"] | VectorEndian T_DTYPE_LITERAL: t.TypeAlias = ( @@ -57,6 +59,7 @@ T_DTYPE_FLOAT_LITERAL: t.TypeAlias = t.Literal[ "f32", "f64", VectorDType.F32, VectorDType.F64 ] + T_EXT_LITERAL: t.TypeAlias = t.Literal["numpy", "rust", "python"] ENDIAN_LITERALS: tuple[T_ENDIAN_LITERAL, ...] = ( @@ -152,17 +155,56 @@ def _get_type_size(dtype: str) -> t.Literal[1, 2, 4, 8]: return lookup[dtype] -def _normalize_float_bytes(dtype: str, data: bytes) -> bytes: - if dtype not in {"f32", "f64"}: - raise ValueError(f"Invalid dtype {dtype}") - type_size = _get_type_size(dtype) - pack_format = _dtype_to_pack_format(dtype) - chunks = (data[i : i + type_size] for i in range(0, len(data), type_size)) - return bytes( - b - for chunk in chunks - for b in struct.pack(pack_format, struct.unpack(pack_format, chunk)[0]) - ) +class NormalizableBytes(abc.ABC): + @abc.abstractmethod + def normalized_bytes(self) -> bytes: ... + + @abc.abstractmethod + def raw_bytes(self) -> bytes: ... + + +class Bytes(NormalizableBytes): + _data: bytes + + def __init__(self, data: bytes) -> None: + self._data = data + + def normalized_bytes(self) -> bytes: + return self._data + + def raw_bytes(self) -> bytes: + return self._data + + +class Float32NanPayloadBytes(NormalizableBytes): + _data: bytes + + def __init__(self, data: bytes) -> None: + self._data = data + + def normalized_bytes(self) -> bytes: + type_size = _get_type_size("f32") + pack_format = _dtype_to_pack_format("f32") + + # Python <3.14 does not preserve NaN payloads on struct pack/unpack + # for float32: + # https://github.com/python/cpython/issues/130317 + if sys.version_info >= (3, 14): + return self._data + chunks = ( + self._data[i : i + type_size] + for i in range(0, len(self._data), type_size) + ) + return bytes( + b + for chunk in chunks + for b in struct.pack( + pack_format, struct.unpack(pack_format, chunk)[0] + ) + ) + + def raw_bytes(self) -> bytes: + return self._data def _dtype_to_pack_format(dtype: str) -> str: @@ -176,20 +218,15 @@ def _dtype_to_pack_format(dtype: str) -> str: }[dtype] -def _mock_mask_extensions(mocker, used_ext): +def _mock_mask_extensions( + used_ext: T_EXT_LITERAL, mocker: MockFixture +) -> None: from neo4j.vector import ( _swap_endian_unchecked_np, _swap_endian_unchecked_py, _swap_endian_unchecked_rust, - _VecF32, - _VecF64, - _VecI8, - _VecI16, - _VecI32, - _VecI64, ) - vec_types = (_VecF64, _VecF32, _VecI64, _VecI32, _VecI16, _VecI8) match used_ext: case "numpy": if _swap_endian_unchecked_np is None: @@ -198,15 +235,6 @@ def _mock_mask_extensions(mocker, used_ext): "neo4j.vector._swap_endian_unchecked", new=_swap_endian_unchecked_np, ) - for vec_type in vec_types: - mocker.patch( - f"neo4j.vector.{vec_type.__name__}.from_native", - new=vec_type._from_native_np, - ) - mocker.patch( - f"neo4j.vector.{vec_type.__name__}.to_native", - new=vec_type._to_native_np, - ) case "rust": if _swap_endian_unchecked_rust is None: pytest.skip("rust extensions are not installed") @@ -214,37 +242,19 @@ def _mock_mask_extensions(mocker, used_ext): "neo4j.vector._swap_endian_unchecked", new=_swap_endian_unchecked_rust, ) - for vec_type in vec_types: - mocker.patch( - f"neo4j.vector.{vec_type.__name__}.from_native", - new=vec_type._from_native_rust, - ) - mocker.patch( - f"neo4j.vector.{vec_type.__name__}.to_native", - new=vec_type._to_native_rust, - ) case "python": mocker.patch( "neo4j.vector._swap_endian_unchecked", new=_swap_endian_unchecked_py, ) - for vec_type in vec_types: - mocker.patch( - f"neo4j.vector.{vec_type.__name__}.from_native", - new=vec_type._from_native_py, - ) - mocker.patch( - f"neo4j.vector.{vec_type.__name__}.to_native", - new=vec_type._to_native_py, - ) case _: raise ValueError(f"Invalid ext value {used_ext}") @pytest.mark.parametrize("ext", ("numpy", "rust", "python")) -def test_swap_endian(mocker, ext): +def test_swap_endian(mocker: MockFixture, ext: T_EXT_LITERAL) -> None: data = bytes(range(1, 17)) - _mock_mask_extensions(mocker, ext) + _mock_mask_extensions(ext, mocker) res = _swap_endian(2, data) assert isinstance(res, bytes) assert res == bytes( @@ -264,9 +274,11 @@ def test_swap_endian(mocker, ext): @pytest.mark.parametrize("ext", ("numpy", "rust", "python")) @pytest.mark.parametrize("type_size", (-1, 0, 3, 5, 7, 9, 16, 32)) -def test_swap_endian_unhandled_size(mocker, ext, type_size): +def test_swap_endian_unhandled_size( + ext: T_EXT_LITERAL, type_size: int, mocker: MockFixture +) -> None: data = bytes(i % 256 for i in range(1, abs(type_size) * 4)) - _mock_mask_extensions(mocker, ext) + _mock_mask_extensions(ext, mocker) with pytest.raises(ValueError, match=str(type_size)): _swap_endian(type_size, data) @@ -282,12 +294,12 @@ def test_swap_endian_unhandled_size(mocker, ext, type_size): ), pytest.param( "i8", - b"\x01", + bytes.fromhex("01"), id="i8-single", ), pytest.param( "i8", - b"\x01\x02\x03\x04", + bytes.fromhex("01020304"), id="i8-some", ), pytest.param( @@ -302,12 +314,12 @@ def test_swap_endian_unhandled_size(mocker, ext, type_size): ), pytest.param( "i16", - b"\x00\x01", + bytes.fromhex("0001"), id="i16-single", ), pytest.param( "i16", - b"\x00\x01\x00\x02", + bytes.fromhex("00010002"), id="i16-some", ), pytest.param( @@ -322,12 +334,12 @@ def test_swap_endian_unhandled_size(mocker, ext, type_size): ), pytest.param( "i32", - b"\x00\x00\x00\x01", + bytes.fromhex("00000001"), id="i32-single", ), pytest.param( "i32", - b"\x00\x00\x00\x01\x00\x00\x00\x02", + bytes.fromhex("0000000100000002"), id="i32-some", ), pytest.param( @@ -342,15 +354,12 @@ def test_swap_endian_unhandled_size(mocker, ext, type_size): ), pytest.param( "i64", - b"\x00\x00\x00\x00\x00\x00\x00\x01", + bytes.fromhex("0000000000000001"), id="i64-single", ), pytest.param( "i64", - ( - b"\x00\x00\x00\x00\x00\x00\x00\x01" - b"\x00\x00\x00\x00\x00\x00\x00\x02" - ), + bytes.fromhex("0000000000000001 0000000000000002"), id="i64-some", ), pytest.param( @@ -426,17 +435,13 @@ def nan_equals(a: list[object], b: list[object]) -> bool: @pytest.mark.parametrize("dtype", DTYPE_INT_LITERALS) @pytest.mark.parametrize(("repeat", "size"), ((10_000, 1), (1, 10_000))) -@pytest.mark.parametrize("ext", ("numpy", "rust", "python")) @pytest.mark.parametrize("use_init", (False, True)) def test_from_native_int_random( dtype: T_DTYPE_INT_LITERAL, repeat: int, size: int, - ext: str, use_init: bool, - mocker: t.Any, ) -> None: - _mock_mask_extensions(mocker, ext) type_size = _get_type_size(dtype) for _ in range(repeat): data = _random_value_be_bytes(type_size, size) @@ -452,24 +457,20 @@ def test_from_native_int_random( else: v = Vector.from_native(values, dtype) expected_raw = data - if dtype.startswith("f"): - expected_raw = _normalize_float_bytes(dtype, data) + if dtype == "f32": + expected_raw = Float32NanPayloadBytes(data).normalized_bytes() assert v.raw() == expected_raw @pytest.mark.parametrize("dtype", DTYPE_FLOAT_LITERALS) @pytest.mark.parametrize(("repeat", "size"), ((10_000, 1), (1, 10_000))) -@pytest.mark.parametrize("ext", ("numpy", "rust", "python")) @pytest.mark.parametrize("use_init", (False, True)) -def test_from_native_floatgst_random( +def test_from_native_float_random( dtype: T_DTYPE_FLOAT_LITERAL, repeat: int, size: int, - ext: str, use_init: bool, - mocker: t.Any, ) -> None: - _mock_mask_extensions(mocker, ext) type_size = _get_type_size(dtype) for _ in range(repeat): data = _random_value_be_bytes(type_size, size) @@ -485,156 +486,212 @@ def test_from_native_floatgst_random( else: v = Vector.from_native(values, dtype) expected_raw = data - if dtype.startswith("f"): - expected_raw = _normalize_float_bytes(dtype, data) + if dtype == "f32": + expected_raw = Float32NanPayloadBytes(data).normalized_bytes() assert v.raw() == expected_raw -SPECIAL_INT_VALUES: tuple[tuple[T_DTYPE_INT_LITERAL, int, bytes], ...] = ( +SPECIAL_INT_VALUES: tuple[ + tuple[T_DTYPE_INT_LITERAL, int, NormalizableBytes], ... +] = ( # (dtype, value, packed_bytes_be) # i8 - ("i8", -128, b"\x80"), - ("i8", 0, b"\x00"), - ("i8", 127, b"\x7f"), + ("i8", -128, Bytes(bytes.fromhex("80"))), + ("i8", 0, Bytes(bytes.fromhex("00"))), + ("i8", 127, Bytes(bytes.fromhex("7f"))), # i16 - ("i16", -32768, b"\x80\x00"), - ("i16", 0, b"\x00\x00"), - ("i16", 32767, b"\x7f\xff"), + ("i16", -32768, Bytes(bytes.fromhex("8000"))), + ("i16", 0, Bytes(bytes.fromhex("0000"))), + ("i16", 32767, Bytes(bytes.fromhex("7fff"))), # i32 - ("i32", -2147483648, b"\x80\x00\x00\x00"), - ("i32", 0, b"\x00\x00\x00\x00"), - ("i32", 2147483647, b"\x7f\xff\xff\xff"), + ("i32", -2147483648, Bytes(bytes.fromhex("80000000"))), + ("i32", 0, Bytes(bytes.fromhex("00000000"))), + ("i32", 2147483647, Bytes(bytes.fromhex("7fffffff"))), # i64 - ("i64", -9223372036854775808, b"\x80\x00\x00\x00\x00\x00\x00\x00"), - ("i64", 0, b"\x00\x00\x00\x00\x00\x00\x00\x00"), - ("i64", 9223372036854775807, b"\x7f\xff\xff\xff\xff\xff\xff\xff"), + ("i64", -9223372036854775808, Bytes(bytes.fromhex("8000000000000000"))), + ("i64", 0, Bytes(bytes.fromhex("0000000000000000"))), + ("i64", 9223372036854775807, Bytes(bytes.fromhex("7fffffffffffffff"))), ) SPECIAL_FLOAT_VALUES: tuple[ - tuple[T_DTYPE_FLOAT_LITERAL, float, bytes], ... + tuple[T_DTYPE_FLOAT_LITERAL, float, NormalizableBytes], ... ] = ( # (dtype, value, packed_bytes_be) # f32 # NaN - ("f32", float("nan"), b"\x7f\xc0\x00\x00"), - ("f32", float("-nan"), b"\xff\xc0\x00\x00"), ( "f32", - struct.unpack(">f", b"\x7f\xc0\x00\x11")[0], - b"\x7f\xc0\x00\x11", + float("nan"), + Bytes(bytes.fromhex("7fc00000")), + ), + ( + "f32", + float("-nan"), + Bytes(bytes.fromhex("ffc00000")), + ), + ( + "f32", + struct.unpack(">f", bytes.fromhex("7fc00011"))[0], + Bytes(bytes.fromhex("7fc00011")), ), ( "f32", - struct.unpack(">f", b"\x7f\x80\x00\x01")[0], - # Python < 3.14 does not properly preserver all NaN payload - # when calling struct.pack - _normalize_float_bytes("f32", b"\x7f\x80\x00\x01"), + struct.unpack(">f", bytes.fromhex("7f800001"))[0], + Float32NanPayloadBytes(bytes.fromhex("7f800001")), ), # ±inf - ("f32", float("inf"), b"\x7f\x80\x00\x00"), - ("f32", float("-inf"), b"\xff\x80\x00\x00"), + ( + "f32", + float("inf"), + Bytes(bytes.fromhex("7f800000")), + ), + ( + "f32", + float("-inf"), + Bytes(bytes.fromhex("ff800000")), + ), # ±0.0 - ("f32", 0.0, b"\x00\x00\x00\x00"), - ("f32", -0.0, b"\x80\x00\x00\x00"), + ( + "f32", + 0.0, + Bytes(bytes.fromhex("00000000")), + ), + ( + "f32", + -0.0, + Bytes(bytes.fromhex("80000000")), + ), # smallest normal ( "f32", - struct.unpack(">f", b"\x00\x80\x00\x00")[0], - b"\x00\x80\x00\x00", + struct.unpack(">f", bytes.fromhex("00800000"))[0], + Bytes(bytes.fromhex("00800000")), ), ( "f32", - struct.unpack(">f", b"\x80\x80\x00\x00")[0], - b"\x80\x80\x00\x00", + struct.unpack(">f", bytes.fromhex("80800000"))[0], + Bytes(bytes.fromhex("80800000")), ), # subnormal ( "f32", - struct.unpack(">f", b"\x00\x00\x00\x01")[0], - b"\x00\x00\x00\x01", + struct.unpack(">f", bytes.fromhex("00000001"))[0], + Bytes(bytes.fromhex("00000001")), ), ( "f32", - struct.unpack(">f", b"\x80\x00\x00\x01")[0], - b"\x80\x00\x00\x01", + struct.unpack(">f", bytes.fromhex("80000001"))[0], + Bytes(bytes.fromhex("80000001")), ), # largest normal ( "f32", - struct.unpack(">f", b"\x7f\x7f\xff\xff")[0], - b"\x7f\x7f\xff\xff", + struct.unpack(">f", bytes.fromhex("7f7fffff"))[0], + Bytes(bytes.fromhex("7f7fffff")), + ), + ( + "f32", + struct.unpack(">f", bytes.fromhex("ff7fffff"))[0], + Bytes(bytes.fromhex("ff7fffff")), ), + # very small f64 being rounded to ±0 in f32 ( "f32", - struct.unpack(">f", b"\xff\x7f\xff\xff")[0], - b"\xff\x7f\xff\xff", + struct.unpack(">d", bytes.fromhex("3686d601ad376ab9"))[0], + Bytes(bytes.fromhex("00000000")), + ), + ( + "f32", + struct.unpack(">d", bytes.fromhex("b686d601ad376ab9"))[0], + Bytes(bytes.fromhex("80000000")), ), # f64 # NaN - ("f64", float("nan"), b"\x7f\xf8\x00\x00\x00\x00\x00\x00"), - ("f64", float("-nan"), b"\xff\xf8\x00\x00\x00\x00\x00\x00"), ( "f64", - struct.unpack(">d", b"\x7f\xf8\x00\x00\x00\x00\x00\x11")[0], - b"\x7f\xf8\x00\x00\x00\x00\x00\x11", + float("nan"), + Bytes(bytes.fromhex("7ff8000000000000")), + ), + ( + "f64", + float("-nan"), + Bytes(bytes.fromhex("fff8000000000000")), + ), + ( + "f64", + struct.unpack(">d", bytes.fromhex("7ff8000000000011"))[0], + Bytes(bytes.fromhex("7ff8000000000011")), ), ( "f64", - struct.unpack(">d", b"\x7f\xf0\x00\x01\x00\x00\x00\x01")[0], - b"\x7f\xf0\x00\x01\x00\x00\x00\x01", + struct.unpack(">d", bytes.fromhex("7ff0000100000001"))[0], + Bytes(bytes.fromhex("7ff0000100000001")), ), # ±inf - ("f64", float("inf"), b"\x7f\xf0\x00\x00\x00\x00\x00\x00"), - ("f64", float("-inf"), b"\xff\xf0\x00\x00\x00\x00\x00\x00"), + ( + "f64", + float("inf"), + Bytes(bytes.fromhex("7ff0000000000000")), + ), + ( + "f64", + float("-inf"), + Bytes(bytes.fromhex("fff0000000000000")), + ), # ±0.0 - ("f64", 0.0, b"\x00\x00\x00\x00\x00\x00\x00\x00"), - ("f64", -0.0, b"\x80\x00\x00\x00\x00\x00\x00\x00"), + ( + "f64", + 0.0, + Bytes(bytes.fromhex("0000000000000000")), + ), + ( + "f64", + -0.0, + Bytes(bytes.fromhex("8000000000000000")), + ), # smallest normal ( "f64", - struct.unpack(">d", b"\x00\x10\x00\x00\x00\x00\x00\x00")[0], - b"\x00\x10\x00\x00\x00\x00\x00\x00", + struct.unpack(">d", bytes.fromhex("0010000000000000"))[0], + Bytes(bytes.fromhex("0010000000000000")), ), ( "f64", - struct.unpack(">d", b"\x80\x10\x00\x00\x00\x00\x00\x00")[0], - b"\x80\x10\x00\x00\x00\x00\x00\x00", + struct.unpack(">d", bytes.fromhex("8010000000000000"))[0], + Bytes(bytes.fromhex("8010000000000000")), ), # subnormal ( "f64", - struct.unpack(">d", b"\x00\x00\x00\x00\x00\x00\x00\x01")[0], - b"\x00\x00\x00\x00\x00\x00\x00\x01", + struct.unpack(">d", bytes.fromhex("0000000000000001"))[0], + Bytes(bytes.fromhex("0000000000000001")), ), ( "f64", - struct.unpack(">d", b"\x80\x00\x00\x00\x00\x00\x00\x01")[0], - b"\x80\x00\x00\x00\x00\x00\x00\x01", + struct.unpack(">d", bytes.fromhex("8000000000000001"))[0], + Bytes(bytes.fromhex("8000000000000001")), ), # largest normal ( "f64", - struct.unpack(">d", b"\x7f\xef\xff\xff\xff\xff\xff\xff")[0], - b"\x7f\xef\xff\xff\xff\xff\xff\xff", + struct.unpack(">d", bytes.fromhex("7fefffffffffffff"))[0], + Bytes(bytes.fromhex("7fefffffffffffff")), ), ( "f64", - struct.unpack(">d", b"\xff\xef\xff\xff\xff\xff\xff\xff")[0], - b"\xff\xef\xff\xff\xff\xff\xff\xff", + struct.unpack(">d", bytes.fromhex("ffefffffffffffff"))[0], + Bytes(bytes.fromhex("ffefffffffffffff")), ), ) SPECIAL_VALUES = SPECIAL_INT_VALUES + SPECIAL_FLOAT_VALUES -@pytest.mark.parametrize(("dtype", "value", "data_be"), SPECIAL_VALUES) -@pytest.mark.parametrize("ext", ("numpy", "rust", "python")) +@pytest.mark.parametrize(("dtype", "value", "data_be_raw"), SPECIAL_VALUES) def test_from_native_special_values( dtype: t.Literal["i8", "i16", "i32", "i64", "f32", "f64"], value: object, - data_be: bytes, - ext: str, - mocker: t.Any, + data_be_raw: NormalizableBytes, ) -> None: - _mock_mask_extensions(mocker, ext) + data_be = data_be_raw.normalized_bytes() if dtype in {"f32", "f64"}: assert isinstance(value, float) dtype_f = t.cast(t.Literal["f32", "f64"], dtype) @@ -671,14 +728,10 @@ def test_from_native_special_values( ("f64", 1), ), ) -@pytest.mark.parametrize("ext", ("numpy", "rust", "python")) def test_from_native_wrong_type( dtype: t.Literal["i8", "i16", "i32", "i64", "f32", "f64"], value: object, - ext: str, - mocker: t.Any, ) -> None: - _mock_mask_extensions(mocker, ext) with pytest.raises(TypeError) as exc: Vector.from_native([value], dtype) # type: ignore @@ -697,16 +750,22 @@ def test_from_native_wrong_type( ("i32", 2147483648), ("i64", -9223372036854775809), ("i64", 9223372036854775808), + # positive value, positive exponent overflow + ("f32", struct.unpack(">d", bytes.fromhex("47f0000020000000"))[0]), + # negative value, positive exponent overflow + ("f32", struct.unpack(">d", bytes.fromhex("c7f0000020000000"))[0]), + # no such thing as negative exponent overflow: + # very small values become 0.0 + # positive value, positive exponent, mantiassa overflow + ("f32", struct.unpack(">d", bytes.fromhex("47effffff0000000"))[0]), + # negative value, positive exponent, mantiassa overflow + ("f32", struct.unpack(">d", bytes.fromhex("c7effffff0000000"))[0]), ), ) -@pytest.mark.parametrize("ext", ("numpy", "rust", "python")) def test_from_native_overflow( dtype: t.Literal["i8", "i16", "i32", "i64", "f32", "f64"], value: object, - ext: str, - mocker: t.Any, ) -> None: - _mock_mask_extensions(mocker, ext) with pytest.raises(OverflowError) as exc: Vector.from_native([value], dtype) # type: ignore @@ -759,12 +818,13 @@ def test_to_native_random( assert nan_equals(v.to_native(), expected) -@pytest.mark.parametrize(("dtype", "value", "data_be"), SPECIAL_VALUES) +@pytest.mark.parametrize(("dtype", "value", "data_be_raw"), SPECIAL_VALUES) def test_to_native_special_values( dtype: t.Literal["i8", "i16", "i32", "i64", "f32", "f64"], value: object, - data_be: bytes, + data_be_raw: NormalizableBytes, ) -> None: + data_be = data_be_raw.raw_bytes() type_size = _get_type_size(dtype) pack_format = _dtype_to_pack_format(dtype) expected = [ @@ -829,14 +889,15 @@ def test_from_numpy_random( @pytest.mark.skipif(np is None, reason="numpy not installed") -@pytest.mark.parametrize(("dtype", "value", "data_be"), SPECIAL_VALUES) +@pytest.mark.parametrize(("dtype", "value", "data_be_raw"), SPECIAL_VALUES) @pytest.mark.parametrize("endian", ("big", "little", "native")) def test_from_numpy_special_values( dtype: t.Literal["i8", "i16", "i32", "i64", "f32", "f64"], endian: t.Literal["big", "little", "native"], value: object, - data_be: bytes, + data_be_raw: NormalizableBytes, ) -> None: + data_be = data_be_raw.raw_bytes() array = _get_numpy_array(data_be, dtype, endian) v = Vector.from_numpy(array) assert v.dtype == dtype @@ -873,7 +934,7 @@ def test_to_numpy_random( @pytest.mark.skipif(np is None, reason="numpy not installed") -@pytest.mark.parametrize(("dtype", "value", "data_be"), SPECIAL_VALUES) +@pytest.mark.parametrize(("dtype", "value", "data_be_raw"), SPECIAL_VALUES) @pytest.mark.parametrize( "endian", ( @@ -885,8 +946,9 @@ def test_to_numpy_special_values( dtype: t.Literal["i8", "i16", "i32", "i64", "f32", "f64"], endian: T_ENDIAN_LITERAL | None, value: object, - data_be: bytes, + data_be_raw: NormalizableBytes, ) -> None: + data_be = data_be_raw.raw_bytes() np_type = _get_numpy_dtype(dtype) v = _vector_from_data(data_be, dtype, endian) array = v.to_numpy() @@ -942,12 +1004,13 @@ def test_from_pyarrow_random( @pytest.mark.skipif(pa is None, reason="pyarrow not installed") -@pytest.mark.parametrize(("dtype", "value", "data_be"), SPECIAL_VALUES) +@pytest.mark.parametrize(("dtype", "value", "data_be_raw"), SPECIAL_VALUES) def test_from_pyarrow_special_values( dtype: t.Literal["i8", "i16", "i32", "i64", "f32", "f64"], value: object, - data_be: bytes, + data_be_raw: NormalizableBytes, ) -> None: + data_be = data_be_raw.raw_bytes() array = _get_pyarrow_array(data_be, dtype) v = Vector.from_pyarrow(array) assert v.dtype == dtype @@ -990,7 +1053,7 @@ def test_to_pyarrow_random( @pytest.mark.skipif(pa is None, reason="pyarrow not installed") -@pytest.mark.parametrize(("dtype", "value", "data_be"), SPECIAL_VALUES) +@pytest.mark.parametrize(("dtype", "value", "data_be_raw"), SPECIAL_VALUES) @pytest.mark.parametrize( "endian", ( @@ -1002,8 +1065,9 @@ def test_to_pyarrow_special_values( dtype: t.Literal["i8", "i16", "i32", "i64", "f32", "f64"], endian: T_ENDIAN_LITERAL | None, value: object, - data_be: bytes, + data_be_raw: NormalizableBytes, ) -> None: + data_be = data_be_raw.raw_bytes() type_size = _get_type_size(dtype) data_ne = data_be if sys.byteorder == "little": @@ -1021,31 +1085,26 @@ def test_to_pyarrow_special_values( @pytest.mark.parametrize( - ("vector", "expected"), + "vector", ( - (Vector([], "i8"), "Vector(b'', 'i8')"), - (Vector([], "i16"), "Vector(b'', 'i16')"), - (Vector([], "i32"), "Vector(b'', 'i32')"), - (Vector([], "i64"), "Vector(b'', 'i64')"), - (Vector([], "f32"), "Vector(b'', 'f32')"), - (Vector([], "f64"), "Vector(b'', 'f64')"), + Vector([], "i8"), + Vector([], "i16"), + Vector([], "i32"), + Vector([], "i64"), + Vector([], "f32"), + Vector([], "f64"), *( - ( - Vector([value], dtype), - f"Vector({packed_bytes_be!r}, {dtype!r})", - ) - for (dtype, value, packed_bytes_be) in SPECIAL_INT_VALUES + Vector([value], dtype) + for (dtype, value, packed_bytes_be_) in SPECIAL_INT_VALUES ), *( - ( - Vector([value], dtype), - f"Vector({packed_bytes_be!r}, {dtype!r})", - ) - for (dtype, value, packed_bytes_be) in SPECIAL_FLOAT_VALUES + Vector([value], dtype) + for (dtype, value, packed_bytes_be_) in SPECIAL_FLOAT_VALUES ), ), ) -def test_vector_repr(vector: Vector, expected: str) -> None: +def test_vector_repr(vector: Vector) -> None: + expected = f"Vector({vector.raw()!r}, {vector.dtype.value!r})" assert repr(vector) == expected @@ -1079,12 +1138,16 @@ def _dtype_to_cypher_type(dtype: T_DTYPE_LITERAL) -> str: }[dtype] -def _vec_element_cypher_repr(value: t.Any) -> str: - if isinstance(value, float): +def _vec_element_cypher_repr(value: t.Any, dtype: T_DTYPE_LITERAL) -> str: + if isinstance(value, float) and dtype in {"f32", "f64"}: if math.isnan(value): return "NaN" if math.isinf(value): return "Infinity" if value > 0 else "-Infinity" + if dtype == "f32": + # account for float32 precision loss + compressed = struct.unpack(">f", struct.pack(">f", value))[0] + return repr(compressed) return repr(value) @@ -1101,7 +1164,7 @@ def _vec_element_cypher_repr(value: t.Any) -> str: ( Vector([value], dtype), ( - f"vector([{_vec_element_cypher_repr(value)}], 1, " + f"vector([{_vec_element_cypher_repr(value, dtype)}], 1, " f"{_dtype_to_cypher_type(dtype)})" ), ) @@ -1111,7 +1174,7 @@ def _vec_element_cypher_repr(value: t.Any) -> str: ( Vector([value], dtype), ( - f"vector([{_vec_element_cypher_repr(value)}], 1, " + f"vector([{_vec_element_cypher_repr(value, dtype)}], 1, " f"{_dtype_to_cypher_type(dtype)})" ), ) @@ -1135,8 +1198,9 @@ def test_vector_str_random( for _ in range(repeat): data = _random_value_be_bytes(type_size, size) v = Vector(data, dtype) - values_repr = ( - f"[{', '.join(map(_vec_element_cypher_repr, v.to_native()))}]" + values_reprs = ( + _vec_element_cypher_repr(value, dtype) for value in v.to_native() ) + values_repr = f"[{', '.join(values_reprs)}]" expected = f"vector({values_repr}, {size}, {cypher_dtype})" assert str(v) == expected