Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,33 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

-------------------------------

This product depends on xxhash (the python-xxhash package, https://github.com/ifduyue/python-xxhash),
which is licensed under the BSD 2-Clause License.

BSD 2-Clause License

Copyright (c) 2014-2024, Yue Du
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 changes: 31 additions & 14 deletions hamilton/caching/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@
import base64
import datetime
import functools
import hashlib
import logging
import sys
from collections.abc import Mapping, Sequence, Set

import xxhash

from hamilton.experimental import h_databackends

# NoneType is introduced in Python 3.10
Expand Down Expand Up @@ -77,12 +78,16 @@ def _compact_hash(digest: bytes) -> str:


def _hash_bytes(data: bytes) -> str:
"""Hash raw bytes and compact-encode the digest.
"""Hash raw bytes with the non-cryptographic xxh3_128 algorithm and
compact-encode the digest.

All hashing in this module routes through this single helper so the
underlying hashing algorithm can be changed in exactly one place.
xxh3_128 produces a 16-byte digest (24 base64url chars, the same width
as the md5 it replaces) while running substantially faster on
buffer-bound paths.
"""
return _compact_hash(hashlib.md5(data).digest())
return _compact_hash(xxhash.xxh3_128(data).digest())


@functools.singledispatch
Expand Down Expand Up @@ -249,28 +254,40 @@ def hash_set(obj, *args, depth: int = 0, **kwargs) -> str:
@hash_value.register(h_databackends.AbstractPandasDataFrame)
@hash_value.register(h_databackends.AbstractPandasColumn)
def hash_pandas_obj(obj, *args, depth: int = 0, **kwargs) -> str:
"""Convert a pandas dataframe, series, or index to
a dictionary of {index: row_hash} then hash it.
"""Hash a pandas DataFrame, Series, or Index via vectorized row hashing.

``pandas.util.hash_pandas_object`` computes a uint64 hash per row in a
single vectorized pass; we hash that buffer in one shot rather than
iterating over rows in Python. Column names and dtypes (the schema) are
folded in so that frames carrying identical cell values under different
schemas do not collide.

Given the hashing for mappings, the physical ordering or rows doesn't matter.
For example, if the index is a date, the hash will represent the {date: row_hash},
and won't preserve how dates were ordered in the DataFrame.
The hash is order-sensitive: reordering rows changes the per-row hash
buffer and therefore the fingerprint.
"""
from pandas.util import hash_pandas_object

hash_per_row = hash_pandas_object(obj)
return hash_mapping(hash_per_row.to_dict(), ignore_order=False, depth=depth + 1)
row_hashes = hash_pandas_object(obj).values.tobytes()
if hasattr(obj, "columns"):
schema = f"{list(obj.columns)}:{[str(dtype) for dtype in obj.dtypes]}"
else:
schema = f"{getattr(obj, 'name', None)}:{obj.dtype}"
return _hash_bytes(schema.encode() + row_hashes)


@hash_value.register(h_databackends.AbstractPolarsDataFrame)
def hash_polars_dataframe(obj, *args, depth: int = 0, **kwargs) -> str:
"""Convert a polars dataframe to a hash that includes column names
and dtypes (schema) alongside row hashes. This prevents collisions
between DataFrames with identical cell values but different schemas.
"""Hash a polars DataFrame via vectorized row hashing.

``DataFrame.hash_rows`` computes a per-row hash in a single vectorized
pass; we hash that buffer (``to_numpy().tobytes()``) in one shot rather
than iterating element-by-element in Python. Column names and dtypes
(the schema) are folded in so frames carrying identical cell values under
different schemas do not collide.
"""
schema_str = ",".join(f"{name}:{dtype}" for name, dtype in obj.schema.items())
schema_hash = hash_bytes(schema_str.encode())
row_hash = hash_sequence(obj.hash_rows().to_list(), depth=depth + 1)
row_hash = hash_bytes(obj.hash_rows().to_numpy().tobytes())
return _hash_bytes(schema_hash.encode() + row_hash.encode())


Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ dependencies = [
"pandas",
"typing_extensions > 4.0.0",
"typing_inspect",
"xxhash>=0.8.0",
]

[project.optional-dependencies]
Expand Down
53 changes: 41 additions & 12 deletions tests/caching/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,11 @@ def __init__(self, obj):
@pytest.mark.parametrize(
("obj", "expected_hash"),
[
("hello-world", "L1Q1Kh6_t1atHO_H8RbBeA=="),
(17.31231, "mJPTpPyXDSZgU-u8NuztIQ=="),
(16474, "6MgAp1NbMW0ZZpe_8iKVsg=="),
(True, "J2eGynSuIpd5bwVQzO9VVg=="),
(b"\x951!\x89u=\xe6\xadG\xdf", "d1DufDgRQmqi9Kt4Z2PeUQ=="),
("hello-world", "EXXR8_e47ElS18aP2lThJA=="),
(17.31231, "tVUSIslYiBcW52c-7w4gvA=="),
(16474, "FAJ-iXM_Hwg9TCRreY8AyA=="),
(True, "qkJEg3-XQKmGWk5sWqmonw=="),
(b"\x951!\x89u=\xe6\xadG\xdf", "pPTyYkSU_x7NLB1Fp_YTyA=="),
],
)
def test_hash_primitive(obj, expected_hash):
Expand All @@ -161,8 +161,8 @@ def test_hash_primitive(obj, expected_hash):
@pytest.mark.parametrize(
("obj", "expected_hash"),
[
([0, True, "hello-world"], "mlOjj4yeCrSDFSn5zgdEIg=="),
((17.0, False, "world"), "BcRSGfyKeIOdym9B6TmAyQ=="),
([0, True, "hello-world"], "I98OkNhfxtScJrYNTs4ZfQ=="),
((17.0, False, "world"), "catgOMSnsbQj1_KELNQscw=="),
],
)
def test_hash_sequence(obj, expected_hash):
Expand All @@ -173,7 +173,7 @@ def test_hash_sequence(obj, expected_hash):
def test_hash_equals_for_different_sequence_types():
list_obj = [0, True, "hello-world"]
tuple_obj = (0, True, "hello-world")
expected_hash = "mlOjj4yeCrSDFSn5zgdEIg=="
expected_hash = "I98OkNhfxtScJrYNTs4ZfQ=="

list_fingerprint = fingerprinting.hash_sequence(list_obj)
tuple_fingerprint = fingerprinting.hash_sequence(tuple_obj)
Expand All @@ -182,7 +182,7 @@ def test_hash_equals_for_different_sequence_types():

def test_hash_ordered_mapping():
obj = {0: True, "key": "value", 17.0: None}
expected_hash = "GyxyI9-pq-EJJvSAIN509g=="
expected_hash = "zX6MzhWGAOvxateHIPxOvA=="
fingerprint = fingerprinting.hash_mapping(obj, ignore_order=False)
assert fingerprint == expected_hash

Expand All @@ -197,7 +197,7 @@ def test_hash_mapping_where_order_matters():

def test_hash_unordered_mapping():
obj = {0: True, "key": "value", 17.0: None}
expected_hash = "cDuuL2eA3DaSWlWW3u7o9g=="
expected_hash = "4cnTFA4MEEzmBN4a04k6tA=="
fingerprint = fingerprinting.hash_mapping(obj, ignore_order=True)
assert fingerprint == expected_hash

Expand All @@ -212,7 +212,7 @@ def test_hash_mapping_where_order_doesnt_matter():

def test_hash_set():
obj = {0, True, "key", "value", 17.0, None}
expected_hash = "E_f_tjbi6qn7KL3NUCZayg=="
expected_hash = "mswHhNBBYN5mv6i-LcEeVw=="
fingerprint = fingerprinting.hash_set(obj)
assert fingerprint == expected_hash

Expand All @@ -221,7 +221,7 @@ def test_hash_numpy():
# dtype is pinned explicitly so the literal digest is reproducible across
# platforms (the default integer dtype is platform-dependent).
array = np.array([[0, 1], [2, 3]], dtype=np.int64)
expected_hash = "024zwZIcWy6r4dlX4AMTow=="
expected_hash = "Y1uek_eQTHejo2YtRvdWPQ=="
fingerprint = fingerprinting.hash_value(array)
assert fingerprint == expected_hash

Expand All @@ -248,6 +248,27 @@ def test_hash_pandas_same_data_matches():
assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b)


def test_hash_pandas_different_columns_differ():
"""pandas analog of test_hash_polars_different_columns_differ: identical
values under different column names must hash differently."""
a = pd.DataFrame({"region": ["East", "West"], "revenue": [100, 200]})
b = pd.DataFrame({"student": ["East", "West"], "height_cm": [100, 200]})
assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)


def test_hash_pandas_different_dtypes_differ():
"""pandas frames with identical values but different dtypes must hash differently."""
a = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) # int64
b = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]}) # float64
assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)


def test_hash_pandas_order_sensitive():
"""Reordering rows must change the fingerprint (order-sensitivity preserved)."""
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
assert fingerprinting.hash_value(df) != fingerprinting.hash_value(df.iloc[::-1])


def test_hash_polars_different_columns_differ():
"""DataFrames with identical values but different column names must hash differently."""
polars = pytest.importorskip("polars")
Expand All @@ -264,6 +285,14 @@ def test_hash_polars_same_schema_same_data_matches():
assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b)


def test_hash_polars_different_dtypes_differ():
"""polars frames with identical values but different dtypes must hash differently."""
polars = pytest.importorskip("polars")
a = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Int64})
b = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Float64})
assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)


def test_hash_cross_type_primitives_differ():
"""Values with the same string form but different types must hash differently.

Expand Down
Loading