From 045a5db86958ed65efedfd2fa5fe61279c896575 Mon Sep 17 00:00:00 2001 From: Dev-iL <6509619+Dev-iL@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:45:27 +0300 Subject: [PATCH 1/2] Vectorize pandas and polars DataFrame hashing Replace the per-row Python loops in the DataFrame fingerprinting paths with single-buffer hashing: - pandas: hash the `hash_pandas_object(obj).values` uint64 buffer in one shot instead of round-tripping through `.to_dict()` and an ordered `hash_mapping`; fold column names + dtypes (schema) into the hash so frames with identical values but different schemas no longer collide; keep the path order-sensitive. - polars: hash the `hash_rows().to_numpy()` buffer in one shot instead of `.to_list()` through a per-element `hash_sequence` loop. Both paths route through the existing `_hash_bytes` chokepoint, so the algorithm is unchanged here. The DataFrame digest is deliberately not pinned to a literal (it depends on library-version-specific dtype reprs); coverage is via relational schema-collision, dtype-collision and order-sensitivity tests for both backends. Co-Authored-By: Claude Opus 4.8 --- hamilton/caching/fingerprinting.py | 34 +++++++++++++++++++--------- tests/caching/test_fingerprinting.py | 29 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 11 deletions(-) diff --git a/hamilton/caching/fingerprinting.py b/hamilton/caching/fingerprinting.py index f791755fa..df488b281 100644 --- a/hamilton/caching/fingerprinting.py +++ b/hamilton/caching/fingerprinting.py @@ -249,28 +249,40 @@ def hash_set(obj, *args, depth: int = 0, **kwargs) -> str: @hash_value.register(h_databackends.AbstractPandasDataFrame) @hash_value.register(h_databackends.AbstractPandasColumn) def hash_pandas_obj(obj, *args, depth: int = 0, **kwargs) -> str: - """Convert a pandas dataframe, series, or index to - a dictionary of {index: row_hash} then hash it. + """Hash a pandas DataFrame, Series, or Index via vectorized row hashing. - Given the hashing for mappings, the physical ordering or rows doesn't matter. - For example, if the index is a date, the hash will represent the {date: row_hash}, - and won't preserve how dates were ordered in the DataFrame. + ``pandas.util.hash_pandas_object`` computes a uint64 hash per row in a + single vectorized pass; we hash that buffer in one shot rather than + iterating over rows in Python. Column names and dtypes (the schema) are + folded in so that frames carrying identical cell values under different + schemas do not collide. + + The hash is order-sensitive: reordering rows changes the per-row hash + buffer and therefore the fingerprint. """ from pandas.util import hash_pandas_object - hash_per_row = hash_pandas_object(obj) - return hash_mapping(hash_per_row.to_dict(), ignore_order=False, depth=depth + 1) + row_hashes = hash_pandas_object(obj).values.tobytes() + if hasattr(obj, "columns"): + schema = f"{list(obj.columns)}:{[str(dtype) for dtype in obj.dtypes]}" + else: + schema = f"{getattr(obj, 'name', None)}:{obj.dtype}" + return _hash_bytes(schema.encode() + row_hashes) @hash_value.register(h_databackends.AbstractPolarsDataFrame) def hash_polars_dataframe(obj, *args, depth: int = 0, **kwargs) -> str: - """Convert a polars dataframe to a hash that includes column names - and dtypes (schema) alongside row hashes. This prevents collisions - between DataFrames with identical cell values but different schemas. + """Hash a polars DataFrame via vectorized row hashing. + + ``DataFrame.hash_rows`` computes a per-row hash in a single vectorized + pass; we hash that buffer (``to_numpy().tobytes()``) in one shot rather + than iterating element-by-element in Python. Column names and dtypes + (the schema) are folded in so frames carrying identical cell values under + different schemas do not collide. """ schema_str = ",".join(f"{name}:{dtype}" for name, dtype in obj.schema.items()) schema_hash = hash_bytes(schema_str.encode()) - row_hash = hash_sequence(obj.hash_rows().to_list(), depth=depth + 1) + row_hash = hash_bytes(obj.hash_rows().to_numpy().tobytes()) return _hash_bytes(schema_hash.encode() + row_hash.encode()) diff --git a/tests/caching/test_fingerprinting.py b/tests/caching/test_fingerprinting.py index 28ea96085..9c00e072e 100644 --- a/tests/caching/test_fingerprinting.py +++ b/tests/caching/test_fingerprinting.py @@ -248,6 +248,27 @@ def test_hash_pandas_same_data_matches(): assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b) +def test_hash_pandas_different_columns_differ(): + """pandas analog of test_hash_polars_different_columns_differ: identical + values under different column names must hash differently.""" + a = pd.DataFrame({"region": ["East", "West"], "revenue": [100, 200]}) + b = pd.DataFrame({"student": ["East", "West"], "height_cm": [100, 200]}) + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + +def test_hash_pandas_different_dtypes_differ(): + """pandas frames with identical values but different dtypes must hash differently.""" + a = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) # int64 + b = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]}) # float64 + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + +def test_hash_pandas_order_sensitive(): + """Reordering rows must change the fingerprint (order-sensitivity preserved).""" + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + assert fingerprinting.hash_value(df) != fingerprinting.hash_value(df.iloc[::-1]) + + def test_hash_polars_different_columns_differ(): """DataFrames with identical values but different column names must hash differently.""" polars = pytest.importorskip("polars") @@ -264,6 +285,14 @@ def test_hash_polars_same_schema_same_data_matches(): assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b) +def test_hash_polars_different_dtypes_differ(): + """polars frames with identical values but different dtypes must hash differently.""" + polars = pytest.importorskip("polars") + a = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Int64}) + b = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Float64}) + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + def test_hash_cross_type_primitives_differ(): """Values with the same string form but different types must hash differently. From b1d0e680523dab6dcfd75a342cf07e9cf0cf40a4 Mon Sep 17 00:00:00 2001 From: Dev-iL <6509619+Dev-iL@users.noreply.github.com> Date: Mon, 8 Jun 2026 12:02:50 +0300 Subject: [PATCH 2/2] Standardize fingerprint hashing on xxh3_128 Swap the single `_hash_bytes` chokepoint from md5 to the non-cryptographic `xxhash.xxh3_128`. xxh3_128 produces a 16-byte digest (24 base64url chars, identical width to the md5 it replaces), so digest width and collision resistance are preserved while throughput on buffer-bound paths rises substantially. Declare `xxhash>=0.8.0` as a core runtime dependency (xxh3_128 was added in 0.8.0); fingerprinting is imported eagerly via the caching adapter, so it must be a hard dependency rather than an optional extra. Add the xxhash BSD-2-Clause attribution to LICENSE. Recompute the portable literal-digest pins (primitives, sequences, mappings, sets, numpy) against xxh3_128. This is a fingerprint-changing release: prior cached fingerprints no longer match and will be recomputed. Co-Authored-By: Claude Opus 4.8 --- LICENSE | 30 ++++++++++++++++++++++++++++ hamilton/caching/fingerprinting.py | 11 +++++++--- pyproject.toml | 1 + tests/caching/test_fingerprinting.py | 24 +++++++++++----------- 4 files changed, 51 insertions(+), 15 deletions(-) diff --git a/LICENSE b/LICENSE index 60a996edb..4d72e4c07 100644 --- a/LICENSE +++ b/LICENSE @@ -226,3 +226,33 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +------------------------------- + +This product depends on xxhash (the python-xxhash package, https://github.com/ifduyue/python-xxhash), +which is licensed under the BSD 2-Clause License. + +BSD 2-Clause License + +Copyright (c) 2014-2024, Yue Du +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/hamilton/caching/fingerprinting.py b/hamilton/caching/fingerprinting.py index df488b281..f3c1b90dd 100644 --- a/hamilton/caching/fingerprinting.py +++ b/hamilton/caching/fingerprinting.py @@ -36,11 +36,12 @@ import base64 import datetime import functools -import hashlib import logging import sys from collections.abc import Mapping, Sequence, Set +import xxhash + from hamilton.experimental import h_databackends # NoneType is introduced in Python 3.10 @@ -77,12 +78,16 @@ def _compact_hash(digest: bytes) -> str: def _hash_bytes(data: bytes) -> str: - """Hash raw bytes and compact-encode the digest. + """Hash raw bytes with the non-cryptographic xxh3_128 algorithm and + compact-encode the digest. All hashing in this module routes through this single helper so the underlying hashing algorithm can be changed in exactly one place. + xxh3_128 produces a 16-byte digest (24 base64url chars, the same width + as the md5 it replaces) while running substantially faster on + buffer-bound paths. """ - return _compact_hash(hashlib.md5(data).digest()) + return _compact_hash(xxhash.xxh3_128(data).digest()) @functools.singledispatch diff --git a/pyproject.toml b/pyproject.toml index f0ece93a6..ea6703f7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "pandas", "typing_extensions > 4.0.0", "typing_inspect", + "xxhash>=0.8.0", ] [project.optional-dependencies] diff --git a/tests/caching/test_fingerprinting.py b/tests/caching/test_fingerprinting.py index 9c00e072e..296affd6b 100644 --- a/tests/caching/test_fingerprinting.py +++ b/tests/caching/test_fingerprinting.py @@ -146,11 +146,11 @@ def __init__(self, obj): @pytest.mark.parametrize( ("obj", "expected_hash"), [ - ("hello-world", "L1Q1Kh6_t1atHO_H8RbBeA=="), - (17.31231, "mJPTpPyXDSZgU-u8NuztIQ=="), - (16474, "6MgAp1NbMW0ZZpe_8iKVsg=="), - (True, "J2eGynSuIpd5bwVQzO9VVg=="), - (b"\x951!\x89u=\xe6\xadG\xdf", "d1DufDgRQmqi9Kt4Z2PeUQ=="), + ("hello-world", "EXXR8_e47ElS18aP2lThJA=="), + (17.31231, "tVUSIslYiBcW52c-7w4gvA=="), + (16474, "FAJ-iXM_Hwg9TCRreY8AyA=="), + (True, "qkJEg3-XQKmGWk5sWqmonw=="), + (b"\x951!\x89u=\xe6\xadG\xdf", "pPTyYkSU_x7NLB1Fp_YTyA=="), ], ) def test_hash_primitive(obj, expected_hash): @@ -161,8 +161,8 @@ def test_hash_primitive(obj, expected_hash): @pytest.mark.parametrize( ("obj", "expected_hash"), [ - ([0, True, "hello-world"], "mlOjj4yeCrSDFSn5zgdEIg=="), - ((17.0, False, "world"), "BcRSGfyKeIOdym9B6TmAyQ=="), + ([0, True, "hello-world"], "I98OkNhfxtScJrYNTs4ZfQ=="), + ((17.0, False, "world"), "catgOMSnsbQj1_KELNQscw=="), ], ) def test_hash_sequence(obj, expected_hash): @@ -173,7 +173,7 @@ def test_hash_sequence(obj, expected_hash): def test_hash_equals_for_different_sequence_types(): list_obj = [0, True, "hello-world"] tuple_obj = (0, True, "hello-world") - expected_hash = "mlOjj4yeCrSDFSn5zgdEIg==" + expected_hash = "I98OkNhfxtScJrYNTs4ZfQ==" list_fingerprint = fingerprinting.hash_sequence(list_obj) tuple_fingerprint = fingerprinting.hash_sequence(tuple_obj) @@ -182,7 +182,7 @@ def test_hash_equals_for_different_sequence_types(): def test_hash_ordered_mapping(): obj = {0: True, "key": "value", 17.0: None} - expected_hash = "GyxyI9-pq-EJJvSAIN509g==" + expected_hash = "zX6MzhWGAOvxateHIPxOvA==" fingerprint = fingerprinting.hash_mapping(obj, ignore_order=False) assert fingerprint == expected_hash @@ -197,7 +197,7 @@ def test_hash_mapping_where_order_matters(): def test_hash_unordered_mapping(): obj = {0: True, "key": "value", 17.0: None} - expected_hash = "cDuuL2eA3DaSWlWW3u7o9g==" + expected_hash = "4cnTFA4MEEzmBN4a04k6tA==" fingerprint = fingerprinting.hash_mapping(obj, ignore_order=True) assert fingerprint == expected_hash @@ -212,7 +212,7 @@ def test_hash_mapping_where_order_doesnt_matter(): def test_hash_set(): obj = {0, True, "key", "value", 17.0, None} - expected_hash = "E_f_tjbi6qn7KL3NUCZayg==" + expected_hash = "mswHhNBBYN5mv6i-LcEeVw==" fingerprint = fingerprinting.hash_set(obj) assert fingerprint == expected_hash @@ -221,7 +221,7 @@ def test_hash_numpy(): # dtype is pinned explicitly so the literal digest is reproducible across # platforms (the default integer dtype is platform-dependent). array = np.array([[0, 1], [2, 3]], dtype=np.int64) - expected_hash = "024zwZIcWy6r4dlX4AMTow==" + expected_hash = "Y1uek_eQTHejo2YtRvdWPQ==" fingerprint = fingerprinting.hash_value(array) assert fingerprint == expected_hash