From 045a5db86958ed65efedfd2fa5fe61279c896575 Mon Sep 17 00:00:00 2001 From: Dev-iL <6509619+Dev-iL@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:45:27 +0300 Subject: [PATCH] Vectorize pandas and polars DataFrame hashing Replace the per-row Python loops in the DataFrame fingerprinting paths with single-buffer hashing: - pandas: hash the `hash_pandas_object(obj).values` uint64 buffer in one shot instead of round-tripping through `.to_dict()` and an ordered `hash_mapping`; fold column names + dtypes (schema) into the hash so frames with identical values but different schemas no longer collide; keep the path order-sensitive. - polars: hash the `hash_rows().to_numpy()` buffer in one shot instead of `.to_list()` through a per-element `hash_sequence` loop. Both paths route through the existing `_hash_bytes` chokepoint, so the algorithm is unchanged here. The DataFrame digest is deliberately not pinned to a literal (it depends on library-version-specific dtype reprs); coverage is via relational schema-collision, dtype-collision and order-sensitivity tests for both backends. Co-Authored-By: Claude Opus 4.8 --- hamilton/caching/fingerprinting.py | 34 +++++++++++++++++++--------- tests/caching/test_fingerprinting.py | 29 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 11 deletions(-) diff --git a/hamilton/caching/fingerprinting.py b/hamilton/caching/fingerprinting.py index f791755fa..df488b281 100644 --- a/hamilton/caching/fingerprinting.py +++ b/hamilton/caching/fingerprinting.py @@ -249,28 +249,40 @@ def hash_set(obj, *args, depth: int = 0, **kwargs) -> str: @hash_value.register(h_databackends.AbstractPandasDataFrame) @hash_value.register(h_databackends.AbstractPandasColumn) def hash_pandas_obj(obj, *args, depth: int = 0, **kwargs) -> str: - """Convert a pandas dataframe, series, or index to - a dictionary of {index: row_hash} then hash it. + """Hash a pandas DataFrame, Series, or Index via vectorized row hashing. - Given the hashing for mappings, the physical ordering or rows doesn't matter. - For example, if the index is a date, the hash will represent the {date: row_hash}, - and won't preserve how dates were ordered in the DataFrame. + ``pandas.util.hash_pandas_object`` computes a uint64 hash per row in a + single vectorized pass; we hash that buffer in one shot rather than + iterating over rows in Python. Column names and dtypes (the schema) are + folded in so that frames carrying identical cell values under different + schemas do not collide. + + The hash is order-sensitive: reordering rows changes the per-row hash + buffer and therefore the fingerprint. """ from pandas.util import hash_pandas_object - hash_per_row = hash_pandas_object(obj) - return hash_mapping(hash_per_row.to_dict(), ignore_order=False, depth=depth + 1) + row_hashes = hash_pandas_object(obj).values.tobytes() + if hasattr(obj, "columns"): + schema = f"{list(obj.columns)}:{[str(dtype) for dtype in obj.dtypes]}" + else: + schema = f"{getattr(obj, 'name', None)}:{obj.dtype}" + return _hash_bytes(schema.encode() + row_hashes) @hash_value.register(h_databackends.AbstractPolarsDataFrame) def hash_polars_dataframe(obj, *args, depth: int = 0, **kwargs) -> str: - """Convert a polars dataframe to a hash that includes column names - and dtypes (schema) alongside row hashes. This prevents collisions - between DataFrames with identical cell values but different schemas. + """Hash a polars DataFrame via vectorized row hashing. + + ``DataFrame.hash_rows`` computes a per-row hash in a single vectorized + pass; we hash that buffer (``to_numpy().tobytes()``) in one shot rather + than iterating element-by-element in Python. Column names and dtypes + (the schema) are folded in so frames carrying identical cell values under + different schemas do not collide. """ schema_str = ",".join(f"{name}:{dtype}" for name, dtype in obj.schema.items()) schema_hash = hash_bytes(schema_str.encode()) - row_hash = hash_sequence(obj.hash_rows().to_list(), depth=depth + 1) + row_hash = hash_bytes(obj.hash_rows().to_numpy().tobytes()) return _hash_bytes(schema_hash.encode() + row_hash.encode()) diff --git a/tests/caching/test_fingerprinting.py b/tests/caching/test_fingerprinting.py index 28ea96085..9c00e072e 100644 --- a/tests/caching/test_fingerprinting.py +++ b/tests/caching/test_fingerprinting.py @@ -248,6 +248,27 @@ def test_hash_pandas_same_data_matches(): assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b) +def test_hash_pandas_different_columns_differ(): + """pandas analog of test_hash_polars_different_columns_differ: identical + values under different column names must hash differently.""" + a = pd.DataFrame({"region": ["East", "West"], "revenue": [100, 200]}) + b = pd.DataFrame({"student": ["East", "West"], "height_cm": [100, 200]}) + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + +def test_hash_pandas_different_dtypes_differ(): + """pandas frames with identical values but different dtypes must hash differently.""" + a = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) # int64 + b = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]}) # float64 + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + +def test_hash_pandas_order_sensitive(): + """Reordering rows must change the fingerprint (order-sensitivity preserved).""" + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + assert fingerprinting.hash_value(df) != fingerprinting.hash_value(df.iloc[::-1]) + + def test_hash_polars_different_columns_differ(): """DataFrames with identical values but different column names must hash differently.""" polars = pytest.importorskip("polars") @@ -264,6 +285,14 @@ def test_hash_polars_same_schema_same_data_matches(): assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b) +def test_hash_polars_different_dtypes_differ(): + """polars frames with identical values but different dtypes must hash differently.""" + polars = pytest.importorskip("polars") + a = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Int64}) + b = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Float64}) + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + def test_hash_cross_type_primitives_differ(): """Values with the same string form but different types must hash differently.