From 045a5db86958ed65efedfd2fa5fe61279c896575 Mon Sep 17 00:00:00 2001
From: Dev-iL <6509619+Dev-iL@users.noreply.github.com>
Date: Mon, 8 Jun 2026 11:45:27 +0300
Subject: [PATCH] Vectorize pandas and polars DataFrame hashing

Replace the per-row Python loops in the DataFrame fingerprinting paths
with single-buffer hashing:

- pandas: hash the `hash_pandas_object(obj).values` uint64 buffer in one
  shot instead of round-tripping through `.to_dict()` and an ordered
  `hash_mapping`; fold column names + dtypes (schema) into the hash so
  frames with identical values but different schemas no longer collide;
  keep the path order-sensitive.
- polars: hash the `hash_rows().to_numpy()` buffer in one shot instead of
  `.to_list()` through a per-element `hash_sequence` loop.

Both paths route through the existing `_hash_bytes` chokepoint, so the
algorithm is unchanged here. The DataFrame digest is deliberately not
pinned to a literal (it depends on library-version-specific dtype reprs);
coverage is via relational schema-collision, dtype-collision and
order-sensitivity tests for both backends.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 hamilton/caching/fingerprinting.py   | 34 +++++++++++++++++++---------
 tests/caching/test_fingerprinting.py | 29 ++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/hamilton/caching/fingerprinting.py b/hamilton/caching/fingerprinting.py
index f791755fa..df488b281 100644
--- a/hamilton/caching/fingerprinting.py
+++ b/hamilton/caching/fingerprinting.py
@@ -249,28 +249,40 @@ def hash_set(obj, *args, depth: int = 0, **kwargs) -> str:
 @hash_value.register(h_databackends.AbstractPandasDataFrame)
 @hash_value.register(h_databackends.AbstractPandasColumn)
 def hash_pandas_obj(obj, *args, depth: int = 0, **kwargs) -> str:
-    """Convert a pandas dataframe, series, or index to
-    a dictionary of {index: row_hash} then hash it.
+    """Hash a pandas DataFrame, Series, or Index via vectorized row hashing.
 
-    Given the hashing for mappings, the physical ordering or rows doesn't matter.
-    For example, if the index is a date, the hash will represent the {date: row_hash},
-    and won't preserve how dates were ordered in the DataFrame.
+    ``pandas.util.hash_pandas_object`` computes a uint64 hash per row in a
+    single vectorized pass; we hash that buffer in one shot rather than
+    iterating over rows in Python. Column names and dtypes (the schema) are
+    folded in so that frames carrying identical cell values under different
+    schemas do not collide.
+
+    The hash is order-sensitive: reordering rows changes the per-row hash
+    buffer and therefore the fingerprint.
     """
     from pandas.util import hash_pandas_object
 
-    hash_per_row = hash_pandas_object(obj)
-    return hash_mapping(hash_per_row.to_dict(), ignore_order=False, depth=depth + 1)
+    row_hashes = hash_pandas_object(obj).values.tobytes()
+    if hasattr(obj, "columns"):
+        schema = f"{list(obj.columns)}:{[str(dtype) for dtype in obj.dtypes]}"
+    else:
+        schema = f"{getattr(obj, 'name', None)}:{obj.dtype}"
+    return _hash_bytes(schema.encode() + row_hashes)
 
 
 @hash_value.register(h_databackends.AbstractPolarsDataFrame)
 def hash_polars_dataframe(obj, *args, depth: int = 0, **kwargs) -> str:
-    """Convert a polars dataframe to a hash that includes column names
-    and dtypes (schema) alongside row hashes. This prevents collisions
-    between DataFrames with identical cell values but different schemas.
+    """Hash a polars DataFrame via vectorized row hashing.
+
+    ``DataFrame.hash_rows`` computes a per-row hash in a single vectorized
+    pass; we hash that buffer (``to_numpy().tobytes()``) in one shot rather
+    than iterating element-by-element in Python. Column names and dtypes
+    (the schema) are folded in so frames carrying identical cell values under
+    different schemas do not collide.
     """
     schema_str = ",".join(f"{name}:{dtype}" for name, dtype in obj.schema.items())
     schema_hash = hash_bytes(schema_str.encode())
-    row_hash = hash_sequence(obj.hash_rows().to_list(), depth=depth + 1)
+    row_hash = hash_bytes(obj.hash_rows().to_numpy().tobytes())
     return _hash_bytes(schema_hash.encode() + row_hash.encode())
 
 
diff --git a/tests/caching/test_fingerprinting.py b/tests/caching/test_fingerprinting.py
index 28ea96085..9c00e072e 100644
--- a/tests/caching/test_fingerprinting.py
+++ b/tests/caching/test_fingerprinting.py
@@ -248,6 +248,27 @@ def test_hash_pandas_same_data_matches():
     assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b)
 
 
+def test_hash_pandas_different_columns_differ():
+    """pandas analog of test_hash_polars_different_columns_differ: identical
+    values under different column names must hash differently."""
+    a = pd.DataFrame({"region": ["East", "West"], "revenue": [100, 200]})
+    b = pd.DataFrame({"student": ["East", "West"], "height_cm": [100, 200]})
+    assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)
+
+
+def test_hash_pandas_different_dtypes_differ():
+    """pandas frames with identical values but different dtypes must hash differently."""
+    a = pd.DataFrame({"a": [1, 2], "b": [3, 4]})  # int64
+    b = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]})  # float64
+    assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)
+
+
+def test_hash_pandas_order_sensitive():
+    """Reordering rows must change the fingerprint (order-sensitivity preserved)."""
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    assert fingerprinting.hash_value(df) != fingerprinting.hash_value(df.iloc[::-1])
+
+
 def test_hash_polars_different_columns_differ():
     """DataFrames with identical values but different column names must hash differently."""
     polars = pytest.importorskip("polars")
@@ -264,6 +285,14 @@ def test_hash_polars_same_schema_same_data_matches():
     assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b)
 
 
+def test_hash_polars_different_dtypes_differ():
+    """polars frames with identical values but different dtypes must hash differently."""
+    polars = pytest.importorskip("polars")
+    a = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Int64})
+    b = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Float64})
+    assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)
+
+
 def test_hash_cross_type_primitives_differ():
     """Values with the same string form but different types must hash differently.