diff --git a/hamilton/caching/fingerprinting.py b/hamilton/caching/fingerprinting.py index f791755fa..df488b281 100644 --- a/hamilton/caching/fingerprinting.py +++ b/hamilton/caching/fingerprinting.py @@ -249,28 +249,40 @@ def hash_set(obj, *args, depth: int = 0, **kwargs) -> str: @hash_value.register(h_databackends.AbstractPandasDataFrame) @hash_value.register(h_databackends.AbstractPandasColumn) def hash_pandas_obj(obj, *args, depth: int = 0, **kwargs) -> str: - """Convert a pandas dataframe, series, or index to - a dictionary of {index: row_hash} then hash it. + """Hash a pandas DataFrame, Series, or Index via vectorized row hashing. - Given the hashing for mappings, the physical ordering or rows doesn't matter. - For example, if the index is a date, the hash will represent the {date: row_hash}, - and won't preserve how dates were ordered in the DataFrame. + ``pandas.util.hash_pandas_object`` computes a uint64 hash per row in a + single vectorized pass; we hash that buffer in one shot rather than + iterating over rows in Python. Column names and dtypes (the schema) are + folded in so that frames carrying identical cell values under different + schemas do not collide. + + The hash is order-sensitive: reordering rows changes the per-row hash + buffer and therefore the fingerprint. """ from pandas.util import hash_pandas_object - hash_per_row = hash_pandas_object(obj) - return hash_mapping(hash_per_row.to_dict(), ignore_order=False, depth=depth + 1) + row_hashes = hash_pandas_object(obj).values.tobytes() + if hasattr(obj, "columns"): + schema = f"{list(obj.columns)}:{[str(dtype) for dtype in obj.dtypes]}" + else: + schema = f"{getattr(obj, 'name', None)}:{obj.dtype}" + return _hash_bytes(schema.encode() + row_hashes) @hash_value.register(h_databackends.AbstractPolarsDataFrame) def hash_polars_dataframe(obj, *args, depth: int = 0, **kwargs) -> str: - """Convert a polars dataframe to a hash that includes column names - and dtypes (schema) alongside row hashes. This prevents collisions - between DataFrames with identical cell values but different schemas. + """Hash a polars DataFrame via vectorized row hashing. + + ``DataFrame.hash_rows`` computes a per-row hash in a single vectorized + pass; we hash that buffer (``to_numpy().tobytes()``) in one shot rather + than iterating element-by-element in Python. Column names and dtypes + (the schema) are folded in so frames carrying identical cell values under + different schemas do not collide. """ schema_str = ",".join(f"{name}:{dtype}" for name, dtype in obj.schema.items()) schema_hash = hash_bytes(schema_str.encode()) - row_hash = hash_sequence(obj.hash_rows().to_list(), depth=depth + 1) + row_hash = hash_bytes(obj.hash_rows().to_numpy().tobytes()) return _hash_bytes(schema_hash.encode() + row_hash.encode()) diff --git a/tests/caching/test_fingerprinting.py b/tests/caching/test_fingerprinting.py index 28ea96085..9c00e072e 100644 --- a/tests/caching/test_fingerprinting.py +++ b/tests/caching/test_fingerprinting.py @@ -248,6 +248,27 @@ def test_hash_pandas_same_data_matches(): assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b) +def test_hash_pandas_different_columns_differ(): + """pandas analog of test_hash_polars_different_columns_differ: identical + values under different column names must hash differently.""" + a = pd.DataFrame({"region": ["East", "West"], "revenue": [100, 200]}) + b = pd.DataFrame({"student": ["East", "West"], "height_cm": [100, 200]}) + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + +def test_hash_pandas_different_dtypes_differ(): + """pandas frames with identical values but different dtypes must hash differently.""" + a = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) # int64 + b = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]}) # float64 + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + +def test_hash_pandas_order_sensitive(): + """Reordering rows must change the fingerprint (order-sensitivity preserved).""" + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + assert fingerprinting.hash_value(df) != fingerprinting.hash_value(df.iloc[::-1]) + + def test_hash_polars_different_columns_differ(): """DataFrames with identical values but different column names must hash differently.""" polars = pytest.importorskip("polars") @@ -264,6 +285,14 @@ def test_hash_polars_same_schema_same_data_matches(): assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b) +def test_hash_polars_different_dtypes_differ(): + """polars frames with identical values but different dtypes must hash differently.""" + polars = pytest.importorskip("polars") + a = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Int64}) + b = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Float64}) + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + def test_hash_cross_type_primitives_differ(): """Values with the same string form but different types must hash differently.