diff --git a/LICENSE b/LICENSE index 60a996edb..4d72e4c07 100644 --- a/LICENSE +++ b/LICENSE @@ -226,3 +226,33 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +------------------------------- + +This product depends on xxhash (the python-xxhash package, https://github.com/ifduyue/python-xxhash), +which is licensed under the BSD 2-Clause License. + +BSD 2-Clause License + +Copyright (c) 2014-2024, Yue Du +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/hamilton/caching/fingerprinting.py b/hamilton/caching/fingerprinting.py index f791755fa..f3c1b90dd 100644 --- a/hamilton/caching/fingerprinting.py +++ b/hamilton/caching/fingerprinting.py @@ -36,11 +36,12 @@ import base64 import datetime import functools -import hashlib import logging import sys from collections.abc import Mapping, Sequence, Set +import xxhash + from hamilton.experimental import h_databackends # NoneType is introduced in Python 3.10 @@ -77,12 +78,16 @@ def _compact_hash(digest: bytes) -> str: def _hash_bytes(data: bytes) -> str: - """Hash raw bytes and compact-encode the digest. + """Hash raw bytes with the non-cryptographic xxh3_128 algorithm and + compact-encode the digest. All hashing in this module routes through this single helper so the underlying hashing algorithm can be changed in exactly one place. + xxh3_128 produces a 16-byte digest (24 base64url chars, the same width + as the md5 it replaces) while running substantially faster on + buffer-bound paths. """ - return _compact_hash(hashlib.md5(data).digest()) + return _compact_hash(xxhash.xxh3_128(data).digest()) @functools.singledispatch @@ -249,28 +254,40 @@ def hash_set(obj, *args, depth: int = 0, **kwargs) -> str: @hash_value.register(h_databackends.AbstractPandasDataFrame) @hash_value.register(h_databackends.AbstractPandasColumn) def hash_pandas_obj(obj, *args, depth: int = 0, **kwargs) -> str: - """Convert a pandas dataframe, series, or index to - a dictionary of {index: row_hash} then hash it. + """Hash a pandas DataFrame, Series, or Index via vectorized row hashing. + + ``pandas.util.hash_pandas_object`` computes a uint64 hash per row in a + single vectorized pass; we hash that buffer in one shot rather than + iterating over rows in Python. Column names and dtypes (the schema) are + folded in so that frames carrying identical cell values under different + schemas do not collide. - Given the hashing for mappings, the physical ordering or rows doesn't matter. - For example, if the index is a date, the hash will represent the {date: row_hash}, - and won't preserve how dates were ordered in the DataFrame. + The hash is order-sensitive: reordering rows changes the per-row hash + buffer and therefore the fingerprint. """ from pandas.util import hash_pandas_object - hash_per_row = hash_pandas_object(obj) - return hash_mapping(hash_per_row.to_dict(), ignore_order=False, depth=depth + 1) + row_hashes = hash_pandas_object(obj).values.tobytes() + if hasattr(obj, "columns"): + schema = f"{list(obj.columns)}:{[str(dtype) for dtype in obj.dtypes]}" + else: + schema = f"{getattr(obj, 'name', None)}:{obj.dtype}" + return _hash_bytes(schema.encode() + row_hashes) @hash_value.register(h_databackends.AbstractPolarsDataFrame) def hash_polars_dataframe(obj, *args, depth: int = 0, **kwargs) -> str: - """Convert a polars dataframe to a hash that includes column names - and dtypes (schema) alongside row hashes. This prevents collisions - between DataFrames with identical cell values but different schemas. + """Hash a polars DataFrame via vectorized row hashing. + + ``DataFrame.hash_rows`` computes a per-row hash in a single vectorized + pass; we hash that buffer (``to_numpy().tobytes()``) in one shot rather + than iterating element-by-element in Python. Column names and dtypes + (the schema) are folded in so frames carrying identical cell values under + different schemas do not collide. """ schema_str = ",".join(f"{name}:{dtype}" for name, dtype in obj.schema.items()) schema_hash = hash_bytes(schema_str.encode()) - row_hash = hash_sequence(obj.hash_rows().to_list(), depth=depth + 1) + row_hash = hash_bytes(obj.hash_rows().to_numpy().tobytes()) return _hash_bytes(schema_hash.encode() + row_hash.encode()) diff --git a/pyproject.toml b/pyproject.toml index f0ece93a6..ea6703f7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "pandas", "typing_extensions > 4.0.0", "typing_inspect", + "xxhash>=0.8.0", ] [project.optional-dependencies] diff --git a/tests/caching/test_fingerprinting.py b/tests/caching/test_fingerprinting.py index 28ea96085..296affd6b 100644 --- a/tests/caching/test_fingerprinting.py +++ b/tests/caching/test_fingerprinting.py @@ -146,11 +146,11 @@ def __init__(self, obj): @pytest.mark.parametrize( ("obj", "expected_hash"), [ - ("hello-world", "L1Q1Kh6_t1atHO_H8RbBeA=="), - (17.31231, "mJPTpPyXDSZgU-u8NuztIQ=="), - (16474, "6MgAp1NbMW0ZZpe_8iKVsg=="), - (True, "J2eGynSuIpd5bwVQzO9VVg=="), - (b"\x951!\x89u=\xe6\xadG\xdf", "d1DufDgRQmqi9Kt4Z2PeUQ=="), + ("hello-world", "EXXR8_e47ElS18aP2lThJA=="), + (17.31231, "tVUSIslYiBcW52c-7w4gvA=="), + (16474, "FAJ-iXM_Hwg9TCRreY8AyA=="), + (True, "qkJEg3-XQKmGWk5sWqmonw=="), + (b"\x951!\x89u=\xe6\xadG\xdf", "pPTyYkSU_x7NLB1Fp_YTyA=="), ], ) def test_hash_primitive(obj, expected_hash): @@ -161,8 +161,8 @@ def test_hash_primitive(obj, expected_hash): @pytest.mark.parametrize( ("obj", "expected_hash"), [ - ([0, True, "hello-world"], "mlOjj4yeCrSDFSn5zgdEIg=="), - ((17.0, False, "world"), "BcRSGfyKeIOdym9B6TmAyQ=="), + ([0, True, "hello-world"], "I98OkNhfxtScJrYNTs4ZfQ=="), + ((17.0, False, "world"), "catgOMSnsbQj1_KELNQscw=="), ], ) def test_hash_sequence(obj, expected_hash): @@ -173,7 +173,7 @@ def test_hash_sequence(obj, expected_hash): def test_hash_equals_for_different_sequence_types(): list_obj = [0, True, "hello-world"] tuple_obj = (0, True, "hello-world") - expected_hash = "mlOjj4yeCrSDFSn5zgdEIg==" + expected_hash = "I98OkNhfxtScJrYNTs4ZfQ==" list_fingerprint = fingerprinting.hash_sequence(list_obj) tuple_fingerprint = fingerprinting.hash_sequence(tuple_obj) @@ -182,7 +182,7 @@ def test_hash_equals_for_different_sequence_types(): def test_hash_ordered_mapping(): obj = {0: True, "key": "value", 17.0: None} - expected_hash = "GyxyI9-pq-EJJvSAIN509g==" + expected_hash = "zX6MzhWGAOvxateHIPxOvA==" fingerprint = fingerprinting.hash_mapping(obj, ignore_order=False) assert fingerprint == expected_hash @@ -197,7 +197,7 @@ def test_hash_mapping_where_order_matters(): def test_hash_unordered_mapping(): obj = {0: True, "key": "value", 17.0: None} - expected_hash = "cDuuL2eA3DaSWlWW3u7o9g==" + expected_hash = "4cnTFA4MEEzmBN4a04k6tA==" fingerprint = fingerprinting.hash_mapping(obj, ignore_order=True) assert fingerprint == expected_hash @@ -212,7 +212,7 @@ def test_hash_mapping_where_order_doesnt_matter(): def test_hash_set(): obj = {0, True, "key", "value", 17.0, None} - expected_hash = "E_f_tjbi6qn7KL3NUCZayg==" + expected_hash = "mswHhNBBYN5mv6i-LcEeVw==" fingerprint = fingerprinting.hash_set(obj) assert fingerprint == expected_hash @@ -221,7 +221,7 @@ def test_hash_numpy(): # dtype is pinned explicitly so the literal digest is reproducible across # platforms (the default integer dtype is platform-dependent). array = np.array([[0, 1], [2, 3]], dtype=np.int64) - expected_hash = "024zwZIcWy6r4dlX4AMTow==" + expected_hash = "Y1uek_eQTHejo2YtRvdWPQ==" fingerprint = fingerprinting.hash_value(array) assert fingerprint == expected_hash @@ -248,6 +248,27 @@ def test_hash_pandas_same_data_matches(): assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b) +def test_hash_pandas_different_columns_differ(): + """pandas analog of test_hash_polars_different_columns_differ: identical + values under different column names must hash differently.""" + a = pd.DataFrame({"region": ["East", "West"], "revenue": [100, 200]}) + b = pd.DataFrame({"student": ["East", "West"], "height_cm": [100, 200]}) + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + +def test_hash_pandas_different_dtypes_differ(): + """pandas frames with identical values but different dtypes must hash differently.""" + a = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) # int64 + b = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]}) # float64 + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + +def test_hash_pandas_order_sensitive(): + """Reordering rows must change the fingerprint (order-sensitivity preserved).""" + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + assert fingerprinting.hash_value(df) != fingerprinting.hash_value(df.iloc[::-1]) + + def test_hash_polars_different_columns_differ(): """DataFrames with identical values but different column names must hash differently.""" polars = pytest.importorskip("polars") @@ -264,6 +285,14 @@ def test_hash_polars_same_schema_same_data_matches(): assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b) +def test_hash_polars_different_dtypes_differ(): + """polars frames with identical values but different dtypes must hash differently.""" + polars = pytest.importorskip("polars") + a = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Int64}) + b = polars.DataFrame({"a": [1, 2]}, schema={"a": polars.Float64}) + assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b) + + def test_hash_cross_type_primitives_differ(): """Values with the same string form but different types must hash differently.