33import logging
44from collections import defaultdict
55from typing import Iterator
6- from operator import attrgetter
76
87import attrs
98
@@ -71,7 +70,8 @@ class HashDiffer(TableDiffer):
7170 """
7271
7372 bisection_factor : int = DEFAULT_BISECTION_FACTOR
74- bisection_threshold : Number = DEFAULT_BISECTION_THRESHOLD # Accepts inf for tests
73+ bisection_threshold : int = DEFAULT_BISECTION_THRESHOLD
74+ bisection_disabled : bool = False # i.e. always download the rows (used in tests)
7575
7676 stats : dict = attrs .field (factory = dict )
7777
@@ -82,7 +82,7 @@ def __attrs_post_init__(self):
8282 if self .bisection_factor < 2 :
8383 raise ValueError ("Must have at least two segments per iteration (i.e. bisection_factor >= 2)" )
8484
85- def _validate_and_adjust_columns (self , table1 , table2 ):
85+ def _validate_and_adjust_columns (self , table1 , table2 , * , strict : bool = True ):
8686 for c1 , c2 in safezip (table1 .relevant_columns , table2 .relevant_columns ):
8787 if c1 not in table1 ._schema :
8888 raise ValueError (f"Column '{ c1 } ' not found in schema for table { table1 } " )
@@ -92,23 +92,23 @@ def _validate_and_adjust_columns(self, table1, table2):
9292 # Update schemas to minimal mutual precision
9393 col1 = table1 ._schema [c1 ]
9494 col2 = table2 ._schema [c2 ]
95- if isinstance (col1 , PrecisionType ):
96- if not isinstance (col2 , PrecisionType ):
95+ if isinstance (col1 , PrecisionType ) and isinstance ( col2 , PrecisionType ) :
96+ if strict and not isinstance (col2 , PrecisionType ):
9797 raise TypeError (f"Incompatible types for column '{ c1 } ': { col1 } <-> { col2 } " )
9898
99- lowest = min (col1 , col2 , key = attrgetter ( " precision" ) )
99+ lowest = min (col1 , col2 , key = lambda col : col . precision )
100100
101101 if col1 .precision != col2 .precision :
102102 logger .warning (f"Using reduced precision { lowest } for column '{ c1 } '. Types={ col1 } , { col2 } " )
103103
104104 table1 ._schema [c1 ] = attrs .evolve (col1 , precision = lowest .precision , rounds = lowest .rounds )
105105 table2 ._schema [c2 ] = attrs .evolve (col2 , precision = lowest .precision , rounds = lowest .rounds )
106106
107- elif isinstance (col1 , (NumericType , Boolean )):
108- if not isinstance (col2 , (NumericType , Boolean )):
107+ elif isinstance (col1 , (NumericType , Boolean )) and isinstance ( col2 , ( NumericType , Boolean )) :
108+ if strict and not isinstance (col2 , (NumericType , Boolean )):
109109 raise TypeError (f"Incompatible types for column '{ c1 } ': { col1 } <-> { col2 } " )
110110
111- lowest = min (col1 , col2 , key = attrgetter ( " precision" ) )
111+ lowest = min (col1 , col2 , key = lambda col : col . precision )
112112
113113 if col1 .precision != col2 .precision :
114114 logger .warning (f"Using reduced precision { lowest } for column '{ c1 } '. Types={ col1 } , { col2 } " )
@@ -119,11 +119,11 @@ def _validate_and_adjust_columns(self, table1, table2):
119119 table2 ._schema [c2 ] = attrs .evolve (col2 , precision = lowest .precision )
120120
121121 elif isinstance (col1 , ColType_UUID ):
122- if not isinstance (col2 , ColType_UUID ):
122+ if strict and not isinstance (col2 , ColType_UUID ):
123123 raise TypeError (f"Incompatible types for column '{ c1 } ': { col1 } <-> { col2 } " )
124124
125125 elif isinstance (col1 , StringType ):
126- if not isinstance (col2 , StringType ):
126+ if strict and not isinstance (col2 , StringType ):
127127 raise TypeError (f"Incompatible types for column '{ c1 } ': { col1 } <-> { col2 } " )
128128
129129 for t in [table1 , table2 ]:
@@ -157,7 +157,7 @@ def _diff_segments(
157157 # default, data-diff will checksum the section first (when it's below
158158 # the threshold) and _then_ download it.
159159 if BENCHMARK :
160- if max_rows < self .bisection_threshold :
160+ if self . bisection_disabled or max_rows < self .bisection_threshold :
161161 return self ._bisect_and_diff_segments (ti , table1 , table2 , info_tree , level = level , max_rows = max_rows )
162162
163163 (count1 , checksum1 ), (count2 , checksum2 ) = self ._threaded_call ("count_and_checksum" , [table1 , table2 ])
@@ -202,7 +202,7 @@ def _bisect_and_diff_segments(
202202
203203 # If count is below the threshold, just download and compare the columns locally
204204 # This saves time, as bisection speed is limited by ping and query performance.
205- if max_rows < self .bisection_threshold or max_space_size < self .bisection_factor * 2 :
205+ if self . bisection_disabled or max_rows < self .bisection_threshold or max_space_size < self .bisection_factor * 2 :
206206 rows1 , rows2 = self ._threaded_call ("get_values" , [table1 , table2 ])
207207 json_cols = {
208208 i : colname
0 commit comments