@@ -231,13 +231,14 @@ def _bisect_and_diff_tables(
231231 ):
232232 assert table1 .is_bounded and table2 .is_bounded
233233
234+ max_space_size = max (table1 .approximate_size (), table2 .approximate_size ())
234235 if max_rows is None :
235- # We can be sure that row_count <= max_rows
236- max_rows = max ( table1 . approximate_size (), table2 . approximate_size ())
236+ # We can be sure that row_count <= max_rows iff the table key is unique
237+ max_rows = max_space_size
237238
238239 # If count is below the threshold, just download and compare the columns locally
239240 # This saves time, as bisection speed is limited by ping and query performance.
240- if max_rows < self .bisection_threshold :
241+ if max_rows < self .bisection_threshold or max_space_size < self . bisection_factor * 2 :
241242 rows1 , rows2 = self ._threaded_call ("get_values" , [table1 , table2 ])
242243 diff = list (diff_sets (rows1 , rows2 ))
243244
@@ -255,7 +256,8 @@ def _bisect_and_diff_tables(
255256 return diff
256257
257258 # Choose evenly spaced checkpoints (according to min_key and max_key)
258- checkpoints = table1 .choose_checkpoints (self .bisection_factor - 1 )
259+ biggest_table = max (table1 , table2 , key = methodcaller ('approximate_size' ))
260+ checkpoints = biggest_table .choose_checkpoints (self .bisection_factor - 1 )
259261
260262 # Create new instances of TableSegment between each checkpoint
261263 segmented1 = table1 .segment_by_checkpoints (checkpoints )
0 commit comments