-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Refactor ListArray hashing to consider only sliced values #19500
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ use arrow::array::*; | |
| use arrow::datatypes::*; | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| use arrow::{downcast_dictionary_array, downcast_primitive_array}; | ||
| use itertools::Itertools; | ||
|
|
||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| use crate::cast::{ | ||
|
|
@@ -513,24 +514,41 @@ fn hash_list_array<OffsetSize>( | |
| where | ||
| OffsetSize: OffsetSizeTrait, | ||
| { | ||
| let values = array.values(); | ||
| let offsets = array.value_offsets(); | ||
| let nulls = array.nulls(); | ||
| let mut values_hashes = vec![0u64; values.len()]; | ||
| create_hashes([values], random_state, &mut values_hashes)?; | ||
| if let Some(nulls) = nulls { | ||
| for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { | ||
| if nulls.is_valid(i) { | ||
| // In case values is sliced, hash only the bytes used by the offsets of this ListArray | ||
| let first_offset = array.value_offsets().first().cloned().unwrap_or_default(); | ||
| let last_offset = array.value_offsets().last().cloned().unwrap_or_default(); | ||
| let value_bytes_len = (last_offset - first_offset).as_usize(); | ||
| let mut values_hashes = vec![0u64; value_bytes_len]; | ||
| create_hashes( | ||
| [array | ||
| .values() | ||
| .slice(first_offset.as_usize(), value_bytes_len)], | ||
| random_state, | ||
| &mut values_hashes, | ||
| )?; | ||
|
|
||
| if array.null_count() > 0 { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Switching to this count probably doesn't affect much (how often do we see a nullbuffer thats present but has all bits valid?) but it's consistent with how we check for nulls in the other functions |
||
| for (i, (start, stop)) in array.value_offsets().iter().tuple_windows().enumerate() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using tuple_windows from itertools makes this more ergonomic |
||
| { | ||
| if array.is_valid(i) { | ||
| let hash = &mut hashes_buffer[i]; | ||
| for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] { | ||
| for values_hash in &values_hashes[(*start - first_offset).as_usize() | ||
| ..(*stop - first_offset).as_usize()] | ||
| { | ||
| *hash = combine_hashes(*hash, *values_hash); | ||
| } | ||
| } | ||
| } | ||
| } else { | ||
| for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { | ||
| let hash = &mut hashes_buffer[i]; | ||
| for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] { | ||
| for ((start, stop), hash) in array | ||
| .value_offsets() | ||
| .iter() | ||
| .tuple_windows() | ||
| .zip(hashes_buffer.iter_mut()) | ||
| { | ||
| for values_hash in &values_hashes | ||
| [(*start - first_offset).as_usize()..(*stop - first_offset).as_usize()] | ||
| { | ||
| *hash = combine_hashes(*hash, *values_hash); | ||
| } | ||
| } | ||
|
|
@@ -1128,6 +1146,30 @@ mod tests { | |
| assert_eq!(hashes[1], hashes[6]); // null vs empty list | ||
| } | ||
|
|
||
| #[test] | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| fn create_hashes_for_sliced_list_arrays() { | ||
| let data = vec![ | ||
| Some(vec![Some(0), Some(1), Some(2)]), | ||
| None, | ||
| // Slice from here | ||
| Some(vec![Some(3), None, Some(5)]), | ||
| Some(vec![Some(3), None, Some(5)]), | ||
| None, | ||
| // To here | ||
| Some(vec![Some(0), Some(1), Some(2)]), | ||
| Some(vec![]), | ||
| ]; | ||
| let list_array = | ||
| Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef; | ||
| let list_array = list_array.slice(2, 3); | ||
| let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
| let mut hashes = vec![0; list_array.len()]; | ||
| create_hashes(&[list_array], &random_state, &mut hashes).unwrap(); | ||
| assert_eq!(hashes[0], hashes[1]); | ||
| assert_ne!(hashes[1], hashes[2]); | ||
| } | ||
|
|
||
| #[test] | ||
| // Tests actual values of hashes, which are different if forcing collisions | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Main change here