-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Refactor ListArray hashing to consider only sliced values #19500
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ use arrow::array::*; | |
| use arrow::datatypes::*; | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| use arrow::{downcast_dictionary_array, downcast_primitive_array}; | ||
| use itertools::Itertools; | ||
|
|
||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| use crate::cast::{ | ||
|
|
@@ -513,24 +514,41 @@ fn hash_list_array<OffsetSize>( | |
| where | ||
| OffsetSize: OffsetSizeTrait, | ||
| { | ||
| let values = array.values(); | ||
| let offsets = array.value_offsets(); | ||
| let nulls = array.nulls(); | ||
| let mut values_hashes = vec![0u64; values.len()]; | ||
| create_hashes([values], random_state, &mut values_hashes)?; | ||
| if let Some(nulls) = nulls { | ||
| for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { | ||
| if nulls.is_valid(i) { | ||
| // In case values is sliced, hash only the bytes used by the offsets of this ListArray | ||
| let first_offset = array.value_offsets().first().cloned().unwrap_or_default(); | ||
| let last_offset = array.value_offsets().last().cloned().unwrap_or_default(); | ||
| let value_bytes_len = (last_offset - first_offset).as_usize(); | ||
| let mut values_hashes = vec![0u64; value_bytes_len]; | ||
| create_hashes( | ||
| [array | ||
| .values() | ||
| .slice(first_offset.as_usize(), value_bytes_len)], | ||
| random_state, | ||
| &mut values_hashes, | ||
| )?; | ||
|
Comment on lines
+518
to
+528
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Main change here |
||
|
|
||
| if array.null_count() > 0 { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Switching to this count probably doesn't affect much (how often do we see a nullbuffer thats present but has all bits valid?) but it's consistent with how we check for nulls in the other functions |
||
| for (i, (start, stop)) in array.value_offsets().iter().tuple_windows().enumerate() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using tuple_windows from itertools makes this more ergonomic |
||
| { | ||
| if array.is_valid(i) { | ||
| let hash = &mut hashes_buffer[i]; | ||
| for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] { | ||
| for values_hash in &values_hashes[(*start - first_offset).as_usize() | ||
| ..(*stop - first_offset).as_usize()] | ||
| { | ||
| *hash = combine_hashes(*hash, *values_hash); | ||
| } | ||
| } | ||
| } | ||
| } else { | ||
| for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { | ||
| let hash = &mut hashes_buffer[i]; | ||
| for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] { | ||
| for ((start, stop), hash) in array | ||
| .value_offsets() | ||
| .iter() | ||
| .tuple_windows() | ||
| .zip(hashes_buffer.iter_mut()) | ||
| { | ||
| for values_hash in &values_hashes | ||
| [(*start - first_offset).as_usize()..(*stop - first_offset).as_usize()] | ||
| { | ||
| *hash = combine_hashes(*hash, *values_hash); | ||
| } | ||
| } | ||
|
|
@@ -1128,6 +1146,30 @@ mod tests { | |
| assert_eq!(hashes[1], hashes[6]); // null vs empty list | ||
| } | ||
|
|
||
| #[test] | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| fn create_hashes_for_sliced_list_arrays() { | ||
| let data = vec![ | ||
| Some(vec![Some(0), Some(1), Some(2)]), | ||
| None, | ||
| // Slice from here | ||
| Some(vec![Some(3), None, Some(5)]), | ||
| Some(vec![Some(3), None, Some(5)]), | ||
| None, | ||
| // To here | ||
| Some(vec![Some(0), Some(1), Some(2)]), | ||
| Some(vec![]), | ||
| ]; | ||
| let list_array = | ||
| Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef; | ||
| let list_array = list_array.slice(2, 3); | ||
| let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
| let mut hashes = vec![0; list_array.len()]; | ||
| create_hashes(&[list_array], &random_state, &mut hashes).unwrap(); | ||
| assert_eq!(hashes[0], hashes[1]); | ||
| assert_ne!(hashes[1], hashes[2]); | ||
| } | ||
|
|
||
| #[test] | ||
| // Tests actual values of hashes, which are different if forcing collisions | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This allocates a fresh
values_hashesVec for every list column hashed.Could we reuse a buffer (similar to HASH_BUFFER above) or early-return for empty
value_bytes_lento trim repeated allocations?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean reuse a buffer between invocations of
hash_list_array()itself? We could look into that, but I'd say thats beyond the scope of changes here especially as the other functions don't do this, so might need more plumbing etc.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree removing the allocation would be good -- however, the existing code also creates
vec!so I don't think this change is any worse in this regard