Is your feature request related to a problem or challenge?
|
fn update_hash_for_dict_key( |
|
hash: &mut u64, |
|
dict_hashes: &[u64], |
|
dict_values: &dyn Array, |
|
idx: usize, |
|
multi_col: bool, |
|
) { |
|
if dict_values.is_valid(idx) { |
|
if multi_col { |
|
*hash = combine_hashes(dict_hashes[idx], *hash); |
|
} else { |
|
*hash = dict_hashes[idx]; |
|
} |
|
} |
|
// no update for invalid dictionary value |
|
} |
|
|
|
/// Hash the values in a dictionary array |
|
#[cfg(not(feature = "force_hash_collisions"))] |
|
fn hash_dictionary<K: ArrowDictionaryKeyType>( |
|
array: &DictionaryArray<K>, |
|
random_state: &RandomState, |
|
hashes_buffer: &mut [u64], |
|
multi_col: bool, |
|
) -> Result<()> { |
|
// Hash each dictionary value once, and then use that computed |
|
// hash for each key value to avoid a potentially expensive |
|
// redundant hashing for large dictionary elements (e.g. strings) |
|
let dict_values = array.values(); |
|
let mut dict_hashes = vec![0; dict_values.len()]; |
|
create_hashes([dict_values], random_state, &mut dict_hashes)?; |
|
|
|
// combine hash for each index in values |
|
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) { |
|
if let Some(key) = key { |
|
let idx = key.as_usize(); |
|
update_hash_for_dict_key( |
|
hash, |
|
&dict_hashes, |
|
dict_values.as_ref(), |
|
idx, |
|
multi_col, |
|
); |
|
} // no update for Null key |
|
} |
|
Ok(()) |
|
} |
- Dictionary doesn't consider if there are 0 null keys + has rehash check inside the hotloop
|
let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls { |
|
nulls.valid_indices().collect() |
|
} else { |
|
(0..row_len).collect() |
|
}; |
|
|
|
// Create hashes for each row that combines the hashes over all the column at that row. |
|
let mut values_hashes = vec![0u64; row_len]; |
|
create_hashes(array.columns(), random_state, &mut values_hashes)?; |
|
|
|
for i in valid_row_indices { |
|
let hash = &mut hashes_buffer[i]; |
|
*hash = combine_hashes(*hash, values_hashes[i]); |
|
} |
- Struct collects the valid indices even if there are no nulls
|
// Combine the hashes for entries on each row with each other and previous hash for that row |
|
if let Some(nulls) = nulls { |
|
for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { |
|
create_hashes([values], random_state, &mut values_hashes)?; |
|
if let Some(nulls) = nulls { |
|
for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() { |
|
create_hashes([values], random_state, &mut values_hashes)?; |
|
if let Some(nulls) = nulls { |
|
for i in 0..array.len() { |
- Map/ListView/FixedSizeList checks for existence of null buffer only, not for if there are nulls (null_count)
|
|
|
if rehash { |
|
if !is_null_value { |
- Run array has rehash check inside hotloop
Describe the solution you'd like
- Always check for null path based on null_count; for dictionary + struct we should have separate paths for when there are nulls vs when there are no nulls (to be consistent with how the other functions handle this)
- This may be a pedantic case (how often would we have an array with a null buffer but no nulls in it?), but can consider cases like having an array with nulls, but then slicing into a section of it that contains no null values
- Pull rehash outside the hotloop
Describe alternatives you've considered
If there aren't noticeable performance improvements then might not be worth considering. Maybe branch prediction is good enough for having rehash check inside the hotloop since that wouldn't change per iteration 🤔
Additional context
Can take inspiration from
Is your feature request related to a problem or challenge?
datafusion/datafusion/common/src/hash_utils.rs
Lines 398 to 444 in b80bf2c
datafusion/datafusion/common/src/hash_utils.rs
Lines 455 to 468 in b80bf2c
datafusion/datafusion/common/src/hash_utils.rs
Lines 487 to 489 in b80bf2c
datafusion/datafusion/common/src/hash_utils.rs
Lines 574 to 576 in b80bf2c
datafusion/datafusion/common/src/hash_utils.rs
Lines 643 to 645 in b80bf2c
datafusion/datafusion/common/src/hash_utils.rs
Lines 713 to 715 in b80bf2c
Describe the solution you'd like
Describe alternatives you've considered
If there aren't noticeable performance improvements then might not be worth considering. Maybe branch prediction is good enough for having rehash check inside the hotloop since that wouldn't change per iteration 🤔
Additional context
Can take inspiration from