diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index 6886220ccf948..56c681ad3b236 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -162,7 +162,7 @@ macro_rules! hash_value { })+ }; } -hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64); +hash_value!(i8, i16, i32, i64, u128, i128, i256, u8, u16, u32, u64); hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano); macro_rules! hash_float_value { @@ -221,6 +221,125 @@ fn hash_array_primitive( } } +/// Hash a string view array. +/// +/// Templated to optimize inner loop based on presence of nulls and external buffers. +/// +/// HAS_NULLS: do we have to check in the inner loop +/// HAS_BUFFERS: if true, array has external buffers; if false, all strings are inlined/ less then 12 bytes +/// REHASH: if true, combining with existing hash, otherwise initializing +#[inline(never)] +fn hash_string_view_array_inner< + const HAS_NULLS: bool, + const HAS_BUFFERS: bool, + const REHASH: bool, +>( + array: &StringViewArray, + random_state: &RandomState, + hashes_buffer: &mut [u64], +) { + assert_eq!( + hashes_buffer.len(), + array.len(), + "hashes_buffer and array should be of equal length" + ); + + for (i, hash) in hashes_buffer.iter_mut().enumerate() { + if HAS_NULLS && array.is_null(i) { + continue; + } + // SAFETY: length is verified above so i is within bounds + let v = unsafe { *array.views().get_unchecked(i) }; + let view_len = v as u32; + // all views are inlined, no need to access external buffers + if !HAS_BUFFERS || view_len <= 12 { + if REHASH { + *hash = combine_hashes(v.hash_one(random_state), *hash); + } else { + *hash = v.hash_one(random_state); + } + continue; + } + // view is not inlined, so we need to hash the bytes as well + let view = ByteView::from(v); + let data = unsafe { + array + .data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + // SAFETY: view is a valid view as it came from the array + let view_bytes = + unsafe { data.get_unchecked(offset..offset + view_len as usize) }; + if REHASH { + *hash = combine_hashes(view_bytes.hash_one(random_state), *hash); + } else { + *hash = view_bytes.hash_one(random_state); + } + } +} + +/// Builds hash values for array views and writes them into `hashes_buffer` +/// If `rehash==true` this combines the previous hash value in the buffer +/// with the new hash using `combine_hashes` +/// +/// TODO: make general for butesview as well +#[cfg(not(feature = "force_hash_collisions"))] +fn hash_string_view_array( + array: &StringViewArray, + random_state: &RandomState, + hashes_buffer: &mut [u64], + rehash: bool, +) { + // instantiate the correct version based on presence of nulls and external buffers + match ( + array.null_count() != 0, + !array.data_buffers().is_empty(), + rehash, + ) { + (false, false, false) => hash_string_view_array_inner::( + array, + random_state, + hashes_buffer, + ), + (false, false, true) => hash_string_view_array_inner::( + array, + random_state, + hashes_buffer, + ), + (false, true, false) => hash_string_view_array_inner::( + array, + random_state, + hashes_buffer, + ), + (false, true, true) => hash_string_view_array_inner::( + array, + random_state, + hashes_buffer, + ), + (true, false, false) => hash_string_view_array_inner::( + array, + random_state, + hashes_buffer, + ), + (true, false, true) => hash_string_view_array_inner::( + array, + random_state, + hashes_buffer, + ), + (true, true, false) => hash_string_view_array_inner::( + array, + random_state, + hashes_buffer, + ), + (true, true, true) => hash_string_view_array_inner::( + array, + random_state, + hashes_buffer, + ), + } +} + /// Hashes one array into the `hashes_buffer` /// If `rehash==true` this combines the previous hash value in the buffer /// with the new hash using `combine_hashes` @@ -568,7 +687,7 @@ fn hash_single_array( DataType::Null => hash_null(random_state, hashes_buffer, rehash), DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash), DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash), - DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash), + DataType::Utf8View => hash_string_view_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash), DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash), DataType::Binary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash), diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs index 31a152aa74174..cc255a5bbcb6a 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs @@ -117,15 +117,38 @@ impl ByteViewGroupValueBuilder { self.do_append_val_inner(arr, row); } - fn vectorized_equal_to_inner( + fn vectorized_equal_to_inner_no_nulls( &self, lhs_rows: &[usize], - array: &ArrayRef, + array: &GenericByteViewArray, rhs_rows: &[usize], equal_to_results: &mut [bool], ) { - let array = array.as_byte_view::(); + let iter = izip!( + lhs_rows.iter(), + rhs_rows.iter(), + equal_to_results.iter_mut(), + ); + // if the input array has no nulls, can skip null check + for (&lhs_row, &rhs_row, equal_to_result) in iter { + // Has found not equal to, don't need to check + if !*equal_to_result { + continue; + } + + *equal_to_result = + self.do_equal_to_inner_values_only(lhs_row, array, rhs_row); + } + } + + fn vectorized_equal_to_inner_nulls( + &self, + lhs_rows: &[usize], + array: &GenericByteViewArray, + rhs_rows: &[usize], + equal_to_results: &mut [bool], + ) { let iter = izip!( lhs_rows.iter(), rhs_rows.iter(), @@ -226,14 +249,26 @@ impl ByteViewGroupValueBuilder { let exist_null = self.nulls.is_null(lhs_row); let input_null = array.is_null(rhs_row); if let Some(result) = nulls_equal_to(exist_null, input_null) { - return result; + result + } else { + self.do_equal_to_inner_values_only(lhs_row, array, rhs_row) } + } - // Otherwise, we need to check their values - let exist_view = self.views[lhs_row]; + /// Same as equal_to_inner, but without null checks + /// + /// Checks only the values for equality + #[inline(always)] + fn do_equal_to_inner_values_only( + &self, + lhs_row: usize, + array: &GenericByteViewArray, + rhs_row: usize, + ) -> bool { + let exist_view = unsafe { *self.views.get_unchecked(lhs_row) }; let exist_view_len = exist_view as u32; - let input_view = array.views()[rhs_row]; + let input_view = unsafe { *array.views().get_unchecked(rhs_row) }; let input_view_len = input_view as u32; // The check logic @@ -246,19 +281,9 @@ impl ByteViewGroupValueBuilder { } if exist_view_len <= 12 { - let exist_inline = unsafe { - GenericByteViewArray::::inline_value( - &exist_view, - exist_view_len as usize, - ) - }; - let input_inline = unsafe { - GenericByteViewArray::::inline_value( - &input_view, - input_view_len as usize, - ) - }; - exist_inline == input_inline + // the views are inlined and the lengths are equal, so just + // compare the views directly + exist_view == input_view } else { let exist_prefix = unsafe { GenericByteViewArray::::inline_value(&exist_view, 4) }; @@ -507,7 +532,22 @@ impl GroupColumn for ByteViewGroupValueBuilder { rows: &[usize], equal_to_results: &mut [bool], ) { - self.vectorized_equal_to_inner(group_indices, array, rows, equal_to_results); + let array = array.as_byte_view::(); + if array.null_count() == 0 { + self.vectorized_equal_to_inner_no_nulls( + group_indices, + array, + rows, + equal_to_results, + ); + } else { + self.vectorized_equal_to_inner_nulls( + group_indices, + array, + rows, + equal_to_results, + ); + } } fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) -> Result<()> {