diff --git a/cpp/src/arrow/compute/key_hash.cc b/cpp/src/arrow/compute/key_hash.cc index f5867b405ec..1902b9ce9a8 100644 --- a/cpp/src/arrow/compute/key_hash.cc +++ b/cpp/src/arrow/compute/key_hash.cc @@ -105,23 +105,23 @@ inline void Hashing32::StripeMask(int i, uint32_t* mask1, uint32_t* mask2, } template -void Hashing32::HashFixedLenImp(uint32_t num_rows, uint64_t length, const uint8_t* keys, - uint32_t* hashes) { +void Hashing32::HashFixedLenImp(uint32_t num_rows, uint64_t key_length, + const uint8_t* keys, uint32_t* hashes) { // Calculate the number of rows that skip the last 16 bytes // uint32_t num_rows_safe = num_rows; - while (num_rows_safe > 0 && (num_rows - num_rows_safe) * length < kStripeSize) { + while (num_rows_safe > 0 && (num_rows - num_rows_safe) * key_length < kStripeSize) { --num_rows_safe; } // Compute masks for the last 16 byte stripe // - uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize); + uint64_t num_stripes = bit_util::CeilDiv(key_length, kStripeSize); uint32_t mask1, mask2, mask3, mask4; - StripeMask(((length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, &mask3, &mask4); + StripeMask(((key_length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, &mask3, &mask4); for (uint32_t i = 0; i < num_rows_safe; ++i) { - const uint8_t* key = keys + static_cast(i) * length; + const uint8_t* key = keys + static_cast(i) * key_length; uint32_t acc1, acc2, acc3, acc4; ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4); ProcessLastStripe(mask1, mask2, mask3, mask4, key + (num_stripes - 1) * kStripeSize, @@ -138,11 +138,11 @@ void Hashing32::HashFixedLenImp(uint32_t num_rows, uint64_t length, const uint8_ uint32_t last_stripe_copy[4]; for (uint32_t i = num_rows_safe; i < num_rows; ++i) { - const uint8_t* key = keys + static_cast(i) * length; + const uint8_t* key = keys + static_cast(i) * key_length; uint32_t acc1, acc2, acc3, acc4; ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4); memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize, - length - (num_stripes - 1) * kStripeSize); + key_length - (num_stripes - 1) * kStripeSize); ProcessLastStripe(mask1, mask2, mask3, mask4, reinterpret_cast(last_stripe_copy), &acc1, &acc2, &acc3, &acc4); @@ -168,15 +168,16 @@ void Hashing32::HashVarLenImp(uint32_t num_rows, const T* offsets, } for (uint32_t i = 0; i < num_rows_safe; ++i) { - uint64_t length = offsets[i + 1] - offsets[i]; + uint64_t key_length = offsets[i + 1] - offsets[i]; // Compute masks for the last 16 byte stripe. // For an empty string set number of stripes to 1 but mask to all zeroes. // - int is_non_empty = length == 0 ? 0 : 1; - uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 - is_non_empty); + int is_non_empty = key_length == 0 ? 0 : 1; + uint64_t num_stripes = + bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty); uint32_t mask1, mask2, mask3, mask4; - StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, &mask1, + StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, &mask1, &mask2, &mask3, &mask4); const uint8_t* key = concatenated_keys + offsets[i]; @@ -198,23 +199,24 @@ void Hashing32::HashVarLenImp(uint32_t num_rows, const T* offsets, uint32_t last_stripe_copy[4]; for (uint32_t i = num_rows_safe; i < num_rows; ++i) { - uint64_t length = offsets[i + 1] - offsets[i]; + uint64_t key_length = offsets[i + 1] - offsets[i]; // Compute masks for the last 16 byte stripe. // For an empty string set number of stripes to 1 but mask to all zeroes. // - int is_non_empty = length == 0 ? 0 : 1; - uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 - is_non_empty); + int is_non_empty = key_length == 0 ? 0 : 1; + uint64_t num_stripes = + bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty); uint32_t mask1, mask2, mask3, mask4; - StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, &mask1, + StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, &mask1, &mask2, &mask3, &mask4); const uint8_t* key = concatenated_keys + offsets[i]; uint32_t acc1, acc2, acc3, acc4; ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4); - if (length > 0) { + if (key_length > 0) { memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize, - length - (num_stripes - 1) * kStripeSize); + key_length - (num_stripes - 1) * kStripeSize); } if (num_stripes > 0) { ProcessLastStripe(mask1, mask2, mask3, mask4, @@ -309,9 +311,9 @@ void Hashing32::HashIntImp(uint32_t num_keys, const T* keys, uint32_t* hashes) { } } -void Hashing32::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t length_key, +void Hashing32::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t key_length, const uint8_t* keys, uint32_t* hashes) { - switch (length_key) { + switch (key_length) { case sizeof(uint8_t): if (combine_hashes) { HashIntImp(num_keys, keys, hashes); @@ -352,27 +354,27 @@ void Hashing32::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t length_ } } -void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t num_rows, - uint64_t length, const uint8_t* keys, uint32_t* hashes, - uint32_t* hashes_temp_for_combine) { - if (ARROW_POPCOUNT64(length) == 1 && length <= sizeof(uint64_t)) { - HashInt(combine_hashes, num_rows, length, keys, hashes); +void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t num_keys, + uint64_t key_length, const uint8_t* keys, uint32_t* hashes, + uint32_t* temp_hashes_for_combine) { + if (ARROW_POPCOUNT64(key_length) == 1 && key_length <= sizeof(uint64_t)) { + HashInt(combine_hashes, num_keys, key_length, keys, hashes); return; } uint32_t num_processed = 0; #if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { - num_processed = HashFixedLen_avx2(combine_hashes, num_rows, length, keys, hashes, - hashes_temp_for_combine); + num_processed = HashFixedLen_avx2(combine_hashes, num_keys, key_length, keys, hashes, + temp_hashes_for_combine); } #endif if (combine_hashes) { - HashFixedLenImp(num_rows - num_processed, length, keys + length * num_processed, - hashes + num_processed); + HashFixedLenImp(num_keys - num_processed, key_length, + keys + key_length * num_processed, hashes + num_processed); } else { - HashFixedLenImp(num_rows - num_processed, length, - keys + length * num_processed, hashes + num_processed); + HashFixedLenImp(num_keys - num_processed, key_length, + keys + key_length * num_processed, hashes + num_processed); } } @@ -423,13 +425,13 @@ void Hashing32::HashMultiColumn(const std::vector& cols, } if (cols[icol].metadata().is_fixed_length) { - uint32_t col_width = cols[icol].metadata().fixed_length; - if (col_width == 0) { + uint32_t key_length = cols[icol].metadata().fixed_length; + if (key_length == 0) { HashBit(icol > 0, cols[icol].bit_offset(1), batch_size_next, cols[icol].data(1) + first_row / 8, hashes + first_row); } else { - HashFixed(ctx->hardware_flags, icol > 0, batch_size_next, col_width, - cols[icol].data(1) + first_row * col_width, hashes + first_row, + HashFixed(ctx->hardware_flags, icol > 0, batch_size_next, key_length, + cols[icol].data(1) + first_row * key_length, hashes + first_row, hash_temp); } } else if (cols[icol].metadata().fixed_length == sizeof(uint32_t)) { @@ -463,8 +465,9 @@ void Hashing32::HashMultiColumn(const std::vector& cols, Status Hashing32::HashBatch(const ExecBatch& key_batch, uint32_t* hashes, std::vector& column_arrays, int64_t hardware_flags, util::TempVectorStack* temp_stack, - int64_t offset, int64_t length) { - RETURN_NOT_OK(ColumnArraysFromExecBatch(key_batch, offset, length, &column_arrays)); + int64_t start_rows, int64_t num_rows) { + RETURN_NOT_OK( + ColumnArraysFromExecBatch(key_batch, start_rows, num_rows, &column_arrays)); LightContext ctx; ctx.hardware_flags = hardware_flags; @@ -574,23 +577,23 @@ inline void Hashing64::StripeMask(int i, uint64_t* mask1, uint64_t* mask2, } template -void Hashing64::HashFixedLenImp(uint32_t num_rows, uint64_t length, const uint8_t* keys, - uint64_t* hashes) { +void Hashing64::HashFixedLenImp(uint32_t num_rows, uint64_t key_length, + const uint8_t* keys, uint64_t* hashes) { // Calculate the number of rows that skip the last 32 bytes // uint32_t num_rows_safe = num_rows; - while (num_rows_safe > 0 && (num_rows - num_rows_safe) * length < kStripeSize) { + while (num_rows_safe > 0 && (num_rows - num_rows_safe) * key_length < kStripeSize) { --num_rows_safe; } // Compute masks for the last 32 byte stripe // - uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize); + uint64_t num_stripes = bit_util::CeilDiv(key_length, kStripeSize); uint64_t mask1, mask2, mask3, mask4; - StripeMask(((length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, &mask3, &mask4); + StripeMask(((key_length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, &mask3, &mask4); for (uint32_t i = 0; i < num_rows_safe; ++i) { - const uint8_t* key = keys + static_cast(i) * length; + const uint8_t* key = keys + static_cast(i) * key_length; uint64_t acc1, acc2, acc3, acc4; ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4); ProcessLastStripe(mask1, mask2, mask3, mask4, key + (num_stripes - 1) * kStripeSize, @@ -607,11 +610,11 @@ void Hashing64::HashFixedLenImp(uint32_t num_rows, uint64_t length, const uint8_ uint64_t last_stripe_copy[4]; for (uint32_t i = num_rows_safe; i < num_rows; ++i) { - const uint8_t* key = keys + static_cast(i) * length; + const uint8_t* key = keys + static_cast(i) * key_length; uint64_t acc1, acc2, acc3, acc4; ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4); memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize, - length - (num_stripes - 1) * kStripeSize); + key_length - (num_stripes - 1) * kStripeSize); ProcessLastStripe(mask1, mask2, mask3, mask4, reinterpret_cast(last_stripe_copy), &acc1, &acc2, &acc3, &acc4); @@ -637,15 +640,16 @@ void Hashing64::HashVarLenImp(uint32_t num_rows, const T* offsets, } for (uint32_t i = 0; i < num_rows_safe; ++i) { - uint64_t length = offsets[i + 1] - offsets[i]; + uint64_t key_length = offsets[i + 1] - offsets[i]; // Compute masks for the last 32 byte stripe. // For an empty string set number of stripes to 1 but mask to all zeroes. // - int is_non_empty = length == 0 ? 0 : 1; - uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 - is_non_empty); + int is_non_empty = key_length == 0 ? 0 : 1; + uint64_t num_stripes = + bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty); uint64_t mask1, mask2, mask3, mask4; - StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, &mask1, + StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, &mask1, &mask2, &mask3, &mask4); const uint8_t* key = concatenated_keys + offsets[i]; @@ -667,22 +671,23 @@ void Hashing64::HashVarLenImp(uint32_t num_rows, const T* offsets, uint64_t last_stripe_copy[4]; for (uint32_t i = num_rows_safe; i < num_rows; ++i) { - uint64_t length = offsets[i + 1] - offsets[i]; + uint64_t key_length = offsets[i + 1] - offsets[i]; // Compute masks for the last 32 byte stripe // - int is_non_empty = length == 0 ? 0 : 1; - uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 - is_non_empty); + int is_non_empty = key_length == 0 ? 0 : 1; + uint64_t num_stripes = + bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty); uint64_t mask1, mask2, mask3, mask4; - StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, &mask1, + StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, &mask1, &mask2, &mask3, &mask4); const uint8_t* key = concatenated_keys + offsets[i]; uint64_t acc1, acc2, acc3, acc4; ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4); - if (length > 0) { + if (key_length > 0) { memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize, - length - (num_stripes - 1) * kStripeSize); + key_length - (num_stripes - 1) * kStripeSize); } if (num_stripes > 0) { ProcessLastStripe(mask1, mask2, mask3, mask4, @@ -759,9 +764,9 @@ void Hashing64::HashIntImp(uint32_t num_keys, const T* keys, uint64_t* hashes) { } } -void Hashing64::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t length_key, +void Hashing64::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t key_length, const uint8_t* keys, uint64_t* hashes) { - switch (length_key) { + switch (key_length) { case sizeof(uint8_t): if (combine_hashes) { HashIntImp(num_keys, keys, hashes); @@ -802,17 +807,17 @@ void Hashing64::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t length_ } } -void Hashing64::HashFixed(bool combine_hashes, uint32_t num_rows, uint64_t length, +void Hashing64::HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t key_length, const uint8_t* keys, uint64_t* hashes) { - if (ARROW_POPCOUNT64(length) == 1 && length <= sizeof(uint64_t)) { - HashInt(combine_hashes, num_rows, length, keys, hashes); + if (ARROW_POPCOUNT64(key_length) == 1 && key_length <= sizeof(uint64_t)) { + HashInt(combine_hashes, num_keys, key_length, keys, hashes); return; } if (combine_hashes) { - HashFixedLenImp(num_rows, length, keys, hashes); + HashFixedLenImp(num_keys, key_length, keys, hashes); } else { - HashFixedLenImp(num_rows, length, keys, hashes); + HashFixedLenImp(num_keys, key_length, keys, hashes); } } @@ -860,13 +865,13 @@ void Hashing64::HashMultiColumn(const std::vector& cols, } if (cols[icol].metadata().is_fixed_length) { - uint64_t col_width = cols[icol].metadata().fixed_length; - if (col_width == 0) { + uint64_t key_length = cols[icol].metadata().fixed_length; + if (key_length == 0) { HashBit(icol > 0, cols[icol].bit_offset(1), batch_size_next, cols[icol].data(1) + first_row / 8, hashes + first_row); } else { - HashFixed(icol > 0, batch_size_next, col_width, - cols[icol].data(1) + first_row * col_width, hashes + first_row); + HashFixed(icol > 0, batch_size_next, key_length, + cols[icol].data(1) + first_row * key_length, hashes + first_row); } } else if (cols[icol].metadata().fixed_length == sizeof(uint32_t)) { HashVarLen(icol > 0, batch_size_next, cols[icol].offsets() + first_row, @@ -897,8 +902,9 @@ void Hashing64::HashMultiColumn(const std::vector& cols, Status Hashing64::HashBatch(const ExecBatch& key_batch, uint64_t* hashes, std::vector& column_arrays, int64_t hardware_flags, util::TempVectorStack* temp_stack, - int64_t offset, int64_t length) { - RETURN_NOT_OK(ColumnArraysFromExecBatch(key_batch, offset, length, &column_arrays)); + int64_t start_row, int64_t num_rows) { + RETURN_NOT_OK( + ColumnArraysFromExecBatch(key_batch, start_row, num_rows, &column_arrays)); LightContext ctx; ctx.hardware_flags = hardware_flags; diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h index b193716c9bd..1173df5ed10 100644 --- a/cpp/src/arrow/compute/key_hash.h +++ b/cpp/src/arrow/compute/key_hash.h @@ -51,10 +51,10 @@ class ARROW_EXPORT Hashing32 { static Status HashBatch(const ExecBatch& key_batch, uint32_t* hashes, std::vector& column_arrays, int64_t hardware_flags, util::TempVectorStack* temp_stack, - int64_t offset, int64_t length); + int64_t start_row, int64_t num_rows); static void HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t num_keys, - uint64_t length_key, const uint8_t* keys, uint32_t* hashes, + uint64_t key_length, const uint8_t* keys, uint32_t* hashes, uint32_t* temp_hashes_for_combine); private: @@ -100,7 +100,7 @@ class ARROW_EXPORT Hashing32 { static inline void StripeMask(int i, uint32_t* mask1, uint32_t* mask2, uint32_t* mask3, uint32_t* mask4); template - static void HashFixedLenImp(uint32_t num_rows, uint64_t length, const uint8_t* keys, + static void HashFixedLenImp(uint32_t num_rows, uint64_t key_length, const uint8_t* keys, uint32_t* hashes); template static void HashVarLenImp(uint32_t num_rows, const T* offsets, @@ -112,7 +112,7 @@ class ARROW_EXPORT Hashing32 { const uint8_t* keys, uint32_t* hashes); template static void HashIntImp(uint32_t num_keys, const T* keys, uint32_t* hashes); - static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t length_key, + static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t key_length, const uint8_t* keys, uint32_t* hashes); #if defined(ARROW_HAVE_RUNTIME_AVX2) @@ -129,11 +129,11 @@ class ARROW_EXPORT Hashing32 { __m256i mask_last_stripe, const uint8_t* keys, int64_t offset_A, int64_t offset_B); template - static uint32_t HashFixedLenImp_avx2(uint32_t num_rows, uint64_t length, + static uint32_t HashFixedLenImp_avx2(uint32_t num_rows, uint64_t key_length, const uint8_t* keys, uint32_t* hashes, uint32_t* hashes_temp_for_combine); static uint32_t HashFixedLen_avx2(bool combine_hashes, uint32_t num_rows, - uint64_t length, const uint8_t* keys, + uint64_t key_length, const uint8_t* keys, uint32_t* hashes, uint32_t* hashes_temp_for_combine); template static uint32_t HashVarLenImp_avx2(uint32_t num_rows, const T* offsets, @@ -164,9 +164,9 @@ class ARROW_EXPORT Hashing64 { static Status HashBatch(const ExecBatch& key_batch, uint64_t* hashes, std::vector& column_arrays, int64_t hardware_flags, util::TempVectorStack* temp_stack, - int64_t offset, int64_t length); + int64_t start_row, int64_t num_rows); - static void HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t length_key, + static void HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t key_length, const uint8_t* keys, uint64_t* hashes); private: @@ -203,7 +203,7 @@ class ARROW_EXPORT Hashing64 { static inline void StripeMask(int i, uint64_t* mask1, uint64_t* mask2, uint64_t* mask3, uint64_t* mask4); template - static void HashFixedLenImp(uint32_t num_rows, uint64_t length, const uint8_t* keys, + static void HashFixedLenImp(uint32_t num_rows, uint64_t key_length, const uint8_t* keys, uint64_t* hashes); template static void HashVarLenImp(uint32_t num_rows, const T* offsets, @@ -211,11 +211,11 @@ class ARROW_EXPORT Hashing64 { template static void HashBitImp(int64_t bit_offset, uint32_t num_keys, const uint8_t* keys, uint64_t* hashes); - static void HashBit(bool T_COMBINE_HASHES, int64_t bit_offset, uint32_t num_keys, + static void HashBit(bool combine_hashes, int64_t bit_offset, uint32_t num_keys, const uint8_t* keys, uint64_t* hashes); template static void HashIntImp(uint32_t num_keys, const T* keys, uint64_t* hashes); - static void HashInt(bool T_COMBINE_HASHES, uint32_t num_keys, uint64_t length_key, + static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t key_length, const uint8_t* keys, uint64_t* hashes); }; diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc b/cpp/src/arrow/compute/key_hash_avx2.cc index 1b444b57678..aec2800c647 100644 --- a/cpp/src/arrow/compute/key_hash_avx2.cc +++ b/cpp/src/arrow/compute/key_hash_avx2.cc @@ -190,7 +190,7 @@ uint32_t Hashing32::HashFixedLenImp_avx2(uint32_t num_rows, uint64_t length, // Do not process rows that could read past the end of the buffer using 16 // byte loads. Round down number of rows to process to multiple of 2. // - uint64_t num_rows_to_skip = bit_util::CeilDiv(length, kStripeSize); + uint64_t num_rows_to_skip = bit_util::CeilDiv(kStripeSize, length); uint32_t num_rows_to_process = (num_rows_to_skip > num_rows) ? 0 diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index 3e6d41525cf..c998df7169c 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -252,5 +252,64 @@ TEST(VectorHash, BasicString) { RunTestVectorHash(); } TEST(VectorHash, BasicLargeString) { RunTestVectorHash(); } +void HashFixedLengthFrom(int key_length, int num_rows, int start_row) { + int num_rows_to_hash = num_rows - start_row; + auto num_bytes_aligned = arrow::bit_util::RoundUpToMultipleOf64(key_length * num_rows); + + const auto hardware_flags_for_testing = HardwareFlagsForTesting(); + ASSERT_GT(hardware_flags_for_testing.size(), 0); + + std::vector> hashes32(hardware_flags_for_testing.size()); + std::vector> hashes64(hardware_flags_for_testing.size()); + for (auto& h : hashes32) { + h.resize(num_rows_to_hash); + } + for (auto& h : hashes64) { + h.resize(num_rows_to_hash); + } + + FixedSizeBinaryBuilder keys_builder(fixed_size_binary(key_length)); + for (int j = 0; j < num_rows; ++j) { + ASSERT_OK(keys_builder.Append(std::string(key_length, 42))); + } + ASSERT_OK_AND_ASSIGN(auto keys, keys_builder.Finish()); + // Make sure the buffer is aligned as expected. + ASSERT_EQ(keys->data()->buffers[1]->capacity(), num_bytes_aligned); + + constexpr int mini_batch_size = 1024; + std::vector temp_buffer; + temp_buffer.resize(mini_batch_size * 4); + + for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + const auto hardware_flags = hardware_flags_for_testing[i]; + Hashing32::HashFixed(hardware_flags, + /*combine_hashes=*/false, num_rows_to_hash, key_length, + keys->data()->GetValues(1) + start_row * key_length, + hashes32[i].data(), temp_buffer.data()); + Hashing64::HashFixed( + /*combine_hashes=*/false, num_rows_to_hash, key_length, + keys->data()->GetValues(1) + start_row * key_length, hashes64[i].data()); + } + + // Verify that all implementations (scalar, SIMD) give the same hashes. + for (int i = 1; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (int j = 0; j < num_rows_to_hash; ++j) { + ASSERT_EQ(hashes32[i][j], hashes32[0][j]) + << "scalar and simd approaches yielded different 32-bit hashes"; + ASSERT_EQ(hashes64[i][j], hashes64[0][j]) + << "scalar and simd approaches yielded different 64-bit hashes"; + } + } +} + +// Some carefully chosen cases that may cause troubles like GH-39778. +TEST(VectorHash, FixedLengthTailByteSafety) { + // Tow cases of key_length < stripe (16-byte). + HashFixedLengthFrom(/*key_length=*/3, /*num_rows=*/1450, /*start_row=*/1447); + HashFixedLengthFrom(/*key_length=*/5, /*num_rows=*/883, /*start_row=*/858); + // Case of key_length > stripe (16-byte). + HashFixedLengthFrom(/*key_length=*/19, /*num_rows=*/64, /*start_row=*/63); +} + } // namespace compute } // namespace arrow