From 82278426410592dea775066cd9234620089ab6ec Mon Sep 17 00:00:00 2001 From: michalursa Date: Tue, 3 Aug 2021 18:30:25 -0700 Subject: [PATCH 1/6] Column-at-a-time comparison and hashing for Group Identifier --- cpp/src/arrow/compute/exec/key_compare.cc | 548 ++++++---- cpp/src/arrow/compute/exec/key_compare.h | 138 ++- .../arrow/compute/exec/key_compare_avx2.cc | 648 +++++++++--- cpp/src/arrow/compute/exec/key_encode.cc | 946 ++++++------------ cpp/src/arrow/compute/exec/key_encode.h | 184 ++-- cpp/src/arrow/compute/exec/key_encode_avx2.cc | 308 +----- cpp/src/arrow/compute/exec/key_hash.cc | 204 ++-- cpp/src/arrow/compute/exec/key_hash.h | 18 +- cpp/src/arrow/compute/exec/key_hash_avx2.cc | 36 +- cpp/src/arrow/compute/exec/key_map.cc | 2 +- cpp/src/arrow/compute/exec/key_map.h | 2 +- cpp/src/arrow/compute/exec/util.cc | 40 +- cpp/src/arrow/compute/exec/util.h | 13 +- cpp/src/arrow/compute/exec/util_avx2.cc | 14 +- .../arrow/compute/kernels/hash_aggregate.cc | 27 +- .../compute/kernels/hash_aggregate_test.cc | 3 + 16 files changed, 1570 insertions(+), 1561 deletions(-) diff --git a/cpp/src/arrow/compute/exec/key_compare.cc b/cpp/src/arrow/compute/exec/key_compare.cc index 7a5b0be9990..2881940cf9c 100644 --- a/cpp/src/arrow/compute/exec/key_compare.cc +++ b/cpp/src/arrow/compute/exec/key_compare.cc @@ -17,250 +17,406 @@ #include "arrow/compute/exec/key_compare.h" +#include + #include #include #include "arrow/compute/exec/util.h" +#include "arrow/util/bit_util.h" #include "arrow/util/ubsan.h" namespace arrow { namespace compute { -void KeyCompare::CompareRows(uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows, - uint16_t* out_sel_left_maybe_same, - const KeyEncoder::KeyRowArray& rows_left, - const KeyEncoder::KeyRowArray& rows_right) { - ARROW_DCHECK(rows_left.metadata().is_compatible(rows_right.metadata())); - - if (num_rows_to_compare == 0) { - *out_num_rows = 0; +template +void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector) { + if (!rows.has_any_nulls(ctx) && !col.data(0)) { return; } - - // Allocate temporary byte and bit vectors - auto bytevector_holder = - util::TempVectorHolder(ctx->stack, num_rows_to_compare); - auto bitvector_holder = - util::TempVectorHolder(ctx->stack, num_rows_to_compare); - - uint8_t* match_bytevector = bytevector_holder.mutable_data(); - uint8_t* match_bitvector = bitvector_holder.mutable_data(); - - // All comparison functions called here will update match byte vector - // (AND it with comparison result) instead of overwriting it. - memset(match_bytevector, 0xff, num_rows_to_compare); - - if (rows_left.metadata().is_fixed_length) { - CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - match_bytevector, ctx, rows_left.metadata().fixed_length, - rows_left.data(1), rows_right.data(1)); - } else { - CompareVaryingLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - match_bytevector, ctx, rows_left.data(2), rows_right.data(2), - rows_left.offsets(), rows_right.offsets()); + uint32_t num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2()) { + num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); } +#endif - // CompareFixedLength can be used to compare nulls as well - bool nulls_present = rows_left.has_any_nulls(ctx) || rows_right.has_any_nulls(ctx); - if (nulls_present) { - CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - match_bytevector, ctx, - rows_left.metadata().null_masks_bytes_per_row, - rows_left.null_masks(), rows_right.null_masks()); + if (!col.data(0)) { + // Remove rows from the result for which the column value is a null + const uint8_t* null_masks = rows.null_masks(); + uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; + for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; + uint32_t irow_right = left_to_right_map[irow_left]; + int64_t bitid = irow_right * null_mask_num_bytes * 8 + id_col; + match_bytevector[i] &= (BitUtil::GetBit(null_masks, bitid) ? 0 : 0xff); + } + } else if (!rows.has_any_nulls(ctx)) { + // Remove rows from the result for which the column value on left side is null + const uint8_t* non_nulls = col.data(0); + ARROW_DCHECK(non_nulls); + for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; + match_bytevector[i] &= + BitUtil::GetBit(non_nulls, irow_left + col.bit_offset(0)) ? 0xff : 0; + } + } else { + const uint8_t* null_masks = rows.null_masks(); + uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; + const uint8_t* non_nulls = col.data(0); + ARROW_DCHECK(non_nulls); + for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; + uint32_t irow_right = left_to_right_map[irow_left]; + int64_t bitid_right = irow_right * null_mask_num_bytes * 8 + id_col; + int right_null = BitUtil::GetBit(null_masks, bitid_right) ? 0xff : 0; + int left_null = + BitUtil::GetBit(non_nulls, irow_left + col.bit_offset(0)) ? 0 : 0xff; + match_bytevector[i] |= left_null & right_null; + match_bytevector[i] &= ~(left_null ^ right_null); + } } +} - util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, match_bytevector, - match_bitvector); - if (sel_left_maybe_null) { - int out_num_rows_int; - util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare, - match_bitvector, sel_left_maybe_null, - &out_num_rows_int, out_sel_left_maybe_same); - *out_num_rows = out_num_rows_int; +template +void KeyCompare::CompareBinaryColumnToRowHelper( + uint32_t offset_within_row, uint32_t first_row_to_compare, + uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector, COMPARE_FN compare_fn) { + bool is_fixed_length = rows.metadata().is_fixed_length; + if (is_fixed_length) { + uint32_t fixed_length = rows.metadata().fixed_length; + const uint8_t* rows_left = col.data(1); + const uint8_t* rows_right = rows.data(1); + for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; + uint32_t irow_right = left_to_right_map[irow_left]; + uint32_t offset_right = irow_right * fixed_length + offset_within_row; + match_bytevector[i] = compare_fn(rows_left, rows_right, irow_left, offset_right); + } } else { - int out_num_rows_int; - util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare, - match_bitvector, &out_num_rows_int, - out_sel_left_maybe_same); - *out_num_rows = out_num_rows_int; + const uint8_t* rows_left = col.data(1); + const uint32_t* offsets_right = rows.offsets(); + const uint8_t* rows_right = rows.data(2); + for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; + uint32_t irow_right = left_to_right_map[irow_left]; + uint32_t offset_right = offsets_right[irow_right] + offset_within_row; + match_bytevector[i] = compare_fn(rows_left, rows_right, irow_left, offset_right); + } } } -void KeyCompare::CompareFixedLength(uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - uint8_t* match_bytevector, - KeyEncoder::KeyEncoderContext* ctx, - uint32_t fixed_length, const uint8_t* rows_left, - const uint8_t* rows_right) { - bool use_selection = (sel_left_maybe_null != nullptr); - - uint32_t num_rows_already_processed = 0; - +template +void KeyCompare::CompareBinaryColumnToRow( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + uint32_t num_processed = 0; #if defined(ARROW_HAVE_AVX2) - if (ctx->has_avx2() && !use_selection) { - // Choose between up-to-8B length, up-to-16B length and any size versions - if (fixed_length <= 8) { - num_rows_already_processed = CompareFixedLength_UpTo8B_avx2( - num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length, - rows_left, rows_right); - } else if (fixed_length <= 16) { - num_rows_already_processed = CompareFixedLength_UpTo16B_avx2( - num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length, - rows_left, rows_right); - } else { - num_rows_already_processed = - CompareFixedLength_avx2(num_rows_to_compare, left_to_right_map, - match_bytevector, fixed_length, rows_left, rows_right); - } + if (ctx->has_avx2()) { + num_processed = CompareBinaryColumnToRow_avx2( + use_selection, offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector); } #endif - typedef void (*CompareFixedLengthImp_t)(uint32_t, uint32_t, const uint16_t*, - const uint32_t*, uint8_t*, uint32_t, - const uint8_t*, const uint8_t*); - static const CompareFixedLengthImp_t CompareFixedLengthImp_fn[] = { - CompareFixedLengthImp, CompareFixedLengthImp, - CompareFixedLengthImp, CompareFixedLengthImp, - CompareFixedLengthImp, CompareFixedLengthImp}; - int dispatch_const = (use_selection ? 3 : 0) + - ((fixed_length <= 8) ? 0 : ((fixed_length <= 16) ? 1 : 2)); - CompareFixedLengthImp_fn[dispatch_const]( - num_rows_already_processed, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, match_bytevector, fixed_length, rows_left, rows_right); -} + uint32_t col_width = col.metadata().fixed_length; + if (col_width == 0) { + int bit_offset = col.bit_offset(1); + CompareBinaryColumnToRowHelper( + offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [bit_offset](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left, uint32_t offset_right) { + uint8_t left = BitUtil::GetBit(left_base, irow_left + bit_offset) ? 0xff : 0x00; + uint8_t right = right_base[offset_right]; + return left == right ? 0xff : 0; + }); + } else if (col_width == 1) { + CompareBinaryColumnToRowHelper( + offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, + uint32_t offset_right) { + uint8_t left = left_base[irow_left]; + uint8_t right = right_base[offset_right]; + return left == right ? 0xff : 0; + }); + } else if (col_width == 2) { + CompareBinaryColumnToRowHelper( + offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, + uint32_t offset_right) { + util::CheckAlignment(left_base); + util::CheckAlignment(right_base + offset_right); + uint16_t left = reinterpret_cast(left_base)[irow_left]; + uint16_t right = *reinterpret_cast(right_base + offset_right); + return left == right ? 0xff : 0; + }); + } else if (col_width == 4) { + CompareBinaryColumnToRowHelper( + offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, + uint32_t offset_right) { + util::CheckAlignment(left_base); + util::CheckAlignment(right_base + offset_right); + uint32_t left = reinterpret_cast(left_base)[irow_left]; + uint32_t right = *reinterpret_cast(right_base + offset_right); + return left == right ? 0xff : 0; + }); + } else if (col_width == 8) { + CompareBinaryColumnToRowHelper( + offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, + uint32_t offset_right) { + util::CheckAlignment(left_base); + util::CheckAlignment(right_base + offset_right); + uint64_t left = reinterpret_cast(left_base)[irow_left]; + uint64_t right = *reinterpret_cast(right_base + offset_right); + return left == right ? 0xff : 0; + }); + } else { + CompareBinaryColumnToRowHelper( + offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [&col](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, + uint32_t offset_right) { + uint32_t length = col.metadata().fixed_length; -template -void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed, - uint32_t num_rows, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - uint8_t* match_bytevector, uint32_t length, - const uint8_t* rows_left, - const uint8_t* rows_right) { - // Key length (for encoded key) has to be non-zero - ARROW_DCHECK(length > 0); + // Non-zero length guarantees no underflow + int32_t num_loops_less_one = + BitUtil::CeilDiv(static_cast(length), 8) - 1; + + uint64_t tail_mask = ~0ULL >> (64 - 8 * (length - num_loops_less_one * 8)); - // Non-zero length guarantees no underflow - int32_t num_loops_less_one = (static_cast(length) + 7) / 8 - 1; + const uint64_t* key_left_ptr = + reinterpret_cast(left_base + irow_left * length); + util::CheckAlignment(right_base + offset_right); + const uint64_t* key_right_ptr = + reinterpret_cast(right_base + offset_right); + uint64_t result_or = 0; + int32_t i; + // length cannot be zero + for (i = 0; i < num_loops_less_one; ++i) { + uint64_t key_left = util::SafeLoad(key_left_ptr + i); + uint64_t key_right = key_right_ptr[i]; + result_or |= key_left ^ key_right; + } + uint64_t key_left = util::SafeLoad(key_left_ptr + i); + uint64_t key_right = key_right_ptr[i]; + result_or |= tail_mask & (key_left ^ key_right); + return result_or == 0 ? 0xff : 0; + }); + } +} - // Length remaining in last loop can only be zero for input length equal to zero - uint32_t length_remaining_last_loop = length - num_loops_less_one * 8; - uint64_t tail_mask = (~0ULL) >> (8 * (8 - length_remaining_last_loop)); +// Overwrites the match_bytevector instead of updating it +template +void KeyCompare::CompareVarBinaryColumnToRow( + uint32_t id_varbinary_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2()) { + CompareVarBinaryColumnToRow_avx2( + use_selection, is_first_varbinary_col, id_varbinary_col, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector); + return; + } +#endif - for (uint32_t id_input = num_rows_already_processed; id_input < num_rows; ++id_input) { - uint32_t irow_left = use_selection ? sel_left_maybe_null[id_input] : id_input; + const uint32_t* offsets_left = col.offsets(); + const uint32_t* offsets_right = rows.offsets(); + const uint8_t* rows_left = col.data(2); + const uint8_t* rows_right = rows.data(2); + for (uint32_t i = 0; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; - uint32_t begin_left = length * irow_left; - uint32_t begin_right = length * irow_right; + uint32_t begin_left = offsets_left[irow_left]; + uint32_t length_left = offsets_left[irow_left + 1] - begin_left; + uint32_t begin_right = offsets_right[irow_right]; + uint32_t length_right; + uint32_t offset_within_row; + if (!is_first_varbinary_col) { + rows.metadata().nth_varbinary_offset_and_length( + rows_right + begin_right, id_varbinary_col, &offset_within_row, &length_right); + } else { + rows.metadata().first_varbinary_offset_and_length( + rows_right + begin_right, &offset_within_row, &length_right); + } + begin_right += offset_within_row; + uint32_t length = std::min(length_left, length_right); const uint64_t* key_left_ptr = reinterpret_cast(rows_left + begin_left); + util::CheckAlignment(rows_right + begin_right); const uint64_t* key_right_ptr = reinterpret_cast(rows_right + begin_right); - uint64_t result_or = 0ULL; - int32_t istripe = 0; - - // Specializations for keys up to 8 bytes and between 9 and 16 bytes to - // avoid internal loop over words in the value for short ones. - // - // Template argument 0 means arbitrarily many 64-bit words, - // 1 means up to 1 and 2 means up to 2. - // - if (num_64bit_words == 0) { - for (; istripe < num_loops_less_one; ++istripe) { - uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); - uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); - result_or |= (key_left ^ key_right); + uint64_t result_or = 0; + if (length > 0) { + int32_t j; + // length can be zero + for (j = 0; j < BitUtil::CeilDiv(static_cast(length), 8) - 1; ++j) { + uint64_t key_left = util::SafeLoad(key_left_ptr + j); + uint64_t key_right = key_right_ptr[j]; + result_or |= key_left ^ key_right; } - } else if (num_64bit_words == 2) { - uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); - uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); - result_or |= (key_left ^ key_right); - ++istripe; + uint64_t tail_mask = ~0ULL >> (64 - 8 * (length - j * 8)); + uint64_t key_left = util::SafeLoad(key_left_ptr + j); + uint64_t key_right = key_right_ptr[j]; + result_or |= tail_mask & (key_left ^ key_right); } + int result = result_or == 0 ? 0xff : 0; + result *= (length_left == length_right ? 1 : 0); + match_bytevector[i] = result; + } +} - uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); - uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); - result_or |= (tail_mask & (key_left ^ key_right)); +void KeyCompare::AndByteVectors(KeyEncoder::KeyEncoderContext* ctx, uint32_t num_elements, + uint8_t* bytevector_A, const uint8_t* bytevector_B) { + uint32_t num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2()) { + num_processed = AndByteVectors_avx2(num_elements, bytevector_A, bytevector_B); + } +#endif - int result = (result_or == 0 ? 0xff : 0); - match_bytevector[id_input] &= result; + for (uint32_t i = num_processed / 8; i < BitUtil::CeilDiv(num_elements, 8); ++i) { + uint64_t* a = reinterpret_cast(bytevector_A); + const uint64_t* b = reinterpret_cast(bytevector_B); + a[i] &= b[i]; } } -void KeyCompare::CompareVaryingLength(uint32_t num_rows_to_compare, +void KeyCompare::CompareColumnsToRows(uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - uint8_t* match_bytevector, KeyEncoder::KeyEncoderContext* ctx, - const uint8_t* rows_left, const uint8_t* rows_right, - const uint32_t* offsets_left, - const uint32_t* offsets_right) { - bool use_selection = (sel_left_maybe_null != nullptr); + uint32_t* out_num_rows, + uint16_t* out_sel_left_maybe_same, + const std::vector& cols, + const KeyEncoder::KeyRowArray& rows) { + if (num_rows_to_compare == 0) { + *out_num_rows = 0; + return; + } -#if defined(ARROW_HAVE_AVX2) - if (ctx->has_avx2() && !use_selection) { - CompareVaryingLength_avx2(num_rows_to_compare, left_to_right_map, match_bytevector, - rows_left, rows_right, offsets_left, offsets_right); - } else { -#endif - if (use_selection) { - CompareVaryingLengthImp(num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, match_bytevector, rows_left, - rows_right, offsets_left, offsets_right); - } else { - CompareVaryingLengthImp(num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, match_bytevector, rows_left, - rows_right, offsets_left, offsets_right); + // Allocate temporary byte and bit vectors + auto bytevector_A_holder = + util::TempVectorHolder(ctx->stack, num_rows_to_compare); + auto bytevector_B_holder = + util::TempVectorHolder(ctx->stack, num_rows_to_compare); + auto bitvector_holder = + util::TempVectorHolder(ctx->stack, num_rows_to_compare); + + uint8_t* match_bytevector_A = bytevector_A_holder.mutable_data(); + uint8_t* match_bytevector_B = bytevector_B_holder.mutable_data(); + uint8_t* match_bitvector = bitvector_holder.mutable_data(); + + bool is_first_column = true; + for (size_t icol = 0; icol < cols.size(); ++icol) { + const KeyEncoder::KeyColumnArray& col = cols[icol]; + uint32_t offset_within_row = + rows.metadata().encoded_field_offset(static_cast(icol)); + if (col.metadata().is_fixed_length) { + if (sel_left_maybe_null) { + CompareBinaryColumnToRow( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, + is_first_column ? match_bytevector_A : match_bytevector_B); + NullUpdateColumnToRow( + static_cast(icol), num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, + is_first_column ? match_bytevector_A : match_bytevector_B); + } else { + // Version without using selection vector + CompareBinaryColumnToRow( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, + is_first_column ? match_bytevector_A : match_bytevector_B); + NullUpdateColumnToRow( + static_cast(icol), num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, + is_first_column ? match_bytevector_A : match_bytevector_B); + } + if (!is_first_column) { + AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); + } + is_first_column = false; } -#if defined(ARROW_HAVE_AVX2) } -#endif -} -template -void KeyCompare::CompareVaryingLengthImp( - uint32_t num_rows, const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, uint8_t* match_bytevector, - const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left, - const uint32_t* offsets_right) { - static const uint64_t tail_masks[] = { - 0x0000000000000000ULL, 0x00000000000000ffULL, 0x000000000000ffffULL, - 0x0000000000ffffffULL, 0x00000000ffffffffULL, 0x000000ffffffffffULL, - 0x0000ffffffffffffULL, 0x00ffffffffffffffULL, 0xffffffffffffffffULL}; - for (uint32_t i = 0; i < num_rows; ++i) { - uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - uint32_t irow_right = left_to_right_map[irow_left]; - uint32_t begin_left = offsets_left[irow_left]; - uint32_t begin_right = offsets_right[irow_right]; - uint32_t length_left = offsets_left[irow_left + 1] - begin_left; - uint32_t length_right = offsets_right[irow_right + 1] - begin_right; - uint32_t length = std::min(length_left, length_right); - const uint64_t* key_left_ptr = - reinterpret_cast(rows_left + begin_left); - const uint64_t* key_right_ptr = - reinterpret_cast(rows_right + begin_right); - uint64_t result_or = 0; - int32_t istripe; - // length can be zero - for (istripe = 0; istripe < (static_cast(length) + 7) / 8 - 1; ++istripe) { - uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); - uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); - result_or |= (key_left ^ key_right); + uint32_t ivarbinary = 0; + for (size_t icol = 0; icol < cols.size(); ++icol) { + const KeyEncoder::KeyColumnArray& col = cols[icol]; + if (!col.metadata().is_fixed_length) { + // Process varbinary and nulls + if (sel_left_maybe_null) { + if (ivarbinary == 0) { + CompareVarBinaryColumnToRow( + ivarbinary, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, is_first_column ? match_bytevector_A : match_bytevector_B); + } else { + CompareVarBinaryColumnToRow(ivarbinary, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector_B); + } + NullUpdateColumnToRow( + static_cast(icol), num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, + is_first_column ? match_bytevector_A : match_bytevector_B); + } else { + if (ivarbinary == 0) { + CompareVarBinaryColumnToRow( + ivarbinary, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, is_first_column ? match_bytevector_A : match_bytevector_B); + } else { + CompareVarBinaryColumnToRow( + ivarbinary, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector_B); + } + NullUpdateColumnToRow( + static_cast(icol), num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, + is_first_column ? match_bytevector_A : match_bytevector_B); + } + if (!is_first_column) { + AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); + } + is_first_column = false; + ++ivarbinary; } + } - uint32_t length_remaining = length - static_cast(istripe) * 8; - uint64_t tail_mask = tail_masks[length_remaining]; - - uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); - uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); - result_or |= (tail_mask & (key_left ^ key_right)); - - int result = (result_or == 0 ? 0xff : 0); - match_bytevector[i] &= result; + util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, + match_bytevector_A, match_bitvector); + if (sel_left_maybe_null) { + int out_num_rows_int; + util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare, + match_bitvector, sel_left_maybe_null, + &out_num_rows_int, out_sel_left_maybe_same); + *out_num_rows = out_num_rows_int; + } else { + int out_num_rows_int; + util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare, + match_bitvector, &out_num_rows_int, + out_sel_left_maybe_same); + *out_num_rows = out_num_rows_int; } } diff --git a/cpp/src/arrow/compute/exec/key_compare.h b/cpp/src/arrow/compute/exec/key_compare.h index 1dffabb884b..aeb5abbdd14 100644 --- a/cpp/src/arrow/compute/exec/key_compare.h +++ b/cpp/src/arrow/compute/exec/key_compare.h @@ -33,66 +33,102 @@ class KeyCompare { // Returns a single 16-bit selection vector of rows that failed comparison. // If there is input selection on the left, the resulting selection is a filtered image // of input selection. - static void CompareRows(uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows, - uint16_t* out_sel_left_maybe_same, - const KeyEncoder::KeyRowArray& rows_left, - const KeyEncoder::KeyRowArray& rows_right); - - private: - static void CompareFixedLength(uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - uint8_t* match_bytevector, - KeyEncoder::KeyEncoderContext* ctx, - uint32_t fixed_length, const uint8_t* rows_left, - const uint8_t* rows_right); - static void CompareVaryingLength(uint32_t num_rows_to_compare, + static void CompareColumnsToRows(uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - uint8_t* match_bytevector, KeyEncoder::KeyEncoderContext* ctx, - const uint8_t* rows_left, const uint8_t* rows_right, - const uint32_t* offsets_left, - const uint32_t* offsets_right); - - // Second template argument is 0, 1 or 2. - // 0 means arbitrarily many 64-bit words, 1 means up to 1 and 2 means up to 2. - template - static void CompareFixedLengthImp(uint32_t num_rows_already_processed, - uint32_t num_rows, + uint32_t* out_num_rows, + uint16_t* out_sel_left_maybe_same, + const std::vector& cols, + const KeyEncoder::KeyRowArray& rows); + + private: + template + static void NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - uint8_t* match_bytevector, uint32_t length, - const uint8_t* rows_left, const uint8_t* rows_right); + KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector); + + template + static void CompareBinaryColumnToRowHelper( + uint32_t offset_within_row, uint32_t first_row_to_compare, + uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector, COMPARE_FN compare_fn); + template - static void CompareVaryingLengthImp(uint32_t num_rows, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - uint8_t* match_bytevector, const uint8_t* rows_left, - const uint8_t* rows_right, - const uint32_t* offsets_left, - const uint32_t* offsets_right); + static void CompareBinaryColumnToRow( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector); + + template + static void CompareVarBinaryColumnToRow( + uint32_t id_varlen_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector); + + static void AndByteVectors(KeyEncoder::KeyEncoderContext* ctx, uint32_t num_elements, + uint8_t* bytevector_A, const uint8_t* bytevector_B); #if defined(ARROW_HAVE_AVX2) - static uint32_t CompareFixedLength_UpTo8B_avx2( - uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, - uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right); - static uint32_t CompareFixedLength_UpTo16B_avx2( - uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, - uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right); - static uint32_t CompareFixedLength_avx2(uint32_t num_rows, - const uint32_t* left_to_right_map, - uint8_t* match_bytevector, uint32_t length, - const uint8_t* rows_left, - const uint8_t* rows_right); - static void CompareVaryingLength_avx2( - uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, - const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left, - const uint32_t* offsets_right); + template + static uint32_t NullUpdateColumnToRowImp_avx2( + uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector); + + template + static uint32_t CompareBinaryColumnToRowHelper_avx2( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector, + COMPARE8_FN compare8_fn); + + template + static uint32_t CompareBinaryColumnToRowImp_avx2( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector); + + template + static void CompareVarBinaryColumnToRowImp_avx2( + uint32_t id_varlen_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector); + + static uint32_t AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A, + const uint8_t* bytevector_B); + + static uint32_t NullUpdateColumnToRow_avx2( + bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector); + + static uint32_t CompareBinaryColumnToRow_avx2( + bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector); + + static void CompareVarBinaryColumnToRow_avx2( + bool use_selection, bool is_first_varbinary_col, uint32_t id_varlen_col, + uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector); #endif }; diff --git a/cpp/src/arrow/compute/exec/key_compare_avx2.cc b/cpp/src/arrow/compute/exec/key_compare_avx2.cc index 6abdf6c3c3a..fba6d2c6ac9 100644 --- a/cpp/src/arrow/compute/exec/key_compare_avx2.cc +++ b/cpp/src/arrow/compute/exec/key_compare_avx2.cc @@ -25,160 +25,550 @@ namespace compute { #if defined(ARROW_HAVE_AVX2) -uint32_t KeyCompare::CompareFixedLength_UpTo8B_avx2( - uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, - uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right) { - ARROW_DCHECK(length <= 8); - __m256i offset_left = _mm256_setr_epi64x(0, length, length * 2, length * 3); - __m256i offset_left_incr = _mm256_set1_epi64x(length * 4); - __m256i mask = _mm256_set1_epi64x(~0ULL >> (8 * (8 - length))); - - constexpr uint32_t unroll = 4; - for (uint32_t i = 0; i < num_rows / unroll; ++i) { - auto key_left = _mm256_i64gather_epi64( - reinterpret_cast(rows_left), offset_left, 1); - offset_left = _mm256_add_epi64(offset_left, offset_left_incr); - __m128i offset_right = - _mm_loadu_si128(reinterpret_cast(left_to_right_map) + i); - offset_right = _mm_mullo_epi32(offset_right, _mm_set1_epi32(length)); - - auto key_right = _mm256_i32gather_epi64( - reinterpret_cast(rows_right), offset_right, 1); - uint32_t cmp = _mm256_movemask_epi8(_mm256_cmpeq_epi64( - _mm256_and_si256(key_left, mask), _mm256_and_si256(key_right, mask))); - reinterpret_cast(match_bytevector)[i] &= cmp; - } +inline __m256i set_first_n_bytes_avx2(int n) { + constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; + constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; + constexpr uint64_t kByteSequence16To23 = 0x1716151413121110ULL; + constexpr uint64_t kByteSequence24To31 = 0x1f1e1d1c1b1a1918ULL; - uint32_t num_rows_processed = num_rows - (num_rows % unroll); - return num_rows_processed; + return _mm256_cmpgt_epi8(_mm256_set1_epi8(n), + _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, + kByteSequence16To23, kByteSequence24To31)); } -uint32_t KeyCompare::CompareFixedLength_UpTo16B_avx2( - uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, - uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right) { - ARROW_DCHECK(length <= 16); +template +uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( + uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector) { + if (!rows.has_any_nulls(ctx) && !col.data(0)) { + return num_rows_to_compare; + } + if (!col.data(0)) { + // Remove rows from the result for which the column value is a null + const uint8_t* null_masks = rows.null_masks(); + uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; - constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; - constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; + uint32_t num_processed = 0; + constexpr uint32_t unroll = 8; + for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { + __m256i irow_right; + if (use_selection) { + __m256i irow_left = _mm256_cvtepu16_epi32( + _mm_loadu_si128(reinterpret_cast(sel_left_maybe_null) + i)); + irow_right = _mm256_i32gather_epi32((const int*)left_to_right_map, irow_left, 4); + } else { + irow_right = + _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); + } + __m256i bitid = + _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(null_mask_num_bytes * 8)); + bitid = _mm256_add_epi32(bitid, _mm256_set1_epi32(id_col)); + __m256i right = + _mm256_i32gather_epi32((const int*)null_masks, _mm256_srli_epi32(bitid, 3), 1); + right = _mm256_and_si256( + _mm256_set1_epi32(1), + _mm256_srlv_epi32(right, _mm256_and_si256(bitid, _mm256_set1_epi32(7)))); + __m256i cmp = _mm256_cmpeq_epi32(right, _mm256_setzero_si256()); + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); + uint32_t result_hi = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); + reinterpret_cast(match_bytevector)[i] &= + result_lo | (static_cast(result_hi) << 32); + } + num_processed = num_rows_to_compare / unroll * unroll; + return num_processed; + } else if (!rows.has_any_nulls(ctx)) { + // Remove rows from the result for which the column value on left side is null + const uint8_t* non_nulls = col.data(0); + ARROW_DCHECK(non_nulls); + uint32_t num_processed = 0; + constexpr uint32_t unroll = 8; + for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { + __m256i cmp; + if (use_selection) { + __m256i irow_left = _mm256_cvtepu16_epi32( + _mm_loadu_si128(reinterpret_cast(sel_left_maybe_null) + i)); + irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(col.bit_offset(0))); + __m256i left = _mm256_i32gather_epi32((const int*)non_nulls, + _mm256_srli_epi32(irow_left, 3), 1); + left = _mm256_and_si256( + _mm256_set1_epi32(1), + _mm256_srlv_epi32(left, _mm256_and_si256(irow_left, _mm256_set1_epi32(7)))); + cmp = _mm256_cmpeq_epi32(left, _mm256_set1_epi32(1)); + } else { + __m256i left = _mm256_cvtepu8_epi32(_mm_set1_epi8(static_cast( + reinterpret_cast(non_nulls + i)[0] >> col.bit_offset(0)))); + __m256i bits = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + cmp = _mm256_cmpeq_epi32(_mm256_and_si256(left, bits), bits); + } + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); + uint32_t result_hi = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); + reinterpret_cast(match_bytevector)[i] &= + result_lo | (static_cast(result_hi) << 32); + num_processed = num_rows_to_compare / unroll * unroll; + } + return num_processed; + } else { + const uint8_t* null_masks = rows.null_masks(); + uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; + const uint8_t* non_nulls = col.data(0); + ARROW_DCHECK(non_nulls); - __m256i mask = - _mm256_cmpgt_epi8(_mm256_set1_epi8(length), - _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, - kByteSequence0To7, kByteSequence8To15)); - const uint8_t* key_left_ptr = rows_left; - - constexpr uint32_t unroll = 2; - for (uint32_t i = 0; i < num_rows / unroll; ++i) { - auto key_left = _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128(reinterpret_cast(key_left_ptr))), - _mm_loadu_si128(reinterpret_cast(key_left_ptr + length)), 1); - key_left_ptr += length * 2; - auto key_right = _mm256_inserti128_si256( - _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast( - rows_right + length * left_to_right_map[2 * i]))), - _mm_loadu_si128(reinterpret_cast( - rows_right + length * left_to_right_map[2 * i + 1])), - 1); - __m256i cmp = _mm256_cmpeq_epi64(_mm256_and_si256(key_left, mask), - _mm256_and_si256(key_right, mask)); - cmp = _mm256_and_si256(cmp, _mm256_shuffle_epi32(cmp, 0xee)); // 0b11101110 - cmp = _mm256_permute4x64_epi64(cmp, 0x08); // 0b00001000 - reinterpret_cast(match_bytevector)[i] &= - (_mm256_movemask_epi8(cmp) & 0xffff); - } + uint32_t num_processed = 0; + constexpr uint32_t unroll = 8; + for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { + __m256i left_null; + __m256i irow_right; + if (use_selection) { + __m256i irow_left = _mm256_cvtepu16_epi32( + _mm_loadu_si128(reinterpret_cast(sel_left_maybe_null) + i)); + irow_right = _mm256_i32gather_epi32((const int*)left_to_right_map, irow_left, 4); + irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(col.bit_offset(0))); + __m256i left = _mm256_i32gather_epi32((const int*)non_nulls, + _mm256_srli_epi32(irow_left, 3), 1); + left = _mm256_and_si256( + _mm256_set1_epi32(1), + _mm256_srlv_epi32(left, _mm256_and_si256(irow_left, _mm256_set1_epi32(7)))); + left_null = _mm256_cmpeq_epi32(left, _mm256_setzero_si256()); + } else { + irow_right = + _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); + __m256i left = _mm256_cvtepu8_epi32(_mm_set1_epi8(static_cast( + reinterpret_cast(non_nulls + i)[0] >> col.bit_offset(0)))); + __m256i bits = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + left_null = + _mm256_cmpeq_epi32(_mm256_and_si256(left, bits), _mm256_setzero_si256()); + } + __m256i bitid = + _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(null_mask_num_bytes * 8)); + bitid = _mm256_add_epi32(bitid, _mm256_set1_epi32(id_col)); + __m256i right = + _mm256_i32gather_epi32((const int*)null_masks, _mm256_srli_epi32(bitid, 3), 1); + right = _mm256_and_si256( + _mm256_set1_epi32(1), + _mm256_srlv_epi32(right, _mm256_and_si256(bitid, _mm256_set1_epi32(7)))); + __m256i right_null = _mm256_cmpeq_epi32(right, _mm256_set1_epi32(1)); - uint32_t num_rows_processed = num_rows - (num_rows % unroll); - return num_rows_processed; -} + uint64_t left_null_64 = + static_cast(_mm256_movemask_epi8( + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(left_null)))) | + (static_cast(static_cast(_mm256_movemask_epi8( + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(left_null, 1))))) + << 32); -uint32_t KeyCompare::CompareFixedLength_avx2(uint32_t num_rows, - const uint32_t* left_to_right_map, - uint8_t* match_bytevector, uint32_t length, - const uint8_t* rows_left, - const uint8_t* rows_right) { - ARROW_DCHECK(length > 0); + uint64_t right_null_64 = + static_cast(_mm256_movemask_epi8( + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(right_null)))) | + (static_cast(static_cast(_mm256_movemask_epi8( + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(right_null, 1))))) + << 32); - constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; - constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; - constexpr uint64_t kByteSequence16To23 = 0x1716151413121110ULL; - constexpr uint64_t kByteSequence24To31 = 0x1f1e1d1c1b1a1918ULL; + reinterpret_cast(match_bytevector)[i] |= left_null_64 & right_null_64; + reinterpret_cast(match_bytevector)[i] &= ~(left_null_64 ^ right_null_64); + } + num_processed = num_rows_to_compare / unroll * unroll; + return num_processed; + } +} - // Non-zero length guarantees no underflow - int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; +template +uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector, + COMPARE8_FN compare8_fn) { + bool is_fixed_length = rows.metadata().is_fixed_length; + if (is_fixed_length) { + uint32_t fixed_length = rows.metadata().fixed_length; + const uint8_t* rows_left = col.data(1); + const uint8_t* rows_right = rows.data(1); + constexpr uint32_t unroll = 8; + __m256i irow_left = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { + if (use_selection) { + irow_left = _mm256_cvtepu16_epi32( + _mm_loadu_si128(reinterpret_cast(sel_left_maybe_null) + i)); + } + __m256i irow_right; + if (use_selection) { + irow_right = _mm256_i32gather_epi32((const int*)left_to_right_map, irow_left, 4); + } else { + irow_right = + _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); + } - __m256i tail_mask = - _mm256_cmpgt_epi8(_mm256_set1_epi8(length - num_loops_less_one * 32), - _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, - kByteSequence16To23, kByteSequence24To31)); + __m256i offset_right = + _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(fixed_length)); + offset_right = _mm256_add_epi32(offset_right, _mm256_set1_epi32(offset_within_row)); - for (uint32_t irow_left = 0; irow_left < num_rows; ++irow_left) { - uint32_t irow_right = left_to_right_map[irow_left]; - uint32_t begin_left = length * irow_left; - uint32_t begin_right = length * irow_right; - const __m256i* key_left_ptr = - reinterpret_cast(rows_left + begin_left); - const __m256i* key_right_ptr = - reinterpret_cast(rows_right + begin_right); - __m256i result_or = _mm256_setzero_si256(); - int32_t i; - // length cannot be zero - for (i = 0; i < num_loops_less_one; ++i) { - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + reinterpret_cast(match_bytevector)[i] = + compare8_fn(rows_left, rows_right, i * unroll, irow_left, offset_right); + + if (!use_selection) { + irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(8)); + } } + return num_rows_to_compare - (num_rows_to_compare % unroll); + } else { + const uint8_t* rows_left = col.data(1); + const uint32_t* offsets_right = rows.offsets(); + const uint8_t* rows_right = rows.data(2); + constexpr uint32_t unroll = 8; + __m256i irow_left = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { + if (use_selection) { + irow_left = _mm256_cvtepu16_epi32( + _mm_loadu_si128(reinterpret_cast(sel_left_maybe_null) + i)); + } + __m256i irow_right; + if (use_selection) { + irow_right = _mm256_i32gather_epi32((const int*)left_to_right_map, irow_left, 4); + } else { + irow_right = + _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); + } + __m256i offset_right = + _mm256_i32gather_epi32((const int*)offsets_right, irow_right, 4); + offset_right = _mm256_add_epi32(offset_right, _mm256_set1_epi32(offset_within_row)); - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256( - result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); - int result = _mm256_testz_si256(result_or, result_or) * 0xff; - match_bytevector[irow_left] &= result; + reinterpret_cast(match_bytevector)[i] = + compare8_fn(rows_left, rows_right, i * unroll, irow_left, offset_right); + + if (!use_selection) { + irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(8)); + } + } + return num_rows_to_compare - (num_rows_to_compare % unroll); } +} - uint32_t num_rows_processed = num_rows; - return num_rows_processed; +template +uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + uint32_t col_width = col.metadata().fixed_length; + if (col_width == 0) { + int bit_offset = col.bit_offset(1); + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [bit_offset](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + __m256i left; + if (use_selection) { + irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(bit_offset)); + left = _mm256_i32gather_epi32((const int*)left_base, + _mm256_srli_epi32(irow_left, 3), 1); + left = _mm256_and_si256( + _mm256_set1_epi32(1), + _mm256_srlv_epi32(left, + _mm256_and_si256(irow_left, _mm256_set1_epi32(7)))); + left = _mm256_mullo_epi32(left, _mm256_set1_epi32(0xff)); + } else { + __m256i bits = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + uint32_t start_bit_index = irow_left_base + bit_offset; + uint8_t left_bits_8 = + (reinterpret_cast(left_base + start_bit_index / 8)[0] >> + (start_bit_index % 8)) & + 0xff; + left = _mm256_cmpeq_epi32( + _mm256_and_si256(bits, _mm256_set1_epi8(left_bits_8)), bits); + left = _mm256_and_si256(left, _mm256_set1_epi32(0xff)); + } + __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); + right = _mm256_and_si256(right, _mm256_set1_epi32(0xff)); + __m256i cmp = _mm256_cmpeq_epi32(left, right); + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); + uint32_t result_hi = _mm256_movemask_epi8( + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); + return result_lo | (static_cast(result_hi) << 32); + }); + } else if (col_width == 1) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, + __m256i irow_left, __m256i offset_right) { + __m256i left; + if (use_selection) { + left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 1); + left = _mm256_and_si256(left, _mm256_set1_epi32(0xff)); + } else { + left = _mm256_cvtepu8_epi32(_mm_set1_epi64x( + reinterpret_cast(left_base)[irow_left_base / 8])); + } + __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); + right = _mm256_and_si256(right, _mm256_set1_epi32(0xff)); + __m256i cmp = _mm256_cmpeq_epi32(left, right); + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); + uint32_t result_hi = _mm256_movemask_epi8( + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); + return result_lo | (static_cast(result_hi) << 32); + }); + } else if (col_width == 2) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, + __m256i irow_left, __m256i offset_right) { + __m256i left; + if (use_selection) { + left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 2); + left = _mm256_and_si256(left, _mm256_set1_epi32(0xffff)); + } else { + left = _mm256_cvtepu16_epi32(_mm_loadu_si128( + reinterpret_cast(left_base) + irow_left_base / 8)); + } + __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); + right = _mm256_and_si256(right, _mm256_set1_epi32(0xffff)); + __m256i cmp = _mm256_cmpeq_epi32(left, right); + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); + uint32_t result_hi = _mm256_movemask_epi8( + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); + return result_lo | (static_cast(result_hi) << 32); + }); + } else if (col_width == 4) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, + __m256i irow_left, __m256i offset_right) { + __m256i left; + if (use_selection) { + left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 4); + } else { + left = _mm256_loadu_si256(reinterpret_cast(left_base) + + irow_left_base / 8); + } + __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); + __m256i cmp = _mm256_cmpeq_epi32(left, right); + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); + uint32_t result_hi = _mm256_movemask_epi8( + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); + return result_lo | (static_cast(result_hi) << 32); + }); + } else if (col_width == 8) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, + __m256i irow_left, __m256i offset_right) { + auto left_base_i64 = + reinterpret_cast(left_base); + __m256i left_lo = + _mm256_i32gather_epi64(left_base_i64, _mm256_castsi256_si128(irow_left), 8); + __m256i left_hi = _mm256_i32gather_epi64( + left_base_i64, _mm256_extracti128_si256(irow_left, 1), 8); + if (use_selection) { + left_lo = _mm256_i32gather_epi64(left_base_i64, + _mm256_castsi256_si128(irow_left), 8); + left_hi = _mm256_i32gather_epi64(left_base_i64, + _mm256_extracti128_si256(irow_left, 1), 8); + } else { + left_lo = _mm256_loadu_si256(reinterpret_cast(left_base) + + irow_left_base / 4); + left_hi = _mm256_loadu_si256(reinterpret_cast(left_base) + + irow_left_base / 4 + 1); + } + auto right_base_i64 = + reinterpret_cast(right_base); + __m256i right_lo = _mm256_i32gather_epi64( + right_base_i64, _mm256_castsi256_si128(offset_right), 1); + __m256i right_hi = _mm256_i32gather_epi64( + right_base_i64, _mm256_extracti128_si256(offset_right, 1), 1); + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_lo, right_lo)); + uint32_t result_hi = + _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_hi, right_hi)); + return result_lo | (static_cast(result_hi) << 32); + }); + } else { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [&col](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + uint32_t irow_left_array[8]; + uint32_t offset_right_array[8]; + if (use_selection) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(irow_left_array), irow_left); + } + _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), + offset_right); + uint32_t length = col.metadata().fixed_length; + + // Non-zero length guarantees no underflow + int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; + + __m256i tail_mask = set_first_n_bytes_avx2(length - num_loops_less_one * 32); + + uint64_t result = 0; + for (uint32_t irow = 0; irow < 8; ++irow) { + const __m256i* key_left_ptr = reinterpret_cast( + left_base + + (use_selection ? irow_left_array[irow] : irow_left_base + irow) * length); + const __m256i* key_right_ptr = + reinterpret_cast(right_base + offset_right_array[irow]); + __m256i result_or = _mm256_setzero_si256(); + int32_t i; + // length cannot be zero + for (i = 0; i < num_loops_less_one; ++i) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = + _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + } + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256( + result_or, + _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + uint64_t result_single = _mm256_testz_si256(result_or, result_or) * 0xff; + result |= result_single << (8 * irow); + } + return result; + }); + } } -void KeyCompare::CompareVaryingLength_avx2( - uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, - const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left, - const uint32_t* offsets_right) { - for (uint32_t irow_left = 0; irow_left < num_rows; ++irow_left) { +// Overwrites the match_bytevector instead of updating it +template +void KeyCompare::CompareVarBinaryColumnToRowImp_avx2( + uint32_t id_varbinary_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + const uint32_t* offsets_left = col.offsets(); + const uint32_t* offsets_right = rows.offsets(); + const uint8_t* rows_left = col.data(2); + const uint8_t* rows_right = rows.data(2); + for (uint32_t i = 0; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; uint32_t begin_left = offsets_left[irow_left]; - uint32_t begin_right = offsets_right[irow_right]; uint32_t length_left = offsets_left[irow_left + 1] - begin_left; - uint32_t length_right = offsets_right[irow_right + 1] - begin_right; - uint32_t length = std::min(length_left, length_right); - auto key_left_ptr = reinterpret_cast(rows_left + begin_left); - auto key_right_ptr = reinterpret_cast(rows_right + begin_right); - __m256i result_or = _mm256_setzero_si256(); - int32_t i; - // length can be zero - for (i = 0; i < (static_cast(length) + 31) / 32 - 1; ++i) { - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + uint32_t begin_right = offsets_right[irow_right]; + uint32_t length_right; + uint32_t offset_within_row; + if (!is_first_varbinary_col) { + rows.metadata().nth_varbinary_offset_and_length( + rows_right + begin_right, id_varbinary_col, &offset_within_row, &length_right); + } else { + rows.metadata().first_varbinary_offset_and_length( + rows_right + begin_right, &offset_within_row, &length_right); } + begin_right += offset_within_row; - constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; - constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; - constexpr uint64_t kByteSequence16To23 = 0x1716151413121110ULL; - constexpr uint64_t kByteSequence24To31 = 0x1f1e1d1c1b1a1918ULL; + __m256i result_or = _mm256_setzero_si256(); + uint32_t length = std::min(length_left, length_right); + if (length > 0) { + const __m256i* key_left_ptr = + reinterpret_cast(rows_left + begin_left); + const __m256i* key_right_ptr = + reinterpret_cast(rows_right + begin_right); + int32_t j; + // length can be zero + for (j = 0; j < (static_cast(length) + 31) / 32 - 1; ++j) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); + result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + } - __m256i tail_mask = - _mm256_cmpgt_epi8(_mm256_set1_epi8(length - i * 32), - _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, - kByteSequence16To23, kByteSequence24To31)); + __m256i tail_mask = set_first_n_bytes_avx2(length - j * 32); - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256( - result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); + result_or = _mm256_or_si256( + result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + } int result = _mm256_testz_si256(result_or, result_or) * 0xff; - match_bytevector[irow_left] &= result; + result *= (length_left == length_right ? 1 : 0); + match_bytevector[i] = result; + } +} + +uint32_t KeyCompare::AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A, + const uint8_t* bytevector_B) { + constexpr int unroll = 32; + for (uint32_t i = 0; i < num_elements / unroll; ++i) { + __m256i result = _mm256_and_si256( + _mm256_loadu_si256(reinterpret_cast(bytevector_A) + i), + _mm256_loadu_si256(reinterpret_cast(bytevector_B) + i)); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(bytevector_A) + i, result); + } + return (num_elements - (num_elements % unroll)); +} + +uint32_t KeyCompare::NullUpdateColumnToRow_avx2( + bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + if (use_selection) { + return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); + } else { + return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); + } +} + +uint32_t KeyCompare::CompareBinaryColumnToRow_avx2( + bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + if (use_selection) { + return CompareBinaryColumnToRowImp_avx2(offset_within_row, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); + } else { + return CompareBinaryColumnToRowImp_avx2(offset_within_row, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); + } +} + +void KeyCompare::CompareVarBinaryColumnToRow_avx2( + bool use_selection, bool is_first_varbinary_col, uint32_t id_varlen_col, + uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector) { + if (use_selection) { + if (is_first_varbinary_col) { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } else { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } + } else { + if (is_first_varbinary_col) { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } else { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } } } diff --git a/cpp/src/arrow/compute/exec/key_encode.cc b/cpp/src/arrow/compute/exec/key_encode.cc index de79558f2c2..1a563867e90 100644 --- a/cpp/src/arrow/compute/exec/key_encode.cc +++ b/cpp/src/arrow/compute/exec/key_encode.cc @@ -189,7 +189,7 @@ Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from, uint32_t total_length = to_offsets[num_rows_]; uint32_t total_length_to_append = 0; for (uint32_t i = 0; i < num_rows_to_append; ++i) { - uint16_t row_id = source_row_ids[i]; + uint16_t row_id = source_row_ids ? source_row_ids[i] : i; uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id]; total_length_to_append += length; to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append; @@ -200,11 +200,11 @@ Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from, const uint8_t* src = from.rows_->data(); uint8_t* dst = rows_->mutable_data() + total_length; for (uint32_t i = 0; i < num_rows_to_append; ++i) { - uint16_t row_id = source_row_ids[i]; + uint16_t row_id = source_row_ids ? source_row_ids[i] : i; uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id]; auto src64 = reinterpret_cast(src + from_offsets[row_id]); auto dst64 = reinterpret_cast(dst); - for (uint32_t j = 0; j < (length + 7) / 8; ++j) { + for (uint32_t j = 0; j < BitUtil::CeilDiv(length, 8); ++j) { dst64[j] = src64[j]; } dst += length; @@ -214,11 +214,11 @@ Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from, const uint8_t* src = from.rows_->data(); uint8_t* dst = rows_->mutable_data() + num_rows_ * metadata_.fixed_length; for (uint32_t i = 0; i < num_rows_to_append; ++i) { - uint16_t row_id = source_row_ids[i]; + uint16_t row_id = source_row_ids ? source_row_ids[i] : i; uint32_t length = metadata_.fixed_length; auto src64 = reinterpret_cast(src + length * row_id); auto dst64 = reinterpret_cast(dst); - for (uint32_t j = 0; j < (length + 7) / 8; ++j) { + for (uint32_t j = 0; j < BitUtil::CeilDiv(length, 8); ++j) { dst64[j] = src64[j]; } dst += length; @@ -231,7 +231,7 @@ Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from, const uint8_t* src_base = from.null_masks_->data(); uint8_t* dst_base = null_masks_->mutable_data(); for (uint32_t i = 0; i < num_rows_to_append; ++i) { - uint32_t row_id = source_row_ids[i]; + uint32_t row_id = source_row_ids ? source_row_ids[i] : i; int64_t src_byte_offset = row_id * byte_length; const uint8_t* src = src_base + src_byte_offset; uint8_t* dst = dst_base + dst_byte_offset; @@ -363,21 +363,6 @@ KeyEncoder::KeyColumnArray KeyEncoder::TransformBoolean::ArrayReplace( return result; } -void KeyEncoder::TransformBoolean::PreEncode(const KeyColumnArray& input, - KeyColumnArray* output, - KeyEncoderContext* ctx) { - // Make sure that metadata and lengths are compatible. - DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length); - DCHECK(output->metadata().fixed_length == 1 && input.metadata().fixed_length == 0); - DCHECK(output->length() == input.length()); - constexpr int buffer_index = 1; - DCHECK(input.data(buffer_index) != nullptr); - DCHECK(output->mutable_data(buffer_index) != nullptr); - util::BitUtil::bits_to_bytes( - ctx->hardware_flags, static_cast(input.length()), input.data(buffer_index), - output->mutable_data(buffer_index), input.bit_offset(buffer_index)); -} - void KeyEncoder::TransformBoolean::PostDecode(const KeyColumnArray& input, KeyColumnArray* output, KeyEncoderContext* ctx) { @@ -410,14 +395,6 @@ KeyEncoder::KeyColumnArray KeyEncoder::EncoderInteger::ArrayReplace( return column; } -void KeyEncoder::EncoderInteger::PreEncode(const KeyColumnArray& input, - KeyColumnArray* output, - KeyEncoderContext* ctx) { - if (IsBoolean(input.metadata())) { - TransformBoolean::PreEncode(input, output, ctx); - } -} - void KeyEncoder::EncoderInteger::PostDecode(const KeyColumnArray& input, KeyColumnArray* output, KeyEncoderContext* ctx) { @@ -426,90 +403,6 @@ void KeyEncoder::EncoderInteger::PostDecode(const KeyColumnArray& input, } } -void KeyEncoder::EncoderInteger::Encode(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col, KeyEncoderContext* ctx, - KeyColumnArray* temp) { - KeyColumnArray col_prep; - if (UsesTransform(col)) { - col_prep = ArrayReplace(col, *temp); - PreEncode(col, &col_prep, ctx); - } else { - col_prep = col; - } - - const auto num_rows = static_cast(col.length()); - - // When we have a single fixed length column we can just do memcpy - if (rows->metadata().is_fixed_length && - rows->metadata().fixed_length == col.metadata().fixed_length) { - DCHECK_EQ(offset_within_row, 0); - uint32_t row_size = col.metadata().fixed_length; - memcpy(rows->mutable_data(1), col.data(1), num_rows * row_size); - } else if (rows->metadata().is_fixed_length) { - uint32_t row_size = rows->metadata().fixed_length; - uint8_t* row_base = rows->mutable_data(1) + offset_within_row; - const uint8_t* col_base = col_prep.data(1); - switch (col_prep.metadata().fixed_length) { - case 1: - for (uint32_t i = 0; i < num_rows; ++i) { - row_base[i * row_size] = col_base[i]; - } - break; - case 2: - for (uint32_t i = 0; i < num_rows; ++i) { - *reinterpret_cast(row_base + i * row_size) = - reinterpret_cast(col_base)[i]; - } - break; - case 4: - for (uint32_t i = 0; i < num_rows; ++i) { - *reinterpret_cast(row_base + i * row_size) = - reinterpret_cast(col_base)[i]; - } - break; - case 8: - for (uint32_t i = 0; i < num_rows; ++i) { - *reinterpret_cast(row_base + i * row_size) = - reinterpret_cast(col_base)[i]; - } - break; - default: - DCHECK(false); - } - } else { - const uint32_t* row_offsets = rows->offsets(); - uint8_t* row_base = rows->mutable_data(2) + offset_within_row; - const uint8_t* col_base = col_prep.data(1); - switch (col_prep.metadata().fixed_length) { - case 1: - for (uint32_t i = 0; i < num_rows; ++i) { - row_base[row_offsets[i]] = col_base[i]; - } - break; - case 2: - for (uint32_t i = 0; i < num_rows; ++i) { - *reinterpret_cast(row_base + row_offsets[i]) = - reinterpret_cast(col_base)[i]; - } - break; - case 4: - for (uint32_t i = 0; i < num_rows; ++i) { - *reinterpret_cast(row_base + row_offsets[i]) = - reinterpret_cast(col_base)[i]; - } - break; - case 8: - for (uint32_t i = 0; i < num_rows; ++i) { - *reinterpret_cast(row_base + row_offsets[i]) = - reinterpret_cast(col_base)[i]; - } - break; - default: - DCHECK(false); - } - } -} - void KeyEncoder::EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col, @@ -606,46 +499,6 @@ bool KeyEncoder::EncoderBinary::IsInteger(const KeyColumnMetadata& metadata) { (size == 0 || size == 1 || size == 2 || size == 4 || size == 8); } -void KeyEncoder::EncoderBinary::Encode(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col, KeyEncoderContext* ctx, - KeyColumnArray* temp) { - if (IsInteger(col.metadata())) { - EncoderInteger::Encode(offset_within_row, rows, col, ctx, temp); - } else { - KeyColumnArray col_prep; - if (EncoderInteger::UsesTransform(col)) { - col_prep = EncoderInteger::ArrayReplace(col, *temp); - EncoderInteger::PreEncode(col, &col_prep, ctx); - } else { - col_prep = col; - } - - bool is_row_fixed_length = rows->metadata().is_fixed_length; - -#if defined(ARROW_HAVE_AVX2) - if (ctx->has_avx2()) { - EncodeHelper_avx2(is_row_fixed_length, offset_within_row, rows, col); - } else { -#endif - if (is_row_fixed_length) { - EncodeImp(offset_within_row, rows, col); - } else { - EncodeImp(offset_within_row, rows, col); - } -#if defined(ARROW_HAVE_AVX2) - } -#endif - } - - DCHECK(temp->metadata().is_fixed_length); - DCHECK(temp->length() * temp->metadata().fixed_length >= - col.length() * static_cast(sizeof(uint16_t))); - - KeyColumnArray temp16bit(KeyColumnMetadata(true, sizeof(uint16_t)), col.length(), - nullptr, temp->mutable_data(1), nullptr); - ColumnMemsetNulls(offset_within_row, rows, col, ctx, &temp16bit, 0xae); -} - void KeyEncoder::EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col, @@ -683,34 +536,14 @@ void KeyEncoder::EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, } } -template -void KeyEncoder::EncoderBinary::EncodeImp(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col) { - EncodeDecodeHelper( - 0, static_cast(col.length()), offset_within_row, rows, rows, &col, - nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) { - auto dst64 = reinterpret_cast(dst); - auto src64 = reinterpret_cast(src); - uint32_t istripe; - for (istripe = 0; istripe < length / 8; ++istripe) { - dst64[istripe] = util::SafeLoad(src64 + istripe); - } - if ((length % 8) > 0) { - uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length)); - dst64[istripe] = (dst64[istripe] & ~mask_last) | - (util::SafeLoad(src64 + istripe) & mask_last); - } - }); -} - template void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col) { - EncodeDecodeHelper( + DecodeHelper( start_row, num_rows, offset_within_row, &rows, nullptr, col, col, [](uint8_t* dst, const uint8_t* src, int64_t length) { - for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) { + for (uint32_t istripe = 0; istripe < BitUtil::CeilDiv(length, 8); ++istripe) { auto dst64 = reinterpret_cast(dst); auto src64 = reinterpret_cast(src); util::SafeStore(dst64 + istripe, src64[istripe]); @@ -718,197 +551,6 @@ void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows, }); } -void KeyEncoder::EncoderBinary::ColumnMemsetNulls( - uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col, - KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) { - using ColumnMemsetNullsImp_t = void (*)(uint32_t, KeyRowArray*, const KeyColumnArray&, - KeyEncoderContext*, KeyColumnArray*, uint8_t); - static const ColumnMemsetNullsImp_t ColumnMemsetNullsImp_fn[] = { - ColumnMemsetNullsImp, ColumnMemsetNullsImp, - ColumnMemsetNullsImp, ColumnMemsetNullsImp, - ColumnMemsetNullsImp, ColumnMemsetNullsImp, - ColumnMemsetNullsImp, ColumnMemsetNullsImp, - ColumnMemsetNullsImp, ColumnMemsetNullsImp}; - uint32_t col_width = col.metadata().fixed_length; - int dispatch_const = - (rows->metadata().is_fixed_length ? 5 : 0) + - (col_width == 1 ? 0 - : col_width == 2 ? 1 : col_width == 4 ? 2 : col_width == 8 ? 3 : 4); - ColumnMemsetNullsImp_fn[dispatch_const](offset_within_row, rows, col, ctx, - temp_vector_16bit, byte_value); -} - -template -void KeyEncoder::EncoderBinary::ColumnMemsetNullsImp( - uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col, - KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) { - // Nothing to do when there are no nulls - if (!col.data(0)) { - return; - } - - const auto num_rows = static_cast(col.length()); - - // Temp vector needs space for the required number of rows - DCHECK(temp_vector_16bit->length() >= num_rows); - DCHECK(temp_vector_16bit->metadata().is_fixed_length && - temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t)); - auto temp_vector = reinterpret_cast(temp_vector_16bit->mutable_data(1)); - - // Bit vector to index vector of null positions - int num_selected; - util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, static_cast(col.length()), - col.data(0), &num_selected, temp_vector, - col.bit_offset(0)); - - for (int i = 0; i < num_selected; ++i) { - uint32_t row_id = temp_vector[i]; - - // Target binary field pointer - uint8_t* dst; - if (is_row_fixed_length) { - dst = rows->mutable_data(1) + rows->metadata().fixed_length * row_id; - } else { - dst = rows->mutable_data(2) + rows->offsets()[row_id]; - } - dst += offset_within_row; - - if (col_width == 1) { - *dst = byte_value; - } else if (col_width == 2) { - *reinterpret_cast(dst) = - (static_cast(byte_value) * static_cast(0x0101)); - } else if (col_width == 4) { - *reinterpret_cast(dst) = - (static_cast(byte_value) * static_cast(0x01010101)); - } else if (col_width == 8) { - *reinterpret_cast(dst) = - (static_cast(byte_value) * 0x0101010101010101ULL); - } else { - uint64_t value = (static_cast(byte_value) * 0x0101010101010101ULL); - uint32_t col_width_actual = col.metadata().fixed_length; - uint32_t j; - for (j = 0; j < col_width_actual / 8; ++j) { - reinterpret_cast(dst)[j] = value; - } - int tail = col_width_actual % 8; - if (tail) { - uint64_t mask = ~0ULL >> (8 * (8 - tail)); - reinterpret_cast(dst)[j] = - (reinterpret_cast(dst)[j] & ~mask) | (value & mask); - } - } - } -} - -void KeyEncoder::EncoderBinaryPair::Encode(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col1, - const KeyColumnArray& col2, - KeyEncoderContext* ctx, KeyColumnArray* temp1, - KeyColumnArray* temp2) { - DCHECK(CanProcessPair(col1.metadata(), col2.metadata())); - - KeyColumnArray col_prep[2]; - if (EncoderInteger::UsesTransform(col1)) { - col_prep[0] = EncoderInteger::ArrayReplace(col1, *temp1); - EncoderInteger::PreEncode(col1, &(col_prep[0]), ctx); - } else { - col_prep[0] = col1; - } - if (EncoderInteger::UsesTransform(col2)) { - col_prep[1] = EncoderInteger::ArrayReplace(col2, *temp2); - EncoderInteger::PreEncode(col2, &(col_prep[1]), ctx); - } else { - col_prep[1] = col2; - } - - uint32_t col_width1 = col_prep[0].metadata().fixed_length; - uint32_t col_width2 = col_prep[1].metadata().fixed_length; - int log_col_width1 = - col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0; - int log_col_width2 = - col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0; - - bool is_row_fixed_length = rows->metadata().is_fixed_length; - - const auto num_rows = static_cast(col1.length()); - uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) - if (ctx->has_avx2() && col_width1 == col_width2) { - num_processed = EncodeHelper_avx2(is_row_fixed_length, col_width1, offset_within_row, - rows, col_prep[0], col_prep[1]); - } -#endif - if (num_processed < num_rows) { - using EncodeImp_t = void (*)(uint32_t, uint32_t, KeyRowArray*, const KeyColumnArray&, - const KeyColumnArray&); - static const EncodeImp_t EncodeImp_fn[] = { - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp, - EncodeImp, EncodeImp}; - int dispatch_const = (log_col_width2 << 2) | log_col_width1; - dispatch_const += (is_row_fixed_length ? 16 : 0); - EncodeImp_fn[dispatch_const](num_processed, offset_within_row, rows, col_prep[0], - col_prep[1]); - } -} - -template -void KeyEncoder::EncoderBinaryPair::EncodeImp(uint32_t num_rows_to_skip, - uint32_t offset_within_row, - KeyRowArray* rows, - const KeyColumnArray& col1, - const KeyColumnArray& col2) { - const uint8_t* src_A = col1.data(1); - const uint8_t* src_B = col2.data(1); - - const auto num_rows = static_cast(col1.length()); - - uint32_t fixed_length = rows->metadata().fixed_length; - const uint32_t* offsets; - uint8_t* dst_base; - if (is_row_fixed_length) { - dst_base = rows->mutable_data(1) + offset_within_row; - offsets = nullptr; - } else { - dst_base = rows->mutable_data(2) + offset_within_row; - offsets = rows->offsets(); - } - - using col1_type_const = typename std::add_const::type; - using col2_type_const = typename std::add_const::type; - - if (is_row_fixed_length) { - uint8_t* dst = dst_base + num_rows_to_skip * fixed_length; - for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) { - *reinterpret_cast(dst) = reinterpret_cast(src_A)[i]; - *reinterpret_cast(dst + sizeof(col1_type)) = - reinterpret_cast(src_B)[i]; - dst += fixed_length; - } - } else { - for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) { - uint8_t* dst = dst_base + offsets[i]; - *reinterpret_cast(dst) = reinterpret_cast(src_A)[i]; - *reinterpret_cast(dst + sizeof(col1_type)) = - reinterpret_cast(src_B)[i]; - } - } -} - void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col1, @@ -1024,101 +666,6 @@ void KeyEncoder::EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip, } } -void KeyEncoder::EncoderOffsets::Encode(KeyRowArray* rows, - const std::vector& varbinary_cols, - KeyEncoderContext* ctx) { - DCHECK(!varbinary_cols.empty()); - - // Rows and columns must all be varying-length - DCHECK(!rows->metadata().is_fixed_length); - for (const auto& col : varbinary_cols) { - DCHECK(!col.metadata().is_fixed_length); - } - - const auto num_rows = static_cast(varbinary_cols[0].length()); - - uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) - // Whether any of the columns has non-zero starting bit offset for non-nulls bit vector - bool has_bit_offset = false; - - // The space in columns must be exactly equal to a space for offsets in rows - DCHECK(rows->length() == num_rows); - for (const auto& col : varbinary_cols) { - DCHECK(col.length() == num_rows); - if (col.bit_offset(0) != 0) { - has_bit_offset = true; - } - } - - if (ctx->has_avx2() && !has_bit_offset) { - // Create a temp vector sized based on the number of columns - auto temp_buffer_holder = util::TempVectorHolder( - ctx->stack, static_cast(varbinary_cols.size()) * 8); - auto temp_buffer_32B_per_col = KeyColumnArray( - KeyColumnMetadata(true, sizeof(uint32_t)), varbinary_cols.size() * 8, nullptr, - reinterpret_cast(temp_buffer_holder.mutable_data()), nullptr); - - num_processed = EncodeImp_avx2(rows, varbinary_cols, &temp_buffer_32B_per_col); - } -#endif - if (num_processed < num_rows) { - EncodeImp(num_processed, rows, varbinary_cols); - } -} - -void KeyEncoder::EncoderOffsets::EncodeImp( - uint32_t num_rows_already_processed, KeyRowArray* rows, - const std::vector& varbinary_cols) { - DCHECK_GT(varbinary_cols.size(), 0); - - int row_alignment = rows->metadata().row_alignment; - int string_alignment = rows->metadata().string_alignment; - - uint32_t* row_offsets = rows->mutable_offsets(); - uint8_t* row_values = rows->mutable_data(2); - const auto num_rows = static_cast(varbinary_cols[0].length()); - - if (num_rows_already_processed == 0) { - row_offsets[0] = 0; - } - - uint32_t row_offset = row_offsets[num_rows_already_processed]; - for (uint32_t i = num_rows_already_processed; i < num_rows; ++i) { - uint32_t* varbinary_end = - rows->metadata().varbinary_end_array(row_values + row_offset); - - // Zero out lengths for nulls. - // Add lengths of all columns to get row size. - // Store varbinary field ends while summing their lengths. - - uint32_t offset_within_row = rows->metadata().fixed_length; - - for (size_t col = 0; col < varbinary_cols.size(); ++col) { - const uint32_t* col_offsets = varbinary_cols[col].offsets(); - uint32_t col_length = col_offsets[i + 1] - col_offsets[i]; - - const int bit_offset = varbinary_cols[col].bit_offset(0); - - const uint8_t* non_nulls = varbinary_cols[col].data(0); - if (non_nulls && BitUtil::GetBit(non_nulls, bit_offset + i) == 0) { - col_length = 0; - } - - offset_within_row += - KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment); - offset_within_row += col_length; - - varbinary_end[col] = offset_within_row; - } - - offset_within_row += - KeyRowMetadata::padding_for_alignment(offset_within_row, row_alignment); - row_offset += offset_within_row; - row_offsets[i + 1] = row_offset; - } -} - void KeyEncoder::EncoderOffsets::Decode( uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows, std::vector* varbinary_cols, @@ -1170,24 +717,6 @@ void KeyEncoder::EncoderOffsets::Decode( } } -void KeyEncoder::EncoderVarBinary::Encode(uint32_t varbinary_col_id, KeyRowArray* rows, - const KeyColumnArray& col, - KeyEncoderContext* ctx) { -#if defined(ARROW_HAVE_AVX2) - if (ctx->has_avx2()) { - EncodeHelper_avx2(varbinary_col_id, rows, col); - } else { -#endif - if (varbinary_col_id == 0) { - EncodeImp(varbinary_col_id, rows, col); - } else { - EncodeImp(varbinary_col_id, rows, col); - } -#if defined(ARROW_HAVE_AVX2) - } -#endif -} - void KeyEncoder::EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const KeyRowArray& rows, KeyColumnArray* col, @@ -1209,35 +738,15 @@ void KeyEncoder::EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, #endif } -template -void KeyEncoder::EncoderVarBinary::EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows, - const KeyColumnArray& col) { - EncodeDecodeHelper( - 0, static_cast(col.length()), varbinary_col_id, rows, rows, &col, nullptr, - [](uint8_t* dst, const uint8_t* src, int64_t length) { - auto dst64 = reinterpret_cast(dst); - auto src64 = reinterpret_cast(src); - uint32_t istripe; - for (istripe = 0; istripe < length / 8; ++istripe) { - dst64[istripe] = util::SafeLoad(src64 + istripe); - } - if ((length % 8) > 0) { - uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length)); - dst64[istripe] = (dst64[istripe] & ~mask_last) | - (util::SafeLoad(src64 + istripe) & mask_last); - } - }); -} - template void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const KeyRowArray& rows, KeyColumnArray* col) { - EncodeDecodeHelper( + DecodeHelper( start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col, [](uint8_t* dst, const uint8_t* src, int64_t length) { - for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) { + for (uint32_t istripe = 0; istripe < BitUtil::CeilDiv(length, 8); ++istripe) { auto dst64 = reinterpret_cast(dst); auto src64 = reinterpret_cast(src); util::SafeStore(dst64 + istripe, src64[istripe]); @@ -1245,46 +754,6 @@ void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_ro }); } -void KeyEncoder::EncoderNulls::Encode(KeyRowArray* rows, - const std::vector& cols, - KeyEncoderContext* ctx, - KeyColumnArray* temp_vector_16bit) { - DCHECK_GT(cols.size(), 0); - const auto num_rows = static_cast(rows->length()); - - // All input columns should have the same number of rows. - // They may or may not have non-nulls bit-vectors allocated. - for (const auto& col : cols) { - DCHECK(col.length() == num_rows); - } - - // Temp vector needs space for the required number of rows - DCHECK(temp_vector_16bit->length() >= num_rows); - DCHECK(temp_vector_16bit->metadata().is_fixed_length && - temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t)); - - uint8_t* null_masks = rows->null_masks(); - uint32_t null_masks_bytes_per_row = rows->metadata().null_masks_bytes_per_row; - memset(null_masks, 0, null_masks_bytes_per_row * num_rows); - for (size_t col = 0; col < cols.size(); ++col) { - const uint8_t* non_nulls = cols[col].data(0); - if (!non_nulls) { - continue; - } - int bit_offset = cols[col].bit_offset(0); - DCHECK_LT(bit_offset, 8); - int num_selected; - util::BitUtil::bits_to_indexes( - 0, ctx->hardware_flags, num_rows, non_nulls, &num_selected, - reinterpret_cast(temp_vector_16bit->mutable_data(1)), bit_offset); - for (int i = 0; i < num_selected; ++i) { - uint16_t row_id = reinterpret_cast(temp_vector_16bit->data(1))[i]; - int64_t null_masks_bit_id = row_id * null_masks_bytes_per_row * 8 + col; - BitUtil::SetBit(null_masks, null_masks_bit_id); - } - } -} - void KeyEncoder::EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows, std::vector* cols) { @@ -1408,8 +877,11 @@ void KeyEncoder::KeyRowMetadata::FromColumnMetadataVector( uint32_t offset_within_row = 0; for (uint32_t i = 0; i < num_cols; ++i) { const KeyColumnMetadata& col = cols[column_order[i]]; - offset_within_row += - KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment, col); + if (col.is_fixed_length && col.fixed_length != 0 && + ARROW_POPCOUNT64(col.fixed_length) != 1) { + offset_within_row += + KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment, col); + } column_offsets[i] = offset_within_row; if (!col.is_fixed_length) { if (num_varbinary_cols == 0) { @@ -1481,94 +953,6 @@ void KeyEncoder::PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows, } } -Status KeyEncoder::PrepareOutputForEncode(int64_t start_row, int64_t num_rows, - KeyRowArray* rows, - const std::vector& all_cols) { - int64_t num_bytes_required = 0; - - int64_t fixed_part = row_metadata_.fixed_length * num_rows; - int64_t var_part = 0; - for (const auto& col : all_cols) { - if (!col.metadata().is_fixed_length) { - DCHECK(col.length() >= start_row + num_rows); - const uint32_t* offsets = col.offsets(); - var_part += offsets[start_row + num_rows] - offsets[start_row]; - // Include maximum padding that can be added to align the start of varbinary fields. - var_part += num_rows * row_metadata_.string_alignment; - } - } - // Include maximum padding that can be added to align the start of the rows. - if (!row_metadata_.is_fixed_length) { - fixed_part += row_metadata_.row_alignment * num_rows; - } - num_bytes_required = fixed_part + var_part; - - rows->Clean(); - RETURN_NOT_OK(rows->AppendEmpty(static_cast(num_rows), - static_cast(num_bytes_required))); - - return Status::OK(); -} - -void KeyEncoder::Encode(int64_t start_row, int64_t num_rows, KeyRowArray* rows, - const std::vector& cols) { - // Prepare column array vectors - PrepareKeyColumnArrays(start_row, num_rows, cols); - - // Create two temp vectors with 16-bit elements - auto temp_buffer_holder_A = - util::TempVectorHolder(ctx_->stack, static_cast(num_rows)); - auto temp_buffer_A = KeyColumnArray( - KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr, - reinterpret_cast(temp_buffer_holder_A.mutable_data()), nullptr); - auto temp_buffer_holder_B = - util::TempVectorHolder(ctx_->stack, static_cast(num_rows)); - auto temp_buffer_B = KeyColumnArray( - KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr, - reinterpret_cast(temp_buffer_holder_B.mutable_data()), nullptr); - - bool is_row_fixed_length = row_metadata_.is_fixed_length; - if (!is_row_fixed_length) { - // This call will generate and fill in data for both: - // - offsets to the entire encoded arrays - // - offsets for individual varbinary fields within each row - EncoderOffsets::Encode(rows, batch_varbinary_cols_, ctx_); - - for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) { - // Memcpy varbinary fields into precomputed in the previous step - // positions in the output row buffer. - EncoderVarBinary::Encode(static_cast(i), rows, batch_varbinary_cols_[i], - ctx_); - } - } - - // Process fixed length columns - const auto num_cols = static_cast(batch_all_cols_.size()); - for (uint32_t i = 0; i < num_cols;) { - if (!batch_all_cols_[i].metadata().is_fixed_length) { - i += 1; - continue; - } - bool can_process_pair = - (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length && - EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(), - batch_all_cols_[i + 1].metadata()); - if (!can_process_pair) { - EncoderBinary::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i], - ctx_, &temp_buffer_A); - i += 1; - } else { - EncoderBinaryPair::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i], - batch_all_cols_[i + 1], ctx_, &temp_buffer_A, - &temp_buffer_B); - i += 2; - } - } - - // Process nulls - EncoderNulls::Encode(rows, batch_all_cols_, ctx_, &temp_buffer_A); -} - void KeyEncoder::DecodeFixedLengthBuffers(int64_t start_row_input, int64_t start_row_output, int64_t num_rows, const KeyRowArray& rows, @@ -1645,5 +1029,305 @@ void KeyEncoder::DecodeVaryingLengthBuffers(int64_t start_row_input, } } +template +void KeyEncoder::EncoderBinary::EncodeSelectedImp( + uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col, + uint32_t num_selected, const uint16_t* selection, COPY_FN copy_fn, + SET_NULL_FN set_null_fn) { + bool is_fixed_length = rows->metadata().is_fixed_length; + if (is_fixed_length) { + uint32_t row_width = rows->metadata().fixed_length; + const uint8_t* src_base = col.data(1); + uint8_t* dst = rows->mutable_data(1) + offset_within_row; + for (uint32_t i = 0; i < num_selected; ++i) { + copy_fn(dst, src_base, selection[i]); + dst += row_width; + } + if (col.data(0)) { + const uint8_t* non_null_bits = col.data(0); + uint8_t* dst = rows->mutable_data(1) + offset_within_row; + for (uint32_t i = 0; i < num_selected; ++i) { + bool is_null = !BitUtil::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); + if (is_null) { + set_null_fn(dst); + } + dst += row_width; + } + } + } else { + const uint8_t* src_base = col.data(1); + uint8_t* dst = rows->mutable_data(2) + offset_within_row; + const uint32_t* offsets = rows->offsets(); + for (uint32_t i = 0; i < num_selected; ++i) { + copy_fn(dst + offsets[i], src_base, selection[i]); + } + if (col.data(0)) { + const uint8_t* non_null_bits = col.data(0); + uint8_t* dst = rows->mutable_data(2) + offset_within_row; + const uint32_t* offsets = rows->offsets(); + for (uint32_t i = 0; i < num_selected; ++i) { + bool is_null = !BitUtil::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); + if (is_null) { + set_null_fn(dst + offsets[i]); + } + } + } + } +} + +void KeyEncoder::EncoderBinary::EncodeSelected(uint32_t offset_within_row, + KeyRowArray* rows, + const KeyColumnArray& col, + uint32_t num_selected, + const uint16_t* selection) { + uint32_t col_width = col.metadata().fixed_length; + if (col_width == 0) { + int bit_offset = col.bit_offset(1); + EncodeSelectedImp( + offset_within_row, rows, col, num_selected, selection, + [bit_offset](uint8_t* dst, const uint8_t* src_base, uint16_t irow) { + *dst = BitUtil::GetBit(src_base, irow + bit_offset) ? 0xff : 0x00; + }, + [](uint8_t* dst) { *dst = 0xae; }); + } else if (col_width == 1) { + EncodeSelectedImp( + offset_within_row, rows, col, num_selected, selection, + [](uint8_t* dst, const uint8_t* src_base, uint16_t irow) { + *dst = src_base[irow]; + }, + [](uint8_t* dst) { *dst = 0xae; }); + } else if (col_width == 2) { + EncodeSelectedImp( + offset_within_row, rows, col, num_selected, selection, + [](uint8_t* dst, const uint8_t* src_base, uint16_t irow) { + *reinterpret_cast(dst) = + reinterpret_cast(src_base)[irow]; + }, + [](uint8_t* dst) { *reinterpret_cast(dst) = 0xaeae; }); + } else if (col_width == 4) { + EncodeSelectedImp( + offset_within_row, rows, col, num_selected, selection, + [](uint8_t* dst, const uint8_t* src_base, uint16_t irow) { + *reinterpret_cast(dst) = + reinterpret_cast(src_base)[irow]; + }, + [](uint8_t* dst) { + *reinterpret_cast(dst) = static_cast(0xaeaeaeae); + }); + } else if (col_width == 8) { + EncodeSelectedImp( + offset_within_row, rows, col, num_selected, selection, + [](uint8_t* dst, const uint8_t* src_base, uint16_t irow) { + *reinterpret_cast(dst) = + reinterpret_cast(src_base)[irow]; + }, + [](uint8_t* dst) { *reinterpret_cast(dst) = 0xaeaeaeaeaeaeaeaeULL; }); + } else { + EncodeSelectedImp( + offset_within_row, rows, col, num_selected, selection, + [col_width](uint8_t* dst, const uint8_t* src_base, uint16_t irow) { + memcpy(dst, src_base + col_width * irow, col_width); + }, + [col_width](uint8_t* dst) { memset(dst, 0xae, col_width); }); + } +} + +void KeyEncoder::EncoderOffsets::GetRowOffsetsSelected( + KeyRowArray* rows, const std::vector& cols, uint32_t num_selected, + const uint16_t* selection) { + if (rows->metadata().is_fixed_length) { + return; + } + + uint32_t* row_offsets = rows->mutable_offsets(); + for (uint32_t i = 0; i < num_selected; ++i) { + row_offsets[i] = rows->metadata().fixed_length; + } + + for (size_t icol = 0; icol < cols.size(); ++icol) { + bool is_fixed_length = (cols[icol].metadata().is_fixed_length); + if (!is_fixed_length) { + const uint32_t* col_offsets = cols[icol].offsets(); + for (uint32_t i = 0; i < num_selected; ++i) { + uint32_t irow = selection[i]; + uint32_t length = col_offsets[irow + 1] - col_offsets[irow]; + row_offsets[i] += KeyRowMetadata::padding_for_alignment( + row_offsets[i], rows->metadata().string_alignment); + row_offsets[i] += length; + } + const uint8_t* non_null_bits = cols[icol].data(0); + if (non_null_bits) { + const uint32_t* col_offsets = cols[icol].offsets(); + for (uint32_t i = 0; i < num_selected; ++i) { + uint32_t irow = selection[i]; + bool is_null = !BitUtil::GetBit(non_null_bits, irow + cols[icol].bit_offset(0)); + if (is_null) { + uint32_t length = col_offsets[irow + 1] - col_offsets[irow]; + row_offsets[i] -= length; + } + } + } + } + } + + uint32_t sum = 0; + int row_alignment = rows->metadata().row_alignment; + for (uint32_t i = 0; i < num_selected; ++i) { + uint32_t length = row_offsets[i]; + length += KeyRowMetadata::padding_for_alignment(length, row_alignment); + row_offsets[i] = sum; + sum += length; + } + row_offsets[num_selected] = sum; +} + +template +void KeyEncoder::EncoderOffsets::EncodeSelectedImp( + uint32_t ivarbinary, KeyRowArray* rows, const std::vector& cols, + uint32_t num_selected, const uint16_t* selection) { + const uint32_t* row_offsets = rows->offsets(); + uint8_t* row_base = rows->mutable_data(2) + + rows->metadata().varbinary_end_array_offset + + ivarbinary * sizeof(uint32_t); + const uint32_t* col_offsets = cols[ivarbinary].offsets(); + const uint8_t* col_non_null_bits = cols[ivarbinary].data(0); + + for (uint32_t i = 0; i < num_selected; ++i) { + uint32_t irow = selection[i]; + uint32_t length = col_offsets[irow + 1] - col_offsets[irow]; + if (has_nulls) { + uint32_t null_multiplier = + BitUtil::GetBit(col_non_null_bits, irow + cols[ivarbinary].bit_offset(0)) ? 1 + : 0; + length *= null_multiplier; + } + uint32_t* row = reinterpret_cast(row_base + row_offsets[i]); + if (is_first_varbinary) { + row[0] = rows->metadata().fixed_length + length; + } else { + row[0] = row[-1] + + KeyRowMetadata::padding_for_alignment(row[-1], + rows->metadata().string_alignment) + + length; + } + } +} + +void KeyEncoder::EncoderOffsets::EncodeSelected(KeyRowArray* rows, + const std::vector& cols, + uint32_t num_selected, + const uint16_t* selection) { + if (rows->metadata().is_fixed_length) { + return; + } + uint32_t ivarbinary = 0; + for (size_t icol = 0; icol < cols.size(); ++icol) { + if (!cols[icol].metadata().is_fixed_length) { + const uint8_t* non_null_bits = cols[icol].data(0); + if (non_null_bits && ivarbinary == 0) { + EncodeSelectedImp(ivarbinary, rows, cols, num_selected, selection); + } else if (non_null_bits && ivarbinary > 0) { + EncodeSelectedImp(ivarbinary, rows, cols, num_selected, selection); + } else if (!non_null_bits && ivarbinary == 0) { + EncodeSelectedImp(ivarbinary, rows, cols, num_selected, selection); + } else { + EncodeSelectedImp(ivarbinary, rows, cols, num_selected, selection); + } + ivarbinary++; + } + } +} + +void KeyEncoder::EncoderVarBinary::EncodeSelected(uint32_t ivarbinary, KeyRowArray* rows, + const KeyColumnArray& cols, + uint32_t num_selected, + const uint16_t* selection) { + const uint32_t* row_offsets = rows->offsets(); + uint8_t* row_base = rows->mutable_data(2); + const uint32_t* col_offsets = cols.offsets(); + const uint8_t* col_base = cols.data(2); + + if (ivarbinary == 0) { + for (uint32_t i = 0; i < num_selected; ++i) { + uint8_t* row = row_base + row_offsets[i]; + uint32_t row_offset; + uint32_t length; + rows->metadata().first_varbinary_offset_and_length(row, &row_offset, &length); + uint32_t irow = selection[i]; + memcpy(row + row_offset, col_base + col_offsets[irow], length); + } + } else { + for (uint32_t i = 0; i < num_selected; ++i) { + uint8_t* row = row_base + row_offsets[i]; + uint32_t row_offset; + uint32_t length; + rows->metadata().nth_varbinary_offset_and_length(row, ivarbinary, &row_offset, + &length); + uint32_t irow = selection[i]; + memcpy(row + row_offset, col_base + col_offsets[irow], length); + } + } +} + +void KeyEncoder::EncoderNulls::EncodeSelected(KeyRowArray* rows, + const std::vector& cols, + uint32_t num_selected, + const uint16_t* selection) { + uint8_t* null_masks = rows->null_masks(); + uint32_t null_mask_num_bytes = rows->metadata().null_masks_bytes_per_row; + memset(null_masks, 0, null_mask_num_bytes * num_selected); + for (size_t icol = 0; icol < cols.size(); ++icol) { + const uint8_t* non_null_bits = cols[icol].data(0); + if (non_null_bits) { + for (uint32_t i = 0; i < num_selected; ++i) { + uint32_t irow = selection[i]; + bool is_null = !BitUtil::GetBit(non_null_bits, irow + cols[icol].bit_offset(0)); + if (is_null) { + BitUtil::SetBit(null_masks, i * null_mask_num_bytes * 8 + icol); + } + } + } + } +} + +void KeyEncoder::PrepareEncodeSelected(int64_t start_row, int64_t num_rows, + const std::vector& cols) { + // Prepare column array vectors + PrepareKeyColumnArrays(start_row, num_rows, cols); +} + +Status KeyEncoder::EncodeSelected(KeyRowArray* rows, uint32_t num_selected, + const uint16_t* selection) { + rows->Clean(); + RETURN_NOT_OK( + rows->AppendEmpty(static_cast(num_selected), static_cast(0))); + + EncoderOffsets::GetRowOffsetsSelected(rows, batch_varbinary_cols_, num_selected, + selection); + + RETURN_NOT_OK(rows->AppendEmpty(static_cast(0), + static_cast(rows->offsets()[num_selected]))); + + for (size_t icol = 0; icol < batch_all_cols_.size(); ++icol) { + if (batch_all_cols_[icol].metadata().is_fixed_length) { + uint32_t offset_within_row = rows->metadata().column_offsets[icol]; + EncoderBinary::EncodeSelected(offset_within_row, rows, batch_all_cols_[icol], + num_selected, selection); + } + } + + EncoderOffsets::EncodeSelected(rows, batch_varbinary_cols_, num_selected, selection); + + for (size_t icol = 0; icol < batch_varbinary_cols_.size(); ++icol) { + EncoderVarBinary::EncodeSelected(static_cast(icol), rows, + batch_varbinary_cols_[icol], num_selected, + selection); + } + + EncoderNulls::EncodeSelected(rows, batch_all_cols_, num_selected, selection); + + return Status::OK(); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_encode.h b/cpp/src/arrow/compute/exec/key_encode.h index e5397b9dfd4..69f4a1694d9 100644 --- a/cpp/src/arrow/compute/exec/key_encode.h +++ b/cpp/src/arrow/compute/exec/key_encode.h @@ -291,20 +291,10 @@ class KeyEncoder { const KeyRowMetadata& row_metadata() { return row_metadata_; } - /// Find out the required sizes of all buffers output buffers for encoding - /// (including varying-length buffers). - /// Use that information to resize provided row array so that it can fit - /// encoded data. - Status PrepareOutputForEncode(int64_t start_input_row, int64_t num_input_rows, - KeyRowArray* rows, - const std::vector& all_cols); - - /// Encode a window of column oriented data into the entire output - /// row oriented storage. - /// The output buffers for encoding need to be correctly sized before - /// starting encoding. - void Encode(int64_t start_input_row, int64_t num_input_rows, KeyRowArray* rows, - const std::vector& cols); + void PrepareEncodeSelected(int64_t start_row, int64_t num_rows, + const std::vector& cols); + Status EncodeSelected(KeyRowArray* rows, uint32_t num_selected, + const uint16_t* selection); /// Decode a window of row oriented data into a corresponding /// window of column oriented storage. @@ -322,6 +312,8 @@ class KeyEncoder { int64_t num_rows, const KeyRowArray& rows, std::vector* cols); + const std::vector& GetBatchColumns() const { return batch_all_cols_; } + private: /// Prepare column array vectors. /// Output column arrays represent a range of input column arrays @@ -337,25 +329,18 @@ class KeyEncoder { public: static KeyColumnArray ArrayReplace(const KeyColumnArray& column, const KeyColumnArray& temp); - static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output, - KeyEncoderContext* ctx); static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output, KeyEncoderContext* ctx); }; class EncoderInteger { public: - static void Encode(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col, KeyEncoderContext* ctx, - KeyColumnArray* temp); static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col, KeyEncoderContext* ctx, KeyColumnArray* temp); static bool UsesTransform(const KeyColumnArray& column); static KeyColumnArray ArrayReplace(const KeyColumnArray& column, const KeyColumnArray& temp); - static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output, - KeyEncoderContext* ctx); static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output, KeyEncoderContext* ctx); @@ -365,52 +350,42 @@ class KeyEncoder { class EncoderBinary { public: - static void Encode(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col, KeyEncoderContext* ctx, - KeyColumnArray* temp); + static void EncodeSelected(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col, uint32_t num_selected, + const uint16_t* selection); static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col, KeyEncoderContext* ctx, KeyColumnArray* temp); static bool IsInteger(const KeyColumnMetadata& metadata); private: - template - static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows, - uint32_t offset_within_row, - const KeyRowArray* rows_const, - KeyRowArray* rows_mutable_maybe_null, - const KeyColumnArray* col_const, - KeyColumnArray* col_mutable_maybe_null, - COPY_FN copy_fn); - template - static void EncodeImp(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col); + template + static void EncodeSelectedImp(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col, uint32_t num_selected, + const uint16_t* selection, COPY_FN copy_fn, + SET_NULL_FN set_null_fn); + + template + static inline void DecodeHelper(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray* rows_const, + KeyRowArray* rows_mutable_maybe_null, + const KeyColumnArray* col_const, + KeyColumnArray* col_mutable_maybe_null, + COPY_FN copy_fn); template static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col); #if defined(ARROW_HAVE_AVX2) - static void EncodeHelper_avx2(bool is_row_fixed_length, uint32_t offset_within_row, - KeyRowArray* rows, const KeyColumnArray& col); static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col); template - static void EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col); - template static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col); #endif - static void ColumnMemsetNulls(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col, KeyEncoderContext* ctx, - KeyColumnArray* temp_vector_16bit, uint8_t byte_value); - template - static void ColumnMemsetNullsImp(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col, KeyEncoderContext* ctx, - KeyColumnArray* temp_vector_16bit, - uint8_t byte_value); }; class EncoderBinaryPair { @@ -419,10 +394,6 @@ class KeyEncoder { const KeyColumnMetadata& col2) { return EncoderBinary::IsInteger(col1) && EncoderBinary::IsInteger(col2); } - static void Encode(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col1, const KeyColumnArray& col2, - KeyEncoderContext* ctx, KeyColumnArray* temp1, - KeyColumnArray* temp2); static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col1, KeyColumnArray* col2, KeyEncoderContext* ctx, @@ -430,28 +401,16 @@ class KeyEncoder { private: template - static void EncodeImp(uint32_t num_rows_to_skip, uint32_t offset_within_row, - KeyRowArray* rows, const KeyColumnArray& col1, - const KeyColumnArray& col2); - template static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col1, KeyColumnArray* col2); #if defined(ARROW_HAVE_AVX2) - static uint32_t EncodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width, - uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col1, - const KeyColumnArray& col2); static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col1, KeyColumnArray* col2); template - static uint32_t EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows, - const KeyColumnArray& col1, - const KeyColumnArray& col2); - template static uint32_t DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col1, KeyColumnArray* col2); @@ -460,63 +419,52 @@ class KeyEncoder { class EncoderOffsets { public: - // In order not to repeat work twice, - // encoding combines in a single pass computing of: - // a) row offsets for varying-length rows - // b) within each new row, the cumulative length array - // of varying-length values within a row. - static void Encode(KeyRowArray* rows, - const std::vector& varbinary_cols, - KeyEncoderContext* ctx); + static void GetRowOffsetsSelected(KeyRowArray* rows, + const std::vector& cols, + uint32_t num_selected, const uint16_t* selection); + static void EncodeSelected(KeyRowArray* rows, const std::vector& cols, + uint32_t num_selected, const uint16_t* selection); + static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows, std::vector* varbinary_cols, const std::vector& varbinary_cols_base_offset, KeyEncoderContext* ctx); private: - static void EncodeImp(uint32_t num_rows_already_processed, KeyRowArray* rows, - const std::vector& varbinary_cols); -#if defined(ARROW_HAVE_AVX2) - static uint32_t EncodeImp_avx2(KeyRowArray* rows, - const std::vector& varbinary_cols, - KeyColumnArray* temp_buffer_32B_per_col); -#endif + template + static void EncodeSelectedImp(uint32_t ivarbinary, KeyRowArray* rows, + const std::vector& cols, + uint32_t num_selected, const uint16_t* selection); }; class EncoderVarBinary { public: - static void Encode(uint32_t varbinary_col_id, KeyRowArray* rows, - const KeyColumnArray& col, KeyEncoderContext* ctx); + static void EncodeSelected(uint32_t ivarbinary, KeyRowArray* rows, + const KeyColumnArray& cols, uint32_t num_selected, + const uint16_t* selection); + static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const KeyRowArray& rows, KeyColumnArray* col, KeyEncoderContext* ctx); private: - template - static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows, - uint32_t varbinary_col_id, - const KeyRowArray* rows_const, - KeyRowArray* rows_mutable_maybe_null, - const KeyColumnArray* col_const, - KeyColumnArray* col_mutable_maybe_null, - COPY_FN copy_fn); - template - static void EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows, - const KeyColumnArray& col); + template + static inline void DecodeHelper(uint32_t start_row, uint32_t num_rows, + uint32_t varbinary_col_id, + const KeyRowArray* rows_const, + KeyRowArray* rows_mutable_maybe_null, + const KeyColumnArray* col_const, + KeyColumnArray* col_mutable_maybe_null, + COPY_FN copy_fn); template static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const KeyRowArray& rows, KeyColumnArray* col); #if defined(ARROW_HAVE_AVX2) - static void EncodeHelper_avx2(uint32_t varbinary_col_id, KeyRowArray* rows, - const KeyColumnArray& col); static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const KeyRowArray& rows, KeyColumnArray* col); template - static void EncodeImp_avx2(uint32_t varbinary_col_id, KeyRowArray* rows, - const KeyColumnArray& col); - template static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const KeyRowArray& rows, KeyColumnArray* col); @@ -525,8 +473,9 @@ class KeyEncoder { class EncoderNulls { public: - static void Encode(KeyRowArray* rows, const std::vector& cols, - KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit); + static void EncodeSelected(KeyRowArray* rows, const std::vector& cols, + uint32_t num_selected, const uint16_t* selection); + static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows, std::vector* cols); }; @@ -543,8 +492,8 @@ class KeyEncoder { std::vector batch_varbinary_cols_base_offsets_; }; -template -inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper( +template +inline void KeyEncoder::EncoderBinary::DecodeHelper( uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null, const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null, @@ -557,14 +506,8 @@ inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper( for (uint32_t i = 0; i < num_rows; ++i) { const uint8_t* src; uint8_t* dst; - if (is_encoding) { - src = col_const->data(1) + col_width * i; - dst = rows_mutable_maybe_null->mutable_data(1) + row_width * (start_row + i) + - offset_within_row; - } else { - src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row; - dst = col_mutable_maybe_null->mutable_data(1) + col_width * i; - } + src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row; + dst = col_mutable_maybe_null->mutable_data(1) + col_width * i; copy_fn(dst, src, col_width); } } else { @@ -572,21 +515,15 @@ inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper( for (uint32_t i = 0; i < num_rows; ++i) { const uint8_t* src; uint8_t* dst; - if (is_encoding) { - src = col_const->data(1) + col_width * i; - dst = rows_mutable_maybe_null->mutable_data(2) + row_offsets[start_row + i] + - offset_within_row; - } else { - src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row; - dst = col_mutable_maybe_null->mutable_data(1) + col_width * i; - } + src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row; + dst = col_mutable_maybe_null->mutable_data(1) + col_width * i; copy_fn(dst, src, col_width); } } } -template -inline void KeyEncoder::EncoderVarBinary::EncodeDecodeHelper( +template +inline void KeyEncoder::EncoderVarBinary::DecodeHelper( uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null, const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null, @@ -620,13 +557,8 @@ inline void KeyEncoder::EncoderVarBinary::EncodeDecodeHelper( const uint8_t* src; uint8_t* dst; - if (is_encoding) { - src = col_const->data(2) + col_offset; - dst = rows_mutable_maybe_null->mutable_data(2) + row_offset; - } else { - src = rows_const->data(2) + row_offset; - dst = col_mutable_maybe_null->mutable_data(2) + col_offset; - } + src = rows_const->data(2) + row_offset; + dst = col_mutable_maybe_null->mutable_data(2) + col_offset; copy_fn(dst, src, length); } } diff --git a/cpp/src/arrow/compute/exec/key_encode_avx2.cc b/cpp/src/arrow/compute/exec/key_encode_avx2.cc index d875412cf88..832bb0361d8 100644 --- a/cpp/src/arrow/compute/exec/key_encode_avx2.cc +++ b/cpp/src/arrow/compute/exec/key_encode_avx2.cc @@ -24,69 +24,6 @@ namespace compute { #if defined(ARROW_HAVE_AVX2) -inline __m256i set_first_n_bytes_avx2(int n) { - constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; - constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; - constexpr uint64_t kByteSequence16To23 = 0x1716151413121110ULL; - constexpr uint64_t kByteSequence24To31 = 0x1f1e1d1c1b1a1918ULL; - - return _mm256_cmpgt_epi8(_mm256_set1_epi8(n), - _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, - kByteSequence16To23, kByteSequence24To31)); -} - -inline __m256i inclusive_prefix_sum_32bit_avx2(__m256i x) { - x = _mm256_add_epi32( - x, _mm256_permutevar8x32_epi32( - _mm256_andnot_si256(_mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0xffffffff), x), - _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6))); - x = _mm256_add_epi32( - x, _mm256_permute4x64_epi64( - _mm256_andnot_si256( - _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0xffffffff, 0xffffffff), x), - 0x93)); // 0b10010011 - x = _mm256_add_epi32( - x, _mm256_permute4x64_epi64( - _mm256_andnot_si256( - _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0xffffffff, 0xffffffff), x), - 0x4f)); // 0b01001111 - return x; -} - -void KeyEncoder::EncoderBinary::EncodeHelper_avx2(bool is_row_fixed_length, - uint32_t offset_within_row, - KeyRowArray* rows, - const KeyColumnArray& col) { - if (is_row_fixed_length) { - EncodeImp_avx2(offset_within_row, rows, col); - } else { - EncodeImp_avx2(offset_within_row, rows, col); - } -} - -template -void KeyEncoder::EncoderBinary::EncodeImp_avx2(uint32_t offset_within_row, - KeyRowArray* rows, - const KeyColumnArray& col) { - EncodeDecodeHelper( - 0, static_cast(col.length()), offset_within_row, rows, rows, &col, - nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) { - __m256i* dst256 = reinterpret_cast<__m256i*>(dst); - const __m256i* src256 = reinterpret_cast(src); - uint32_t istripe; - for (istripe = 0; istripe < length / 32; ++istripe) { - _mm256_storeu_si256(dst256 + istripe, _mm256_loadu_si256(src256 + istripe)); - } - if ((length % 32) > 0) { - __m256i mask = set_first_n_bytes_avx2(length % 32); - _mm256_storeu_si256( - dst256 + istripe, - _mm256_blendv_epi8(_mm256_loadu_si256(dst256 + istripe), - _mm256_loadu_si256(src256 + istripe), mask)); - } - }); -} - void KeyEncoder::EncoderBinary::DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, @@ -104,7 +41,7 @@ void KeyEncoder::EncoderBinary::DecodeImp_avx2(uint32_t start_row, uint32_t num_ uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col) { - EncodeDecodeHelper( + DecodeHelper( start_row, num_rows, offset_within_row, &rows, nullptr, col, col, [](uint8_t* dst, const uint8_t* src, int64_t length) { for (uint32_t istripe = 0; istripe < (length + 31) / 32; ++istripe) { @@ -115,128 +52,6 @@ void KeyEncoder::EncoderBinary::DecodeImp_avx2(uint32_t start_row, uint32_t num_ }); } -uint32_t KeyEncoder::EncoderBinaryPair::EncodeHelper_avx2( - bool is_row_fixed_length, uint32_t col_width, uint32_t offset_within_row, - KeyRowArray* rows, const KeyColumnArray& col1, const KeyColumnArray& col2) { - using EncodeImp_avx2_t = - uint32_t (*)(uint32_t, KeyRowArray*, const KeyColumnArray&, const KeyColumnArray&); - static const EncodeImp_avx2_t EncodeImp_avx2_fn[] = { - EncodeImp_avx2, EncodeImp_avx2, EncodeImp_avx2, - EncodeImp_avx2, EncodeImp_avx2, EncodeImp_avx2, - EncodeImp_avx2, EncodeImp_avx2, - }; - int log_col_width = col_width == 8 ? 3 : col_width == 4 ? 2 : col_width == 2 ? 1 : 0; - int dispatch_const = (is_row_fixed_length ? 4 : 0) + log_col_width; - return EncodeImp_avx2_fn[dispatch_const](offset_within_row, rows, col1, col2); -} - -template -uint32_t KeyEncoder::EncoderBinaryPair::EncodeImp_avx2(uint32_t offset_within_row, - KeyRowArray* rows, - const KeyColumnArray& col1, - const KeyColumnArray& col2) { - uint32_t num_rows = static_cast(col1.length()); - ARROW_DCHECK(col_width == 1 || col_width == 2 || col_width == 4 || col_width == 8); - - const uint8_t* col_vals_A = col1.data(1); - const uint8_t* col_vals_B = col2.data(1); - uint8_t* row_vals = is_row_fixed_length ? rows->mutable_data(1) : rows->mutable_data(2); - - constexpr int unroll = 32 / col_width; - - uint32_t num_processed = num_rows / unroll * unroll; - - for (uint32_t i = 0; i < num_rows / unroll; ++i) { - __m256i col_A = _mm256_loadu_si256(reinterpret_cast(col_vals_A) + i); - __m256i col_B = _mm256_loadu_si256(reinterpret_cast(col_vals_B) + i); - __m256i r0, r1; - if (col_width == 1) { - // results in 16-bit outputs in the order: 0..7, 16..23 - r0 = _mm256_unpacklo_epi8(col_A, col_B); - // results in 16-bit outputs in the order: 8..15, 24..31 - r1 = _mm256_unpackhi_epi8(col_A, col_B); - } else if (col_width == 2) { - // results in 32-bit outputs in the order: 0..3, 8..11 - r0 = _mm256_unpacklo_epi16(col_A, col_B); - // results in 32-bit outputs in the order: 4..7, 12..15 - r1 = _mm256_unpackhi_epi16(col_A, col_B); - } else if (col_width == 4) { - // results in 64-bit outputs in the order: 0..1, 4..5 - r0 = _mm256_unpacklo_epi32(col_A, col_B); - // results in 64-bit outputs in the order: 2..3, 6..7 - r1 = _mm256_unpackhi_epi32(col_A, col_B); - } else if (col_width == 8) { - // results in 128-bit outputs in the order: 0, 2 - r0 = _mm256_unpacklo_epi64(col_A, col_B); - // results in 128-bit outputs in the order: 1, 3 - r1 = _mm256_unpackhi_epi64(col_A, col_B); - } - col_A = _mm256_permute2x128_si256(r0, r1, 0x20); - col_B = _mm256_permute2x128_si256(r0, r1, 0x31); - if (col_width == 8) { - __m128i *dst0, *dst1, *dst2, *dst3; - if (is_row_fixed_length) { - uint32_t fixed_length = rows->metadata().fixed_length; - uint8_t* dst = row_vals + offset_within_row + fixed_length * i * unroll; - dst0 = reinterpret_cast<__m128i*>(dst); - dst1 = reinterpret_cast<__m128i*>(dst + fixed_length); - dst2 = reinterpret_cast<__m128i*>(dst + fixed_length * 2); - dst3 = reinterpret_cast<__m128i*>(dst + fixed_length * 3); - } else { - const uint32_t* row_offsets = rows->offsets() + i * unroll; - uint8_t* dst = row_vals + offset_within_row; - dst0 = reinterpret_cast<__m128i*>(dst + row_offsets[0]); - dst1 = reinterpret_cast<__m128i*>(dst + row_offsets[1]); - dst2 = reinterpret_cast<__m128i*>(dst + row_offsets[2]); - dst3 = reinterpret_cast<__m128i*>(dst + row_offsets[3]); - } - _mm_storeu_si128(dst0, _mm256_castsi256_si128(r0)); - _mm_storeu_si128(dst1, _mm256_castsi256_si128(r1)); - _mm_storeu_si128(dst2, _mm256_extracti128_si256(r0, 1)); - _mm_storeu_si128(dst3, _mm256_extracti128_si256(r1, 1)); - - } else { - uint8_t buffer[64]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(buffer), col_A); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(buffer) + 1, col_B); - - if (is_row_fixed_length) { - uint32_t fixed_length = rows->metadata().fixed_length; - uint8_t* dst = row_vals + offset_within_row + fixed_length * i * unroll; - for (int j = 0; j < unroll; ++j) { - if (col_width == 1) { - *reinterpret_cast(dst + fixed_length * j) = - reinterpret_cast(buffer)[j]; - } else if (col_width == 2) { - *reinterpret_cast(dst + fixed_length * j) = - reinterpret_cast(buffer)[j]; - } else if (col_width == 4) { - *reinterpret_cast(dst + fixed_length * j) = - reinterpret_cast(buffer)[j]; - } - } - } else { - const uint32_t* row_offsets = rows->offsets() + i * unroll; - uint8_t* dst = row_vals + offset_within_row; - for (int j = 0; j < unroll; ++j) { - if (col_width == 1) { - *reinterpret_cast(dst + row_offsets[j]) = - reinterpret_cast(buffer)[j]; - } else if (col_width == 2) { - *reinterpret_cast(dst + row_offsets[j]) = - reinterpret_cast(buffer)[j]; - } else if (col_width == 4) { - *reinterpret_cast(dst + row_offsets[j]) = - reinterpret_cast(buffer)[j]; - } - } - } - } - } - - return num_processed; -} - uint32_t KeyEncoder::EncoderBinaryPair::DecodeHelper_avx2( bool is_row_fixed_length, uint32_t col_width, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col1, @@ -392,125 +207,6 @@ uint32_t KeyEncoder::EncoderBinaryPair::DecodeImp_avx2( return num_processed; } -uint32_t KeyEncoder::EncoderOffsets::EncodeImp_avx2( - KeyRowArray* rows, const std::vector& varbinary_cols, - KeyColumnArray* temp_buffer_32B_per_col) { - ARROW_DCHECK(temp_buffer_32B_per_col->metadata().is_fixed_length && - temp_buffer_32B_per_col->metadata().fixed_length == - static_cast(sizeof(uint32_t)) && - temp_buffer_32B_per_col->length() >= - static_cast(varbinary_cols.size()) * 8); - ARROW_DCHECK(varbinary_cols.size() > 0); - - int row_alignment = rows->metadata().row_alignment; - int string_alignment = rows->metadata().string_alignment; - - uint32_t* row_offsets = rows->mutable_offsets(); - uint8_t* row_values = rows->mutable_data(2); - uint32_t num_rows = static_cast(varbinary_cols[0].length()); - - constexpr int unroll = 8; - uint32_t num_processed = num_rows / unroll * unroll; - uint32_t* temp_varbinary_ends = - reinterpret_cast(temp_buffer_32B_per_col->mutable_data(1)); - - row_offsets[0] = 0; - - __m256i row_offset = _mm256_setzero_si256(); - for (uint32_t i = 0; i < num_rows / unroll; ++i) { - // Zero out lengths for nulls. - // Add lengths of all columns to get row size. - // Store in temp buffer varbinary field ends while summing their lengths. - - __m256i offset_within_row = _mm256_set1_epi32(rows->metadata().fixed_length); - - for (size_t col = 0; col < varbinary_cols.size(); ++col) { - const uint32_t* col_offsets = varbinary_cols[col].offsets(); - __m256i col_length = _mm256_sub_epi32( - _mm256_loadu_si256(reinterpret_cast(col_offsets + 1) + i), - _mm256_loadu_si256(reinterpret_cast(col_offsets + 0) + i)); - - const uint8_t* non_nulls = varbinary_cols[col].data(0); - if (non_nulls && non_nulls[i] != 0xff) { - // Zero out lengths for values that are not null - const __m256i individual_bits = - _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); - __m256i null_mask = _mm256_cmpeq_epi32( - _mm256_setzero_si256(), - _mm256_and_si256(_mm256_set1_epi32(non_nulls[i]), individual_bits)); - col_length = _mm256_andnot_si256(null_mask, col_length); - } - - __m256i padding = - _mm256_and_si256(_mm256_sub_epi32(_mm256_setzero_si256(), offset_within_row), - _mm256_set1_epi32(string_alignment - 1)); - offset_within_row = _mm256_add_epi32(offset_within_row, padding); - offset_within_row = _mm256_add_epi32(offset_within_row, col_length); - - _mm256_storeu_si256(reinterpret_cast<__m256i*>(temp_varbinary_ends) + col, - offset_within_row); - } - - __m256i padding = - _mm256_and_si256(_mm256_sub_epi32(_mm256_setzero_si256(), offset_within_row), - _mm256_set1_epi32(row_alignment - 1)); - offset_within_row = _mm256_add_epi32(offset_within_row, padding); - - // Inclusive prefix sum of 32-bit elements - __m256i row_offset_delta = inclusive_prefix_sum_32bit_avx2(offset_within_row); - row_offset = _mm256_add_epi32( - _mm256_permutevar8x32_epi32(row_offset, _mm256_set1_epi32(7)), row_offset_delta); - - _mm256_storeu_si256(reinterpret_cast<__m256i*>(row_offsets + 1) + i, row_offset); - - // Output varbinary ends for all fields in each row - for (size_t col = 0; col < varbinary_cols.size(); ++col) { - for (uint32_t row = 0; row < unroll; ++row) { - uint32_t* dst = rows->metadata().varbinary_end_array( - row_values + row_offsets[i * unroll + row]) + - col; - const uint32_t* src = temp_varbinary_ends + (col * unroll + row); - *dst = *src; - } - } - } - - return num_processed; -} - -void KeyEncoder::EncoderVarBinary::EncodeHelper_avx2(uint32_t varbinary_col_id, - KeyRowArray* rows, - const KeyColumnArray& col) { - if (varbinary_col_id == 0) { - EncodeImp_avx2(varbinary_col_id, rows, col); - } else { - EncodeImp_avx2(varbinary_col_id, rows, col); - } -} - -template -void KeyEncoder::EncoderVarBinary::EncodeImp_avx2(uint32_t varbinary_col_id, - KeyRowArray* rows, - const KeyColumnArray& col) { - EncodeDecodeHelper( - 0, static_cast(col.length()), varbinary_col_id, rows, rows, &col, nullptr, - [](uint8_t* dst, const uint8_t* src, int64_t length) { - __m256i* dst256 = reinterpret_cast<__m256i*>(dst); - const __m256i* src256 = reinterpret_cast(src); - uint32_t istripe; - for (istripe = 0; istripe < length / 32; ++istripe) { - _mm256_storeu_si256(dst256 + istripe, _mm256_loadu_si256(src256 + istripe)); - } - if ((length % 32) > 0) { - __m256i mask = set_first_n_bytes_avx2(length % 32); - _mm256_storeu_si256( - dst256 + istripe, - _mm256_blendv_epi8(_mm256_loadu_si256(dst256 + istripe), - _mm256_loadu_si256(src256 + istripe), mask)); - } - }); -} - void KeyEncoder::EncoderVarBinary::DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, @@ -528,7 +224,7 @@ void KeyEncoder::EncoderVarBinary::DecodeImp_avx2(uint32_t start_row, uint32_t n uint32_t varbinary_col_id, const KeyRowArray& rows, KeyColumnArray* col) { - EncodeDecodeHelper( + DecodeHelper( start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col, [](uint8_t* dst, const uint8_t* src, int64_t length) { for (uint32_t istripe = 0; istripe < (length + 31) / 32; ++istripe) { diff --git a/cpp/src/arrow/compute/exec/key_hash.cc b/cpp/src/arrow/compute/exec/key_hash.cc index dcc42ed913d..76c8ed1ef30 100644 --- a/cpp/src/arrow/compute/exec/key_hash.cc +++ b/cpp/src/arrow/compute/exec/key_hash.cc @@ -23,6 +23,7 @@ #include #include "arrow/compute/exec/util.h" +#include "arrow/util/ubsan.h" namespace arrow { namespace compute { @@ -55,27 +56,24 @@ inline uint32_t Hashing::combine_accumulators(const uint32_t acc1, const uint32_ return ROTL(acc1, 1) + ROTL(acc2, 7) + ROTL(acc3, 12) + ROTL(acc4, 18); } -inline void Hashing::helper_8B(uint32_t key_length, uint32_t num_keys, - const uint8_t* keys, uint32_t* hashes) { +template +inline void Hashing::helper_8B(uint32_t key_length, uint32_t num_keys, const T* keys, + uint32_t* hashes) { ARROW_DCHECK(key_length <= 8); - uint64_t mask = ~0ULL >> (8 * (8 - key_length)); constexpr uint64_t multiplier = 14029467366897019727ULL; - uint32_t offset = 0; for (uint32_t ikey = 0; ikey < num_keys; ++ikey) { - uint64_t x = *reinterpret_cast(keys + offset); - x &= mask; + uint64_t x = static_cast(keys[ikey]); hashes[ikey] = static_cast(BYTESWAP(x * multiplier)); - offset += key_length; } } inline void Hashing::helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys, uint32_t& acc1, uint32_t& acc2, uint32_t& acc3, uint32_t& acc4) { - uint64_t v1 = reinterpret_cast(keys + offset)[0]; + uint64_t v1 = util::SafeLoadAs(keys + offset); // We do not need to mask v1, because we will not process a stripe // unless at least 9 bytes of it are part of the key. - uint64_t v2 = reinterpret_cast(keys + offset)[1]; + uint64_t v2 = util::SafeLoadAs(keys + offset + 8); v2 &= mask_hi; uint32_t x1 = static_cast(v1); uint32_t x2 = static_cast(v1 >> 32); @@ -129,7 +127,7 @@ void Hashing::helper_stripes(int64_t hardware_flags, uint32_t num_keys, inline uint32_t Hashing::helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys, uint32_t acc) { - uint64_t v = reinterpret_cast(keys + offset)[0]; + uint64_t v = util::SafeLoadAs(keys + offset); v &= mask; uint32_t x1 = static_cast(v); uint32_t x2 = static_cast(v >> 32); @@ -163,8 +161,23 @@ void Hashing::hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t len const uint8_t* keys, uint32_t* hashes) { ARROW_DCHECK(length_key > 0); - if (length_key <= 8) { - helper_8B(length_key, num_keys, keys, hashes); + if (length_key <= 8 && ARROW_POPCOUNT64(length_key) == 1) { + switch (length_key) { + case 1: + helper_8B(length_key, num_keys, keys, hashes); + break; + case 2: + helper_8B(length_key, num_keys, reinterpret_cast(keys), hashes); + break; + case 4: + helper_8B(length_key, num_keys, reinterpret_cast(keys), hashes); + break; + case 8: + helper_8B(length_key, num_keys, reinterpret_cast(keys), hashes); + break; + default: + ARROW_DCHECK(false); + } return; } helper_stripes(hardware_flags, num_keys, length_key, keys, hashes); @@ -174,46 +187,6 @@ void Hashing::hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t len avalanche(hardware_flags, num_keys, hashes); } -void Hashing::hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc) { - for (uint32_t i = 0; i < length / 16; ++i) { - for (int j = 0; j < 4; ++j) { - uint32_t lane = reinterpret_cast(key)[i * 4 + j]; - acc[j] += (lane * PRIME32_2); - acc[j] = ROTL(acc[j], 13); - acc[j] *= PRIME32_1; - } - } - - int tail = length % 16; - if (tail) { - uint64_t last_stripe[2]; - const uint64_t* last_stripe_base = - reinterpret_cast(key + length - (length % 16)); - last_stripe[0] = last_stripe_base[0]; - uint64_t mask = ~0ULL >> (8 * ((length + 7) / 8 * 8 - length)); - if (tail <= 8) { - last_stripe[1] = 0; - last_stripe[0] &= mask; - } else { - last_stripe[1] = last_stripe_base[1]; - last_stripe[1] &= mask; - } - - // The stack allocation and memcpy here should be optimized out by the compiler. - // Using a reinterpret_cast causes a compiler warning on gcc and can lead to incorrect - // results. See https://issues.apache.org/jira/browse/ARROW-13600 for more info. - uint32_t lanes[4]; - memcpy(&lanes, &last_stripe, sizeof(last_stripe)); - - for (int j = 0; j < 4; ++j) { - uint32_t lane = lanes[j]; - acc[j] += (lane * PRIME32_2); - acc[j] = ROTL(acc[j], 13); - acc[j] *= PRIME32_1; - } - } -} - void Hashing::hash_varlen(int64_t hardware_flags, uint32_t num_rows, const uint32_t* offsets, const uint8_t* concatenated_keys, uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row @@ -221,24 +194,125 @@ void Hashing::hash_varlen(int64_t hardware_flags, uint32_t num_rows, #if defined(ARROW_HAVE_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { hash_varlen_avx2(num_rows, offsets, concatenated_keys, temp_buffer, hashes); - } else { + return; + } #endif - for (uint32_t i = 0; i < num_rows; ++i) { - uint32_t acc[4]; - acc[0] = static_cast( - (static_cast(PRIME32_1) + static_cast(PRIME32_2)) & - 0xffffffff); - acc[1] = PRIME32_2; - acc[2] = 0; - acc[3] = static_cast(-static_cast(PRIME32_1)); - uint32_t length = offsets[i + 1] - offsets[i]; - hash_varlen_helper(length, concatenated_keys + offsets[i], acc); - hashes[i] = combine_accumulators(acc[0], acc[1], acc[2], acc[3]); + static const uint64_t masks[9] = {0, + 0xffULL, + 0xffffULL, + 0xffffffULL, + 0xffffffffULL, + 0xffffffffffULL, + 0xffffffffffffULL, + 0xffffffffffffffULL, + ~0ULL}; + + for (uint32_t i = 0; i < num_rows; ++i) { + uint32_t offset = offsets[i]; + uint32_t key_length = offsets[i + 1] - offsets[i]; + const uint32_t num_stripes = key_length / 16; + + uint32_t acc1, acc2, acc3, acc4; + acc1 = static_cast( + (static_cast(PRIME32_1) + static_cast(PRIME32_2)) & + 0xffffffff); + acc2 = PRIME32_2; + acc3 = 0; + acc4 = static_cast(-static_cast(PRIME32_1)); + + for (uint32_t stripe = 0; stripe < num_stripes; ++stripe) { + helper_stripe(offset, ~0ULL, concatenated_keys, acc1, acc2, acc3, acc4); + offset += 16; } - avalanche(hardware_flags, num_rows, hashes); + uint32_t key_length_remaining = key_length - num_stripes * 16; + if (key_length_remaining > 8) { + helper_stripe(offset, masks[key_length_remaining - 8], concatenated_keys, acc1, + acc2, acc3, acc4); + hashes[i] = combine_accumulators(acc1, acc2, acc3, acc4); + } else if (key_length > 0) { + uint32_t acc_combined = combine_accumulators(acc1, acc2, acc3, acc4); + hashes[i] = helper_tail(offset, masks[key_length_remaining], concatenated_keys, + acc_combined); + } else { + hashes[i] = combine_accumulators(acc1, acc2, acc3, acc4); + } + } + avalanche(hardware_flags, num_rows, hashes); +} + +// From: +// https://www.boost.org/doc/libs/1_37_0/doc/html/hash/reference.html#boost.hash_combine +// template +// inline void hash_combine(std::size_t& seed, const T& v) +//{ +// std::hash hasher; +// seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2); +//} +void Hashing::HashCombine(KeyEncoder::KeyEncoderContext* ctx, uint32_t num_rows, + uint32_t* accumulated_hash, const uint32_t* next_column_hash) { + uint32_t num_processed = 0; #if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2()) { + num_processed = HashCombine_avx2(num_rows, accumulated_hash, next_column_hash); } #endif + for (uint32_t i = num_processed; i < num_rows; ++i) { + uint32_t acc = accumulated_hash[i]; + uint32_t next = next_column_hash[i]; + next += 0x9e3779b9 + (acc << 6) + (acc >> 2); + acc ^= next; + accumulated_hash[i] = acc; + } +} + +void Hashing::HashMultiColumn(const std::vector& cols, + KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_hash) { + uint32_t num_rows = static_cast(cols[0].length()); + + auto hash_temp_buf = util::TempVectorHolder(ctx->stack, num_rows); + auto hash_null_index_buf = util::TempVectorHolder(ctx->stack, num_rows); + auto byte_temp_buf = util::TempVectorHolder(ctx->stack, num_rows); + auto varbin_temp_buf = util::TempVectorHolder(ctx->stack, 4 * num_rows); + + bool is_first = true; + + for (size_t icol = 0; icol < cols.size(); ++icol) { + if (cols[icol].metadata().is_fixed_length) { + uint32_t col_width = cols[icol].metadata().fixed_length; + if (col_width == 0) { + util::BitUtil::bits_to_bytes(ctx->hardware_flags, num_rows, cols[icol].data(1), + byte_temp_buf.mutable_data(), + cols[icol].bit_offset(1)); + } + Hashing::hash_fixed( + ctx->hardware_flags, num_rows, col_width == 0 ? 1 : col_width, + col_width == 0 ? byte_temp_buf.mutable_data() : cols[icol].data(1), + is_first ? out_hash : hash_temp_buf.mutable_data()); + } else { + Hashing::hash_varlen( + ctx->hardware_flags, num_rows, cols[icol].offsets(), cols[icol].data(2), + varbin_temp_buf.mutable_data(), // Needs to hold 4 x 32-bit per row + is_first ? out_hash : hash_temp_buf.mutable_data()); + } + + // Zero hash for nulls + if (cols[icol].data(0)) { + uint32_t* dst_hash = is_first ? out_hash : hash_temp_buf.mutable_data(); + int num_nulls; + util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows, cols[icol].data(0), + &num_nulls, hash_null_index_buf.mutable_data(), + cols[icol].bit_offset(0)); + for (int i = 0; i < num_nulls; ++i) { + uint16_t row_id = hash_null_index_buf.mutable_data()[i]; + dst_hash[row_id] = 0; + } + } + + if (!is_first) { + HashCombine(ctx, num_rows, out_hash, hash_temp_buf.mutable_data()); + } + is_first = false; + } } } // namespace compute diff --git a/cpp/src/arrow/compute/exec/key_hash.h b/cpp/src/arrow/compute/exec/key_hash.h index 7f8ab5185cc..a0ed42cf86d 100644 --- a/cpp/src/arrow/compute/exec/key_hash.h +++ b/cpp/src/arrow/compute/exec/key_hash.h @@ -23,6 +23,7 @@ #include +#include "arrow/compute/exec/key_encode.h" #include "arrow/compute/exec/util.h" namespace arrow { @@ -41,6 +42,9 @@ class Hashing { uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row uint32_t* hashes); + static void HashMultiColumn(const std::vector& cols, + KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_hash); + private: static const uint32_t PRIME32_1 = 0x9E3779B1; // 0b10011110001101110111100110110001 static const uint32_t PRIME32_2 = 0x85EBCA77; // 0b10000101111010111100101001110111 @@ -48,6 +52,14 @@ class Hashing { static const uint32_t PRIME32_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111 static const uint32_t PRIME32_5 = 0x165667B1; // 0b00010110010101100110011110110001 + static void HashCombine(KeyEncoder::KeyEncoderContext* ctx, uint32_t num_rows, + uint32_t* accumulated_hash, const uint32_t* next_column_hash); + +#if defined(ARROW_HAVE_AVX2) + static uint32_t HashCombine_avx2(uint32_t num_rows, uint32_t* accumulated_hash, + const uint32_t* next_column_hash); +#endif + // Avalanche static inline uint32_t avalanche_helper(uint32_t acc); #if defined(ARROW_HAVE_AVX2) @@ -63,8 +75,9 @@ class Hashing { #endif // Helpers - static inline void helper_8B(uint32_t key_length, uint32_t num_keys, - const uint8_t* keys, uint32_t* hashes); + template + static inline void helper_8B(uint32_t key_length, uint32_t num_keys, const T* keys, + uint32_t* hashes); static inline void helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys, uint32_t& acc1, uint32_t& acc2, uint32_t& acc3, uint32_t& acc4); @@ -81,7 +94,6 @@ class Hashing { static void helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length, const uint8_t* keys, uint32_t* hash); - static void hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc); #if defined(ARROW_HAVE_AVX2) static void hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets, const uint8_t* concatenated_keys, diff --git a/cpp/src/arrow/compute/exec/key_hash_avx2.cc b/cpp/src/arrow/compute/exec/key_hash_avx2.cc index b58db015088..3804afe106d 100644 --- a/cpp/src/arrow/compute/exec/key_hash_avx2.cc +++ b/cpp/src/arrow/compute/exec/key_hash_avx2.cc @@ -165,19 +165,21 @@ void Hashing::hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets, __m128i acc = acc_init; - uint32_t i; - for (i = 0; i < (length - 1) / 16; ++i) { + if (length) { + uint32_t i; + for (i = 0; i < (length - 1) / 16; ++i) { + __m128i key_stripe = _mm_loadu_si128(reinterpret_cast(base) + i); + acc = _mm_add_epi32(acc, _mm_mullo_epi32(key_stripe, _mm_set1_epi32(PRIME32_2))); + acc = _mm_or_si128(_mm_slli_epi32(acc, 13), _mm_srli_epi32(acc, 32 - 13)); + acc = _mm_mullo_epi32(acc, _mm_set1_epi32(PRIME32_1)); + } __m128i key_stripe = _mm_loadu_si128(reinterpret_cast(base) + i); + __m128i mask = _mm_cmpgt_epi8(_mm_set1_epi8(((length - 1) % 16) + 1), sequence); + key_stripe = _mm_and_si128(key_stripe, mask); acc = _mm_add_epi32(acc, _mm_mullo_epi32(key_stripe, _mm_set1_epi32(PRIME32_2))); acc = _mm_or_si128(_mm_slli_epi32(acc, 13), _mm_srli_epi32(acc, 32 - 13)); acc = _mm_mullo_epi32(acc, _mm_set1_epi32(PRIME32_1)); } - __m128i key_stripe = _mm_loadu_si128(reinterpret_cast(base) + i); - __m128i mask = _mm_cmpgt_epi8(_mm_set1_epi8(((length - 1) % 16) + 1), sequence); - key_stripe = _mm_and_si128(key_stripe, mask); - acc = _mm_add_epi32(acc, _mm_mullo_epi32(key_stripe, _mm_set1_epi32(PRIME32_2))); - acc = _mm_or_si128(_mm_slli_epi32(acc, 13), _mm_srli_epi32(acc, 32 - 13)); - acc = _mm_mullo_epi32(acc, _mm_set1_epi32(PRIME32_1)); _mm_storeu_si128(reinterpret_cast<__m128i*>(temp_buffer) + ikey, acc); } @@ -242,6 +244,24 @@ void Hashing::hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets, } } +uint32_t Hashing::HashCombine_avx2(uint32_t num_rows, uint32_t* accumulated_hash, + const uint32_t* next_column_hash) { + constexpr uint32_t unroll = 8; + for (uint32_t i = 0; i < num_rows / unroll; ++i) { + __m256i acc = + _mm256_loadu_si256(reinterpret_cast(accumulated_hash) + i); + __m256i next = + _mm256_loadu_si256(reinterpret_cast(next_column_hash) + i); + next = _mm256_add_epi32(next, _mm256_set1_epi32(0x9e3779b9)); + next = _mm256_add_epi32(next, _mm256_slli_epi32(acc, 6)); + next = _mm256_add_epi32(next, _mm256_srli_epi32(acc, 2)); + acc = _mm256_xor_si256(acc, next); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(accumulated_hash) + i, acc); + } + uint32_t num_processed = num_rows / unroll * unroll; + return num_processed; +} + #endif } // namespace compute diff --git a/cpp/src/arrow/compute/exec/key_map.cc b/cpp/src/arrow/compute/exec/key_map.cc index ac47c04403c..eaa2ae3e39f 100644 --- a/cpp/src/arrow/compute/exec/key_map.cc +++ b/cpp/src/arrow/compute/exec/key_map.cc @@ -100,7 +100,7 @@ inline void SwissTable::search_block(uint64_t block, int stamp, int start_slot, // zero, which is the default value stored in empty slots. // inline uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot, - uint64_t group_id_mask) { + uint64_t group_id_mask) const { // Input slot can be equal to 8, in which case we need to output any valid group id // value, so we take the one from slot 0 in the block. int clamped_slot = slot & 7; diff --git a/cpp/src/arrow/compute/exec/key_map.h b/cpp/src/arrow/compute/exec/key_map.h index 8c472736ec4..7ee28b82898 100644 --- a/cpp/src/arrow/compute/exec/key_map.h +++ b/cpp/src/arrow/compute/exec/key_map.h @@ -77,7 +77,7 @@ class SwissTable { /// Group ids are bit packed using that maximum to determine the necessary number of /// bits. inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot, - uint64_t group_id_mask); + uint64_t group_id_mask) const; inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found); diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc index aad6dc3d587..e2fe61a63c6 100644 --- a/cpp/src/arrow/compute/exec/util.cc +++ b/cpp/src/arrow/compute/exec/util.cc @@ -53,7 +53,8 @@ inline void BitUtil::bits_filter_indexes_helper(uint64_t word, template void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, const uint8_t* bits, const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes) { + int* num_indexes, uint16_t* indexes, + uint16_t base_index) { // 64 bits at a time constexpr int unroll = 64; int tail = num_bits % unroll; @@ -63,7 +64,8 @@ void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bit bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, num_indexes, indexes); } else { - bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes); + bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes, + base_index); } } else { #endif @@ -76,7 +78,7 @@ void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bit if (filter_input_indexes) { bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes); } else { - bits_to_indexes_helper(word, i * 64, num_indexes, indexes); + bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes); } } #if defined(ARROW_HAVE_AVX2) @@ -94,41 +96,43 @@ void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bit bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes, indexes); } else { - bits_to_indexes_helper(word, num_bits - tail, num_indexes, indexes); + bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes); } } } -void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, int* num_indexes, - uint16_t* indexes, int bit_offset) { +void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, + const uint8_t* bits, int* num_indexes, uint16_t* indexes, + int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; + *num_indexes = 0; + uint16_t base_index = 0; if (bit_offset != 0) { - int num_indexes_head = 0; uint64_t bits_head = util::SafeLoad(reinterpret_cast(bits)) >> bit_offset; int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte, - reinterpret_cast(&bits_head), &num_indexes_head, - indexes); - int num_indexes_tail = 0; - if (num_bits > bits_in_first_byte) { - bits_to_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte, - bits + 1, &num_indexes_tail, indexes + num_indexes_head); + reinterpret_cast(&bits_head), num_indexes, indexes); + if (num_bits <= bits_in_first_byte) { + return; } - *num_indexes = num_indexes_head + num_indexes_tail; - return; + num_bits -= bits_in_first_byte; + indexes += *num_indexes; + bits += 1; + base_index = bits_in_first_byte; } + int num_indexes_new = 0; if (bit_to_search == 0) { bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr, - num_indexes, indexes); + &num_indexes_new, indexes, base_index); } else { ARROW_DCHECK(bit_to_search == 1); bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr, - num_indexes, indexes); + &num_indexes_new, indexes, base_index); } + *num_indexes += num_indexes_new; } void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h index 8bd6a3c5d62..f5c55afe0f5 100644 --- a/cpp/src/arrow/compute/exec/util.h +++ b/cpp/src/arrow/compute/exec/util.h @@ -43,6 +43,11 @@ namespace arrow { namespace util { +template +inline void CheckAlignment(const void* ptr) { + ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); +} + // Some platforms typedef int64_t as long int instead of long long int, // which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics // which need long long. @@ -159,18 +164,20 @@ class BitUtil { template static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, const uint8_t* bits, const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes); + int* num_indexes, uint16_t* indexes, + uint16_t base_index = 0); #if defined(ARROW_HAVE_AVX2) static void bits_to_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits, int* num_indexes, - uint16_t* indexes); + uint16_t* indexes, uint16_t base_index = 0); static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits, const uint16_t* input_indexes, int* num_indexes, uint16_t* indexes); template static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - int* num_indexes, uint16_t* indexes); + int* num_indexes, uint16_t* indexes, + uint16_t base_index = 0); template static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, const uint16_t* input_indexes, diff --git a/cpp/src/arrow/compute/exec/util_avx2.cc b/cpp/src/arrow/compute/exec/util_avx2.cc index 8cf0104db46..bdc0e41f576 100644 --- a/cpp/src/arrow/compute/exec/util_avx2.cc +++ b/cpp/src/arrow/compute/exec/util_avx2.cc @@ -27,18 +27,19 @@ namespace util { void BitUtil::bits_to_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits, int* num_indexes, - uint16_t* indexes) { + uint16_t* indexes, uint16_t base_index) { if (bit_to_search == 0) { - bits_to_indexes_imp_avx2<0>(num_bits, bits, num_indexes, indexes); + bits_to_indexes_imp_avx2<0>(num_bits, bits, num_indexes, indexes, base_index); } else { ARROW_DCHECK(bit_to_search == 1); - bits_to_indexes_imp_avx2<1>(num_bits, bits, num_indexes, indexes); + bits_to_indexes_imp_avx2<1>(num_bits, bits, num_indexes, indexes, base_index); } } template void BitUtil::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - int* num_indexes, uint16_t* indexes) { + int* num_indexes, uint16_t* indexes, + uint16_t base_index) { // 64 bits at a time constexpr int unroll = 64; @@ -74,7 +75,7 @@ void BitUtil::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, for (int j = 0; j < (num_indexes_loop + 15) / 16; ++j) { __m256i output = _mm256_cvtepi8_epi16( _mm_loadu_si128(reinterpret_cast(byte_indexes) + j)); - output = _mm256_add_epi16(output, _mm256_set1_epi16(i * 64)); + output = _mm256_add_epi16(output, _mm256_set1_epi16(i * 64 + base_index)); _mm256_storeu_si256(((__m256i*)(indexes + *num_indexes)) + j, output); } *num_indexes += num_indexes_loop; @@ -203,6 +204,9 @@ bool BitUtil::are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) __m256i x = _mm256_loadu_si256(reinterpret_cast(bytes) + i); result_or = _mm256_or_si256(result_or, x); } + result_or = _mm256_cmpeq_epi8(result_or, _mm256_set1_epi8(0)); + result_or = + _mm256_andnot_si256(result_or, _mm256_set1_epi8(static_cast(0xff))); uint32_t result_or32 = _mm256_movemask_epi8(result_or); if (num_bytes % 32 > 0) { uint64_t tail[4] = {0, 0, 0, 0}; diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 4fd6af9b190..8235900d492 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -516,14 +516,16 @@ struct GrouperFastImpl : Grouper { int num_keys_to_compare, const uint16_t* selection_may_be_null, const uint32_t* group_ids, uint32_t* out_num_keys_mismatch, uint16_t* out_selection_mismatch) { - arrow::compute::KeyCompare::CompareRows( + arrow::compute::KeyCompare::CompareColumnsToRows( num_keys_to_compare, selection_may_be_null, group_ids, &impl_ptr->encode_ctx_, - out_num_keys_mismatch, out_selection_mismatch, impl_ptr->rows_minibatch_, - impl_ptr->rows_); + out_num_keys_mismatch, out_selection_mismatch, + impl_ptr->encoder_.GetBatchColumns(), impl_ptr->rows_); }; auto append_func = [impl_ptr](int num_keys, const uint16_t* selection) { + RETURN_NOT_OK(impl_ptr->encoder_.EncodeSelected(&impl_ptr->rows_minibatch_, + num_keys, selection)); return impl_ptr->rows_.AppendSelectionFrom(impl_ptr->rows_minibatch_, num_keys, - selection); + nullptr); }; RETURN_NOT_OK(impl->map_.init(impl->encode_ctx_.hardware_flags, ctx->memory_pool(), impl->encode_ctx_.stack, impl->log_minibatch_max_, @@ -590,22 +592,11 @@ struct GrouperFastImpl : Grouper { // Encode rows_minibatch_.Clean(); - RETURN_NOT_OK(encoder_.PrepareOutputForEncode(start_row, batch_size_next, - &rows_minibatch_, cols_)); - encoder_.Encode(start_row, batch_size_next, &rows_minibatch_, cols_); + encoder_.PrepareEncodeSelected(start_row, batch_size_next, cols_); // Compute hash - if (encoder_.row_metadata().is_fixed_length) { - Hashing::hash_fixed(encode_ctx_.hardware_flags, batch_size_next, - encoder_.row_metadata().fixed_length, rows_minibatch_.data(1), - minibatch_hashes_.data()); - } else { - auto hash_temp_buf = - util::TempVectorHolder(&temp_stack_, 4 * batch_size_next); - Hashing::hash_varlen(encode_ctx_.hardware_flags, batch_size_next, - rows_minibatch_.offsets(), rows_minibatch_.data(2), - hash_temp_buf.mutable_data(), minibatch_hashes_.data()); - } + Hashing::HashMultiColumn(encoder_.GetBatchColumns(), &encode_ctx_, + minibatch_hashes_.data()); // Map RETURN_NOT_OK( diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index c69b51e71fc..a4f55efe301 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -1293,6 +1293,9 @@ TEST(GroupBy, SumOnlyStringAndDictKeys) { { {"hash_sum", nullptr}, })); + if (key_type->Equals(utf8())) { + SortBy({"key_0"}, &aggregated_and_grouped); + } AssertDatumsEqual(ArrayFromJSON(struct_({ field("hash_sum", float64()), From 5e1272d7cd3d61866511c0863402be25ba7928d8 Mon Sep 17 00:00:00 2001 From: michalursa Date: Thu, 12 Aug 2021 22:13:15 -0700 Subject: [PATCH 2/6] Column-at-a-time comparison and hashing - refactoring of binary comparison --- cpp/src/arrow/compute/exec/key_compare.cc | 4 +- .../arrow/compute/exec/key_compare_avx2.cc | 313 ++++++++++-------- 2 files changed, 186 insertions(+), 131 deletions(-) diff --git a/cpp/src/arrow/compute/exec/key_compare.cc b/cpp/src/arrow/compute/exec/key_compare.cc index 2881940cf9c..55b0e5e998b 100644 --- a/cpp/src/arrow/compute/exec/key_compare.cc +++ b/cpp/src/arrow/compute/exec/key_compare.cc @@ -200,7 +200,7 @@ void KeyCompare::CompareBinaryColumnToRow( // Non-zero length guarantees no underflow int32_t num_loops_less_one = - BitUtil::CeilDiv(static_cast(length), 8) - 1; + static_cast(BitUtil::CeilDiv(length, 8)) - 1; uint64_t tail_mask = ~0ULL >> (64 - 8 * (length - num_loops_less_one * 8)); @@ -271,7 +271,7 @@ void KeyCompare::CompareVarBinaryColumnToRow( if (length > 0) { int32_t j; // length can be zero - for (j = 0; j < BitUtil::CeilDiv(static_cast(length), 8) - 1; ++j) { + for (j = 0; j < static_cast(BitUtil::CeilDiv(length, 8)) - 1; ++j) { uint64_t key_left = util::SafeLoad(key_left_ptr + j); uint64_t key_right = key_right_ptr[j]; result_or |= key_left ^ key_right; diff --git a/cpp/src/arrow/compute/exec/key_compare_avx2.cc b/cpp/src/arrow/compute/exec/key_compare_avx2.cc index fba6d2c6ac9..df13e8cae3c 100644 --- a/cpp/src/arrow/compute/exec/key_compare_avx2.cc +++ b/cpp/src/arrow/compute/exec/key_compare_avx2.cc @@ -248,6 +248,173 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( } } +template +inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* right_base, + __m256i irow_left, __m256i offset_right, + int bit_offset = 0) { + __m256i left; + switch (column_width) { + case 0: + irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(bit_offset)); + left = _mm256_i32gather_epi32((const int*)left_base, + _mm256_srli_epi32(irow_left, 3), 1); + left = _mm256_and_si256( + _mm256_set1_epi32(1), + _mm256_srlv_epi32(left, _mm256_and_si256(irow_left, _mm256_set1_epi32(7)))); + left = _mm256_mullo_epi32(left, _mm256_set1_epi32(0xff)); + break; + case 1: + left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 1); + left = _mm256_and_si256(left, _mm256_set1_epi32(0xff)); + break; + case 2: + left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 2); + left = _mm256_and_si256(left, _mm256_set1_epi32(0xff)); + break; + case 4: + left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 4); + break; + default: + ARROW_DCHECK(false); + } + + __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); + if (column_width != sizeof(uint32_t)) { + constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff; + right = _mm256_and_si256(right, _mm256_set1_epi32(mask)); + } + + __m256i cmp = _mm256_cmpeq_epi32(left, right); + + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); + uint32_t result_hi = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); + + return result_lo | (static_cast(result_hi) << 32); +} + +template +inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_first, __m256i offset_right, + int bit_offset = 0) { + __m256i left; + switch (column_width) { + case 0: { + __m256i bits = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + uint32_t start_bit_index = irow_left_first + bit_offset; + uint8_t left_bits_8 = + (reinterpret_cast(left_base + start_bit_index / 8)[0] >> + (start_bit_index % 8)) & + 0xff; + left = + _mm256_cmpeq_epi32(_mm256_and_si256(bits, _mm256_set1_epi8(left_bits_8)), bits); + left = _mm256_and_si256(left, _mm256_set1_epi32(0xff)); + } break; + case 1: + left = _mm256_cvtepu8_epi32(_mm_set1_epi64x( + reinterpret_cast(left_base)[irow_left_first / 8])); + break; + case 2: + left = _mm256_cvtepu16_epi32(_mm_loadu_si128( + reinterpret_cast(left_base) + irow_left_first / 8)); + break; + case 4: + left = _mm256_loadu_si256(reinterpret_cast(left_base) + + irow_left_first / 8); + break; + default: + ARROW_DCHECK(false); + } + + __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); + if (column_width != sizeof(uint32_t)) { + constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff; + right = _mm256_and_si256(right, _mm256_set1_epi32(mask)); + } + + __m256i cmp = _mm256_cmpeq_epi32(left, right); + + uint32_t result_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); + uint32_t result_hi = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); + + return result_lo | (static_cast(result_hi) << 32); +} + +template +inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* right_base, + __m256i irow_left, uint32_t irow_left_first, + __m256i offset_right) { + auto left_base_i64 = + reinterpret_cast(left_base); + __m256i left_lo = + _mm256_i32gather_epi64(left_base_i64, _mm256_castsi256_si128(irow_left), 8); + __m256i left_hi = + _mm256_i32gather_epi64(left_base_i64, _mm256_extracti128_si256(irow_left, 1), 8); + if (use_selection) { + left_lo = _mm256_i32gather_epi64(left_base_i64, _mm256_castsi256_si128(irow_left), 8); + left_hi = + _mm256_i32gather_epi64(left_base_i64, _mm256_extracti128_si256(irow_left, 1), 8); + } else { + left_lo = _mm256_loadu_si256(reinterpret_cast(left_base) + + irow_left_first / 4); + left_hi = _mm256_loadu_si256(reinterpret_cast(left_base) + + irow_left_first / 4 + 1); + } + auto right_base_i64 = + reinterpret_cast(right_base); + __m256i right_lo = + _mm256_i32gather_epi64(right_base_i64, _mm256_castsi256_si128(offset_right), 1); + __m256i right_hi = _mm256_i32gather_epi64(right_base_i64, + _mm256_extracti128_si256(offset_right, 1), 1); + uint32_t result_lo = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_lo, right_lo)); + uint32_t result_hi = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_hi, right_hi)); + return result_lo | (static_cast(result_hi) << 32); +} + +template +inline uint64_t Compare8_Binary_avx2(uint32_t length, const uint8_t* left_base, + const uint8_t* right_base, __m256i irow_left, + uint32_t irow_left_first, __m256i offset_right) { + uint32_t irow_left_array[8]; + uint32_t offset_right_array[8]; + if (use_selection) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(irow_left_array), irow_left); + } + _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right); + + // Non-zero length guarantees no underflow + int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; + + __m256i tail_mask = set_first_n_bytes_avx2(length - num_loops_less_one * 32); + + uint64_t result = 0; + for (uint32_t irow = 0; irow < 8; ++irow) { + const __m256i* key_left_ptr = reinterpret_cast( + left_base + + (use_selection ? irow_left_array[irow] : irow_left_first + irow) * length); + const __m256i* key_right_ptr = + reinterpret_cast(right_base + offset_right_array[irow]); + __m256i result_or = _mm256_setzero_si256(); + int32_t i; + // length cannot be zero + for (i = 0; i < num_loops_less_one; ++i) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + } + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256( + result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + uint64_t result_single = _mm256_testz_si256(result_or, result_or) * 0xff; + result |= result_single << (8 * irow); + } + return result; +} + template uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( uint32_t offset_within_row, uint32_t num_rows_to_compare, @@ -262,35 +429,13 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( ctx, col, rows, match_bytevector, [bit_offset](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - __m256i left; if (use_selection) { - irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(bit_offset)); - left = _mm256_i32gather_epi32((const int*)left_base, - _mm256_srli_epi32(irow_left, 3), 1); - left = _mm256_and_si256( - _mm256_set1_epi32(1), - _mm256_srlv_epi32(left, - _mm256_and_si256(irow_left, _mm256_set1_epi32(7)))); - left = _mm256_mullo_epi32(left, _mm256_set1_epi32(0xff)); + return CompareSelected8_avx2<0>(left_base, right_base, irow_left, + offset_right, bit_offset); } else { - __m256i bits = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); - uint32_t start_bit_index = irow_left_base + bit_offset; - uint8_t left_bits_8 = - (reinterpret_cast(left_base + start_bit_index / 8)[0] >> - (start_bit_index % 8)) & - 0xff; - left = _mm256_cmpeq_epi32( - _mm256_and_si256(bits, _mm256_set1_epi8(left_bits_8)), bits); - left = _mm256_and_si256(left, _mm256_set1_epi32(0xff)); + return Compare8_avx2<0>(left_base, right_base, irow_left_base, offset_right, + bit_offset); } - __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); - right = _mm256_and_si256(right, _mm256_set1_epi32(0xff)); - __m256i cmp = _mm256_cmpeq_epi32(left, right); - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); - uint32_t result_hi = _mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); - return result_lo | (static_cast(result_hi) << 32); }); } else if (col_width == 1) { return CompareBinaryColumnToRowHelper_avx2( @@ -298,22 +443,12 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - __m256i left; if (use_selection) { - left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 1); - left = _mm256_and_si256(left, _mm256_set1_epi32(0xff)); + return CompareSelected8_avx2<1>(left_base, right_base, irow_left, + offset_right); } else { - left = _mm256_cvtepu8_epi32(_mm_set1_epi64x( - reinterpret_cast(left_base)[irow_left_base / 8])); + return Compare8_avx2<1>(left_base, right_base, irow_left_base, offset_right); } - __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); - right = _mm256_and_si256(right, _mm256_set1_epi32(0xff)); - __m256i cmp = _mm256_cmpeq_epi32(left, right); - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); - uint32_t result_hi = _mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); - return result_lo | (static_cast(result_hi) << 32); }); } else if (col_width == 2) { return CompareBinaryColumnToRowHelper_avx2( @@ -321,22 +456,12 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - __m256i left; if (use_selection) { - left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 2); - left = _mm256_and_si256(left, _mm256_set1_epi32(0xffff)); + return CompareSelected8_avx2<2>(left_base, right_base, irow_left, + offset_right); } else { - left = _mm256_cvtepu16_epi32(_mm_loadu_si128( - reinterpret_cast(left_base) + irow_left_base / 8)); + return Compare8_avx2<2>(left_base, right_base, irow_left_base, offset_right); } - __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); - right = _mm256_and_si256(right, _mm256_set1_epi32(0xffff)); - __m256i cmp = _mm256_cmpeq_epi32(left, right); - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); - uint32_t result_hi = _mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); - return result_lo | (static_cast(result_hi) << 32); }); } else if (col_width == 4) { return CompareBinaryColumnToRowHelper_avx2( @@ -344,20 +469,12 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - __m256i left; if (use_selection) { - left = _mm256_i32gather_epi32((const int*)left_base, irow_left, 4); + return CompareSelected8_avx2<4>(left_base, right_base, irow_left, + offset_right); } else { - left = _mm256_loadu_si256(reinterpret_cast(left_base) + - irow_left_base / 8); + return Compare8_avx2<4>(left_base, right_base, irow_left_base, offset_right); } - __m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1); - __m256i cmp = _mm256_cmpeq_epi32(left, right); - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); - uint32_t result_hi = _mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); - return result_lo | (static_cast(result_hi) << 32); }); } else if (col_width == 8) { return CompareBinaryColumnToRowHelper_avx2( @@ -365,34 +482,8 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - auto left_base_i64 = - reinterpret_cast(left_base); - __m256i left_lo = - _mm256_i32gather_epi64(left_base_i64, _mm256_castsi256_si128(irow_left), 8); - __m256i left_hi = _mm256_i32gather_epi64( - left_base_i64, _mm256_extracti128_si256(irow_left, 1), 8); - if (use_selection) { - left_lo = _mm256_i32gather_epi64(left_base_i64, - _mm256_castsi256_si128(irow_left), 8); - left_hi = _mm256_i32gather_epi64(left_base_i64, - _mm256_extracti128_si256(irow_left, 1), 8); - } else { - left_lo = _mm256_loadu_si256(reinterpret_cast(left_base) + - irow_left_base / 4); - left_hi = _mm256_loadu_si256(reinterpret_cast(left_base) + - irow_left_base / 4 + 1); - } - auto right_base_i64 = - reinterpret_cast(right_base); - __m256i right_lo = _mm256_i32gather_epi64( - right_base_i64, _mm256_castsi256_si128(offset_right), 1); - __m256i right_hi = _mm256_i32gather_epi64( - right_base_i64, _mm256_extracti128_si256(offset_right, 1), 1); - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_lo, right_lo)); - uint32_t result_hi = - _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_hi, right_hi)); - return result_lo | (static_cast(result_hi) << 32); + return Compare8_64bit_avx2(left_base, right_base, irow_left, + irow_left_base, offset_right); }); } else { return CompareBinaryColumnToRowHelper_avx2( @@ -400,45 +491,9 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( ctx, col, rows, match_bytevector, [&col](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - uint32_t irow_left_array[8]; - uint32_t offset_right_array[8]; - if (use_selection) { - _mm256_storeu_si256(reinterpret_cast<__m256i*>(irow_left_array), irow_left); - } - _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), - offset_right); uint32_t length = col.metadata().fixed_length; - - // Non-zero length guarantees no underflow - int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; - - __m256i tail_mask = set_first_n_bytes_avx2(length - num_loops_less_one * 32); - - uint64_t result = 0; - for (uint32_t irow = 0; irow < 8; ++irow) { - const __m256i* key_left_ptr = reinterpret_cast( - left_base + - (use_selection ? irow_left_array[irow] : irow_left_base + irow) * length); - const __m256i* key_right_ptr = - reinterpret_cast(right_base + offset_right_array[irow]); - __m256i result_or = _mm256_setzero_si256(); - int32_t i; - // length cannot be zero - for (i = 0; i < num_loops_less_one; ++i) { - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = - _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); - } - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256( - result_or, - _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); - uint64_t result_single = _mm256_testz_si256(result_or, result_or) * 0xff; - result |= result_single << (8 * irow); - } - return result; + return Compare8_Binary_avx2( + length, left_base, right_base, irow_left, irow_left_base, offset_right); }); } } From 5981c5f092fc937ab61d3074d8bf1de34db71ab7 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 18 Aug 2021 14:03:17 -0400 Subject: [PATCH 3/6] support dict columns in SortBy --- .../compute/kernels/hash_aggregate_test.cc | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index a4f55efe301..bcd4914384d 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -638,10 +638,24 @@ TEST(GroupBy, Errors) { namespace { void SortBy(std::vector names, Datum* aggregated_and_grouped) { - SortOptions options{{SortKey("key_0", SortOrder::Ascending)}}; + SortOptions options; + for (auto&& name : names) { + options.sort_keys.emplace_back(std::move(name), SortOrder::Ascending); + } ASSERT_OK_AND_ASSIGN( auto batch, RecordBatch::FromStructArray(aggregated_and_grouped->make_array())); + + // decode any dictionary columns: + ArrayVector cols = batch->columns(); + for (auto& col : cols) { + if (col->type_id() != Type::DICTIONARY) continue; + + auto dict_col = checked_cast(col.get()); + ASSERT_OK_AND_ASSIGN(col, Take(*dict_col->dictionary(), *dict_col->indices())); + } + batch = RecordBatch::Make(batch->schema(), batch->num_rows(), std::move(cols)); + ASSERT_OK_AND_ASSIGN(Datum sort_indices, SortIndices(batch, options)); ASSERT_OK_AND_ASSIGN(*aggregated_and_grouped, @@ -1293,9 +1307,7 @@ TEST(GroupBy, SumOnlyStringAndDictKeys) { { {"hash_sum", nullptr}, })); - if (key_type->Equals(utf8())) { - SortBy({"key_0"}, &aggregated_and_grouped); - } + SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsEqual(ArrayFromJSON(struct_({ field("hash_sum", float64()), From 4025a8202f49133054d0c35b37be1da19dec2377 Mon Sep 17 00:00:00 2001 From: michalursa Date: Tue, 3 Aug 2021 18:30:25 -0700 Subject: [PATCH 4/6] Column-at-a-time comparison and hashing for Group Identifier --- cpp/src/arrow/compute/exec/key_compare.cc | 4 +- .../arrow/compute/exec/key_compare_avx2.cc | 500 ++++++++++-------- 2 files changed, 269 insertions(+), 235 deletions(-) diff --git a/cpp/src/arrow/compute/exec/key_compare.cc b/cpp/src/arrow/compute/exec/key_compare.cc index 55b0e5e998b..2881940cf9c 100644 --- a/cpp/src/arrow/compute/exec/key_compare.cc +++ b/cpp/src/arrow/compute/exec/key_compare.cc @@ -200,7 +200,7 @@ void KeyCompare::CompareBinaryColumnToRow( // Non-zero length guarantees no underflow int32_t num_loops_less_one = - static_cast(BitUtil::CeilDiv(length, 8)) - 1; + BitUtil::CeilDiv(static_cast(length), 8) - 1; uint64_t tail_mask = ~0ULL >> (64 - 8 * (length - num_loops_less_one * 8)); @@ -271,7 +271,7 @@ void KeyCompare::CompareVarBinaryColumnToRow( if (length > 0) { int32_t j; // length can be zero - for (j = 0; j < static_cast(BitUtil::CeilDiv(length, 8)) - 1; ++j) { + for (j = 0; j < BitUtil::CeilDiv(static_cast(length), 8) - 1; ++j) { uint64_t key_left = util::SafeLoad(key_left_ptr + j); uint64_t key_right = key_right_ptr[j]; result_or |= key_left ^ key_right; diff --git a/cpp/src/arrow/compute/exec/key_compare_avx2.cc b/cpp/src/arrow/compute/exec/key_compare_avx2.cc index df13e8cae3c..3866ad2d751 100644 --- a/cpp/src/arrow/compute/exec/key_compare_avx2.cc +++ b/cpp/src/arrow/compute/exec/key_compare_avx2.cc @@ -385,249 +385,283 @@ inline uint64_t Compare8_Binary_avx2(uint32_t length, const uint8_t* left_base, } _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right); - // Non-zero length guarantees no underflow - int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; - - __m256i tail_mask = set_first_n_bytes_avx2(length - num_loops_less_one * 32); - - uint64_t result = 0; - for (uint32_t irow = 0; irow < 8; ++irow) { - const __m256i* key_left_ptr = reinterpret_cast( - left_base + - (use_selection ? irow_left_array[irow] : irow_left_first + irow) * length); - const __m256i* key_right_ptr = - reinterpret_cast(right_base + offset_right_array[irow]); - __m256i result_or = _mm256_setzero_si256(); - int32_t i; - // length cannot be zero - for (i = 0; i < num_loops_less_one; ++i) { - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); - } - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256( - result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); - uint64_t result_single = _mm256_testz_si256(result_or, result_or) * 0xff; - result |= result_single << (8 * irow); - } - return result; -} + template + uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector, + COMPARE8_FN compare8_fn) { + bool is_fixed_length = rows.metadata().is_fixed_length; + if (is_fixed_length) { + uint32_t fixed_length = rows.metadata().fixed_length; + const uint8_t* rows_left = col.data(1); + const uint8_t* rows_right = rows.data(1); + constexpr uint32_t unroll = 8; + __m256i irow_left = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { + if (use_selection) { + irow_left = _mm256_cvtepu16_epi32( + _mm_loadu_si128(reinterpret_cast(sel_left_maybe_null) + i)); + } + __m256i irow_right; + if (use_selection) { + irow_right = + _mm256_i32gather_epi32((const int*)left_to_right_map, irow_left, 4); + } else { + irow_right = + _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); + } + + __m256i tail_mask = set_first_n_bytes_avx2(length - num_loops_less_one * 32); + + uint64_t result = 0; + for (uint32_t irow = 0; irow < 8; ++irow) { + const __m256i* key_left_ptr = reinterpret_cast( + left_base + + (use_selection ? irow_left_array[irow] : irow_left_first + irow) * length); + const __m256i* key_right_ptr = + reinterpret_cast(right_base + offset_right_array[irow]); + __m256i result_or = _mm256_setzero_si256(); + int32_t i; + // length cannot be zero + for (i = 0; i < num_loops_less_one; ++i) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + } + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256( + result_or, + _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + uint64_t result_single = _mm256_testz_si256(result_or, result_or) * 0xff; + result |= result_single << (8 * irow); + } + return result; + } -template -uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( - uint32_t offset_within_row, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { - uint32_t col_width = col.metadata().fixed_length; - if (col_width == 0) { - int bit_offset = col.bit_offset(1); - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector, - [bit_offset](const uint8_t* left_base, const uint8_t* right_base, + template + uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + uint32_t col_width = col.metadata().fixed_length; + if (col_width == 0) { + int bit_offset = col.bit_offset(1); + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [bit_offset](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_base, __m256i irow_left, + __m256i offset_right) { + if (use_selection) { + return CompareSelected8_avx2<0>(left_base, right_base, irow_left, + offset_right, bit_offset); + } else { + return Compare8_avx2<0>(left_base, right_base, irow_left_base, + offset_right, bit_offset); + } + }); + } else if (col_width == 1) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + if (use_selection) { + return CompareSelected8_avx2<1>(left_base, right_base, irow_left, + offset_right); + } else { + return Compare8_avx2<1>(left_base, right_base, irow_left_base, + offset_right); + } + }); + } else if (col_width == 2) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + if (use_selection) { + return CompareSelected8_avx2<2>(left_base, right_base, irow_left, + offset_right); + } else { + return Compare8_avx2<2>(left_base, right_base, irow_left_base, + offset_right); + } + }); + } else if (col_width == 4) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + if (use_selection) { + return CompareSelected8_avx2<4>(left_base, right_base, irow_left, + offset_right); + } else { + return Compare8_avx2<4>(left_base, right_base, irow_left_base, + offset_right); + } + }); + } else if (col_width == 8) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + return Compare8_64bit_avx2( + left_base, right_base, irow_left, irow_left_base, offset_right); + }); + } else { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector, + [&col](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - if (use_selection) { - return CompareSelected8_avx2<0>(left_base, right_base, irow_left, - offset_right, bit_offset); - } else { - return Compare8_avx2<0>(left_base, right_base, irow_left_base, offset_right, - bit_offset); - } - }); - } else if (col_width == 1) { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector, - [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, - __m256i irow_left, __m256i offset_right) { - if (use_selection) { - return CompareSelected8_avx2<1>(left_base, right_base, irow_left, - offset_right); - } else { - return Compare8_avx2<1>(left_base, right_base, irow_left_base, offset_right); - } - }); - } else if (col_width == 2) { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector, - [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, - __m256i irow_left, __m256i offset_right) { - if (use_selection) { - return CompareSelected8_avx2<2>(left_base, right_base, irow_left, - offset_right); + uint32_t length = col.metadata().fixed_length; + return Compare8_Binary_avx2(length, left_base, right_base, + irow_left, irow_left_base, + offset_right); + }); + } + } + + // Overwrites the match_bytevector instead of updating it + template + void KeyCompare::CompareVarBinaryColumnToRowImp_avx2( + uint32_t id_varbinary_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + const uint32_t* offsets_left = col.offsets(); + const uint32_t* offsets_right = rows.offsets(); + const uint8_t* rows_left = col.data(2); + const uint8_t* rows_right = rows.data(2); + for (uint32_t i = 0; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; + uint32_t irow_right = left_to_right_map[irow_left]; + uint32_t begin_left = offsets_left[irow_left]; + uint32_t length_left = offsets_left[irow_left + 1] - begin_left; + uint32_t begin_right = offsets_right[irow_right]; + uint32_t length_right; + uint32_t offset_within_row; + if (!is_first_varbinary_col) { + rows.metadata().nth_varbinary_offset_and_length( + rows_right + begin_right, id_varbinary_col, &offset_within_row, + &length_right); } else { - return Compare8_avx2<2>(left_base, right_base, irow_left_base, offset_right); + rows.metadata().first_varbinary_offset_and_length( + rows_right + begin_right, &offset_within_row, &length_right); } - }); - } else if (col_width == 4) { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector, - [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, - __m256i irow_left, __m256i offset_right) { - if (use_selection) { - return CompareSelected8_avx2<4>(left_base, right_base, irow_left, - offset_right); - } else { - return Compare8_avx2<4>(left_base, right_base, irow_left_base, offset_right); + begin_right += offset_within_row; + + __m256i result_or = _mm256_setzero_si256(); + uint32_t length = std::min(length_left, length_right); + if (length > 0) { + const __m256i* key_left_ptr = + reinterpret_cast(rows_left + begin_left); + const __m256i* key_right_ptr = + reinterpret_cast(rows_right + begin_right); + int32_t j; + // length can be zero + for (j = 0; j < (static_cast(length) + 31) / 32 - 1; ++j) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); + result_or = + _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + } + + __m256i tail_mask = set_first_n_bytes_avx2(length - j * 32); + + __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); + result_or = _mm256_or_si256( + result_or, + _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); } - }); - } else if (col_width == 8) { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector, - [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, - __m256i irow_left, __m256i offset_right) { - return Compare8_64bit_avx2(left_base, right_base, irow_left, - irow_left_base, offset_right); - }); - } else { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector, - [&col](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - uint32_t length = col.metadata().fixed_length; - return Compare8_Binary_avx2( - length, left_base, right_base, irow_left, irow_left_base, offset_right); - }); - } -} - -// Overwrites the match_bytevector instead of updating it -template -void KeyCompare::CompareVarBinaryColumnToRowImp_avx2( - uint32_t id_varbinary_col, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { - const uint32_t* offsets_left = col.offsets(); - const uint32_t* offsets_right = rows.offsets(); - const uint8_t* rows_left = col.data(2); - const uint8_t* rows_right = rows.data(2); - for (uint32_t i = 0; i < num_rows_to_compare; ++i) { - uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - uint32_t irow_right = left_to_right_map[irow_left]; - uint32_t begin_left = offsets_left[irow_left]; - uint32_t length_left = offsets_left[irow_left + 1] - begin_left; - uint32_t begin_right = offsets_right[irow_right]; - uint32_t length_right; - uint32_t offset_within_row; - if (!is_first_varbinary_col) { - rows.metadata().nth_varbinary_offset_and_length( - rows_right + begin_right, id_varbinary_col, &offset_within_row, &length_right); - } else { - rows.metadata().first_varbinary_offset_and_length( - rows_right + begin_right, &offset_within_row, &length_right); - } - begin_right += offset_within_row; - - __m256i result_or = _mm256_setzero_si256(); - uint32_t length = std::min(length_left, length_right); - if (length > 0) { - const __m256i* key_left_ptr = - reinterpret_cast(rows_left + begin_left); - const __m256i* key_right_ptr = - reinterpret_cast(rows_right + begin_right); - int32_t j; - // length can be zero - for (j = 0; j < (static_cast(length) + 31) / 32 - 1; ++j) { - __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); - result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + int result = _mm256_testz_si256(result_or, result_or) * 0xff; + result *= (length_left == length_right ? 1 : 0); + match_bytevector[i] = result; + } } - __m256i tail_mask = set_first_n_bytes_avx2(length - j * 32); - - __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); - result_or = _mm256_or_si256( - result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); - } - int result = _mm256_testz_si256(result_or, result_or) * 0xff; - result *= (length_left == length_right ? 1 : 0); - match_bytevector[i] = result; - } -} - -uint32_t KeyCompare::AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A, - const uint8_t* bytevector_B) { - constexpr int unroll = 32; - for (uint32_t i = 0; i < num_elements / unroll; ++i) { - __m256i result = _mm256_and_si256( - _mm256_loadu_si256(reinterpret_cast(bytevector_A) + i), - _mm256_loadu_si256(reinterpret_cast(bytevector_B) + i)); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(bytevector_A) + i, result); - } - return (num_elements - (num_elements % unroll)); -} + uint32_t KeyCompare::AndByteVectors_avx2( + uint32_t num_elements, uint8_t * bytevector_A, const uint8_t* bytevector_B) { + constexpr int unroll = 32; + for (uint32_t i = 0; i < num_elements / unroll; ++i) { + __m256i result = _mm256_and_si256( + _mm256_loadu_si256(reinterpret_cast(bytevector_A) + i), + _mm256_loadu_si256(reinterpret_cast(bytevector_B) + i)); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(bytevector_A) + i, result); + } + return (num_elements - (num_elements % unroll)); + } -uint32_t KeyCompare::NullUpdateColumnToRow_avx2( - bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { - if (use_selection) { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); - } else { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); - } -} + uint32_t KeyCompare::NullUpdateColumnToRow_avx2( + bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + if (use_selection) { + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } else { + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } + } -uint32_t KeyCompare::CompareBinaryColumnToRow_avx2( - bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { - if (use_selection) { - return CompareBinaryColumnToRowImp_avx2(offset_within_row, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); - } else { - return CompareBinaryColumnToRowImp_avx2(offset_within_row, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); - } -} + uint32_t KeyCompare::CompareBinaryColumnToRow_avx2( + bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + if (use_selection) { + return CompareBinaryColumnToRowImp_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector); + } else { + return CompareBinaryColumnToRowImp_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector); + } + } -void KeyCompare::CompareVarBinaryColumnToRow_avx2( - bool use_selection, bool is_first_varbinary_col, uint32_t id_varlen_col, - uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, - const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, - uint8_t* match_bytevector) { - if (use_selection) { - if (is_first_varbinary_col) { - CompareVarBinaryColumnToRowImp_avx2( - id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, - col, rows, match_bytevector); - } else { - CompareVarBinaryColumnToRowImp_avx2( - id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, - col, rows, match_bytevector); - } - } else { - if (is_first_varbinary_col) { - CompareVarBinaryColumnToRowImp_avx2( - id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, - col, rows, match_bytevector); - } else { - CompareVarBinaryColumnToRowImp_avx2( - id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, - col, rows, match_bytevector); - } - } -} + void KeyCompare::CompareVarBinaryColumnToRow_avx2( + bool use_selection, bool is_first_varbinary_col, uint32_t id_varlen_col, + uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector) { + if (use_selection) { + if (is_first_varbinary_col) { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector); + } else { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector); + } + } else { + if (is_first_varbinary_col) { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector); + } else { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, match_bytevector); + } + } + } #endif -} // namespace compute -} // namespace arrow + } // namespace compute + } // namespace arrow From aee5b11438cc94cb3239416dcdd8f2fa8b901afd Mon Sep 17 00:00:00 2001 From: michalursa Date: Thu, 12 Aug 2021 22:13:15 -0700 Subject: [PATCH 5/6] Column-at-a-time comparison and hashing - refactoring of binary comparison --- cpp/src/arrow/compute/exec/key_compare.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/exec/key_compare.cc b/cpp/src/arrow/compute/exec/key_compare.cc index 2881940cf9c..55b0e5e998b 100644 --- a/cpp/src/arrow/compute/exec/key_compare.cc +++ b/cpp/src/arrow/compute/exec/key_compare.cc @@ -200,7 +200,7 @@ void KeyCompare::CompareBinaryColumnToRow( // Non-zero length guarantees no underflow int32_t num_loops_less_one = - BitUtil::CeilDiv(static_cast(length), 8) - 1; + static_cast(BitUtil::CeilDiv(length, 8)) - 1; uint64_t tail_mask = ~0ULL >> (64 - 8 * (length - num_loops_less_one * 8)); @@ -271,7 +271,7 @@ void KeyCompare::CompareVarBinaryColumnToRow( if (length > 0) { int32_t j; // length can be zero - for (j = 0; j < BitUtil::CeilDiv(static_cast(length), 8) - 1; ++j) { + for (j = 0; j < static_cast(BitUtil::CeilDiv(length, 8)) - 1; ++j) { uint64_t key_left = util::SafeLoad(key_left_ptr + j); uint64_t key_right = key_right_ptr[j]; result_or |= key_left ^ key_right; From 5294f98a57f070ea3b2c82c4b797231fb72fd776 Mon Sep 17 00:00:00 2001 From: michalursa Date: Wed, 18 Aug 2021 13:52:36 -0700 Subject: [PATCH 6/6] Column-at-a-time comparison and hashing - fixing merge problems --- .../arrow/compute/exec/key_compare_avx2.cc | 500 ++++++++---------- 1 file changed, 233 insertions(+), 267 deletions(-) diff --git a/cpp/src/arrow/compute/exec/key_compare_avx2.cc b/cpp/src/arrow/compute/exec/key_compare_avx2.cc index 3866ad2d751..df13e8cae3c 100644 --- a/cpp/src/arrow/compute/exec/key_compare_avx2.cc +++ b/cpp/src/arrow/compute/exec/key_compare_avx2.cc @@ -385,283 +385,249 @@ inline uint64_t Compare8_Binary_avx2(uint32_t length, const uint8_t* left_base, } _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right); - template - uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( - uint32_t offset_within_row, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector, - COMPARE8_FN compare8_fn) { - bool is_fixed_length = rows.metadata().is_fixed_length; - if (is_fixed_length) { - uint32_t fixed_length = rows.metadata().fixed_length; - const uint8_t* rows_left = col.data(1); - const uint8_t* rows_right = rows.data(1); - constexpr uint32_t unroll = 8; - __m256i irow_left = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { - if (use_selection) { - irow_left = _mm256_cvtepu16_epi32( - _mm_loadu_si128(reinterpret_cast(sel_left_maybe_null) + i)); - } - __m256i irow_right; - if (use_selection) { - irow_right = - _mm256_i32gather_epi32((const int*)left_to_right_map, irow_left, 4); - } else { - irow_right = - _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); - } - - __m256i tail_mask = set_first_n_bytes_avx2(length - num_loops_less_one * 32); - - uint64_t result = 0; - for (uint32_t irow = 0; irow < 8; ++irow) { - const __m256i* key_left_ptr = reinterpret_cast( - left_base + - (use_selection ? irow_left_array[irow] : irow_left_first + irow) * length); - const __m256i* key_right_ptr = - reinterpret_cast(right_base + offset_right_array[irow]); - __m256i result_or = _mm256_setzero_si256(); - int32_t i; - // length cannot be zero - for (i = 0; i < num_loops_less_one; ++i) { - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); - } - __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); - result_or = _mm256_or_si256( - result_or, - _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); - uint64_t result_single = _mm256_testz_si256(result_or, result_or) * 0xff; - result |= result_single << (8 * irow); - } - return result; - } + // Non-zero length guarantees no underflow + int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; + + __m256i tail_mask = set_first_n_bytes_avx2(length - num_loops_less_one * 32); + + uint64_t result = 0; + for (uint32_t irow = 0; irow < 8; ++irow) { + const __m256i* key_left_ptr = reinterpret_cast( + left_base + + (use_selection ? irow_left_array[irow] : irow_left_first + irow) * length); + const __m256i* key_right_ptr = + reinterpret_cast(right_base + offset_right_array[irow]); + __m256i result_or = _mm256_setzero_si256(); + int32_t i; + // length cannot be zero + for (i = 0; i < num_loops_less_one; ++i) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + } + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256( + result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + uint64_t result_single = _mm256_testz_si256(result_or, result_or) * 0xff; + result |= result_single << (8 * irow); + } + return result; +} - template - uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( - uint32_t offset_within_row, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { - uint32_t col_width = col.metadata().fixed_length; - if (col_width == 0) { - int bit_offset = col.bit_offset(1); - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector, - [bit_offset](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_base, __m256i irow_left, - __m256i offset_right) { - if (use_selection) { - return CompareSelected8_avx2<0>(left_base, right_base, irow_left, - offset_right, bit_offset); - } else { - return Compare8_avx2<0>(left_base, right_base, irow_left_base, - offset_right, bit_offset); - } - }); - } else if (col_width == 1) { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector, - [](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - if (use_selection) { - return CompareSelected8_avx2<1>(left_base, right_base, irow_left, - offset_right); - } else { - return Compare8_avx2<1>(left_base, right_base, irow_left_base, - offset_right); - } - }); - } else if (col_width == 2) { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector, - [](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - if (use_selection) { - return CompareSelected8_avx2<2>(left_base, right_base, irow_left, - offset_right); - } else { - return Compare8_avx2<2>(left_base, right_base, irow_left_base, - offset_right); - } - }); - } else if (col_width == 4) { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector, - [](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - if (use_selection) { - return CompareSelected8_avx2<4>(left_base, right_base, irow_left, - offset_right); - } else { - return Compare8_avx2<4>(left_base, right_base, irow_left_base, - offset_right); - } - }); - } else if (col_width == 8) { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector, - [](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - return Compare8_64bit_avx2( - left_base, right_base, irow_left, irow_left_base, offset_right); - }); - } else { - return CompareBinaryColumnToRowHelper_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector, - [&col](const uint8_t* left_base, const uint8_t* right_base, +template +uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( + uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + uint32_t col_width = col.metadata().fixed_length; + if (col_width == 0) { + int bit_offset = col.bit_offset(1); + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [bit_offset](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { - uint32_t length = col.metadata().fixed_length; - return Compare8_Binary_avx2(length, left_base, right_base, - irow_left, irow_left_base, - offset_right); - }); - } - } - - // Overwrites the match_bytevector instead of updating it - template - void KeyCompare::CompareVarBinaryColumnToRowImp_avx2( - uint32_t id_varbinary_col, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { - const uint32_t* offsets_left = col.offsets(); - const uint32_t* offsets_right = rows.offsets(); - const uint8_t* rows_left = col.data(2); - const uint8_t* rows_right = rows.data(2); - for (uint32_t i = 0; i < num_rows_to_compare; ++i) { - uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - uint32_t irow_right = left_to_right_map[irow_left]; - uint32_t begin_left = offsets_left[irow_left]; - uint32_t length_left = offsets_left[irow_left + 1] - begin_left; - uint32_t begin_right = offsets_right[irow_right]; - uint32_t length_right; - uint32_t offset_within_row; - if (!is_first_varbinary_col) { - rows.metadata().nth_varbinary_offset_and_length( - rows_right + begin_right, id_varbinary_col, &offset_within_row, - &length_right); + if (use_selection) { + return CompareSelected8_avx2<0>(left_base, right_base, irow_left, + offset_right, bit_offset); } else { - rows.metadata().first_varbinary_offset_and_length( - rows_right + begin_right, &offset_within_row, &length_right); + return Compare8_avx2<0>(left_base, right_base, irow_left_base, offset_right, + bit_offset); } - begin_right += offset_within_row; - - __m256i result_or = _mm256_setzero_si256(); - uint32_t length = std::min(length_left, length_right); - if (length > 0) { - const __m256i* key_left_ptr = - reinterpret_cast(rows_left + begin_left); - const __m256i* key_right_ptr = - reinterpret_cast(rows_right + begin_right); - int32_t j; - // length can be zero - for (j = 0; j < (static_cast(length) + 31) / 32 - 1; ++j) { - __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); - result_or = - _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); - } - - __m256i tail_mask = set_first_n_bytes_avx2(length - j * 32); - - __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); - __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); - result_or = _mm256_or_si256( - result_or, - _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + }); + } else if (col_width == 1) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, + __m256i irow_left, __m256i offset_right) { + if (use_selection) { + return CompareSelected8_avx2<1>(left_base, right_base, irow_left, + offset_right); + } else { + return Compare8_avx2<1>(left_base, right_base, irow_left_base, offset_right); } - int result = _mm256_testz_si256(result_or, result_or) * 0xff; - result *= (length_left == length_right ? 1 : 0); - match_bytevector[i] = result; - } - } - - uint32_t KeyCompare::AndByteVectors_avx2( - uint32_t num_elements, uint8_t * bytevector_A, const uint8_t* bytevector_B) { - constexpr int unroll = 32; - for (uint32_t i = 0; i < num_elements / unroll; ++i) { - __m256i result = _mm256_and_si256( - _mm256_loadu_si256(reinterpret_cast(bytevector_A) + i), - _mm256_loadu_si256(reinterpret_cast(bytevector_B) + i)); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(bytevector_A) + i, result); - } - return (num_elements - (num_elements % unroll)); - } - - uint32_t KeyCompare::NullUpdateColumnToRow_avx2( - bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { - if (use_selection) { - return NullUpdateColumnToRowImp_avx2( - id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, - col, rows, match_bytevector); - } else { - return NullUpdateColumnToRowImp_avx2( - id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, - col, rows, match_bytevector); - } - } - - uint32_t KeyCompare::CompareBinaryColumnToRow_avx2( - bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, - KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, - const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { - if (use_selection) { - return CompareBinaryColumnToRowImp_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector); - } else { - return CompareBinaryColumnToRowImp_avx2( - offset_within_row, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector); - } - } - - void KeyCompare::CompareVarBinaryColumnToRow_avx2( - bool use_selection, bool is_first_varbinary_col, uint32_t id_varlen_col, - uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, - const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, - uint8_t* match_bytevector) { - if (use_selection) { - if (is_first_varbinary_col) { - CompareVarBinaryColumnToRowImp_avx2( - id_varlen_col, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector); + }); + } else if (col_width == 2) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, + __m256i irow_left, __m256i offset_right) { + if (use_selection) { + return CompareSelected8_avx2<2>(left_base, right_base, irow_left, + offset_right); } else { - CompareVarBinaryColumnToRowImp_avx2( - id_varlen_col, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector); + return Compare8_avx2<2>(left_base, right_base, irow_left_base, offset_right); } - } else { - if (is_first_varbinary_col) { - CompareVarBinaryColumnToRowImp_avx2( - id_varlen_col, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector); + }); + } else if (col_width == 4) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, + __m256i irow_left, __m256i offset_right) { + if (use_selection) { + return CompareSelected8_avx2<4>(left_base, right_base, irow_left, + offset_right); } else { - CompareVarBinaryColumnToRowImp_avx2( - id_varlen_col, num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, match_bytevector); + return Compare8_avx2<4>(left_base, right_base, irow_left_base, offset_right); } - } + }); + } else if (col_width == 8) { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, + __m256i irow_left, __m256i offset_right) { + return Compare8_64bit_avx2(left_base, right_base, irow_left, + irow_left_base, offset_right); + }); + } else { + return CompareBinaryColumnToRowHelper_avx2( + offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector, + [&col](const uint8_t* left_base, const uint8_t* right_base, + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + uint32_t length = col.metadata().fixed_length; + return Compare8_Binary_avx2( + length, left_base, right_base, irow_left, irow_left_base, offset_right); + }); + } +} + +// Overwrites the match_bytevector instead of updating it +template +void KeyCompare::CompareVarBinaryColumnToRowImp_avx2( + uint32_t id_varbinary_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + const uint32_t* offsets_left = col.offsets(); + const uint32_t* offsets_right = rows.offsets(); + const uint8_t* rows_left = col.data(2); + const uint8_t* rows_right = rows.data(2); + for (uint32_t i = 0; i < num_rows_to_compare; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; + uint32_t irow_right = left_to_right_map[irow_left]; + uint32_t begin_left = offsets_left[irow_left]; + uint32_t length_left = offsets_left[irow_left + 1] - begin_left; + uint32_t begin_right = offsets_right[irow_right]; + uint32_t length_right; + uint32_t offset_within_row; + if (!is_first_varbinary_col) { + rows.metadata().nth_varbinary_offset_and_length( + rows_right + begin_right, id_varbinary_col, &offset_within_row, &length_right); + } else { + rows.metadata().first_varbinary_offset_and_length( + rows_right + begin_right, &offset_within_row, &length_right); + } + begin_right += offset_within_row; + + __m256i result_or = _mm256_setzero_si256(); + uint32_t length = std::min(length_left, length_right); + if (length > 0) { + const __m256i* key_left_ptr = + reinterpret_cast(rows_left + begin_left); + const __m256i* key_right_ptr = + reinterpret_cast(rows_right + begin_right); + int32_t j; + // length can be zero + for (j = 0; j < (static_cast(length) + 31) / 32 - 1; ++j) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); + result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); } + __m256i tail_mask = set_first_n_bytes_avx2(length - j * 32); + + __m256i key_left = _mm256_loadu_si256(key_left_ptr + j); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + j); + result_or = _mm256_or_si256( + result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + } + int result = _mm256_testz_si256(result_or, result_or) * 0xff; + result *= (length_left == length_right ? 1 : 0); + match_bytevector[i] = result; + } +} + +uint32_t KeyCompare::AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A, + const uint8_t* bytevector_B) { + constexpr int unroll = 32; + for (uint32_t i = 0; i < num_elements / unroll; ++i) { + __m256i result = _mm256_and_si256( + _mm256_loadu_si256(reinterpret_cast(bytevector_A) + i), + _mm256_loadu_si256(reinterpret_cast(bytevector_B) + i)); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(bytevector_A) + i, result); + } + return (num_elements - (num_elements % unroll)); +} + +uint32_t KeyCompare::NullUpdateColumnToRow_avx2( + bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + if (use_selection) { + return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); + } else { + return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); + } +} + +uint32_t KeyCompare::CompareBinaryColumnToRow_avx2( + bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, const KeyEncoder::KeyColumnArray& col, + const KeyEncoder::KeyRowArray& rows, uint8_t* match_bytevector) { + if (use_selection) { + return CompareBinaryColumnToRowImp_avx2(offset_within_row, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); + } else { + return CompareBinaryColumnToRowImp_avx2(offset_within_row, num_rows_to_compare, + sel_left_maybe_null, left_to_right_map, + ctx, col, rows, match_bytevector); + } +} + +void KeyCompare::CompareVarBinaryColumnToRow_avx2( + bool use_selection, bool is_first_varbinary_col, uint32_t id_varlen_col, + uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, KeyEncoder::KeyEncoderContext* ctx, + const KeyEncoder::KeyColumnArray& col, const KeyEncoder::KeyRowArray& rows, + uint8_t* match_bytevector) { + if (use_selection) { + if (is_first_varbinary_col) { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } else { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } + } else { + if (is_first_varbinary_col) { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } else { + CompareVarBinaryColumnToRowImp_avx2( + id_varlen_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, + col, rows, match_bytevector); + } + } +} + #endif - } // namespace compute - } // namespace arrow +} // namespace compute +} // namespace arrow