From 937ff8b55a56f5d88ef4d49ea804777279c295e4 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Tue, 23 Jul 2024 19:58:31 +0800 Subject: [PATCH 01/34] Widen the offset of row table to 64b --- cpp/src/arrow/acero/swiss_join.cc | 11 ++-- cpp/src/arrow/compute/row/compare_internal.cc | 21 +++--- cpp/src/arrow/compute/row/encode_internal.cc | 39 +++++------ cpp/src/arrow/compute/row/encode_internal.h | 7 +- cpp/src/arrow/compute/row/row_internal.cc | 31 ++++----- cpp/src/arrow/compute/row/row_internal.h | 35 +++++++--- cpp/src/arrow/compute/row/row_test.cc | 64 ++++++------------- 7 files changed, 95 insertions(+), 113 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 732deb72861..e5b609879b3 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -122,7 +122,7 @@ void RowArrayAccessor::Visit(const RowTableImpl& rows, int column_id, int num_ro if (!is_fixed_length_column) { int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); const uint8_t* row_ptr_base = rows.data(2); - const uint32_t* row_offsets = rows.offsets(); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); uint32_t field_offset_within_row, field_length; if (varbinary_column_id == 0) { @@ -173,7 +173,7 @@ void RowArrayAccessor::Visit(const RowTableImpl& rows, int column_id, int num_ro // Case 4: This is a fixed length column in a varying length row // const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row; - const uint32_t* row_offsets = rows.offsets(); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); for (int i = 0; i < num_rows; ++i) { uint32_t row_id = row_ids[i]; const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id]; @@ -565,8 +565,8 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& int64_t first_target_row_offset, const int64_t* source_rows_permutation) { int64_t num_source_rows = source.length(); - uint32_t* target_offsets = target->mutable_offsets(); - const uint32_t* source_offsets = source.offsets(); + RowTableImpl::offset_type* target_offsets = target->mutable_offsets(); + const RowTableImpl::offset_type* source_offsets = source.offsets(); // Permutation of source rows is optional. // @@ -593,7 +593,8 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& int64_t source_row_id = source_rows_permutation[i]; const uint64_t* source_row_ptr = reinterpret_cast( source.data(2) + source_offsets[source_row_id]); - uint32_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id]; + int64_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id]; + DCHECK_LE(length, std::numeric_limits::max()); // Rows should be 64-bit aligned. // In that case we can copy them using a sequence of 64-bit read/writes. diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 98aea901126..7cb2612a2c4 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -110,12 +110,13 @@ void KeyCompare::CompareBinaryColumnToRowHelper( } } else { const uint8_t* rows_left = col.data(1); - const uint32_t* offsets_right = rows.offsets(); + const RowTableImpl::offset_type* offsets_right = rows.offsets(); const uint8_t* rows_right = rows.data(2); for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; - uint32_t offset_right = offsets_right[irow_right] + offset_within_row; + RowTableImpl::offset_type offset_right = + offsets_right[irow_right] + offset_within_row; match_bytevector[i] = compare_fn(rows_left, rows_right, irow_left, offset_right); } } @@ -145,7 +146,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [bit_offset](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left, uint32_t offset_right) { + uint32_t irow_left, RowTableImpl::offset_type offset_right) { uint8_t left = bit_util::GetBit(left_base, irow_left + bit_offset) ? 0xff : 0x00; uint8_t right = right_base[offset_right]; @@ -156,7 +157,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, - uint32_t offset_right) { + RowTableImpl::offset_type offset_right) { uint8_t left = left_base[irow_left]; uint8_t right = right_base[offset_right]; return left == right ? 0xff : 0; @@ -166,7 +167,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, - uint32_t offset_right) { + RowTableImpl::offset_type offset_right) { util::CheckAlignment(left_base); util::CheckAlignment(right_base + offset_right); uint16_t left = reinterpret_cast(left_base)[irow_left]; @@ -178,7 +179,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, - uint32_t offset_right) { + RowTableImpl::offset_type offset_right) { util::CheckAlignment(left_base); util::CheckAlignment(right_base + offset_right); uint32_t left = reinterpret_cast(left_base)[irow_left]; @@ -190,7 +191,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, - uint32_t offset_right) { + RowTableImpl::offset_type offset_right) { util::CheckAlignment(left_base); util::CheckAlignment(right_base + offset_right); uint64_t left = reinterpret_cast(left_base)[irow_left]; @@ -202,7 +203,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [&col](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, - uint32_t offset_right) { + RowTableImpl::offset_type offset_right) { uint32_t length = col.metadata().fixed_length; // Non-zero length guarantees no underflow @@ -241,7 +242,7 @@ void KeyCompare::CompareVarBinaryColumnToRowHelper( const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, uint8_t* match_bytevector) { const uint32_t* offsets_left = col.offsets(); - const uint32_t* offsets_right = rows.offsets(); + const RowTableImpl::offset_type* offsets_right = rows.offsets(); const uint8_t* rows_left = col.data(2); const uint8_t* rows_right = rows.data(2); for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { @@ -249,7 +250,7 @@ void KeyCompare::CompareVarBinaryColumnToRowHelper( uint32_t irow_right = left_to_right_map[irow_left]; uint32_t begin_left = offsets_left[irow_left]; uint32_t length_left = offsets_left[irow_left + 1] - begin_left; - uint32_t begin_right = offsets_right[irow_right]; + RowTableImpl::offset_type begin_right = offsets_right[irow_right]; uint32_t length_right; uint32_t offset_within_row; if (!is_first_varbinary_col) { diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 658e0dffcac..a02ee312d03 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -296,7 +296,7 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows, DCHECK(false); } } else { - const uint32_t* row_offsets = rows.offsets() + start_row; + const RowTableImpl::offset_type* row_offsets = rows.offsets() + start_row; const uint8_t* row_base = rows.data(2); row_base += offset_within_row; uint8_t* col_base = col_prep.mutable_data(1); @@ -362,14 +362,14 @@ void EncoderBinary::EncodeSelectedImp(uint32_t offset_within_row, RowTableImpl* } else { const uint8_t* src_base = col.data(1); uint8_t* dst = rows->mutable_data(2) + offset_within_row; - const uint32_t* offsets = rows->offsets(); + const RowTableImpl::offset_type* offsets = rows->offsets(); for (uint32_t i = 0; i < num_selected; ++i) { copy_fn(dst + offsets[i], src_base, selection[i]); } if (col.data(0)) { const uint8_t* non_null_bits = col.data(0); uint8_t* dst = rows->mutable_data(2) + offset_within_row; - const uint32_t* offsets = rows->offsets(); + const RowTableImpl::offset_type* offsets = rows->offsets(); for (uint32_t i = 0; i < num_selected; ++i) { bool is_null = !bit_util::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); if (is_null) { @@ -585,7 +585,7 @@ void EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, uint8_t* dst_B = col2->mutable_data(1); uint32_t fixed_length = rows.metadata().fixed_length; - const uint32_t* offsets; + const RowTableImpl::offset_type* offsets; const uint8_t* src_base; if (is_row_fixed_length) { src_base = rows.data(1) + fixed_length * start_row + offset_within_row; @@ -640,7 +640,7 @@ void EncoderOffsets::Decode(uint32_t start_row, uint32_t num_rows, // The Nth element is the sum of all the lengths of varbinary columns data in // that row, up to and including Nth varbinary column. - const uint32_t* row_offsets = rows.offsets() + start_row; + const RowTableImpl::offset_type* row_offsets = rows.offsets() + start_row; // Set the base offset for each column for (size_t col = 0; col < varbinary_cols->size(); ++col) { @@ -658,8 +658,8 @@ void EncoderOffsets::Decode(uint32_t start_row, uint32_t num_rows, // Update the offset of each column uint32_t offset_within_row = rows.metadata().fixed_length; for (size_t col = 0; col < varbinary_cols->size(); ++col) { - offset_within_row += - RowTableMetadata::padding_for_alignment(offset_within_row, string_alignment); + offset_within_row += RowTableMetadata::padding_for_alignment_within_row( + offset_within_row, string_alignment); uint32_t length = varbinary_ends[col] - offset_within_row; offset_within_row = varbinary_ends[col]; uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets(); @@ -676,7 +676,7 @@ Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows, return Status::OK(); } - uint32_t* row_offsets = rows->mutable_offsets(); + RowTableImpl::offset_type* row_offsets = rows->mutable_offsets(); for (uint32_t i = 0; i < num_selected; ++i) { row_offsets[i] = rows->metadata().fixed_length; } @@ -688,7 +688,7 @@ Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows, for (uint32_t i = 0; i < num_selected; ++i) { uint32_t irow = selection[i]; uint32_t length = col_offsets[irow + 1] - col_offsets[irow]; - row_offsets[i] += RowTableMetadata::padding_for_alignment( + row_offsets[i] += RowTableMetadata::padding_for_alignment_row( row_offsets[i], rows->metadata().string_alignment); row_offsets[i] += length; } @@ -708,20 +708,13 @@ Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows, } } - uint32_t sum = 0; + int64_t sum = 0; int row_alignment = rows->metadata().row_alignment; for (uint32_t i = 0; i < num_selected; ++i) { - uint32_t length = row_offsets[i]; - length += RowTableMetadata::padding_for_alignment(length, row_alignment); + RowTableImpl::offset_type length = row_offsets[i]; + length += RowTableMetadata::padding_for_alignment_row(length, row_alignment); row_offsets[i] = sum; - uint32_t sum_maybe_overflow = 0; - if (ARROW_PREDICT_FALSE( - arrow::internal::AddWithOverflow(sum, length, &sum_maybe_overflow))) { - return Status::Invalid( - "Offset overflow detected in EncoderOffsets::GetRowOffsetsSelected for row ", i, - " of length ", length, " bytes, current length in total is ", sum, " bytes"); - } - sum = sum_maybe_overflow; + sum += length; } row_offsets[num_selected] = sum; @@ -732,7 +725,7 @@ template void EncoderOffsets::EncodeSelectedImp(uint32_t ivarbinary, RowTableImpl* rows, const std::vector& cols, uint32_t num_selected, const uint16_t* selection) { - const uint32_t* row_offsets = rows->offsets(); + const RowTableImpl::offset_type* row_offsets = rows->offsets(); uint8_t* row_base = rows->mutable_data(2) + rows->metadata().varbinary_end_array_offset + ivarbinary * sizeof(uint32_t); @@ -753,7 +746,7 @@ void EncoderOffsets::EncodeSelectedImp(uint32_t ivarbinary, RowTableImpl* rows, row[0] = rows->metadata().fixed_length + length; } else { row[0] = row[-1] + - RowTableMetadata::padding_for_alignment( + RowTableMetadata::padding_for_alignment_within_row( row[-1], rows->metadata().string_alignment) + length; } @@ -857,7 +850,7 @@ void EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, const RowTableI void EncoderVarBinary::EncodeSelected(uint32_t ivarbinary, RowTableImpl* rows, const KeyColumnArray& cols, uint32_t num_selected, const uint16_t* selection) { - const uint32_t* row_offsets = rows->offsets(); + const RowTableImpl::offset_type* row_offsets = rows->offsets(); uint8_t* row_base = rows->mutable_data(2); const uint32_t* col_offsets = cols.offsets(); const uint8_t* col_base = cols.data(2); diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 0618ddd8e4b..37538fcc4b8 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -173,7 +173,7 @@ class EncoderBinary { copy_fn(dst, src, col_width); } } else { - const uint32_t* row_offsets = rows_const->offsets(); + const RowTableImpl::offset_type* row_offsets = rows_const->offsets(); for (uint32_t i = 0; i < num_rows; ++i) { const uint8_t* src; uint8_t* dst; @@ -267,7 +267,8 @@ class EncoderVarBinary { ARROW_DCHECK(!rows_const->metadata().is_fixed_length && !col_const->metadata().is_fixed_length); - const uint32_t* row_offsets_for_batch = rows_const->offsets() + start_row; + const RowTableImpl::offset_type* row_offsets_for_batch = + rows_const->offsets() + start_row; const uint32_t* col_offsets = col_const->offsets(); uint32_t col_offset_next = col_offsets[0]; @@ -275,7 +276,7 @@ class EncoderVarBinary { uint32_t col_offset = col_offset_next; col_offset_next = col_offsets[i + 1]; - uint32_t row_offset = row_offsets_for_batch[i]; + RowTableImpl::offset_type row_offset = row_offsets_for_batch[i]; const uint8_t* row = rows_const->data(2) + row_offset; uint32_t offset_within_row; diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 746ed950ffa..3789491dc95 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -128,8 +128,8 @@ void RowTableMetadata::FromColumnMetadataVector( const KeyColumnMetadata& col = cols[column_order[i]]; if (col.is_fixed_length && col.fixed_length != 0 && ARROW_POPCOUNT64(col.fixed_length) != 1) { - offset_within_row += RowTableMetadata::padding_for_alignment(offset_within_row, - string_alignment, col); + offset_within_row += RowTableMetadata::padding_for_alignment_within_row( + offset_within_row, string_alignment, col); } column_offsets[i] = offset_within_row; if (!col.is_fixed_length) { @@ -155,7 +155,7 @@ void RowTableMetadata::FromColumnMetadataVector( is_fixed_length = (num_varbinary_cols == 0); fixed_length = offset_within_row + - RowTableMetadata::padding_for_alignment( + RowTableMetadata::padding_for_alignment_within_row( offset_within_row, num_varbinary_cols == 0 ? row_alignment : string_alignment); // We set the number of bytes per row storing null masks of individual key columns @@ -235,7 +235,7 @@ int64_t RowTableImpl::size_null_masks(int64_t num_rows) const { } int64_t RowTableImpl::size_offsets(int64_t num_rows) const { - return (num_rows + 1) * sizeof(uint32_t) + kPaddingForVectors; + return (num_rows + 1) * sizeof(offset_type) + kPaddingForVectors; } int64_t RowTableImpl::size_rows_fixed_length(int64_t num_rows) const { @@ -326,23 +326,15 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from, if (!metadata_.is_fixed_length) { // Varying-length rows - auto from_offsets = reinterpret_cast(from.offsets_->data()); - auto to_offsets = reinterpret_cast(offsets_->mutable_data()); - uint32_t total_length = to_offsets[num_rows_]; - uint32_t total_length_to_append = 0; + auto from_offsets = reinterpret_cast(from.offsets_->data()); + auto to_offsets = reinterpret_cast(offsets_->mutable_data()); + offset_type total_length = to_offsets[num_rows_]; + int64_t total_length_to_append = 0; for (uint32_t i = 0; i < num_rows_to_append; ++i) { uint16_t row_id = source_row_ids ? source_row_ids[i] : i; - uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id]; + int64_t length = from_offsets[row_id + 1] - from_offsets[row_id]; total_length_to_append += length; - uint32_t to_offset_maybe_overflow = 0; - if (ARROW_PREDICT_FALSE(arrow::internal::AddWithOverflow( - total_length, total_length_to_append, &to_offset_maybe_overflow))) { - return Status::Invalid( - "Offset overflow detected in RowTableImpl::AppendSelectionFrom for row ", - num_rows_ + i, " of length ", length, " bytes, current length in total is ", - to_offsets[num_rows_ + i], " bytes"); - } - to_offsets[num_rows_ + i + 1] = to_offset_maybe_overflow; + to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append; } RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append)); @@ -351,7 +343,8 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from, uint8_t* dst = rows_->mutable_data() + total_length; for (uint32_t i = 0; i < num_rows_to_append; ++i) { uint16_t row_id = source_row_ids ? source_row_ids[i] : i; - uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id]; + int64_t length = from_offsets[row_id + 1] - from_offsets[row_id]; + DCHECK_LE(length, std::numeric_limits::max()); auto src64 = reinterpret_cast(src + from_offsets[row_id]); auto dst64 = reinterpret_cast(dst); for (uint32_t j = 0; j < bit_util::CeilDiv(length, 8); ++j) { diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 93818fb14d6..4e1d1f664e0 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -30,6 +30,8 @@ namespace compute { /// Description of the data stored in a RowTable struct ARROW_EXPORT RowTableMetadata { + using offset_type = int64_t; + /// \brief True if there are no variable length columns in the table bool is_fixed_length; @@ -78,26 +80,35 @@ struct ARROW_EXPORT RowTableMetadata { /// Offsets within a row to fields in their encoding order. std::vector column_offsets; - /// Rounding up offset to the nearest multiple of alignment value. + /// Rounding up offset within row to the nearest multiple of alignment value. /// Alignment must be a power of 2. - static inline uint32_t padding_for_alignment(uint32_t offset, int required_alignment) { + static inline uint32_t padding_for_alignment_within_row(uint32_t offset, + int required_alignment) { ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1); return static_cast((-static_cast(offset)) & (required_alignment - 1)); } - /// Rounding up offset to the beginning of next column, + /// Rounding up offset within row to the beginning of next column, /// choosing required alignment based on the data type of that column. - static inline uint32_t padding_for_alignment(uint32_t offset, int string_alignment, - const KeyColumnMetadata& col_metadata) { + static inline uint32_t padding_for_alignment_within_row( + uint32_t offset, int string_alignment, const KeyColumnMetadata& col_metadata) { if (!col_metadata.is_fixed_length || ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) { return 0; } else { - return padding_for_alignment(offset, string_alignment); + return padding_for_alignment_within_row(offset, string_alignment); } } + /// Rounding up row offset to the nearest multiple of alignment value. + /// Alignment must be a power of 2. + static inline offset_type padding_for_alignment_row(offset_type row_offset, + int required_alignment) { + ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1); + return (-row_offset) & (required_alignment - 1); + } + /// Returns an array of offsets within a row of ends of varbinary fields. inline const uint32_t* varbinary_end_array(const uint8_t* row) const { ARROW_DCHECK(!is_fixed_length); @@ -127,7 +138,7 @@ struct ARROW_EXPORT RowTableMetadata { ARROW_DCHECK(varbinary_id > 0); const uint32_t* varbinary_end = varbinary_end_array(row); uint32_t offset = varbinary_end[varbinary_id - 1]; - offset += padding_for_alignment(offset, string_alignment); + offset += padding_for_alignment_within_row(offset, string_alignment); *out_offset = offset; *out_length = varbinary_end[varbinary_id] - offset; } @@ -161,6 +172,8 @@ struct ARROW_EXPORT RowTableMetadata { /// The row table is not safe class ARROW_EXPORT RowTableImpl { public: + using offset_type = RowTableMetadata::offset_type; + RowTableImpl(); /// \brief Initialize a row array for use /// @@ -201,8 +214,12 @@ class ARROW_EXPORT RowTableImpl { } return NULLPTR; } - const uint32_t* offsets() const { return reinterpret_cast(data(1)); } - uint32_t* mutable_offsets() { return reinterpret_cast(mutable_data(1)); } + const offset_type* offsets() const { + return reinterpret_cast(data(1)); + } + offset_type* mutable_offsets() { + return reinterpret_cast(mutable_data(1)); + } const uint8_t* null_masks() const { return null_masks_->data(); } uint8_t* null_masks() { return null_masks_->mutable_data(); } diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 75f981fb128..4eff2df361b 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -123,7 +123,7 @@ TEST(RowTableMemoryConsumption, Encode) { ASSERT_GT(actual_null_mask_size * 2, row_table.buffer_size(0) - padding_for_vectors); - int64_t actual_offset_size = num_rows * sizeof(uint32_t); + int64_t actual_offset_size = num_rows * sizeof(RowTableImpl::offset_type); ASSERT_LE(actual_offset_size, row_table.buffer_size(1) - padding_for_vectors); ASSERT_GT(actual_offset_size * 2, row_table.buffer_size(1) - padding_for_vectors); @@ -134,15 +134,14 @@ TEST(RowTableMemoryConsumption, Encode) { } } -// GH-43202: Ensure that when offset overflow happens in encoding the row table, an -// explicit error is raised instead of a silent wrong result. -TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) { +// GH-XXXXX: Ensure that we can build a row table with more than 4GB row data. +TEST(RowTableLarge, LARGE_MEMORY_TEST(Encode)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; } - // Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table. - constexpr int64_t num_rows = 8; + // Use 9 512MB var-length rows to occupy more than 4GB memory. + constexpr int64_t num_rows = 9; constexpr int64_t length_per_binary = 512 * 1024 * 1024; constexpr int64_t row_alignment = sizeof(uint32_t); constexpr int64_t var_length_alignment = sizeof(uint32_t); @@ -174,39 +173,24 @@ TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) { // The rows to encode. std::vector row_ids(num_rows, 0); - // Encoding 7 rows should be fine. - { - row_encoder.PrepareEncodeSelected(0, num_rows - 1, columns); - ASSERT_OK(row_encoder.EncodeSelected(&row_table, static_cast(num_rows - 1), - row_ids.data())); - } + // Encode num_rows rows. + row_encoder.PrepareEncodeSelected(0, num_rows, columns); + ASSERT_OK(row_encoder.EncodeSelected(&row_table, static_cast(num_rows), + row_ids.data())); - // Encoding 8 rows should overflow. - { - int64_t length_per_row = table_metadata.fixed_length + length_per_binary; - std::stringstream expected_error_message; - expected_error_message << "Invalid: Offset overflow detected in " - "EncoderOffsets::GetRowOffsetsSelected for row " - << num_rows - 1 << " of length " << length_per_row - << " bytes, current length in total is " - << length_per_row * (num_rows - 1) << " bytes"; - row_encoder.PrepareEncodeSelected(0, num_rows, columns); - ASSERT_RAISES_WITH_MESSAGE( - Invalid, expected_error_message.str(), - row_encoder.EncodeSelected(&row_table, static_cast(num_rows), - row_ids.data())); - } + ASSERT_GT(row_table.offsets()[num_rows - 1], std::numeric_limits::max()); + ASSERT_GT(row_table.offsets()[num_rows], + row_table.offsets()[num_rows - 1] + length_per_binary); } -// GH-43202: Ensure that when offset overflow happens in appending to the row table, an -// explicit error is raised instead of a silent wrong result. +// GH-XXXXX: Ensure that we can build a row table with more than 4GB row data. TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(AppendFrom)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; } - // Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table. - constexpr int64_t num_rows = 8; + // Use 9 512MB var-length rows to occupy more than 4GB memory. + constexpr int64_t num_rows = 9; constexpr int64_t length_per_binary = 512 * 1024 * 1024; constexpr int64_t num_rows_seed = 1; constexpr int64_t row_alignment = sizeof(uint32_t); @@ -244,23 +228,15 @@ TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(AppendFrom)) { RowTableImpl row_table; ASSERT_OK(row_table.Init(pool, table_metadata)); - // Appending the seed 7 times should be fine. - for (int i = 0; i < num_rows - 1; ++i) { + // Append seed num_rows times. + for (int i = 0; i < num_rows; ++i) { ASSERT_OK(row_table.AppendSelectionFrom(row_table_seed, num_rows_seed, /*source_row_ids=*/NULLPTR)); } - // Appending the seed the 8-th time should overflow. - int64_t length_per_row = table_metadata.fixed_length + length_per_binary; - std::stringstream expected_error_message; - expected_error_message - << "Invalid: Offset overflow detected in RowTableImpl::AppendSelectionFrom for row " - << num_rows - 1 << " of length " << length_per_row - << " bytes, current length in total is " << length_per_row * (num_rows - 1) - << " bytes"; - ASSERT_RAISES_WITH_MESSAGE(Invalid, expected_error_message.str(), - row_table.AppendSelectionFrom(row_table_seed, num_rows_seed, - /*source_row_ids=*/NULLPTR)); + ASSERT_GT(row_table.offsets()[num_rows - 1], std::numeric_limits::max()); + ASSERT_GT(row_table.offsets()[num_rows], + row_table.offsets()[num_rows - 1] + length_per_binary); } } // namespace compute From ab93a63c1669e741085b161ac39087ed111f38ef Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Wed, 24 Jul 2024 23:37:39 +0800 Subject: [PATCH 02/34] More fix --- cpp/src/arrow/acero/swiss_join.cc | 4 ++-- cpp/src/arrow/compute/row/compare_internal.cc | 10 ++++++---- cpp/src/arrow/compute/row/encode_internal.cc | 7 +++++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index e5b609879b3..95bf876e89e 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -573,7 +573,7 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& if (!source_rows_permutation) { int64_t target_row_offset = first_target_row_offset; for (int64_t i = 0; i < num_source_rows; ++i) { - target_offsets[first_target_row_id + i] = static_cast(target_row_offset); + target_offsets[first_target_row_id + i] = target_row_offset; target_row_offset += source_offsets[i + 1] - source_offsets[i]; } // We purposefully skip outputting of N+1 offset, to allow concurrent @@ -605,7 +605,7 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& *target_row_ptr++ = *source_row_ptr++; } - target_offsets[first_target_row_id + i] = static_cast(target_row_offset); + target_offsets[first_target_row_id + i] = target_row_offset; target_row_offset += length; } } diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 7cb2612a2c4..1839e5ea864 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -59,7 +59,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - uint32_t irow_right = left_to_right_map[irow_left]; + int64_t irow_right = left_to_right_map[irow_left]; int64_t bitid = irow_right * null_mask_num_bytes * 8 + null_bit_id; match_bytevector[i] &= (bit_util::GetBit(null_masks, bitid) ? 0 : 0xff); } @@ -80,7 +80,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com ARROW_DCHECK(non_nulls); for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - uint32_t irow_right = left_to_right_map[irow_left]; + int64_t irow_right = left_to_right_map[irow_left]; int64_t bitid_right = irow_right * null_mask_num_bytes * 8 + null_bit_id; int right_null = bit_util::GetBit(null_masks, bitid_right) ? 0xff : 0; int left_null = @@ -104,8 +104,10 @@ void KeyCompare::CompareBinaryColumnToRowHelper( const uint8_t* rows_right = rows.data(1); for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - uint32_t irow_right = left_to_right_map[irow_left]; - uint32_t offset_right = irow_right * fixed_length + offset_within_row; + // irow_right is used to index into row data so elevate to 64b. + int64_t irow_right = left_to_right_map[irow_left]; + RowTableImpl::offset_type offset_right = + irow_right * fixed_length + offset_within_row; match_bytevector[i] = compare_fn(rows_left, rows_right, irow_left, offset_right); } } else { diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index a02ee312d03..4a8db6f61c2 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -265,7 +265,8 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows, num_rows * row_size); } else if (rows.metadata().is_fixed_length) { uint32_t row_size = rows.metadata().fixed_length; - const uint8_t* row_base = rows.data(1) + start_row * row_size; + const uint8_t* row_base = + rows.data(1) + static_cast(start_row) * row_size; row_base += offset_within_row; uint8_t* col_base = col_prep.mutable_data(1); switch (col_prep.metadata().fixed_length) { @@ -588,7 +589,9 @@ void EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, const RowTableImpl::offset_type* offsets; const uint8_t* src_base; if (is_row_fixed_length) { - src_base = rows.data(1) + fixed_length * start_row + offset_within_row; + src_base = rows.data(1) + + static_cast(start_row) * fixed_length + + offset_within_row; offsets = nullptr; } else { src_base = rows.data(2) + offset_within_row; From e50e2669ec892b007e9eee711a58ea8a473b1519 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Wed, 24 Jul 2024 23:46:27 +0800 Subject: [PATCH 03/34] More fix --- cpp/src/arrow/compute/row/compare_internal.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 1839e5ea864..d5089674f1d 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -104,8 +104,8 @@ void KeyCompare::CompareBinaryColumnToRowHelper( const uint8_t* rows_right = rows.data(1); for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - // irow_right is used to index into row data so elevate to 64b. - int64_t irow_right = left_to_right_map[irow_left]; + // irow_right is used to index into row data so elevate to the row offset type. + RowTableImpl::offset_type irow_right = left_to_right_map[irow_left]; RowTableImpl::offset_type offset_right = irow_right * fixed_length + offset_within_row; match_bytevector[i] = compare_fn(rows_left, rows_right, irow_left, offset_right); From c7e522512e80ef208daedde04aeccd220d79fd04 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 00:02:38 +0800 Subject: [PATCH 04/34] Fix test name --- cpp/src/arrow/compute/row/row_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 4eff2df361b..40a3fb6dea2 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -184,7 +184,7 @@ TEST(RowTableLarge, LARGE_MEMORY_TEST(Encode)) { } // GH-XXXXX: Ensure that we can build a row table with more than 4GB row data. -TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(AppendFrom)) { +TEST(RowTableLarge, LARGE_MEMORY_TEST(AppendFrom)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; } From 02669e1b579cc78ff7bc68079d6e036336932799 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 00:48:16 +0800 Subject: [PATCH 05/34] Remove header --- cpp/src/arrow/compute/row/encode_internal.cc | 1 - cpp/src/arrow/compute/row/row_internal.cc | 1 - 2 files changed, 2 deletions(-) diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 4a8db6f61c2..127d43021d6 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -17,7 +17,6 @@ #include "arrow/compute/row/encode_internal.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/int_util_overflow.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 3789491dc95..7fed07e3741 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -18,7 +18,6 @@ #include "arrow/compute/row/row_internal.h" #include "arrow/compute/util.h" -#include "arrow/util/int_util_overflow.h" namespace arrow { namespace compute { From 148118ab67673e78cfb7f42933c11ca3ca6f7e36 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 01:38:15 +0800 Subject: [PATCH 06/34] Support avx2 --- cpp/src/arrow/acero/swiss_join_avx2.cc | 90 +++++++---- .../compute/row/compare_internal_avx2.cc | 153 +++++++++--------- .../arrow/compute/row/encode_internal_avx2.cc | 10 +- 3 files changed, 139 insertions(+), 114 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 0888dd89384..b02391dbb89 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -45,48 +45,62 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu if (!is_fixed_length_column) { int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); const uint8_t* row_ptr_base = rows.data(2); - const uint32_t* row_offsets = rows.offsets(); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); if (varbinary_column_id == 0) { // Case 1: This is the first varbinary column // __m256i field_offset_within_row = _mm256_set1_epi32(rows.metadata().fixed_length); __m256i varbinary_end_array_offset = - _mm256_set1_epi32(rows.metadata().varbinary_end_array_offset); + _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset); for (int i = 0; i < num_rows / unroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_offset = _mm256_i32gather_epi32( - reinterpret_cast(row_offsets), row_id, sizeof(uint32_t)); + __m256i row_offset_lo = + _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + sizeof(RowTableImpl::offset_type)); + __m256i row_offset_hi = + _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + sizeof(RowTableImpl::offset_type)); + __m128i field_length_lo = _mm256_i64gather_epi32( + reinterpret_cast(row_ptr_base), + _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset), 1); + __m128i field_length_hi = _mm256_i64gather_epi32( + reinterpret_cast(row_ptr_base), + _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset), 1); __m256i field_length = _mm256_sub_epi32( - _mm256_i32gather_epi32( - reinterpret_cast(row_ptr_base), - _mm256_add_epi32(row_offset, varbinary_end_array_offset), 1), - field_offset_within_row); + _mm256_set_m128i(field_length_hi, field_length_lo), field_offset_within_row); process_8_values_fn(i * unroll, row_ptr_base, - _mm256_add_epi32(row_offset, field_offset_within_row), + _mm256_add_epi64(row_offset_lo, field_offset_within_row), + _mm256_add_epi64(row_offset_hi, field_offset_within_row), field_length); } } else { // Case 2: This is second or later varbinary column // __m256i varbinary_end_array_offset = - _mm256_set1_epi32(rows.metadata().varbinary_end_array_offset + - sizeof(uint32_t) * (varbinary_column_id - 1)); + _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset + + sizeof(uint32_t) * (varbinary_column_id - 1)); auto row_ptr_base_i64 = reinterpret_cast(row_ptr_base); for (int i = 0; i < num_rows / unroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_offset = _mm256_i32gather_epi32( - reinterpret_cast(row_offsets), row_id, sizeof(uint32_t)); - __m256i end_array_offset = - _mm256_add_epi32(row_offset, varbinary_end_array_offset); - - __m256i field_offset_within_row_A = _mm256_i32gather_epi64( - row_ptr_base_i64, _mm256_castsi256_si128(end_array_offset), 1); - __m256i field_offset_within_row_B = _mm256_i32gather_epi64( - row_ptr_base_i64, _mm256_extracti128_si256(end_array_offset, 1), 1); + __m256i row_offset_lo = + _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + sizeof(RowTableImpl::offset_type)); + __m256i row_offset_hi = + _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + sizeof(RowTableImpl::offset_type)); + __m256i end_array_offset_lo = + _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset); + __m256i end_array_offset_hi = + _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset); + + __m256i field_offset_within_row_A = + _mm256_i64gather_epi64(row_ptr_base_i64, end_array_offset_lo, 1); + __m256i field_offset_within_row_B = + _mm256_i64gather_epi64(row_ptr_base_i64, end_array_offset_hi, 1); field_offset_within_row_A = _mm256_permutevar8x32_epi32( field_offset_within_row_A, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); field_offset_within_row_B = _mm256_permutevar8x32_epi32( @@ -111,7 +125,8 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu field_length = _mm256_sub_epi32(field_length, field_offset_within_row); process_8_values_fn(i * unroll, row_ptr_base, - _mm256_add_epi32(row_offset, field_offset_within_row), + _mm256_add_epi64(row_offset_lo, field_offset_within_row), + _mm256_add_epi64(row_offset_hi, field_offset_within_row), field_length); } } @@ -119,7 +134,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu if (is_fixed_length_column) { __m256i field_offset_within_row = - _mm256_set1_epi32(rows.metadata().encoded_field_offset( + _mm256_set1_epi64x(rows.metadata().encoded_field_offset( rows.metadata().pos_after_encoding(column_id))); __m256i field_length = _mm256_set1_epi32(rows.metadata().column_metadatas[column_id].fixed_length); @@ -132,22 +147,37 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu for (int i = 0; i < num_rows / unroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_offset = _mm256_mullo_epi32(row_id, field_length); - __m256i field_offset = _mm256_add_epi32(row_offset, field_offset_within_row); - process_8_values_fn(i * unroll, row_ptr_base, field_offset, field_length); + __m256i row_id_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id)); + __m256i row_id_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id, 1)); + __m256i row_offset_lo = _mm256_mul_epi32(row_id_lo, field_length); + __m256i row_offset_hi = _mm256_mul_epi32(row_id_hi, field_length); + __m256i field_offset_lo = + _mm256_add_epi64(row_offset_lo, field_offset_within_row); + __m256i field_offset_hi = + _mm256_add_epi64(row_offset_hi, field_offset_within_row); + process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi, + field_length); } } else { // Case 4: This is a fixed length column in varying length row // const uint8_t* row_ptr_base = rows.data(2); - const uint32_t* row_offsets = rows.offsets(); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); for (int i = 0; i < num_rows / unroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_offset = _mm256_i32gather_epi32( - reinterpret_cast(row_offsets), row_id, sizeof(uint32_t)); - __m256i field_offset = _mm256_add_epi32(row_offset, field_offset_within_row); - process_8_values_fn(i * unroll, row_ptr_base, field_offset, field_length); + __m256i row_offset_lo = + _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + sizeof(RowTableImpl::offset_type)); + __m256i row_offset_hi = + _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + sizeof(RowTableImpl::offset_type)); + __m256i field_offset_lo = + _mm256_add_epi64(row_offset_lo, field_offset_within_row); + __m256i field_offset_hi = + _mm256_add_epi64(row_offset_hi, field_offset_within_row); + process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi, + field_length); } } } diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 23238a3691c..65e8e21dc89 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -180,40 +180,6 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( } } -namespace { - -// Intrinsics `_mm256_i32gather_epi32/64` treat the `vindex` as signed integer, and we -// are using `uint32_t` to represent the offset, in range of [0, 4G), within the row -// table. When the offset is larger than `0x80000000` (2GB), those intrinsics will treat -// it as negative offset and gather the data from undesired address. To avoid this issue, -// we normalize the addresses by translating `base` `0x80000000` higher, and `offset` -// `0x80000000` lower. This way, the offset is always in range of [-2G, 2G) and those -// intrinsics are safe. - -constexpr uint64_t kTwoGB = 0x80000000ull; - -template -inline __m256i UnsignedOffsetSafeGather32(int const* base, __m256i offset) { - int const* normalized_base = base + kTwoGB / sizeof(int); - __m256i normalized_offset = - _mm256_sub_epi32(offset, _mm256_set1_epi32(static_cast(kTwoGB / kScale))); - return _mm256_i32gather_epi32(normalized_base, normalized_offset, - static_cast(kScale)); -} - -template -inline __m256i UnsignedOffsetSafeGather64(arrow::util::int64_for_gather_t const* base, - __m128i offset) { - arrow::util::int64_for_gather_t const* normalized_base = - base + kTwoGB / sizeof(arrow::util::int64_for_gather_t); - __m128i normalized_offset = - _mm_sub_epi32(offset, _mm_set1_epi32(static_cast(kTwoGB / kScale))); - return _mm256_i32gather_epi64(normalized_base, normalized_offset, - static_cast(kScale)); -} - -} // namespace - template uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( uint32_t offset_within_row, uint32_t num_rows_to_compare, @@ -240,12 +206,20 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } - __m256i offset_right = - _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(fixed_length)); - offset_right = _mm256_add_epi32(offset_right, _mm256_set1_epi32(offset_within_row)); - - reinterpret_cast(match_bytevector)[i] = - compare8_fn(rows_left, rows_right, i * unroll, irow_left, offset_right); + __m256i irow_right_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right)); + __m256i irow_right_hi = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1)); + __m256i offset_right_lo = + _mm256_mul_epi32(irow_right_lo, _mm256_set1_epi64x(fixed_length)); + __m256i offset_right_hi = + _mm256_mul_epi32(irow_right_hi, _mm256_set1_epi64x(fixed_length)); + offset_right_lo = + _mm256_add_epi64(offset_right_lo, _mm256_set1_epi64x(offset_within_row)); + offset_right_hi = + _mm256_add_epi64(offset_right_hi, _mm256_set1_epi64x(offset_within_row)); + + reinterpret_cast(match_bytevector)[i] = compare8_fn( + rows_left, rows_right, i * unroll, irow_left, offset_right_lo, offset_right_hi); if (!use_selection) { irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(8)); @@ -254,7 +228,7 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( return num_rows_to_compare - (num_rows_to_compare % unroll); } else { const uint8_t* rows_left = col.data(1); - const uint32_t* offsets_right = rows.offsets(); + const RowTableImpl::offset_type* offsets_right = rows.offsets(); const uint8_t* rows_right = rows.data(2); constexpr uint32_t unroll = 8; __m256i irow_left = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); @@ -270,12 +244,20 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( irow_right = _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } - __m256i offset_right = - UnsignedOffsetSafeGather32<4>((int const*)offsets_right, irow_right); - offset_right = _mm256_add_epi32(offset_right, _mm256_set1_epi32(offset_within_row)); - reinterpret_cast(match_bytevector)[i] = - compare8_fn(rows_left, rows_right, i * unroll, irow_left, offset_right); + __m256i offset_right_lo = + _mm256_i32gather_epi64(offsets_right, _mm256_castsi256_si128(irow_right), + sizeof(RowTableImpl::offset_type)); + __m256i offset_right_hi = + _mm256_i32gather_epi64(offsets_right, _mm256_extracti128_si256(irow_right, 1), + sizeof(RowTableImpl::offset_type)); + offset_right_lo = + _mm256_add_epi64(offset_right_lo, _mm256_set1_epi64x(offset_within_row)); + offset_right_hi = + _mm256_add_epi64(offset_right_hi, _mm256_set1_epi64x(offset_within_row)); + + reinterpret_cast(match_bytevector)[i] = compare8_fn( + rows_left, rows_right, i * unroll, irow_left, offset_right_lo, offset_right_hi); if (!use_selection) { irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(8)); @@ -287,8 +269,8 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( template inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* right_base, - __m256i irow_left, __m256i offset_right, - int bit_offset = 0) { + __m256i irow_left, __m256i offset_right_lo, + __m256i offset_right_hi, int bit_offset = 0) { __m256i left; switch (column_width) { case 0: { @@ -315,7 +297,9 @@ inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* r ARROW_DCHECK(false); } - __m256i right = UnsignedOffsetSafeGather32<1>((int const*)right_base, offset_right); + __m128i right_lo = _mm256_i64gather_epi32(right_base, offset_right_lo, 1); + __m128i right_hi = _mm256_i64gather_epi32(right_base, offset_right_hi, 1); + __m256i right = _mm256_set_m128i(right_hi, right_lo); if (column_width != sizeof(uint32_t)) { constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff; right = _mm256_and_si256(right, _mm256_set1_epi32(mask)); @@ -333,8 +317,8 @@ inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* r template inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_first, __m256i offset_right, - int bit_offset = 0) { + uint32_t irow_left_first, __m256i offset_right_lo, + __m256i offset_right_hi, int bit_offset = 0) { __m256i left; switch (column_width) { case 0: { @@ -364,7 +348,9 @@ inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_bas ARROW_DCHECK(false); } - __m256i right = UnsignedOffsetSafeGather32<1>((int const*)right_base, offset_right); + __m128i right_lo = _mm256_i64gather_epi32(right_base, offset_right_lo, 1); + __m128i right_hi = _mm256_i64gather_epi32(right_base, offset_right_hi, 1); + __m256i right = _mm256_set_m128i(right_hi, right_lo); if (column_width != sizeof(uint32_t)) { constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff; right = _mm256_and_si256(right, _mm256_set1_epi32(mask)); @@ -383,7 +369,7 @@ inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_bas template inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* right_base, __m256i irow_left, uint32_t irow_left_first, - __m256i offset_right) { + __m256i offset_right_lo, __m256i offset_right_hi) { auto left_base_i64 = reinterpret_cast(left_base); __m256i left_lo, left_hi; @@ -400,10 +386,8 @@ inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* rig } auto right_base_i64 = reinterpret_cast(right_base); - __m256i right_lo = - UnsignedOffsetSafeGather64<1>(right_base_i64, _mm256_castsi256_si128(offset_right)); - __m256i right_hi = UnsignedOffsetSafeGather64<1>( - right_base_i64, _mm256_extracti128_si256(offset_right, 1)); + __m256i right_lo = _mm256_i64gather_epi64(right_base_i64, offset_right_lo, 1); + __m256i right_hi = _mm256_i64gather_epi64(right_base_i64, offset_right_hi, 1); uint32_t result_lo = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_lo, right_lo)); uint32_t result_hi = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_hi, right_hi)); return result_lo | (static_cast(result_hi) << 32); @@ -412,13 +396,15 @@ inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* rig template inline uint64_t Compare8_Binary_avx2(uint32_t length, const uint8_t* left_base, const uint8_t* right_base, __m256i irow_left, - uint32_t irow_left_first, __m256i offset_right) { + uint32_t irow_left_first, __m256i offset_right_lo, + __m256i offset_right_hi) { uint32_t irow_left_array[8]; - uint32_t offset_right_array[8]; + RowTableImpl::offset_type offset_right_array[8]; if (use_selection) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(irow_left_array), irow_left); } - _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right); + _mm256_store_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right_lo); + _mm256_store_si256(reinterpret_cast<__m256i*>(&offset_right_array[4]), offset_right_hi); // Non-zero length guarantees no underflow int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; @@ -463,13 +449,14 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [bit_offset](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right_lo, + __m256i offset_right_hi) { if (use_selection) { return CompareSelected8_avx2<0>(left_base, right_base, irow_left, - offset_right, bit_offset); + offset_right_lo, offset_right_hi, bit_offset); } else { - return Compare8_avx2<0>(left_base, right_base, irow_left_base, offset_right, - bit_offset); + return Compare8_avx2<0>(left_base, right_base, irow_left_base, + offset_right_lo, offset_right_hi, bit_offset); } }); } else if (col_width == 1) { @@ -477,12 +464,13 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, - __m256i irow_left, __m256i offset_right) { + __m256i irow_left, __m256i offset_right_lo, __m256i offset_right_hi) { if (use_selection) { return CompareSelected8_avx2<1>(left_base, right_base, irow_left, - offset_right); + offset_right_lo, offset_right_hi); } else { - return Compare8_avx2<1>(left_base, right_base, irow_left_base, offset_right); + return Compare8_avx2<1>(left_base, right_base, irow_left_base, + offset_right_lo, offset_right_hi); } }); } else if (col_width == 2) { @@ -490,12 +478,13 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, - __m256i irow_left, __m256i offset_right) { + __m256i irow_left, __m256i offset_right_lo, __m256i offset_right_hi) { if (use_selection) { return CompareSelected8_avx2<2>(left_base, right_base, irow_left, - offset_right); + offset_right_lo, offset_right_hi); } else { - return Compare8_avx2<2>(left_base, right_base, irow_left_base, offset_right); + return Compare8_avx2<2>(left_base, right_base, irow_left_base, + offset_right_lo, offset_right_hi); } }); } else if (col_width == 4) { @@ -503,12 +492,13 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, - __m256i irow_left, __m256i offset_right) { + __m256i irow_left, __m256i offset_right_lo, __m256i offset_right_hi) { if (use_selection) { return CompareSelected8_avx2<4>(left_base, right_base, irow_left, - offset_right); + offset_right_lo, offset_right_hi); } else { - return Compare8_avx2<4>(left_base, right_base, irow_left_base, offset_right); + return Compare8_avx2<4>(left_base, right_base, irow_left_base, + offset_right_lo, offset_right_hi); } }); } else if (col_width == 8) { @@ -516,19 +506,22 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2( offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base, - __m256i irow_left, __m256i offset_right) { + __m256i irow_left, __m256i offset_right_lo, __m256i offset_right_hi) { return Compare8_64bit_avx2(left_base, right_base, irow_left, - irow_left_base, offset_right); + irow_left_base, offset_right_lo, + offset_right_hi); }); } else { return CompareBinaryColumnToRowHelper_avx2( offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, rows, match_bytevector, [&col](const uint8_t* left_base, const uint8_t* right_base, - uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) { + uint32_t irow_left_base, __m256i irow_left, __m256i offset_right_lo, + __m256i offset_right_hi) { uint32_t length = col.metadata().fixed_length; - return Compare8_Binary_avx2( - length, left_base, right_base, irow_left, irow_left_base, offset_right); + return Compare8_Binary_avx2(length, left_base, right_base, + irow_left, irow_left_base, + offset_right_lo, offset_right_hi); }); } } @@ -541,7 +534,7 @@ void KeyCompare::CompareVarBinaryColumnToRowImp_avx2( LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, uint8_t* match_bytevector) { const uint32_t* offsets_left = col.offsets(); - const uint32_t* offsets_right = rows.offsets(); + const RowTableImpl::offset_type* offsets_right = rows.offsets(); const uint8_t* rows_left = col.data(2); const uint8_t* rows_right = rows.data(2); for (uint32_t i = 0; i < num_rows_to_compare; ++i) { @@ -549,7 +542,7 @@ void KeyCompare::CompareVarBinaryColumnToRowImp_avx2( uint32_t irow_right = left_to_right_map[irow_left]; uint32_t begin_left = offsets_left[irow_left]; uint32_t length_left = offsets_left[irow_left + 1] - begin_left; - uint32_t begin_right = offsets_right[irow_right]; + RowTableImpl::offset_type begin_right = offsets_right[irow_right]; uint32_t length_right; uint32_t offset_within_row; if (!is_first_varbinary_col) { diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc b/cpp/src/arrow/compute/row/encode_internal_avx2.cc index 50969c7bd60..26f8e3a63de 100644 --- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc @@ -75,10 +75,12 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows uint8_t* col_vals_B = col2->mutable_data(1); uint32_t fixed_length = rows.metadata().fixed_length; - const uint32_t* offsets; + const RowTableImpl::offset_type* offsets; const uint8_t* src_base; if (is_row_fixed_length) { - src_base = rows.data(1) + fixed_length * start_row + offset_within_row; + src_base = rows.data(1) + + static_cast(fixed_length) * start_row + + offset_within_row; offsets = nullptr; } else { src_base = rows.data(2) + offset_within_row; @@ -99,7 +101,7 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows src2 = reinterpret_cast(src + fixed_length * 2); src3 = reinterpret_cast(src + fixed_length * 3); } else { - const uint32_t* row_offsets = offsets + i * unroll; + const RowTableImpl::offset_type* row_offsets = offsets + i * unroll; const uint8_t* src = src_base; src0 = reinterpret_cast(src + row_offsets[0]); src1 = reinterpret_cast(src + row_offsets[1]); @@ -140,7 +142,7 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows } } } else { - const uint32_t* row_offsets = offsets + i * unroll; + const RowTableImpl::offset_type* row_offsets = offsets + i * unroll; const uint8_t* src = src_base; for (int j = 0; j < unroll; ++j) { if (col_width == 1) { From 65aa81e942c068acea0b965fb4bd03445654a69d Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 01:46:57 +0800 Subject: [PATCH 07/34] Fix warning --- cpp/src/arrow/compute/row/compare_internal_avx2.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 65e8e21dc89..5194f179e53 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -297,8 +297,8 @@ inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* r ARROW_DCHECK(false); } - __m128i right_lo = _mm256_i64gather_epi32(right_base, offset_right_lo, 1); - __m128i right_hi = _mm256_i64gather_epi32(right_base, offset_right_hi, 1); + __m128i right_lo = _mm256_i64gather_epi32((int const*)right_base, offset_right_lo, 1); + __m128i right_hi = _mm256_i64gather_epi32((int const*)right_base, offset_right_hi, 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); if (column_width != sizeof(uint32_t)) { constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff; @@ -348,8 +348,8 @@ inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_bas ARROW_DCHECK(false); } - __m128i right_lo = _mm256_i64gather_epi32(right_base, offset_right_lo, 1); - __m128i right_hi = _mm256_i64gather_epi32(right_base, offset_right_hi, 1); + __m128i right_lo = _mm256_i64gather_epi32((int const*)right_base, offset_right_lo, 1); + __m128i right_hi = _mm256_i64gather_epi32((int const*)right_base, offset_right_hi, 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); if (column_width != sizeof(uint32_t)) { constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff; From 2c4d57f0c19287cd4262a19b2cef68da02e497af Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 15:44:47 +0800 Subject: [PATCH 08/34] Fix unaligned store --- cpp/src/arrow/compute/row/compare_internal_avx2.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 5194f179e53..4efd67de840 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -403,8 +403,9 @@ inline uint64_t Compare8_Binary_avx2(uint32_t length, const uint8_t* left_base, if (use_selection) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(irow_left_array), irow_left); } - _mm256_store_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right_lo); - _mm256_store_si256(reinterpret_cast<__m256i*>(&offset_right_array[4]), offset_right_hi); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right_lo); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&offset_right_array[4]), + offset_right_hi); // Non-zero length guarantees no underflow int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; From db125925e5e39eb258797fb74aed24292608c2d1 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 17:17:20 +0800 Subject: [PATCH 09/34] Fix warning --- cpp/src/arrow/compute/row/compare_internal_avx2.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 4efd67de840..e4c72d4440b 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -245,12 +245,12 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } - __m256i offset_right_lo = - _mm256_i32gather_epi64(offsets_right, _mm256_castsi256_si128(irow_right), - sizeof(RowTableImpl::offset_type)); - __m256i offset_right_hi = - _mm256_i32gather_epi64(offsets_right, _mm256_extracti128_si256(irow_right, 1), - sizeof(RowTableImpl::offset_type)); + __m256i offset_right_lo = _mm256_i32gather_epi64((const long long*)offsets_right, + _mm256_castsi256_si128(irow_right), + sizeof(RowTableImpl::offset_type)); + __m256i offset_right_hi = _mm256_i32gather_epi64( + (const long long*)offsets_right, _mm256_extracti128_si256(irow_right, 1), + sizeof(RowTableImpl::offset_type)); offset_right_lo = _mm256_add_epi64(offset_right_lo, _mm256_set1_epi64x(offset_within_row)); offset_right_hi = From 5bd2981256d13f32a8048302a545dd956dac4a8e Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 17:38:52 +0800 Subject: [PATCH 10/34] Fix warning --- cpp/src/arrow/compute/row/compare_internal_avx2.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index e4c72d4440b..a23545804fe 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -245,11 +245,13 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } - __m256i offset_right_lo = _mm256_i32gather_epi64((const long long*)offsets_right, - _mm256_castsi256_si128(irow_right), - sizeof(RowTableImpl::offset_type)); + auto offsets_right_i64 = + reinterpret_cast(offsets_right); + __m256i offset_right_lo = + _mm256_i32gather_epi64(offsets_right_i64, _mm256_castsi256_si128(irow_right), + sizeof(RowTableImpl::offset_type)); __m256i offset_right_hi = _mm256_i32gather_epi64( - (const long long*)offsets_right, _mm256_extracti128_si256(irow_right, 1), + offsets_right_i64, _mm256_extracti128_si256(irow_right, 1), sizeof(RowTableImpl::offset_type)); offset_right_lo = _mm256_add_epi64(offset_right_lo, _mm256_set1_epi64x(offset_within_row)); From 5f40f97bcea0213bbdacd0f8014dfa801024bc0b Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 20:18:34 +0800 Subject: [PATCH 11/34] Add test for compare rows bigger than 4GB --- cpp/src/arrow/compute/row/compare_test.cc | 176 ++++++++++++++++++++-- 1 file changed, 161 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 22af7e067d8..7665df6673c 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -27,7 +27,12 @@ namespace arrow { namespace compute { using arrow::bit_util::BytesForBits; +using arrow::bit_util::GetBit; +using arrow::gen::Constant; +using arrow::gen::Random; +using arrow::internal::CountSetBits; using arrow::internal::CpuInfo; +using arrow::random::kSeedMax; using arrow::random::RandomArrayGenerator; using arrow::util::MiniBatch; using arrow::util::TempVectorStack; @@ -106,7 +111,7 @@ TEST(KeyCompare, CompareColumnsToRowsCuriousFSB) { true, match_bitvector.data()); for (int i = 0; i < num_rows; ++i) { SCOPED_TRACE(i); - ASSERT_EQ(arrow::bit_util::GetBit(match_bitvector.data(), i), i != 6); + ASSERT_EQ(GetBit(match_bitvector.data(), i), i != 6); } } } @@ -168,7 +173,7 @@ TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) { // Compare columns to rows at offsets over 2GB within a row table. // Certain AVX2 instructions may behave unexpectedly causing troubles like GH-41813. -TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) { +TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; } @@ -176,21 +181,21 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) { // The idea of this case is to create a row table using several fixed length columns and // one var length column (so the row is hence var length and has offset buffer), with // the overall data size exceeding 2GB. Then compare each row with itself. - constexpr int64_t two_gb = 2ll * 1024ll * 1024ll * 1024ll; + constexpr int64_t k2GB = 2ll * 1024ll * 1024ll * 1024ll; // The compare function requires the row id of the left column to be uint16_t, hence the // number of rows. constexpr int64_t num_rows = std::numeric_limits::max() + 1; const std::vector> fixed_length_types{uint64(), uint32()}; // The var length column should be a little smaller than 2GB to workaround the capacity // limitation in the var length builder. - constexpr int32_t var_length = two_gb / num_rows - 1; + constexpr int32_t var_length = k2GB / num_rows - 1; auto row_size = std::accumulate(fixed_length_types.begin(), fixed_length_types.end(), static_cast(var_length), [](int64_t acc, const std::shared_ptr& type) { return acc + type->byte_width(); }); // The overall size should be larger than 2GB. - ASSERT_GT(row_size * num_rows, two_gb); + ASSERT_GT(row_size * num_rows, k2GB); MemoryPool* pool = default_memory_pool(); @@ -202,14 +207,14 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) { // Several fixed length arrays containing random content. for (const auto& type : fixed_length_types) { - ASSERT_OK_AND_ASSIGN(auto value, ::arrow::gen::Random(type)->Generate(num_rows)); + ASSERT_OK_AND_ASSIGN(auto value, Random(type)->Generate(num_rows)); values.push_back(std::move(value)); } // A var length array containing 'X' repeated var_length times. - ASSERT_OK_AND_ASSIGN(auto value_var_length, - ::arrow::gen::Constant( - std::make_shared(std::string(var_length, 'X'))) - ->Generate(num_rows)); + ASSERT_OK_AND_ASSIGN( + auto value_var_length, + Constant(std::make_shared(std::string(var_length, 'X'))) + ->Generate(num_rows)); values.push_back(std::move(value_var_length)); batch_left = ExecBatch(std::move(values), num_rows); @@ -237,7 +242,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) { // The row table must contain an offset buffer. ASSERT_NE(row_table_right.offsets(), NULLPTR); // The whole point of this test. - ASSERT_GT(row_table_right.offsets()[num_rows - 1], two_gb); + ASSERT_GT(row_table_right.offsets()[num_rows - 1], k2GB); } // The rows to compare. @@ -268,8 +273,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) { /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left, row_table_right, /*are_cols_in_encoding_order=*/true, match_bitvector.data()); - ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows), - num_rows); + ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows), num_rows); } std::vector selection_left(num_rows); @@ -295,8 +299,150 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) { /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left, row_table_right, /*are_cols_in_encoding_order=*/true, match_bitvector.data()); - ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows), - num_rows); + ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows), num_rows); + } +} + +// GH-XXXXX: Compare columns to rows at offset over 4GB within a row table. +TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GB)) { + if constexpr (sizeof(void*) == 4) { + GTEST_SKIP() << "Test only works on 64-bit platforms"; + } + + // The idea of this case is to create a row table using one fixed length column and one + // var length column (so the row is hence var length and has offset buffer), with more + // than 4GB data. Then compare the rows located at over 4GB. + + // A small batch to append to the row table repeatedly to grow the row table to big + // enough. + constexpr int64_t num_rows_batch = std::numeric_limits::max() + 1ll; + constexpr int fixed_length = 128; + // Involve some small randomness in the var length column. + constexpr int var_length_min = 128; + constexpr int var_length_max = 129; + constexpr double null_probability = 0.01; + + // The size of the row table is one batch larger than 4GB, and we'll compare the last + // num_rows_batch rows. + constexpr int64_t k4GB = 4ll * 1024 * 1024 * 1024; + constexpr int64_t size_row_min = fixed_length + var_length_min; + constexpr int64_t num_rows_row_table = k4GB / size_row_min + num_rows_batch; + static_assert(num_rows_row_table * size_row_min > k4GB, + "row table size must be greater than 4GB"); + + MemoryPool* pool = default_memory_pool(); + + // The left side columns with num_rows_batch rows. + std::vector columns_left; + ExecBatch batch_left; + { + std::vector values; + + // A fixed length array containing random values. + ASSERT_OK_AND_ASSIGN( + auto value_fixed_length, + Random(fixed_size_binary(fixed_length))->Generate(num_rows_batch)); + values.push_back(std::move(value_fixed_length)); + + // A var length array containing small var length values ("X"). + auto value_var_length = RandomArrayGenerator(kSeedMax).String( + num_rows_batch, var_length_min, var_length_max, null_probability); + values.push_back(std::move(value_var_length)); + + batch_left = ExecBatch(std::move(values), num_rows_batch); + ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left)); + } + + // The right side row table with num_rows_row_table rows. + RowTableImpl row_table_right; + { + // Encode the batch row table with the left columns repeatedly. + std::vector column_metadatas; + ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas)); + RowTableMetadata table_metadata; + table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t), + sizeof(uint64_t)); + ASSERT_OK(row_table_right.Init(pool, table_metadata)); + RowTableImpl row_table_batch; + ASSERT_OK(row_table_batch.Init(pool, table_metadata)); + std::vector row_ids(num_rows_batch); + std::iota(row_ids.begin(), row_ids.end(), 0); + RowTableEncoder row_encoder; + row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t)); + row_encoder.PrepareEncodeSelected(0, num_rows_batch, columns_left); + ASSERT_OK(row_encoder.EncodeSelected( + &row_table_batch, static_cast(num_rows_batch), row_ids.data())); + + // Append the batch row table repeatedly to grow the row table to big enough. + while (row_table_right.length() < num_rows_row_table) { + ASSERT_OK(row_table_right.AppendSelectionFrom(row_table_batch, num_rows_batch, + /*source_row_ids=*/NULLPTR)); + } + + // The row table must contain an offset buffer. + ASSERT_NE(row_table_right.offsets(), NULLPTR); + // At least the last row should be located at over 4GB. + ASSERT_GT(row_table_right.offsets()[num_rows_row_table - 1], k4GB); + } + + // The rows to compare: the last num_rows_batch rows in the row table VS. the whole + // batch. + std::vector row_ids_to_compare(num_rows_batch); + std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(), + num_rows_row_table - num_rows_batch); + + TempVectorStack stack; + ASSERT_OK( + stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows_batch))); + LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack}; + + { + // No selection, output no match row ids. + uint32_t num_rows_no_match; + std::vector row_ids_out(num_rows_batch); + KeyCompare::CompareColumnsToRows(num_rows_batch, /*sel_left_maybe_null=*/NULLPTR, + row_ids_to_compare.data(), &ctx, &num_rows_no_match, + row_ids_out.data(), columns_left, row_table_right, + /*are_cols_in_encoding_order=*/true, + /*out_match_bitvector_maybe_null=*/NULLPTR); + ASSERT_EQ(num_rows_no_match, 0); + } + + { + // No selection, output match bit vector. + std::vector match_bitvector(BytesForBits(num_rows_batch)); + KeyCompare::CompareColumnsToRows( + num_rows_batch, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), &ctx, + /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left, + row_table_right, + /*are_cols_in_encoding_order=*/true, match_bitvector.data()); + ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_batch), num_rows_batch); + } + + std::vector selection_left(num_rows_batch); + std::iota(selection_left.begin(), selection_left.end(), 0); + + { + // With selection, output no match row ids. + uint32_t num_rows_no_match; + std::vector row_ids_out(num_rows_batch); + KeyCompare::CompareColumnsToRows(num_rows_batch, selection_left.data(), + row_ids_to_compare.data(), &ctx, &num_rows_no_match, + row_ids_out.data(), columns_left, row_table_right, + /*are_cols_in_encoding_order=*/true, + /*out_match_bitvector_maybe_null=*/NULLPTR); + ASSERT_EQ(num_rows_no_match, 0); + } + + { + // With selection, output match bit vector. + std::vector match_bitvector(BytesForBits(num_rows_batch)); + KeyCompare::CompareColumnsToRows( + num_rows_batch, selection_left.data(), row_ids_to_compare.data(), &ctx, + /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left, + row_table_right, + /*are_cols_in_encoding_order=*/true, match_bitvector.data()); + ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_batch), num_rows_batch); } } From 9c7c3b77858b4c041225f67ed5d4a5d04c946af4 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 25 Jul 2024 21:22:54 +0800 Subject: [PATCH 12/34] Fix warning on windows --- cpp/src/arrow/compute/row/compare_test.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 7665df6673c..64666a43ec6 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -327,6 +327,8 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GB)) { constexpr int64_t k4GB = 4ll * 1024 * 1024 * 1024; constexpr int64_t size_row_min = fixed_length + var_length_min; constexpr int64_t num_rows_row_table = k4GB / size_row_min + num_rows_batch; + static_assert(num_rows_row_table < std::numeric_limits::max(), + "row table length must be less than uint32 max"); static_assert(num_rows_row_table * size_row_min > k4GB, "row table size must be greater than 4GB"); @@ -389,7 +391,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GB)) { // batch. std::vector row_ids_to_compare(num_rows_batch); std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(), - num_rows_row_table - num_rows_batch); + static_cast(num_rows_row_table - num_rows_batch)); TempVectorStack stack; ASSERT_OK( From 136b5887cde182056b70f022a011941d57226be7 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Fri, 26 Jul 2024 18:02:45 +0800 Subject: [PATCH 13/34] Refine compare tests --- cpp/src/arrow/compute/row/compare_test.cc | 308 ++++++++++++---------- 1 file changed, 164 insertions(+), 144 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 64666a43ec6..ea563a76a45 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -171,6 +171,108 @@ TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) { } } +namespace { + +Result MakeRowTableFromExecBatch(const ExecBatch& batch) { + RowTableImpl row_table; + + std::vector column_metadatas; + RETURN_NOT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas)); + RowTableMetadata table_metadata; + table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t), + sizeof(uint64_t)); + RETURN_NOT_OK(row_table.Init(default_memory_pool(), table_metadata)); + std::vector row_ids(batch.length); + std::iota(row_ids.begin(), row_ids.end(), 0); + RowTableEncoder row_encoder; + row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t)); + std::vector column_arrays; + RETURN_NOT_OK(ColumnArraysFromExecBatch(batch, &column_arrays)); + row_encoder.PrepareEncodeSelected(0, batch.length, column_arrays); + RETURN_NOT_OK(row_encoder.EncodeSelected( + &row_table, static_cast(batch.length), row_ids.data())); + + return row_table; +} + +Result RepeatRowTableUntil(const RowTableImpl& seed, int64_t num_rows) { + RowTableImpl row_table; + + RETURN_NOT_OK(row_table.Init(default_memory_pool(), seed.metadata())); + // Append the seed row table repeatedly to grow the row table to big enough. + while (row_table.length() < num_rows) { + RETURN_NOT_OK(row_table.AppendSelectionFrom(seed, + static_cast(seed.length()), + /*source_row_ids=*/NULLPTR)); + } + + return row_table; +} + +void AssertCompareColumnsToRowsAllMatch(const std::vector& columns, + const RowTableImpl& row_table, + const std::vector& row_ids_to_compare) { + uint32_t num_rows_to_compare = static_cast(row_ids_to_compare.size()); + + TempVectorStack stack; + ASSERT_OK( + stack.Init(default_memory_pool(), + KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows_to_compare))); + LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack}; + + { + // No selection, output no match row ids. + uint32_t num_rows_no_match; + std::vector row_ids_out(num_rows_to_compare); + KeyCompare::CompareColumnsToRows(num_rows_to_compare, /*sel_left_maybe_null=*/NULLPTR, + row_ids_to_compare.data(), &ctx, &num_rows_no_match, + row_ids_out.data(), columns, row_table, + /*are_cols_in_encoding_order=*/true, + /*out_match_bitvector_maybe_null=*/NULLPTR); + ASSERT_EQ(num_rows_no_match, 0); + } + + { + // No selection, output match bit vector. + std::vector match_bitvector(BytesForBits(num_rows_to_compare)); + KeyCompare::CompareColumnsToRows( + num_rows_to_compare, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), + &ctx, + /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns, row_table, + /*are_cols_in_encoding_order=*/true, match_bitvector.data()); + ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_to_compare), + num_rows_to_compare); + } + + std::vector selection_left(num_rows_to_compare); + std::iota(selection_left.begin(), selection_left.end(), 0); + + { + // With selection, output no match row ids. + uint32_t num_rows_no_match; + std::vector row_ids_out(num_rows_to_compare); + KeyCompare::CompareColumnsToRows(num_rows_to_compare, selection_left.data(), + row_ids_to_compare.data(), &ctx, &num_rows_no_match, + row_ids_out.data(), columns, row_table, + /*are_cols_in_encoding_order=*/true, + /*out_match_bitvector_maybe_null=*/NULLPTR); + ASSERT_EQ(num_rows_no_match, 0); + } + + { + // With selection, output match bit vector. + std::vector match_bitvector(BytesForBits(num_rows_to_compare)); + KeyCompare::CompareColumnsToRows( + num_rows_to_compare, selection_left.data(), row_ids_to_compare.data(), &ctx, + /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns, row_table, + /*are_cols_in_encoding_order=*/true, match_bitvector.data()); + ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_to_compare), + num_rows_to_compare); + } +} + +} // namespace + // Compare columns to rows at offsets over 2GB within a row table. // Certain AVX2 instructions may behave unexpectedly causing troubles like GH-41813. TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { @@ -197,8 +299,6 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { // The overall size should be larger than 2GB. ASSERT_GT(row_size * num_rows, k2GB); - MemoryPool* pool = default_memory_pool(); - // The left side columns. std::vector columns_left; ExecBatch batch_left; @@ -222,25 +322,11 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { } // The right side row table. - RowTableImpl row_table_right; + ASSERT_OK_AND_ASSIGN(RowTableImpl row_table_right, + MakeRowTableFromExecBatch(batch_left)); { - // Encode the row table with the left columns. - std::vector column_metadatas; - ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas)); - RowTableMetadata table_metadata; - table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t), - sizeof(uint64_t)); - ASSERT_OK(row_table_right.Init(pool, table_metadata)); - std::vector row_ids(num_rows); - std::iota(row_ids.begin(), row_ids.end(), 0); - RowTableEncoder row_encoder; - row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t)); - row_encoder.PrepareEncodeSelected(0, num_rows, columns_left); - ASSERT_OK(row_encoder.EncodeSelected( - &row_table_right, static_cast(num_rows), row_ids.data())); - // The row table must contain an offset buffer. - ASSERT_NE(row_table_right.offsets(), NULLPTR); + ASSERT_NE(row_table_right.data(2), NULLPTR); // The whole point of this test. ASSERT_GT(row_table_right.offsets()[num_rows - 1], k2GB); } @@ -249,62 +335,69 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { std::vector row_ids_to_compare(num_rows); std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(), 0); - TempVectorStack stack; - ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows))); - LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack}; + AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, row_ids_to_compare); +} - { - // No selection, output no match row ids. - uint32_t num_rows_no_match; - std::vector row_ids_out(num_rows); - KeyCompare::CompareColumnsToRows(num_rows, /*sel_left_maybe_null=*/NULLPTR, - row_ids_to_compare.data(), &ctx, &num_rows_no_match, - row_ids_out.data(), columns_left, row_table_right, - /*are_cols_in_encoding_order=*/true, - /*out_match_bitvector_maybe_null=*/NULLPTR); - ASSERT_EQ(num_rows_no_match, 0); +// GH-XXXXX: Compare fixed length columns to rows over 4GB within a row table. +TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) { + if constexpr (sizeof(void*) == 4) { + GTEST_SKIP() << "Test only works on 64-bit platforms"; } - { - // No selection, output match bit vector. - std::vector match_bitvector(BytesForBits(num_rows)); - KeyCompare::CompareColumnsToRows( - num_rows, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), &ctx, - /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left, - row_table_right, - /*are_cols_in_encoding_order=*/true, match_bitvector.data()); - ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows), num_rows); - } + // The idea of this case is to create a row table using one fixed length column (so the + // row is hence fixed length), with more than 4GB data. Then compare the rows located at + // over 4GB. - std::vector selection_left(num_rows); - std::iota(selection_left.begin(), selection_left.end(), 0); + // A small batch to append to the row table repeatedly to grow the row table to big + // enough. + constexpr int64_t num_rows_batch = std::numeric_limits::max() + 1ll; + constexpr int fixed_length = 256; + + // The size of the row table is one batch larger than 4GB, and we'll compare the last + // num_rows_batch rows. + constexpr int64_t k4GB = 4ll * 1024 * 1024 * 1024; + constexpr int64_t num_rows_row_table = k4GB / fixed_length + num_rows_batch; + static_assert(num_rows_row_table < std::numeric_limits::max(), + "row table length must be less than uint32 max"); + static_assert(num_rows_row_table * fixed_length > k4GB, + "row table size must be greater than 4GB"); + // The left side columns with num_rows_batch rows. + std::vector columns_left; + ExecBatch batch_left; { - // With selection, output no match row ids. - uint32_t num_rows_no_match; - std::vector row_ids_out(num_rows); - KeyCompare::CompareColumnsToRows(num_rows, selection_left.data(), - row_ids_to_compare.data(), &ctx, &num_rows_no_match, - row_ids_out.data(), columns_left, row_table_right, - /*are_cols_in_encoding_order=*/true, - /*out_match_bitvector_maybe_null=*/NULLPTR); - ASSERT_EQ(num_rows_no_match, 0); + std::vector values; + + // A fixed length array containing random values. + ASSERT_OK_AND_ASSIGN( + auto value_fixed_length, + Random(fixed_size_binary(fixed_length))->Generate(num_rows_batch)); + values.push_back(std::move(value_fixed_length)); } + // The right side row table with num_rows_row_table rows. + ASSERT_OK_AND_ASSIGN( + RowTableImpl row_table_right, + RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(), + num_rows_row_table)); { - // With selection, output match bit vector. - std::vector match_bitvector(BytesForBits(num_rows)); - KeyCompare::CompareColumnsToRows( - num_rows, selection_left.data(), row_ids_to_compare.data(), &ctx, - /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left, - row_table_right, - /*are_cols_in_encoding_order=*/true, match_bitvector.data()); - ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows), num_rows); + // The row table must not contain a third buffer. + ASSERT_EQ(row_table_right.data(2), NULLPTR); + // The row data must be greater than 4GB. + ASSERT_GT(row_table_right.buffer_size(1), k4GB); } + + // The rows to compare: the last num_rows_batch rows in the row table VS. the whole + // batch. + std::vector row_ids_to_compare(num_rows_batch); + std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(), + static_cast(num_rows_row_table - num_rows_batch)); + + AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, row_ids_to_compare); } -// GH-XXXXX: Compare columns to rows at offset over 4GB within a row table. -TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GB)) { +// GH-XXXXX: Compare var length columns to rows at offset over 4GB within a row table. +TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBVarLength)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; } @@ -332,8 +425,6 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GB)) { static_assert(num_rows_row_table * size_row_min > k4GB, "row table size must be greater than 4GB"); - MemoryPool* pool = default_memory_pool(); - // The left side columns with num_rows_batch rows. std::vector columns_left; ExecBatch batch_left; @@ -346,7 +437,8 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GB)) { Random(fixed_size_binary(fixed_length))->Generate(num_rows_batch)); values.push_back(std::move(value_fixed_length)); - // A var length array containing small var length values ("X"). + // A var length array containing random binary of 128 or 129 bytes with small portion + // of nulls. auto value_var_length = RandomArrayGenerator(kSeedMax).String( num_rows_batch, var_length_min, var_length_max, null_probability); values.push_back(std::move(value_var_length)); @@ -356,33 +448,13 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GB)) { } // The right side row table with num_rows_row_table rows. - RowTableImpl row_table_right; + ASSERT_OK_AND_ASSIGN( + RowTableImpl row_table_right, + RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(), + num_rows_row_table)); { - // Encode the batch row table with the left columns repeatedly. - std::vector column_metadatas; - ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas)); - RowTableMetadata table_metadata; - table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t), - sizeof(uint64_t)); - ASSERT_OK(row_table_right.Init(pool, table_metadata)); - RowTableImpl row_table_batch; - ASSERT_OK(row_table_batch.Init(pool, table_metadata)); - std::vector row_ids(num_rows_batch); - std::iota(row_ids.begin(), row_ids.end(), 0); - RowTableEncoder row_encoder; - row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t)); - row_encoder.PrepareEncodeSelected(0, num_rows_batch, columns_left); - ASSERT_OK(row_encoder.EncodeSelected( - &row_table_batch, static_cast(num_rows_batch), row_ids.data())); - - // Append the batch row table repeatedly to grow the row table to big enough. - while (row_table_right.length() < num_rows_row_table) { - ASSERT_OK(row_table_right.AppendSelectionFrom(row_table_batch, num_rows_batch, - /*source_row_ids=*/NULLPTR)); - } - // The row table must contain an offset buffer. - ASSERT_NE(row_table_right.offsets(), NULLPTR); + ASSERT_NE(row_table_right.data(2), NULLPTR); // At least the last row should be located at over 4GB. ASSERT_GT(row_table_right.offsets()[num_rows_row_table - 1], k4GB); } @@ -393,59 +465,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GB)) { std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(), static_cast(num_rows_row_table - num_rows_batch)); - TempVectorStack stack; - ASSERT_OK( - stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows_batch))); - LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack}; - - { - // No selection, output no match row ids. - uint32_t num_rows_no_match; - std::vector row_ids_out(num_rows_batch); - KeyCompare::CompareColumnsToRows(num_rows_batch, /*sel_left_maybe_null=*/NULLPTR, - row_ids_to_compare.data(), &ctx, &num_rows_no_match, - row_ids_out.data(), columns_left, row_table_right, - /*are_cols_in_encoding_order=*/true, - /*out_match_bitvector_maybe_null=*/NULLPTR); - ASSERT_EQ(num_rows_no_match, 0); - } - - { - // No selection, output match bit vector. - std::vector match_bitvector(BytesForBits(num_rows_batch)); - KeyCompare::CompareColumnsToRows( - num_rows_batch, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), &ctx, - /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left, - row_table_right, - /*are_cols_in_encoding_order=*/true, match_bitvector.data()); - ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_batch), num_rows_batch); - } - - std::vector selection_left(num_rows_batch); - std::iota(selection_left.begin(), selection_left.end(), 0); - - { - // With selection, output no match row ids. - uint32_t num_rows_no_match; - std::vector row_ids_out(num_rows_batch); - KeyCompare::CompareColumnsToRows(num_rows_batch, selection_left.data(), - row_ids_to_compare.data(), &ctx, &num_rows_no_match, - row_ids_out.data(), columns_left, row_table_right, - /*are_cols_in_encoding_order=*/true, - /*out_match_bitvector_maybe_null=*/NULLPTR); - ASSERT_EQ(num_rows_no_match, 0); - } - - { - // With selection, output match bit vector. - std::vector match_bitvector(BytesForBits(num_rows_batch)); - KeyCompare::CompareColumnsToRows( - num_rows_batch, selection_left.data(), row_ids_to_compare.data(), &ctx, - /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left, - row_table_right, - /*are_cols_in_encoding_order=*/true, match_bitvector.data()); - ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_batch), num_rows_batch); - } + AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, row_ids_to_compare); } } // namespace compute From 9f9db1bc53b2f6c546b36f5d46ac4fd841890e59 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Fri, 26 Jul 2024 18:07:09 +0800 Subject: [PATCH 14/34] WIP join test --- cpp/src/arrow/acero/hash_join_node_test.cc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index f7b442cc3c6..7208529ba77 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/compute/light_array_internal.h" #include "arrow/testing/extension_type.h" +#include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" @@ -3253,5 +3254,24 @@ TEST(HashJoin, ManyJoins) { ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root))); } +// Test that both the key and the payload of the right side (the build side) are larger +// than 4GB, and the 64-bit offset in the hash table can handle it correctly. +TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GB)) { + // constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; + // constexpr int16_t num_rows_per_batch = 1024; + // constexpr int bytes_per_column = 8; + // constexpr int64_t num_batches = k5GB / (num_rows_per_batch * bytes_per_column); + // constexpr int64_t num_rows = num_rows_per_batch * num_batches; + // const int num_left_rows = ExecBatchBuilder::num_rows_max(); + + // std::vector values; + // ASSERT_OK_AND_ASSIGN(auto value_fixed_length, + // ::arrow::gen::Random(fixed_size_binary(bytes_per_column)) + // ->Generate(num_rows_per_batch)); + // values.push_back(std::move(value_fixed_length)); + + // ExecBatch batch = ExecBatch(std::move(values), num_rows_per_batch); +} + } // namespace acero } // namespace arrow From 26cfd77ae1fcb236fe5956c3774523150d035e7b Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Sat, 27 Jul 2024 01:04:26 +0800 Subject: [PATCH 15/34] Fix --- cpp/src/arrow/compute/row/row_internal.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 7fed07e3741..bda744ec197 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -190,7 +190,7 @@ Status RowTableImpl::Init(MemoryPool* pool, const RowTableMetadata& metadata) { auto offsets, AllocateResizableBuffer(size_offsets(kInitialRowsCapacity), pool_)); offsets_ = std::move(offsets); memset(offsets_->mutable_data(), 0, size_offsets(kInitialRowsCapacity)); - reinterpret_cast(offsets_->mutable_data())[0] = 0; + reinterpret_cast(offsets_->mutable_data())[0] = 0; ARROW_ASSIGN_OR_RAISE( auto rows, @@ -225,7 +225,7 @@ void RowTableImpl::Clean() { has_any_nulls_ = false; if (!metadata_.is_fixed_length) { - reinterpret_cast(offsets_->mutable_data())[0] = 0; + reinterpret_cast(offsets_->mutable_data())[0] = 0; } } From c4005d460844c0742c9e1471b9fb3aff5accd4b8 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Sat, 27 Jul 2024 01:20:40 +0800 Subject: [PATCH 16/34] Fix refined test --- cpp/src/arrow/compute/row/compare_test.cc | 55 ++++++++++++----------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index ea563a76a45..ee8087d1ce8 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -299,8 +299,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { // The overall size should be larger than 2GB. ASSERT_GT(row_size * num_rows, k2GB); - // The left side columns. - std::vector columns_left; + // The left side batch. ExecBatch batch_left; { std::vector values; @@ -318,18 +317,19 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { values.push_back(std::move(value_var_length)); batch_left = ExecBatch(std::move(values), num_rows); - ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left)); } + // The left side columns. + std::vector columns_left; + ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left)); + // The right side row table. ASSERT_OK_AND_ASSIGN(RowTableImpl row_table_right, MakeRowTableFromExecBatch(batch_left)); - { - // The row table must contain an offset buffer. - ASSERT_NE(row_table_right.data(2), NULLPTR); - // The whole point of this test. - ASSERT_GT(row_table_right.offsets()[num_rows - 1], k2GB); - } + // The row table must contain an offset buffer. + ASSERT_NE(row_table_right.data(2), NULLPTR); + // The whole point of this test. + ASSERT_GT(row_table_right.offsets()[num_rows - 1], k2GB); // The rows to compare. std::vector row_ids_to_compare(num_rows); @@ -362,8 +362,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) { static_assert(num_rows_row_table * fixed_length > k4GB, "row table size must be greater than 4GB"); - // The left side columns with num_rows_batch rows. - std::vector columns_left; + // The left side batch with num_rows_batch rows. ExecBatch batch_left; { std::vector values; @@ -373,19 +372,23 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) { auto value_fixed_length, Random(fixed_size_binary(fixed_length))->Generate(num_rows_batch)); values.push_back(std::move(value_fixed_length)); + + batch_left = ExecBatch(std::move(values), num_rows_batch); } + // The left side columns with num_rows_batch rows. + std::vector columns_left; + ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left)); + // The right side row table with num_rows_row_table rows. ASSERT_OK_AND_ASSIGN( RowTableImpl row_table_right, RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(), num_rows_row_table)); - { - // The row table must not contain a third buffer. - ASSERT_EQ(row_table_right.data(2), NULLPTR); - // The row data must be greater than 4GB. - ASSERT_GT(row_table_right.buffer_size(1), k4GB); - } + // The row table must not contain a third buffer. + ASSERT_EQ(row_table_right.data(2), NULLPTR); + // The row data must be greater than 4GB. + ASSERT_GT(row_table_right.buffer_size(1), k4GB); // The rows to compare: the last num_rows_batch rows in the row table VS. the whole // batch. @@ -425,8 +428,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBVarLength)) { static_assert(num_rows_row_table * size_row_min > k4GB, "row table size must be greater than 4GB"); - // The left side columns with num_rows_batch rows. - std::vector columns_left; + // The left side batch with num_rows_batch rows. ExecBatch batch_left; { std::vector values; @@ -444,20 +446,21 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBVarLength)) { values.push_back(std::move(value_var_length)); batch_left = ExecBatch(std::move(values), num_rows_batch); - ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left)); } + // The left side columns with num_rows_batch rows. + std::vector columns_left; + ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left)); + // The right side row table with num_rows_row_table rows. ASSERT_OK_AND_ASSIGN( RowTableImpl row_table_right, RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(), num_rows_row_table)); - { - // The row table must contain an offset buffer. - ASSERT_NE(row_table_right.data(2), NULLPTR); - // At least the last row should be located at over 4GB. - ASSERT_GT(row_table_right.offsets()[num_rows_row_table - 1], k4GB); - } + // The row table must contain an offset buffer. + ASSERT_NE(row_table_right.data(2), NULLPTR); + // At least the last row should be located at over 4GB. + ASSERT_GT(row_table_right.offsets()[num_rows_row_table - 1], k4GB); // The rows to compare: the last num_rows_batch rows in the row table VS. the whole // batch. From ef7c0c72e2841b65f46b8af6b6349e12143aeb68 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Sat, 27 Jul 2024 01:25:20 +0800 Subject: [PATCH 17/34] Add a todo to be fixed for large hash join test --- cpp/src/arrow/acero/swiss_join.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 95bf876e89e..6afdf8b86fe 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -473,6 +473,7 @@ Status RowArrayMerge::PrepareForMerge(RowArray* target, (*first_target_row_id)[sources.size()] = num_rows; } + // TODO: WIP, this should fire for a join larger than 4GB. if (num_bytes > std::numeric_limits::max()) { return Status::Invalid( "There are more than 2^32 bytes of key data. Acero cannot " From f171e54cfab0bceeab40480f9ae765218849eec3 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Sun, 28 Jul 2024 00:37:34 +0800 Subject: [PATCH 18/34] Hash join test for fixed length type --- cpp/src/arrow/acero/hash_join_node_test.cc | 94 ++++++++++++++++++---- 1 file changed, 77 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 7208529ba77..7c317352f18 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -41,6 +41,9 @@ using testing::UnorderedElementsAreArray; namespace arrow { +using arrow::gen::Constant; +using arrow::random::kSeedMax; +using arrow::random::RandomArrayGenerator; using compute::call; using compute::default_exec_context; using compute::ExecBatchBuilder; @@ -3254,24 +3257,81 @@ TEST(HashJoin, ManyJoins) { ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root))); } -// Test that both the key and the payload of the right side (the build side) are larger -// than 4GB, and the 64-bit offset in the hash table can handle it correctly. -TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GB)) { - // constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; - // constexpr int16_t num_rows_per_batch = 1024; - // constexpr int bytes_per_column = 8; - // constexpr int64_t num_batches = k5GB / (num_rows_per_batch * bytes_per_column); - // constexpr int64_t num_rows = num_rows_per_batch * num_batches; - // const int num_left_rows = ExecBatchBuilder::num_rows_max(); - - // std::vector values; - // ASSERT_OK_AND_ASSIGN(auto value_fixed_length, - // ::arrow::gen::Random(fixed_size_binary(bytes_per_column)) - // ->Generate(num_rows_per_batch)); - // values.push_back(std::move(value_fixed_length)); - - // ExecBatch batch = ExecBatch(std::move(values), num_rows_per_batch); +// Test that both the key and the payload of the right side (the build side) are fixed +// length and larger than 4GB, and the 64-bit offset in the hash table can handle it +// correctly. +TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { + constexpr int64_t k5GB = 4ll * 1024 * 1024 * 1024; + const auto type = uint64(); + constexpr int16_t num_rows_per_batch_left = 1024; + constexpr int16_t num_rows_per_batch_right = 4096; + const int64_t num_batches_left = 8; + const int64_t num_batches_right = + k5GB / (num_rows_per_batch_right * type->byte_width()); + + // Left side composed of num_batches_left identical batches of num_rows_per_batch_left + // rows of zeros. + BatchesWithSchema batches_left; + { + // A column with num_rows_per_batch_left zeros. + ASSERT_OK_AND_ASSIGN( + auto column, + Constant(std::make_shared(0))->Generate(num_rows_per_batch_left)); + + // Use the column as both the key and the payload. + std::vector columns; + columns.push_back(column); + columns.push_back(column); + ExecBatch batch(std::move(columns), num_rows_per_batch_left); + batches_left = + BatchesWithSchema{std::vector(num_batches_left, std::move(batch)), + schema({field("l_key", type), field("l_payload", type)})}; + } + + // Right side composed of num_batches_right identical batches of + // num_rows_per_batch_right rows containing only 1 zero. + BatchesWithSchema batches_right; + { + // A column with (num_rows_per_batch_right - 1) non-zeros (possibly null) and 1 zero. + auto non_zeros = + RandomArrayGenerator(kSeedMax).UInt64(num_rows_per_batch_right - 1, + /*min=*/1, num_rows_per_batch_right, + /*null_probability =*/0.01); + ASSERT_OK_AND_ASSIGN(auto zero, + Constant(std::make_shared(0))->Generate(1)); + ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_zeros, zero})); + + // Use the column as both the key and the payload. + std::vector columns; + columns.push_back(column); + columns.push_back(column); + ExecBatch batch(std::move(columns), num_rows_per_batch_right); + batches_right = + BatchesWithSchema{std::vector(num_batches_right, std::move(batch)), + schema({field("r_key", type), field("r_payload", type)})}; + } + + Declaration left{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_left.schema), + std::move(batches_left.batches))}; + + Declaration right{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_right.schema), + std::move(batches_right.batches))}; + + HashJoinNodeOptions join_opts(JoinType::INNER, /*left_keys=*/{"l_key"}, + /*right_keys=*/{"r_key"}); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; + + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToTable(std::move(join))); + ASSERT_EQ(result->num_rows(), + num_batches_left * num_rows_per_batch_left * num_batches_right); } +// Test that both the key and the payload of the right side (the build side) are var +// length and larger than 4GB, and the 64-bit offset in the hash table can handle it +// correctly. +TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) {} + } // namespace acero } // namespace arrow From 25e86ffa225f28db8b6b3ef900b5cbbc82022ee1 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Sun, 28 Jul 2024 15:10:57 +0800 Subject: [PATCH 19/34] Pass all hash join tests --- cpp/src/arrow/acero/hash_join_node_test.cc | 165 +++++++++++++++++---- cpp/src/arrow/acero/swiss_join.cc | 10 +- cpp/src/arrow/compute/row/row_internal.cc | 2 +- cpp/src/arrow/compute/row/row_internal.h | 2 +- 4 files changed, 140 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 7c317352f18..06453bca84c 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -44,6 +44,7 @@ namespace arrow { using arrow::gen::Constant; using arrow::random::kSeedMax; using arrow::random::RandomArrayGenerator; +using compute::and_; using compute::call; using compute::default_exec_context; using compute::ExecBatchBuilder; @@ -3257,55 +3258,68 @@ TEST(HashJoin, ManyJoins) { ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root))); } +namespace { + +void AssertRowCountEq(Declaration source, int64_t expected) { + Declaration count{"aggregate", + {std::move(source)}, + AggregateNodeOptions{/*aggregates=*/{{"count_all", "count(*)"}}}}; + ASSERT_OK_AND_ASSIGN(auto batches, DeclarationToExecBatches(std::move(count))); + ASSERT_EQ(batches.batches.size(), 1); + ASSERT_EQ(batches.batches[0].values.size(), 1); + ASSERT_TRUE(batches.batches[0].values[0].is_scalar()); + ASSERT_EQ(batches.batches[0].values[0].scalar()->type->id(), Type::INT64); + ASSERT_TRUE(batches.batches[0].values[0].scalar_as().is_valid); + ASSERT_EQ(batches.batches[0].values[0].scalar_as().value, expected); +} + +} // namespace + // Test that both the key and the payload of the right side (the build side) are fixed // length and larger than 4GB, and the 64-bit offset in the hash table can handle it // correctly. TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { - constexpr int64_t k5GB = 4ll * 1024 * 1024 * 1024; + constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; const auto type = uint64(); - constexpr int16_t num_rows_per_batch_left = 1024; + const uint64_t value_match = 42; + constexpr int16_t num_rows_per_batch_left = 128; constexpr int16_t num_rows_per_batch_right = 4096; const int64_t num_batches_left = 8; const int64_t num_batches_right = k5GB / (num_rows_per_batch_right * type->byte_width()); // Left side composed of num_batches_left identical batches of num_rows_per_batch_left - // rows of zeros. + // rows of value_match-es. BatchesWithSchema batches_left; { - // A column with num_rows_per_batch_left zeros. - ASSERT_OK_AND_ASSIGN( - auto column, - Constant(std::make_shared(0))->Generate(num_rows_per_batch_left)); + // A column with num_rows_per_batch_left value_match-es. + ASSERT_OK_AND_ASSIGN(auto column, + Constant(std::make_shared(value_match)) + ->Generate(num_rows_per_batch_left)); // Use the column as both the key and the payload. - std::vector columns; - columns.push_back(column); - columns.push_back(column); - ExecBatch batch(std::move(columns), num_rows_per_batch_left); + ExecBatch batch({column, column}, num_rows_per_batch_left); batches_left = BatchesWithSchema{std::vector(num_batches_left, std::move(batch)), schema({field("l_key", type), field("l_payload", type)})}; } // Right side composed of num_batches_right identical batches of - // num_rows_per_batch_right rows containing only 1 zero. + // num_rows_per_batch_right rows containing only 1 value_match. BatchesWithSchema batches_right; { - // A column with (num_rows_per_batch_right - 1) non-zeros (possibly null) and 1 zero. - auto non_zeros = - RandomArrayGenerator(kSeedMax).UInt64(num_rows_per_batch_right - 1, - /*min=*/1, num_rows_per_batch_right, - /*null_probability =*/0.01); - ASSERT_OK_AND_ASSIGN(auto zero, - Constant(std::make_shared(0))->Generate(1)); - ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_zeros, zero})); + // A column with (num_rows_per_batch_right - 1) non-value_match-es (possibly null) and + // 1 value_match. + auto non_matches = RandomArrayGenerator(kSeedMax).UInt64( + num_rows_per_batch_right - 1, + /*min=*/value_match + 1, /*max=*/value_match + 1 + num_rows_per_batch_right, + /*null_probability =*/0.01); + ASSERT_OK_AND_ASSIGN( + auto match, Constant(std::make_shared(value_match))->Generate(1)); + ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_matches, match})); // Use the column as both the key and the payload. - std::vector columns; - columns.push_back(column); - columns.push_back(column); - ExecBatch batch(std::move(columns), num_rows_per_batch_right); + ExecBatch batch({column, column}, num_rows_per_batch_right); batches_right = BatchesWithSchema{std::vector(num_batches_right, std::move(batch)), schema({field("r_key", type), field("r_payload", type)})}; @@ -3323,15 +3337,110 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { /*right_keys=*/{"r_key"}); Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; - ASSERT_OK_AND_ASSIGN(auto result, DeclarationToTable(std::move(join))); - ASSERT_EQ(result->num_rows(), - num_batches_left * num_rows_per_batch_left * num_batches_right); + ASSERT_OK_AND_ASSIGN(auto batches_result, DeclarationToExecBatches(std::move(join))); + Declaration result{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_result.schema), + std::move(batches_result.batches))}; + + // The row count of hash join should be (number of value_match-es in left side) * + // (number of value_match-es in right side). + AssertRowCountEq(result, + num_batches_left * num_rows_per_batch_left * num_batches_right); + + // All rows should be value_match-es. + auto predicate = and_({equal(field_ref("l_key"), literal(value_match)), + equal(field_ref("l_payload"), literal(value_match)), + equal(field_ref("r_key"), literal(value_match)), + equal(field_ref("r_payload"), literal(value_match))}); + Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}}; + AssertRowCountEq(std::move(filter), + num_batches_left * num_rows_per_batch_left * num_batches_right); } // Test that both the key and the payload of the right side (the build side) are var // length and larger than 4GB, and the 64-bit offset in the hash table can handle it // correctly. -TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) {} +TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { + constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; + const auto type = utf8(); + constexpr int value_no_match_length_min = 128; + constexpr int value_no_match_length_max = 129; + constexpr int value_match_length = 130; + const auto value_match = + std::make_shared(std::string(value_match_length, 'X')); + constexpr int16_t num_rows_per_batch_left = 128; + constexpr int16_t num_rows_per_batch_right = 4096; + const int64_t num_batches_left = 8; + const int64_t num_batches_right = + k5GB / (num_rows_per_batch_right * value_no_match_length_min); + + // Left side composed of num_batches_left identical batches of num_rows_per_batch_left + // rows of value_match-es. + BatchesWithSchema batches_left; + { + // A column with num_rows_per_batch_left value_match-es. + ASSERT_OK_AND_ASSIGN(auto column, + Constant(value_match)->Generate(num_rows_per_batch_left)); + + // Use the column as both the key and the payload. + ExecBatch batch({column, column}, num_rows_per_batch_left); + batches_left = + BatchesWithSchema{std::vector(num_batches_left, std::move(batch)), + schema({field("l_key", type), field("l_payload", type)})}; + } + + // Right side composed of num_batches_right identical batches of + // num_rows_per_batch_right rows containing only 1 value_match. + BatchesWithSchema batches_right; + { + // A column with (num_rows_per_batch_right - 1) non-value_match-es (possibly null) and + // 1 value_match. + auto non_matches = + RandomArrayGenerator(kSeedMax).String(num_rows_per_batch_right - 1, + /*min_length=*/value_no_match_length_min, + /*max_length=*/value_no_match_length_max, + /*null_probability =*/0.01); + ASSERT_OK_AND_ASSIGN(auto match, Constant(value_match)->Generate(1)); + ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_matches, match})); + + // Use the column as both the key and the payload. + ExecBatch batch({column, column}, num_rows_per_batch_right); + batches_right = + BatchesWithSchema{std::vector(num_batches_right, std::move(batch)), + schema({field("r_key", type), field("r_payload", type)})}; + } + + Declaration left{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_left.schema), + std::move(batches_left.batches))}; + + Declaration right{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_right.schema), + std::move(batches_right.batches))}; + + HashJoinNodeOptions join_opts(JoinType::INNER, /*left_keys=*/{"l_key"}, + /*right_keys=*/{"r_key"}); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; + + ASSERT_OK_AND_ASSIGN(auto batches_result, DeclarationToExecBatches(std::move(join))); + Declaration result{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_result.schema), + std::move(batches_result.batches))}; + + // The row count of hash join should be (number of value_match-es in left side) * + // (number of value_match-es in right side). + AssertRowCountEq(result, + num_batches_left * num_rows_per_batch_left * num_batches_right); + + // All rows should be value_match-es. + auto predicate = and_({equal(field_ref("l_key"), literal(value_match)), + equal(field_ref("l_payload"), literal(value_match)), + equal(field_ref("r_key"), literal(value_match)), + equal(field_ref("r_payload"), literal(value_match))}); + Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}}; + AssertRowCountEq(std::move(filter), + num_batches_left * num_rows_per_batch_left * num_batches_right); +} } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 6afdf8b86fe..5182ec0a620 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -473,18 +473,10 @@ Status RowArrayMerge::PrepareForMerge(RowArray* target, (*first_target_row_id)[sources.size()] = num_rows; } - // TODO: WIP, this should fire for a join larger than 4GB. - if (num_bytes > std::numeric_limits::max()) { - return Status::Invalid( - "There are more than 2^32 bytes of key data. Acero cannot " - "process a join of this magnitude"); - } - // Allocate target memory // target->rows_.Clean(); - RETURN_NOT_OK(target->rows_.AppendEmpty(static_cast(num_rows), - static_cast(num_bytes))); + RETURN_NOT_OK(target->rows_.AppendEmpty(static_cast(num_rows), num_bytes)); // In case of varying length rows, // initialize the first row offset for each range of rows corresponding to a diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index bda744ec197..aa7e62add45 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -389,7 +389,7 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from, } Status RowTableImpl::AppendEmpty(uint32_t num_rows_to_append, - uint32_t num_extra_bytes_to_append) { + int64_t num_extra_bytes_to_append) { RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append)); if (!metadata_.is_fixed_length) { RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append)); diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 4e1d1f664e0..094a9c31efe 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -188,7 +188,7 @@ class ARROW_EXPORT RowTableImpl { /// \param num_extra_bytes_to_append For tables storing variable-length data this /// should be a guess of how many data bytes will be needed to populate the /// data. This is ignored if there are no variable-length columns - Status AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append); + Status AppendEmpty(uint32_t num_rows_to_append, int64_t num_extra_bytes_to_append); /// \brief Append rows from a source table /// \param from The table to append from /// \param num_rows_to_append The number of rows to append From c4ab99a38eaf07f9923105ce0d97259042b5476d Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 29 Jul 2024 02:16:44 +0800 Subject: [PATCH 20/34] Refine tests --- cpp/src/arrow/acero/hash_join_node_test.cc | 21 ++++++++++++--------- cpp/src/arrow/compute/row/compare_test.cc | 10 ++++++---- cpp/src/arrow/testing/random.cc | 22 +++++++++++----------- cpp/src/arrow/testing/random.h | 6 ++++++ 4 files changed, 35 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 06453bca84c..d58e23de291 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3280,8 +3280,13 @@ void AssertRowCountEq(Declaration source, int64_t expected) { // correctly. TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; - const auto type = uint64(); - const uint64_t value_match = 42; + constexpr int fixed_length = 128; + const auto type = fixed_size_binary(fixed_length); + constexpr uint8_t byte_no_match_min = static_cast('A'); + constexpr uint8_t byte_no_match_max = static_cast('y'); + constexpr uint8_t byte_match = static_cast('z'); + const auto value_match = + std::make_shared(std::string(fixed_length, byte_match)); constexpr int16_t num_rows_per_batch_left = 128; constexpr int16_t num_rows_per_batch_right = 4096; const int64_t num_batches_left = 8; @@ -3294,8 +3299,7 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { { // A column with num_rows_per_batch_left value_match-es. ASSERT_OK_AND_ASSIGN(auto column, - Constant(std::make_shared(value_match)) - ->Generate(num_rows_per_batch_left)); + Constant(value_match)->Generate(num_rows_per_batch_left)); // Use the column as both the key and the payload. ExecBatch batch({column, column}, num_rows_per_batch_left); @@ -3310,12 +3314,11 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { { // A column with (num_rows_per_batch_right - 1) non-value_match-es (possibly null) and // 1 value_match. - auto non_matches = RandomArrayGenerator(kSeedMax).UInt64( - num_rows_per_batch_right - 1, - /*min=*/value_match + 1, /*max=*/value_match + 1 + num_rows_per_batch_right, + auto non_matches = RandomArrayGenerator(kSeedMax).FixedSizeBinary( + num_rows_per_batch_right - 1, fixed_length, + /*min_byte=*/byte_no_match_min, /*max_byte=*/byte_no_match_max, /*null_probability =*/0.01); - ASSERT_OK_AND_ASSIGN( - auto match, Constant(std::make_shared(value_match))->Generate(1)); + ASSERT_OK_AND_ASSIGN(auto match, Constant(value_match)->Generate(1)); ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_matches, match})); // Use the column as both the key and the payload. diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index ee8087d1ce8..19f9367dfcc 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -350,13 +350,14 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) { // A small batch to append to the row table repeatedly to grow the row table to big // enough. - constexpr int64_t num_rows_batch = std::numeric_limits::max() + 1ll; + constexpr int64_t num_rows_batch = std::numeric_limits::max(); constexpr int fixed_length = 256; // The size of the row table is one batch larger than 4GB, and we'll compare the last // num_rows_batch rows. constexpr int64_t k4GB = 4ll * 1024 * 1024 * 1024; - constexpr int64_t num_rows_row_table = k4GB / fixed_length + num_rows_batch; + constexpr int64_t num_rows_row_table = + (k4GB / (fixed_length * num_rows_batch) + 1) * num_rows_batch; static_assert(num_rows_row_table < std::numeric_limits::max(), "row table length must be less than uint32 max"); static_assert(num_rows_row_table * fixed_length > k4GB, @@ -411,7 +412,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBVarLength)) { // A small batch to append to the row table repeatedly to grow the row table to big // enough. - constexpr int64_t num_rows_batch = std::numeric_limits::max() + 1ll; + constexpr int64_t num_rows_batch = std::numeric_limits::max(); constexpr int fixed_length = 128; // Involve some small randomness in the var length column. constexpr int var_length_min = 128; @@ -422,7 +423,8 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBVarLength)) { // num_rows_batch rows. constexpr int64_t k4GB = 4ll * 1024 * 1024 * 1024; constexpr int64_t size_row_min = fixed_length + var_length_min; - constexpr int64_t num_rows_row_table = k4GB / size_row_min + num_rows_batch; + constexpr int64_t num_rows_row_table = + (k4GB / (size_row_min * num_rows_batch) + 1) * num_rows_batch; static_assert(num_rows_row_table < std::numeric_limits::max(), "row table length must be less than uint32 max"); static_assert(num_rows_row_table * size_row_min > k4GB, diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index c317fe7aef4..23458b53614 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -473,19 +473,16 @@ std::shared_ptr RandomArrayGenerator::StringWithRepeats( return result; } -std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, - int32_t byte_width, - double null_probability, - int64_t alignment, - MemoryPool* memory_pool) { +std::shared_ptr RandomArrayGenerator::FixedSizeBinary( + int64_t size, int32_t byte_width, uint8_t min_byte, uint8_t max_byte, + double null_probability, int64_t alignment, MemoryPool* memory_pool) { if (null_probability < 0 || null_probability > 1) { ABORT_NOT_OK(Status::Invalid("null_probability must be between 0 and 1")); } // Visual Studio does not implement uniform_int_distribution for char types. using GenOpt = GenerateOptions>; - GenOpt options(seed(), static_cast('A'), static_cast('z'), - null_probability); + GenOpt options(seed(), min_byte, max_byte, null_probability); int64_t null_count = 0; auto null_bitmap = *AllocateEmptyBitmap(size, alignment, memory_pool); @@ -1087,8 +1084,9 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t case Type::type::FIXED_SIZE_BINARY: { auto byte_width = internal::checked_pointer_cast(field.type())->byte_width(); - return *FixedSizeBinary(length, byte_width, null_probability, alignment, - memory_pool) + return *FixedSizeBinary(length, byte_width, /*min_byte=*/static_cast('A'), + /*min_byte=*/static_cast('z'), null_probability, + alignment, memory_pool) ->View(field.type()); } @@ -1143,8 +1141,10 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t // type means it's not a (useful) composition of other generators GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType); case Type::type::INTERVAL_MONTH_DAY_NANO: { - return *FixedSizeBinary(length, /*byte_width=*/16, null_probability, alignment, - memory_pool) + return *FixedSizeBinary(length, /*byte_width=*/16, + /*min_byte=*/static_cast('A'), + /*min_byte=*/static_cast('z'), null_probability, + alignment, memory_pool) ->View(month_day_nano_interval()); } diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 1d97a3ada72..a567b697e07 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -433,12 +433,18 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// /// \param[in] size the size of the array to generate /// \param[in] byte_width the byte width of fixed-size binary items + /// \param[in] min_byte the lower bound of each byte in the binary determined by the + /// uniform distribution + /// \param[in] max_byte the upper bound of each byte in the binary determined by the + /// uniform distribution /// \param[in] null_probability the probability of a value being null /// \param[in] alignment alignment for memory allocations (in bytes) /// \param[in] memory_pool memory pool to allocate memory from /// /// \return a generated Array std::shared_ptr FixedSizeBinary(int64_t size, int32_t byte_width, + uint8_t min_byte = static_cast('A'), + uint8_t max_byte = static_cast('z'), double null_probability = 0, int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); From 2b0ad20953b6ba81e18f1241d21eea6726212a06 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 29 Jul 2024 02:22:05 +0800 Subject: [PATCH 21/34] Fix --- cpp/src/arrow/acero/hash_join_node_test.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index d58e23de291..a8e39721f62 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -419,8 +419,9 @@ std::vector> GenRandomRecords( data_types[i].null_probability)); break; default: - result.push_back(rag.FixedSizeBinary(num_rows, data_types[i].fixed_length, - data_types[i].null_probability)); + result.push_back(rag.FixedSizeBinary( + num_rows, data_types[i].fixed_length, static_cast('A'), + static_cast('z'), data_types[i].null_probability)); break; } } else { From 0f4b770bf9a779d1c2962d66e6ceb82ad729d492 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 29 Jul 2024 02:28:08 +0800 Subject: [PATCH 22/34] Refine --- cpp/src/arrow/compute/row/compare_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index d5089674f1d..3cae04fd37d 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -104,7 +104,7 @@ void KeyCompare::CompareBinaryColumnToRowHelper( const uint8_t* rows_right = rows.data(1); for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - // irow_right is used to index into row data so elevate to the row offset type. + // irow_right is used to index into row data so promote to the row offset type. RowTableImpl::offset_type irow_right = left_to_right_map[irow_left]; RowTableImpl::offset_type offset_right = irow_right * fixed_length + offset_within_row; From 984b5e6fc947a3b88f6cf9147f8daf4146f3e15a Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 29 Jul 2024 02:41:05 +0800 Subject: [PATCH 23/34] Fix compile --- cpp/src/arrow/acero/hash_join_node_test.cc | 7 ++++--- .../arrow/compute/kernels/vector_selection_benchmark.cc | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index a8e39721f62..032e9319564 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -419,9 +419,10 @@ std::vector> GenRandomRecords( data_types[i].null_probability)); break; default: - result.push_back(rag.FixedSizeBinary( - num_rows, data_types[i].fixed_length, static_cast('A'), - static_cast('z'), data_types[i].null_probability)); + result.push_back(rag.FixedSizeBinary(num_rows, data_types[i].fixed_length, + /*min_byte=*/static_cast('A'), + /*max_byte=*/static_cast('z'), + data_types[i].null_probability)); break; } } else { diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index c2a27dfe434..90292946eb6 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -148,7 +148,9 @@ struct TakeBenchmark { void FixedSizeBinary() { const auto byte_width = static_cast(state.range(kByteWidthRange)); - auto values = rand.FixedSizeBinary(args.size, byte_width, args.null_proportion); + auto values = rand.FixedSizeBinary( + args.size, byte_width, /*min_byte=*/static_cast('A'), + /*max_byte=*/static_cast('z'), args.null_proportion); Bench(values); state.counters["byte_width"] = byte_width; } From 49a40679a6e8b351d58b4ed98a7d61907c9c2959 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 29 Jul 2024 11:45:17 +0800 Subject: [PATCH 24/34] Fix warning --- .../arrow/compute/kernels/vector_selection_benchmark.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index 90292946eb6..71a4dace7c8 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -172,7 +172,9 @@ struct TakeBenchmark { const auto byte_width = static_cast(state.range(kByteWidthRange)); auto chunked_array = GenChunkedArray(num_chunks, [this, byte_width](int64_t chunk_length) { - return rand.FixedSizeBinary(chunk_length, byte_width, args.null_proportion); + return rand.FixedSizeBinary( + chunk_length, byte_width, /*min_byte=*/static_cast('A'), + /*max_byte=*/static_cast('z'), args.null_proportion); }); BenchChunked(chunked_array, chunk_indices_too); state.counters["byte_width"] = byte_width; @@ -269,8 +271,9 @@ struct FilterBenchmark { void FixedSizeBinary() { const int32_t byte_width = static_cast(state.range(2)); const int64_t array_size = args.size / byte_width; - auto values = - rand.FixedSizeBinary(array_size, byte_width, args.values_null_proportion); + auto values = rand.FixedSizeBinary( + array_size, byte_width, /*min_byte=*/static_cast('A'), + /*max_byte=*/static_cast('z'), args.values_null_proportion); Bench(values); state.counters["byte_width"] = byte_width; } From 2ef0b46dfa9a8f451501acb123ce048b1da05394 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 29 Jul 2024 12:41:58 +0800 Subject: [PATCH 25/34] Fix warning --- cpp/src/arrow/compare_benchmark.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compare_benchmark.cc b/cpp/src/arrow/compare_benchmark.cc index 2699f90f69e..4b26288d1b2 100644 --- a/cpp/src/arrow/compare_benchmark.cc +++ b/cpp/src/arrow/compare_benchmark.cc @@ -93,7 +93,9 @@ static void ArrayRangeEqualsFixedSizeBinary(benchmark::State& state) { RegressionArgs args(state, /*size_is_bytes=*/false); auto rng = random::RandomArrayGenerator(kSeed); - auto array = rng.FixedSizeBinary(args.size, /*byte_width=*/8, args.null_proportion); + auto array = rng.FixedSizeBinary( + args.size, /*byte_width=*/8, /*min_byte=*/static_cast('A'), + /*max_byte=*/static_cast('z'), args.null_proportion); BenchmarkArrayRangeEquals(array, state); } From e65657a72f50158a0c7968bfea97a62f8c5eeb70 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 29 Jul 2024 22:53:16 +0800 Subject: [PATCH 26/34] Remove useless fix --- cpp/src/arrow/compute/row/compare_internal.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 3cae04fd37d..6f0cffdaeda 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -59,7 +59,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - int64_t irow_right = left_to_right_map[irow_left]; + uint32_t irow_right = left_to_right_map[irow_left]; int64_t bitid = irow_right * null_mask_num_bytes * 8 + null_bit_id; match_bytevector[i] &= (bit_util::GetBit(null_masks, bitid) ? 0 : 0xff); } @@ -80,7 +80,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com ARROW_DCHECK(non_nulls); for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; - int64_t irow_right = left_to_right_map[irow_left]; + uint32_t irow_right = left_to_right_map[irow_left]; int64_t bitid_right = irow_right * null_mask_num_bytes * 8 + null_bit_id; int right_null = bit_util::GetBit(null_masks, bitid_right) ? 0xff : 0; int left_null = From 780e2be7002c0c6c7c3a36bf13c90fde574b7c64 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 8 Aug 2024 12:35:22 +0800 Subject: [PATCH 27/34] Update gh issue number in tests --- cpp/src/arrow/acero/hash_join_node_test.cc | 8 ++++---- cpp/src/arrow/compute/row/compare_test.cc | 4 ++-- cpp/src/arrow/compute/row/row_test.cc | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 032e9319564..964f312af7d 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3277,8 +3277,8 @@ void AssertRowCountEq(Declaration source, int64_t expected) { } // namespace -// Test that both the key and the payload of the right side (the build side) are fixed -// length and larger than 4GB, and the 64-bit offset in the hash table can handle it +// GH-43495: Test that both the key and the payload of the right side (the build side) are +// fixed length and larger than 4GB, and the 64-bit offset in the hash table can handle it // correctly. TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; @@ -3362,8 +3362,8 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { num_batches_left * num_rows_per_batch_left * num_batches_right); } -// Test that both the key and the payload of the right side (the build side) are var -// length and larger than 4GB, and the 64-bit offset in the hash table can handle it +// GH-43495: Test that both the key and the payload of the right side (the build side) are +// var length and larger than 4GB, and the 64-bit offset in the hash table can handle it // correctly. TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 19f9367dfcc..5e8ee7c58a7 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -338,7 +338,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, row_ids_to_compare); } -// GH-XXXXX: Compare fixed length columns to rows over 4GB within a row table. +// GH-43495: Compare fixed length columns to rows over 4GB within a row table. TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; @@ -400,7 +400,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) { AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, row_ids_to_compare); } -// GH-XXXXX: Compare var length columns to rows at offset over 4GB within a row table. +// GH-43495: Compare var length columns to rows at offset over 4GB within a row table. TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBVarLength)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 40a3fb6dea2..40f668f341b 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -134,7 +134,7 @@ TEST(RowTableMemoryConsumption, Encode) { } } -// GH-XXXXX: Ensure that we can build a row table with more than 4GB row data. +// GH-43495: Ensure that we can build a row table with more than 4GB row data. TEST(RowTableLarge, LARGE_MEMORY_TEST(Encode)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; @@ -183,7 +183,7 @@ TEST(RowTableLarge, LARGE_MEMORY_TEST(Encode)) { row_table.offsets()[num_rows - 1] + length_per_binary); } -// GH-XXXXX: Ensure that we can build a row table with more than 4GB row data. +// GH-43495: Ensure that we can build a row table with more than 4GB row data. TEST(RowTableLarge, LARGE_MEMORY_TEST(AppendFrom)) { if constexpr (sizeof(void*) == 4) { GTEST_SKIP() << "Test only works on 64-bit platforms"; From 348ce66bbc6cf224d97c7a0b0f1a62ae0ed3eb13 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 8 Aug 2024 19:26:14 +0800 Subject: [PATCH 28/34] More accurate offset assertion for large row table tests --- cpp/src/arrow/compute/row/row_test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 40f668f341b..6aed9e43278 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -178,9 +178,9 @@ TEST(RowTableLarge, LARGE_MEMORY_TEST(Encode)) { ASSERT_OK(row_encoder.EncodeSelected(&row_table, static_cast(num_rows), row_ids.data())); - ASSERT_GT(row_table.offsets()[num_rows - 1], std::numeric_limits::max()); - ASSERT_GT(row_table.offsets()[num_rows], - row_table.offsets()[num_rows - 1] + length_per_binary); + auto encoded_row_length = table_metadata.fixed_length + length_per_binary; + ASSERT_EQ(row_table.offsets()[num_rows - 1], encoded_row_length * (num_rows - 1)); + ASSERT_EQ(row_table.offsets()[num_rows], encoded_row_length * num_rows); } // GH-43495: Ensure that we can build a row table with more than 4GB row data. @@ -234,9 +234,9 @@ TEST(RowTableLarge, LARGE_MEMORY_TEST(AppendFrom)) { /*source_row_ids=*/NULLPTR)); } - ASSERT_GT(row_table.offsets()[num_rows - 1], std::numeric_limits::max()); - ASSERT_GT(row_table.offsets()[num_rows], - row_table.offsets()[num_rows - 1] + length_per_binary); + auto encoded_row_length = table_metadata.fixed_length + length_per_binary; + ASSERT_EQ(row_table.offsets()[num_rows - 1], encoded_row_length * (num_rows - 1)); + ASSERT_EQ(row_table.offsets()[num_rows], encoded_row_length * num_rows); } } // namespace compute From ad0f78e561bf93da59b34fa3ee6d12b519ba9cfd Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Fri, 9 Aug 2024 11:12:37 +0800 Subject: [PATCH 29/34] Add comment about the row length invariant --- cpp/src/arrow/acero/swiss_join.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 5182ec0a620..40a4b5886e4 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -587,6 +587,8 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& const uint64_t* source_row_ptr = reinterpret_cast( source.data(2) + source_offsets[source_row_id]); int64_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id]; + // Though the row offset is 64-bit, the length of a single row must be 32-bit as + // required by current row table implementation. DCHECK_LE(length, std::numeric_limits::max()); // Rows should be 64-bit aligned. From 1215355177628212bca68cad8e95c5f57cd6bcf0 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Fri, 9 Aug 2024 11:23:23 +0800 Subject: [PATCH 30/34] Make random FixedSizeBinary more backward compatible --- cpp/src/arrow/acero/hash_join_node_test.cc | 6 ++---- cpp/src/arrow/compare_benchmark.cc | 4 +--- .../kernels/vector_selection_benchmark.cc | 13 ++++--------- cpp/src/arrow/testing/random.cc | 17 +++++++++-------- cpp/src/arrow/testing/random.h | 4 ++-- 5 files changed, 18 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 964f312af7d..88f9a9e71b7 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -420,8 +420,6 @@ std::vector> GenRandomRecords( break; default: result.push_back(rag.FixedSizeBinary(num_rows, data_types[i].fixed_length, - /*min_byte=*/static_cast('A'), - /*max_byte=*/static_cast('z'), data_types[i].null_probability)); break; } @@ -3318,8 +3316,8 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { // 1 value_match. auto non_matches = RandomArrayGenerator(kSeedMax).FixedSizeBinary( num_rows_per_batch_right - 1, fixed_length, - /*min_byte=*/byte_no_match_min, /*max_byte=*/byte_no_match_max, - /*null_probability =*/0.01); + /*null_probability =*/0.01, /*min_byte=*/byte_no_match_min, + /*max_byte=*/byte_no_match_max); ASSERT_OK_AND_ASSIGN(auto match, Constant(value_match)->Generate(1)); ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_matches, match})); diff --git a/cpp/src/arrow/compare_benchmark.cc b/cpp/src/arrow/compare_benchmark.cc index 4b26288d1b2..2699f90f69e 100644 --- a/cpp/src/arrow/compare_benchmark.cc +++ b/cpp/src/arrow/compare_benchmark.cc @@ -93,9 +93,7 @@ static void ArrayRangeEqualsFixedSizeBinary(benchmark::State& state) { RegressionArgs args(state, /*size_is_bytes=*/false); auto rng = random::RandomArrayGenerator(kSeed); - auto array = rng.FixedSizeBinary( - args.size, /*byte_width=*/8, /*min_byte=*/static_cast('A'), - /*max_byte=*/static_cast('z'), args.null_proportion); + auto array = rng.FixedSizeBinary(args.size, /*byte_width=*/8, args.null_proportion); BenchmarkArrayRangeEquals(array, state); } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index 71a4dace7c8..c2a27dfe434 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -148,9 +148,7 @@ struct TakeBenchmark { void FixedSizeBinary() { const auto byte_width = static_cast(state.range(kByteWidthRange)); - auto values = rand.FixedSizeBinary( - args.size, byte_width, /*min_byte=*/static_cast('A'), - /*max_byte=*/static_cast('z'), args.null_proportion); + auto values = rand.FixedSizeBinary(args.size, byte_width, args.null_proportion); Bench(values); state.counters["byte_width"] = byte_width; } @@ -172,9 +170,7 @@ struct TakeBenchmark { const auto byte_width = static_cast(state.range(kByteWidthRange)); auto chunked_array = GenChunkedArray(num_chunks, [this, byte_width](int64_t chunk_length) { - return rand.FixedSizeBinary( - chunk_length, byte_width, /*min_byte=*/static_cast('A'), - /*max_byte=*/static_cast('z'), args.null_proportion); + return rand.FixedSizeBinary(chunk_length, byte_width, args.null_proportion); }); BenchChunked(chunked_array, chunk_indices_too); state.counters["byte_width"] = byte_width; @@ -271,9 +267,8 @@ struct FilterBenchmark { void FixedSizeBinary() { const int32_t byte_width = static_cast(state.range(2)); const int64_t array_size = args.size / byte_width; - auto values = rand.FixedSizeBinary( - array_size, byte_width, /*min_byte=*/static_cast('A'), - /*max_byte=*/static_cast('z'), args.values_null_proportion); + auto values = + rand.FixedSizeBinary(array_size, byte_width, args.values_null_proportion); Bench(values); state.counters["byte_width"] = byte_width; } diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 23458b53614..59de09fff83 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -474,8 +474,8 @@ std::shared_ptr RandomArrayGenerator::StringWithRepeats( } std::shared_ptr RandomArrayGenerator::FixedSizeBinary( - int64_t size, int32_t byte_width, uint8_t min_byte, uint8_t max_byte, - double null_probability, int64_t alignment, MemoryPool* memory_pool) { + int64_t size, int32_t byte_width, double null_probability, uint8_t min_byte, + uint8_t max_byte, int64_t alignment, MemoryPool* memory_pool) { if (null_probability < 0 || null_probability > 1) { ABORT_NOT_OK(Status::Invalid("null_probability must be between 0 and 1")); } @@ -1084,9 +1084,10 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t case Type::type::FIXED_SIZE_BINARY: { auto byte_width = internal::checked_pointer_cast(field.type())->byte_width(); - return *FixedSizeBinary(length, byte_width, /*min_byte=*/static_cast('A'), - /*min_byte=*/static_cast('z'), null_probability, - alignment, memory_pool) + return *FixedSizeBinary(length, byte_width, null_probability, + /*min_byte=*/static_cast('A'), + /*min_byte=*/static_cast('z'), alignment, + memory_pool) ->View(field.type()); } @@ -1141,10 +1142,10 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t // type means it's not a (useful) composition of other generators GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType); case Type::type::INTERVAL_MONTH_DAY_NANO: { - return *FixedSizeBinary(length, /*byte_width=*/16, + return *FixedSizeBinary(length, /*byte_width=*/16, null_probability, /*min_byte=*/static_cast('A'), - /*min_byte=*/static_cast('z'), null_probability, - alignment, memory_pool) + /*min_byte=*/static_cast('z'), alignment, + memory_pool) ->View(month_day_nano_interval()); } diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index a567b697e07..9c0c5baae0f 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -433,19 +433,19 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// /// \param[in] size the size of the array to generate /// \param[in] byte_width the byte width of fixed-size binary items + /// \param[in] null_probability the probability of a value being null /// \param[in] min_byte the lower bound of each byte in the binary determined by the /// uniform distribution /// \param[in] max_byte the upper bound of each byte in the binary determined by the /// uniform distribution - /// \param[in] null_probability the probability of a value being null /// \param[in] alignment alignment for memory allocations (in bytes) /// \param[in] memory_pool memory pool to allocate memory from /// /// \return a generated Array std::shared_ptr FixedSizeBinary(int64_t size, int32_t byte_width, + double null_probability = 0, uint8_t min_byte = static_cast('A'), uint8_t max_byte = static_cast('z'), - double null_probability = 0, int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); From 70b19360eec58f27ca6c7bcf6dba07574b108810 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Fri, 9 Aug 2024 13:09:01 +0800 Subject: [PATCH 31/34] Add comments for non-trivial simd code --- cpp/src/arrow/acero/swiss_join_avx2.cc | 34 +++++++++++++++++-- .../compute/row/compare_internal_avx2.cc | 10 ++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index b02391dbb89..c3b932f5960 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -54,20 +54,26 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu __m256i varbinary_end_array_offset = _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset); for (int i = 0; i < num_rows / unroll; ++i) { + // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); + // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit + // row ids. __m256i row_offset_lo = _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), sizeof(RowTableImpl::offset_type)); __m256i row_offset_hi = _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), sizeof(RowTableImpl::offset_type)); + // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4 + // 64-bit row offsets. __m128i field_length_lo = _mm256_i64gather_epi32( reinterpret_cast(row_ptr_base), _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset), 1); __m128i field_length_hi = _mm256_i64gather_epi32( reinterpret_cast(row_ptr_base), _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset), 1); + // The final 8 32-bit field lengths, subtracting the field offset within row. __m256i field_length = _mm256_sub_epi32( _mm256_set_m128i(field_length_hi, field_length_lo), field_offset_within_row); process_8_values_fn(i * unroll, row_ptr_base, @@ -84,14 +90,21 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu auto row_ptr_base_i64 = reinterpret_cast(row_ptr_base); for (int i = 0; i < num_rows / unroll; ++i) { + // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); + // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit + // row ids. __m256i row_offset_lo = _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), sizeof(RowTableImpl::offset_type)); + // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4 + // 64-bit row offsets. __m256i row_offset_hi = _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), sizeof(RowTableImpl::offset_type)); + // Prepare the lower/higher 4 64-bit end array offsets based on the lower/higher 4 + // 64-bit row offsets. __m256i end_array_offset_lo = _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset); __m256i end_array_offset_hi = @@ -124,9 +137,14 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu 0x4e); // Swapping low and high 128-bits field_length = _mm256_sub_epi32(field_length, field_offset_within_row); + field_offset_within_row_A = + _mm256_add_epi32(field_offset_within_row_A, alignment_padding); + field_offset_within_row_B = + _mm256_add_epi32(field_offset_within_row_B, alignment_padding); + process_8_values_fn(i * unroll, row_ptr_base, - _mm256_add_epi64(row_offset_lo, field_offset_within_row), - _mm256_add_epi64(row_offset_hi, field_offset_within_row), + _mm256_add_epi64(row_offset_lo, field_offset_within_row_A), + _mm256_add_epi64(row_offset_hi, field_offset_within_row_B), field_length); } } @@ -145,12 +163,19 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu // const uint8_t* row_ptr_base = rows.data(1); for (int i = 0; i < num_rows / unroll; ++i) { + // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); + // Widen the 32-bit row ids to 64-bit and store the lower/higher 4 of them into 2 + // 256-bit registers. __m256i row_id_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id)); __m256i row_id_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id, 1)); + // Calculate the lower/higher 4 64-bit row offsets based on the lower/higher 4 + // 64-bit row ids and the fixed field length. __m256i row_offset_lo = _mm256_mul_epi32(row_id_lo, field_length); __m256i row_offset_hi = _mm256_mul_epi32(row_id_hi, field_length); + // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4 + // 64-bit row offsets and field offset within row. __m256i field_offset_lo = _mm256_add_epi64(row_offset_lo, field_offset_within_row); __m256i field_offset_hi = @@ -164,14 +189,19 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu const uint8_t* row_ptr_base = rows.data(2); const RowTableImpl::offset_type* row_offsets = rows.offsets(); for (int i = 0; i < num_rows / unroll; ++i) { + // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); + // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit + // row ids. __m256i row_offset_lo = _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), sizeof(RowTableImpl::offset_type)); __m256i row_offset_hi = _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), sizeof(RowTableImpl::offset_type)); + // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4 + // 64-bit row offsets and field offset within row. __m256i field_offset_lo = _mm256_add_epi64(row_offset_lo, field_offset_within_row); __m256i field_offset_hi = diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index a23545804fe..3a564474717 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -206,13 +206,19 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } + // Widen the 32-bit row ids to 64-bit and store the first/last 4 of them into 2 + // 256-bit registers. __m256i irow_right_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right)); __m256i irow_right_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1)); + // Calculate the lower/higher 4 64-bit row offsets based on the lower/higher 4 + // 64-bit row ids and the fixed length. __m256i offset_right_lo = _mm256_mul_epi32(irow_right_lo, _mm256_set1_epi64x(fixed_length)); __m256i offset_right_hi = _mm256_mul_epi32(irow_right_hi, _mm256_set1_epi64x(fixed_length)); + // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4 + // 64-bit row offsets and field offset within row. offset_right_lo = _mm256_add_epi64(offset_right_lo, _mm256_set1_epi64x(offset_within_row)); offset_right_hi = @@ -247,12 +253,16 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( auto offsets_right_i64 = reinterpret_cast(offsets_right); + // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit + // row ids. __m256i offset_right_lo = _mm256_i32gather_epi64(offsets_right_i64, _mm256_castsi256_si128(irow_right), sizeof(RowTableImpl::offset_type)); __m256i offset_right_hi = _mm256_i32gather_epi64( offsets_right_i64, _mm256_extracti128_si256(irow_right, 1), sizeof(RowTableImpl::offset_type)); + // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4 + // 64-bit row offsets and field offset within row. offset_right_lo = _mm256_add_epi64(offset_right_lo, _mm256_set1_epi64x(offset_within_row)); offset_right_hi = From f23f706d37ad2001ff7fffe3214577ae38fbc3dc Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 15 Aug 2024 00:35:13 +0800 Subject: [PATCH 32/34] Add comments about swiss join avx2 not wired --- cpp/src/arrow/acero/swiss_join_avx2.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index c3b932f5960..e6be49ae33d 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -23,6 +23,9 @@ namespace arrow { namespace acero { +// TODO(GH-43693): The functions in this file are not wired anywhere. We may consider +// actually utilizing them or removing them. + template int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int num_rows, const uint32_t* row_ids, From 0082bca268626ffcdea3ae15617c2c02f50ccfa7 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Fri, 16 Aug 2024 18:14:27 +0800 Subject: [PATCH 33/34] Update doc string for compare columns to rows api --- cpp/src/arrow/compute/row/compare_internal.cc | 12 ++++++--- cpp/src/arrow/compute/row/compare_internal.h | 27 ++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 6f0cffdaeda..5e1a87b7952 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -337,7 +337,13 @@ void KeyCompare::CompareColumnsToRows( const RowTableImpl& rows, bool are_cols_in_encoding_order, uint8_t* out_match_bitvector_maybe_null) { if (num_rows_to_compare == 0) { - *out_num_rows = 0; + if (out_match_bitvector_maybe_null) { + DCHECK_EQ(out_num_rows, nullptr); + DCHECK_EQ(out_sel_left_maybe_same, nullptr); + bit_util::ClearBitmap(out_match_bitvector_maybe_null, 0, num_rows_to_compare); + } else { + *out_num_rows = 0; + } return; } @@ -443,8 +449,8 @@ void KeyCompare::CompareColumnsToRows( match_bytevector_A, match_bitvector); if (out_match_bitvector_maybe_null) { - ARROW_DCHECK(out_num_rows == nullptr); - ARROW_DCHECK(out_sel_left_maybe_same == nullptr); + DCHECK_EQ(out_num_rows, nullptr); + DCHECK_EQ(out_sel_left_maybe_same, nullptr); memcpy(out_match_bitvector_maybe_null, match_bitvector, bit_util::BytesForBits(num_rows_to_compare)); } else { diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index a5a109b0b51..29d7f859e59 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -42,9 +42,30 @@ class ARROW_EXPORT KeyCompare { /*extra=*/util::MiniBatch::kMiniBatchLength; } - // Returns a single 16-bit selection vector of rows that failed comparison. - // If there is input selection on the left, the resulting selection is a filtered image - // of input selection. + /// \brief Compare a batch of rows in columnar format to the specified rows in row + /// format. + /// + /// The comparison result is populated in either a 16-bit selection vector of rows that + /// failed comparison, or a match bitvector with 1 for matched rows and 0 otherwise. + /// + /// @param num_rows_to_compare The number of rows to compare. + /// @param sel_left_maybe_null Optional input selection vector on the left, the + /// comparison is only performed on the selected rows. Null if all rows in + /// `left_to_right_map` are to be compared. + /// @param left_to_right_map The mapping from the left to the right rows. Left row `i` + /// in `cols` is compared to right row `left_to_right_map[i]` in `row`. + /// @param ctx The light context needed for the comparison. + /// @param out_num_rows The number of rows that failed comparison. Must be null if + /// `out_match_bitvector_maybe_null` is not null. + /// @param out_sel_left_maybe_same The selection vector of rows that failed comparison. + /// Can be the same as `sel_left_maybe_null` for in-place update. Must be null if + /// `out_match_bitvector_maybe_null` is not null. + /// @param cols The left rows in columnar format to compare. + /// @param rows The right rows in row format to compare. + /// @param are_cols_in_encoding_order Whether the columns are in encoding order. + /// @param out_match_bitvector_maybe_null The optional output match bitvector, 1 for + /// matched rows and 0 otherwise. Won't be populated if `out_num_rows` and + /// `out_sel_left_maybe_same` are not null. static void CompareColumnsToRows( uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, uint32_t* out_num_rows, From ded3e676b6b20ddea81cc50366787b72a636fde0 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Fri, 16 Aug 2024 18:34:02 +0800 Subject: [PATCH 34/34] Static assert the size of offset type in row table at where assumes it --- cpp/src/arrow/acero/swiss_join_avx2.cc | 3 +++ cpp/src/arrow/compute/row/compare_internal_avx2.cc | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index e6be49ae33d..e42b0b40445 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -49,6 +49,9 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); const uint8_t* row_ptr_base = rows.data(2); const RowTableImpl::offset_type* row_offsets = rows.offsets(); + static_assert( + sizeof(RowTableImpl::offset_type) == sizeof(int64_t), + "RowArrayAccessor::Visit_avx2 only supports 64-bit RowTableImpl::offset_type"); if (varbinary_column_id == 0) { // Case 1: This is the first varbinary column diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 3a564474717..96eed6fc03a 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -251,6 +251,9 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } + static_assert(sizeof(RowTableImpl::offset_type) == sizeof(int64_t), + "KeyCompare::CompareBinaryColumnToRowHelper_avx2 only supports " + "64-bit RowTableImpl::offset_type"); auto offsets_right_i64 = reinterpret_cast(offsets_right); // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit @@ -415,6 +418,9 @@ inline uint64_t Compare8_Binary_avx2(uint32_t length, const uint8_t* left_base, if (use_selection) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(irow_left_array), irow_left); } + static_assert( + sizeof(RowTableImpl::offset_type) * 4 == sizeof(__m256i), + "Unexpected RowTableImpl::offset_type size in KeyCompare::Compare8_Binary_avx2"); _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right_lo); _mm256_storeu_si256(reinterpret_cast<__m256i*>(&offset_right_array[4]), offset_right_hi);