From e15d80bed8a6f4c0c06008f9b43ba4ef22023722 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 15 Jan 2025 01:05:22 +0800 Subject: [PATCH 01/23] Reproduce the payload overflow issue --- cpp/src/arrow/acero/hash_join_node_test.cc | 86 ++++++++++++++++++++++ cpp/src/arrow/acero/swiss_join_internal.h | 2 + 2 files changed, 88 insertions(+) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 7dbed7163da..587769187bb 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3448,5 +3448,91 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { num_batches_left * num_rows_per_batch_left * num_batches_right); } +TEST(HashJoin, GH44513) { + // // Minimal reproduce-able set 1. + // const int64_t num_large_rows = 360449051; + // const int64_t num_batches = 4; + // const int64_t num_batches_multiplier = 2; + + const int64_t num_large_rows = 360449051; + const int64_t num_batches = 1024; + const int64_t num_batches_multiplier = 2; + + auto small_schema = + schema({field("key0", int64()), field("key1", int64()), field("key2", int64())}); + auto large_schema = schema({field("key0", int64()), field("key1", int64()), + field("key2", int64()), field("payload", int64())}); + + const int64_t key0_match = static_cast(88506230299); + const int64_t key1_match = static_cast(16556030299); + const int64_t key2_match = 11240299; + const int64_t payload_match = 42; + + ASSERT_OK_AND_ASSIGN(auto small_key0_arr, + Constant(MakeScalar(key0_match))->Generate(1)); + ASSERT_OK_AND_ASSIGN(auto small_key1_arr, + Constant(MakeScalar(key1_match))->Generate(1)); + ASSERT_OK_AND_ASSIGN(auto small_key2_arr, + Constant(MakeScalar(key2_match))->Generate(1)); + ExecBatch small_batch({small_key0_arr, small_key1_arr, small_key2_arr}, 1); + + const int64_t seed = 42; + auto large_unmatch_key_arr = RandomArrayGenerator(seed).Int64( + num_large_rows / num_batches, key2_match + 1, + key2_match + 1 + 8); + // ASSERT_OK_AND_ASSIGN( + // auto large_unmatch_key_arr, + // gen::StepInt64(key2_match + 1, 1)->Generate(num_large_rows / num_batches)); + // ASSERT_OK_AND_ASSIGN( + // auto large_unmatch_key_arr, + // Constant(MakeScalar(key2_match + 1))->Generate(num_large_rows / num_batches)); + // num_large_rows / num_batches, key2_match + 1, + // std::numeric_limits::max()); + ASSERT_OK_AND_ASSIGN(auto large_unmatch_payload_arr, + MakeArrayOfNull(int64(), num_large_rows / num_batches)); + ExecBatch large_unmatch_batch({large_unmatch_key_arr, large_unmatch_key_arr, + large_unmatch_key_arr, large_unmatch_payload_arr}, + num_large_rows / num_batches); + + auto large_match_key0_arr = small_key0_arr; + auto large_match_key1_arr = small_key1_arr; + auto large_match_key2_arr = small_key2_arr; + ASSERT_OK_AND_ASSIGN(auto large_match_payload_arr, + Constant(MakeScalar(payload_match))->Generate(1)); + ExecBatch large_match_batch({large_match_key0_arr, large_match_key1_arr, + large_match_key2_arr, large_match_payload_arr}, + 1); + + auto small_batches = + BatchesWithSchema{std::vector{small_batch}, small_schema}; + auto large_batches = BatchesWithSchema{ + std::vector(num_batches * num_batches_multiplier, large_unmatch_batch), + large_schema}; + large_batches.batches.push_back(large_match_batch); + // auto large_batches = + // BatchesWithSchema{std::vector{large_match_batch}, large_schema}; + // for (int i = 0; i < num_batches * num_batches_multiplier; i++) { + // large_batches.batches.push_back(large_unmatch_batch); + // } + + { + Declaration small_source{ + "exec_batch_source", + ExecBatchSourceNodeOptions(small_batches.schema, small_batches.batches)}; + Declaration large_source{ + "exec_batch_source", + ExecBatchSourceNodeOptions(large_batches.schema, large_batches.batches)}; + + HashJoinNodeOptions join_opts(JoinType::INNER, + /*left_keys=*/{"key0", "key1", "key2"}, + /*right_keys=*/{"key0", "key1", "key2"}); + Declaration join{ + "hashjoin", {std::move(small_source), std::move(large_source)}, join_opts}; + + auto result = DeclarationToTable(std::move(join)).ValueOrDie(); + std::cout << result->ToString() << std::endl; + } +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index f2f3ac5b1bf..8965ade0ce3 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -108,6 +108,7 @@ class RowArrayAccessor { if (field_length == 0) { field_length = 1; } + // int64_t row_length = rows.metadata().fixed_length; uint32_t row_length = rows.metadata().fixed_length; bool is_fixed_length_row = rows.metadata().is_fixed_length; @@ -143,6 +144,7 @@ class RowArrayAccessor { static void VisitNulls(const RowTableImpl& rows, int column_id, int num_rows, const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) { const uint8_t* null_masks = rows.null_masks(); + // int64_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id); for (int i = 0; i < num_rows; ++i) { From 06bcc5e4e4044c9a5a9dd62fdd7e09df1ecf0426 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 22 Jan 2025 18:20:20 +0800 Subject: [PATCH 02/23] Refine tests --- cpp/src/arrow/acero/hash_join_node_test.cc | 167 +++++++++++---------- 1 file changed, 86 insertions(+), 81 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 3fc0521e33e..cad7c09fbdb 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3449,90 +3449,95 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { num_batches_left * num_rows_per_batch_left * num_batches_right); } -TEST(HashJoin, GH44513) { - // // Minimal reproduce-able set 1. - // const int64_t num_large_rows = 360449051; - // const int64_t num_batches = 4; - // const int64_t num_batches_multiplier = 2; - - const int64_t num_large_rows = 360449051; - const int64_t num_batches = 1024; - const int64_t num_batches_multiplier = 2; - - auto small_schema = - schema({field("key0", int64()), field("key1", int64()), field("key2", int64())}); - auto large_schema = schema({field("key0", int64()), field("key1", int64()), - field("key2", int64()), field("payload", int64())}); - - const int64_t key0_match = static_cast(88506230299); - const int64_t key1_match = static_cast(16556030299); - const int64_t key2_match = 11240299; - const int64_t payload_match = 42; - - ASSERT_OK_AND_ASSIGN(auto small_key0_arr, - Constant(MakeScalar(key0_match))->Generate(1)); - ASSERT_OK_AND_ASSIGN(auto small_key1_arr, - Constant(MakeScalar(key1_match))->Generate(1)); - ASSERT_OK_AND_ASSIGN(auto small_key2_arr, - Constant(MakeScalar(key2_match))->Generate(1)); - ExecBatch small_batch({small_key0_arr, small_key1_arr, small_key2_arr}, 1); +TEST(HashJoin, LARGE_MEMORY_TEST(BuildSidePayloadOver4GB)) { + const int64_t num_match_rows = 32; + const int64_t num_rows_per_match_batch = 32; + const int64_t num_match_batches = num_match_rows / num_rows_per_match_batch; + + const int64_t num_unmatch_rows_large = 720898048; + const int64_t num_rows_per_unmatch_batch_large = 352001; + const int64_t num_unmatch_batches_large = + num_unmatch_rows_large / num_rows_per_unmatch_batch_large; + + auto schema_small = schema({field("small_key0", int64()), field("small_key1", int64()), + field("small_key2", int64())}); + auto schema_large = + schema({field("large_key0", int64()), field("large_key1", int64()), + field("large_key2", int64()), field("large_payload", int64())}); + + const int64_t match_key0 = static_cast(88506230299); + const int64_t match_key1 = static_cast(16556030299); + const int64_t match_key2 = 11240299; + const int64_t match_payload = 42; + + // Match arrays of length num_rows_per_match_batch. + ASSERT_OK_AND_ASSIGN( + auto match_key0_arr, + Constant(MakeScalar(match_key0))->Generate(num_rows_per_match_batch)); + ASSERT_OK_AND_ASSIGN( + auto match_key1_arr, + Constant(MakeScalar(match_key1))->Generate(num_rows_per_match_batch)); + ASSERT_OK_AND_ASSIGN( + auto match_key2_arr, + Constant(MakeScalar(match_key2))->Generate(num_rows_per_match_batch)); + ASSERT_OK_AND_ASSIGN( + auto match_payload_arr, + Constant(MakeScalar(match_payload))->Generate(num_rows_per_match_batch)); + // Small batch. + ExecBatch batch_small({match_key0_arr, match_key1_arr, match_key2_arr}, + num_rows_per_match_batch); + + // Large unmatch batch. const int64_t seed = 42; - auto large_unmatch_key_arr = RandomArrayGenerator(seed).Int64( - num_large_rows / num_batches, key2_match + 1, - key2_match + 1 + 8); - // ASSERT_OK_AND_ASSIGN( - // auto large_unmatch_key_arr, - // gen::StepInt64(key2_match + 1, 1)->Generate(num_large_rows / num_batches)); - // ASSERT_OK_AND_ASSIGN( - // auto large_unmatch_key_arr, - // Constant(MakeScalar(key2_match + 1))->Generate(num_large_rows / num_batches)); - // num_large_rows / num_batches, key2_match + 1, - // std::numeric_limits::max()); - ASSERT_OK_AND_ASSIGN(auto large_unmatch_payload_arr, - MakeArrayOfNull(int64(), num_large_rows / num_batches)); - ExecBatch large_unmatch_batch({large_unmatch_key_arr, large_unmatch_key_arr, - large_unmatch_key_arr, large_unmatch_payload_arr}, - num_large_rows / num_batches); - - auto large_match_key0_arr = small_key0_arr; - auto large_match_key1_arr = small_key1_arr; - auto large_match_key2_arr = small_key2_arr; - ASSERT_OK_AND_ASSIGN(auto large_match_payload_arr, - Constant(MakeScalar(payload_match))->Generate(1)); - ExecBatch large_match_batch({large_match_key0_arr, large_match_key1_arr, - large_match_key2_arr, large_match_payload_arr}, - 1); - - auto small_batches = - BatchesWithSchema{std::vector{small_batch}, small_schema}; - auto large_batches = BatchesWithSchema{ - std::vector(num_batches * num_batches_multiplier, large_unmatch_batch), - large_schema}; - large_batches.batches.push_back(large_match_batch); - // auto large_batches = - // BatchesWithSchema{std::vector{large_match_batch}, large_schema}; - // for (int i = 0; i < num_batches * num_batches_multiplier; i++) { - // large_batches.batches.push_back(large_unmatch_batch); - // } + auto unmatch_key_arr_large = RandomArrayGenerator(seed).Int64( + num_rows_per_unmatch_batch_large, /*min=*/match_key2 + 1, + /*max=*/match_key2 + 1 + 8); + ASSERT_OK_AND_ASSIGN(auto unmatch_payload_arr_large, + MakeArrayOfNull(int64(), num_rows_per_unmatch_batch_large)); + ExecBatch unmatch_batch_large({unmatch_key_arr_large, unmatch_key_arr_large, + unmatch_key_arr_large, unmatch_payload_arr_large}, + num_rows_per_unmatch_batch_large); + // Large match batch. + ExecBatch match_batch_large( + {match_key0_arr, match_key1_arr, match_key2_arr, match_payload_arr}, + num_rows_per_match_batch); + + // Batches with schemas. + auto batches_small = BatchesWithSchema{ + std::vector(num_match_batches, batch_small), schema_small}; + auto batches_large = BatchesWithSchema{ + std::vector(num_unmatch_batches_large, unmatch_batch_large), + schema_large}; + for (int i = 0; i < num_match_batches; i++) { + batches_large.batches.push_back(match_batch_large); + } + + Declaration source_small{ + "exec_batch_source", + ExecBatchSourceNodeOptions(batches_small.schema, batches_small.batches)}; + Declaration source_large{ + "exec_batch_source", + ExecBatchSourceNodeOptions(batches_large.schema, batches_large.batches)}; + + HashJoinNodeOptions join_opts( + JoinType::INNER, + /*left_keys=*/{"small_key0", "small_key1", "small_key2"}, + /*right_keys=*/{"large_key0", "large_key1", "large_key2"}); + Declaration join{ + "hashjoin", {std::move(source_small), std::move(source_large)}, join_opts}; - { - Declaration small_source{ - "exec_batch_source", - ExecBatchSourceNodeOptions(small_batches.schema, small_batches.batches)}; - Declaration large_source{ - "exec_batch_source", - ExecBatchSourceNodeOptions(large_batches.schema, large_batches.batches)}; - - HashJoinNodeOptions join_opts(JoinType::INNER, - /*left_keys=*/{"key0", "key1", "key2"}, - /*right_keys=*/{"key0", "key1", "key2"}); - Declaration join{ - "hashjoin", {std::move(small_source), std::move(large_source)}, join_opts}; - - auto result = DeclarationToTable(std::move(join)).ValueOrDie(); - std::cout << result->ToString() << std::endl; - } + // Join should emit num_match_rows * num_match_rows rows. + ASSERT_OK_AND_ASSIGN(auto batches_result, DeclarationToExecBatches(std::move(join))); + Declaration result{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_result.schema), + std::move(batches_result.batches))}; + AssertRowCountEq(result, num_match_rows * num_match_rows); + + // The payload should all be match_payload. + auto predicate = equal(field_ref("large_payload"), literal(match_payload)); + Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}}; + AssertRowCountEq(std::move(filter), num_match_rows * num_match_rows); } } // namespace acero From 2cdd4c2c8775fcd6af65055894527e6268b93d6d Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 22 Jan 2025 19:25:32 +0800 Subject: [PATCH 03/23] Replace overflow-prone null mask access --- cpp/src/arrow/acero/swiss_join.cc | 12 ++++++++---- cpp/src/arrow/acero/swiss_join_avx2.cc | 2 +- cpp/src/arrow/acero/swiss_join_internal.h | 10 +++------- cpp/src/arrow/compute/row/compare_internal.cc | 10 ++-------- .../arrow/compute/row/compare_internal_avx2.cc | 4 ++-- cpp/src/arrow/compute/row/encode_internal.cc | 4 ++-- cpp/src/arrow/compute/row/row_internal.cc | 10 +++++++--- cpp/src/arrow/compute/row/row_internal.h | 16 ++++++++++++++-- 8 files changed, 39 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index fc3be1b462e..1be7af9fd1f 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -564,14 +564,18 @@ void RowArrayMerge::CopyNulls(RowTableImpl* target, const RowTableImpl& source, const int64_t* source_rows_permutation) { int64_t num_source_rows = source.length(); int num_bytes_per_row = target->metadata().null_masks_bytes_per_row; - uint8_t* target_nulls = target->null_masks() + num_bytes_per_row * first_target_row_id; + DCHECK_LE(first_target_row_id, std::numeric_limits::max()); + uint8_t* target_nulls = + target->null_masks(static_cast(first_target_row_id), /*col_pos=*/0); if (!source_rows_permutation) { - memcpy(target_nulls, source.null_masks(), num_bytes_per_row * num_source_rows); + memcpy(target_nulls, source.null_masks(/*row_id=*/0, /*col_pos=*/0), + num_bytes_per_row * num_source_rows); } else { - for (int64_t i = 0; i < num_source_rows; ++i) { + for (uint32_t i = 0; i < num_source_rows; ++i) { int64_t source_row_id = source_rows_permutation[i]; + DCHECK_LE(source_row_id, std::numeric_limits::max()); const uint8_t* source_nulls = - source.null_masks() + num_bytes_per_row * source_row_id; + source.null_masks(static_cast(source_row_id), /*col_pos=*/0); for (int64_t byte = 0; byte < num_bytes_per_row; ++byte) { *target_nulls++ = *source_nulls++; } diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 1d6b7eda6e6..4374b44c66b 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -237,7 +237,7 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, // constexpr int kUnroll = 8; - const uint8_t* null_masks = rows.null_masks(); + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); __m256i null_bits_per_row = _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row); __m256i pos_after_encoding = diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 8965ade0ce3..6eceefd8639 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -108,8 +108,8 @@ class RowArrayAccessor { if (field_length == 0) { field_length = 1; } - // int64_t row_length = rows.metadata().fixed_length; - uint32_t row_length = rows.metadata().fixed_length; + int64_t row_length = rows.metadata().fixed_length; + // uint32_t row_length = rows.metadata().fixed_length; bool is_fixed_length_row = rows.metadata().is_fixed_length; if (is_fixed_length_row) { @@ -143,14 +143,10 @@ class RowArrayAccessor { template static void VisitNulls(const RowTableImpl& rows, int column_id, int num_rows, const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) { - const uint8_t* null_masks = rows.null_masks(); - // int64_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; - uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id); for (int i = 0; i < num_rows; ++i) { uint32_t row_id = row_ids[i]; - int64_t bit_id = row_id * null_mask_num_bytes * 8 + pos_after_encoding; - process_value_fn(i, bit_util::GetBit(null_masks, bit_id) ? 0xff : 0); + process_value_fn(i, rows.is_null(row_id, pos_after_encoding) ? 0xff : 0); } } diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 5e1a87b7952..72e5bb967de 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -55,13 +55,10 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com if (!col.data(0)) { // Remove rows from the result for which the column value is a null - const uint8_t* null_masks = rows.null_masks(); - uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; - int64_t bitid = irow_right * null_mask_num_bytes * 8 + null_bit_id; - match_bytevector[i] &= (bit_util::GetBit(null_masks, bitid) ? 0 : 0xff); + match_bytevector[i] &= (rows.is_null(irow_right, null_bit_id) ? 0 : 0xff); } } else if (!rows.has_any_nulls(ctx)) { // Remove rows from the result for which the column value on left side is @@ -74,15 +71,12 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com bit_util::GetBit(non_nulls, irow_left + col.bit_offset(0)) ? 0xff : 0; } } else { - const uint8_t* null_masks = rows.null_masks(); - uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; const uint8_t* non_nulls = col.data(0); ARROW_DCHECK(non_nulls); for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; - int64_t bitid_right = irow_right * null_mask_num_bytes * 8 + null_bit_id; - int right_null = bit_util::GetBit(null_masks, bitid_right) ? 0xff : 0; + int right_null = rows.is_null(irow_right, null_bit_id) ? 0xff : 0; int left_null = bit_util::GetBit(non_nulls, irow_left + col.bit_offset(0)) ? 0 : 0xff; match_bytevector[i] |= left_null & right_null; diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 9f6e1adfe21..9ac78fb9e4f 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -49,7 +49,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( if (!col.data(0)) { // Remove rows from the result for which the column value is a null - const uint8_t* null_masks = rows.null_masks(); + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; uint32_t num_processed = 0; @@ -117,7 +117,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( } return num_processed; } else { - const uint8_t* null_masks = rows.null_masks(); + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; const uint8_t* non_nulls = col.data(0); ARROW_DCHECK(non_nulls); diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 127d43021d6..6397554b08b 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -824,7 +824,7 @@ void EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, const RowTableI DCHECK(col.mutable_data(0) || col.metadata().is_null_type); } - const uint8_t* null_masks = rows.null_masks(); + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row; for (size_t col = 0; col < cols->size(); ++col) { if ((*cols)[col].metadata().is_null_type) { @@ -882,7 +882,7 @@ void EncoderVarBinary::EncodeSelected(uint32_t ivarbinary, RowTableImpl* rows, void EncoderNulls::EncodeSelected(RowTableImpl* rows, const std::vector& cols, uint32_t num_selected, const uint16_t* selection) { - uint8_t* null_masks = rows->null_masks(); + uint8_t* null_masks = rows->null_masks(/*row_id=*/0, /*col_pos=*/0); uint32_t null_mask_num_bytes = rows->metadata().null_masks_bytes_per_row; memset(null_masks, 0, null_mask_num_bytes * num_selected); for (size_t icol = 0; icol < cols.size(); ++icol) { diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index aa7e62add45..ed1d06be265 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -406,10 +406,14 @@ bool RowTableImpl::has_any_nulls(const LightContext* ctx) const { return true; } if (num_rows_for_has_any_nulls_ < num_rows_) { - auto size_per_row = metadata().null_masks_bytes_per_row; + DCHECK_LE(num_rows_for_has_any_nulls_, std::numeric_limits::max()); + int64_t num_bytes = + metadata().null_masks_bytes_per_row * (num_rows_ - num_rows_for_has_any_nulls_); + DCHECK_LE(num_bytes, std::numeric_limits::max()); has_any_nulls_ = !util::bit_util::are_all_bytes_zero( - ctx->hardware_flags, null_masks() + size_per_row * num_rows_for_has_any_nulls_, - static_cast(size_per_row * (num_rows_ - num_rows_for_has_any_nulls_))); + ctx->hardware_flags, + null_masks(static_cast(num_rows_for_has_any_nulls_), /*col_pos=*/0), + static_cast(num_bytes)); num_rows_for_has_any_nulls_ = num_rows_; } return has_any_nulls_; diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 3ab86fd1fc6..207f7b0e247 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -220,8 +220,20 @@ class ARROW_EXPORT RowTableImpl { offset_type* mutable_offsets() { return reinterpret_cast(mutable_data(1)); } - const uint8_t* null_masks() const { return null_masks_->data(); } - uint8_t* null_masks() { return null_masks_->mutable_data(); } + const uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) const { + return null_masks_->data() + + static_cast(row_id) * metadata_.null_masks_bytes_per_row + col_pos; + } + uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) { + return null_masks_->mutable_data() + + static_cast(row_id) * metadata_.null_masks_bytes_per_row + col_pos; + } + + bool is_null(uint32_t row_id, uint32_t col_pos) const { + return bit_util::GetBit( + null_masks_->data(), + static_cast(row_id) * metadata_.null_masks_bytes_per_row * 8 + col_pos); + } /// \brief True if there is a null value anywhere in the table /// From 7f0ea141cf4bb24f33556e9b27e4e700d4548eac Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 22 Jan 2025 20:32:22 +0800 Subject: [PATCH 04/23] Private buffer accessor and use dedicated interfaces --- cpp/src/arrow/acero/swiss_join.cc | 24 ++++++---- cpp/src/arrow/acero/swiss_join_avx2.cc | 7 +-- cpp/src/arrow/acero/swiss_join_internal.h | 10 ++-- cpp/src/arrow/compute/row/compare_internal.cc | 6 +-- .../compute/row/compare_internal_avx2.cc | 6 +-- cpp/src/arrow/compute/row/compare_test.cc | 6 +-- cpp/src/arrow/compute/row/encode_internal.cc | 40 +++++++--------- cpp/src/arrow/compute/row/encode_internal.h | 10 ++-- .../arrow/compute/row/encode_internal_avx2.cc | 15 +++--- cpp/src/arrow/compute/row/row_internal.h | 47 ++++++++++++++----- cpp/src/arrow/compute/row/row_test.cc | 13 ++--- 11 files changed, 100 insertions(+), 84 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 1be7af9fd1f..2cc0f870afc 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -477,14 +477,15 @@ void RowArrayMerge::CopyFixedLength(RowTableImpl* target, const RowTableImpl& so const int64_t* source_rows_permutation) { int64_t num_source_rows = source.length(); - int64_t fixed_length = target->metadata().fixed_length; + uint32_t fixed_length = target->metadata().fixed_length; // Permutation of source rows is optional. Without permutation all that is // needed is memcpy. // if (!source_rows_permutation) { - memcpy(target->mutable_data(1) + fixed_length * first_target_row_id, source.data(1), - fixed_length * num_source_rows); + DCHECK_LE(first_target_row_id, std::numeric_limits::max()); + memcpy(target->mutable_fixed_length_rows(static_cast(first_target_row_id)), + source.fixed_length_rows(/*row_id=*/0), fixed_length * num_source_rows); } else { // Row length must be a multiple of 64-bits due to enforced alignment. // Loop for each output row copying a fixed number of 64-bit words. @@ -494,10 +495,13 @@ void RowArrayMerge::CopyFixedLength(RowTableImpl* target, const RowTableImpl& so int64_t num_words_per_row = fixed_length / sizeof(uint64_t); for (int64_t i = 0; i < num_source_rows; ++i) { int64_t source_row_id = source_rows_permutation[i]; + DCHECK_LE(source_row_id, std::numeric_limits::max()); const uint64_t* source_row_ptr = reinterpret_cast( - source.data(1) + fixed_length * source_row_id); + source.fixed_length_rows(static_cast(source_row_id))); + int64_t target_row_id = first_target_row_id + i; + DCHECK_LE(target_row_id, std::numeric_limits::max()); uint64_t* target_row_ptr = reinterpret_cast( - target->mutable_data(1) + fixed_length * (first_target_row_id + i)); + target->mutable_fixed_length_rows(static_cast(target_row_id))); for (int64_t word = 0; word < num_words_per_row; ++word) { target_row_ptr[word] = source_row_ptr[word]; @@ -529,16 +533,16 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& // We can simply memcpy bytes of rows if their order has not changed. // - memcpy(target->mutable_data(2) + target_offsets[first_target_row_id], source.data(2), - source_offsets[num_source_rows] - source_offsets[0]); + memcpy(target->mutable_var_length_rows() + target_offsets[first_target_row_id], + source.var_length_rows(), source_offsets[num_source_rows] - source_offsets[0]); } else { int64_t target_row_offset = first_target_row_offset; - uint64_t* target_row_ptr = - reinterpret_cast(target->mutable_data(2) + target_row_offset); + uint64_t* target_row_ptr = reinterpret_cast( + target->mutable_var_length_rows() + target_row_offset); for (int64_t i = 0; i < num_source_rows; ++i) { int64_t source_row_id = source_rows_permutation[i]; const uint64_t* source_row_ptr = reinterpret_cast( - source.data(2) + source_offsets[source_row_id]); + source.var_length_rows() + source_offsets[source_row_id]); int64_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id]; // Though the row offset is 64-bit, the length of a single row must be 32-bit as // required by current row table implementation. diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 4374b44c66b..18c35cbb7ac 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -46,7 +46,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu if (!is_fixed_length_column) { int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); - const uint8_t* row_ptr_base = rows.data(2); + const uint8_t* row_ptr_base = rows.var_length_rows(); const RowTableImpl::offset_type* row_offsets = rows.offsets(); auto row_offsets_i64 = reinterpret_cast(row_offsets); @@ -172,7 +172,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu if (is_fixed_length_row) { // Case 3: This is a fixed length column in fixed length row // - const uint8_t* row_ptr_base = rows.data(1); + const uint8_t* row_ptr_base = rows.fixed_length_rows(/*row_id=*/0); for (int i = 0; i < num_rows / kUnroll; ++i) { // Load 8 32-bit row ids. __m256i row_id = @@ -197,7 +197,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu } else { // Case 4: This is a fixed length column in varying length row // - const uint8_t* row_ptr_base = rows.data(2); + const uint8_t* row_ptr_base = rows.var_length_rows(); const RowTableImpl::offset_type* row_offsets = rows.offsets(); auto row_offsets_i64 = reinterpret_cast(row_offsets); @@ -237,6 +237,7 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, // constexpr int kUnroll = 8; + // TODO: Fix this. const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); __m256i null_bits_per_row = _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row); diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 6eceefd8639..85f443b0323 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -72,7 +72,7 @@ class RowArrayAccessor { if (!is_fixed_length_column) { int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); - const uint8_t* row_ptr_base = rows.data(2); + const uint8_t* row_ptr_base = rows.var_length_rows(); const RowTableImpl::offset_type* row_offsets = rows.offsets(); uint32_t field_offset_within_row, field_length; @@ -108,23 +108,21 @@ class RowArrayAccessor { if (field_length == 0) { field_length = 1; } - int64_t row_length = rows.metadata().fixed_length; - // uint32_t row_length = rows.metadata().fixed_length; bool is_fixed_length_row = rows.metadata().is_fixed_length; if (is_fixed_length_row) { // Case 3: This is a fixed length column in a fixed length row // - const uint8_t* row_ptr_base = rows.data(1) + field_offset_within_row; for (int i = 0; i < num_rows; ++i) { uint32_t row_id = row_ids[i]; - const uint8_t* row_ptr = row_ptr_base + row_length * row_id; + const uint8_t* row_ptr = + rows.fixed_length_rows(row_id) + field_offset_within_row; process_value_fn(i, row_ptr, field_length); } } else { // Case 4: This is a fixed length column in a varying length row // - const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row; + const uint8_t* row_ptr_base = rows.var_length_rows() + field_offset_within_row; const RowTableImpl::offset_type* row_offsets = rows.offsets(); for (int i = 0; i < num_rows; ++i) { uint32_t row_id = row_ids[i]; diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 72e5bb967de..b7a01ea75ad 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -95,7 +95,7 @@ void KeyCompare::CompareBinaryColumnToRowHelper( if (is_fixed_length) { uint32_t fixed_length = rows.metadata().fixed_length; const uint8_t* rows_left = col.data(1); - const uint8_t* rows_right = rows.data(1); + const uint8_t* rows_right = rows.fixed_length_rows(/*row_id=*/0); for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; // irow_right is used to index into row data so promote to the row offset type. @@ -107,7 +107,7 @@ void KeyCompare::CompareBinaryColumnToRowHelper( } else { const uint8_t* rows_left = col.data(1); const RowTableImpl::offset_type* offsets_right = rows.offsets(); - const uint8_t* rows_right = rows.data(2); + const uint8_t* rows_right = rows.var_length_rows(); for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; @@ -240,7 +240,7 @@ void KeyCompare::CompareVarBinaryColumnToRowHelper( const uint32_t* offsets_left = col.offsets(); const RowTableImpl::offset_type* offsets_right = rows.offsets(); const uint8_t* rows_left = col.data(2); - const uint8_t* rows_right = rows.data(2); + const uint8_t* rows_right = rows.var_length_rows(); for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 9ac78fb9e4f..2d65cb9d90e 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -189,7 +189,7 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( if (is_fixed_length) { uint32_t fixed_length = rows.metadata().fixed_length; const uint8_t* rows_left = col.data(1); - const uint8_t* rows_right = rows.data(1); + const uint8_t* rows_right = rows.fixed_length_rows(/*row_id=*/0); constexpr uint32_t unroll = 8; __m256i irow_left = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { @@ -234,7 +234,7 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2( } else { const uint8_t* rows_left = col.data(1); const RowTableImpl::offset_type* offsets_right = rows.offsets(); - const uint8_t* rows_right = rows.data(2); + const uint8_t* rows_right = rows.var_length_rows(); constexpr uint32_t unroll = 8; __m256i irow_left = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { @@ -554,7 +554,7 @@ void KeyCompare::CompareVarBinaryColumnToRowImp_avx2( const uint32_t* offsets_left = col.offsets(); const RowTableImpl::offset_type* offsets_right = rows.offsets(); const uint8_t* rows_left = col.data(2); - const uint8_t* rows_right = rows.data(2); + const uint8_t* rows_right = rows.var_length_rows(); for (uint32_t i = 0; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 5e8ee7c58a7..2b8f4d97561 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -327,7 +327,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) { ASSERT_OK_AND_ASSIGN(RowTableImpl row_table_right, MakeRowTableFromExecBatch(batch_left)); // The row table must contain an offset buffer. - ASSERT_NE(row_table_right.data(2), NULLPTR); + ASSERT_NE(row_table_right.var_length_rows(), NULLPTR); // The whole point of this test. ASSERT_GT(row_table_right.offsets()[num_rows - 1], k2GB); @@ -387,7 +387,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) { RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(), num_rows_row_table)); // The row table must not contain a third buffer. - ASSERT_EQ(row_table_right.data(2), NULLPTR); + ASSERT_EQ(row_table_right.var_length_rows(), NULLPTR); // The row data must be greater than 4GB. ASSERT_GT(row_table_right.buffer_size(1), k4GB); @@ -460,7 +460,7 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBVarLength)) { RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(), num_rows_row_table)); // The row table must contain an offset buffer. - ASSERT_NE(row_table_right.data(2), NULLPTR); + ASSERT_NE(row_table_right.var_length_rows(), NULLPTR); // At least the last row should be located at over 4GB. ASSERT_GT(row_table_right.offsets()[num_rows_row_table - 1], k4GB); diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 6397554b08b..b0a57708f44 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -260,36 +260,32 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows, col_prep.metadata().fixed_length == rows.metadata().fixed_length) { DCHECK_EQ(offset_within_row, 0); uint32_t row_size = rows.metadata().fixed_length; - memcpy(col_prep.mutable_data(1), rows.data(1) + start_row * row_size, - num_rows * row_size); + memcpy(col_prep.mutable_data(1), rows.fixed_length_rows(start_row), + static_cast(num_rows) * row_size); } else if (rows.metadata().is_fixed_length) { - uint32_t row_size = rows.metadata().fixed_length; - const uint8_t* row_base = - rows.data(1) + static_cast(start_row) * row_size; - row_base += offset_within_row; uint8_t* col_base = col_prep.mutable_data(1); switch (col_prep.metadata().fixed_length) { case 1: for (uint32_t i = 0; i < num_rows; ++i) { - col_base[i] = row_base[i * row_size]; + col_base[i] = *rows.fixed_length_rows(start_row + i); } break; case 2: for (uint32_t i = 0; i < num_rows; ++i) { reinterpret_cast(col_base)[i] = - *reinterpret_cast(row_base + i * row_size); + *reinterpret_cast(rows.fixed_length_rows(start_row + i)); } break; case 4: for (uint32_t i = 0; i < num_rows; ++i) { reinterpret_cast(col_base)[i] = - *reinterpret_cast(row_base + i * row_size); + *reinterpret_cast(rows.fixed_length_rows(start_row + i)); } break; case 8: for (uint32_t i = 0; i < num_rows; ++i) { reinterpret_cast(col_base)[i] = - *reinterpret_cast(row_base + i * row_size); + *reinterpret_cast(rows.fixed_length_rows(start_row + i)); } break; default: @@ -297,7 +293,7 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows, } } else { const RowTableImpl::offset_type* row_offsets = rows.offsets() + start_row; - const uint8_t* row_base = rows.data(2); + const uint8_t* row_base = rows.var_length_rows(); row_base += offset_within_row; uint8_t* col_base = col_prep.mutable_data(1); switch (col_prep.metadata().fixed_length) { @@ -343,14 +339,14 @@ void EncoderBinary::EncodeSelectedImp(uint32_t offset_within_row, RowTableImpl* if (is_fixed_length) { uint32_t row_width = rows->metadata().fixed_length; const uint8_t* src_base = col.data(1); - uint8_t* dst = rows->mutable_data(1) + offset_within_row; + uint8_t* dst = rows->mutable_fixed_length_rows(/*row_id=*/0) + offset_within_row; for (uint32_t i = 0; i < num_selected; ++i) { copy_fn(dst, src_base, selection[i]); dst += row_width; } if (col.data(0)) { const uint8_t* non_null_bits = col.data(0); - uint8_t* dst = rows->mutable_data(1) + offset_within_row; + dst = rows->mutable_fixed_length_rows(/*row_id=*/0) + offset_within_row; for (uint32_t i = 0; i < num_selected; ++i) { bool is_null = !bit_util::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); if (is_null) { @@ -361,14 +357,14 @@ void EncoderBinary::EncodeSelectedImp(uint32_t offset_within_row, RowTableImpl* } } else { const uint8_t* src_base = col.data(1); - uint8_t* dst = rows->mutable_data(2) + offset_within_row; + uint8_t* dst = rows->mutable_var_length_rows() + offset_within_row; const RowTableImpl::offset_type* offsets = rows->offsets(); for (uint32_t i = 0; i < num_selected; ++i) { copy_fn(dst + offsets[i], src_base, selection[i]); } if (col.data(0)) { const uint8_t* non_null_bits = col.data(0); - uint8_t* dst = rows->mutable_data(2) + offset_within_row; + uint8_t* dst = rows->mutable_var_length_rows() + offset_within_row; const RowTableImpl::offset_type* offsets = rows->offsets(); for (uint32_t i = 0; i < num_selected; ++i) { bool is_null = !bit_util::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); @@ -584,16 +580,13 @@ void EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, uint8_t* dst_A = col1->mutable_data(1); uint8_t* dst_B = col2->mutable_data(1); - uint32_t fixed_length = rows.metadata().fixed_length; const RowTableImpl::offset_type* offsets; const uint8_t* src_base; if (is_row_fixed_length) { - src_base = rows.data(1) + - static_cast(start_row) * fixed_length + - offset_within_row; + src_base = rows.fixed_length_rows(start_row) + offset_within_row; offsets = nullptr; } else { - src_base = rows.data(2) + offset_within_row; + src_base = rows.var_length_rows() + offset_within_row; offsets = rows.offsets() + start_row; } @@ -601,6 +594,7 @@ void EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, using col2_type_const = typename std::add_const::type; if (is_row_fixed_length) { + uint32_t fixed_length = rows.metadata().fixed_length; const uint8_t* src = src_base + num_rows_to_skip * fixed_length; for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) { reinterpret_cast(dst_A)[i] = *reinterpret_cast(src); @@ -654,7 +648,7 @@ void EncoderOffsets::Decode(uint32_t start_row, uint32_t num_rows, for (uint32_t i = 0; i < num_rows; ++i) { // Find the beginning of cumulative lengths array for next row - const uint8_t* row = rows.data(2) + row_offsets[i]; + const uint8_t* row = rows.var_length_rows() + row_offsets[i]; const uint32_t* varbinary_ends = rows.metadata().varbinary_end_array(row); // Update the offset of each column @@ -728,7 +722,7 @@ void EncoderOffsets::EncodeSelectedImp(uint32_t ivarbinary, RowTableImpl* rows, const std::vector& cols, uint32_t num_selected, const uint16_t* selection) { const RowTableImpl::offset_type* row_offsets = rows->offsets(); - uint8_t* row_base = rows->mutable_data(2) + + uint8_t* row_base = rows->mutable_var_length_rows() + rows->metadata().varbinary_end_array_offset + ivarbinary * sizeof(uint32_t); const uint32_t* col_offsets = cols[ivarbinary].offsets(); @@ -853,7 +847,7 @@ void EncoderVarBinary::EncodeSelected(uint32_t ivarbinary, RowTableImpl* rows, const KeyColumnArray& cols, uint32_t num_selected, const uint16_t* selection) { const RowTableImpl::offset_type* row_offsets = rows->offsets(); - uint8_t* row_base = rows->mutable_data(2); + uint8_t* row_base = rows->mutable_var_length_rows(); const uint32_t* col_offsets = cols.offsets(); const uint8_t* col_base = cols.data(2); diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 37538fcc4b8..5ad82e0c8e7 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -164,11 +164,10 @@ class EncoderBinary { uint32_t col_width = col_const->metadata().fixed_length; if (is_row_fixed_length) { - uint32_t row_width = rows_const->metadata().fixed_length; for (uint32_t i = 0; i < num_rows; ++i) { const uint8_t* src; uint8_t* dst; - src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row; + src = rows_const->fixed_length_rows(start_row + i) + offset_within_row; dst = col_mutable_maybe_null->mutable_data(1) + col_width * i; copy_fn(dst, src, col_width); } @@ -177,7 +176,8 @@ class EncoderBinary { for (uint32_t i = 0; i < num_rows; ++i) { const uint8_t* src; uint8_t* dst; - src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row; + src = rows_const->var_length_rows() + row_offsets[start_row + i] + + offset_within_row; dst = col_mutable_maybe_null->mutable_data(1) + col_width * i; copy_fn(dst, src, col_width); } @@ -277,7 +277,7 @@ class EncoderVarBinary { col_offset_next = col_offsets[i + 1]; RowTableImpl::offset_type row_offset = row_offsets_for_batch[i]; - const uint8_t* row = rows_const->data(2) + row_offset; + const uint8_t* row = rows_const->var_length_rows() + row_offset; uint32_t offset_within_row; uint32_t length; @@ -293,7 +293,7 @@ class EncoderVarBinary { const uint8_t* src; uint8_t* dst; - src = rows_const->data(2) + row_offset; + src = rows_const->var_length_rows() + row_offset; dst = col_mutable_maybe_null->mutable_data(2) + col_offset; copy_fn(dst, src, length); } diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc b/cpp/src/arrow/compute/row/encode_internal_avx2.cc index d2e317deb89..da29eecf7c5 100644 --- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc @@ -75,14 +75,9 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows uint32_t fixed_length = rows.metadata().fixed_length; const RowTableImpl::offset_type* offsets; - const uint8_t* src_base; if (is_row_fixed_length) { - src_base = rows.data(1) + - static_cast(fixed_length) * start_row + - offset_within_row; offsets = nullptr; } else { - src_base = rows.data(2) + offset_within_row; offsets = rows.offsets() + start_row; } @@ -94,14 +89,15 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows for (uint32_t i = 0; i < num_rows / unroll; ++i) { const __m128i *src0, *src1, *src2, *src3; if (is_row_fixed_length) { - const uint8_t* src = src_base + (i * unroll) * fixed_length; + const uint8_t* src = + rows.fixed_length_rows(start_row + i * unroll) + offset_within_row; src0 = reinterpret_cast(src); src1 = reinterpret_cast(src + fixed_length); src2 = reinterpret_cast(src + fixed_length * 2); src3 = reinterpret_cast(src + fixed_length * 3); } else { + const uint8_t* src = rows.fixed_length_rows(/*row_id=*/0) + offset_within_row; const RowTableImpl::offset_type* row_offsets = offsets + i * unroll; - const uint8_t* src = src_base; src0 = reinterpret_cast(src + row_offsets[0]); src1 = reinterpret_cast(src + row_offsets[1]); src2 = reinterpret_cast(src + row_offsets[2]); @@ -127,7 +123,8 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows uint8_t buffer[64]; for (uint32_t i = 0; i < num_rows / unroll; ++i) { if (is_row_fixed_length) { - const uint8_t* src = src_base + (i * unroll) * fixed_length; + const uint8_t* src = + rows.fixed_length_rows(start_row + i * unroll) + offset_within_row; for (int j = 0; j < unroll; ++j) { if (col_width == 1) { reinterpret_cast(buffer)[j] = @@ -141,8 +138,8 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows } } } else { + const uint8_t* src = rows.fixed_length_rows(/*row_id=*/0) + offset_within_row; const RowTableImpl::offset_type* row_offsets = offsets + i * unroll; - const uint8_t* src = src_base; for (int j = 0; j < unroll; ++j) { if (col_width == 1) { reinterpret_cast(buffer)[j] = diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 207f7b0e247..a73e6044541 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -199,27 +199,32 @@ class ARROW_EXPORT RowTableImpl { const RowTableMetadata& metadata() const { return metadata_; } /// \brief The number of rows stored in the table int64_t length() const { return num_rows_; } - // Accessors into the table's buffers - const uint8_t* data(int i) const { - ARROW_DCHECK(i >= 0 && i < kMaxBuffers); - if (ARROW_PREDICT_TRUE(buffers_[i])) { - return buffers_[i]->data(); - } - return NULLPTR; + + const uint8_t* var_length_rows() const { + ARROW_DCHECK(!metadata_.is_fixed_length); + return data(2); } - uint8_t* mutable_data(int i) { - ARROW_DCHECK(i >= 0 && i < kMaxBuffers); - if (ARROW_PREDICT_TRUE(buffers_[i])) { - return buffers_[i]->mutable_data(); - } - return NULLPTR; + uint8_t* mutable_var_length_rows() { + ARROW_DCHECK(!metadata_.is_fixed_length); + return mutable_data(2); } + + const uint8_t* fixed_length_rows(uint32_t row_id) const { + ARROW_DCHECK(metadata_.is_fixed_length); + return data(1) + static_cast(row_id) * metadata_.fixed_length; + } + uint8_t* mutable_fixed_length_rows(uint32_t row_id) { + ARROW_DCHECK(metadata_.is_fixed_length); + return mutable_data(1) + static_cast(row_id) * metadata_.fixed_length; + } + const offset_type* offsets() const { return reinterpret_cast(data(1)); } offset_type* mutable_offsets() { return reinterpret_cast(mutable_data(1)); } + const uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) const { return null_masks_->data() + static_cast(row_id) * metadata_.null_masks_bytes_per_row + col_pos; @@ -249,6 +254,22 @@ class ARROW_EXPORT RowTableImpl { } private: + // Accessors into the table's buffers + const uint8_t* data(int i) const { + ARROW_DCHECK(i >= 0 && i < kMaxBuffers); + if (ARROW_PREDICT_TRUE(buffers_[i])) { + return buffers_[i]->data(); + } + return NULLPTR; + } + uint8_t* mutable_data(int i) { + ARROW_DCHECK(i >= 0 && i < kMaxBuffers); + if (ARROW_PREDICT_TRUE(buffers_[i])) { + return buffers_[i]->mutable_data(); + } + return NULLPTR; + } + /// \brief Resize the fixed length buffers to store `num_extra_rows` more rows. The /// fixed length buffers are buffers_[0] for null masks, buffers_[1] for row data if the /// row is fixed length, or for row offsets otherwise. diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 5057ce91b5b..0267adcde1e 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -92,9 +92,10 @@ TEST(RowTableMemoryConsumption, Encode) { ASSERT_OK_AND_ASSIGN(auto row_table, MakeRowTableFromColumn(col, num_rows, dt->byte_width(), /*string_alignment=*/0)); - ASSERT_NE(row_table.data(0), NULLPTR); - ASSERT_NE(row_table.data(1), NULLPTR); - ASSERT_EQ(row_table.data(2), NULLPTR); + ASSERT_NE(row_table.null_masks(/*row_id=*/0, /*col_pos=*/0), NULLPTR); + ASSERT_NE(row_table.fixed_length_rows(/*row_id=*/0), NULLPTR); + // TODO: May fail. + ASSERT_EQ(row_table.var_length_rows(), NULLPTR); int64_t actual_null_mask_size = num_rows * row_table.metadata().null_masks_bytes_per_row; @@ -113,9 +114,9 @@ TEST(RowTableMemoryConsumption, Encode) { SCOPED_TRACE("encoding var length column of " + std::to_string(num_rows) + " rows"); ASSERT_OK_AND_ASSIGN(auto row_table, MakeRowTableFromColumn(var_length_column, num_rows, 4, 4)); - ASSERT_NE(row_table.data(0), NULLPTR); - ASSERT_NE(row_table.data(1), NULLPTR); - ASSERT_NE(row_table.data(2), NULLPTR); + ASSERT_NE(row_table.null_masks(/*row_id=*/0, /*col_pos=*/0), NULLPTR); + ASSERT_NE(row_table.offsets(), NULLPTR); + ASSERT_NE(row_table.var_length_rows(), NULLPTR); int64_t actual_null_mask_size = num_rows * row_table.metadata().null_masks_bytes_per_row; From f2f3535fde361b4f7377cb0ce04655c7c743ddcd Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 01:03:32 +0800 Subject: [PATCH 05/23] Refine and fix --- .../arrow/compute/row/encode_internal_avx2.cc | 4 +- cpp/src/arrow/compute/row/row_internal.h | 49 ++++++++++--------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc b/cpp/src/arrow/compute/row/encode_internal_avx2.cc index da29eecf7c5..650d24b8efc 100644 --- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc @@ -96,7 +96,7 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows src2 = reinterpret_cast(src + fixed_length * 2); src3 = reinterpret_cast(src + fixed_length * 3); } else { - const uint8_t* src = rows.fixed_length_rows(/*row_id=*/0) + offset_within_row; + const uint8_t* src = rows.var_length_rows() + offset_within_row; const RowTableImpl::offset_type* row_offsets = offsets + i * unroll; src0 = reinterpret_cast(src + row_offsets[0]); src1 = reinterpret_cast(src + row_offsets[1]); @@ -138,7 +138,7 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows } } } else { - const uint8_t* src = rows.fixed_length_rows(/*row_id=*/0) + offset_within_row; + const uint8_t* src = rows.var_length_rows() + offset_within_row; const RowTableImpl::offset_type* row_offsets = offsets + i * unroll; for (int j = 0; j < unroll; ++j) { if (col_width == 1) { diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index a73e6044541..5d803b77fee 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -200,44 +200,45 @@ class ARROW_EXPORT RowTableImpl { /// \brief The number of rows stored in the table int64_t length() const { return num_rows_; } - const uint8_t* var_length_rows() const { - ARROW_DCHECK(!metadata_.is_fixed_length); - return data(2); + inline const uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) const { + return data(0) + static_cast(row_id) * metadata_.null_masks_bytes_per_row + + col_pos; } - uint8_t* mutable_var_length_rows() { - ARROW_DCHECK(!metadata_.is_fixed_length); - return mutable_data(2); + inline uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) { + return mutable_data(0) + + static_cast(row_id) * metadata_.null_masks_bytes_per_row + col_pos; + } + inline bool is_null(uint32_t row_id, uint32_t col_pos) const { + return bit_util::GetBit( + null_masks_->data(), + static_cast(row_id) * metadata_.null_masks_bytes_per_row * 8 + col_pos); } - const uint8_t* fixed_length_rows(uint32_t row_id) const { + inline const uint8_t* fixed_length_rows(uint32_t row_id) const { ARROW_DCHECK(metadata_.is_fixed_length); return data(1) + static_cast(row_id) * metadata_.fixed_length; } - uint8_t* mutable_fixed_length_rows(uint32_t row_id) { + inline uint8_t* mutable_fixed_length_rows(uint32_t row_id) { ARROW_DCHECK(metadata_.is_fixed_length); return mutable_data(1) + static_cast(row_id) * metadata_.fixed_length; } - const offset_type* offsets() const { + inline const offset_type* offsets() const { + ARROW_DCHECK(!metadata_.is_fixed_length); return reinterpret_cast(data(1)); } - offset_type* mutable_offsets() { + inline offset_type* mutable_offsets() { + ARROW_DCHECK(!metadata_.is_fixed_length); return reinterpret_cast(mutable_data(1)); } - const uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) const { - return null_masks_->data() + - static_cast(row_id) * metadata_.null_masks_bytes_per_row + col_pos; - } - uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) { - return null_masks_->mutable_data() + - static_cast(row_id) * metadata_.null_masks_bytes_per_row + col_pos; + inline const uint8_t* var_length_rows() const { + ARROW_DCHECK(!metadata_.is_fixed_length); + return data(2); } - - bool is_null(uint32_t row_id, uint32_t col_pos) const { - return bit_util::GetBit( - null_masks_->data(), - static_cast(row_id) * metadata_.null_masks_bytes_per_row * 8 + col_pos); + inline uint8_t* mutable_var_length_rows() { + ARROW_DCHECK(!metadata_.is_fixed_length); + return mutable_data(2); } /// \brief True if there is a null value anywhere in the table @@ -255,14 +256,14 @@ class ARROW_EXPORT RowTableImpl { private: // Accessors into the table's buffers - const uint8_t* data(int i) const { + inline const uint8_t* data(int i) const { ARROW_DCHECK(i >= 0 && i < kMaxBuffers); if (ARROW_PREDICT_TRUE(buffers_[i])) { return buffers_[i]->data(); } return NULLPTR; } - uint8_t* mutable_data(int i) { + inline uint8_t* mutable_data(int i) { ARROW_DCHECK(i >= 0 && i < kMaxBuffers); if (ARROW_PREDICT_TRUE(buffers_[i])) { return buffers_[i]->mutable_data(); From 9b1e9086acebc2867d08b497d727f14394894c9f Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 01:20:52 +0800 Subject: [PATCH 06/23] Fix avx2 visit null overflow --- cpp/src/arrow/acero/swiss_join_avx2.cc | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 18c35cbb7ac..1419a4aaa5e 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -237,22 +237,26 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, // constexpr int kUnroll = 8; - // TODO: Fix this. const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); __m256i null_bits_per_row = - _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row); + _mm256_set1_epi64x(8 * rows.metadata().null_masks_bytes_per_row); __m256i pos_after_encoding = - _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id)); + _mm256_set1_epi64x(rows.metadata().pos_after_encoding(column_id)); + __m256i bit_in_word = + _mm256_set1_epi32(1 << (rows.metadata().pos_after_encoding(column_id) & 7)); for (int i = 0; i < num_rows / kUnroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row); - bit_id = _mm256_add_epi32(bit_id, pos_after_encoding); - __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi32(bit_id, 3), 1); - __m256i bit_in_word = _mm256_sllv_epi32( - _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7))); - // `result` will contain one 32-bit word per tested null bit, either 0xffffffff if the - // null bit was set or 0 if it was unset. + __m256i row_id_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id)); + __m256i row_id_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id, 1)); + __m256i bit_id_lo = _mm256_mul_epi32(row_id_lo, null_bits_per_row); + __m256i bit_id_hi = _mm256_mul_epi32(row_id_hi, null_bits_per_row); + bit_id_lo = _mm256_add_epi64(bit_id_lo, pos_after_encoding); + bit_id_hi = _mm256_add_epi64(bit_id_hi, pos_after_encoding); + __m128i bytes_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_lo, 3), 1); + __m128i bytes_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_hi, 3), 1); + __m256i bytes = _mm256_set_m128i(bytes_hi, bytes_lo); __m256i result = _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word); // NB: Be careful about sign-extension when casting the return value of From c004237340a1be3110062dfcd1798a39ff22a24f Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 01:24:46 +0800 Subject: [PATCH 07/23] Remove useless assertion --- cpp/src/arrow/compute/row/row_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 0267adcde1e..7dbbf76fdc8 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -94,8 +94,6 @@ TEST(RowTableMemoryConsumption, Encode) { /*string_alignment=*/0)); ASSERT_NE(row_table.null_masks(/*row_id=*/0, /*col_pos=*/0), NULLPTR); ASSERT_NE(row_table.fixed_length_rows(/*row_id=*/0), NULLPTR); - // TODO: May fail. - ASSERT_EQ(row_table.var_length_rows(), NULLPTR); int64_t actual_null_mask_size = num_rows * row_table.metadata().null_masks_bytes_per_row; From 18d818893e943128347e5262994ed6545e38f515 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 11:08:44 +0800 Subject: [PATCH 08/23] Remove col_pos from null_masks() arguments --- cpp/src/arrow/acero/swiss_join.cc | 7 +++---- cpp/src/arrow/acero/swiss_join_avx2.cc | 2 +- cpp/src/arrow/compute/row/compare_internal_avx2.cc | 6 ++++-- cpp/src/arrow/compute/row/encode_internal.cc | 6 ++++-- cpp/src/arrow/compute/row/row_internal.cc | 2 +- cpp/src/arrow/compute/row/row_internal.h | 9 ++++----- cpp/src/arrow/compute/row/row_test.cc | 4 ++-- 7 files changed, 19 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 2cc0f870afc..ba9cb543a89 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -569,17 +569,16 @@ void RowArrayMerge::CopyNulls(RowTableImpl* target, const RowTableImpl& source, int64_t num_source_rows = source.length(); int num_bytes_per_row = target->metadata().null_masks_bytes_per_row; DCHECK_LE(first_target_row_id, std::numeric_limits::max()); - uint8_t* target_nulls = - target->null_masks(static_cast(first_target_row_id), /*col_pos=*/0); + uint8_t* target_nulls = target->null_masks(static_cast(first_target_row_id)); if (!source_rows_permutation) { - memcpy(target_nulls, source.null_masks(/*row_id=*/0, /*col_pos=*/0), + memcpy(target_nulls, source.null_masks(/*row_id=*/0), num_bytes_per_row * num_source_rows); } else { for (uint32_t i = 0; i < num_source_rows; ++i) { int64_t source_row_id = source_rows_permutation[i]; DCHECK_LE(source_row_id, std::numeric_limits::max()); const uint8_t* source_nulls = - source.null_masks(static_cast(source_row_id), /*col_pos=*/0); + source.null_masks(static_cast(source_row_id)); for (int64_t byte = 0; byte < num_bytes_per_row; ++byte) { *target_nulls++ = *source_nulls++; } diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 1419a4aaa5e..49afae8beed 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -237,7 +237,7 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, // constexpr int kUnroll = 8; - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); __m256i null_bits_per_row = _mm256_set1_epi64x(8 * rows.metadata().null_masks_bytes_per_row); __m256i pos_after_encoding = diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 2d65cb9d90e..effe5a5710a 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -49,7 +49,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( if (!col.data(0)) { // Remove rows from the result for which the column value is a null - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; uint32_t num_processed = 0; @@ -64,6 +64,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( irow_right = _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } + // TODO: Fix this. __m256i bitid = _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(null_mask_num_bytes * 8)); bitid = _mm256_add_epi32(bitid, _mm256_set1_epi32(null_bit_id)); @@ -117,7 +118,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( } return num_processed; } else { - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; const uint8_t* non_nulls = col.data(0); ARROW_DCHECK(non_nulls); @@ -147,6 +148,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( left_null = _mm256_cmpeq_epi32(_mm256_and_si256(left, bits), _mm256_setzero_si256()); } + // TODO: Fix this. __m256i bitid = _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(null_mask_num_bytes * 8)); bitid = _mm256_add_epi32(bitid, _mm256_set1_epi32(null_bit_id)); diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index b0a57708f44..a880416f6d9 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -818,7 +818,8 @@ void EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, const RowTableI DCHECK(col.mutable_data(0) || col.metadata().is_null_type); } - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0, /*col_pos=*/0); + // TODO: Fix this. + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row; for (size_t col = 0; col < cols->size(); ++col) { if ((*cols)[col].metadata().is_null_type) { @@ -876,7 +877,8 @@ void EncoderVarBinary::EncodeSelected(uint32_t ivarbinary, RowTableImpl* rows, void EncoderNulls::EncodeSelected(RowTableImpl* rows, const std::vector& cols, uint32_t num_selected, const uint16_t* selection) { - uint8_t* null_masks = rows->null_masks(/*row_id=*/0, /*col_pos=*/0); + // TODO: Fix this. + uint8_t* null_masks = rows->null_masks(/*row_id=*/0); uint32_t null_mask_num_bytes = rows->metadata().null_masks_bytes_per_row; memset(null_masks, 0, null_mask_num_bytes * num_selected); for (size_t icol = 0; icol < cols.size(); ++icol) { diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index ed1d06be265..492cc71ac49 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -412,7 +412,7 @@ bool RowTableImpl::has_any_nulls(const LightContext* ctx) const { DCHECK_LE(num_bytes, std::numeric_limits::max()); has_any_nulls_ = !util::bit_util::are_all_bytes_zero( ctx->hardware_flags, - null_masks(static_cast(num_rows_for_has_any_nulls_), /*col_pos=*/0), + null_masks(static_cast(num_rows_for_has_any_nulls_)), static_cast(num_bytes)); num_rows_for_has_any_nulls_ = num_rows_; } diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 5d803b77fee..85c8ad78bd8 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -200,13 +200,12 @@ class ARROW_EXPORT RowTableImpl { /// \brief The number of rows stored in the table int64_t length() const { return num_rows_; } - inline const uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) const { - return data(0) + static_cast(row_id) * metadata_.null_masks_bytes_per_row + - col_pos; + inline const uint8_t* null_masks(uint32_t row_id) const { + return data(0) + static_cast(row_id) * metadata_.null_masks_bytes_per_row; } - inline uint8_t* null_masks(uint32_t row_id, uint32_t col_pos) { + inline uint8_t* null_masks(uint32_t row_id) { return mutable_data(0) + - static_cast(row_id) * metadata_.null_masks_bytes_per_row + col_pos; + static_cast(row_id) * metadata_.null_masks_bytes_per_row; } inline bool is_null(uint32_t row_id, uint32_t col_pos) const { return bit_util::GetBit( diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 7dbbf76fdc8..49d8f2a9afe 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -92,7 +92,7 @@ TEST(RowTableMemoryConsumption, Encode) { ASSERT_OK_AND_ASSIGN(auto row_table, MakeRowTableFromColumn(col, num_rows, dt->byte_width(), /*string_alignment=*/0)); - ASSERT_NE(row_table.null_masks(/*row_id=*/0, /*col_pos=*/0), NULLPTR); + ASSERT_NE(row_table.null_masks(/*row_id=*/0), NULLPTR); ASSERT_NE(row_table.fixed_length_rows(/*row_id=*/0), NULLPTR); int64_t actual_null_mask_size = @@ -112,7 +112,7 @@ TEST(RowTableMemoryConsumption, Encode) { SCOPED_TRACE("encoding var length column of " + std::to_string(num_rows) + " rows"); ASSERT_OK_AND_ASSIGN(auto row_table, MakeRowTableFromColumn(var_length_column, num_rows, 4, 4)); - ASSERT_NE(row_table.null_masks(/*row_id=*/0, /*col_pos=*/0), NULLPTR); + ASSERT_NE(row_table.null_masks(/*row_id=*/0), NULLPTR); ASSERT_NE(row_table.offsets(), NULLPTR); ASSERT_NE(row_table.var_length_rows(), NULLPTR); From ba24a03e587e33b272eaf9900b7501bd5bbe9994 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 12:02:54 +0800 Subject: [PATCH 09/23] Fix compare avx2 using null masks --- .../compute/row/compare_internal_avx2.cc | 50 ++++++++++++------- cpp/src/arrow/compute/row/encode_internal.cc | 8 +-- cpp/src/arrow/compute/row/row_internal.h | 4 +- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index effe5a5710a..30d45bb34b0 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -46,6 +46,8 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( const uint32_t null_bit_id = ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); + __m256i pos_after_encoding = _mm256_set1_epi64x(null_bit_id); + __m256i bit_in_right = _mm256_set1_epi32(1 << (null_bit_id & 7)); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -64,15 +66,21 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( irow_right = _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } - // TODO: Fix this. - __m256i bitid = - _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(null_mask_num_bytes * 8)); - bitid = _mm256_add_epi32(bitid, _mm256_set1_epi32(null_bit_id)); - __m256i right = - _mm256_i32gather_epi32((const int*)null_masks, _mm256_srli_epi32(bitid, 3), 1); - right = _mm256_and_si256( - _mm256_set1_epi32(1), - _mm256_srlv_epi32(right, _mm256_and_si256(bitid, _mm256_set1_epi32(7)))); + __m256i irow_right_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right)); + __m256i irow_right_hi = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1)); + __m256i bit_id_lo = + _mm256_mul_epi32(irow_right_lo, _mm256_set1_epi64x(null_mask_num_bytes * 8)); + __m256i bit_id_hi = + _mm256_mul_epi32(irow_right_hi, _mm256_set1_epi64x(null_mask_num_bytes * 8)); + bit_id_lo = _mm256_add_epi64(bit_id_lo, pos_after_encoding); + bit_id_hi = _mm256_add_epi64(bit_id_hi, pos_after_encoding); + __m128i right_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_lo, 3), 1); + __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_hi, 3), 1); + __m256i right = _mm256_set_m128i(right_hi, right_lo); + right = _mm256_and_si256(right, bit_in_right); __m256i cmp = _mm256_cmpeq_epi32(right, _mm256_setzero_si256()); uint32_t result_lo = _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); @@ -148,15 +156,21 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( left_null = _mm256_cmpeq_epi32(_mm256_and_si256(left, bits), _mm256_setzero_si256()); } - // TODO: Fix this. - __m256i bitid = - _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(null_mask_num_bytes * 8)); - bitid = _mm256_add_epi32(bitid, _mm256_set1_epi32(null_bit_id)); - __m256i right = - _mm256_i32gather_epi32((const int*)null_masks, _mm256_srli_epi32(bitid, 3), 1); - right = _mm256_and_si256( - _mm256_set1_epi32(1), - _mm256_srlv_epi32(right, _mm256_and_si256(bitid, _mm256_set1_epi32(7)))); + __m256i irow_right_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right)); + __m256i irow_right_hi = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1)); + __m256i bit_id_lo = + _mm256_mul_epi32(irow_right_lo, _mm256_set1_epi64x(null_mask_num_bytes * 8)); + __m256i bit_id_hi = + _mm256_mul_epi32(irow_right_hi, _mm256_set1_epi64x(null_mask_num_bytes * 8)); + bit_id_lo = _mm256_add_epi64(bit_id_lo, pos_after_encoding); + bit_id_hi = _mm256_add_epi64(bit_id_hi, pos_after_encoding); + __m128i right_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_lo, 3), 1); + __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_hi, 3), 1); + __m256i right = _mm256_set_m128i(right_hi, right_lo); + right = _mm256_and_si256(right, bit_in_right); __m256i right_null = _mm256_cmpeq_epi32(right, _mm256_set1_epi32(1)); uint64_t left_null_64 = diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index a880416f6d9..3d89bce9c15 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -818,9 +818,6 @@ void EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, const RowTableI DCHECK(col.mutable_data(0) || col.metadata().is_null_type); } - // TODO: Fix this. - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); - uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row; for (size_t col = 0; col < cols->size(); ++col) { if ((*cols)[col].metadata().is_null_type) { continue; @@ -834,9 +831,7 @@ void EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, const RowTableI memset(non_nulls + 1, 0xff, bit_util::BytesForBits(num_rows - bits_in_first_byte)); } for (uint32_t row = 0; row < num_rows; ++row) { - uint32_t null_masks_bit_id = - (start_row + row) * null_masks_bytes_per_row * 8 + static_cast(col); - bool is_set = bit_util::GetBit(null_masks, null_masks_bit_id); + bool is_set = rows.is_null(start_row + row, static_cast(col)); if (is_set) { bit_util::ClearBit(non_nulls, bit_offset + row); } @@ -877,7 +872,6 @@ void EncoderVarBinary::EncodeSelected(uint32_t ivarbinary, RowTableImpl* rows, void EncoderNulls::EncodeSelected(RowTableImpl* rows, const std::vector& cols, uint32_t num_selected, const uint16_t* selection) { - // TODO: Fix this. uint8_t* null_masks = rows->null_masks(/*row_id=*/0); uint32_t null_mask_num_bytes = rows->metadata().null_masks_bytes_per_row; memset(null_masks, 0, null_mask_num_bytes * num_selected); diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 85c8ad78bd8..21f783f80cd 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -208,9 +208,7 @@ class ARROW_EXPORT RowTableImpl { static_cast(row_id) * metadata_.null_masks_bytes_per_row; } inline bool is_null(uint32_t row_id, uint32_t col_pos) const { - return bit_util::GetBit( - null_masks_->data(), - static_cast(row_id) * metadata_.null_masks_bytes_per_row * 8 + col_pos); + return bit_util::GetBit(null_masks(row_id), col_pos); } inline const uint8_t* fixed_length_rows(uint32_t row_id) const { From 22d6b1e53f5c9cb3083be3241ff16d327d833282 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 12:13:40 +0800 Subject: [PATCH 10/23] Refine tests --- cpp/src/arrow/acero/hash_join_node_test.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index cad7c09fbdb..fbe908f82fd 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3449,6 +3449,8 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { num_batches_left * num_rows_per_batch_left * num_batches_right); } +// GH-45334: The right side (the build side) payload column of the matching rows' are +// placed over 4GB, causing the index calculation overflow. TEST(HashJoin, LARGE_MEMORY_TEST(BuildSidePayloadOver4GB)) { const int64_t num_match_rows = 32; const int64_t num_rows_per_match_batch = 32; @@ -3465,8 +3467,8 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSidePayloadOver4GB)) { schema({field("large_key0", int64()), field("large_key1", int64()), field("large_key2", int64()), field("large_payload", int64())}); - const int64_t match_key0 = static_cast(88506230299); - const int64_t match_key1 = static_cast(16556030299); + const int64_t match_key0 = 88506230299LL; + const int64_t match_key1 = 16556030299LL; const int64_t match_key2 = 11240299; const int64_t match_payload = 42; From 88977532e38d35c7e862fffc18806c0274e66087 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 18:28:45 +0800 Subject: [PATCH 11/23] Refine test --- cpp/src/arrow/acero/hash_join_node_test.cc | 86 +++++++++++++--------- 1 file changed, 51 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index fbe908f82fd..e6c776810fe 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3449,11 +3449,15 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { num_batches_left * num_rows_per_batch_left * num_batches_right); } -// GH-45334: The right side (the build side) payload column of the matching rows' are -// placed over 4GB, causing the index calculation overflow. -TEST(HashJoin, LARGE_MEMORY_TEST(BuildSidePayloadOver4GB)) { - const int64_t num_match_rows = 32; - const int64_t num_rows_per_match_batch = 32; +// GH-45334: The row ids of the matching rows on the right side (the build side) are very +// big, causing the index calculation overflow. +TEST(HashJoin, BuildSideLargeRowIds) { + GTEST_SKIP() << "Test disabled due to excessively time and resource consuming, " + "for local debugging only."; + + // A fair amount of match rows to trigger both SIMD and non-SIMD code paths. + const int64_t num_match_rows = 35; + const int64_t num_rows_per_match_batch = 35; const int64_t num_match_batches = num_match_rows / num_rows_per_match_batch; const int64_t num_unmatch_rows_large = 720898048; @@ -3461,56 +3465,61 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSidePayloadOver4GB)) { const int64_t num_unmatch_batches_large = num_unmatch_rows_large / num_rows_per_unmatch_batch_large; - auto schema_small = schema({field("small_key0", int64()), field("small_key1", int64()), - field("small_key2", int64())}); + auto schema_small = + schema({field("small_key0", int64()), field("small_key1", int64()), + field("small_key2", int64()), field("small_payload", int64())}); auto schema_large = schema({field("large_key0", int64()), field("large_key1", int64()), field("large_key2", int64()), field("large_payload", int64())}); - const int64_t match_key0 = 88506230299LL; - const int64_t match_key1 = 16556030299LL; - const int64_t match_key2 = 11240299; + // A carefully chosen key value which hashes to 0xFFFFFFFC, making the match rows to be + // placed at higher address of the row table. + const int64_t match_key = 14036976; const int64_t match_payload = 42; // Match arrays of length num_rows_per_match_batch. ASSERT_OK_AND_ASSIGN( - auto match_key0_arr, - Constant(MakeScalar(match_key0))->Generate(num_rows_per_match_batch)); - ASSERT_OK_AND_ASSIGN( - auto match_key1_arr, - Constant(MakeScalar(match_key1))->Generate(num_rows_per_match_batch)); - ASSERT_OK_AND_ASSIGN( - auto match_key2_arr, - Constant(MakeScalar(match_key2))->Generate(num_rows_per_match_batch)); + auto match_key_arr, + Constant(MakeScalar(match_key))->Generate(num_rows_per_match_batch)); ASSERT_OK_AND_ASSIGN( auto match_payload_arr, Constant(MakeScalar(match_payload))->Generate(num_rows_per_match_batch)); + // Append 1 row of null to trigger null processing code paths. + ASSERT_OK_AND_ASSIGN(auto null_arr, MakeArrayOfNull(int64(), 1)); + ASSERT_OK_AND_ASSIGN(match_key_arr, Concatenate({match_key_arr, null_arr})); + ASSERT_OK_AND_ASSIGN(match_payload_arr, Concatenate({match_payload_arr, null_arr})); + // Match batch. + ExecBatch match_batch({match_key_arr, match_key_arr, match_key_arr, match_payload_arr}, + num_rows_per_match_batch + 1); // Small batch. - ExecBatch batch_small({match_key0_arr, match_key1_arr, match_key2_arr}, - num_rows_per_match_batch); + ExecBatch batch_small = match_batch; - // Large unmatch batch. + // Large unmatch batches. const int64_t seed = 42; - auto unmatch_key_arr_large = RandomArrayGenerator(seed).Int64( - num_rows_per_unmatch_batch_large, /*min=*/match_key2 + 1, - /*max=*/match_key2 + 1 + 8); + std::vector unmatch_batches_large; + unmatch_batches_large.reserve(num_unmatch_batches_large); ASSERT_OK_AND_ASSIGN(auto unmatch_payload_arr_large, MakeArrayOfNull(int64(), num_rows_per_unmatch_batch_large)); - ExecBatch unmatch_batch_large({unmatch_key_arr_large, unmatch_key_arr_large, - unmatch_key_arr_large, unmatch_payload_arr_large}, - num_rows_per_unmatch_batch_large); + int64_t unmatch_range_per_batch = + (std::numeric_limits::max() - match_key) / num_unmatch_batches_large; + for (int i = 0; i < num_unmatch_batches_large; ++i) { + auto unmatch_key_arr_large = RandomArrayGenerator(seed).Int64( + num_rows_per_unmatch_batch_large, + /*min=*/match_key + 1 + i * unmatch_range_per_batch, + /*max=*/match_key + 1 + (i + 1) * unmatch_range_per_batch); + unmatch_batches_large.push_back( + ExecBatch({unmatch_key_arr_large, unmatch_key_arr_large, unmatch_key_arr_large, + unmatch_payload_arr_large}, + num_rows_per_unmatch_batch_large)); + } // Large match batch. - ExecBatch match_batch_large( - {match_key0_arr, match_key1_arr, match_key2_arr, match_payload_arr}, - num_rows_per_match_batch); + ExecBatch match_batch_large = match_batch; // Batches with schemas. auto batches_small = BatchesWithSchema{ std::vector(num_match_batches, batch_small), schema_small}; - auto batches_large = BatchesWithSchema{ - std::vector(num_unmatch_batches_large, unmatch_batch_large), - schema_large}; + auto batches_large = BatchesWithSchema{std::move(unmatch_batches_large), schema_large}; for (int i = 0; i < num_match_batches; i++) { batches_large.batches.push_back(match_batch_large); } @@ -3536,8 +3545,15 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSidePayloadOver4GB)) { std::move(batches_result.batches))}; AssertRowCountEq(result, num_match_rows * num_match_rows); - // The payload should all be match_payload. - auto predicate = equal(field_ref("large_payload"), literal(match_payload)); + // All rows should be match_key/payload. + auto predicate = and_({equal(field_ref("small_key0"), literal(match_key)), + equal(field_ref("small_key1"), literal(match_key)), + equal(field_ref("small_key2"), literal(match_key)), + equal(field_ref("small_payload"), literal(match_payload)), + equal(field_ref("large_key0"), literal(match_key)), + equal(field_ref("large_key1"), literal(match_key)), + equal(field_ref("large_key2"), literal(match_key)), + equal(field_ref("large_payload"), literal(match_payload))}); Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}}; AssertRowCountEq(std::move(filter), num_match_rows * num_match_rows); } From ff4202b7def5c9f71fce446b1e7d18dfafed61d4 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 19:40:02 +0800 Subject: [PATCH 12/23] Enhance test --- cpp/src/arrow/acero/hash_join_node_test.cc | 30 ++++++++-------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index e6c776810fe..654fd59c45d 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3466,15 +3466,13 @@ TEST(HashJoin, BuildSideLargeRowIds) { num_unmatch_rows_large / num_rows_per_unmatch_batch_large; auto schema_small = - schema({field("small_key0", int64()), field("small_key1", int64()), - field("small_key2", int64()), field("small_payload", int64())}); + schema({field("small_key", int64()), field("small_payload", int64())}); auto schema_large = - schema({field("large_key0", int64()), field("large_key1", int64()), - field("large_key2", int64()), field("large_payload", int64())}); + schema({field("large_key", int64()), field("large_payload", int64())}); - // A carefully chosen key value which hashes to 0xFFFFFFFC, making the match rows to be + // A carefully chosen key value which hashes to 0xFFFFFFFE, making the match rows to be // placed at higher address of the row table. - const int64_t match_key = 14036976; + const int64_t match_key = 289339070; const int64_t match_payload = 42; // Match arrays of length num_rows_per_match_batch. @@ -3489,8 +3487,7 @@ TEST(HashJoin, BuildSideLargeRowIds) { ASSERT_OK_AND_ASSIGN(match_key_arr, Concatenate({match_key_arr, null_arr})); ASSERT_OK_AND_ASSIGN(match_payload_arr, Concatenate({match_payload_arr, null_arr})); // Match batch. - ExecBatch match_batch({match_key_arr, match_key_arr, match_key_arr, match_payload_arr}, - num_rows_per_match_batch + 1); + ExecBatch match_batch({match_key_arr, match_payload_arr}, num_rows_per_match_batch + 1); // Small batch. ExecBatch batch_small = match_batch; @@ -3509,8 +3506,7 @@ TEST(HashJoin, BuildSideLargeRowIds) { /*min=*/match_key + 1 + i * unmatch_range_per_batch, /*max=*/match_key + 1 + (i + 1) * unmatch_range_per_batch); unmatch_batches_large.push_back( - ExecBatch({unmatch_key_arr_large, unmatch_key_arr_large, unmatch_key_arr_large, - unmatch_payload_arr_large}, + ExecBatch({unmatch_key_arr_large, unmatch_payload_arr_large}, num_rows_per_unmatch_batch_large)); } // Large match batch. @@ -3531,10 +3527,8 @@ TEST(HashJoin, BuildSideLargeRowIds) { "exec_batch_source", ExecBatchSourceNodeOptions(batches_large.schema, batches_large.batches)}; - HashJoinNodeOptions join_opts( - JoinType::INNER, - /*left_keys=*/{"small_key0", "small_key1", "small_key2"}, - /*right_keys=*/{"large_key0", "large_key1", "large_key2"}); + HashJoinNodeOptions join_opts(JoinType::INNER, /*left_keys=*/{"small_key"}, + /*right_keys=*/{"large_key"}); Declaration join{ "hashjoin", {std::move(source_small), std::move(source_large)}, join_opts}; @@ -3546,13 +3540,9 @@ TEST(HashJoin, BuildSideLargeRowIds) { AssertRowCountEq(result, num_match_rows * num_match_rows); // All rows should be match_key/payload. - auto predicate = and_({equal(field_ref("small_key0"), literal(match_key)), - equal(field_ref("small_key1"), literal(match_key)), - equal(field_ref("small_key2"), literal(match_key)), + auto predicate = and_({equal(field_ref("small_key"), literal(match_key)), equal(field_ref("small_payload"), literal(match_payload)), - equal(field_ref("large_key0"), literal(match_key)), - equal(field_ref("large_key1"), literal(match_key)), - equal(field_ref("large_key2"), literal(match_key)), + equal(field_ref("large_key"), literal(match_key)), equal(field_ref("large_payload"), literal(match_payload))}); Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}}; AssertRowCountEq(std::move(filter), num_match_rows * num_match_rows); From 5e7f8631c1921ede4612dae98783996f1b9e2ab0 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 19:55:49 +0800 Subject: [PATCH 13/23] Fix --- cpp/src/arrow/compute/row/encode_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 3d89bce9c15..02af02ad014 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -267,7 +267,7 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows, switch (col_prep.metadata().fixed_length) { case 1: for (uint32_t i = 0; i < num_rows; ++i) { - col_base[i] = *rows.fixed_length_rows(start_row + i); + col_base[i] = *rows.fixed_length_rows(start_row + i) + offset_within_row; } break; case 2: From c3b0ee7772db9944e59598fc95b991279b797d6e Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 20:12:10 +0800 Subject: [PATCH 14/23] Fix --- cpp/src/arrow/compute/key_hash_test.cc | 61 +++++++++++++++++++ .../compute/row/compare_internal_avx2.cc | 7 ++- cpp/src/arrow/testing/generator.cc | 32 ++++++++++ cpp/src/arrow/testing/generator.h | 3 + 4 files changed, 100 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index fdf6d212585..0eccdb688ce 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -24,6 +24,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/compute/key_hash_internal.h" +#include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" @@ -362,5 +363,65 @@ TEST(VectorHash, HashBatchTempStackUsage) { } } +TEST(VectorHash, HashMax) { + MemoryPool* pool = default_memory_pool(); + const int64_t start = 0xFFFFFFF; + const int64_t num_rows = 0xFFFFFFF; + + ASSERT_OK_AND_ASSIGN(auto arr, arrow::gen::Step64(start, 1)->Generate(num_rows)); + ExecBatch batch({arr}, num_rows); + // ExecBatch batch({arr, arr, arr}, num_rows); + + std::vector column_arrays; + ASSERT_OK(ColumnArraysFromExecBatch(batch, &column_arrays)); + + const auto hardware_flags_for_testing = HardwareFlagsForTesting(); + ASSERT_GT(hardware_flags_for_testing.size(), 0); + + std::vector hashes(num_rows); + TempVectorStack stack; + ASSERT_OK(stack.Init(pool, Hashing32::kHashBatchTempStackUsage)); + ASSERT_OK(Hashing32::HashBatch(batch, hashes.data(), column_arrays, + hardware_flags_for_testing[0], &stack, + /*start_rows=*/0, num_rows)); + + auto max_it = std::max_element(hashes.begin(), hashes.end()); + auto max_i = (max_it - hashes.begin()) + start; + auto max_hash = *max_it; + std::cout << "Max integer: " << max_i << std::endl; + std::cout << "Max hash: " << std::uppercase << std::hex << max_hash << std::endl; +} + +TEST(VectorHash, HashSmall) { + MemoryPool* pool = default_memory_pool(); + const int64_t start[3] = {88506230299LL, 16556030299LL, 11240299}; + const int64_t num_rows = 0x1; + + std::vector> arrs(3); + for (int i = 0; i < 3; ++i) { + ASSERT_OK_AND_ASSIGN(arrs[i], arrow::gen::Step64(start[i], 1)->Generate(num_rows)); + } + ExecBatch batch({arrs[0], arrs[1], arrs[2]}, num_rows); + + std::vector column_arrays; + ASSERT_OK(ColumnArraysFromExecBatch(batch, &column_arrays)); + + const auto hardware_flags_for_testing = HardwareFlagsForTesting(); + ASSERT_GT(hardware_flags_for_testing.size(), 0); + + std::vector hashes(num_rows); + TempVectorStack stack; + ASSERT_OK(stack.Init(pool, Hashing32::kHashBatchTempStackUsage)); + ASSERT_OK(Hashing32::HashBatch(batch, hashes.data(), column_arrays, + hardware_flags_for_testing[0], &stack, + /*start_rows=*/0, num_rows)); + + auto max_it = std::max_element(hashes.begin(), hashes.end()); + auto max_i = (max_it - hashes.begin()) + start; + auto max_hash = *max_it; + std::cout << "Max integer: " << max_i << std::endl; + std::cout << "Max hash: " << std::uppercase << std::hex << max_hash << std::endl; +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 30d45bb34b0..91274b715fd 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -47,7 +47,6 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( const uint32_t null_bit_id = ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); __m256i pos_after_encoding = _mm256_set1_epi64x(null_bit_id); - __m256i bit_in_right = _mm256_set1_epi32(1 << (null_bit_id & 7)); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -80,7 +79,8 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), _mm256_srli_epi64(bit_id_hi, 3), 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); - right = _mm256_and_si256(right, bit_in_right); + right = _mm256_and_si256(_mm256_set1_epi32(1), + _mm256_srli_epi32(right, null_bit_id & 7)); __m256i cmp = _mm256_cmpeq_epi32(right, _mm256_setzero_si256()); uint32_t result_lo = _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); @@ -170,7 +170,8 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), _mm256_srli_epi64(bit_id_hi, 3), 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); - right = _mm256_and_si256(right, bit_in_right); + right = _mm256_and_si256(_mm256_set1_epi32(1), + _mm256_srli_epi32(right, null_bit_id & 7)); __m256i right_null = _mm256_cmpeq_epi32(right, _mm256_set1_epi32(1)); uint64_t left_null_64 = diff --git a/cpp/src/arrow/testing/generator.cc b/cpp/src/arrow/testing/generator.cc index 5ea6a541e89..ccdb895432d 100644 --- a/cpp/src/arrow/testing/generator.cc +++ b/cpp/src/arrow/testing/generator.cc @@ -220,6 +220,34 @@ class ConstantGenerator : public ArrayGenerator { std::shared_ptr value_; }; +class StepGenerator64 : public ArrayGenerator { + public: + StepGenerator64(int64_t start, int64_t step) : start_(start), step_(step) {} + + template + Result> DoGenerate(int64_t num_rows) { + BuilderType builder; + ARROW_RETURN_NOT_OK(builder.Reserve(num_rows)); + CType val = start_; + for (int64_t i = 0; i < num_rows; i++) { + builder.UnsafeAppend(val); + val += step_; + } + start_ = val; + return builder.Finish(); + } + + Result> Generate(int64_t num_rows) override { + return DoGenerate(num_rows); + } + + std::shared_ptr type() const override { return int64(); } + + private: + int64_t start_; + int64_t step_; +}; + class StepGenerator : public ArrayGenerator { public: StepGenerator(uint32_t start, uint32_t step, bool signed_int) @@ -409,6 +437,10 @@ std::shared_ptr Step(uint32_t start, uint32_t step, bool signed_ return std::make_shared(start, step, signed_int); } +std::shared_ptr Step64(int64_t start, int64_t step) { + return std::make_shared(start, step); +} + std::shared_ptr Random(std::shared_ptr type) { return std::make_shared(std::move(type)); } diff --git a/cpp/src/arrow/testing/generator.h b/cpp/src/arrow/testing/generator.h index 4ec8845864b..008b251d71c 100644 --- a/cpp/src/arrow/testing/generator.h +++ b/cpp/src/arrow/testing/generator.h @@ -307,6 +307,9 @@ ARROW_TESTING_EXPORT std::shared_ptr Constant( ARROW_TESTING_EXPORT std::shared_ptr Step(uint32_t start = 0, uint32_t step = 1, bool signed_int = false); + +ARROW_TESTING_EXPORT std::shared_ptr Step64(int64_t start, int64_t step); + /// make a generator that returns a random value ARROW_TESTING_EXPORT std::shared_ptr Random( std::shared_ptr type); From b93af5b53f30dc9bb32c1958128eb82d5b61f0a4 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 20:16:22 +0800 Subject: [PATCH 15/23] Revert "Fix" This reverts commit c3b0ee7772db9944e59598fc95b991279b797d6e. --- cpp/src/arrow/compute/key_hash_test.cc | 61 ------------------- .../compute/row/compare_internal_avx2.cc | 7 +-- cpp/src/arrow/testing/generator.cc | 32 ---------- cpp/src/arrow/testing/generator.h | 3 - 4 files changed, 3 insertions(+), 100 deletions(-) diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index 0eccdb688ce..fdf6d212585 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -24,7 +24,6 @@ #include "arrow/array/builder_binary.h" #include "arrow/compute/key_hash_internal.h" -#include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" @@ -363,65 +362,5 @@ TEST(VectorHash, HashBatchTempStackUsage) { } } -TEST(VectorHash, HashMax) { - MemoryPool* pool = default_memory_pool(); - const int64_t start = 0xFFFFFFF; - const int64_t num_rows = 0xFFFFFFF; - - ASSERT_OK_AND_ASSIGN(auto arr, arrow::gen::Step64(start, 1)->Generate(num_rows)); - ExecBatch batch({arr}, num_rows); - // ExecBatch batch({arr, arr, arr}, num_rows); - - std::vector column_arrays; - ASSERT_OK(ColumnArraysFromExecBatch(batch, &column_arrays)); - - const auto hardware_flags_for_testing = HardwareFlagsForTesting(); - ASSERT_GT(hardware_flags_for_testing.size(), 0); - - std::vector hashes(num_rows); - TempVectorStack stack; - ASSERT_OK(stack.Init(pool, Hashing32::kHashBatchTempStackUsage)); - ASSERT_OK(Hashing32::HashBatch(batch, hashes.data(), column_arrays, - hardware_flags_for_testing[0], &stack, - /*start_rows=*/0, num_rows)); - - auto max_it = std::max_element(hashes.begin(), hashes.end()); - auto max_i = (max_it - hashes.begin()) + start; - auto max_hash = *max_it; - std::cout << "Max integer: " << max_i << std::endl; - std::cout << "Max hash: " << std::uppercase << std::hex << max_hash << std::endl; -} - -TEST(VectorHash, HashSmall) { - MemoryPool* pool = default_memory_pool(); - const int64_t start[3] = {88506230299LL, 16556030299LL, 11240299}; - const int64_t num_rows = 0x1; - - std::vector> arrs(3); - for (int i = 0; i < 3; ++i) { - ASSERT_OK_AND_ASSIGN(arrs[i], arrow::gen::Step64(start[i], 1)->Generate(num_rows)); - } - ExecBatch batch({arrs[0], arrs[1], arrs[2]}, num_rows); - - std::vector column_arrays; - ASSERT_OK(ColumnArraysFromExecBatch(batch, &column_arrays)); - - const auto hardware_flags_for_testing = HardwareFlagsForTesting(); - ASSERT_GT(hardware_flags_for_testing.size(), 0); - - std::vector hashes(num_rows); - TempVectorStack stack; - ASSERT_OK(stack.Init(pool, Hashing32::kHashBatchTempStackUsage)); - ASSERT_OK(Hashing32::HashBatch(batch, hashes.data(), column_arrays, - hardware_flags_for_testing[0], &stack, - /*start_rows=*/0, num_rows)); - - auto max_it = std::max_element(hashes.begin(), hashes.end()); - auto max_i = (max_it - hashes.begin()) + start; - auto max_hash = *max_it; - std::cout << "Max integer: " << max_i << std::endl; - std::cout << "Max hash: " << std::uppercase << std::hex << max_hash << std::endl; -} - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 91274b715fd..30d45bb34b0 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -47,6 +47,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( const uint32_t null_bit_id = ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); __m256i pos_after_encoding = _mm256_set1_epi64x(null_bit_id); + __m256i bit_in_right = _mm256_set1_epi32(1 << (null_bit_id & 7)); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -79,8 +80,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), _mm256_srli_epi64(bit_id_hi, 3), 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); - right = _mm256_and_si256(_mm256_set1_epi32(1), - _mm256_srli_epi32(right, null_bit_id & 7)); + right = _mm256_and_si256(right, bit_in_right); __m256i cmp = _mm256_cmpeq_epi32(right, _mm256_setzero_si256()); uint32_t result_lo = _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); @@ -170,8 +170,7 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), _mm256_srli_epi64(bit_id_hi, 3), 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); - right = _mm256_and_si256(_mm256_set1_epi32(1), - _mm256_srli_epi32(right, null_bit_id & 7)); + right = _mm256_and_si256(right, bit_in_right); __m256i right_null = _mm256_cmpeq_epi32(right, _mm256_set1_epi32(1)); uint64_t left_null_64 = diff --git a/cpp/src/arrow/testing/generator.cc b/cpp/src/arrow/testing/generator.cc index ccdb895432d..5ea6a541e89 100644 --- a/cpp/src/arrow/testing/generator.cc +++ b/cpp/src/arrow/testing/generator.cc @@ -220,34 +220,6 @@ class ConstantGenerator : public ArrayGenerator { std::shared_ptr value_; }; -class StepGenerator64 : public ArrayGenerator { - public: - StepGenerator64(int64_t start, int64_t step) : start_(start), step_(step) {} - - template - Result> DoGenerate(int64_t num_rows) { - BuilderType builder; - ARROW_RETURN_NOT_OK(builder.Reserve(num_rows)); - CType val = start_; - for (int64_t i = 0; i < num_rows; i++) { - builder.UnsafeAppend(val); - val += step_; - } - start_ = val; - return builder.Finish(); - } - - Result> Generate(int64_t num_rows) override { - return DoGenerate(num_rows); - } - - std::shared_ptr type() const override { return int64(); } - - private: - int64_t start_; - int64_t step_; -}; - class StepGenerator : public ArrayGenerator { public: StepGenerator(uint32_t start, uint32_t step, bool signed_int) @@ -437,10 +409,6 @@ std::shared_ptr Step(uint32_t start, uint32_t step, bool signed_ return std::make_shared(start, step, signed_int); } -std::shared_ptr Step64(int64_t start, int64_t step) { - return std::make_shared(start, step); -} - std::shared_ptr Random(std::shared_ptr type) { return std::make_shared(std::move(type)); } diff --git a/cpp/src/arrow/testing/generator.h b/cpp/src/arrow/testing/generator.h index 008b251d71c..4ec8845864b 100644 --- a/cpp/src/arrow/testing/generator.h +++ b/cpp/src/arrow/testing/generator.h @@ -307,9 +307,6 @@ ARROW_TESTING_EXPORT std::shared_ptr Constant( ARROW_TESTING_EXPORT std::shared_ptr Step(uint32_t start = 0, uint32_t step = 1, bool signed_int = false); - -ARROW_TESTING_EXPORT std::shared_ptr Step64(int64_t start, int64_t step); - /// make a generator that returns a random value ARROW_TESTING_EXPORT std::shared_ptr Random( std::shared_ptr type); From 5c0c85710a9e3e95ae20d7866f8c7ffd0dd69030 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 20:17:34 +0800 Subject: [PATCH 16/23] Fix --- cpp/src/arrow/compute/row/compare_internal_avx2.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 30d45bb34b0..91274b715fd 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -47,7 +47,6 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( const uint32_t null_bit_id = ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); __m256i pos_after_encoding = _mm256_set1_epi64x(null_bit_id); - __m256i bit_in_right = _mm256_set1_epi32(1 << (null_bit_id & 7)); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -80,7 +79,8 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), _mm256_srli_epi64(bit_id_hi, 3), 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); - right = _mm256_and_si256(right, bit_in_right); + right = _mm256_and_si256(_mm256_set1_epi32(1), + _mm256_srli_epi32(right, null_bit_id & 7)); __m256i cmp = _mm256_cmpeq_epi32(right, _mm256_setzero_si256()); uint32_t result_lo = _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); @@ -170,7 +170,8 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), _mm256_srli_epi64(bit_id_hi, 3), 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); - right = _mm256_and_si256(right, bit_in_right); + right = _mm256_and_si256(_mm256_set1_epi32(1), + _mm256_srli_epi32(right, null_bit_id & 7)); __m256i right_null = _mm256_cmpeq_epi32(right, _mm256_set1_epi32(1)); uint64_t left_null_64 = From efe3c98f0a9ad2895a85657dd9524da038cd79da Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 20:18:07 +0800 Subject: [PATCH 17/23] Remove already implied inline keywords --- cpp/src/arrow/compute/row/row_internal.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 21f783f80cd..0cc3b4e8c6d 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -200,40 +200,40 @@ class ARROW_EXPORT RowTableImpl { /// \brief The number of rows stored in the table int64_t length() const { return num_rows_; } - inline const uint8_t* null_masks(uint32_t row_id) const { + const uint8_t* null_masks(uint32_t row_id) const { return data(0) + static_cast(row_id) * metadata_.null_masks_bytes_per_row; } - inline uint8_t* null_masks(uint32_t row_id) { + uint8_t* null_masks(uint32_t row_id) { return mutable_data(0) + static_cast(row_id) * metadata_.null_masks_bytes_per_row; } - inline bool is_null(uint32_t row_id, uint32_t col_pos) const { + bool is_null(uint32_t row_id, uint32_t col_pos) const { return bit_util::GetBit(null_masks(row_id), col_pos); } - inline const uint8_t* fixed_length_rows(uint32_t row_id) const { + const uint8_t* fixed_length_rows(uint32_t row_id) const { ARROW_DCHECK(metadata_.is_fixed_length); return data(1) + static_cast(row_id) * metadata_.fixed_length; } - inline uint8_t* mutable_fixed_length_rows(uint32_t row_id) { + uint8_t* mutable_fixed_length_rows(uint32_t row_id) { ARROW_DCHECK(metadata_.is_fixed_length); return mutable_data(1) + static_cast(row_id) * metadata_.fixed_length; } - inline const offset_type* offsets() const { + const offset_type* offsets() const { ARROW_DCHECK(!metadata_.is_fixed_length); return reinterpret_cast(data(1)); } - inline offset_type* mutable_offsets() { + offset_type* mutable_offsets() { ARROW_DCHECK(!metadata_.is_fixed_length); return reinterpret_cast(mutable_data(1)); } - inline const uint8_t* var_length_rows() const { + const uint8_t* var_length_rows() const { ARROW_DCHECK(!metadata_.is_fixed_length); return data(2); } - inline uint8_t* mutable_var_length_rows() { + uint8_t* mutable_var_length_rows() { ARROW_DCHECK(!metadata_.is_fixed_length); return mutable_data(2); } @@ -253,14 +253,14 @@ class ARROW_EXPORT RowTableImpl { private: // Accessors into the table's buffers - inline const uint8_t* data(int i) const { + const uint8_t* data(int i) const { ARROW_DCHECK(i >= 0 && i < kMaxBuffers); if (ARROW_PREDICT_TRUE(buffers_[i])) { return buffers_[i]->data(); } return NULLPTR; } - inline uint8_t* mutable_data(int i) { + uint8_t* mutable_data(int i) { ARROW_DCHECK(i >= 0 && i < kMaxBuffers); if (ARROW_PREDICT_TRUE(buffers_[i])) { return buffers_[i]->mutable_data(); From 53dc951f50e4d907224eb8dfabe9750fc45378f9 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 23 Jan 2025 20:20:27 +0800 Subject: [PATCH 18/23] null_masks -> mutable_null_masks --- cpp/src/arrow/acero/swiss_join.cc | 3 ++- cpp/src/arrow/compute/row/encode_internal.cc | 2 +- cpp/src/arrow/compute/row/row_internal.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index ba9cb543a89..85e14ac469c 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -569,7 +569,8 @@ void RowArrayMerge::CopyNulls(RowTableImpl* target, const RowTableImpl& source, int64_t num_source_rows = source.length(); int num_bytes_per_row = target->metadata().null_masks_bytes_per_row; DCHECK_LE(first_target_row_id, std::numeric_limits::max()); - uint8_t* target_nulls = target->null_masks(static_cast(first_target_row_id)); + uint8_t* target_nulls = + target->mutable_null_masks(static_cast(first_target_row_id)); if (!source_rows_permutation) { memcpy(target_nulls, source.null_masks(/*row_id=*/0), num_bytes_per_row * num_source_rows); diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 02af02ad014..136e7760937 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -872,7 +872,7 @@ void EncoderVarBinary::EncodeSelected(uint32_t ivarbinary, RowTableImpl* rows, void EncoderNulls::EncodeSelected(RowTableImpl* rows, const std::vector& cols, uint32_t num_selected, const uint16_t* selection) { - uint8_t* null_masks = rows->null_masks(/*row_id=*/0); + uint8_t* null_masks = rows->mutable_null_masks(/*row_id=*/0); uint32_t null_mask_num_bytes = rows->metadata().null_masks_bytes_per_row; memset(null_masks, 0, null_mask_num_bytes * num_selected); for (size_t icol = 0; icol < cols.size(); ++icol) { diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 0cc3b4e8c6d..0919773a228 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -203,7 +203,7 @@ class ARROW_EXPORT RowTableImpl { const uint8_t* null_masks(uint32_t row_id) const { return data(0) + static_cast(row_id) * metadata_.null_masks_bytes_per_row; } - uint8_t* null_masks(uint32_t row_id) { + uint8_t* mutable_null_masks(uint32_t row_id) { return mutable_data(0) + static_cast(row_id) * metadata_.null_masks_bytes_per_row; } From 8e97d6b60c0d3569085927cf8359f43ffb3f1cd5 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Fri, 24 Jan 2025 00:09:30 +0800 Subject: [PATCH 19/23] Helper functions for get null bits from row table and 32/64b compare result to 8b --- cpp/src/arrow/acero/hash_join_node_test.cc | 4 +- .../compute/row/compare_internal_avx2.cc | 130 +++++++----------- 2 files changed, 53 insertions(+), 81 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 654fd59c45d..64359b973e5 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3452,8 +3452,8 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { // GH-45334: The row ids of the matching rows on the right side (the build side) are very // big, causing the index calculation overflow. TEST(HashJoin, BuildSideLargeRowIds) { - GTEST_SKIP() << "Test disabled due to excessively time and resource consuming, " - "for local debugging only."; + // GTEST_SKIP() << "Test disabled due to excessively time and resource consuming, " + // "for local debugging only."; // A fair amount of match rows to trigger both SIMD and non-SIMD code paths. const int64_t num_match_rows = 35; diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 91274b715fd..b0f954be865 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -23,6 +23,8 @@ namespace arrow { namespace compute { +namespace { + inline __m256i set_first_n_bytes_avx2(int n) { constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; @@ -34,6 +36,43 @@ inline __m256i set_first_n_bytes_avx2(int n) { kByteSequence16To23, kByteSequence24To31)); } +// Get null bits for 8 32-bit row ids in `row_id32` at `null_bit_id` as a vector of 32-bit +// integers. +inline __m256i GetNullBitInt32(const RowTableImpl& rows, uint32_t null_bit_id, + __m256i row_id32) { + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); + __m256i null_mask_num_bits = + _mm256_set1_epi64x(rows.metadata().null_masks_bytes_per_row * 8); + __m256i row_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id32)); + __m256i row_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id32, 1)); + __m256i bit_id_lo = _mm256_mul_epi32(row_lo, null_mask_num_bits); + __m256i bit_id_hi = _mm256_mul_epi32(row_hi, null_mask_num_bits); + bit_id_lo = _mm256_add_epi64(bit_id_lo, _mm256_set1_epi64x(null_bit_id)); + bit_id_hi = _mm256_add_epi64(bit_id_hi, _mm256_set1_epi64x(null_bit_id)); + __m128i right_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_lo, 3), 1); + __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_hi, 3), 1); + __m256i right = _mm256_set_m128i(right_hi, right_lo); + return _mm256_and_si256(_mm256_set1_epi32(1), + _mm256_srli_epi32(right, null_bit_id & 7)); +} + +// Convert 8 64-bit comparision results, each being 0 or -1, to 8 bytes. +inline uint64_t Cmp64To8(__m256i cmp64_lo, __m256i cmp64_hi) { + uint32_t cmp_lo = _mm256_movemask_epi8(cmp64_lo); + uint32_t cmp_hi = _mm256_movemask_epi8(cmp64_hi); + return cmp_lo | (static_cast(cmp_hi) << 32); +} + +// Convert 8 32-bit comparision results, each being 0 or -1, to 8 bytes. +inline uint64_t Cmp32To8(__m256i cmp32) { + return Cmp64To8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp32)), + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp32, 1))); +} + +} // namespace + template uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, @@ -46,13 +85,9 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( const uint32_t null_bit_id = ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); - __m256i pos_after_encoding = _mm256_set1_epi64x(null_bit_id); if (!col.data(0)) { // Remove rows from the result for which the column value is a null - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); - uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; - uint32_t num_processed = 0; constexpr uint32_t unroll = 8; for (uint32_t i = 0; i < num_rows_to_compare / unroll; ++i) { @@ -65,29 +100,9 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( irow_right = _mm256_loadu_si256(reinterpret_cast(left_to_right_map) + i); } - __m256i irow_right_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right)); - __m256i irow_right_hi = - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1)); - __m256i bit_id_lo = - _mm256_mul_epi32(irow_right_lo, _mm256_set1_epi64x(null_mask_num_bytes * 8)); - __m256i bit_id_hi = - _mm256_mul_epi32(irow_right_hi, _mm256_set1_epi64x(null_mask_num_bytes * 8)); - bit_id_lo = _mm256_add_epi64(bit_id_lo, pos_after_encoding); - bit_id_hi = _mm256_add_epi64(bit_id_hi, pos_after_encoding); - __m128i right_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi64(bit_id_lo, 3), 1); - __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi64(bit_id_hi, 3), 1); - __m256i right = _mm256_set_m128i(right_hi, right_lo); - right = _mm256_and_si256(_mm256_set1_epi32(1), - _mm256_srli_epi32(right, null_bit_id & 7)); + __m256i right = GetNullBitInt32(rows, null_bit_id, irow_right); __m256i cmp = _mm256_cmpeq_epi32(right, _mm256_setzero_si256()); - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); - uint32_t result_hi = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); - reinterpret_cast(match_bytevector)[i] &= - result_lo | (static_cast(result_hi) << 32); + reinterpret_cast(match_bytevector)[i] &= Cmp32To8(cmp); } num_processed = num_rows_to_compare / unroll * unroll; return num_processed; @@ -116,18 +131,11 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( __m256i bits = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); cmp = _mm256_cmpeq_epi32(_mm256_and_si256(left, bits), bits); } - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); - uint32_t result_hi = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); - reinterpret_cast(match_bytevector)[i] &= - result_lo | (static_cast(result_hi) << 32); - num_processed = num_rows_to_compare / unroll * unroll; + reinterpret_cast(match_bytevector)[i] &= Cmp32To8(cmp); } + num_processed = num_rows_to_compare / unroll * unroll; return num_processed; } else { - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); - uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; const uint8_t* non_nulls = col.data(0); ARROW_DCHECK(non_nulls); @@ -156,37 +164,11 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( left_null = _mm256_cmpeq_epi32(_mm256_and_si256(left, bits), _mm256_setzero_si256()); } - __m256i irow_right_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right)); - __m256i irow_right_hi = - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1)); - __m256i bit_id_lo = - _mm256_mul_epi32(irow_right_lo, _mm256_set1_epi64x(null_mask_num_bytes * 8)); - __m256i bit_id_hi = - _mm256_mul_epi32(irow_right_hi, _mm256_set1_epi64x(null_mask_num_bytes * 8)); - bit_id_lo = _mm256_add_epi64(bit_id_lo, pos_after_encoding); - bit_id_hi = _mm256_add_epi64(bit_id_hi, pos_after_encoding); - __m128i right_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi64(bit_id_lo, 3), 1); - __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi64(bit_id_hi, 3), 1); - __m256i right = _mm256_set_m128i(right_hi, right_lo); - right = _mm256_and_si256(_mm256_set1_epi32(1), - _mm256_srli_epi32(right, null_bit_id & 7)); + __m256i right = GetNullBitInt32(rows, null_bit_id, irow_right); __m256i right_null = _mm256_cmpeq_epi32(right, _mm256_set1_epi32(1)); - uint64_t left_null_64 = - static_cast(_mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_castsi256_si128(left_null)))) | - (static_cast(static_cast(_mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(left_null, 1))))) - << 32); - - uint64_t right_null_64 = - static_cast(_mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_castsi256_si128(right_null)))) | - (static_cast(static_cast(_mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(right_null, 1))))) - << 32); + uint64_t left_null_64 = Cmp32To8(left_null); + uint64_t right_null_64 = Cmp32To8(right_null); reinterpret_cast(match_bytevector)[i] |= left_null_64 & right_null_64; reinterpret_cast(match_bytevector)[i] &= ~(left_null_64 ^ right_null_64); @@ -338,12 +320,7 @@ inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* r __m256i cmp = _mm256_cmpeq_epi32(left, right); - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); - uint32_t result_hi = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); - - return result_lo | (static_cast(result_hi) << 32); + return Cmp32To8(cmp); } template @@ -389,12 +366,7 @@ inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_bas __m256i cmp = _mm256_cmpeq_epi32(left, right); - uint32_t result_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp))); - uint32_t result_hi = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp, 1))); - - return result_lo | (static_cast(result_hi) << 32); + return Cmp32To8(cmp); } template @@ -419,9 +391,9 @@ inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* rig reinterpret_cast(right_base); __m256i right_lo = _mm256_i64gather_epi64(right_base_i64, offset_right_lo, 1); __m256i right_hi = _mm256_i64gather_epi64(right_base_i64, offset_right_hi, 1); - uint32_t result_lo = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_lo, right_lo)); - uint32_t result_hi = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_hi, right_hi)); - return result_lo | (static_cast(result_hi) << 32); + __m256i cmp_lo = _mm256_cmpeq_epi64(left_lo, right_lo); + __m256i cmp_hi = _mm256_cmpeq_epi64(left_hi, right_hi); + return Cmp64To8(cmp_lo, cmp_hi); } template From c4d3959e1b936c6232b7278f108c88b8319fdba4 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Fri, 24 Jan 2025 00:12:00 +0800 Subject: [PATCH 20/23] Revert mis-commented gtest skip --- cpp/src/arrow/acero/hash_join_node_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 64359b973e5..654fd59c45d 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3452,8 +3452,8 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { // GH-45334: The row ids of the matching rows on the right side (the build side) are very // big, causing the index calculation overflow. TEST(HashJoin, BuildSideLargeRowIds) { - // GTEST_SKIP() << "Test disabled due to excessively time and resource consuming, " - // "for local debugging only."; + GTEST_SKIP() << "Test disabled due to excessively time and resource consuming, " + "for local debugging only."; // A fair amount of match rows to trigger both SIMD and non-SIMD code paths. const int64_t num_match_rows = 35; From d4c2af3b4605a251c81cf4702017854ecc0908c6 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Fri, 24 Jan 2025 01:17:53 +0800 Subject: [PATCH 21/23] Move GetNullBitInt32 to common header and reuse it in swiss join code --- cpp/src/arrow/acero/swiss_join_avx2.cc | 37 ++++---------- .../compute/row/compare_internal_avx2.cc | 23 +-------- .../compute/row/row_util_avx2_internal.h | 51 +++++++++++++++++++ 3 files changed, 61 insertions(+), 50 deletions(-) create mode 100644 cpp/src/arrow/compute/row/row_util_avx2_internal.h diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 49afae8beed..86d08870e58 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/acero/swiss_join_internal.h" +#include "arrow/compute/row/row_util_avx2_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/simd.h" @@ -237,36 +238,16 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, // constexpr int kUnroll = 8; - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); - __m256i null_bits_per_row = - _mm256_set1_epi64x(8 * rows.metadata().null_masks_bytes_per_row); - __m256i pos_after_encoding = - _mm256_set1_epi64x(rows.metadata().pos_after_encoding(column_id)); - __m256i bit_in_word = - _mm256_set1_epi32(1 << (rows.metadata().pos_after_encoding(column_id) & 7)); + uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id); for (int i = 0; i < num_rows / kUnroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_id_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id)); - __m256i row_id_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id, 1)); - __m256i bit_id_lo = _mm256_mul_epi32(row_id_lo, null_bits_per_row); - __m256i bit_id_hi = _mm256_mul_epi32(row_id_hi, null_bits_per_row); - bit_id_lo = _mm256_add_epi64(bit_id_lo, pos_after_encoding); - bit_id_hi = _mm256_add_epi64(bit_id_hi, pos_after_encoding); - __m128i bytes_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi64(bit_id_lo, 3), 1); - __m128i bytes_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi64(bit_id_hi, 3), 1); - __m256i bytes = _mm256_set_m128i(bytes_hi, bytes_lo); - __m256i result = - _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word); - // NB: Be careful about sign-extension when casting the return value of - // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will pollute the - // higher bits of the following OR. - uint32_t null_bytes_lo = static_cast( - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result)))); - uint64_t null_bytes_hi = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 1))); - uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32); + __m256i null32 = GetNullBitInt32(rows, pos_after_encoding, row_id); + null32 = _mm256_cmpeq_epi32(null32, _mm256_set1_epi32(1)); + uint32_t null32_lo = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(null32))); + uint32_t null32_hi = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(null32, 1))); + uint64_t null_bytes = null32_lo | (static_cast(null32_hi) << 32); process_8_values_fn(i * kUnroll, null_bytes); } diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index b0f954be865..ac335d31d59 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/compute/row/compare_internal.h" +#include "arrow/compute/row/row_util_avx2_internal.h" #include "arrow/compute/util.h" #include "arrow/util/bit_util.h" #include "arrow/util/simd.h" @@ -36,28 +37,6 @@ inline __m256i set_first_n_bytes_avx2(int n) { kByteSequence16To23, kByteSequence24To31)); } -// Get null bits for 8 32-bit row ids in `row_id32` at `null_bit_id` as a vector of 32-bit -// integers. -inline __m256i GetNullBitInt32(const RowTableImpl& rows, uint32_t null_bit_id, - __m256i row_id32) { - const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); - __m256i null_mask_num_bits = - _mm256_set1_epi64x(rows.metadata().null_masks_bytes_per_row * 8); - __m256i row_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id32)); - __m256i row_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id32, 1)); - __m256i bit_id_lo = _mm256_mul_epi32(row_lo, null_mask_num_bits); - __m256i bit_id_hi = _mm256_mul_epi32(row_hi, null_mask_num_bits); - bit_id_lo = _mm256_add_epi64(bit_id_lo, _mm256_set1_epi64x(null_bit_id)); - bit_id_hi = _mm256_add_epi64(bit_id_hi, _mm256_set1_epi64x(null_bit_id)); - __m128i right_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi64(bit_id_lo, 3), 1); - __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi64(bit_id_hi, 3), 1); - __m256i right = _mm256_set_m128i(right_hi, right_lo); - return _mm256_and_si256(_mm256_set1_epi32(1), - _mm256_srli_epi32(right, null_bit_id & 7)); -} - // Convert 8 64-bit comparision results, each being 0 or -1, to 8 bytes. inline uint64_t Cmp64To8(__m256i cmp64_lo, __m256i cmp64_hi) { uint32_t cmp_lo = _mm256_movemask_epi8(cmp64_lo); diff --git a/cpp/src/arrow/compute/row/row_util_avx2_internal.h b/cpp/src/arrow/compute/row/row_util_avx2_internal.h new file mode 100644 index 00000000000..3ec5fc3a67e --- /dev/null +++ b/cpp/src/arrow/compute/row/row_util_avx2_internal.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include "arrow/compute/row/row_internal.h" +#include "arrow/util/simd.h" + +#if !defined(ARROW_HAVE_AVX2) && !defined(ARROW_HAVE_AVX512) && \ + !defined(ARROW_HAVE_RUNTIME_AVX2) && !defined(ARROW_HAVE_RUNTIME_AVX512) +# error "This file should only be included when AVX2 or AVX512 is enabled" +#endif + +namespace arrow::compute { + +// Get null bits for 8 32-bit row ids in `row_id32` at `col_pos` as a vector of 32-bit +// integers. Note that the result integer is 0 if the corresponding column is not null, or +// 1 otherwise. +inline __m256i GetNullBitInt32(const RowTableImpl& rows, uint32_t col_pos, + __m256i row_id32) { + const uint8_t* null_masks = rows.null_masks(/*row_id=*/0); + __m256i null_mask_num_bits = + _mm256_set1_epi64x(rows.metadata().null_masks_bytes_per_row * 8); + __m256i row_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id32)); + __m256i row_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id32, 1)); + __m256i bit_id_lo = _mm256_mul_epi32(row_lo, null_mask_num_bits); + __m256i bit_id_hi = _mm256_mul_epi32(row_hi, null_mask_num_bits); + bit_id_lo = _mm256_add_epi64(bit_id_lo, _mm256_set1_epi64x(col_pos)); + bit_id_hi = _mm256_add_epi64(bit_id_hi, _mm256_set1_epi64x(col_pos)); + __m128i right_lo = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_lo, 3), 1); + __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast(null_masks), + _mm256_srli_epi64(bit_id_hi, 3), 1); + __m256i right = _mm256_set_m128i(right_hi, right_lo); + return _mm256_and_si256(_mm256_set1_epi32(1), _mm256_srli_epi32(right, col_pos & 7)); +} + +} // namespace arrow::compute From 8355c6879e947f78f0078a9f64b97f80fc304f1b Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Mon, 27 Jan 2025 03:20:48 +0800 Subject: [PATCH 22/23] Fix CI --- cpp/src/arrow/compute/row/encode_internal.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 136e7760937..0e2720a2866 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -267,25 +267,25 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows, switch (col_prep.metadata().fixed_length) { case 1: for (uint32_t i = 0; i < num_rows; ++i) { - col_base[i] = *rows.fixed_length_rows(start_row + i) + offset_within_row; + col_base[i] = *(rows.fixed_length_rows(start_row + i) + offset_within_row); } break; case 2: for (uint32_t i = 0; i < num_rows; ++i) { - reinterpret_cast(col_base)[i] = - *reinterpret_cast(rows.fixed_length_rows(start_row + i)); + reinterpret_cast(col_base)[i] = *reinterpret_cast( + rows.fixed_length_rows(start_row + i) + offset_within_row); } break; case 4: for (uint32_t i = 0; i < num_rows; ++i) { - reinterpret_cast(col_base)[i] = - *reinterpret_cast(rows.fixed_length_rows(start_row + i)); + reinterpret_cast(col_base)[i] = *reinterpret_cast( + rows.fixed_length_rows(start_row + i + offset_within_row)); } break; case 8: for (uint32_t i = 0; i < num_rows; ++i) { - reinterpret_cast(col_base)[i] = - *reinterpret_cast(rows.fixed_length_rows(start_row + i)); + reinterpret_cast(col_base)[i] = *reinterpret_cast( + rows.fixed_length_rows(start_row + i) + offset_within_row); } break; default: From f7df7a4ff587791e8f650bea7ef9a8cc97e96a01 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Mon, 27 Jan 2025 22:22:09 +0800 Subject: [PATCH 23/23] Move Cmp32/64To8 to common header and reuse it in swiss join avx2 --- cpp/src/arrow/acero/swiss_join_avx2.cc | 6 +----- .../arrow/compute/row/compare_internal_avx2.cc | 17 ----------------- .../arrow/compute/row/row_util_avx2_internal.h | 13 +++++++++++++ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 86d08870e58..deeee2a4e11 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -243,11 +243,7 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); __m256i null32 = GetNullBitInt32(rows, pos_after_encoding, row_id); null32 = _mm256_cmpeq_epi32(null32, _mm256_set1_epi32(1)); - uint32_t null32_lo = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(null32))); - uint32_t null32_hi = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(null32, 1))); - uint64_t null_bytes = null32_lo | (static_cast(null32_hi) << 32); + uint64_t null_bytes = arrow::compute::Cmp32To8(null32); process_8_values_fn(i * kUnroll, null_bytes); } diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index ac335d31d59..8af84ac6b2f 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -24,8 +24,6 @@ namespace arrow { namespace compute { -namespace { - inline __m256i set_first_n_bytes_avx2(int n) { constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; @@ -37,21 +35,6 @@ inline __m256i set_first_n_bytes_avx2(int n) { kByteSequence16To23, kByteSequence24To31)); } -// Convert 8 64-bit comparision results, each being 0 or -1, to 8 bytes. -inline uint64_t Cmp64To8(__m256i cmp64_lo, __m256i cmp64_hi) { - uint32_t cmp_lo = _mm256_movemask_epi8(cmp64_lo); - uint32_t cmp_hi = _mm256_movemask_epi8(cmp64_hi); - return cmp_lo | (static_cast(cmp_hi) << 32); -} - -// Convert 8 32-bit comparision results, each being 0 or -1, to 8 bytes. -inline uint64_t Cmp32To8(__m256i cmp32) { - return Cmp64To8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp32)), - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp32, 1))); -} - -} // namespace - template uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, diff --git a/cpp/src/arrow/compute/row/row_util_avx2_internal.h b/cpp/src/arrow/compute/row/row_util_avx2_internal.h index 3ec5fc3a67e..a8fce7e0e86 100644 --- a/cpp/src/arrow/compute/row/row_util_avx2_internal.h +++ b/cpp/src/arrow/compute/row/row_util_avx2_internal.h @@ -26,6 +26,19 @@ namespace arrow::compute { +// Convert 8 64-bit comparision results, each being 0 or -1, to 8 bytes. +inline uint64_t Cmp64To8(__m256i cmp64_lo, __m256i cmp64_hi) { + uint32_t cmp_lo = _mm256_movemask_epi8(cmp64_lo); + uint32_t cmp_hi = _mm256_movemask_epi8(cmp64_hi); + return cmp_lo | (static_cast(cmp_hi) << 32); +} + +// Convert 8 32-bit comparision results, each being 0 or -1, to 8 bytes. +inline uint64_t Cmp32To8(__m256i cmp32) { + return Cmp64To8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(cmp32)), + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(cmp32, 1))); +} + // Get null bits for 8 32-bit row ids in `row_id32` at `col_pos` as a vector of 32-bit // integers. Note that the result integer is 0 if the corresponding column is not null, or // 1 otherwise.