From 7d87de648492ffc49e5ae98d070f599e390f4dbb Mon Sep 17 00:00:00 2001 From: zanmato Date: Sun, 31 Dec 2023 17:21:38 -0800 Subject: [PATCH 01/33] Sketch basic filter logic --- cpp/src/arrow/acero/swiss_join.cc | 164 +++++++++++++++++++--- cpp/src/arrow/acero/swiss_join_internal.h | 28 ++++ 2 files changed, 171 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 2f79ed299bb..6b4aecf1f4c 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1855,6 +1855,99 @@ bool JoinMatchIterator::GetNextBatch(int num_rows_max, int* out_num_rows, return (*out_num_rows) > 0; } +Status JoinResidualFilter::FilterMatchBitVector( + const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, + int bit_match, const uint8_t* match_bitvector, const uint32_t* key_ids, + bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack, + int* num_passing_ids, uint16_t* passing_batch_row_ids, + uint32_t* passing_key_ids_maybe_null) { + ARROW_DCHECK(filter_ != literal(true)); + *num_passing_ids = 0; + if (filter_.IsNullLiteral() || filter_ == literal(false)) { + return Status::OK(); + } + auto materialize_batch_ids_buf = + arrow::util::TempVectorHolder(temp_stack, minibatch_size_); + auto materialize_key_ids_buf = + arrow::util::TempVectorHolder(temp_stack, minibatch_size_); + auto materialize_payload_ids_buf = + arrow::util::TempVectorHolder(temp_stack, minibatch_size_); + + JoinMatchIterator match_iterator; + match_iterator.SetLookupResult(num_batch_rows, batch_start_row, match_bitvector, + key_ids, no_duplicate_keys, key_to_payload_); + int num_matches_next = 0; + uint32_t row_id_last = std::numeric_limits::max() + 1; + while (match_iterator.GetNextBatch(minibatch_size_, &num_matches_next, + materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data())) { + int num_filtered = 0; + RETURN_NOT_OK(FilterMatchRowIds( + keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data(), passing_key_ids_maybe_null, false, + temp_stack, &num_filtered)); + for (int ifiltered = 0; ifiltered < num_filtered; ++ifiltered) { + if (materialize_batch_ids_buf.mutable_data()[ifiltered] == row_id_last) { + continue; + } + passing_batch_row_ids[*num_passing_ids] = + materialize_batch_ids_buf.mutable_data()[ifiltered]; + if (passing_key_ids_maybe_null) { + passing_key_ids_maybe_null[*num_passing_ids] = + materialize_key_ids_buf.mutable_data()[ifiltered]; + } + row_id_last = materialize_batch_ids_buf.mutable_data()[ifiltered]; + ++(*num_passing_ids); + } + } + return Status::OK(); +} + +Status JoinResidualFilter::FilterMatchRowIds(const ExecBatch& keypayload_batch, + int num_batch_rows, uint16_t* batch_row_ids, + uint32_t* key_ids, uint32_t* payload_ids, + bool output_key_ids, bool output_payload_ids, + arrow::util::TempVectorStack* temp_stack, + int* num_passing_rows) { + ARROW_DCHECK(filter_ != literal(true)); + *num_passing_rows = 0; + if (filter_.IsNullLiteral() || filter_ == literal(false)) { + return Status::OK(); + } + ARROW_ASSIGN_OR_RAISE(Datum mask, EvalFilter()); + if (mask.is_scalar()) { + const auto& mask_scalar = mask.scalar_as(); + if (mask_scalar.is_valid && mask_scalar.value) { + *num_passing_rows = num_batch_rows; + return Status::OK(); + } else { + return Status::OK(); + } + } + ARROW_DCHECK_EQ(mask.array()->offset, 0); + ARROW_DCHECK_EQ(mask.array()->length, static_cast(num_batch_rows)); + const uint8_t* validity = + mask.array()->buffers[0] ? mask.array()->buffers[0]->data() : nullptr; + const uint8_t* comparisons = mask.array()->buffers[1]->data(); + for (int irow = 0; irow < num_batch_rows; ++irow) { + bool is_valid = !validity || bit_util::GetBit(validity, irow); + bool is_cmp_true = bit_util::GetBit(comparisons, irow); + if (is_valid && is_cmp_true) { + batch_row_ids[*num_passing_rows] = batch_row_ids[irow]; + if (output_key_ids) { + key_ids[*num_passing_rows] = key_ids[irow]; + } + if (output_payload_ids) { + payload_ids[*num_passing_rows] = payload_ids[irow]; + } + ++(*num_passing_rows); + } + } + return Status::OK(); +} + void JoinProbeProcessor::Init(int num_key_columns, JoinType join_type, SwissTableForJoin* hash_table, std::vector materialize, @@ -1893,6 +1986,8 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, auto hashes_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto match_bitvector_buf = arrow::util::TempVectorHolder( temp_stack, static_cast(bit_util::BytesForBits(minibatch_size))); + auto filtered_bitvector_buf = arrow::util::TempVectorHolder( + temp_stack, static_cast(bit_util::BytesForBits(minibatch_size))); auto key_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto materialize_batch_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); @@ -1923,33 +2018,48 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI || join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { int num_passing_ids = 0; - arrow::util::bit_util::bits_to_indexes( - (join_type_ == JoinType::LEFT_ANTI) ? 0 : 1, hardware_flags, - minibatch_size_next, match_bitvector_buf.mutable_data(), &num_passing_ids, - materialize_batch_ids_buf.mutable_data()); + int bit_match = join_type_ == JoinType::LEFT_ANTI ? 0 : 1; + if (!residual_filter_) { + arrow::util::bit_util::bits_to_indexes( + bit_match, hardware_flags, minibatch_size_next, + match_bitvector_buf.mutable_data(), &num_passing_ids, + materialize_batch_ids_buf.mutable_data()); + if (join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { + // For right-semi, right-anti joins: collect key ids of passing rows. + // + for (int i = 0; i < num_passing_ids; ++i) { + uint16_t id = materialize_batch_ids_buf.mutable_data()[i]; + materialize_key_ids_buf.mutable_data()[i] = key_ids_buf.mutable_data()[id]; + } + } else { + // For left-semi, left-anti joins: add base batch row index. + // + for (int i = 0; i < num_passing_ids; ++i) { + materialize_batch_ids_buf.mutable_data()[i] += + static_cast(minibatch_start); + } + } + } else { + bool no_duplicate_keys = (hash_table_->key_to_payload() == nullptr); + RETURN_NOT_OK(residual_filter_->FilterMatchBitVector( + keypayload_batch, minibatch_start, minibatch_size_next, bit_match, + match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), + no_duplicate_keys, temp_stack, &num_passing_ids, + materialize_batch_ids_buf.mutable_data(), + join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI + ? materialize_key_ids_buf.mutable_data() + : NULLPTR)); + } - // For right-semi, right-anti joins: update has-match flags for the rows - // in hash table. - // if (join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { - for (int i = 0; i < num_passing_ids; ++i) { - uint16_t id = materialize_batch_ids_buf.mutable_data()[i]; - key_ids_buf.mutable_data()[i] = key_ids_buf.mutable_data()[id]; - } + // For right-semi, right-anti joins: update has-match flags for the rows + // in hash table. hash_table_->UpdateHasMatchForKeys(thread_id, num_passing_ids, - key_ids_buf.mutable_data()); + materialize_key_ids_buf.mutable_data()); } else { // For left-semi, left-anti joins: call materialize using match - // bit-vector. + // row ids. // - - // Add base batch row index. - // - for (int i = 0; i < num_passing_ids; ++i) { - materialize_batch_ids_buf.mutable_data()[i] += - static_cast(minibatch_start); - } - RETURN_NOT_OK(materialize_[thread_id]->AppendProbeOnly( keypayload_batch, num_passing_ids, materialize_batch_ids_buf.mutable_data(), [&](ExecBatch batch) { @@ -1972,6 +2082,15 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, materialize_batch_ids_buf.mutable_data(), materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data())) { + if (residual_filter_) { + RETURN_NOT_OK(residual_filter_->FilterMatchRowIds( + keypayload_batch, num_matches_next, + materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data(), true, + !(no_duplicate_keys || no_payload_columns), temp_stack, &num_matches_next)); + // TODO: Index to bit vector. + } const uint16_t* materialize_batch_ids = materialize_batch_ids_buf.mutable_data(); const uint32_t* materialize_key_ids = materialize_key_ids_buf.mutable_data(); const uint32_t* materialize_payload_ids = @@ -2003,6 +2122,9 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, // the other side of the join. // if (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER) { + if (residual_filter_) { + // TODO: and match bit vector. + } int num_passing_ids = 0; arrow::util::bit_util::bits_to_indexes( /*bit_to_search=*/0, hardware_flags, minibatch_size_next, diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 6403b7a655e..76b6ea6e97d 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -736,6 +736,33 @@ class JoinMatchIterator { int current_match_for_row_; }; +class JoinResidualFilter { + public: + Status FilterMatchBitVector(const ExecBatch& keypayload_batch, int batch_start_row, + int num_batch_rows, int bit_match, + const uint8_t* match_bitvector, const uint32_t* key_ids, + bool no_duplicate_keys, + arrow::util::TempVectorStack* temp_stack, + int* num_passing_ids, uint16_t* passing_batch_row_ids, + uint32_t* passing_key_ids_maybe_null); + + Status FilterMatchRowIds(const ExecBatch& keypayload_batch, int num_batch_rows, + uint16_t* batch_row_ids, uint32_t* key_ids, + uint32_t* payload_ids, bool output_key_ids, + bool output_payload_ids, + arrow::util::TempVectorStack* temp_stack, + int* num_passing_rows); + + private: + Result EvalFilter() { return Datum(); } + + private: + // int64_t hardware_flags_; + int minibatch_size_; + Expression filter_; + const uint32_t* key_to_payload_; +}; + // Implements entire processing of a probe side exec batch, // provided the join hash table is already built and available. // @@ -760,6 +787,7 @@ class JoinProbeProcessor { JoinType join_type_; SwissTableForJoin* hash_table_; + JoinResidualFilter* residual_filter_; // One element per thread // std::vector materialize_; From cccf9e16e2dd0e2811ddf3550a78fee3f9e216ca Mon Sep 17 00:00:00 2001 From: zanmato Date: Mon, 1 Jan 2024 20:06:08 -0800 Subject: [PATCH 02/33] Implement materialize and evaluation for residual filter for swiss join --- cpp/src/arrow/acero/swiss_join.cc | 120 ++++++++++++++++++++-- cpp/src/arrow/acero/swiss_join_internal.h | 36 +++++-- 2 files changed, 140 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 6b4aecf1f4c..c730767bb0c 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1860,12 +1860,39 @@ Status JoinResidualFilter::FilterMatchBitVector( int bit_match, const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack, int* num_passing_ids, uint16_t* passing_batch_row_ids, - uint32_t* passing_key_ids_maybe_null) { + uint32_t* passing_key_ids_maybe_null) const { ARROW_DCHECK(filter_ != literal(true)); *num_passing_ids = 0; if (filter_.IsNullLiteral() || filter_ == literal(false)) { return Status::OK(); } + + if (build_filter_to_key_.empty() && build_filter_to_payload_.empty()) { + arrow::util::bit_util::bits_to_indexes(bit_match, hardware_flags_, num_batch_rows, + match_bitvector, num_passing_ids, + passing_batch_row_ids); + if (passing_key_ids_maybe_null) { + // Collect key ids of passing rows. + // + for (int i = 0; i < *num_passing_ids; ++i) { + uint16_t id = passing_batch_row_ids[i]; + passing_key_ids_maybe_null[i] = key_ids[id]; + } + } + + // Add base batch row index. + // + for (int i = 0; i < *num_passing_ids; ++i) { + passing_batch_row_ids[i] += static_cast(batch_start_row); + } + + RETURN_NOT_OK(FilterMatchRowIds(keypayload_batch, *num_passing_ids, + passing_batch_row_ids, passing_key_ids_maybe_null, + NULLPTR, passing_key_ids_maybe_null, false, + temp_stack, num_passing_ids)); + return Status::OK(); + } + auto materialize_batch_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); auto materialize_key_ids_buf = @@ -1907,16 +1934,23 @@ Status JoinResidualFilter::FilterMatchBitVector( Status JoinResidualFilter::FilterMatchRowIds(const ExecBatch& keypayload_batch, int num_batch_rows, uint16_t* batch_row_ids, - uint32_t* key_ids, uint32_t* payload_ids, + uint32_t* key_ids_maybe_null, + uint32_t* payload_ids_maybe_null, bool output_key_ids, bool output_payload_ids, arrow::util::TempVectorStack* temp_stack, - int* num_passing_rows) { + int* num_passing_rows) const { ARROW_DCHECK(filter_ != literal(true)); + ARROW_DCHECK(!output_key_ids || key_ids_maybe_null); + ARROW_DCHECK(!output_payload_ids || payload_ids_maybe_null); + *num_passing_rows = 0; if (filter_.IsNullLiteral() || filter_ == literal(false)) { return Status::OK(); } - ARROW_ASSIGN_OR_RAISE(Datum mask, EvalFilter()); + + ARROW_ASSIGN_OR_RAISE(Datum mask, + EvalFilter(keypayload_batch, num_batch_rows, batch_row_ids, + key_ids_maybe_null, payload_ids_maybe_null)); if (mask.is_scalar()) { const auto& mask_scalar = mask.scalar_as(); if (mask_scalar.is_valid && mask_scalar.value) { @@ -1926,6 +1960,7 @@ Status JoinResidualFilter::FilterMatchRowIds(const ExecBatch& keypayload_batch, return Status::OK(); } } + ARROW_DCHECK_EQ(mask.array()->offset, 0); ARROW_DCHECK_EQ(mask.array()->length, static_cast(num_batch_rows)); const uint8_t* validity = @@ -1937,10 +1972,10 @@ Status JoinResidualFilter::FilterMatchRowIds(const ExecBatch& keypayload_batch, if (is_valid && is_cmp_true) { batch_row_ids[*num_passing_rows] = batch_row_ids[irow]; if (output_key_ids) { - key_ids[*num_passing_rows] = key_ids[irow]; + key_ids_maybe_null[*num_passing_rows] = key_ids_maybe_null[irow]; } if (output_payload_ids) { - payload_ids[*num_passing_rows] = payload_ids[irow]; + payload_ids_maybe_null[*num_passing_rows] = payload_ids_maybe_null[irow]; } ++(*num_passing_rows); } @@ -1948,6 +1983,71 @@ Status JoinResidualFilter::FilterMatchRowIds(const ExecBatch& keypayload_batch, return Status::OK(); } +Result JoinResidualFilter::EvalFilter( + const ExecBatch& keypayload_batch, int num_batch_rows, const uint16_t* batch_row_ids, + const uint32_t* key_ids_maybe_null, const uint32_t* payload_ids_maybe_null) const { + ARROW_DCHECK(!filter_.IsNullLiteral() && filter_ != literal(true) && + filter_ != literal(false)); + + ARROW_ASSIGN_OR_RAISE( + ExecBatch input, + MaterializeFilterInput(keypayload_batch, num_batch_rows, batch_row_ids, + key_ids_maybe_null, payload_ids_maybe_null)); + return ExecuteScalarExpression(filter_, input, ctx_->exec_context()); +} + +Result JoinResidualFilter::MaterializeFilterInput( + const ExecBatch& keypayload_batch, int num_batch_rows, const uint16_t* batch_row_ids, + const uint32_t* key_ids_maybe_null, const uint32_t* payload_ids_maybe_null) const { + ExecBatch out; + out.length = num_batch_rows; + out.values.resize(probe_filter_to_key_and_payload_.size() + + build_filter_to_key_.size() + build_filter_to_payload_.size()); + + if (probe_filter_to_key_and_payload_.size() > 0) { + ExecBatchBuilder probe_batch_builder; + RETURN_NOT_OK(probe_batch_builder.AppendSelected( + pool_, keypayload_batch, num_batch_rows, batch_row_ids, + static_cast(probe_filter_to_key_and_payload_.size()), + probe_filter_to_key_and_payload_.data())); + ExecBatch probe_batch = probe_batch_builder.Flush(); + ARROW_DCHECK(probe_batch.values.size() == probe_filter_to_key_and_payload_.size()); + for (size_t i = 0; i < probe_batch.values.size(); ++i) { + out.values[i] = std::move(probe_batch.values[i]); + } + } + + if (build_filter_to_key_.size() > 0) { + ARROW_DCHECK(key_ids_maybe_null); + for (size_t i = 0; i < build_filter_to_key_.size(); ++i) { + int key_idx = build_filter_to_key_[i]; + ResizableArrayData build_key; + build_key.Init(build_schemas_->data_type(HashJoinProjection::KEY, key_idx), pool_, + bit_util::Log2(num_batch_rows)); + RETURN_NOT_OK(build_keys_->DecodeSelected(&build_key, key_idx, num_batch_rows, + key_ids_maybe_null, pool_)); + out.values[probe_filter_to_key_and_payload_.size() + i] = build_key.array_data(); + } + } + + if (build_filter_to_payload_.size() > 0) { + ARROW_DCHECK(payload_ids_maybe_null); + for (size_t i = 0; i < build_filter_to_payload_.size(); ++i) { + int payload_idx = build_filter_to_payload_[i]; + ResizableArrayData build_payload; + build_payload.Init( + build_schemas_->data_type(HashJoinProjection::PAYLOAD, payload_idx), pool_, + bit_util::Log2(num_batch_rows)); + RETURN_NOT_OK(build_payloads_->DecodeSelected( + &build_payload, payload_idx, num_batch_rows, payload_ids_maybe_null, pool_)); + out.values[probe_filter_to_key_and_payload_.size() + build_filter_to_key_.size() + + i] = build_payload.array_data(); + } + } + + return out; +} + void JoinProbeProcessor::Init(int num_key_columns, JoinType join_type, SwissTableForJoin* hash_table, std::vector materialize, @@ -2019,7 +2119,7 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { int num_passing_ids = 0; int bit_match = join_type_ == JoinType::LEFT_ANTI ? 0 : 1; - if (!residual_filter_) { + if (residual_filter_->IsTrivial()) { arrow::util::bit_util::bits_to_indexes( bit_match, hardware_flags, minibatch_size_next, match_bitvector_buf.mutable_data(), &num_passing_ids, @@ -2082,7 +2182,7 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, materialize_batch_ids_buf.mutable_data(), materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data())) { - if (residual_filter_) { + if (!residual_filter_->IsTrivial()) { RETURN_NOT_OK(residual_filter_->FilterMatchRowIds( keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), @@ -2302,7 +2402,8 @@ class SwissJoin : public HashJoinImpl { // const HashJoinProjectionMaps* schema = schema_[1]; bool reject_duplicate_keys = - join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI; + (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI) && + (schema->is_empty(HashJoinProjection::FILTER) == 0); bool no_payload = reject_duplicate_keys || schema->num_cols(HashJoinProjection::PAYLOAD) == 0; @@ -2646,6 +2747,7 @@ class SwissJoin : public HashJoinImpl { SwissTableForJoin hash_table_; JoinProbeProcessor probe_processor_; + JoinResidualFilter residual_filter_; SwissTableForJoinBuild hash_table_build_; AccumulationQueue build_side_batches_; diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 76b6ea6e97d..4d16b631070 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -738,29 +738,51 @@ class JoinMatchIterator { class JoinResidualFilter { public: + bool IsTrivial() const { return filter_ == literal(true); } + Status FilterMatchBitVector(const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, int bit_match, const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack, int* num_passing_ids, uint16_t* passing_batch_row_ids, - uint32_t* passing_key_ids_maybe_null); + uint32_t* passing_key_ids_maybe_null) const; Status FilterMatchRowIds(const ExecBatch& keypayload_batch, int num_batch_rows, - uint16_t* batch_row_ids, uint32_t* key_ids, - uint32_t* payload_ids, bool output_key_ids, + uint16_t* batch_row_ids, uint32_t* key_ids_maybe_null, + uint32_t* payload_ids_maybe_null, bool output_key_ids, bool output_payload_ids, arrow::util::TempVectorStack* temp_stack, - int* num_passing_rows); + int* num_passing_rows) const; private: - Result EvalFilter() { return Datum(); } + Result EvalFilter(const ExecBatch& keypayload_batch, int num_batch_rows, + const uint16_t* batch_row_ids, + const uint32_t* key_ids_maybe_null, + const uint32_t* payload_ids_maybe_null) const; + + Result MaterializeFilterInput(const ExecBatch& keypayload_batch, + int num_batch_rows, + const uint16_t* batch_row_ids, + const uint32_t* key_ids_maybe_null, + const uint32_t* payload_ids_maybe_null) const; private: - // int64_t hardware_flags_; + QueryContext* ctx_; + int64_t hardware_flags_; + MemoryPool* pool_; int minibatch_size_; + // const HashJoinProjectionMaps* probe_schemas_; + const HashJoinProjectionMaps* build_schemas_; Expression filter_; + + std::vector probe_filter_to_key_and_payload_; + std::vector build_filter_to_key_; + std::vector build_filter_to_payload_; + const uint32_t* key_to_payload_; + const RowArray* build_keys_; + const RowArray* build_payloads_; }; // Implements entire processing of a probe side exec batch, @@ -787,7 +809,7 @@ class JoinProbeProcessor { JoinType join_type_; SwissTableForJoin* hash_table_; - JoinResidualFilter* residual_filter_; + const JoinResidualFilter* residual_filter_; // One element per thread // std::vector materialize_; From 8bfb8d7ee9d4b2f1901bb5bcd4e2717f191bb284 Mon Sep 17 00:00:00 2001 From: zanmato Date: Tue, 2 Jan 2024 11:11:07 -0800 Subject: [PATCH 03/33] Finish impl --- cpp/src/arrow/acero/hash_join_node.cc | 6 +- cpp/src/arrow/acero/swiss_join.cc | 131 +++++++++++++++++----- cpp/src/arrow/acero/swiss_join_internal.h | 26 +++-- 3 files changed, 122 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index 254dad361ff..c0179fd160e 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -740,13 +740,11 @@ class HashJoinNode : public ExecNode, public TracedNode { // Create hash join implementation object // SwissJoin does not support: // a) 64-bit string offsets - // b) residual predicates - // c) dictionaries + // b) dictionaries // bool use_swiss_join; #if ARROW_LITTLE_ENDIAN - use_swiss_join = (filter == literal(true)) && !schema_mgr->HasDictionaries() && - !schema_mgr->HasLargeBinary(); + use_swiss_join = !schema_mgr->HasDictionaries() && !schema_mgr->HasLargeBinary(); #else use_swiss_join = false; #endif diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index c730767bb0c..ad8e44c585e 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1855,6 +1855,68 @@ bool JoinMatchIterator::GetNextBatch(int num_rows_max, int* out_num_rows, return (*out_num_rows) > 0; } +void JoinResidualFilter::Init(Expression filter, int minibatch_size, QueryContext* ctx, + MemoryPool* pool, int64_t hardware_flags, + const HashJoinProjectionMaps* probe_schemas, + const HashJoinProjectionMaps* build_schemas) { + filter_ = std::move(filter); + if (filter_ == literal(true)) { + return; + } + + minibatch_size_ = minibatch_size; + ctx_ = ctx; + pool_ = pool; + hardware_flags_ = hardware_flags; + probe_schemas_ = probe_schemas; + build_schemas_ = build_schemas; + + { + probe_filter_to_key_and_payload_.resize( + probe_schemas_->num_cols(HashJoinProjection::FILTER)); + int num_key_cols = probe_schemas_->num_cols(HashJoinProjection::KEY); + auto to_key = + probe_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::KEY); + auto to_payload = + probe_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD); + for (int i = 0; static_cast(i) < probe_filter_to_key_and_payload_.size(); + ++i) { + if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) { + probe_filter_to_key_and_payload_[i] = idx; + } else if (idx = to_payload.get(i); idx != SchemaProjectionMap::kMissingField) { + probe_filter_to_key_and_payload_[i] = idx + num_key_cols; + } else { + DCHECK(false); + } + } + } + + { + int num_columns = build_schemas_->num_cols(HashJoinProjection::FILTER); + auto to_key = + build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::KEY); + auto to_payload = + build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD); + for (int i = 0; i < num_columns; ++i) { + if (to_key.get(i) != SchemaProjectionMap::kMissingField) { + num_build_keys_referred_++; + } else if (to_payload.get(i) != SchemaProjectionMap::kMissingField) { + num_build_payloads_referred_++; + } else { + DCHECK(false); + } + } + } +} + +void JoinResidualFilter::SetBuildSide(const RowArray* build_keys, + const RowArray* build_payloads, + const uint32_t* key_to_payload) { + build_keys_ = build_keys; + build_payloads_ = build_payloads; + key_to_payload_ = key_to_payload; +} + Status JoinResidualFilter::FilterMatchBitVector( const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, int bit_match, const uint8_t* match_bitvector, const uint32_t* key_ids, @@ -1867,7 +1929,10 @@ Status JoinResidualFilter::FilterMatchBitVector( return Status::OK(); } - if (build_filter_to_key_.empty() && build_filter_to_payload_.empty()) { + if (num_build_keys_referred_ == 0 && num_build_payloads_referred_ == 0) { + // If filter refers no column in right table, + // TODO + // arrow::util::bit_util::bits_to_indexes(bit_match, hardware_flags_, num_batch_rows, match_bitvector, num_passing_ids, passing_batch_row_ids); @@ -2001,8 +2066,8 @@ Result JoinResidualFilter::MaterializeFilterInput( const uint32_t* key_ids_maybe_null, const uint32_t* payload_ids_maybe_null) const { ExecBatch out; out.length = num_batch_rows; - out.values.resize(probe_filter_to_key_and_payload_.size() + - build_filter_to_key_.size() + build_filter_to_payload_.size()); + out.values.resize(probe_filter_to_key_and_payload_.size() + num_build_keys_referred_ + + num_build_payloads_referred_); if (probe_filter_to_key_and_payload_.size() > 0) { ExecBatchBuilder probe_batch_builder; @@ -2017,31 +2082,29 @@ Result JoinResidualFilter::MaterializeFilterInput( } } - if (build_filter_to_key_.size() > 0) { - ARROW_DCHECK(key_ids_maybe_null); - for (size_t i = 0; i < build_filter_to_key_.size(); ++i) { - int key_idx = build_filter_to_key_[i]; - ResizableArrayData build_key; - build_key.Init(build_schemas_->data_type(HashJoinProjection::KEY, key_idx), pool_, - bit_util::Log2(num_batch_rows)); - RETURN_NOT_OK(build_keys_->DecodeSelected(&build_key, key_idx, num_batch_rows, - key_ids_maybe_null, pool_)); - out.values[probe_filter_to_key_and_payload_.size() + i] = build_key.array_data(); - } - } - - if (build_filter_to_payload_.size() > 0) { - ARROW_DCHECK(payload_ids_maybe_null); - for (size_t i = 0; i < build_filter_to_payload_.size(); ++i) { - int payload_idx = build_filter_to_payload_[i]; - ResizableArrayData build_payload; - build_payload.Init( - build_schemas_->data_type(HashJoinProjection::PAYLOAD, payload_idx), pool_, - bit_util::Log2(num_batch_rows)); - RETURN_NOT_OK(build_payloads_->DecodeSelected( - &build_payload, payload_idx, num_batch_rows, payload_ids_maybe_null, pool_)); - out.values[probe_filter_to_key_and_payload_.size() + build_filter_to_key_.size() + - i] = build_payload.array_data(); + if (num_build_keys_referred_ > 0 || num_build_payloads_referred_ > 0) { + ARROW_DCHECK(num_build_keys_referred_ == 0 || key_ids_maybe_null); + ARROW_DCHECK(num_build_payloads_referred_ == 0 || payload_ids_maybe_null); + + int num_build_cols = build_schemas_->num_cols(HashJoinProjection::FILTER); + auto to_key = + build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::KEY); + auto to_payload = + build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD); + for (int i = 0; i < num_build_cols; ++i) { + ResizableArrayData column_data; + column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), pool_, + bit_util::Log2(num_batch_rows)); + if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) { + RETURN_NOT_OK(build_keys_->DecodeSelected(&column_data, idx, num_batch_rows, + key_ids_maybe_null, pool_)); + } else if (idx = to_payload.get(i); idx != SchemaProjectionMap::kMissingField) { + RETURN_NOT_OK(build_payloads_->DecodeSelected(&column_data, idx, num_batch_rows, + payload_ids_maybe_null, pool_)); + } else { + ARROW_DCHECK(false); + } + out.values[probe_filter_to_key_and_payload_.size() + i] = column_data.array_data(); } } @@ -2050,12 +2113,14 @@ Result JoinResidualFilter::MaterializeFilterInput( void JoinProbeProcessor::Init(int num_key_columns, JoinType join_type, SwissTableForJoin* hash_table, + JoinResidualFilter* residual_filter, std::vector materialize, const std::vector* cmp, OutputBatchFn output_batch_fn) { num_key_columns_ = num_key_columns; join_type_ = join_type; hash_table_ = hash_table; + residual_filter_ = residual_filter; materialize_.resize(materialize.size()); for (size_t i = 0; i < materialize.size(); ++i) { materialize_[i] = materialize[i]; @@ -2321,8 +2386,13 @@ class SwissJoin : public HashJoinImpl { materialize[i] = &local_states_[i].materialize; } + int minibatch_size = hash_table_.keys()->swiss_table()->minibatch_size(); + residual_filter_.Init(std::move(filter), minibatch_size, ctx_, pool_, hardware_flags_, + proj_map_left, proj_map_right); + probe_processor_.Init(proj_map_left->num_cols(HashJoinProjection::KEY), join_type_, - &hash_table_, materialize, &key_cmp_, output_batch_callback_); + &hash_table_, &residual_filter_, materialize, &key_cmp_, + output_batch_callback_); InitTaskGroups(); @@ -2525,6 +2595,9 @@ class SwissJoin : public HashJoinImpl { } hash_table_ready_.store(true); + residual_filter_.SetBuildSide(hash_table_.keys()->keys(), hash_table_.payloads(), + hash_table_.key_to_payload()); + return build_finished_callback_(thread_id); } diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 4d16b631070..6bcddfca9d8 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -738,6 +738,13 @@ class JoinMatchIterator { class JoinResidualFilter { public: + void Init(Expression filter, int minibatch_size, QueryContext* ctx, MemoryPool* pool, + int64_t hardware_flags, const HashJoinProjectionMaps* probe_schemas, + const HashJoinProjectionMaps* build_schemas); + + void SetBuildSide(const RowArray* build_keys, const RowArray* build_payloads, + const uint32_t* key_to_payload); + bool IsTrivial() const { return filter_ == literal(true); } Status FilterMatchBitVector(const ExecBatch& keypayload_batch, int batch_start_row, @@ -768,21 +775,23 @@ class JoinResidualFilter { const uint32_t* payload_ids_maybe_null) const; private: + Expression filter_; + int minibatch_size_; + QueryContext* ctx_; - int64_t hardware_flags_; MemoryPool* pool_; - int minibatch_size_; - // const HashJoinProjectionMaps* probe_schemas_; + int64_t hardware_flags_; + + const HashJoinProjectionMaps* probe_schemas_; const HashJoinProjectionMaps* build_schemas_; - Expression filter_; std::vector probe_filter_to_key_and_payload_; - std::vector build_filter_to_key_; - std::vector build_filter_to_payload_; + int num_build_keys_referred_ = 0; + int num_build_payloads_referred_ = 0; - const uint32_t* key_to_payload_; const RowArray* build_keys_; const RowArray* build_payloads_; + const uint32_t* key_to_payload_; }; // Implements entire processing of a probe side exec batch, @@ -793,6 +802,7 @@ class JoinProbeProcessor { using OutputBatchFn = std::function; void Init(int num_key_columns, JoinType join_type, SwissTableForJoin* hash_table, + JoinResidualFilter* residual_filter, std::vector materialize, const std::vector* cmp, OutputBatchFn output_batch_fn); Status OnNextBatch(int64_t thread_id, const ExecBatch& keypayload_batch, @@ -809,7 +819,7 @@ class JoinProbeProcessor { JoinType join_type_; SwissTableForJoin* hash_table_; - const JoinResidualFilter* residual_filter_; + JoinResidualFilter* residual_filter_; // One element per thread // std::vector materialize_; From 4322db59d7c811b2d4b0f659bbcb9db0595ef9a0 Mon Sep 17 00:00:00 2001 From: zanmato Date: Tue, 2 Jan 2024 14:36:34 -0800 Subject: [PATCH 04/33] Init residual filter in probe processor --- cpp/src/arrow/acero/swiss_join.cc | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index ad8e44c585e..b1937407252 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1930,8 +1930,8 @@ Status JoinResidualFilter::FilterMatchBitVector( } if (num_build_keys_referred_ == 0 && num_build_payloads_referred_ == 0) { - // If filter refers no column in right table, - // TODO + // If filter refers no column in the right table, then we can directly filter on the + // left rows without inner matching and materializing the right rows. // arrow::util::bit_util::bits_to_indexes(bit_match, hardware_flags_, num_batch_rows, match_bitvector, num_passing_ids, @@ -1951,10 +1951,11 @@ Status JoinResidualFilter::FilterMatchBitVector( passing_batch_row_ids[i] += static_cast(batch_start_row); } - RETURN_NOT_OK(FilterMatchRowIds(keypayload_batch, *num_passing_ids, - passing_batch_row_ids, passing_key_ids_maybe_null, - NULLPTR, passing_key_ids_maybe_null, false, - temp_stack, num_passing_ids)); + RETURN_NOT_OK( + FilterMatchRowIds(keypayload_batch, *num_passing_ids, passing_batch_row_ids, + passing_key_ids_maybe_null, /*payload_ids_maybe_null=*/NULLPTR, + passing_key_ids_maybe_null, + /*output_payload_ids=*/false, temp_stack, num_passing_ids)); return Status::OK(); } @@ -1969,6 +1970,7 @@ Status JoinResidualFilter::FilterMatchBitVector( match_iterator.SetLookupResult(num_batch_rows, batch_start_row, match_bitvector, key_ids, no_duplicate_keys, key_to_payload_); int num_matches_next = 0; + // Last row id passing the filter, used to filter out duplicate rows. uint32_t row_id_last = std::numeric_limits::max() + 1; while (match_iterator.GetNextBatch(minibatch_size_, &num_matches_next, materialize_batch_ids_buf.mutable_data(), @@ -1978,8 +1980,8 @@ Status JoinResidualFilter::FilterMatchBitVector( RETURN_NOT_OK(FilterMatchRowIds( keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data(), passing_key_ids_maybe_null, false, - temp_stack, &num_filtered)); + materialize_payload_ids_buf.mutable_data(), passing_key_ids_maybe_null, + /*output_payload_ids=*/false, temp_stack, &num_filtered)); for (int ifiltered = 0; ifiltered < num_filtered; ++ifiltered) { if (materialize_batch_ids_buf.mutable_data()[ifiltered] == row_id_last) { continue; @@ -2252,7 +2254,7 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data(), true, + materialize_payload_ids_buf.mutable_data(), /*output_payload_ids=*/true, !(no_duplicate_keys || no_payload_columns), temp_stack, &num_matches_next)); // TODO: Index to bit vector. } From f30a382e1bc0202fcfa36eab36042a65d674b6d9 Mon Sep 17 00:00:00 2001 From: zanmato Date: Wed, 3 Jan 2024 10:09:43 -0800 Subject: [PATCH 05/33] Add match bitvector update for left joins --- cpp/src/arrow/acero/swiss_join.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index b1937407252..6b29448cea5 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2256,7 +2256,12 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data(), /*output_payload_ids=*/true, !(no_duplicate_keys || no_payload_columns), temp_stack, &num_matches_next)); - // TODO: Index to bit vector. + std::memset(filtered_bitvector_buf.mutable_data(), 0, + bit_util::BytesForBits(minibatch_size_next)); + for (int i = 0; i < num_matches_next; ++i) { + int bit_idx = materialize_batch_ids_buf.mutable_data()[i] - minibatch_start; + bit_util::SetBitTo(filtered_bitvector_buf.mutable_data(), bit_idx, 1); + } } const uint16_t* materialize_batch_ids = materialize_batch_ids_buf.mutable_data(); const uint32_t* materialize_key_ids = materialize_key_ids_buf.mutable_data(); @@ -2289,8 +2294,11 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, // the other side of the join. // if (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER) { - if (residual_filter_) { - // TODO: and match bit vector. + if (!residual_filter_->IsTrivial()) { + arrow::internal::BitmapAnd(match_bitvector_buf.mutable_data(), 0, + filtered_bitvector_buf.mutable_data(), 0, + minibatch_size_next, 0, + match_bitvector_buf.mutable_data()); } int num_passing_ids = 0; arrow::util::bit_util::bits_to_indexes( From ec304e45b88635f87c7a9939f1923d47af3207aa Mon Sep 17 00:00:00 2001 From: zanmato Date: Wed, 3 Jan 2024 10:50:40 -0800 Subject: [PATCH 06/33] Bug fix --- cpp/src/arrow/acero/swiss_join.cc | 5 +++-- cpp/src/arrow/acero/swiss_join_internal.h | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 6b29448cea5..786fc42f72f 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2483,9 +2483,10 @@ class SwissJoin : public HashJoinImpl { const HashJoinProjectionMaps* schema = schema_[1]; bool reject_duplicate_keys = (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI) && - (schema->is_empty(HashJoinProjection::FILTER) == 0); + residual_filter_.NumBuildPayloadsReferred() == 0; bool no_payload = - reject_duplicate_keys || schema->num_cols(HashJoinProjection::PAYLOAD) == 0; + reject_duplicate_keys || (schema->num_cols(HashJoinProjection::PAYLOAD) == 0 && + residual_filter_.NumBuildPayloadsReferred() == 0); std::vector key_types; for (int i = 0; i < schema->num_cols(HashJoinProjection::KEY); ++i) { diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 6bcddfca9d8..bb382c37548 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -746,6 +746,8 @@ class JoinResidualFilter { const uint32_t* key_to_payload); bool IsTrivial() const { return filter_ == literal(true); } + int NumBuildKeysReferred() const { return num_build_keys_referred_; } + int NumBuildPayloadsReferred() const { return num_build_payloads_referred_; } Status FilterMatchBitVector(const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, int bit_match, From 2e496db829279f98abea5d96ba99e99a238a9e84 Mon Sep 17 00:00:00 2001 From: zanmato Date: Wed, 3 Jan 2024 11:39:09 -0800 Subject: [PATCH 07/33] Fix --- cpp/src/arrow/acero/swiss_join.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 786fc42f72f..3b012cc9da9 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2185,7 +2185,7 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI || join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { int num_passing_ids = 0; - int bit_match = join_type_ == JoinType::LEFT_ANTI ? 0 : 1; + int bit_match = (join_type_ == JoinType::LEFT_ANTI) ? 0 : 1; if (residual_filter_->IsTrivial()) { arrow::util::bit_util::bits_to_indexes( bit_match, hardware_flags, minibatch_size_next, From cbf1b5856ceeee258dde485a88938abc85adefc9 Mon Sep 17 00:00:00 2001 From: zanmato Date: Wed, 3 Jan 2024 18:08:37 -0800 Subject: [PATCH 08/33] Fix filter bitvector init timing --- cpp/src/arrow/acero/CMakeLists.txt | 7 +++++-- cpp/src/arrow/acero/swiss_join.cc | 6 ++++-- cpp/src/arrow/acero/swiss_join_test.cc | 26 ++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 cpp/src/arrow/acero/swiss_join_test.cc diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index b77d52a23ee..1889f65632b 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -170,8 +170,11 @@ add_arrow_acero_test(plan_test add_arrow_acero_test(source_node_test SOURCES source_node_test.cc test_nodes.cc) add_arrow_acero_test(fetch_node_test SOURCES fetch_node_test.cc test_nodes.cc) add_arrow_acero_test(order_by_node_test SOURCES order_by_node_test.cc test_nodes.cc) -add_arrow_acero_test(hash_join_node_test SOURCES hash_join_node_test.cc - bloom_filter_test.cc) +add_arrow_acero_test(hash_join_node_test + SOURCES + hash_join_node_test.cc + bloom_filter_test.cc + swiss_join_test.cc) add_arrow_acero_test(pivot_longer_node_test SOURCES pivot_longer_node_test.cc test_nodes.cc) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 3b012cc9da9..1c3a7b22756 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2244,6 +2244,10 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, match_iterator.SetLookupResult( minibatch_size_next, minibatch_start, match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), no_duplicate_keys, hash_table_->key_to_payload()); + if (!residual_filter_->IsTrivial()) { + std::memset(filtered_bitvector_buf.mutable_data(), 0, + bit_util::BytesForBits(minibatch_size_next)); + } int num_matches_next; while (match_iterator.GetNextBatch(minibatch_size, &num_matches_next, materialize_batch_ids_buf.mutable_data(), @@ -2256,8 +2260,6 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data(), /*output_payload_ids=*/true, !(no_duplicate_keys || no_payload_columns), temp_stack, &num_matches_next)); - std::memset(filtered_bitvector_buf.mutable_data(), 0, - bit_util::BytesForBits(minibatch_size_next)); for (int i = 0; i < num_matches_next; ++i) { int bit_idx = materialize_batch_ids_buf.mutable_data()[i] - minibatch_start; bit_util::SetBitTo(filtered_bitvector_buf.mutable_data(), bit_idx, 1); diff --git a/cpp/src/arrow/acero/swiss_join_test.cc b/cpp/src/arrow/acero/swiss_join_test.cc new file mode 100644 index 00000000000..e51af2d1594 --- /dev/null +++ b/cpp/src/arrow/acero/swiss_join_test.cc @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +namespace arrow { +namespace acero { + +TEST(SwissJoin, ResidualFilter) {} + +} // namespace acero +} // namespace arrow \ No newline at end of file From 5cf8edf06dadcaa9e10d0dde942281cf6b201ab0 Mon Sep 17 00:00:00 2001 From: zanmato Date: Wed, 3 Jan 2024 18:24:39 -0800 Subject: [PATCH 09/33] Refine --- cpp/src/arrow/acero/swiss_join.cc | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 1c3a7b22756..0002c0db659 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2153,8 +2153,6 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, auto hashes_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto match_bitvector_buf = arrow::util::TempVectorHolder( temp_stack, static_cast(bit_util::BytesForBits(minibatch_size))); - auto filtered_bitvector_buf = arrow::util::TempVectorHolder( - temp_stack, static_cast(bit_util::BytesForBits(minibatch_size))); auto key_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto materialize_batch_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); @@ -2162,6 +2160,8 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto materialize_payload_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); + auto filtered_bitvector_buf = arrow::util::TempVectorHolder( + temp_stack, static_cast(bit_util::BytesForBits(minibatch_size))); for (int minibatch_start = 0; minibatch_start < num_rows;) { uint32_t minibatch_size_next = std::min(minibatch_size, num_rows - minibatch_start); @@ -2296,17 +2296,13 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, // the other side of the join. // if (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER) { - if (!residual_filter_->IsTrivial()) { - arrow::internal::BitmapAnd(match_bitvector_buf.mutable_data(), 0, - filtered_bitvector_buf.mutable_data(), 0, - minibatch_size_next, 0, - match_bitvector_buf.mutable_data()); - } int num_passing_ids = 0; + const uint8_t* match_bitvector = residual_filter_->IsTrivial() + ? match_bitvector_buf.mutable_data() + : filtered_bitvector_buf.mutable_data(); arrow::util::bit_util::bits_to_indexes( - /*bit_to_search=*/0, hardware_flags, minibatch_size_next, - match_bitvector_buf.mutable_data(), &num_passing_ids, - materialize_batch_ids_buf.mutable_data()); + /*bit_to_search=*/0, hardware_flags, minibatch_size_next, match_bitvector, + &num_passing_ids, materialize_batch_ids_buf.mutable_data()); // Add base batch row index. // From 8fc9f994524d0a5fca913151a438f5101d23dbff Mon Sep 17 00:00:00 2001 From: zanmato Date: Thu, 4 Jan 2024 14:21:08 -0800 Subject: [PATCH 10/33] Refine --- cpp/src/arrow/acero/swiss_join.cc | 41 ++++++++++++++--------- cpp/src/arrow/acero/swiss_join_internal.h | 2 +- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 0002c0db659..ddd4aebe8bd 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1789,9 +1789,9 @@ void JoinMatchIterator::SetLookupResult(int num_batch_rows, int start_batch_row, current_match_for_row_ = 0; } -bool JoinMatchIterator::GetNextBatch(int num_rows_max, int* out_num_rows, - uint16_t* batch_row_ids, uint32_t* key_ids, - uint32_t* payload_ids) { +bool JoinMatchIterator::GetNextBatch(int num_rows_max, uint32_t batch_row_id_to_skip, + int* out_num_rows, uint16_t* batch_row_ids, + uint32_t* key_ids, uint32_t* payload_ids) { *out_num_rows = 0; if (no_duplicate_keys_) { @@ -1816,7 +1816,9 @@ bool JoinMatchIterator::GetNextBatch(int num_rows_max, int* out_num_rows, // matches to output. // while (current_row_ < num_batch_rows_ && *out_num_rows < num_rows_max) { - if (!bit_util::GetBit(batch_has_match_, current_row_)) { + if (!bit_util::GetBit(batch_has_match_, current_row_) || + static_cast(start_batch_row_ + current_row_) == + batch_row_id_to_skip) { ++current_row_; current_match_for_row_ = 0; continue; @@ -1924,6 +1926,7 @@ Status JoinResidualFilter::FilterMatchBitVector( int* num_passing_ids, uint16_t* passing_batch_row_ids, uint32_t* passing_key_ids_maybe_null) const { ARROW_DCHECK(filter_ != literal(true)); + *num_passing_ids = 0; if (filter_.IsNullLiteral() || filter_ == literal(false)) { return Status::OK(); @@ -1972,7 +1975,7 @@ Status JoinResidualFilter::FilterMatchBitVector( int num_matches_next = 0; // Last row id passing the filter, used to filter out duplicate rows. uint32_t row_id_last = std::numeric_limits::max() + 1; - while (match_iterator.GetNextBatch(minibatch_size_, &num_matches_next, + while (match_iterator.GetNextBatch(minibatch_size_, row_id_last, &num_matches_next, materialize_batch_ids_buf.mutable_data(), materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data())) { @@ -1982,17 +1985,19 @@ Status JoinResidualFilter::FilterMatchBitVector( materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data(), passing_key_ids_maybe_null, /*output_payload_ids=*/false, temp_stack, &num_filtered)); + + // There may be multiple matches for a row in batch. Collect distinct row ids. + // for (int ifiltered = 0; ifiltered < num_filtered; ++ifiltered) { if (materialize_batch_ids_buf.mutable_data()[ifiltered] == row_id_last) { continue; } - passing_batch_row_ids[*num_passing_ids] = + row_id_last = passing_batch_row_ids[*num_passing_ids] = materialize_batch_ids_buf.mutable_data()[ifiltered]; if (passing_key_ids_maybe_null) { passing_key_ids_maybe_null[*num_passing_ids] = materialize_key_ids_buf.mutable_data()[ifiltered]; } - row_id_last = materialize_batch_ids_buf.mutable_data()[ifiltered]; ++(*num_passing_ids); } } @@ -2244,25 +2249,29 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, match_iterator.SetLookupResult( minibatch_size_next, minibatch_start, match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), no_duplicate_keys, hash_table_->key_to_payload()); - if (!residual_filter_->IsTrivial()) { + if (!residual_filter_->IsTrivial() && + (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER)) { std::memset(filtered_bitvector_buf.mutable_data(), 0, bit_util::BytesForBits(minibatch_size_next)); } int num_matches_next; - while (match_iterator.GetNextBatch(minibatch_size, &num_matches_next, - materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data())) { + while (match_iterator.GetNextBatch( + minibatch_size, std::numeric_limits::max() + 1, &num_matches_next, + materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data())) { if (!residual_filter_->IsTrivial()) { RETURN_NOT_OK(residual_filter_->FilterMatchRowIds( keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data(), /*output_payload_ids=*/true, + materialize_payload_ids_buf.mutable_data(), /*output_key_ids=*/true, !(no_duplicate_keys || no_payload_columns), temp_stack, &num_matches_next)); - for (int i = 0; i < num_matches_next; ++i) { - int bit_idx = materialize_batch_ids_buf.mutable_data()[i] - minibatch_start; - bit_util::SetBitTo(filtered_bitvector_buf.mutable_data(), bit_idx, 1); + if (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER) { + for (int i = 0; i < num_matches_next; ++i) { + int bit_idx = materialize_batch_ids_buf.mutable_data()[i] - minibatch_start; + bit_util::SetBitTo(filtered_bitvector_buf.mutable_data(), bit_idx, 1); + } } } const uint16_t* materialize_batch_ids = materialize_batch_ids_buf.mutable_data(); diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index bb382c37548..7ac80f692a9 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -714,7 +714,7 @@ class JoinMatchIterator { void SetLookupResult(int num_batch_rows, int start_batch_row, const uint8_t* batch_has_match, const uint32_t* key_ids, bool no_duplicate_keys, const uint32_t* key_to_payload); - bool GetNextBatch(int num_rows_max, int* out_num_rows, uint16_t* batch_row_ids, + bool GetNextBatch(int num_rows_max, uint32_t batch_row_id_to_skip, int* out_num_rows, uint16_t* batch_row_ids, uint32_t* key_ids, uint32_t* payload_ids); private: From 50136d8317f5a7a861c47262be2caa5077a00117 Mon Sep 17 00:00:00 2001 From: zanmato Date: Sat, 6 Jan 2024 10:56:22 -0800 Subject: [PATCH 11/33] Fix many bugs --- cpp/src/arrow/acero/swiss_join.cc | 306 +++++++++++++--------- cpp/src/arrow/acero/swiss_join_internal.h | 62 +++-- 2 files changed, 217 insertions(+), 151 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index ddd4aebe8bd..25fe23d94cf 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1085,10 +1085,30 @@ void SwissTableForJoin::UpdateHasMatchForKeys(int64_t thread_id, int num_ids, if (num_ids == 0 || !bit_vector) { return; } + for (int ikey = 0; ikey < num_ids; ++ikey) { + // Mark row in hash table as having a match + // + uint32_t key_id = key_ids[ikey]; + uint32_t first_payload_for_key = key_to_payload() ? key_to_payload()[key_id] : key_id; + uint32_t last_payload_for_key = + key_to_payload() ? key_to_payload()[key_id + 1] - 1 : key_id; + for (uint32_t ipayload = first_payload_for_key; ipayload <= last_payload_for_key; + ++ipayload) { + bit_util::SetBit(bit_vector, ipayload); + } + } +} + +void SwissTableForJoin::UpdateHasMatchForPayloads(int64_t thread_id, int num_ids, + const uint32_t* payload_ids) { + uint8_t* bit_vector = local_has_match(thread_id); + if (num_ids == 0 || !bit_vector) { + return; + } for (int i = 0; i < num_ids; ++i) { // Mark row in hash table as having a match // - bit_util::SetBit(bit_vector, key_ids[i]); + bit_util::SetBit(bit_vector, payload_ids[i]); } } @@ -1123,29 +1143,6 @@ uint32_t SwissTableForJoin::payload_id_to_key_id(uint32_t payload_id) const { return static_cast(first_greater - entries) - 1; } -void SwissTableForJoin::payload_ids_to_key_ids(int num_rows, const uint32_t* payload_ids, - uint32_t* key_ids) const { - if (num_rows == 0) { - return; - } - if (no_duplicate_keys_) { - memcpy(key_ids, payload_ids, num_rows * sizeof(uint32_t)); - return; - } - - const uint32_t* entries = key_to_payload(); - uint32_t key_id = payload_id_to_key_id(payload_ids[0]); - key_ids[0] = key_id; - for (int i = 1; i < num_rows; ++i) { - ARROW_DCHECK(payload_ids[i] > payload_ids[i - 1]); - while (entries[key_id + 1] <= payload_ids[i]) { - ++key_id; - ARROW_DCHECK(key_id < num_keys()); - } - key_ids[i] = key_id; - } -} - Status SwissTableForJoinBuild::Init(SwissTableForJoin* target, int dop, int64_t num_rows, bool reject_duplicate_keys, bool no_payload, const std::vector& key_types, @@ -1789,9 +1786,9 @@ void JoinMatchIterator::SetLookupResult(int num_batch_rows, int start_batch_row, current_match_for_row_ = 0; } -bool JoinMatchIterator::GetNextBatch(int num_rows_max, uint32_t batch_row_id_to_skip, - int* out_num_rows, uint16_t* batch_row_ids, - uint32_t* key_ids, uint32_t* payload_ids) { +bool JoinMatchIterator::GetNextBatch(int num_rows_max, int* out_num_rows, + uint16_t* batch_row_ids, uint32_t* key_ids, + uint32_t* payload_ids, int row_id_to_skip) { *out_num_rows = 0; if (no_duplicate_keys_) { @@ -1817,8 +1814,7 @@ bool JoinMatchIterator::GetNextBatch(int num_rows_max, uint32_t batch_row_id_to_ // while (current_row_ < num_batch_rows_ && *out_num_rows < num_rows_max) { if (!bit_util::GetBit(batch_has_match_, current_row_) || - static_cast(start_batch_row_ + current_row_) == - batch_row_id_to_skip) { + current_row_ == row_id_to_skip) { ++current_row_; current_match_for_row_ = 0; continue; @@ -1857,8 +1853,8 @@ bool JoinMatchIterator::GetNextBatch(int num_rows_max, uint32_t batch_row_id_to_ return (*out_num_rows) > 0; } -void JoinResidualFilter::Init(Expression filter, int minibatch_size, QueryContext* ctx, - MemoryPool* pool, int64_t hardware_flags, +void JoinResidualFilter::Init(Expression filter, QueryContext* ctx, MemoryPool* pool, + int64_t hardware_flags, const HashJoinProjectionMaps* probe_schemas, const HashJoinProjectionMaps* build_schemas) { filter_ = std::move(filter); @@ -1866,7 +1862,6 @@ void JoinResidualFilter::Init(Expression filter, int minibatch_size, QueryContex return; } - minibatch_size_ = minibatch_size; ctx_ = ctx; pool_ = pool; hardware_flags_ = hardware_flags; @@ -1911,20 +1906,22 @@ void JoinResidualFilter::Init(Expression filter, int minibatch_size, QueryContex } } -void JoinResidualFilter::SetBuildSide(const RowArray* build_keys, +void JoinResidualFilter::SetBuildSide(int minibatch_size, const RowArray* build_keys, const RowArray* build_payloads, const uint32_t* key_to_payload) { + minibatch_size_ = minibatch_size; build_keys_ = build_keys; build_payloads_ = build_payloads; key_to_payload_ = key_to_payload; } -Status JoinResidualFilter::FilterMatchBitVector( - const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, - int bit_match, const uint8_t* match_bitvector, const uint32_t* key_ids, - bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack, - int* num_passing_ids, uint16_t* passing_batch_row_ids, - uint32_t* passing_key_ids_maybe_null) const { +Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, + int batch_start_row, int num_batch_rows, + const uint8_t* match_bitvector, + const uint32_t* key_ids, bool no_duplicate_keys, + arrow::util::TempVectorStack* temp_stack, + int* num_passing_ids, + uint16_t* passing_batch_row_ids) const { ARROW_DCHECK(filter_ != literal(true)); *num_passing_ids = 0; @@ -1936,17 +1933,9 @@ Status JoinResidualFilter::FilterMatchBitVector( // If filter refers no column in the right table, then we can directly filter on the // left rows without inner matching and materializing the right rows. // - arrow::util::bit_util::bits_to_indexes(bit_match, hardware_flags_, num_batch_rows, + arrow::util::bit_util::bits_to_indexes(1, hardware_flags_, num_batch_rows, match_bitvector, num_passing_ids, passing_batch_row_ids); - if (passing_key_ids_maybe_null) { - // Collect key ids of passing rows. - // - for (int i = 0; i < *num_passing_ids; ++i) { - uint16_t id = passing_batch_row_ids[i]; - passing_key_ids_maybe_null[i] = key_ids[id]; - } - } // Add base batch row index. // @@ -1954,11 +1943,9 @@ Status JoinResidualFilter::FilterMatchBitVector( passing_batch_row_ids[i] += static_cast(batch_start_row); } - RETURN_NOT_OK( - FilterMatchRowIds(keypayload_batch, *num_passing_ids, passing_batch_row_ids, - passing_key_ids_maybe_null, /*payload_ids_maybe_null=*/NULLPTR, - passing_key_ids_maybe_null, - /*output_payload_ids=*/false, temp_stack, num_passing_ids)); + RETURN_NOT_OK(FilterInner(keypayload_batch, *num_passing_ids, passing_batch_row_ids, + NULLPTR, NULLPTR, false, false, temp_stack, + num_passing_ids)); return Status::OK(); } @@ -1973,44 +1960,79 @@ Status JoinResidualFilter::FilterMatchBitVector( match_iterator.SetLookupResult(num_batch_rows, batch_start_row, match_bitvector, key_ids, no_duplicate_keys, key_to_payload_); int num_matches_next = 0; - // Last row id passing the filter, used to filter out duplicate rows. - uint32_t row_id_last = std::numeric_limits::max() + 1; - while (match_iterator.GetNextBatch(minibatch_size_, row_id_last, &num_matches_next, - materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data())) { + int row_id_to_skip = JoinMatchIterator::kInvalidRowId; + while (match_iterator.GetNextBatch( + minibatch_size_, &num_matches_next, materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data(), + row_id_to_skip)) { int num_filtered = 0; - RETURN_NOT_OK(FilterMatchRowIds( - keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data(), passing_key_ids_maybe_null, - /*output_payload_ids=*/false, temp_stack, &num_filtered)); - + RETURN_NOT_OK(FilterInner(keypayload_batch, num_matches_next, + materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data(), false, false, + temp_stack, &num_filtered)); // There may be multiple matches for a row in batch. Collect distinct row ids. // for (int ifiltered = 0; ifiltered < num_filtered; ++ifiltered) { - if (materialize_batch_ids_buf.mutable_data()[ifiltered] == row_id_last) { + if (materialize_batch_ids_buf.mutable_data()[ifiltered] == row_id_to_skip) { continue; } - row_id_last = passing_batch_row_ids[*num_passing_ids] = + row_id_to_skip = passing_batch_row_ids[*num_passing_ids] = materialize_batch_ids_buf.mutable_data()[ifiltered]; - if (passing_key_ids_maybe_null) { - passing_key_ids_maybe_null[*num_passing_ids] = - materialize_key_ids_buf.mutable_data()[ifiltered]; - } ++(*num_passing_ids); } } + + return Status::OK(); +} + +Status JoinResidualFilter::FilterRightSemi( + const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, + const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, + arrow::util::TempVectorStack* temp_stack, + OutputPayloadIdsCallback output_payload_ids) const { + ARROW_DCHECK(filter_ != literal(true)); + + if (filter_.IsNullLiteral() || filter_ == literal(false)) { + return Status::OK(); + } + + auto materialize_batch_ids_buf = + arrow::util::TempVectorHolder(temp_stack, minibatch_size_); + auto materialize_key_ids_buf = + arrow::util::TempVectorHolder(temp_stack, minibatch_size_); + auto materialize_payload_ids_buf = + arrow::util::TempVectorHolder(temp_stack, minibatch_size_); + + JoinMatchIterator match_iterator; + match_iterator.SetLookupResult(num_batch_rows, batch_start_row, match_bitvector, + key_ids, no_duplicate_keys, key_to_payload_); + int num_matches_next = 0; + while (match_iterator.GetNextBatch(minibatch_size_, &num_matches_next, + materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data())) { + int num_filtered = 0; + RETURN_NOT_OK(FilterInner( + keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data(), /*output_key_ids=*/false, + /*output_payload_ids=*/true, temp_stack, &num_filtered)); + // Output payload ids of passing rows. + // + output_payload_ids(num_filtered, materialize_payload_ids_buf.mutable_data()); + } + return Status::OK(); } -Status JoinResidualFilter::FilterMatchRowIds(const ExecBatch& keypayload_batch, - int num_batch_rows, uint16_t* batch_row_ids, - uint32_t* key_ids_maybe_null, - uint32_t* payload_ids_maybe_null, - bool output_key_ids, bool output_payload_ids, - arrow::util::TempVectorStack* temp_stack, - int* num_passing_rows) const { +Status JoinResidualFilter::FilterInner(const ExecBatch& keypayload_batch, + int num_batch_rows, uint16_t* batch_row_ids, + uint32_t* key_ids_maybe_null, + uint32_t* payload_ids_maybe_null, + bool output_key_ids, bool output_payload_ids, + arrow::util::TempVectorStack* temp_stack, + int* num_passing_rows) const { ARROW_DCHECK(filter_ != literal(true)); ARROW_DCHECK(!output_key_ids || key_ids_maybe_null); ARROW_DCHECK(!output_payload_ids || payload_ids_maybe_null); @@ -2052,6 +2074,7 @@ Status JoinResidualFilter::FilterMatchRowIds(const ExecBatch& keypayload_batch, ++(*num_passing_rows); } } + return Status::OK(); } @@ -2165,6 +2188,8 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto materialize_payload_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); + auto materialize_no_match_batch_ids_buf = + arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto filtered_bitvector_buf = arrow::util::TempVectorHolder( temp_stack, static_cast(bit_util::BytesForBits(minibatch_size))); @@ -2190,8 +2215,9 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI || join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { int num_passing_ids = 0; - int bit_match = (join_type_ == JoinType::LEFT_ANTI) ? 0 : 1; + const uint16_t* materialize_batch_ids = materialize_batch_ids_buf.mutable_data(); if (residual_filter_->IsTrivial()) { + int bit_match = (join_type_ == JoinType::LEFT_ANTI) ? 0 : 1; arrow::util::bit_util::bits_to_indexes( bit_match, hardware_flags, minibatch_size_next, match_bitvector_buf.mutable_data(), &num_passing_ids, @@ -2203,6 +2229,10 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, uint16_t id = materialize_batch_ids_buf.mutable_data()[i]; materialize_key_ids_buf.mutable_data()[i] = key_ids_buf.mutable_data()[id]; } + // For right-semi, right-anti joins: update has-match flags for the rows + // in hash table. + hash_table_->UpdateHasMatchForKeys(thread_id, num_passing_ids, + materialize_key_ids_buf.mutable_data()); } else { // For left-semi, left-anti joins: add base batch row index. // @@ -2213,27 +2243,51 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, } } else { bool no_duplicate_keys = (hash_table_->key_to_payload() == nullptr); - RETURN_NOT_OK(residual_filter_->FilterMatchBitVector( - keypayload_batch, minibatch_start, minibatch_size_next, bit_match, - match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), - no_duplicate_keys, temp_stack, &num_passing_ids, - materialize_batch_ids_buf.mutable_data(), - join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI - ? materialize_key_ids_buf.mutable_data() - : NULLPTR)); + if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI) { + RETURN_NOT_OK(residual_filter_->FilterLeftSemi( + keypayload_batch, minibatch_start, minibatch_size_next, + match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), + no_duplicate_keys, temp_stack, &num_passing_ids, + materialize_batch_ids_buf.mutable_data())); + if (join_type_ == JoinType::LEFT_ANTI) { + // For left-anti join: collect no match row ids. + // + int num_no_passing_ids = 0; + int imatch = 0; + for (int irow = minibatch_start; + irow < minibatch_start + static_cast(minibatch_size_next); ++irow) { + while (imatch < num_passing_ids && + materialize_batch_ids_buf.mutable_data()[imatch] < irow) { + ++imatch; + } + if (imatch == num_passing_ids || + materialize_batch_ids_buf.mutable_data()[imatch] != irow) { + materialize_no_match_batch_ids_buf.mutable_data()[num_no_passing_ids++] = + static_cast(irow); + } + } + num_passing_ids = num_no_passing_ids; + materialize_batch_ids = materialize_no_match_batch_ids_buf.mutable_data(); + } + } else { + auto update_has_match = [thread_id, this](int num_passing_ids, + const uint32_t* payload_ids) { + hash_table_->UpdateHasMatchForPayloads(thread_id, num_passing_ids, + payload_ids); + }; + RETURN_NOT_OK(residual_filter_->FilterRightSemi( + keypayload_batch, minibatch_start, minibatch_size_next, + match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), + no_duplicate_keys, temp_stack, update_has_match)); + } } - if (join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { - // For right-semi, right-anti joins: update has-match flags for the rows - // in hash table. - hash_table_->UpdateHasMatchForKeys(thread_id, num_passing_ids, - materialize_key_ids_buf.mutable_data()); - } else { + if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI) { // For left-semi, left-anti joins: call materialize using match // row ids. // RETURN_NOT_OK(materialize_[thread_id]->AppendProbeOnly( - keypayload_batch, num_passing_ids, materialize_batch_ids_buf.mutable_data(), + keypayload_batch, num_passing_ids, materialize_batch_ids, [&](ExecBatch batch) { return output_batch_fn_(thread_id, std::move(batch)); })); @@ -2244,7 +2298,6 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, // matches we use a helper class that implements enumerating all of them. // bool no_duplicate_keys = (hash_table_->key_to_payload() == nullptr); - bool no_payload_columns = (hash_table_->payloads() == nullptr); JoinMatchIterator match_iterator; match_iterator.SetLookupResult( minibatch_size_next, minibatch_start, match_bitvector_buf.mutable_data(), @@ -2255,18 +2308,17 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, bit_util::BytesForBits(minibatch_size_next)); } int num_matches_next; - while (match_iterator.GetNextBatch( - minibatch_size, std::numeric_limits::max() + 1, &num_matches_next, - materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data())) { + while (match_iterator.GetNextBatch(minibatch_size, &num_matches_next, + materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data())) { if (!residual_filter_->IsTrivial()) { - RETURN_NOT_OK(residual_filter_->FilterMatchRowIds( + RETURN_NOT_OK(residual_filter_->FilterInner( keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data(), /*output_key_ids=*/true, - !(no_duplicate_keys || no_payload_columns), temp_stack, &num_matches_next)); + !no_duplicate_keys, temp_stack, &num_matches_next)); if (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER) { for (int i = 0; i < num_matches_next; ++i) { int bit_idx = materialize_batch_ids_buf.mutable_data()[i] - minibatch_start; @@ -2277,16 +2329,15 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, const uint16_t* materialize_batch_ids = materialize_batch_ids_buf.mutable_data(); const uint32_t* materialize_key_ids = materialize_key_ids_buf.mutable_data(); const uint32_t* materialize_payload_ids = - no_duplicate_keys || no_payload_columns - ? materialize_key_ids_buf.mutable_data() - : materialize_payload_ids_buf.mutable_data(); + no_duplicate_keys ? materialize_key_ids_buf.mutable_data() + : materialize_payload_ids_buf.mutable_data(); // For right-outer, full-outer joins we need to update has-match flags // for the rows in hash table. // if (join_type_ == JoinType::RIGHT_OUTER || join_type_ == JoinType::FULL_OUTER) { - hash_table_->UpdateHasMatchForKeys(thread_id, num_matches_next, - materialize_key_ids); + hash_table_->UpdateHasMatchForPayloads(thread_id, num_matches_next, + materialize_payload_ids); } // Call materialize for resulting id tuples pointing to matching pairs @@ -2403,9 +2454,8 @@ class SwissJoin : public HashJoinImpl { materialize[i] = &local_states_[i].materialize; } - int minibatch_size = hash_table_.keys()->swiss_table()->minibatch_size(); - residual_filter_.Init(std::move(filter), minibatch_size, ctx_, pool_, hardware_flags_, - proj_map_left, proj_map_right); + residual_filter_.Init(std::move(filter), ctx_, pool_, hardware_flags_, proj_map_left, + proj_map_right); probe_processor_.Init(proj_map_left->num_cols(HashJoinProjection::KEY), join_type_, &hash_table_, &residual_filter_, materialize, &key_cmp_, @@ -2613,7 +2663,8 @@ class SwissJoin : public HashJoinImpl { } hash_table_ready_.store(true); - residual_filter_.SetBuildSide(hash_table_.keys()->keys(), hash_table_.payloads(), + residual_filter_.SetBuildSide(hash_table_.keys()->swiss_table()->minibatch_size(), + hash_table_.keys()->keys(), hash_table_.payloads(), hash_table_.key_to_payload()); return build_finished_callback_(thread_id); @@ -2678,24 +2729,25 @@ class SwissJoin : public HashJoinImpl { static_cast(mini_batch_start + mini_batch_size_next - 1)); int num_output_rows = 0; for (uint32_t key_id = first_key_id; key_id <= last_key_id; ++key_id) { - if (bit_util::GetBit(hash_table_.has_match(), key_id) == bit_to_output) { - uint32_t first_payload_for_key = - std::max(static_cast(mini_batch_start), - hash_table_.key_to_payload() ? hash_table_.key_to_payload()[key_id] - : key_id); - uint32_t last_payload_for_key = std::min( - static_cast(mini_batch_start + mini_batch_size_next - 1), - hash_table_.key_to_payload() ? hash_table_.key_to_payload()[key_id + 1] - 1 - : key_id); - uint32_t num_payloads_for_key = - last_payload_for_key - first_payload_for_key + 1; - for (uint32_t i = 0; i < num_payloads_for_key; ++i) { - key_ids_buf.mutable_data()[num_output_rows + i] = key_id; - payload_ids_buf.mutable_data()[num_output_rows + i] = - first_payload_for_key + i; + uint32_t first_payload_for_key = std::max( + static_cast(mini_batch_start), + hash_table_.key_to_payload() ? hash_table_.key_to_payload()[key_id] : key_id); + uint32_t last_payload_for_key = std::min( + static_cast(mini_batch_start + mini_batch_size_next - 1), + hash_table_.key_to_payload() ? hash_table_.key_to_payload()[key_id + 1] - 1 + : key_id); + uint32_t num_payloads_for_key = last_payload_for_key - first_payload_for_key + 1; + uint32_t num_payloads_match = 0; + for (uint32_t i = 0; i < num_payloads_for_key; ++i) { + uint32_t payload = first_payload_for_key + i; + if (bit_util::GetBit(hash_table_.has_match(), payload) == bit_to_output) { + key_ids_buf.mutable_data()[num_output_rows + num_payloads_match] = key_id; + payload_ids_buf.mutable_data()[num_output_rows + num_payloads_match] = + payload; + num_payloads_match++; } - num_output_rows += num_payloads_for_key; } + num_output_rows += num_payloads_match; } if (num_output_rows > 0) { diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 7ac80f692a9..5cc9aff0d62 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -367,7 +367,11 @@ class SwissTableForJoin { friend class SwissTableForJoinBuild; public: + // TODO void UpdateHasMatchForKeys(int64_t thread_id, int num_rows, const uint32_t* key_ids); + // TODO + void UpdateHasMatchForPayloads(int64_t thread_id, int num_rows, + const uint32_t* payload_ids); void MergeHasMatch(); const SwissTableWithKeys* keys() const { return &map_; } @@ -385,10 +389,6 @@ class SwissTableForJoin { } uint32_t payload_id_to_key_id(uint32_t payload_id) const; - // Input payload ids must form an increasing sequence. - // - void payload_ids_to_key_ids(int num_rows, const uint32_t* payload_ids, - uint32_t* key_ids) const; private: uint8_t* local_has_match(int64_t thread_id); @@ -397,8 +397,10 @@ class SwissTableForJoin { int dop_; struct ThreadLocalState { + // Bit-vector for keeping track of whether each payload in the hash table had a match std::vector has_match; }; + // Bit-vector for keeping track of whether each payload in the hash table had a match std::vector local_states_; std::vector has_match_; @@ -714,8 +716,15 @@ class JoinMatchIterator { void SetLookupResult(int num_batch_rows, int start_batch_row, const uint8_t* batch_has_match, const uint32_t* key_ids, bool no_duplicate_keys, const uint32_t* key_to_payload); - bool GetNextBatch(int num_rows_max, uint32_t batch_row_id_to_skip, int* out_num_rows, uint16_t* batch_row_ids, - uint32_t* key_ids, uint32_t* payload_ids); + // TODO: row_id_to_skip + bool GetNextBatch(int num_rows_max, int* out_num_rows, uint16_t* batch_row_ids, + uint32_t* key_ids, uint32_t* payload_ids, + int row_id_to_skip = kInvalidRowId); + + // The row id that will never exist in an ExecBatch. + // Used to indicate that there is no row to skip. + // + static constexpr uint32_t kInvalidRowId = std::numeric_limits::max() + 1; private: int num_batch_rows_; @@ -738,31 +747,36 @@ class JoinMatchIterator { class JoinResidualFilter { public: - void Init(Expression filter, int minibatch_size, QueryContext* ctx, MemoryPool* pool, + void Init(Expression filter, QueryContext* ctx, MemoryPool* pool, int64_t hardware_flags, const HashJoinProjectionMaps* probe_schemas, const HashJoinProjectionMaps* build_schemas); - void SetBuildSide(const RowArray* build_keys, const RowArray* build_payloads, - const uint32_t* key_to_payload); + void SetBuildSide(int minibatch_size, const RowArray* build_keys, + const RowArray* build_payloads, const uint32_t* key_to_payload); bool IsTrivial() const { return filter_ == literal(true); } + int NumBuildKeysReferred() const { return num_build_keys_referred_; } int NumBuildPayloadsReferred() const { return num_build_payloads_referred_; } - Status FilterMatchBitVector(const ExecBatch& keypayload_batch, int batch_start_row, - int num_batch_rows, int bit_match, - const uint8_t* match_bitvector, const uint32_t* key_ids, - bool no_duplicate_keys, - arrow::util::TempVectorStack* temp_stack, - int* num_passing_ids, uint16_t* passing_batch_row_ids, - uint32_t* passing_key_ids_maybe_null) const; - - Status FilterMatchRowIds(const ExecBatch& keypayload_batch, int num_batch_rows, - uint16_t* batch_row_ids, uint32_t* key_ids_maybe_null, - uint32_t* payload_ids_maybe_null, bool output_key_ids, - bool output_payload_ids, - arrow::util::TempVectorStack* temp_stack, - int* num_passing_rows) const; + Status FilterLeftSemi(const ExecBatch& keypayload_batch, int batch_start_row, + int num_batch_rows, const uint8_t* match_bitvector, + const uint32_t* key_ids, bool no_duplicate_keys, + arrow::util::TempVectorStack* temp_stack, int* num_passing_ids, + uint16_t* passing_batch_row_ids) const; + + using OutputPayloadIdsCallback = std::function; + Status FilterRightSemi(const ExecBatch& keypayload_batch, int batch_start_row, + int num_batch_rows, const uint8_t* match_bitvector, + const uint32_t* key_ids, bool no_duplicate_keys, + arrow::util::TempVectorStack* temp_stack, + OutputPayloadIdsCallback output_payload_ids) const; + + Status FilterInner(const ExecBatch& keypayload_batch, int num_batch_rows, + uint16_t* batch_row_ids, uint32_t* key_ids_maybe_null, + uint32_t* payload_ids_maybe_null, bool output_key_ids, + bool output_payload_ids, arrow::util::TempVectorStack* temp_stack, + int* num_passing_rows) const; private: Result EvalFilter(const ExecBatch& keypayload_batch, int num_batch_rows, @@ -778,7 +792,6 @@ class JoinResidualFilter { private: Expression filter_; - int minibatch_size_; QueryContext* ctx_; MemoryPool* pool_; @@ -791,6 +804,7 @@ class JoinResidualFilter { int num_build_keys_referred_ = 0; int num_build_payloads_referred_ = 0; + int minibatch_size_; const RowArray* build_keys_; const RowArray* build_payloads_; const uint32_t* key_to_payload_; From 80b57654e1c15d3297c4bee2e1c314737cf171b3 Mon Sep 17 00:00:00 2001 From: zanmato Date: Sat, 6 Jan 2024 11:01:18 -0800 Subject: [PATCH 12/33] Revert cmake change --- cpp/src/arrow/acero/CMakeLists.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 1889f65632b..b0c39b9ff56 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -170,11 +170,7 @@ add_arrow_acero_test(plan_test add_arrow_acero_test(source_node_test SOURCES source_node_test.cc test_nodes.cc) add_arrow_acero_test(fetch_node_test SOURCES fetch_node_test.cc test_nodes.cc) add_arrow_acero_test(order_by_node_test SOURCES order_by_node_test.cc test_nodes.cc) -add_arrow_acero_test(hash_join_node_test - SOURCES - hash_join_node_test.cc - bloom_filter_test.cc - swiss_join_test.cc) +add_arrow_acero_test(hash_join_node_test SOURCES hash_join_node_test.cc bloom_filter_test.cc) add_arrow_acero_test(pivot_longer_node_test SOURCES pivot_longer_node_test.cc test_nodes.cc) From d29de580234f2935296b6ee596aa0aff2410a141 Mon Sep 17 00:00:00 2001 From: zanmato Date: Sat, 6 Jan 2024 11:02:09 -0800 Subject: [PATCH 13/33] Revert cmake change --- cpp/src/arrow/acero/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index b0c39b9ff56..b77d52a23ee 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -170,7 +170,8 @@ add_arrow_acero_test(plan_test add_arrow_acero_test(source_node_test SOURCES source_node_test.cc test_nodes.cc) add_arrow_acero_test(fetch_node_test SOURCES fetch_node_test.cc test_nodes.cc) add_arrow_acero_test(order_by_node_test SOURCES order_by_node_test.cc test_nodes.cc) -add_arrow_acero_test(hash_join_node_test SOURCES hash_join_node_test.cc bloom_filter_test.cc) +add_arrow_acero_test(hash_join_node_test SOURCES hash_join_node_test.cc + bloom_filter_test.cc) add_arrow_acero_test(pivot_longer_node_test SOURCES pivot_longer_node_test.cc test_nodes.cc) From 1d868522fde3ce10026b8ff854df483f43a6f17e Mon Sep 17 00:00:00 2001 From: zanmato Date: Sat, 6 Jan 2024 11:03:21 -0800 Subject: [PATCH 14/33] Remove file --- cpp/src/arrow/acero/swiss_join_test.cc | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 cpp/src/arrow/acero/swiss_join_test.cc diff --git a/cpp/src/arrow/acero/swiss_join_test.cc b/cpp/src/arrow/acero/swiss_join_test.cc deleted file mode 100644 index e51af2d1594..00000000000 --- a/cpp/src/arrow/acero/swiss_join_test.cc +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -namespace arrow { -namespace acero { - -TEST(SwissJoin, ResidualFilter) {} - -} // namespace acero -} // namespace arrow \ No newline at end of file From c58d12596e25705815f89c5b5561ae24feed8634 Mon Sep 17 00:00:00 2001 From: zanmato Date: Sat, 6 Jan 2024 16:41:25 -0800 Subject: [PATCH 15/33] Add some comments --- cpp/src/arrow/acero/swiss_join.cc | 20 +++++++++++--------- cpp/src/arrow/acero/swiss_join_internal.h | 13 ++++++++++--- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 25fe23d94cf..6f6c72de01a 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1086,7 +1086,7 @@ void SwissTableForJoin::UpdateHasMatchForKeys(int64_t thread_id, int num_ids, return; } for (int ikey = 0; ikey < num_ids; ++ikey) { - // Mark row in hash table as having a match + // Mark all payloads corresponding to this key in hash table as having a match // uint32_t key_id = key_ids[ikey]; uint32_t first_payload_for_key = key_to_payload() ? key_to_payload()[key_id] : key_id; @@ -1106,7 +1106,7 @@ void SwissTableForJoin::UpdateHasMatchForPayloads(int64_t thread_id, int num_ids return; } for (int i = 0; i < num_ids; ++i) { - // Mark row in hash table as having a match + // Mark payload in hash table as having a match // bit_util::SetBit(bit_vector, payload_ids[i]); } @@ -1944,8 +1944,10 @@ Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, } RETURN_NOT_OK(FilterInner(keypayload_batch, *num_passing_ids, passing_batch_row_ids, - NULLPTR, NULLPTR, false, false, temp_stack, - num_passing_ids)); + /*payload_ids_maybe_null=*/NULLPTR, + /*payload_ids_maybe_null=*/NULLPTR, + /*output_payload_ids=*/false, /*output_payload_ids=*/false, + temp_stack, num_passing_ids)); return Status::OK(); } @@ -1966,11 +1968,11 @@ Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data(), row_id_to_skip)) { int num_filtered = 0; - RETURN_NOT_OK(FilterInner(keypayload_batch, num_matches_next, - materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data(), false, false, - temp_stack, &num_filtered)); + RETURN_NOT_OK(FilterInner( + keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data(), /*output_key_ids=*/false, + /*output_payload_ids=*/false, temp_stack, &num_filtered)); // There may be multiple matches for a row in batch. Collect distinct row ids. // for (int ifiltered = 0; ifiltered < num_filtered; ++ifiltered) { diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 5cc9aff0d62..40b2c5a9867 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -367,9 +367,11 @@ class SwissTableForJoin { friend class SwissTableForJoinBuild; public: - // TODO + // Update all payloads corresponding to the given keys as having a match + // void UpdateHasMatchForKeys(int64_t thread_id, int num_rows, const uint32_t* key_ids); - // TODO + // Update the given payloads as having a match + // void UpdateHasMatchForPayloads(int64_t thread_id, int num_rows, const uint32_t* payload_ids); void MergeHasMatch(); @@ -716,7 +718,12 @@ class JoinMatchIterator { void SetLookupResult(int num_batch_rows, int start_batch_row, const uint8_t* batch_has_match, const uint32_t* key_ids, bool no_duplicate_keys, const uint32_t* key_to_payload); - // TODO: row_id_to_skip + // Get the next batch of matching rows by outputting the batch row ids, key ids and + // payload ids. If the row_id_to_skip is not kInvalidRowId, then the row with that id + // will be skipped. This is useful for left-anti and left-semi joins, where we can + // safely skip the subsequent matchings of the row that already has a match in the + // previous batch. + // bool GetNextBatch(int num_rows_max, int* out_num_rows, uint16_t* batch_row_ids, uint32_t* key_ids, uint32_t* payload_ids, int row_id_to_skip = kInvalidRowId); From dda56e3380428d5f86271f1c909172d7fe07cc1a Mon Sep 17 00:00:00 2001 From: zanmato Date: Sun, 7 Jan 2024 01:53:52 -0800 Subject: [PATCH 16/33] Refine structure --- cpp/src/arrow/acero/swiss_join.cc | 379 +++++++++++++--------- cpp/src/arrow/acero/swiss_join_internal.h | 45 ++- 2 files changed, 260 insertions(+), 164 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 6f6c72de01a..4a0ff7e67f3 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1853,6 +1853,23 @@ bool JoinMatchIterator::GetNextBatch(int num_rows_max, int* out_num_rows, return (*out_num_rows) > 0; } +namespace { + +void CollectPassingBatchIds(int passing_bit, int64_t hardware_flags, int batch_start_row, + int num_batch_rows, const uint8_t* match_bitvector, + int* num_passing_ids, uint16_t* passing_batch_row_ids) { + arrow::util::bit_util::bits_to_indexes(passing_bit, hardware_flags, num_batch_rows, + match_bitvector, num_passing_ids, + passing_batch_row_ids); + // Add base batch row index. + // + for (int i = 0; i < *num_passing_ids; ++i) { + passing_batch_row_ids[i] += static_cast(batch_start_row); + } +} + +} // namespace + void JoinResidualFilter::Init(Expression filter, QueryContext* ctx, MemoryPool* pool, int64_t hardware_flags, const HashJoinProjectionMaps* probe_schemas, @@ -1922,7 +1939,11 @@ Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, arrow::util::TempVectorStack* temp_stack, int* num_passing_ids, uint16_t* passing_batch_row_ids) const { - ARROW_DCHECK(filter_ != literal(true)); + if (filter_ == literal(true)) { + CollectPassingBatchIds(1, hardware_flags_, batch_start_row, num_batch_rows, + match_bitvector, num_passing_ids, passing_batch_row_ids); + return Status::OK(); + } *num_passing_ids = 0; if (filter_.IsNullLiteral() || filter_ == literal(false)) { @@ -1933,29 +1954,22 @@ Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, // If filter refers no column in the right table, then we can directly filter on the // left rows without inner matching and materializing the right rows. // - arrow::util::bit_util::bits_to_indexes(1, hardware_flags_, num_batch_rows, - match_bitvector, num_passing_ids, - passing_batch_row_ids); - - // Add base batch row index. - // - for (int i = 0; i < *num_passing_ids; ++i) { - passing_batch_row_ids[i] += static_cast(batch_start_row); - } - - RETURN_NOT_OK(FilterInner(keypayload_batch, *num_passing_ids, passing_batch_row_ids, - /*payload_ids_maybe_null=*/NULLPTR, - /*payload_ids_maybe_null=*/NULLPTR, - /*output_payload_ids=*/false, /*output_payload_ids=*/false, - temp_stack, num_passing_ids)); + CollectPassingBatchIds(1, hardware_flags_, batch_start_row, num_batch_rows, + match_bitvector, num_passing_ids, passing_batch_row_ids); + RETURN_NOT_OK( + FilterOneBatch(keypayload_batch, *num_passing_ids, passing_batch_row_ids, + /*payload_ids_maybe_null=*/NULLPTR, + /*payload_ids_maybe_null=*/NULLPTR, + /*output_payload_ids=*/false, + /*output_payload_ids=*/false, temp_stack, num_passing_ids)); return Status::OK(); } - auto materialize_batch_ids_buf = + auto match_batch_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); - auto materialize_key_ids_buf = + auto match_key_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); - auto materialize_payload_ids_buf = + auto match_payload_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); JoinMatchIterator match_iterator; @@ -1964,23 +1978,62 @@ Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, int num_matches_next = 0; int row_id_to_skip = JoinMatchIterator::kInvalidRowId; while (match_iterator.GetNextBatch( - minibatch_size_, &num_matches_next, materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data(), + minibatch_size_, &num_matches_next, match_batch_ids_buf.mutable_data(), + match_key_ids_buf.mutable_data(), match_payload_ids_buf.mutable_data(), row_id_to_skip)) { int num_filtered = 0; - RETURN_NOT_OK(FilterInner( - keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data(), /*output_key_ids=*/false, + RETURN_NOT_OK(FilterOneBatch( + keypayload_batch, num_matches_next, match_batch_ids_buf.mutable_data(), + match_key_ids_buf.mutable_data(), match_payload_ids_buf.mutable_data(), + /*output_key_ids=*/false, /*output_payload_ids=*/false, temp_stack, &num_filtered)); // There may be multiple matches for a row in batch. Collect distinct row ids. // for (int ifiltered = 0; ifiltered < num_filtered; ++ifiltered) { - if (materialize_batch_ids_buf.mutable_data()[ifiltered] == row_id_to_skip) { + if (match_batch_ids_buf.mutable_data()[ifiltered] == row_id_to_skip) { continue; } row_id_to_skip = passing_batch_row_ids[*num_passing_ids] = - materialize_batch_ids_buf.mutable_data()[ifiltered]; + match_batch_ids_buf.mutable_data()[ifiltered]; + ++(*num_passing_ids); + } + } + + return Status::OK(); +} + +Status JoinResidualFilter::FilterLeftAnti(const ExecBatch& keypayload_batch, + int batch_start_row, int num_batch_rows, + const uint8_t* match_bitvector, + const uint32_t* key_ids, bool no_duplicate_keys, + arrow::util::TempVectorStack* temp_stack, + int* num_passing_ids, + uint16_t* passing_batch_row_ids) const { + if (filter_ == literal(true)) { + CollectPassingBatchIds(0, hardware_flags_, batch_start_row, num_batch_rows, + match_bitvector, num_passing_ids, passing_batch_row_ids); + return Status::OK(); + } + + *num_passing_ids = 0; + int num_matching_ids = 0; + auto matching_batch_row_ids = + arrow::util::TempVectorHolder(temp_stack, num_batch_rows); + RETURN_NOT_OK(FilterLeftSemi(keypayload_batch, batch_start_row, num_batch_rows, + match_bitvector, key_ids, no_duplicate_keys, temp_stack, + &num_matching_ids, matching_batch_row_ids.mutable_data())); + + // Collect no match row ids. + // + int imatch = 0; + for (int irow = batch_start_row; irow < batch_start_row + num_batch_rows; ++irow) { + while (imatch < num_matching_ids && + matching_batch_row_ids.mutable_data()[imatch] < irow) { + ++imatch; + } + if (imatch == num_matching_ids || + matching_batch_row_ids.mutable_data()[imatch] != irow) { + passing_batch_row_ids[*num_passing_ids] = static_cast(irow); ++(*num_passing_ids); } } @@ -1991,59 +2044,90 @@ Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, Status JoinResidualFilter::FilterRightSemi( const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, - arrow::util::TempVectorStack* temp_stack, - OutputPayloadIdsCallback output_payload_ids) const { - ARROW_DCHECK(filter_ != literal(true)); + arrow::util::TempVectorStack* temp_stack, OnMatchBatch on_match_batch) const { + ARROW_DCHECK(on_match_batch); if (filter_.IsNullLiteral() || filter_ == literal(false)) { return Status::OK(); } - auto materialize_batch_ids_buf = + int num_matching_ids = 0; + if (filter_ == literal(true)) { + auto match_relative_batch_ids_buf = + arrow::util::TempVectorHolder(temp_stack, num_batch_rows); + auto match_key_ids_buf = + arrow::util::TempVectorHolder(temp_stack, num_batch_rows); + + arrow::util::bit_util::bits_to_indexes(1, hardware_flags_, num_batch_rows, + match_bitvector, &num_matching_ids, + match_relative_batch_ids_buf.mutable_data()); + // Collect key ids of passing rows. + // + for (int i = 0; i < num_matching_ids; ++i) { + uint16_t id = match_relative_batch_ids_buf.mutable_data()[i]; + match_key_ids_buf.mutable_data()[i] = key_ids[id]; + } + + on_match_batch(num_matching_ids, /*batch_row_ids=*/NULLPTR, + match_key_ids_buf.mutable_data(), + /*payload_ids=*/NULLPTR); + return Status::OK(); + } + + auto match_batch_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); - auto materialize_key_ids_buf = + auto match_key_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); - auto materialize_payload_ids_buf = + auto match_payload_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); JoinMatchIterator match_iterator; match_iterator.SetLookupResult(num_batch_rows, batch_start_row, match_bitvector, key_ids, no_duplicate_keys, key_to_payload_); - int num_matches_next = 0; - while (match_iterator.GetNextBatch(minibatch_size_, &num_matches_next, - materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data())) { + while (match_iterator.GetNextBatch( + minibatch_size_, &num_matching_ids, match_batch_ids_buf.mutable_data(), + match_key_ids_buf.mutable_data(), match_payload_ids_buf.mutable_data())) { int num_filtered = 0; - RETURN_NOT_OK(FilterInner( - keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data(), /*output_key_ids=*/false, - /*output_payload_ids=*/true, temp_stack, &num_filtered)); - // Output payload ids of passing rows. - // - output_payload_ids(num_filtered, materialize_payload_ids_buf.mutable_data()); + RETURN_NOT_OK(FilterOneBatch( + keypayload_batch, num_matching_ids, match_batch_ids_buf.mutable_data(), + match_key_ids_buf.mutable_data(), match_payload_ids_buf.mutable_data(), + /*output_key_ids=*/false, + /*output_payload_ids=*/true, temp_stack, &num_filtered, on_match_batch)); } return Status::OK(); } -Status JoinResidualFilter::FilterInner(const ExecBatch& keypayload_batch, - int num_batch_rows, uint16_t* batch_row_ids, - uint32_t* key_ids_maybe_null, - uint32_t* payload_ids_maybe_null, - bool output_key_ids, bool output_payload_ids, - arrow::util::TempVectorStack* temp_stack, - int* num_passing_rows) const { - ARROW_DCHECK(filter_ != literal(true)); - ARROW_DCHECK(!output_key_ids || key_ids_maybe_null); - ARROW_DCHECK(!output_payload_ids || payload_ids_maybe_null); +Status JoinResidualFilter::FilterInner( + const ExecBatch& keypayload_batch, int num_batch_rows, uint16_t* batch_row_ids, + uint32_t* key_ids, uint32_t* payload_ids_maybe_null, bool output_payload_ids, + arrow::util::TempVectorStack* temp_stack, int* num_passing_rows) const { + if (filter_ == literal(true)) { + *num_passing_rows = num_batch_rows; + return Status::OK(); + } *num_passing_rows = 0; if (filter_.IsNullLiteral() || filter_ == literal(false)) { return Status::OK(); } + return FilterOneBatch( + keypayload_batch, num_batch_rows, batch_row_ids, key_ids, payload_ids_maybe_null, + /*output_key_ids=*/true, output_payload_ids, temp_stack, num_passing_rows); +} + +Status JoinResidualFilter::FilterOneBatch( + const ExecBatch& keypayload_batch, int num_batch_rows, uint16_t* batch_row_ids, + uint32_t* key_ids_maybe_null, uint32_t* payload_ids_maybe_null, bool output_key_ids, + bool output_payload_ids, arrow::util::TempVectorStack* temp_stack, + int* num_passing_rows, OnMatchBatch on_match_batch) const { + ARROW_DCHECK(!filter_.IsNullLiteral() && filter_ != literal(true) && + filter_ != literal(false)); + ARROW_DCHECK(!output_key_ids || key_ids_maybe_null); + ARROW_DCHECK(!output_payload_ids || payload_ids_maybe_null); + + *num_passing_rows = 0; ARROW_ASSIGN_OR_RAISE(Datum mask, EvalFilter(keypayload_batch, num_batch_rows, batch_row_ids, key_ids_maybe_null, payload_ids_maybe_null)); @@ -2051,10 +2135,12 @@ Status JoinResidualFilter::FilterInner(const ExecBatch& keypayload_batch, const auto& mask_scalar = mask.scalar_as(); if (mask_scalar.is_valid && mask_scalar.value) { *num_passing_rows = num_batch_rows; - return Status::OK(); - } else { - return Status::OK(); } + if (on_match_batch) { + on_match_batch(*num_passing_rows, batch_row_ids, key_ids_maybe_null, + payload_ids_maybe_null); + } + return Status::OK(); } ARROW_DCHECK_EQ(mask.array()->offset, 0); @@ -2077,6 +2163,11 @@ Status JoinResidualFilter::FilterInner(const ExecBatch& keypayload_batch, } } + if (on_match_batch) { + on_match_batch(*num_passing_rows, batch_row_ids, key_ids_maybe_null, + payload_ids_maybe_null); + } + return Status::OK(); } @@ -2165,6 +2256,7 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, const ExecBatch& keypayload_batch, arrow::util::TempVectorStack* temp_stack, std::vector* temp_column_arrays) { + bool no_duplicate_keys = (hash_table_->key_to_payload() == nullptr); const SwissTable* swiss_table = hash_table_->keys()->swiss_table(); int64_t hardware_flags = swiss_table->hardware_flags(); int minibatch_size = swiss_table->minibatch_size(); @@ -2190,8 +2282,6 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto materialize_payload_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); - auto materialize_no_match_batch_ids_buf = - arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto filtered_bitvector_buf = arrow::util::TempVectorHolder( temp_stack, static_cast(bit_util::BytesForBits(minibatch_size))); @@ -2217,71 +2307,29 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI || join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { int num_passing_ids = 0; - const uint16_t* materialize_batch_ids = materialize_batch_ids_buf.mutable_data(); - if (residual_filter_->IsTrivial()) { - int bit_match = (join_type_ == JoinType::LEFT_ANTI) ? 0 : 1; - arrow::util::bit_util::bits_to_indexes( - bit_match, hardware_flags, minibatch_size_next, - match_bitvector_buf.mutable_data(), &num_passing_ids, - materialize_batch_ids_buf.mutable_data()); - if (join_type_ == JoinType::RIGHT_SEMI || join_type_ == JoinType::RIGHT_ANTI) { - // For right-semi, right-anti joins: collect key ids of passing rows. - // - for (int i = 0; i < num_passing_ids; ++i) { - uint16_t id = materialize_batch_ids_buf.mutable_data()[i]; - materialize_key_ids_buf.mutable_data()[i] = key_ids_buf.mutable_data()[id]; - } - // For right-semi, right-anti joins: update has-match flags for the rows - // in hash table. - hash_table_->UpdateHasMatchForKeys(thread_id, num_passing_ids, - materialize_key_ids_buf.mutable_data()); - } else { - // For left-semi, left-anti joins: add base batch row index. - // - for (int i = 0; i < num_passing_ids; ++i) { - materialize_batch_ids_buf.mutable_data()[i] += - static_cast(minibatch_start); - } - } + if (join_type_ == JoinType::LEFT_SEMI) { + RETURN_NOT_OK(residual_filter_->FilterLeftSemi( + keypayload_batch, minibatch_start, minibatch_size_next, + match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), + no_duplicate_keys, temp_stack, &num_passing_ids, + materialize_batch_ids_buf.mutable_data())); + } else if (join_type_ == JoinType::LEFT_ANTI) { + RETURN_NOT_OK(residual_filter_->FilterLeftAnti( + keypayload_batch, minibatch_start, minibatch_size_next, + match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), + no_duplicate_keys, temp_stack, &num_passing_ids, + materialize_batch_ids_buf.mutable_data())); } else { - bool no_duplicate_keys = (hash_table_->key_to_payload() == nullptr); - if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI) { - RETURN_NOT_OK(residual_filter_->FilterLeftSemi( - keypayload_batch, minibatch_start, minibatch_size_next, - match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), - no_duplicate_keys, temp_stack, &num_passing_ids, - materialize_batch_ids_buf.mutable_data())); - if (join_type_ == JoinType::LEFT_ANTI) { - // For left-anti join: collect no match row ids. - // - int num_no_passing_ids = 0; - int imatch = 0; - for (int irow = minibatch_start; - irow < minibatch_start + static_cast(minibatch_size_next); ++irow) { - while (imatch < num_passing_ids && - materialize_batch_ids_buf.mutable_data()[imatch] < irow) { - ++imatch; - } - if (imatch == num_passing_ids || - materialize_batch_ids_buf.mutable_data()[imatch] != irow) { - materialize_no_match_batch_ids_buf.mutable_data()[num_no_passing_ids++] = - static_cast(irow); - } - } - num_passing_ids = num_no_passing_ids; - materialize_batch_ids = materialize_no_match_batch_ids_buf.mutable_data(); - } - } else { - auto update_has_match = [thread_id, this](int num_passing_ids, - const uint32_t* payload_ids) { - hash_table_->UpdateHasMatchForPayloads(thread_id, num_passing_ids, - payload_ids); - }; - RETURN_NOT_OK(residual_filter_->FilterRightSemi( - keypayload_batch, minibatch_start, minibatch_size_next, - match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), - no_duplicate_keys, temp_stack, update_has_match)); - } + RETURN_NOT_OK(residual_filter_->FilterRightSemi( + keypayload_batch, minibatch_start, minibatch_size_next, + match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), + no_duplicate_keys, temp_stack, + [thread_id, this](int num_passing_ids, const uint16_t*, + const uint32_t* key_ids_maybe_null, + const uint32_t* payload_ids_maybe_null) { + UpdateHasMatch(thread_id, num_passing_ids, key_ids_maybe_null, + payload_ids_maybe_null); + })); } if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI) { @@ -2289,7 +2337,7 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, // row ids. // RETURN_NOT_OK(materialize_[thread_id]->AppendProbeOnly( - keypayload_batch, num_passing_ids, materialize_batch_ids, + keypayload_batch, num_passing_ids, materialize_batch_ids_buf.mutable_data(), [&](ExecBatch batch) { return output_batch_fn_(thread_id, std::move(batch)); })); @@ -2299,47 +2347,49 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, // Since every hash table lookup for an input row might have multiple // matches we use a helper class that implements enumerating all of them. // - bool no_duplicate_keys = (hash_table_->key_to_payload() == nullptr); JoinMatchIterator match_iterator; match_iterator.SetLookupResult( minibatch_size_next, minibatch_start, match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), no_duplicate_keys, hash_table_->key_to_payload()); - if (!residual_filter_->IsTrivial() && - (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER)) { + int num_matches_next; + bool use_filtered_bitvector = + residual_filter_->NeedToUpdateMatchBitVector(join_type_); + // For filtered result, initialize match bit-vector to all zeros (no match). + // + if (use_filtered_bitvector) { std::memset(filtered_bitvector_buf.mutable_data(), 0, bit_util::BytesForBits(minibatch_size_next)); } - int num_matches_next; while (match_iterator.GetNextBatch(minibatch_size, &num_matches_next, materialize_batch_ids_buf.mutable_data(), materialize_key_ids_buf.mutable_data(), materialize_payload_ids_buf.mutable_data())) { - if (!residual_filter_->IsTrivial()) { - RETURN_NOT_OK(residual_filter_->FilterInner( - keypayload_batch, num_matches_next, - materialize_batch_ids_buf.mutable_data(), - materialize_key_ids_buf.mutable_data(), - materialize_payload_ids_buf.mutable_data(), /*output_key_ids=*/true, - !no_duplicate_keys, temp_stack, &num_matches_next)); - if (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER) { - for (int i = 0; i < num_matches_next; ++i) { - int bit_idx = materialize_batch_ids_buf.mutable_data()[i] - minibatch_start; - bit_util::SetBitTo(filtered_bitvector_buf.mutable_data(), bit_idx, 1); - } - } - } + RETURN_NOT_OK(residual_filter_->FilterInner( + keypayload_batch, num_matches_next, materialize_batch_ids_buf.mutable_data(), + materialize_key_ids_buf.mutable_data(), + materialize_payload_ids_buf.mutable_data(), !no_duplicate_keys, temp_stack, + &num_matches_next)); + const uint16_t* materialize_batch_ids = materialize_batch_ids_buf.mutable_data(); const uint32_t* materialize_key_ids = materialize_key_ids_buf.mutable_data(); const uint32_t* materialize_payload_ids = no_duplicate_keys ? materialize_key_ids_buf.mutable_data() : materialize_payload_ids_buf.mutable_data(); + // For filtered result, update match bit-vector. + // + if (use_filtered_bitvector) { + UpdateMatchBitVector(minibatch_start, num_matches_next, + filtered_bitvector_buf.mutable_data(), num_matches_next, + materialize_batch_ids); + } + // For right-outer, full-outer joins we need to update has-match flags // for the rows in hash table. // if (join_type_ == JoinType::RIGHT_OUTER || join_type_ == JoinType::FULL_OUTER) { - hash_table_->UpdateHasMatchForPayloads(thread_id, num_matches_next, - materialize_payload_ids); + UpdateHasMatch(thread_id, num_matches_next, /*key_ids_maybe_null=*/NULLPTR, + materialize_payload_ids); } // Call materialize for resulting id tuples pointing to matching pairs @@ -2359,20 +2409,12 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, // if (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER) { int num_passing_ids = 0; - const uint8_t* match_bitvector = residual_filter_->IsTrivial() - ? match_bitvector_buf.mutable_data() - : filtered_bitvector_buf.mutable_data(); - arrow::util::bit_util::bits_to_indexes( - /*bit_to_search=*/0, hardware_flags, minibatch_size_next, match_bitvector, + CollectPassingBatchIds( + 0, hardware_flags, minibatch_start, minibatch_size_next, + use_filtered_bitvector ? filtered_bitvector_buf.mutable_data() + : match_bitvector_buf.mutable_data(), &num_passing_ids, materialize_batch_ids_buf.mutable_data()); - // Add base batch row index. - // - for (int i = 0; i < num_passing_ids; ++i) { - materialize_batch_ids_buf.mutable_data()[i] += - static_cast(minibatch_start); - } - RETURN_NOT_OK(materialize_[thread_id]->AppendProbeOnly( keypayload_batch, num_passing_ids, materialize_batch_ids_buf.mutable_data(), [&](ExecBatch batch) { @@ -2400,6 +2442,27 @@ Status JoinProbeProcessor::OnFinished() { return Status::OK(); } +void JoinProbeProcessor::UpdateHasMatch(int64_t thread_id, int num_rows, + const uint32_t* key_ids_maybe_null, + const uint32_t* payload_ids_maybe_null) { + ARROW_DCHECK(key_ids_maybe_null || payload_ids_maybe_null); + if (payload_ids_maybe_null) { + hash_table_->UpdateHasMatchForPayloads(thread_id, num_rows, payload_ids_maybe_null); + } else { + hash_table_->UpdateHasMatchForKeys(thread_id, num_rows, key_ids_maybe_null); + } +} + +void JoinProbeProcessor::UpdateMatchBitVector(int batch_start_row, int num_batch_rows, + uint8_t* match_bitvector, + int num_passing_rows, + const uint16_t* batch_ids) { + for (int i = 0; i < num_passing_rows; ++i) { + int bit_idx = batch_ids[i] - batch_start_row; + bit_util::SetBitTo(match_bitvector, bit_idx, 1); + } +} + class SwissJoin : public HashJoinImpl { public: Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 40b2c5a9867..f1da3c8b1a2 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -761,31 +761,49 @@ class JoinResidualFilter { void SetBuildSide(int minibatch_size, const RowArray* build_keys, const RowArray* build_payloads, const uint32_t* key_to_payload); - bool IsTrivial() const { return filter_ == literal(true); } + bool NeedToUpdateMatchBitVector(JoinType join_type) const { + return (join_type == JoinType::LEFT_OUTER || join_type == JoinType::FULL_OUTER) && + filter_ != literal(true); + } int NumBuildKeysReferred() const { return num_build_keys_referred_; } int NumBuildPayloadsReferred() const { return num_build_payloads_referred_; } + using OnMatchBatch = + std::function; + Status FilterLeftSemi(const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack, int* num_passing_ids, uint16_t* passing_batch_row_ids) const; - using OutputPayloadIdsCallback = std::function; + Status FilterLeftAnti(const ExecBatch& keypayload_batch, int batch_start_row, + int num_batch_rows, const uint8_t* match_bitvector, + const uint32_t* key_ids, bool no_duplicate_keys, + arrow::util::TempVectorStack* temp_stack, int* num_passing_ids, + uint16_t* passing_batch_row_ids) const; + Status FilterRightSemi(const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack, - OutputPayloadIdsCallback output_payload_ids) const; + OnMatchBatch on_match_batch) const; Status FilterInner(const ExecBatch& keypayload_batch, int num_batch_rows, - uint16_t* batch_row_ids, uint32_t* key_ids_maybe_null, - uint32_t* payload_ids_maybe_null, bool output_key_ids, - bool output_payload_ids, arrow::util::TempVectorStack* temp_stack, + uint16_t* batch_row_ids, uint32_t* key_ids, + uint32_t* payload_ids_maybe_null, bool output_payload_ids, + arrow::util::TempVectorStack* temp_stack, int* num_passing_rows) const; private: + Status FilterOneBatch(const ExecBatch& keypayload_batch, int num_batch_rows, + uint16_t* batch_row_ids, uint32_t* key_ids_maybe_null, + uint32_t* payload_ids_maybe_null, bool output_key_ids, + bool output_payload_ids, arrow::util::TempVectorStack* temp_stack, + int* num_passing_rows, OnMatchBatch on_match_batch = {}) const; + Result EvalFilter(const ExecBatch& keypayload_batch, int num_batch_rows, const uint16_t* batch_row_ids, const uint32_t* key_ids_maybe_null, @@ -837,6 +855,21 @@ class JoinProbeProcessor { // Status OnFinished(); + private: + // For right-* and full-outer joins: we need to update has-match flags + // for the rows in hash table. + // + void UpdateHasMatch(int64_t thread_id, int num_passing_ids, + const uint32_t* key_ids_maybe_null, + const uint32_t* payload_ids_maybe_null); + + // For left-outer and full-outer joins: we need to update match bit-vector if + // the residual filter is not a literal true. + // + void UpdateMatchBitVector(int batch_start_row, int num_batch_rows, + uint8_t* match_bitvector, int num_passing_rows, + const uint16_t* batch_ids); + private: int num_key_columns_; JoinType join_type_; From d000621f05419f08944dd20c327dad8dd6134db5 Mon Sep 17 00:00:00 2001 From: zanmato Date: Sun, 7 Jan 2024 16:41:53 -0800 Subject: [PATCH 17/33] Refine structure and add docs --- cpp/src/arrow/acero/swiss_join.cc | 238 ++++++++++------------ cpp/src/arrow/acero/swiss_join_internal.h | 105 ++++++---- 2 files changed, 180 insertions(+), 163 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 4a0ff7e67f3..8a3a3b9156e 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1086,7 +1086,7 @@ void SwissTableForJoin::UpdateHasMatchForKeys(int64_t thread_id, int num_ids, return; } for (int ikey = 0; ikey < num_ids; ++ikey) { - // Mark all payloads corresponding to this key in hash table as having a match + // Mark payloads corresponding to this key in hash table as having a match. // uint32_t key_id = key_ids[ikey]; uint32_t first_payload_for_key = key_to_payload() ? key_to_payload()[key_id] : key_id; @@ -1106,7 +1106,7 @@ void SwissTableForJoin::UpdateHasMatchForPayloads(int64_t thread_id, int num_ids return; } for (int i = 0; i < num_ids; ++i) { - // Mark payload in hash table as having a match + // Mark payload in hash table as having a match. // bit_util::SetBit(bit_vector, payload_ids[i]); } @@ -1855,6 +1855,10 @@ bool JoinMatchIterator::GetNextBatch(int num_rows_max, int* out_num_rows, namespace { +// Given match_bitvector identifies that there is a match for row[batch_start_row + i] in +// given input batch if bit match_bitvector[i] == passing_bit. Collect all the passing row +// ids according to the given match_bitvector. +// void CollectPassingBatchIds(int passing_bit, int64_t hardware_flags, int batch_start_row, int num_batch_rows, const uint8_t* match_bitvector, int* num_passing_ids, uint16_t* passing_batch_row_ids) { @@ -1873,17 +1877,15 @@ void CollectPassingBatchIds(int passing_bit, int64_t hardware_flags, int batch_s void JoinResidualFilter::Init(Expression filter, QueryContext* ctx, MemoryPool* pool, int64_t hardware_flags, const HashJoinProjectionMaps* probe_schemas, - const HashJoinProjectionMaps* build_schemas) { + const HashJoinProjectionMaps* build_schemas, + SwissTableForJoin* hash_table) { filter_ = std::move(filter); - if (filter_ == literal(true)) { - return; - } - ctx_ = ctx; pool_ = pool; hardware_flags_ = hardware_flags; probe_schemas_ = probe_schemas; build_schemas_ = build_schemas; + hash_table_ = hash_table; { probe_filter_to_key_and_payload_.resize( @@ -1923,13 +1925,25 @@ void JoinResidualFilter::Init(Expression filter, QueryContext* ctx, MemoryPool* } } -void JoinResidualFilter::SetBuildSide(int minibatch_size, const RowArray* build_keys, - const RowArray* build_payloads, - const uint32_t* key_to_payload) { - minibatch_size_ = minibatch_size; - build_keys_ = build_keys; - build_payloads_ = build_payloads; - key_to_payload_ = key_to_payload; +void JoinResidualFilter::OnBuildFinished() { + minibatch_size_ = hash_table_->keys()->swiss_table()->minibatch_size(); + build_keys_ = hash_table_->keys()->keys(); + build_payloads_ = hash_table_->payloads(); + key_to_payload_ = hash_table_->key_to_payload(); +} + +void JoinResidualFilter::InitFilterBitVector(int num_batch_rows, + uint8_t* filter_bitvector) { + std::memset(filter_bitvector, 0, bit_util::BytesForBits(num_batch_rows)); +} + +void JoinResidualFilter::UpdateFilterBitVector(int batch_start_row, int num_batch_rows, + const uint16_t* batch_row_ids, + uint8_t* filter_bitvector) { + for (int i = 0; i < num_batch_rows; ++i) { + int bit_idx = batch_row_ids[i] - batch_start_row; + bit_util::SetBitTo(filter_bitvector, bit_idx, 1); + } } Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, @@ -1956,45 +1970,50 @@ Status JoinResidualFilter::FilterLeftSemi(const ExecBatch& keypayload_batch, // CollectPassingBatchIds(1, hardware_flags_, batch_start_row, num_batch_rows, match_bitvector, num_passing_ids, passing_batch_row_ids); - RETURN_NOT_OK( - FilterOneBatch(keypayload_batch, *num_passing_ids, passing_batch_row_ids, - /*payload_ids_maybe_null=*/NULLPTR, - /*payload_ids_maybe_null=*/NULLPTR, - /*output_payload_ids=*/false, - /*output_payload_ids=*/false, temp_stack, num_passing_ids)); - return Status::OK(); + return FilterOneBatch(keypayload_batch, *num_passing_ids, passing_batch_row_ids, + /*key_ids_maybe_null=*/NULLPTR, + /*payload_ids_maybe_null=*/NULLPTR, + /*output_key_ids=*/false, + /*output_payload_ids=*/false, temp_stack, num_passing_ids); } - auto match_batch_ids_buf = + auto match_batch_row_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); auto match_key_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); auto match_payload_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); + // Inner matching is necessary for non-trivial filter. Only until evaluating filter for + // all matches of the same row can we be sure that it's not passing (it could pass + // earlier though). + // JoinMatchIterator match_iterator; match_iterator.SetLookupResult(num_batch_rows, batch_start_row, match_bitvector, key_ids, no_duplicate_keys, key_to_payload_); int num_matches_next = 0; - int row_id_to_skip = JoinMatchIterator::kInvalidRowId; - while (match_iterator.GetNextBatch( - minibatch_size_, &num_matches_next, match_batch_ids_buf.mutable_data(), - match_key_ids_buf.mutable_data(), match_payload_ids_buf.mutable_data(), - row_id_to_skip)) { - int num_filtered = 0; + // Used to not only collect distinct row ids, but also skip unecessary matches in the + // next batch. + // + int row_id_last = JoinMatchIterator::kInvalidRowId; + while (match_iterator.GetNextBatch(minibatch_size_, &num_matches_next, + match_batch_row_ids_buf.mutable_data(), + match_key_ids_buf.mutable_data(), + match_payload_ids_buf.mutable_data(), row_id_last)) { + int num_passing = 0; RETURN_NOT_OK(FilterOneBatch( - keypayload_batch, num_matches_next, match_batch_ids_buf.mutable_data(), + keypayload_batch, num_matches_next, match_batch_row_ids_buf.mutable_data(), match_key_ids_buf.mutable_data(), match_payload_ids_buf.mutable_data(), /*output_key_ids=*/false, - /*output_payload_ids=*/false, temp_stack, &num_filtered)); - // There may be multiple matches for a row in batch. Collect distinct row ids. + /*output_payload_ids=*/false, temp_stack, &num_passing)); + // There may be multiple passing of a row in batch. Collect distinct row ids. // - for (int ifiltered = 0; ifiltered < num_filtered; ++ifiltered) { - if (match_batch_ids_buf.mutable_data()[ifiltered] == row_id_to_skip) { + for (int ipassing = 0; ipassing < num_passing; ++ipassing) { + if (match_batch_row_ids_buf.mutable_data()[ipassing] == row_id_last) { continue; } - row_id_to_skip = passing_batch_row_ids[*num_passing_ids] = - match_batch_ids_buf.mutable_data()[ifiltered]; + row_id_last = passing_batch_row_ids[*num_passing_ids] = + match_batch_row_ids_buf.mutable_data()[ipassing]; ++(*num_passing_ids); } } @@ -2015,24 +2034,27 @@ Status JoinResidualFilter::FilterLeftAnti(const ExecBatch& keypayload_batch, return Status::OK(); } + // Do FilterLeftSemi first. + // *num_passing_ids = 0; - int num_matching_ids = 0; - auto matching_batch_row_ids = + int num_semi_passing_ids = 0; + auto semi_passing_batch_row_ids = arrow::util::TempVectorHolder(temp_stack, num_batch_rows); RETURN_NOT_OK(FilterLeftSemi(keypayload_batch, batch_start_row, num_batch_rows, match_bitvector, key_ids, no_duplicate_keys, temp_stack, - &num_matching_ids, matching_batch_row_ids.mutable_data())); + &num_semi_passing_ids, + semi_passing_batch_row_ids.mutable_data())); - // Collect no match row ids. + // Then collect non-passing row ids of FilterLeftSemi. // - int imatch = 0; + int isemi = 0; for (int irow = batch_start_row; irow < batch_start_row + num_batch_rows; ++irow) { - while (imatch < num_matching_ids && - matching_batch_row_ids.mutable_data()[imatch] < irow) { - ++imatch; + while (isemi < num_semi_passing_ids && + semi_passing_batch_row_ids.mutable_data()[isemi] < irow) { + ++isemi; } - if (imatch == num_matching_ids || - matching_batch_row_ids.mutable_data()[imatch] != irow) { + if (isemi == num_semi_passing_ids || + semi_passing_batch_row_ids.mutable_data()[isemi] != irow) { passing_batch_row_ids[*num_passing_ids] = static_cast(irow); ++(*num_passing_ids); } @@ -2041,12 +2063,10 @@ Status JoinResidualFilter::FilterLeftAnti(const ExecBatch& keypayload_batch, return Status::OK(); } -Status JoinResidualFilter::FilterRightSemi( - const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, - const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, - arrow::util::TempVectorStack* temp_stack, OnMatchBatch on_match_batch) const { - ARROW_DCHECK(on_match_batch); - +Status JoinResidualFilter::FilterRightSemiAnti( + int64_t thread_id, const ExecBatch& keypayload_batch, int batch_start_row, + int num_batch_rows, const uint8_t* match_bitvector, const uint32_t* key_ids, + bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack) const { if (filter_.IsNullLiteral() || filter_ == literal(false)) { return Status::OK(); } @@ -2068,31 +2088,35 @@ Status JoinResidualFilter::FilterRightSemi( match_key_ids_buf.mutable_data()[i] = key_ids[id]; } - on_match_batch(num_matching_ids, /*batch_row_ids=*/NULLPTR, - match_key_ids_buf.mutable_data(), - /*payload_ids=*/NULLPTR); + hash_table_->UpdateHasMatchForKeys(thread_id, num_matching_ids, + match_key_ids_buf.mutable_data()); return Status::OK(); } - auto match_batch_ids_buf = + auto match_batch_row_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); auto match_key_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); auto match_payload_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size_); + // Inner matching is necessary for non-trivial filter. Because even for the same row + // with same matching key, the filter results could vary for different payloads. + // JoinMatchIterator match_iterator; match_iterator.SetLookupResult(num_batch_rows, batch_start_row, match_bitvector, key_ids, no_duplicate_keys, key_to_payload_); while (match_iterator.GetNextBatch( - minibatch_size_, &num_matching_ids, match_batch_ids_buf.mutable_data(), + minibatch_size_, &num_matching_ids, match_batch_row_ids_buf.mutable_data(), match_key_ids_buf.mutable_data(), match_payload_ids_buf.mutable_data())) { int num_filtered = 0; RETURN_NOT_OK(FilterOneBatch( - keypayload_batch, num_matching_ids, match_batch_ids_buf.mutable_data(), + keypayload_batch, num_matching_ids, match_batch_row_ids_buf.mutable_data(), match_key_ids_buf.mutable_data(), match_payload_ids_buf.mutable_data(), /*output_key_ids=*/false, - /*output_payload_ids=*/true, temp_stack, &num_filtered, on_match_batch)); + /*output_payload_ids=*/true, temp_stack, &num_filtered)); + hash_table_->UpdateHasMatchForPayloads(thread_id, num_filtered, + match_payload_ids_buf.mutable_data()); } return Status::OK(); @@ -2117,11 +2141,14 @@ Status JoinResidualFilter::FilterInner( /*output_key_ids=*/true, output_payload_ids, temp_stack, num_passing_rows); } -Status JoinResidualFilter::FilterOneBatch( - const ExecBatch& keypayload_batch, int num_batch_rows, uint16_t* batch_row_ids, - uint32_t* key_ids_maybe_null, uint32_t* payload_ids_maybe_null, bool output_key_ids, - bool output_payload_ids, arrow::util::TempVectorStack* temp_stack, - int* num_passing_rows, OnMatchBatch on_match_batch) const { +Status JoinResidualFilter::FilterOneBatch(const ExecBatch& keypayload_batch, + int num_batch_rows, uint16_t* batch_row_ids, + uint32_t* key_ids_maybe_null, + uint32_t* payload_ids_maybe_null, + bool output_key_ids, bool output_payload_ids, + arrow::util::TempVectorStack* temp_stack, + int* num_passing_rows) const { + // Caller must do shortcuts for trivial filter. ARROW_DCHECK(!filter_.IsNullLiteral() && filter_ != literal(true) && filter_ != literal(false)); ARROW_DCHECK(!output_key_ids || key_ids_maybe_null); @@ -2136,10 +2163,6 @@ Status JoinResidualFilter::FilterOneBatch( if (mask_scalar.is_valid && mask_scalar.value) { *num_passing_rows = num_batch_rows; } - if (on_match_batch) { - on_match_batch(*num_passing_rows, batch_row_ids, key_ids_maybe_null, - payload_ids_maybe_null); - } return Status::OK(); } @@ -2163,11 +2186,6 @@ Status JoinResidualFilter::FilterOneBatch( } } - if (on_match_batch) { - on_match_batch(*num_passing_rows, batch_row_ids, key_ids_maybe_null, - payload_ids_maybe_null); - } - return Status::OK(); } @@ -2282,7 +2300,7 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, arrow::util::TempVectorHolder(temp_stack, minibatch_size); auto materialize_payload_ids_buf = arrow::util::TempVectorHolder(temp_stack, minibatch_size); - auto filtered_bitvector_buf = arrow::util::TempVectorHolder( + auto filter_bitvector_buf = arrow::util::TempVectorHolder( temp_stack, static_cast(bit_util::BytesForBits(minibatch_size))); for (int minibatch_start = 0; minibatch_start < num_rows;) { @@ -2320,16 +2338,10 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, no_duplicate_keys, temp_stack, &num_passing_ids, materialize_batch_ids_buf.mutable_data())); } else { - RETURN_NOT_OK(residual_filter_->FilterRightSemi( - keypayload_batch, minibatch_start, minibatch_size_next, + RETURN_NOT_OK(residual_filter_->FilterRightSemiAnti( + thread_id, keypayload_batch, minibatch_start, minibatch_size_next, match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), - no_duplicate_keys, temp_stack, - [thread_id, this](int num_passing_ids, const uint16_t*, - const uint32_t* key_ids_maybe_null, - const uint32_t* payload_ids_maybe_null) { - UpdateHasMatch(thread_id, num_passing_ids, key_ids_maybe_null, - payload_ids_maybe_null); - })); + no_duplicate_keys, temp_stack)); } if (join_type_ == JoinType::LEFT_SEMI || join_type_ == JoinType::LEFT_ANTI) { @@ -2352,13 +2364,10 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, minibatch_size_next, minibatch_start, match_bitvector_buf.mutable_data(), key_ids_buf.mutable_data(), no_duplicate_keys, hash_table_->key_to_payload()); int num_matches_next; - bool use_filtered_bitvector = - residual_filter_->NeedToUpdateMatchBitVector(join_type_); - // For filtered result, initialize match bit-vector to all zeros (no match). - // - if (use_filtered_bitvector) { - std::memset(filtered_bitvector_buf.mutable_data(), 0, - bit_util::BytesForBits(minibatch_size_next)); + bool use_filter_bitvector = residual_filter_->NeedFilterBitVector(join_type_); + if (use_filter_bitvector) { + residual_filter_->InitFilterBitVector(minibatch_size_next, + filter_bitvector_buf.mutable_data()); } while (match_iterator.GetNextBatch(minibatch_size, &num_matches_next, materialize_batch_ids_buf.mutable_data(), @@ -2376,20 +2385,20 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, no_duplicate_keys ? materialize_key_ids_buf.mutable_data() : materialize_payload_ids_buf.mutable_data(); - // For filtered result, update match bit-vector. + // For filtered result, update filter bit-vector. // - if (use_filtered_bitvector) { - UpdateMatchBitVector(minibatch_start, num_matches_next, - filtered_bitvector_buf.mutable_data(), num_matches_next, - materialize_batch_ids); + if (use_filter_bitvector) { + residual_filter_->UpdateFilterBitVector(minibatch_start, num_matches_next, + materialize_batch_ids, + filter_bitvector_buf.mutable_data()); } // For right-outer, full-outer joins we need to update has-match flags // for the rows in hash table. // if (join_type_ == JoinType::RIGHT_OUTER || join_type_ == JoinType::FULL_OUTER) { - UpdateHasMatch(thread_id, num_matches_next, /*key_ids_maybe_null=*/NULLPTR, - materialize_payload_ids); + hash_table_->UpdateHasMatchForPayloads(thread_id, num_matches_next, + materialize_payload_ids); } // Call materialize for resulting id tuples pointing to matching pairs @@ -2409,11 +2418,11 @@ Status JoinProbeProcessor::OnNextBatch(int64_t thread_id, // if (join_type_ == JoinType::LEFT_OUTER || join_type_ == JoinType::FULL_OUTER) { int num_passing_ids = 0; - CollectPassingBatchIds( - 0, hardware_flags, minibatch_start, minibatch_size_next, - use_filtered_bitvector ? filtered_bitvector_buf.mutable_data() - : match_bitvector_buf.mutable_data(), - &num_passing_ids, materialize_batch_ids_buf.mutable_data()); + CollectPassingBatchIds(0, hardware_flags, minibatch_start, minibatch_size_next, + use_filter_bitvector ? filter_bitvector_buf.mutable_data() + : match_bitvector_buf.mutable_data(), + &num_passing_ids, + materialize_batch_ids_buf.mutable_data()); RETURN_NOT_OK(materialize_[thread_id]->AppendProbeOnly( keypayload_batch, num_passing_ids, materialize_batch_ids_buf.mutable_data(), @@ -2442,27 +2451,6 @@ Status JoinProbeProcessor::OnFinished() { return Status::OK(); } -void JoinProbeProcessor::UpdateHasMatch(int64_t thread_id, int num_rows, - const uint32_t* key_ids_maybe_null, - const uint32_t* payload_ids_maybe_null) { - ARROW_DCHECK(key_ids_maybe_null || payload_ids_maybe_null); - if (payload_ids_maybe_null) { - hash_table_->UpdateHasMatchForPayloads(thread_id, num_rows, payload_ids_maybe_null); - } else { - hash_table_->UpdateHasMatchForKeys(thread_id, num_rows, key_ids_maybe_null); - } -} - -void JoinProbeProcessor::UpdateMatchBitVector(int batch_start_row, int num_batch_rows, - uint8_t* match_bitvector, - int num_passing_rows, - const uint16_t* batch_ids) { - for (int i = 0; i < num_passing_rows; ++i) { - int bit_idx = batch_ids[i] - batch_start_row; - bit_util::SetBitTo(match_bitvector, bit_idx, 1); - } -} - class SwissJoin : public HashJoinImpl { public: Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, @@ -2520,7 +2508,7 @@ class SwissJoin : public HashJoinImpl { } residual_filter_.Init(std::move(filter), ctx_, pool_, hardware_flags_, proj_map_left, - proj_map_right); + proj_map_right, &hash_table_); probe_processor_.Init(proj_map_left->num_cols(HashJoinProjection::KEY), join_type_, &hash_table_, &residual_filter_, materialize, &key_cmp_, @@ -2728,9 +2716,7 @@ class SwissJoin : public HashJoinImpl { } hash_table_ready_.store(true); - residual_filter_.SetBuildSide(hash_table_.keys()->swiss_table()->minibatch_size(), - hash_table_.keys()->keys(), hash_table_.payloads(), - hash_table_.key_to_payload()); + residual_filter_.OnBuildFinished(); return build_finished_callback_(thread_id); } diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index f1da3c8b1a2..1c305d3c162 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -367,10 +367,10 @@ class SwissTableForJoin { friend class SwissTableForJoinBuild; public: - // Update all payloads corresponding to the given keys as having a match + // Update all payloads corresponding to the given keys as having a match. // void UpdateHasMatchForKeys(int64_t thread_id, int num_rows, const uint32_t* key_ids); - // Update the given payloads as having a match + // Update the given payloads as having a match. // void UpdateHasMatchForPayloads(int64_t thread_id, int num_rows, const uint32_t* payload_ids); @@ -399,10 +399,10 @@ class SwissTableForJoin { int dop_; struct ThreadLocalState { - // Bit-vector for keeping track of whether each payload in the hash table had a match + // Bit-vector for keeping track of whether each payload in the hash table had a match. std::vector has_match; }; - // Bit-vector for keeping track of whether each payload in the hash table had a match + // Bit-vector for keeping track of whether each payload in the hash table had a match. std::vector local_states_; std::vector has_match_; @@ -728,8 +728,8 @@ class JoinMatchIterator { uint32_t* key_ids, uint32_t* payload_ids, int row_id_to_skip = kInvalidRowId); - // The row id that will never exist in an ExecBatch. - // Used to indicate that there is no row to skip. + // The row id that will never exist in an ExecBatch. Used to indicate that there is no + // row to skip. // static constexpr uint32_t kInvalidRowId = std::numeric_limits::max() + 1; @@ -752,45 +752,86 @@ class JoinMatchIterator { int current_match_for_row_; }; +// Implement the residual filter support used when processing the probe side exec batches. +// There are four filtering patterns, each with a corresponding public FilterXXX method: +// - LeftSemi and LeftAnti, each for its co-naming join type, opposite to each other. +// - RightSemiAnti for both right-semi and right-anti joins: they have the same filtering +// logic and differ only in the scanning phase. +// - Inner for inner joins and the inner part of outer joins: caller should take care of +// filtering the outer part. +// All the public Filter* methods have zero-cost shortcut for trivial filter. +// class JoinResidualFilter { public: void Init(Expression filter, QueryContext* ctx, MemoryPool* pool, int64_t hardware_flags, const HashJoinProjectionMaps* probe_schemas, - const HashJoinProjectionMaps* build_schemas); + const HashJoinProjectionMaps* build_schemas, SwissTableForJoin* hash_table); + + void OnBuildFinished(); - void SetBuildSide(int minibatch_size, const RowArray* build_keys, - const RowArray* build_payloads, const uint32_t* key_to_payload); + int NumBuildKeysReferred() const { return num_build_keys_referred_; } + int NumBuildPayloadsReferred() const { return num_build_payloads_referred_; } - bool NeedToUpdateMatchBitVector(JoinType join_type) const { + // Left-outer and full-outer joins can result in a different bit-vector than the one of + // probing the hash table if the residual filter is not a literal true. If so, caller + // should setup a bit-vector for filtering properly and call `UpdateMatchBitVector` + // accordingly. + // + bool NeedFilterBitVector(JoinType join_type) const { return (join_type == JoinType::LEFT_OUTER || join_type == JoinType::FULL_OUTER) && filter_ != literal(true); } - int NumBuildKeysReferred() const { return num_build_keys_referred_; } - int NumBuildPayloadsReferred() const { return num_build_payloads_referred_; } + // Init the bit-vector for filtering. Caller should make sure the bit-vector has enough + // size for a particular probe side batch. + // + void InitFilterBitVector(int num_batch_rows, uint8_t* filter_bitvector); - using OnMatchBatch = - std::function; + // Update the bit-vector for filtering according to the given batch row ids. + // + void UpdateFilterBitVector(int batch_start_row, int num_batch_rows, + const uint16_t* batch_row_ids, uint8_t* filter_bitvector); + // Left row is passing if filter evaluates true. Output all the passing row ids in + // the input batch. Like the left-semi join semantic, each passing row is output only + // once. + // Zero-overhead shortcut guarantee for trivial filter. + // Status FilterLeftSemi(const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack, int* num_passing_ids, uint16_t* passing_batch_row_ids) const; + // Logically the opposite of FilterLeftSemi. Output all the passing row ids in the input + // batch. Like the left-anti join semantic, each passing row is output only once. + // Zero-overhead shortcut guarantee for trivial filter. + // Status FilterLeftAnti(const ExecBatch& keypayload_batch, int batch_start_row, int num_batch_rows, const uint8_t* match_bitvector, const uint32_t* key_ids, bool no_duplicate_keys, arrow::util::TempVectorStack* temp_stack, int* num_passing_ids, uint16_t* passing_batch_row_ids) const; - Status FilterRightSemi(const ExecBatch& keypayload_batch, int batch_start_row, - int num_batch_rows, const uint8_t* match_bitvector, - const uint32_t* key_ids, bool no_duplicate_keys, - arrow::util::TempVectorStack* temp_stack, - OnMatchBatch on_match_batch) const; - + // Right row is passing if filter evaluates true. Mark a match for all the passing + // payload ids in the hash table. This applies for both right-semi and right-anti joins: + // they differ in scanning phase. + // Zero-overhead shortcut guarantee for trivial filter. + // + Status FilterRightSemiAnti(int64_t thread_id, const ExecBatch& keypayload_batch, + int batch_start_row, int num_batch_rows, + const uint8_t* match_bitvector, const uint32_t* key_ids, + bool no_duplicate_keys, + arrow::util::TempVectorStack* temp_stack) const; + + // For a given batch of an inner match (an inner-join or the inner part of an + // outer-join), row is passing if filter evaluates true. Does not do any outer filtering + // because this method is usually called within a inner match loop, which doesn't have + // the full scope of outer join. This requires caller to handle the outer part properly. + // All batch_row_ids, key_ids and payload_ids_maybe_null are input and output, this is + // for efficient shortcut. + // Zero-overhead shortcut guarantee for trivial filter. + // Status FilterInner(const ExecBatch& keypayload_batch, int num_batch_rows, uint16_t* batch_row_ids, uint32_t* key_ids, uint32_t* payload_ids_maybe_null, bool output_payload_ids, @@ -798,11 +839,15 @@ class JoinResidualFilter { int* num_passing_rows) const; private: + // Evaluates the filter for a given batch of matching rows, and outputs the passing + // rows. Always introduces overhead of materialization and evaluation, so caller must do + // shortcut properly for trivial filters. + // Status FilterOneBatch(const ExecBatch& keypayload_batch, int num_batch_rows, uint16_t* batch_row_ids, uint32_t* key_ids_maybe_null, uint32_t* payload_ids_maybe_null, bool output_key_ids, bool output_payload_ids, arrow::util::TempVectorStack* temp_stack, - int* num_passing_rows, OnMatchBatch on_match_batch = {}) const; + int* num_passing_rows) const; Result EvalFilter(const ExecBatch& keypayload_batch, int num_batch_rows, const uint16_t* batch_row_ids, @@ -825,6 +870,7 @@ class JoinResidualFilter { const HashJoinProjectionMaps* probe_schemas_; const HashJoinProjectionMaps* build_schemas_; + SwissTableForJoin* hash_table_; std::vector probe_filter_to_key_and_payload_; int num_build_keys_referred_ = 0; int num_build_payloads_referred_ = 0; @@ -855,21 +901,6 @@ class JoinProbeProcessor { // Status OnFinished(); - private: - // For right-* and full-outer joins: we need to update has-match flags - // for the rows in hash table. - // - void UpdateHasMatch(int64_t thread_id, int num_passing_ids, - const uint32_t* key_ids_maybe_null, - const uint32_t* payload_ids_maybe_null); - - // For left-outer and full-outer joins: we need to update match bit-vector if - // the residual filter is not a literal true. - // - void UpdateMatchBitVector(int batch_start_row, int num_batch_rows, - uint8_t* match_bitvector, int num_passing_rows, - const uint16_t* batch_ids); - private: int num_key_columns_; JoinType join_type_; From 595f96f95cd8d9a78587aeb14da4757d3c0132fe Mon Sep 17 00:00:00 2001 From: zanmato Date: Thu, 11 Jan 2024 21:02:08 +0800 Subject: [PATCH 18/33] WIP --- cpp/src/arrow/acero/hash_join_node_test.cc | 289 ++++++++++++++++----- 1 file changed, 224 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 58551f4eca0..87e06eeb4c6 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -1893,58 +1893,125 @@ TEST(HashJoin, CheckHashJoinNodeOptionsValidation) { } } -TEST(HashJoin, ResidualFilter) { - for (bool parallel : {false, true}) { - SCOPED_TRACE(parallel ? "parallel/merged" : "serial"); +class ResidualFilterCaseRunner { + public: + ResidualFilterCaseRunner(BatchesWithSchema left_input, BatchesWithSchema right_input) + : left_input_(std::move(left_input)), right_input_(std::move(right_input)) {} + + void Run(JoinType join_type, const std::vector& left_keys, + const std::vector& right_keys, Expression filter, + const std::vector& expected) const { + RunInternal(HashJoinNodeOptions{join_type, std::move(left_keys), + std::move(right_keys), std::move(filter)}, + expected); + } + + void Run(JoinType join_type, std::vector left_keys, + const std::vector right_keys, std::vector left_output, + const std::vector right_output, Expression filter, + const std::vector& expected) const { + RunInternal(HashJoinNodeOptions{join_type, std::move(left_keys), + std::move(right_keys), std::move(left_output), + std::move(right_output), std::move(filter)}, + expected); + } + + private: + void RunInternal(const HashJoinNodeOptions& options, + const std::vector& expected) const { + auto join_type_str = ToString(options.join_type); + auto join_cond_str = + JoinConditionString(options.left_keys, options.right_keys, options.filter); + auto output_str = OutputString(options.left_output, options.right_output); + for (bool parallel : {false, true}) { + auto parallel_str = parallel ? "parallel" : "serial"; + SCOPED_TRACE(join_type_str + " " + join_cond_str + " " + output_str + " " + + parallel_str); + + Declaration left{"source", + SourceNodeOptions{left_input_.schema, + left_input_.gen(parallel, /*slow=*/false)}}; + Declaration right{"source", + SourceNodeOptions{right_input_.schema, + right_input_.gen(parallel, /*slow=*/false)}}; + + Declaration join{"hashjoin", {std::move(left), std::move(right)}, options}; + + ASSERT_OK_AND_ASSIGN(auto result, + DeclarationToExecBatches(std::move(join), parallel)); + AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, expected); + } + } + + private: + BatchesWithSchema left_input_; + BatchesWithSchema right_input_; + + private: + static std::string JoinConditionString(const std::vector& left_keys, + const std::vector& right_keys, + const Expression& filter) { + ARROW_DCHECK(left_keys.size() > 0); + ARROW_DCHECK(left_keys.size() == right_keys.size()); + std::stringstream ss; + ss << "on ("; + for (size_t i = 0; i < left_keys.size(); ++i) { + ss << left_keys[i].ToString() << " = " << right_keys[i].ToString() << " and "; + } + ss << filter.ToString(); + ss << ")"; + return ss.str(); + } + + static std::string OutputString(const std::vector& left_output, + const std::vector& right_output) { + std::vector both_output; + std::copy(left_output.begin(), left_output.end(), std::back_inserter(both_output)); + std::copy(right_output.begin(), right_output.end(), std::back_inserter(both_output)); + std::stringstream ss; + ss << "output ("; + for (size_t i = 0; i < both_output.size(); ++i) { + if (i != 0) { + ss << ", "; + } + ss << left_output[i].ToString(); + } + ss << ")"; + return ss.str(); + } +}; - BatchesWithSchema input_left; - input_left.batches = {ExecBatchFromJSON({int32(), int32(), utf8()}, R"([ +TEST(HashJoin, ResidualFilter) { + BatchesWithSchema input_left; + input_left.batches = {ExecBatchFromJSON({int32(), int32(), utf8()}, R"([ [1, 6, "alpha"], [2, 5, "beta"], [3, 4, "alpha"] ])")}; - input_left.schema = - schema({field("l1", int32()), field("l2", int32()), field("l_str", utf8())}); + input_left.schema = + schema({field("l1", int32()), field("l2", int32()), field("l_str", utf8())}); - BatchesWithSchema input_right; - input_right.batches = {ExecBatchFromJSON({int32(), int32(), utf8()}, R"([ + BatchesWithSchema input_right; + input_right.batches = {ExecBatchFromJSON({int32(), int32(), utf8()}, R"([ [5, 11, "alpha"], [2, 12, "beta"], [4, 16, "alpha"] ])")}; - input_right.schema = - schema({field("r1", int32()), field("r2", int32()), field("r_str", utf8())}); - - Declaration left{ - "source", - SourceNodeOptions{input_left.schema, input_left.gen(parallel, /*slow=*/false)}}; - Declaration right{ - "source", - SourceNodeOptions{input_right.schema, input_right.gen(parallel, /*slow=*/false)}}; - - Expression mul = call("multiply", {field_ref("l1"), field_ref("l2")}); - Expression combination = call("add", {mul, field_ref("r1")}); - Expression residual_filter = less_equal(combination, field_ref("r2")); - - HashJoinNodeOptions join_opts{ - JoinType::FULL_OUTER, - /*left_keys=*/{"l_str"}, - /*right_keys=*/{"r_str"}, std::move(residual_filter), "l_", "r_"}; + input_right.schema = + schema({field("r1", int32()), field("r2", int32()), field("r_str", utf8())}); - Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; + const ResidualFilterCaseRunner runner{std::move(input_left), std::move(input_right)}; - ASSERT_OK_AND_ASSIGN(auto result, - DeclarationToExecBatches(std::move(join), parallel)); + Expression mul = call("multiply", {field_ref("l1"), field_ref("l2")}); + Expression combination = call("add", {mul, field_ref("r1")}); + Expression filter = less_equal(combination, field_ref("r2")); - std::vector expected = { - ExecBatchFromJSON({int32(), int32(), utf8(), int32(), int32(), utf8()}, R"([ + runner.Run(JoinType::FULL_OUTER, {"l_str"}, {"r_str"}, std::move(filter), + {ExecBatchFromJSON({int32(), int32(), utf8(), int32(), int32(), utf8()}, R"([ [1, 6, "alpha", 4, 16, "alpha"], [1, 6, "alpha", 5, 11, "alpha"], [2, 5, "beta", 2, 12, "beta"], - [3, 4, "alpha", 4, 16, "alpha"]])")}; - - AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, expected); - } + [3, 4, "alpha", 4, 16, "alpha"]])")}); } TEST(HashJoin, TrivialResidualFilter) { @@ -1959,47 +2026,139 @@ TEST(HashJoin, TrivialResidualFilter) { std::vector expected_strings = {expected_true, expected_false}; std::vector filters = {always_true, always_false}; - for (size_t test_id = 0; test_id < 2; test_id++) { - for (bool parallel : {false, true}) { - SCOPED_TRACE(parallel ? "parallel/merged" : "serial"); - - BatchesWithSchema input_left; - input_left.batches = {ExecBatchFromJSON({int32(), utf8()}, R"([ + BatchesWithSchema input_left; + input_left.batches = {ExecBatchFromJSON({int32(), utf8()}, R"([ [1, "alpha"] ])")}; - input_left.schema = schema({field("l1", int32()), field("l_str", utf8())}); + input_left.schema = schema({field("l1", int32()), field("l_str", utf8())}); - BatchesWithSchema input_right; - input_right.batches = {ExecBatchFromJSON({int32(), utf8()}, R"([ + BatchesWithSchema input_right; + input_right.batches = {ExecBatchFromJSON({int32(), utf8()}, R"([ [1, "alpha"] ])")}; - input_right.schema = schema({field("r1", int32()), field("r_str", utf8())}); + input_right.schema = schema({field("r1", int32()), field("r_str", utf8())}); - auto exec_ctx = std::make_unique( - default_memory_pool(), - parallel ? arrow::internal::GetCpuThreadPool() : nullptr); + ResidualFilterCaseRunner runner{std::move(input_left), std::move(input_right)}; - Declaration left{ - "source", - SourceNodeOptions{input_left.schema, input_left.gen(parallel, /*slow=*/false)}}; - Declaration right{"source", - SourceNodeOptions{input_right.schema, - input_right.gen(parallel, /*slow=*/false)}}; + for (size_t test_id = 0; test_id < 2; test_id++) { + runner.Run(JoinType::INNER, {"l_str"}, {"r_str"}, filters[test_id], + {ExecBatchFromJSON({int32(), utf8(), int32(), utf8()}, + expected_strings[test_id])}); + } +} - HashJoinNodeOptions join_opts{ - JoinType::INNER, - /*left_keys=*/{"l_str"}, - /*right_keys=*/{"r_str"}, filters[test_id], "l_", "r_"}; +TEST(HashJoin, FineGrainedResidualFilter) { + struct JoinSchema { + std::shared_ptr left, right; - Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; + struct Projector { + std::shared_ptr left, right; + std::vector left_output, right_output; - ASSERT_OK_AND_ASSIGN(auto result, - DeclarationToExecBatches(std::move(join), parallel)); + std::vector LeftOutput() const { + std::vector output; + for (int i : left_output) { + output.push_back(FieldRef(i)); + }; + return output; + } - std::vector expected = {ExecBatchFromJSON( - {int32(), utf8(), int32(), utf8()}, expected_strings[test_id])}; + std::vector RightOutput() const { + std::vector output; + for (int i : right_output) { + output.push_back(FieldRef(i)); + }; + return output; + } - AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, expected); + ExecBatch Project(const ExecBatch& batch) const { + std::vector values; + for (int i : left_output) { + values.push_back(batch[i]); + } + for (int i : right_output) { + values.push_back(batch[left_output.size() + i]); + } + return {std::move(values), batch.length}; + } + }; + + Projector GetProjector(std::vector left_output, std::vector right_output) { + return Projector{left, right, std::move(left_output), std::move(right_output)}; + } + }; + + BatchesWithSchema left; + left.batches = {ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["left_only", null, "payload"], + ["left_only", 0, "payload"], + ["left_only", 42, "payload"], + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])")}; + left.schema = schema( + {field("l_key", utf8()), field("l_filter", int32()), field("l_payload", utf8())}); + + BatchesWithSchema right; + right.batches = {ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"], + ["right_only", null, "payload"], + ["right_only", 0, "payload"], + ["right_only", 42, "payload"]])")}; + right.schema = schema( + {field("r_key", utf8()), field("r_filter", int32()), field("r_payload", utf8())}); + + const ResidualFilterCaseRunner runner{std::move(left), std::move(right)}; + JoinSchema join_schema{left.schema, right.schema}; + std::vector projectors{ + join_schema.GetProjector({0, 1, 2}, {0, 1, 2}), // Output all. + join_schema.GetProjector({0, 1}, {0, 1}), // Output key columns only. + join_schema.GetProjector({0, 1, 2}, {0, 1}), // Output left payload only. + join_schema.GetProjector({0, 1}, {0, 1, 2}), // Output right payload only. + join_schema.GetProjector({0, 1, 2}, {0, 1, 2})}; // Output all.}; + + { + // Literal true. + Expression filter = literal(true); + { + // Inner join. + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, + R"([ + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + { + // for (const auto& projector : projectors) { + // runner.Run(JoinType::INNER, {"l_key", "l_filter"}, {"r_key", "r_filter"}, + // projector.LeftOutput(), projector.RightOutput(), filter, + // {projector.Project(expected)}); + // } + // Output all. + // Output all. + runner.Run(JoinType::INNER, {"l_key", "l_filter"}, {"r_key", "r_filter"}, filter, + {ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, + R"([ + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])")}); + } } } } From a687bb0a9365f83ff0a693b067fe8c7aca4ce38f Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 00:27:26 +0800 Subject: [PATCH 19/33] Some tests --- cpp/src/arrow/acero/hash_join_node_test.cc | 314 +++++++++++++++------ 1 file changed, 230 insertions(+), 84 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 87e06eeb4c6..8fb7ec0d887 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -1939,7 +1939,7 @@ class ResidualFilterCaseRunner { ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(std::move(join), parallel)); - AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, expected); + AssertExecBatchesEqualIgnoringOrder(result.schema, expected, result.batches); } } @@ -1974,7 +1974,7 @@ class ResidualFilterCaseRunner { if (i != 0) { ss << ", "; } - ss << left_output[i].ToString(); + ss << both_output[i].ToString(); } ss << ")"; return ss.str(); @@ -1984,19 +1984,17 @@ class ResidualFilterCaseRunner { TEST(HashJoin, ResidualFilter) { BatchesWithSchema input_left; input_left.batches = {ExecBatchFromJSON({int32(), int32(), utf8()}, R"([ - [1, 6, "alpha"], - [2, 5, "beta"], - [3, 4, "alpha"] - ])")}; + [1, 6, "alpha"], + [2, 5, "beta"], + [3, 4, "alpha"]])")}; input_left.schema = schema({field("l1", int32()), field("l2", int32()), field("l_str", utf8())}); BatchesWithSchema input_right; input_right.batches = {ExecBatchFromJSON({int32(), int32(), utf8()}, R"([ - [5, 11, "alpha"], - [2, 12, "beta"], - [4, 16, "alpha"] - ])")}; + [5, 11, "alpha"], + [2, 12, "beta"], + [4, 16, "alpha"]])")}; input_right.schema = schema({field("r1", int32()), field("r2", int32()), field("r_str", utf8())}); @@ -2008,10 +2006,10 @@ TEST(HashJoin, ResidualFilter) { runner.Run(JoinType::FULL_OUTER, {"l_str"}, {"r_str"}, std::move(filter), {ExecBatchFromJSON({int32(), int32(), utf8(), int32(), int32(), utf8()}, R"([ - [1, 6, "alpha", 4, 16, "alpha"], - [1, 6, "alpha", 5, 11, "alpha"], - [2, 5, "beta", 2, 12, "beta"], - [3, 4, "alpha", 4, 16, "alpha"]])")}); + [1, 6, "alpha", 4, 16, "alpha"], + [1, 6, "alpha", 5, 11, "alpha"], + [2, 5, "beta", 2, 12, "beta"], + [3, 4, "alpha", 4, 16, "alpha"]])")}); } TEST(HashJoin, TrivialResidualFilter) { @@ -2028,14 +2026,12 @@ TEST(HashJoin, TrivialResidualFilter) { BatchesWithSchema input_left; input_left.batches = {ExecBatchFromJSON({int32(), utf8()}, R"([ - [1, "alpha"] - ])")}; + [1, "alpha"]])")}; input_left.schema = schema({field("l1", int32()), field("l_str", utf8())}); BatchesWithSchema input_right; input_right.batches = {ExecBatchFromJSON({int32(), utf8()}, R"([ - [1, "alpha"] - ])")}; + [1, "alpha"]])")}; input_right.schema = schema({field("r1", int32()), field("r_str", utf8())}); ResidualFilterCaseRunner runner{std::move(input_left), std::move(input_right)}; @@ -2055,29 +2051,41 @@ TEST(HashJoin, FineGrainedResidualFilter) { std::shared_ptr left, right; std::vector left_output, right_output; - std::vector LeftOutput() const { - std::vector output; - for (int i : left_output) { - output.push_back(FieldRef(i)); - }; + std::vector LeftOutput(JoinType join_type) const { + if (join_type == JoinType::RIGHT_SEMI || join_type == JoinType::RIGHT_ANTI) { + return {}; + } + std::vector output(left_output.size()); + std::transform(left_output.begin(), left_output.end(), output.begin(), + [](int i) { return i; }); return output; } - std::vector RightOutput() const { - std::vector output; - for (int i : right_output) { - output.push_back(FieldRef(i)); - }; + std::vector RightOutput(JoinType join_type) const { + if (join_type == JoinType::LEFT_SEMI || join_type == JoinType::LEFT_ANTI) { + return {}; + } + std::vector output(right_output.size()); + std::transform(right_output.begin(), right_output.end(), output.begin(), + [](int i) { return i; }); return output; } - ExecBatch Project(const ExecBatch& batch) const { + ExecBatch Project(JoinType join_type, const ExecBatch& batch) const { std::vector values; - for (int i : left_output) { - values.push_back(batch[i]); + if (join_type != JoinType::RIGHT_SEMI && join_type != JoinType::RIGHT_ANTI) { + for (int i : left_output) { + values.push_back(batch[i]); + } } - for (int i : right_output) { - values.push_back(batch[left_output.size() + i]); + if (join_type != JoinType::LEFT_SEMI && join_type != JoinType::LEFT_ANTI) { + int left_size = + join_type == JoinType::RIGHT_SEMI || join_type == JoinType::RIGHT_ANTI + ? 0 + : left->num_fields(); + for (int i : right_output) { + values.push_back(batch[left_size + i]); + } } return {std::move(values), batch.length}; } @@ -2090,74 +2098,212 @@ TEST(HashJoin, FineGrainedResidualFilter) { BatchesWithSchema left; left.batches = {ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["left_only", null, "payload"], - ["left_only", 0, "payload"], - ["left_only", 42, "payload"], - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])")}; + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["left_only", null, "payload"], + ["left_only", 0, "payload"], + ["left_only", 42, "payload"], + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])")}; left.schema = schema( {field("l_key", utf8()), field("l_filter", int32()), field("l_payload", utf8())}); BatchesWithSchema right; right.batches = {ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"], - ["right_only", null, "payload"], - ["right_only", 0, "payload"], - ["right_only", 42, "payload"]])")}; + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"], + ["right_only", null, "payload"], + ["right_only", 0, "payload"], + ["right_only", 42, "payload"]])")}; right.schema = schema( {field("r_key", utf8()), field("r_filter", int32()), field("r_payload", utf8())}); - const ResidualFilterCaseRunner runner{std::move(left), std::move(right)}; JoinSchema join_schema{left.schema, right.schema}; std::vector projectors{ - join_schema.GetProjector({0, 1, 2}, {0, 1, 2}), // Output all. - join_schema.GetProjector({0, 1}, {0, 1}), // Output key columns only. - join_schema.GetProjector({0, 1, 2}, {0, 1}), // Output left payload only. - join_schema.GetProjector({0, 1}, {0, 1, 2}), // Output right payload only. - join_schema.GetProjector({0, 1, 2}, {0, 1, 2})}; // Output all.}; + join_schema.GetProjector({0, 1, 2}, {0, 1, 2}), // Output all. + join_schema.GetProjector({0}, {0}), // Output key columns only. + join_schema.GetProjector({1}, {1}), // Output filter columns only. + join_schema.GetProjector({2}, {2})}; // Output payload columns only. + + const ResidualFilterCaseRunner runner{std::move(left), std::move(right)}; { // Literal true. Expression filter = literal(true); + std::vector left_keys{"l_key", "l_filter"}, right_keys{"r_key", "r_filter"}; { // Inner join. + JoinType join_type = JoinType::INNER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left outer join. + JoinType join_type = JoinType::LEFT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right outer join. + JoinType join_type = JoinType::RIGHT_OUTER; auto expected = - ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, - R"([ - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); - { - // for (const auto& projector : projectors) { - // runner.Run(JoinType::INNER, {"l_key", "l_filter"}, {"r_key", "r_filter"}, - // projector.LeftOutput(), projector.RightOutput(), filter, - // {projector.Project(expected)}); - // } - // Output all. - // Output all. - runner.Run(JoinType::INNER, {"l_key", "l_filter"}, {"r_key", "r_filter"}, filter, - {ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, - R"([ - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])")}); + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Full outer join. + JoinType join_type = JoinType::FULL_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left semi join. + JoinType join_type = JoinType::LEFT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left anti join. + JoinType join_type = JoinType::LEFT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["left_only", null, "payload"], + ["left_only", 0, "payload"], + ["left_only", 42, "payload"], + ["both1", null, "payload"], + ["both2", null, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right semi join. + JoinType join_type = JoinType::RIGHT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right anti join. + JoinType join_type = JoinType::RIGHT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["both1", null, "payload"], + ["both2", null, "payload"], + ["right_only", null, "payload"], + ["right_only", 0, "payload"], + ["right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); } } } From 8f9db832036c3f6895f8b819552fb5940683ae21 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 00:49:39 +0800 Subject: [PATCH 20/33] Literal false and null --- cpp/src/arrow/acero/hash_join_node_test.cc | 171 +++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 8fb7ec0d887..eef09dbb012 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -2307,6 +2307,177 @@ TEST(HashJoin, FineGrainedResidualFilter) { } } } + + { + // Literal false and null. + for (Expression filter : {literal(false), literal(NullScalar())}) { + std::vector left_keys{"l_key", "l_filter"}, + right_keys{"r_key", "r_filter"}; + { + // Inner join. + JoinType join_type = JoinType::INNER; + auto expected = ExecBatchFromJSON( + {utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left outer join. + JoinType join_type = JoinType::LEFT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both1", 0, "payload", null, null, null], + ["both1", 42, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both2", 0, "payload", null, null, null], + ["both2", 42, "payload", null, null, null]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right outer join. + JoinType join_type = JoinType::RIGHT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both1", 0, "payload"], + [null, null, null, "both1", 42, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "both2", 0, "payload"], + [null, null, null, "both2", 42, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Full outer join. + JoinType join_type = JoinType::FULL_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both1", 0, "payload", null, null, null], + ["both1", 42, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both2", 0, "payload", null, null, null], + ["both2", 42, "payload", null, null, null], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both1", 0, "payload"], + [null, null, null, "both1", 42, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "both2", 0, "payload"], + [null, null, null, "both2", 42, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left semi join. + JoinType join_type = JoinType::LEFT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left anti join. + JoinType join_type = JoinType::LEFT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["left_only", null, "payload"], + ["left_only", 0, "payload"], + ["left_only", 42, "payload"], + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right semi join. + JoinType join_type = JoinType::RIGHT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right anti join. + JoinType join_type = JoinType::RIGHT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"], + ["right_only", null, "payload"], + ["right_only", 0, "payload"], + ["right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + } + } } HashJoinNodeOptions GenerateHashJoinNodeOptions(Random64Bit& rng, int num_left_cols, From 183bf86e4759326f8f3c114bd9b429ebe45fde91 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 01:01:03 +0800 Subject: [PATCH 21/33] Scalar true, false and null --- cpp/src/arrow/acero/hash_join_node_test.cc | 308 +++++++++++---------- 1 file changed, 156 insertions(+), 152 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index eef09dbb012..efc77aec734 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -2140,177 +2140,181 @@ TEST(HashJoin, FineGrainedResidualFilter) { const ResidualFilterCaseRunner runner{std::move(left), std::move(right)}; { - // Literal true. - Expression filter = literal(true); - std::vector left_keys{"l_key", "l_filter"}, right_keys{"r_key", "r_filter"}; - { - // Inner join. - JoinType join_type = JoinType::INNER; - auto expected = - ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); - for (const auto& projector : projectors) { - runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), - projector.RightOutput(join_type), filter, - {projector.Project(join_type, expected)}); + // Literal true and scalar true. + for (Expression filter : {literal(true), equal(literal(1), literal(1))}) { + std::vector left_keys{"l_key", "l_filter"}, + right_keys{"r_key", "r_filter"}; + { + // Inner join. + JoinType join_type = JoinType::INNER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } } - } - { - // Left outer join. - JoinType join_type = JoinType::LEFT_OUTER; - auto expected = - ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); - for (const auto& projector : projectors) { - runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), - projector.RightOutput(join_type), filter, - {projector.Project(join_type, expected)}); + { + // Left outer join. + JoinType join_type = JoinType::LEFT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } } - } - { - // Right outer join. - JoinType join_type = JoinType::RIGHT_OUTER; - auto expected = - ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); - for (const auto& projector : projectors) { - runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), - projector.RightOutput(join_type), filter, - {projector.Project(join_type, expected)}); + { + // Right outer join. + JoinType join_type = JoinType::RIGHT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } } - } - { - // Full outer join. - JoinType join_type = JoinType::FULL_OUTER; - auto expected = - ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); - for (const auto& projector : projectors) { - runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), - projector.RightOutput(join_type), filter, - {projector.Project(join_type, expected)}); + { + // Full outer join. + JoinType join_type = JoinType::FULL_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } } - } - { - // Left semi join. - JoinType join_type = JoinType::LEFT_SEMI; - auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); - for (const auto& projector : projectors) { - runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), - projector.RightOutput(join_type), filter, - {projector.Project(join_type, expected)}); + { + // Left semi join. + JoinType join_type = JoinType::LEFT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } } - } - { - // Left anti join. - JoinType join_type = JoinType::LEFT_ANTI; - auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["left_only", null, "payload"], - ["left_only", 0, "payload"], - ["left_only", 42, "payload"], - ["both1", null, "payload"], - ["both2", null, "payload"]])"); - for (const auto& projector : projectors) { - runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), - projector.RightOutput(join_type), filter, - {projector.Project(join_type, expected)}); + { + // Left anti join. + JoinType join_type = JoinType::LEFT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["left_only", null, "payload"], + ["left_only", 0, "payload"], + ["left_only", 42, "payload"], + ["both1", null, "payload"], + ["both2", null, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } } - } - { - // Right semi join. - JoinType join_type = JoinType::RIGHT_SEMI; - auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); - for (const auto& projector : projectors) { - runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), - projector.RightOutput(join_type), filter, - {projector.Project(join_type, expected)}); + { + // Right semi join. + JoinType join_type = JoinType::RIGHT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } } - } - { - // Right anti join. - JoinType join_type = JoinType::RIGHT_ANTI; - auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["both1", null, "payload"], - ["both2", null, "payload"], - ["right_only", null, "payload"], - ["right_only", 0, "payload"], - ["right_only", 42, "payload"]])"); - for (const auto& projector : projectors) { - runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), - projector.RightOutput(join_type), filter, - {projector.Project(join_type, expected)}); + { + // Right anti join. + JoinType join_type = JoinType::RIGHT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["both1", null, "payload"], + ["both2", null, "payload"], + ["right_only", null, "payload"], + ["right_only", 0, "payload"], + ["right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } } } } { - // Literal false and null. - for (Expression filter : {literal(false), literal(NullScalar())}) { + // Literal false, null, and scalar false, null. + for (Expression filter : + {literal(false), literal(NullScalar()), equal(literal(0), literal(1)), + equal(literal(1), literal(NullScalar()))}) { std::vector left_keys{"l_key", "l_filter"}, right_keys{"r_key", "r_filter"}; { From a1ade8e7dd68616e78750d3c3c8654b0f0b420b3 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 10:16:56 +0800 Subject: [PATCH 22/33] More test --- cpp/src/arrow/acero/hash_join_node_test.cc | 531 +++++++++++++++++++++ 1 file changed, 531 insertions(+) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index efc77aec734..eee22ce1b3a 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -2482,6 +2482,537 @@ TEST(HashJoin, FineGrainedResidualFilter) { } } } + + { + // Non-trivial filters referring left columns only. + for (Expression filter : {equal(field_ref("l_filter"), literal(42)), + not_equal(literal(0), field_ref("l_filter"))}) { + std::vector left_keys{"l_key"}, right_keys{"r_key"}; + { + // Inner join. + JoinType join_type = JoinType::INNER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", 42, "payload", "both1", null, "payload"], + ["both1", 42, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 42, "payload", "both2", null, "payload"], + ["both2", 42, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left outer join. + JoinType join_type = JoinType::LEFT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both1", 0, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both2", 0, "payload", null, null, null], + ["both1", 42, "payload", "both1", null, "payload"], + ["both1", 42, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 42, "payload", "both2", null, "payload"], + ["both2", 42, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right outer join. + JoinType join_type = JoinType::RIGHT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", 42, "payload", "both1", null, "payload"], + ["both1", 42, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 42, "payload", "both2", null, "payload"], + ["both2", 42, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Full outer join. + JoinType join_type = JoinType::FULL_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both1", 0, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both2", 0, "payload", null, null, null], + ["both1", 42, "payload", "both1", null, "payload"], + ["both1", 42, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 42, "payload", "both2", null, "payload"], + ["both2", 42, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left semi join. + JoinType join_type = JoinType::LEFT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", 42, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left anti join. + JoinType join_type = JoinType::LEFT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["left_only", null, "payload"], + ["left_only", 0, "payload"], + ["left_only", 42, "payload"], + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right semi join. + JoinType join_type = JoinType::RIGHT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right anti join. + JoinType join_type = JoinType::RIGHT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["right_only", null, "payload"], + ["right_only", 0, "payload"], + ["right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + } + } + + { + // Non-trivial filters referring right columns only. + for (Expression filter : {equal(field_ref("r_filter"), literal(42)), + not_equal(literal(0), field_ref("r_filter"))}) { + std::vector left_keys{"l_key"}, right_keys{"r_key"}; + { + // Inner join. + JoinType join_type = JoinType::INNER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", null, "payload", "both1", 42, "payload"], + ["both1", 0, "payload", "both1", 42, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", null, "payload", "both2", 42, "payload"], + ["both2", 0, "payload", "both2", 42, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left outer join. + JoinType join_type = JoinType::LEFT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", "both1", 42, "payload"], + ["both1", 0, "payload", "both1", 42, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", null, "payload", "both2", 42, "payload"], + ["both2", 0, "payload", "both2", 42, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right outer join. + JoinType join_type = JoinType::RIGHT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", null, "payload", "both1", 42, "payload"], + ["both1", 0, "payload", "both1", 42, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", null, "payload", "both2", 42, "payload"], + ["both2", 0, "payload", "both2", 42, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both1", 0, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "both2", 0, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Full outer join. + JoinType join_type = JoinType::FULL_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", "both1", 42, "payload"], + ["both1", 0, "payload", "both1", 42, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", null, "payload", "both2", 42, "payload"], + ["both2", 0, "payload", "both2", 42, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both1", 0, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "both2", 0, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left semi join. + JoinType join_type = JoinType::LEFT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left anti join. + JoinType join_type = JoinType::LEFT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["left_only", null, "payload"], + ["left_only", 0, "payload"], + ["left_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right semi join. + JoinType join_type = JoinType::RIGHT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", 42, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right anti join. + JoinType join_type = JoinType::RIGHT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["both1", null, "payload"], + ["both1", 0, "payload"], + ["both2", null, "payload"], + ["both2", 0, "payload"], + ["right_only", null, "payload"], + ["right_only", 0, "payload"], + ["right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + } + } + + { + // Non-trivial filters referring both left and right columns. + for (Expression filter : + {equal(field_ref("l_filter"), field_ref("r_filter")), + equal(call("subtract", {field_ref("l_filter"), field_ref("r_filter")}), + literal(0))}) { + std::vector left_keys{"l_key"}, right_keys{"r_key"}; + { + // Inner join. + JoinType join_type = JoinType::INNER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left outer join. + JoinType join_type = JoinType::LEFT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right outer join. + JoinType join_type = JoinType::RIGHT_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Full outer join. + JoinType join_type = JoinType::FULL_OUTER; + auto expected = + ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ + [null, null, "payload", null, null, null], + [null, 0, "payload", null, null, null], + [null, 42, "payload", null, null, null], + ["left_only", null, "payload", null, null, null], + ["left_only", 0, "payload", null, null, null], + ["left_only", 42, "payload", null, null, null], + ["both1", null, "payload", null, null, null], + ["both2", null, "payload", null, null, null], + ["both1", 0, "payload", "both1", 0, "payload"], + ["both1", 42, "payload", "both1", 42, "payload"], + ["both2", 0, "payload", "both2", 0, "payload"], + ["both2", 42, "payload", "both2", 42, "payload"], + [null, null, null, null, null, "payload"], + [null, null, null, null, 0, "payload"], + [null, null, null, null, 42, "payload"], + [null, null, null, "both1", null, "payload"], + [null, null, null, "both2", null, "payload"], + [null, null, null, "right_only", null, "payload"], + [null, null, null, "right_only", 0, "payload"], + [null, null, null, "right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left semi join. + JoinType join_type = JoinType::LEFT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Left anti join. + JoinType join_type = JoinType::LEFT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["left_only", null, "payload"], + ["left_only", 0, "payload"], + ["left_only", 42, "payload"], + ["both1", null, "payload"], + ["both2", null, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right semi join. + JoinType join_type = JoinType::RIGHT_SEMI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + ["both1", 0, "payload"], + ["both1", 42, "payload"], + ["both2", 0, "payload"], + ["both2", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + + { + // Right anti join. + JoinType join_type = JoinType::RIGHT_ANTI; + auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ + [null, null, "payload"], + [null, 0, "payload"], + [null, 42, "payload"], + ["both1", null, "payload"], + ["both2", null, "payload"], + ["right_only", null, "payload"], + ["right_only", 0, "payload"], + ["right_only", 42, "payload"]])"); + for (const auto& projector : projectors) { + runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), + projector.RightOutput(join_type), filter, + {projector.Project(join_type, expected)}); + } + } + } + } } HashJoinNodeOptions GenerateHashJoinNodeOptions(Random64Bit& rng, int num_left_cols, From 941712027e020136dfaf1a78b31edb147eb79ac8 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 10:30:44 +0800 Subject: [PATCH 23/33] Fix windows build --- cpp/src/arrow/acero/hash_join_node_test.cc | 24 +++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index eee22ce1b3a..5957bd82059 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -1919,7 +1919,7 @@ class ResidualFilterCaseRunner { private: void RunInternal(const HashJoinNodeOptions& options, const std::vector& expected) const { - auto join_type_str = ToString(options.join_type); + auto join_type_str = JoinTypeString(options.join_type); auto join_cond_str = JoinConditionString(options.left_keys, options.right_keys, options.filter); auto output_str = OutputString(options.left_output, options.right_output); @@ -1948,6 +1948,28 @@ class ResidualFilterCaseRunner { BatchesWithSchema right_input_; private: + static std::string JoinTypeString(JoinType t) { + switch (t) { + case JoinType::LEFT_SEMI: + return "LEFT_SEMI"; + case JoinType::RIGHT_SEMI: + return "RIGHT_SEMI"; + case JoinType::LEFT_ANTI: + return "LEFT_ANTI"; + case JoinType::RIGHT_ANTI: + return "RIGHT_ANTI"; + case JoinType::INNER: + return "INNER"; + case JoinType::LEFT_OUTER: + return "LEFT_OUTER"; + case JoinType::RIGHT_OUTER: + return "RIGHT_OUTER"; + case JoinType::FULL_OUTER: + return "FULL_OUTER"; + } + ARROW_DCHECK(false); + } + static std::string JoinConditionString(const std::vector& left_keys, const std::vector& right_keys, const Expression& filter) { From e45be434e2840d3abc7cf532fd23a50ed341a019 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 10:44:57 +0800 Subject: [PATCH 24/33] Fix build issue --- cpp/src/arrow/acero/hash_join_node_test.cc | 1 + cpp/src/arrow/acero/swiss_join.cc | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 5957bd82059..36997e0eb37 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -1968,6 +1968,7 @@ class ResidualFilterCaseRunner { return "FULL_OUTER"; } ARROW_DCHECK(false); + return ""; } static std::string JoinConditionString(const std::vector& left_keys, diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 8a3a3b9156e..5e55a59ee3f 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1902,7 +1902,7 @@ void JoinResidualFilter::Init(Expression filter, QueryContext* ctx, MemoryPool* } else if (idx = to_payload.get(i); idx != SchemaProjectionMap::kMissingField) { probe_filter_to_key_and_payload_[i] = idx + num_key_cols; } else { - DCHECK(false); + ARROW_DCHECK(false); } } } @@ -1919,7 +1919,7 @@ void JoinResidualFilter::Init(Expression filter, QueryContext* ctx, MemoryPool* } else if (to_payload.get(i) != SchemaProjectionMap::kMissingField) { num_build_payloads_referred_++; } else { - DCHECK(false); + ARROW_DCHECK(false); } } } From 8e50c18950ce8c20acd927e2e5260fe0a19c3173 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 10:54:31 +0800 Subject: [PATCH 25/33] Fix comment --- cpp/src/arrow/acero/swiss_join_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 1c305d3c162..aa36a611092 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -774,7 +774,7 @@ class JoinResidualFilter { // Left-outer and full-outer joins can result in a different bit-vector than the one of // probing the hash table if the residual filter is not a literal true. If so, caller - // should setup a bit-vector for filtering properly and call `UpdateMatchBitVector` + // should setup a bit-vector for filtering properly and call `UpdateFilterBitVector` // accordingly. // bool NeedFilterBitVector(JoinType join_type) const { From 04f3d199072d35230084859b541e9e2abef0f044 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 12:01:52 +0800 Subject: [PATCH 26/33] Fix ubsan --- cpp/src/arrow/acero/swiss_join.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 5e55a59ee3f..e7a3ccea8fe 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1578,6 +1578,10 @@ Status JoinResultMaterialize::AppendProbeOnly(const ExecBatch& key_and_payload, int num_rows_to_append, const uint16_t* row_ids, int* num_rows_appended) { + if (num_rows_to_append) { + *num_rows_appended = 0; + return Status::OK(); + } num_rows_to_append = std::min(ExecBatchBuilder::num_rows_max() - num_rows_, num_rows_to_append); if (HasProbeOutput()) { @@ -1604,6 +1608,10 @@ Status JoinResultMaterialize::AppendBuildOnly(int num_rows_to_append, const uint32_t* key_ids, const uint32_t* payload_ids, int* num_rows_appended) { + if (num_rows_to_append) { + *num_rows_appended = 0; + return Status::OK(); + } num_rows_to_append = std::min(ExecBatchBuilder::num_rows_max() - num_rows_, num_rows_to_append); if (HasProbeOutput()) { @@ -1631,6 +1639,10 @@ Status JoinResultMaterialize::Append(const ExecBatch& key_and_payload, int num_rows_to_append, const uint16_t* row_ids, const uint32_t* key_ids, const uint32_t* payload_ids, int* num_rows_appended) { + if (num_rows_to_append) { + *num_rows_appended = 0; + return Status::OK(); + } num_rows_to_append = std::min(ExecBatchBuilder::num_rows_max() - num_rows_, num_rows_to_append); if (HasProbeOutput()) { From ed1dd4b580c21dfdfa79c83e594739da86727e91 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Fri, 12 Jan 2024 12:16:40 +0800 Subject: [PATCH 27/33] Fix bug --- cpp/src/arrow/acero/swiss_join.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index e7a3ccea8fe..68b0e37b01a 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1578,7 +1578,7 @@ Status JoinResultMaterialize::AppendProbeOnly(const ExecBatch& key_and_payload, int num_rows_to_append, const uint16_t* row_ids, int* num_rows_appended) { - if (num_rows_to_append) { + if (num_rows_to_append == 0) { *num_rows_appended = 0; return Status::OK(); } @@ -1608,7 +1608,7 @@ Status JoinResultMaterialize::AppendBuildOnly(int num_rows_to_append, const uint32_t* key_ids, const uint32_t* payload_ids, int* num_rows_appended) { - if (num_rows_to_append) { + if (num_rows_to_append == 0) { *num_rows_appended = 0; return Status::OK(); } @@ -1639,7 +1639,7 @@ Status JoinResultMaterialize::Append(const ExecBatch& key_and_payload, int num_rows_to_append, const uint16_t* row_ids, const uint32_t* key_ids, const uint32_t* payload_ids, int* num_rows_appended) { - if (num_rows_to_append) { + if (num_rows_to_append == 0) { *num_rows_appended = 0; return Status::OK(); } From 245a6a69fe34aac5422afc8accac4591b6d8caa7 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Tue, 23 Jan 2024 15:54:14 +0800 Subject: [PATCH 28/33] Minor fix --- cpp/src/arrow/acero/hash_join_node_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 36997e0eb37..902ac213969 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -1898,17 +1898,17 @@ class ResidualFilterCaseRunner { ResidualFilterCaseRunner(BatchesWithSchema left_input, BatchesWithSchema right_input) : left_input_(std::move(left_input)), right_input_(std::move(right_input)) {} - void Run(JoinType join_type, const std::vector& left_keys, - const std::vector& right_keys, Expression filter, - const std::vector& expected) const { + void Run(JoinType join_type, std::vector left_keys, + std::vector right_keys, Expression filter, + std::vector expected) const { RunInternal(HashJoinNodeOptions{join_type, std::move(left_keys), std::move(right_keys), std::move(filter)}, expected); } void Run(JoinType join_type, std::vector left_keys, - const std::vector right_keys, std::vector left_output, - const std::vector right_output, Expression filter, + std::vector right_keys, std::vector left_output, + std::vector right_output, Expression filter, const std::vector& expected) const { RunInternal(HashJoinNodeOptions{join_type, std::move(left_keys), std::move(right_keys), std::move(left_output), @@ -2007,9 +2007,9 @@ class ResidualFilterCaseRunner { TEST(HashJoin, ResidualFilter) { BatchesWithSchema input_left; input_left.batches = {ExecBatchFromJSON({int32(), int32(), utf8()}, R"([ - [1, 6, "alpha"], - [2, 5, "beta"], - [3, 4, "alpha"]])")}; + [1, 6, "alpha"], + [2, 5, "beta"], + [3, 4, "alpha"]])")}; input_left.schema = schema({field("l1", int32()), field("l2", int32()), field("l_str", utf8())}); From d651fd9ceee2b1b91e308b332375c2651cca3e3a Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Tue, 23 Jan 2024 15:59:11 +0800 Subject: [PATCH 29/33] Minor fix --- cpp/src/arrow/acero/hash_join_node_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 902ac213969..a2423383da8 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -1900,7 +1900,7 @@ class ResidualFilterCaseRunner { void Run(JoinType join_type, std::vector left_keys, std::vector right_keys, Expression filter, - std::vector expected) const { + const std::vector& expected) const { RunInternal(HashJoinNodeOptions{join_type, std::move(left_keys), std::move(right_keys), std::move(filter)}, expected); From 7f14432eebd06182fbf035f567e810b52e044ab3 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Tue, 23 Jan 2024 16:29:42 +0800 Subject: [PATCH 30/33] Minor fix --- cpp/src/arrow/acero/hash_join_node_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index a2423383da8..97d11f51e45 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -1925,8 +1925,8 @@ class ResidualFilterCaseRunner { auto output_str = OutputString(options.left_output, options.right_output); for (bool parallel : {false, true}) { auto parallel_str = parallel ? "parallel" : "serial"; - SCOPED_TRACE(join_type_str + " " + join_cond_str + " " + output_str + " " + - parallel_str); + ARROW_SCOPED_TRACE(join_type_str + " " + join_cond_str + " " + output_str + " " + + parallel_str); Declaration left{"source", SourceNodeOptions{left_input_.schema, From 7a49012f3812d11a9cde133d6f301bc9a8d5aac9 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 24 Jan 2024 11:42:40 +0800 Subject: [PATCH 31/33] Add benchmark (#2) * Add benchmark * Fix benchmark * Fix benchmark --- cpp/src/arrow/acero/hash_join_benchmark.cc | 197 ++++++++++++++++++++- 1 file changed, 192 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 9be4bed6065..72ab0651050 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -51,6 +51,10 @@ struct BenchmarkSettings { double null_percentage = 0.0; double cardinality = 1.0; // Proportion of distinct keys in build side double selectivity = 1.0; // Probability of a match for a given row + int var_length_min = 2; // Minimal length of any var length types + int var_length_max = 20; // Maximum length of any var length types + + Expression residual_filter = literal(true); }; class JoinBenchmark { @@ -79,8 +83,8 @@ class JoinBenchmark { build_metadata["null_probability"] = std::to_string(settings.null_percentage); build_metadata["min"] = std::to_string(min_build_value); build_metadata["max"] = std::to_string(max_build_value); - build_metadata["min_length"] = "2"; - build_metadata["max_length"] = "20"; + build_metadata["min_length"] = settings.var_length_min; + build_metadata["max_length"] = settings.var_length_max; std::unordered_map probe_metadata; probe_metadata["null_probability"] = std::to_string(settings.null_percentage); @@ -126,10 +130,9 @@ class JoinBenchmark { stats_.num_probe_rows = settings.num_probe_batches * settings.batch_size; schema_mgr_ = std::make_unique(); - Expression filter = literal(true); DCHECK_OK(schema_mgr_->Init(settings.join_type, *l_batches_with_schema.schema, left_keys, *r_batches_with_schema.schema, right_keys, - filter, "l_", "r_")); + settings.residual_filter, "l_", "r_")); if (settings.use_basic_implementation) { join_ = *HashJoinImpl::MakeBasic(); @@ -158,7 +161,7 @@ class JoinBenchmark { DCHECK_OK(join_->Init( &ctx_, settings.join_type, settings.num_threads, &(schema_mgr_->proj_maps[0]), - &(schema_mgr_->proj_maps[1]), std::move(key_cmp), std::move(filter), + &(schema_mgr_->proj_maps[1]), std::move(key_cmp), settings.residual_filter, std::move(register_task_group_callback), std::move(start_task_group_callback), [](int64_t, ExecBatch) { return Status::OK(); }, [](int64_t) { return Status::OK(); })); @@ -308,6 +311,60 @@ static void BM_HashJoinBasic_NullPercentage(benchmark::State& st) { HashJoinBasicBenchmarkImpl(st, settings); } + +template +static void BM_HashJoinBasic_TrivialResidualFilter(benchmark::State& st, + JoinType join_type, + Expression residual_filter, + Args&&...) { + BenchmarkSettings settings; + settings.join_type = join_type; + settings.build_payload_types = {binary()}; + settings.probe_payload_types = {binary()}; + + settings.use_basic_implementation = st.range(0); + + settings.num_build_batches = 1024; + settings.num_probe_batches = 1024; + + // Let payload column length from 1 to 100. + settings.var_length_min = 1; + settings.var_length_max = 100; + + settings.residual_filter = std::move(residual_filter); + + HashJoinBasicBenchmarkImpl(st, settings); +} + +template +static void BM_HashJoinBasic_ComplexResidualFilter(benchmark::State& st, + JoinType join_type, Args&&...) { + BenchmarkSettings settings; + settings.join_type = join_type; + settings.build_payload_types = {binary()}; + settings.probe_payload_types = {binary()}; + + settings.use_basic_implementation = st.range(0); + + settings.num_build_batches = 1024; + settings.num_probe_batches = 1024; + + // Let payload column length from 1 to 100. + settings.var_length_min = 1; + settings.var_length_max = 100; + + // Create filter referring payload columns from both sides. + // binary_length(probe_payload) + binary_length(build_payload) <= 2 * selectivity + settings.selectivity = static_cast(st.range(1)) / 100.0; + using arrow::compute::call; + using arrow::compute::field_ref; + settings.residual_filter = + call("less_equal", {call("plus", {call("binary_length", {field_ref("lp0")}), + call("binary_length", {field_ref("rp0")})}), + literal(2 * settings.selectivity)}); + + HashJoinBasicBenchmarkImpl(st, settings); +} #endif std::vector hashtable_krows = benchmark::CreateRange(1, 4096, 8); @@ -435,6 +492,136 @@ BENCHMARK(BM_HashJoinBasic_BuildParallelism) BENCHMARK(BM_HashJoinBasic_NullPercentage) ->ArgNames({"Null Percentage"}) ->DenseRange(0, 100, 10); + +std::string use_basic_argname = "Use basic"; +std::vector use_basic_arg = benchmark::CreateDenseRange(0, 1, 1); + +std::vector trivial_residual_filter_argnames = {use_basic_argname}; +std::vector> trivial_residual_filter_args = {use_basic_arg}; + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Inner/Literal(true)", + JoinType::INNER, literal(true)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Left Semi/Literal(true)", + JoinType::LEFT_SEMI, literal(true)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Right Semi/Literal(true)", + JoinType::RIGHT_SEMI, literal(true)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Left Anti/Literal(true)", + JoinType::LEFT_ANTI, literal(true)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Right Anti/Literal(true)", + JoinType::RIGHT_ANTI, literal(true)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Left Outer/Literal(true)", + JoinType::LEFT_OUTER, literal(true)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Right Outer/Literal(true)", + JoinType::RIGHT_OUTER, literal(true)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Full Outer/Literal(true)", + JoinType::FULL_OUTER, literal(true)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Inner/Literal(false)", + JoinType::INNER, literal(false)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Left Semi/Literal(false)", + JoinType::LEFT_SEMI, literal(false)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Right Semi/Literal(false)", + JoinType::RIGHT_SEMI, literal(false)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Left Anti/Literal(false)", + JoinType::LEFT_ANTI, literal(false)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Right Anti/Literal(false)", + JoinType::RIGHT_ANTI, literal(false)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Left Outer/Literal(false)", + JoinType::LEFT_OUTER, literal(false)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Right Outer/Literal(false)", + JoinType::RIGHT_OUTER, literal(false)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_TrivialResidualFilter, "Full Outer/Literal(false)", + JoinType::FULL_OUTER, literal(false)) + ->ArgNames(trivial_residual_filter_argnames) + ->ArgsProduct(trivial_residual_filter_args); + +std::vector complex_residual_filter_argnames = {use_basic_argname, + "Selectivity"}; +std::vector> complex_residual_filter_args = { + use_basic_arg, benchmark::CreateDenseRange(0, 100, 20)}; + +BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Inner", JoinType::INNER) + ->ArgNames(complex_residual_filter_argnames) + ->ArgsProduct(complex_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Left Semi", + JoinType::LEFT_SEMI) + ->ArgNames(complex_residual_filter_argnames) + ->ArgsProduct(complex_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Right Semi", + JoinType::RIGHT_SEMI) + ->ArgNames(complex_residual_filter_argnames) + ->ArgsProduct(complex_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Left Anti", + JoinType::LEFT_ANTI) + ->ArgNames(complex_residual_filter_argnames) + ->ArgsProduct(complex_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Right Anti", + JoinType::RIGHT_ANTI) + ->ArgNames(complex_residual_filter_argnames) + ->ArgsProduct(complex_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Left Outer", + JoinType::LEFT_OUTER) + ->ArgNames(complex_residual_filter_argnames) + ->ArgsProduct(complex_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Right Outer", + JoinType::RIGHT_OUTER) + ->ArgNames(complex_residual_filter_argnames) + ->ArgsProduct(complex_residual_filter_args); + +BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Full Outer", + JoinType::FULL_OUTER) + ->ArgNames(complex_residual_filter_argnames) + ->ArgsProduct(complex_residual_filter_args); #else BENCHMARK_CAPTURE(BM_HashJoinBasic_KeyTypes, "{int32}", {int32()}) From 533c23860b88638a6d805dbeccfe29c6a85f01f9 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Wed, 24 Jan 2024 11:54:58 +0800 Subject: [PATCH 32/33] Fix lint --- cpp/src/arrow/acero/hash_join_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 72ab0651050..993c0b9a705 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -493,7 +493,7 @@ BENCHMARK(BM_HashJoinBasic_NullPercentage) ->ArgNames({"Null Percentage"}) ->DenseRange(0, 100, 10); -std::string use_basic_argname = "Use basic"; +const char* use_basic_argname = "Use basic"; std::vector use_basic_arg = benchmark::CreateDenseRange(0, 1, 1); std::vector trivial_residual_filter_argnames = {use_basic_argname}; From 1070f4df3e4eb88533392e31e1d22698a9ad07d5 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 29 Feb 2024 01:25:25 +0800 Subject: [PATCH 33/33] Address comments --- cpp/src/arrow/acero/hash_join_node_test.cc | 805 +++++++++++---------- 1 file changed, 403 insertions(+), 402 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 97d11f51e45..63969d9a3ed 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -1989,8 +1989,9 @@ class ResidualFilterCaseRunner { static std::string OutputString(const std::vector& left_output, const std::vector& right_output) { std::vector both_output; - std::copy(left_output.begin(), left_output.end(), std::back_inserter(both_output)); - std::copy(right_output.begin(), right_output.end(), std::back_inserter(both_output)); + both_output.reserve(left_output.size() + right_output.size()); + both_output.insert(both_output.end(), left_output.begin(), left_output.end()); + both_output.insert(both_output.end(), right_output.begin(), right_output.end()); std::stringstream ss; ss << "output ("; for (size_t i = 0; i < both_output.size(); ++i) { @@ -2121,35 +2122,35 @@ TEST(HashJoin, FineGrainedResidualFilter) { BatchesWithSchema left; left.batches = {ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["left_only", null, "payload"], - ["left_only", 0, "payload"], - ["left_only", 42, "payload"], - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])")}; + [null, null, "l_payload"], + [null, 0, "l_payload"], + [null, 42, "l_payload"], + ["left_only", null, "l_payload"], + ["left_only", 0, "l_payload"], + ["left_only", 42, "l_payload"], + ["both1", null, "l_payload"], + ["both1", 0, "l_payload"], + ["both1", 42, "l_payload"], + ["both2", null, "l_payload"], + ["both2", 0, "l_payload"], + ["both2", 42, "l_payload"]])")}; left.schema = schema( {field("l_key", utf8()), field("l_filter", int32()), field("l_payload", utf8())}); BatchesWithSchema right; right.batches = {ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"], - ["right_only", null, "payload"], - ["right_only", 0, "payload"], - ["right_only", 42, "payload"]])")}; + [null, null, "r_payload"], + [null, 0, "r_payload"], + [null, 42, "r_payload"], + ["both1", null, "r_payload"], + ["both1", 0, "r_payload"], + ["both1", 42, "r_payload"], + ["both2", null, "r_payload"], + ["both2", 0, "r_payload"], + ["both2", 42, "r_payload"], + ["right_only", null, "r_payload"], + ["right_only", 0, "r_payload"], + ["right_only", 42, "r_payload"]])")}; right.schema = schema( {field("r_key", utf8()), field("r_filter", int32()), field("r_payload", utf8())}); @@ -2172,10 +2173,10 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::INNER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); + ["both1", 0, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2188,18 +2189,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::LEFT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", null, null, null], + ["both2", null, "l_payload", null, null, null], + ["both1", 0, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2212,18 +2213,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::RIGHT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + ["both1", 0, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "both1", null, "r_payload"], + [null, null, null, "both2", null, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2236,26 +2237,26 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::FULL_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", null, null, null], + ["both2", null, "l_payload", null, null, null], + ["both1", 0, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "both1", null, "r_payload"], + [null, null, null, "both2", null, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2267,10 +2268,10 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left semi join. JoinType join_type = JoinType::LEFT_SEMI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); + ["both1", 0, "l_payload"], + ["both1", 42, "l_payload"], + ["both2", 0, "l_payload"], + ["both2", 42, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2282,14 +2283,14 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left anti join. JoinType join_type = JoinType::LEFT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["left_only", null, "payload"], - ["left_only", 0, "payload"], - ["left_only", 42, "payload"], - ["both1", null, "payload"], - ["both2", null, "payload"]])"); + [null, null, "l_payload"], + [null, 0, "l_payload"], + [null, 42, "l_payload"], + ["left_only", null, "l_payload"], + ["left_only", 0, "l_payload"], + ["left_only", 42, "l_payload"], + ["both1", null, "l_payload"], + ["both2", null, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2301,10 +2302,10 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right semi join. JoinType join_type = JoinType::RIGHT_SEMI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); + ["both1", 0, "r_payload"], + ["both1", 42, "r_payload"], + ["both2", 0, "r_payload"], + ["both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2316,14 +2317,14 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right anti join. JoinType join_type = JoinType::RIGHT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["both1", null, "payload"], - ["both2", null, "payload"], - ["right_only", null, "payload"], - ["right_only", 0, "payload"], - ["right_only", 42, "payload"]])"); + [null, null, "r_payload"], + [null, 0, "r_payload"], + [null, 42, "r_payload"], + ["both1", null, "r_payload"], + ["both2", null, "r_payload"], + ["right_only", null, "r_payload"], + ["right_only", 0, "r_payload"], + ["right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2357,18 +2358,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::LEFT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both1", 0, "payload", null, null, null], - ["both1", 42, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both2", 0, "payload", null, null, null], - ["both2", 42, "payload", null, null, null]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", null, null, null], + ["both1", 0, "l_payload", null, null, null], + ["both1", 42, "l_payload", null, null, null], + ["both2", null, "l_payload", null, null, null], + ["both2", 0, "l_payload", null, null, null], + ["both2", 42, "l_payload", null, null, null]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2381,18 +2382,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::RIGHT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both1", 0, "payload"], - [null, null, null, "both1", 42, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "both2", 0, "payload"], - [null, null, null, "both2", 42, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "both1", null, "r_payload"], + [null, null, null, "both1", 0, "r_payload"], + [null, null, null, "both1", 42, "r_payload"], + [null, null, null, "both2", null, "r_payload"], + [null, null, null, "both2", 0, "r_payload"], + [null, null, null, "both2", 42, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2405,30 +2406,30 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::FULL_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both1", 0, "payload", null, null, null], - ["both1", 42, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both2", 0, "payload", null, null, null], - ["both2", 42, "payload", null, null, null], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both1", 0, "payload"], - [null, null, null, "both1", 42, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "both2", 0, "payload"], - [null, null, null, "both2", 42, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", null, null, null], + ["both1", 0, "l_payload", null, null, null], + ["both1", 42, "l_payload", null, null, null], + ["both2", null, "l_payload", null, null, null], + ["both2", 0, "l_payload", null, null, null], + ["both2", 42, "l_payload", null, null, null], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "both1", null, "r_payload"], + [null, null, null, "both1", 0, "r_payload"], + [null, null, null, "both1", 42, "r_payload"], + [null, null, null, "both2", null, "r_payload"], + [null, null, null, "both2", 0, "r_payload"], + [null, null, null, "both2", 42, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2451,18 +2452,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left anti join. JoinType join_type = JoinType::LEFT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["left_only", null, "payload"], - ["left_only", 0, "payload"], - ["left_only", 42, "payload"], - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); + [null, null, "l_payload"], + [null, 0, "l_payload"], + [null, 42, "l_payload"], + ["left_only", null, "l_payload"], + ["left_only", 0, "l_payload"], + ["left_only", 42, "l_payload"], + ["both1", null, "l_payload"], + ["both1", 0, "l_payload"], + ["both1", 42, "l_payload"], + ["both2", null, "l_payload"], + ["both2", 0, "l_payload"], + ["both2", 42, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2485,18 +2486,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right anti join. JoinType join_type = JoinType::RIGHT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"], - ["right_only", null, "payload"], - ["right_only", 0, "payload"], - ["right_only", 42, "payload"]])"); + [null, null, "r_payload"], + [null, 0, "r_payload"], + [null, 42, "r_payload"], + ["both1", null, "r_payload"], + ["both1", 0, "r_payload"], + ["both1", 42, "r_payload"], + ["both2", null, "r_payload"], + ["both2", 0, "r_payload"], + ["both2", 42, "r_payload"], + ["right_only", null, "r_payload"], + ["right_only", 0, "r_payload"], + ["right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2516,12 +2517,12 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::INNER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", 42, "payload", "both1", null, "payload"], - ["both1", 42, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 42, "payload", "both2", null, "payload"], - ["both2", 42, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); + ["both1", 42, "l_payload", "both1", null, "r_payload"], + ["both1", 42, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 42, "l_payload", "both2", null, "r_payload"], + ["both2", 42, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2534,22 +2535,22 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::LEFT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both1", 0, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both2", 0, "payload", null, null, null], - ["both1", 42, "payload", "both1", null, "payload"], - ["both1", 42, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 42, "payload", "both2", null, "payload"], - ["both2", 42, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", null, null, null], + ["both1", 0, "l_payload", null, null, null], + ["both2", null, "l_payload", null, null, null], + ["both2", 0, "l_payload", null, null, null], + ["both1", 42, "l_payload", "both1", null, "r_payload"], + ["both1", 42, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 42, "l_payload", "both2", null, "r_payload"], + ["both2", 42, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2562,18 +2563,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::RIGHT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", 42, "payload", "both1", null, "payload"], - ["both1", 42, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 42, "payload", "both2", null, "payload"], - ["both2", 42, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + ["both1", 42, "l_payload", "both1", null, "r_payload"], + ["both1", 42, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 42, "l_payload", "both2", null, "r_payload"], + ["both2", 42, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2586,28 +2587,28 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::FULL_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both1", 0, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both2", 0, "payload", null, null, null], - ["both1", 42, "payload", "both1", null, "payload"], - ["both1", 42, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 42, "payload", "both2", null, "payload"], - ["both2", 42, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", null, null, null], + ["both1", 0, "l_payload", null, null, null], + ["both2", null, "l_payload", null, null, null], + ["both2", 0, "l_payload", null, null, null], + ["both1", 42, "l_payload", "both1", null, "r_payload"], + ["both1", 42, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 42, "l_payload", "both2", null, "r_payload"], + ["both2", 42, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2619,8 +2620,8 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left semi join. JoinType join_type = JoinType::LEFT_SEMI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", 42, "payload"], - ["both2", 42, "payload"]])"); + ["both1", 42, "l_payload"], + ["both2", 42, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2632,16 +2633,16 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left anti join. JoinType join_type = JoinType::LEFT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["left_only", null, "payload"], - ["left_only", 0, "payload"], - ["left_only", 42, "payload"], - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"]])"); + [null, null, "l_payload"], + [null, 0, "l_payload"], + [null, 42, "l_payload"], + ["left_only", null, "l_payload"], + ["left_only", 0, "l_payload"], + ["left_only", 42, "l_payload"], + ["both1", null, "l_payload"], + ["both1", 0, "l_payload"], + ["both2", null, "l_payload"], + ["both2", 0, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2653,12 +2654,12 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right semi join. JoinType join_type = JoinType::RIGHT_SEMI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); + ["both1", null, "r_payload"], + ["both1", 0, "r_payload"], + ["both1", 42, "r_payload"], + ["both2", null, "r_payload"], + ["both2", 0, "r_payload"], + ["both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2670,12 +2671,12 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right anti join. JoinType join_type = JoinType::RIGHT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["right_only", null, "payload"], - ["right_only", 0, "payload"], - ["right_only", 42, "payload"]])"); + [null, null, "r_payload"], + [null, 0, "r_payload"], + [null, 42, "r_payload"], + ["right_only", null, "r_payload"], + ["right_only", 0, "r_payload"], + ["right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2695,12 +2696,12 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::INNER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", null, "payload", "both1", 42, "payload"], - ["both1", 0, "payload", "both1", 42, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", null, "payload", "both2", 42, "payload"], - ["both2", 0, "payload", "both2", 42, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); + ["both1", null, "l_payload", "both1", 42, "r_payload"], + ["both1", 0, "l_payload", "both1", 42, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", null, "l_payload", "both2", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 42, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2713,18 +2714,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::LEFT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", "both1", 42, "payload"], - ["both1", 0, "payload", "both1", 42, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", null, "payload", "both2", 42, "payload"], - ["both2", 0, "payload", "both2", 42, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", "both1", 42, "r_payload"], + ["both1", 0, "l_payload", "both1", 42, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", null, "l_payload", "both2", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 42, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2737,22 +2738,22 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::RIGHT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", null, "payload", "both1", 42, "payload"], - ["both1", 0, "payload", "both1", 42, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", null, "payload", "both2", 42, "payload"], - ["both2", 0, "payload", "both2", 42, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both1", 0, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "both2", 0, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + ["both1", null, "l_payload", "both1", 42, "r_payload"], + ["both1", 0, "l_payload", "both1", 42, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", null, "l_payload", "both2", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 42, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"], + [null, null, null, "both1", null, "r_payload"], + [null, null, null, "both1", 0, "r_payload"], + [null, null, null, "both2", null, "r_payload"], + [null, null, null, "both2", 0, "r_payload"], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2765,28 +2766,28 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::FULL_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", "both1", 42, "payload"], - ["both1", 0, "payload", "both1", 42, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", null, "payload", "both2", 42, "payload"], - ["both2", 0, "payload", "both2", 42, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both1", 0, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "both2", 0, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", "both1", 42, "r_payload"], + ["both1", 0, "l_payload", "both1", 42, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", null, "l_payload", "both2", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 42, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"], + [null, null, null, "both1", null, "r_payload"], + [null, null, null, "both1", 0, "r_payload"], + [null, null, null, "both2", null, "r_payload"], + [null, null, null, "both2", 0, "r_payload"], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2798,12 +2799,12 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left semi join. JoinType join_type = JoinType::LEFT_SEMI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); + ["both1", null, "l_payload"], + ["both1", 0, "l_payload"], + ["both1", 42, "l_payload"], + ["both2", null, "l_payload"], + ["both2", 0, "l_payload"], + ["both2", 42, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2815,12 +2816,12 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left anti join. JoinType join_type = JoinType::LEFT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["left_only", null, "payload"], - ["left_only", 0, "payload"], - ["left_only", 42, "payload"]])"); + [null, null, "l_payload"], + [null, 0, "l_payload"], + [null, 42, "l_payload"], + ["left_only", null, "l_payload"], + ["left_only", 0, "l_payload"], + ["left_only", 42, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2832,8 +2833,8 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right semi join. JoinType join_type = JoinType::RIGHT_SEMI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", 42, "payload"], - ["both2", 42, "payload"]])"); + ["both1", 42, "r_payload"], + ["both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2845,16 +2846,16 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right anti join. JoinType join_type = JoinType::RIGHT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["both1", null, "payload"], - ["both1", 0, "payload"], - ["both2", null, "payload"], - ["both2", 0, "payload"], - ["right_only", null, "payload"], - ["right_only", 0, "payload"], - ["right_only", 42, "payload"]])"); + [null, null, "r_payload"], + [null, 0, "r_payload"], + [null, 42, "r_payload"], + ["both1", null, "r_payload"], + ["both1", 0, "r_payload"], + ["both2", null, "r_payload"], + ["both2", 0, "r_payload"], + ["right_only", null, "r_payload"], + ["right_only", 0, "r_payload"], + ["right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2876,10 +2877,10 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::INNER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); + ["both1", 0, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2892,18 +2893,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::LEFT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", null, null, null], + ["both2", null, "l_payload", null, null, null], + ["both1", 0, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2916,18 +2917,18 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::RIGHT_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + ["both1", 0, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "both1", null, "r_payload"], + [null, null, null, "both2", null, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2940,26 +2941,26 @@ TEST(HashJoin, FineGrainedResidualFilter) { JoinType join_type = JoinType::FULL_OUTER; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8(), utf8(), int32(), utf8()}, R"([ - [null, null, "payload", null, null, null], - [null, 0, "payload", null, null, null], - [null, 42, "payload", null, null, null], - ["left_only", null, "payload", null, null, null], - ["left_only", 0, "payload", null, null, null], - ["left_only", 42, "payload", null, null, null], - ["both1", null, "payload", null, null, null], - ["both2", null, "payload", null, null, null], - ["both1", 0, "payload", "both1", 0, "payload"], - ["both1", 42, "payload", "both1", 42, "payload"], - ["both2", 0, "payload", "both2", 0, "payload"], - ["both2", 42, "payload", "both2", 42, "payload"], - [null, null, null, null, null, "payload"], - [null, null, null, null, 0, "payload"], - [null, null, null, null, 42, "payload"], - [null, null, null, "both1", null, "payload"], - [null, null, null, "both2", null, "payload"], - [null, null, null, "right_only", null, "payload"], - [null, null, null, "right_only", 0, "payload"], - [null, null, null, "right_only", 42, "payload"]])"); + [null, null, "l_payload", null, null, null], + [null, 0, "l_payload", null, null, null], + [null, 42, "l_payload", null, null, null], + ["left_only", null, "l_payload", null, null, null], + ["left_only", 0, "l_payload", null, null, null], + ["left_only", 42, "l_payload", null, null, null], + ["both1", null, "l_payload", null, null, null], + ["both2", null, "l_payload", null, null, null], + ["both1", 0, "l_payload", "both1", 0, "r_payload"], + ["both1", 42, "l_payload", "both1", 42, "r_payload"], + ["both2", 0, "l_payload", "both2", 0, "r_payload"], + ["both2", 42, "l_payload", "both2", 42, "r_payload"], + [null, null, null, null, null, "r_payload"], + [null, null, null, null, 0, "r_payload"], + [null, null, null, null, 42, "r_payload"], + [null, null, null, "both1", null, "r_payload"], + [null, null, null, "both2", null, "r_payload"], + [null, null, null, "right_only", null, "r_payload"], + [null, null, null, "right_only", 0, "r_payload"], + [null, null, null, "right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2971,10 +2972,10 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left semi join. JoinType join_type = JoinType::LEFT_SEMI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); + ["both1", 0, "l_payload"], + ["both1", 42, "l_payload"], + ["both2", 0, "l_payload"], + ["both2", 42, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -2986,14 +2987,14 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Left anti join. JoinType join_type = JoinType::LEFT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["left_only", null, "payload"], - ["left_only", 0, "payload"], - ["left_only", 42, "payload"], - ["both1", null, "payload"], - ["both2", null, "payload"]])"); + [null, null, "l_payload"], + [null, 0, "l_payload"], + [null, 42, "l_payload"], + ["left_only", null, "l_payload"], + ["left_only", 0, "l_payload"], + ["left_only", 42, "l_payload"], + ["both1", null, "l_payload"], + ["both2", null, "l_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -3005,10 +3006,10 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right semi join. JoinType join_type = JoinType::RIGHT_SEMI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - ["both1", 0, "payload"], - ["both1", 42, "payload"], - ["both2", 0, "payload"], - ["both2", 42, "payload"]])"); + ["both1", 0, "r_payload"], + ["both1", 42, "r_payload"], + ["both2", 0, "r_payload"], + ["both2", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter, @@ -3020,14 +3021,14 @@ TEST(HashJoin, FineGrainedResidualFilter) { // Right anti join. JoinType join_type = JoinType::RIGHT_ANTI; auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ - [null, null, "payload"], - [null, 0, "payload"], - [null, 42, "payload"], - ["both1", null, "payload"], - ["both2", null, "payload"], - ["right_only", null, "payload"], - ["right_only", 0, "payload"], - ["right_only", 42, "payload"]])"); + [null, null, "r_payload"], + [null, 0, "r_payload"], + [null, 42, "r_payload"], + ["both1", null, "r_payload"], + ["both2", null, "r_payload"], + ["right_only", null, "r_payload"], + ["right_only", 0, "r_payload"], + ["right_only", 42, "r_payload"]])"); for (const auto& projector : projectors) { runner.Run(join_type, left_keys, right_keys, projector.LeftOutput(join_type), projector.RightOutput(join_type), filter,