diff --git a/c_glib/test/test-is-in.rb b/c_glib/test/test-is-in.rb index 5b1b36041ed..ba44075d6b3 100644 --- a/c_glib/test/test-is-in.rb +++ b/c_glib/test/test-is-in.rb @@ -29,20 +29,20 @@ def test_no_null def test_null_in_left left = build_int16_array([1, 0, nil, 2]) right = build_int16_array([2, 0, 3]) - assert_equal(build_boolean_array([false, true, nil, true]), + assert_equal(build_boolean_array([false, true, false, true]), left.is_in(right)) end def test_null_in_right left = build_int16_array([1, 0, 1, 2]) - right = build_int16_array([2, 0, nil, 2, 0]) + right = build_int16_array([2, 0, nil]) assert_equal(build_boolean_array([false, true, false, true]), left.is_in(right)) end def test_null_in_both left = build_int16_array([1, 0, nil, 2]) - right = build_int16_array([2, 0, nil, 2, 0, nil]) + right = build_int16_array([2, 0, nil]) assert_equal(build_boolean_array([false, true, true, true]), left.is_in(right)) end @@ -52,8 +52,8 @@ def test_null_in_both def test_no_null left = build_int16_array([1, 0, 1, 2]) chunks = [ - build_int16_array([1, 0]), - build_int16_array([1, 0, 3]) + build_int16_array([1, 4]), + build_int16_array([3, 0]) ] right = Arrow::ChunkedArray.new(chunks) assert_equal(build_boolean_array([true, true, true, false]), @@ -63,19 +63,19 @@ def test_no_null def test_null_in_left left = build_int16_array([1, 0, nil, 2]) chunks = [ - build_int16_array([2, 0, 3]), - build_int16_array([3, 0, 2, 2]) + build_int16_array([2, 0]), + build_int16_array([3, 4]) ] right = Arrow::ChunkedArray.new(chunks) - assert_equal(build_boolean_array([false, true, nil, true]), + assert_equal(build_boolean_array([false, true, false, true]), left.is_in_chunked_array(right)) end def test_null_in_right left = build_int16_array([1, 0, 1, 2]) chunks = [ - build_int16_array([2, 0, nil, 2, 0]), - build_int16_array([2, 3, nil]) + build_int16_array([2, 0]), + build_int16_array([3, nil]) ] right = Arrow::ChunkedArray.new(chunks) assert_equal(build_boolean_array([false, true, false, true]), @@ -85,8 +85,8 @@ def test_null_in_right def test_null_in_both left = build_int16_array([1, 0, nil, 2]) chunks = [ - build_int16_array([2, 0, nil, 2, 0, nil]), - build_int16_array([2, 3, nil]) + build_int16_array([2, 0]), + build_int16_array([3, nil]) ] right = Arrow::ChunkedArray.new(chunks) assert_equal(build_boolean_array([false, true, true, true]), diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 84918f0e44b..671c8246378 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -57,30 +57,36 @@ SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked") // Set-related operations static Result ExecSetLookup(const std::string& func_name, const Datum& data, - const Datum& value_set, bool add_nulls_to_hash_table, - ExecContext* ctx) { - if (!value_set.is_arraylike()) { + const SetLookupOptions& options, ExecContext* ctx) { + if (!options.value_set.is_arraylike()) { return Status::Invalid("Set lookup value set must be Array or ChunkedArray"); } - if (value_set.length() > 0 && !data.type()->Equals(value_set.type())) { + if (options.value_set.length() > 0 && !data.type()->Equals(options.value_set.type())) { std::stringstream ss; ss << "Array type didn't match type of values set: " << data.type()->ToString() - << " vs " << value_set.type()->ToString(); + << " vs " << options.value_set.type()->ToString(); return Status::Invalid(ss.str()); } - SetLookupOptions options(value_set, !add_nulls_to_hash_table); return CallFunction(func_name, {data}, &options, ctx); } +Result IsIn(const Datum& values, const SetLookupOptions& options, + ExecContext* ctx) { + return ExecSetLookup("is_in", values, options, ctx); +} + Result IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) { - return ExecSetLookup("is_in", values, value_set, - /*add_nulls_to_hash_table=*/false, ctx); + return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx); +} + +Result IndexIn(const Datum& values, const SetLookupOptions& options, + ExecContext* ctx) { + return ExecSetLookup("index_in", values, options, ctx); } Result IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx) { - return ExecSetLookup("index_in", values, value_set, - /*add_nulls_to_hash_table=*/true, ctx); + return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 9d3d0cb745d..da219b88793 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -290,17 +290,19 @@ Result KleeneAndNot(const Datum& left, const Datum& right, /// \brief IsIn returns true for each element of `values` that is contained in /// `value_set` /// -/// If null occurs in left, if null count in right is not 0, -/// it returns true, else returns null. +/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls. /// /// \param[in] values array-like input to look up in value_set -/// \param[in] value_set either Array or ChunkedArray +/// \param[in] options SetLookupOptions /// \param[in] ctx the function execution context, optional /// \return the resulting datum /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT +Result IsIn(const Datum& values, const SetLookupOptions& options, + ExecContext* ctx = NULLPTR); +ARROW_EXPORT Result IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx = NULLPTR); @@ -312,19 +314,19 @@ Result IsIn(const Datum& values, const Datum& value_set, /// For example given values = [99, 42, 3, null] and /// value_set = [3, 3, 99], the output will be = [1, null, 0, null] /// -/// Note: Null in the values is considered to match -/// a null in the value_set array. For example given -/// values = [99, 42, 3, null] and value_set = [3, 99, null], -/// the output will be = [1, null, 0, 2] +/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls. /// /// \param[in] values array-like input -/// \param[in] value_set either Array or ChunkedArray +/// \param[in] options SetLookupOptions /// \param[in] ctx the function execution context, optional /// \return the resulting datum /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT +Result IndexIn(const Datum& values, const SetLookupOptions& options, + ExecContext* ctx = NULLPTR); +ARROW_EXPORT Result IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx = NULLPTR); diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index 722f3173a34..93fa34c1694 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -36,53 +36,59 @@ namespace { template struct SetLookupState : public KernelState { - explicit SetLookupState(MemoryPool* pool) - : lookup_table(pool, 0), lookup_null_count(0) {} + explicit SetLookupState(const SetLookupOptions& options, MemoryPool* pool) + : options(options), lookup_table(pool, 0) {} - Status Init(const SetLookupOptions& options) { + Status Init() { + if (options.value_set.kind() == Datum::ARRAY) { + RETURN_NOT_OK(AddArrayValueSet(*options.value_set.array())); + } else if (options.value_set.kind() == Datum::CHUNKED_ARRAY) { + const ChunkedArray& value_set = *options.value_set.chunked_array(); + for (const std::shared_ptr& chunk : value_set.chunks()) { + RETURN_NOT_OK(AddArrayValueSet(*chunk->data())); + } + } else { + return Status::Invalid("value_set should be an array or chunked array"); + } + if (lookup_table.size() != options.value_set.length()) { + return Status::NotImplemented("duplicate values in value_set"); + } + value_set_has_null = (lookup_table.GetNull() >= 0); + return Status::OK(); + } + + Status AddArrayValueSet(const ArrayData& data) { using T = typename GetViewType::T; auto visit_valid = [&](T v) { int32_t unused_memo_index; return lookup_table.GetOrInsert(v, &unused_memo_index); }; auto visit_null = [&]() { - if (!options.skip_nulls) { - lookup_table.GetOrInsertNull(); - } + lookup_table.GetOrInsertNull(); return Status::OK(); }; - if (options.value_set.kind() == Datum::ARRAY) { - const std::shared_ptr& value_set = options.value_set.array(); - this->lookup_null_count += value_set->GetNullCount(); - return VisitArrayDataInline(*value_set, std::move(visit_valid), - std::move(visit_null)); - } else { - const ChunkedArray& value_set = *options.value_set.chunked_array(); - for (const std::shared_ptr& chunk : value_set.chunks()) { - this->lookup_null_count += chunk->null_count(); - RETURN_NOT_OK(VisitArrayDataInline(*chunk->data(), std::move(visit_valid), - std::move(visit_null))); - } - return Status::OK(); - } + + return VisitArrayDataInline(data, visit_valid, visit_null); } using MemoTable = typename HashTraits::MemoTableType; + const SetLookupOptions& options; MemoTable lookup_table; - int64_t lookup_null_count; - int64_t null_index = -1; + bool value_set_has_null; }; template <> struct SetLookupState : public KernelState { - explicit SetLookupState(MemoryPool*) {} + explicit SetLookupState(const SetLookupOptions& options, MemoryPool*) + : options(options) {} - Status Init(const SetLookupOptions& options) { - this->lookup_null_count = options.value_set.null_count(); + Status Init() { + this->value_set_has_null = (options.value_set.length() > 0); return Status::OK(); } - int64_t lookup_null_count; + const SetLookupOptions& options; + bool value_set_has_null; }; // TODO: Put this concept somewhere reusable @@ -125,8 +131,8 @@ struct InitStateVisitor { "Attempted to call a set lookup function without SetLookupOptions"); } using StateType = SetLookupState; - result.reset(new StateType(ctx->exec_context()->memory_pool())); - return static_cast(result.get())->Init(*options); + result.reset(new StateType(*options, ctx->exec_context()->memory_pool())); + return static_cast(result.get())->Init(); } Status Visit(const DataType&) { return Init(); } @@ -174,16 +180,18 @@ struct IndexInVisitor { IndexInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out) : ctx(ctx), data(data), out(out), builder(ctx->exec_context()->memory_pool()) {} - Status Visit(const DataType&) { + Status Visit(const DataType& type) { + DCHECK_EQ(type.id(), Type::NA); const auto& state = checked_cast&>(*ctx->state()); if (data.length != 0) { - if (state.lookup_null_count == 0) { - RETURN_NOT_OK(this->builder.AppendNulls(data.length)); - } else { + // skip_nulls is honored for consistency with other types + if (state.value_set_has_null && !state.options.skip_nulls) { RETURN_NOT_OK(this->builder.Reserve(data.length)); for (int64_t i = 0; i < data.length; ++i) { this->builder.UnsafeAppend(0); } + } else { + RETURN_NOT_OK(this->builder.AppendNulls(data.length)); } } return Status::OK(); @@ -195,7 +203,7 @@ struct IndexInVisitor { const auto& state = checked_cast&>(*ctx->state()); - int32_t null_index = state.lookup_table.GetNull(); + int32_t null_index = state.options.skip_nulls ? -1 : state.lookup_table.GetNull(); RETURN_NOT_OK(this->builder.Reserve(data.length)); VisitArrayDataInline( data, @@ -261,7 +269,7 @@ void ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // ---------------------------------------------------------------------- -// IsIn writes the results into a preallocated binary data bitmap +// IsIn writes the results into a preallocated boolean data bitmap struct IsInVisitor { KernelContext* ctx; const ArrayData& data; @@ -270,12 +278,12 @@ struct IsInVisitor { IsInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out) : ctx(ctx), data(data), out(out) {} - Status Visit(const DataType&) { + Status Visit(const DataType& type) { + DCHECK_EQ(type.id(), Type::NA); const auto& state = checked_cast&>(*ctx->state()); ArrayData* output = out->mutable_array(); - if (state.lookup_null_count > 0) { - BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), output->offset, - output->length, true); + // skip_nulls is honored for consistency with other types + if (state.value_set_has_null && !state.options.skip_nulls) { BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset, output->length, true); } else { @@ -291,13 +299,6 @@ struct IsInVisitor { const auto& state = checked_cast&>(*ctx->state()); ArrayData* output = out->mutable_array(); - if (this->data.GetNullCount() > 0 && state.lookup_null_count > 0) { - // If there were nulls in the value set, set the whole validity bitmap to - // true - output->null_count = 0; - BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), output->offset, - output->length, true); - } FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), output->offset, output->length); VisitArrayDataInline( @@ -311,7 +312,11 @@ struct IsInVisitor { writer.Next(); }, [&]() { - writer.Set(); + if (!state.options.skip_nulls && state.lookup_table.GetNull() != -1) { + writer.Set(); + } else { + writer.Clear(); + } writer.Next(); }); writer.Finish(); @@ -412,8 +417,10 @@ class IndexInMetaBinary : public MetaFunction { const FunctionDoc is_in_doc{ "Find each element in a set of values", ("For each element in `values`, return true if it is found in a given\n" - "set of values. The set of values to look for must be given in\n" - "SetLookupOptions."), + "set of values, false otherwise.\n" + "The set of values to look for must be given in SetLookupOptions.\n" + "By default, nulls are matched against the value set, this can be\n" + "changed in SetLookupOptions."), {"values"}, "SetLookupOptions"}; @@ -421,24 +428,26 @@ const FunctionDoc index_in_doc{ "Return index of each element in a set of values", ("For each element in `values`, return its index in a given set of\n" "values, or null if it is not found there.\n" - "The set of values to look for must be given in SetLookupOptions."), + "The set of values to look for must be given in SetLookupOptions.\n" + "By default, nulls are matched against the value set, this can be\n" + "changed in SetLookupOptions."), {"values"}, "SetLookupOptions"}; } // namespace void RegisterScalarSetLookup(FunctionRegistry* registry) { - // IsIn always writes into preallocated memory + // IsIn writes its boolean output into preallocated memory { ScalarKernel isin_base; isin_base.init = InitSetLookup; isin_base.exec = TrivialScalarUnaryAsArraysExec(ExecIsIn); + isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL; auto is_in = std::make_shared("is_in", Arity::Unary(), &is_in_doc); AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get()); isin_base.signature = KernelSignature::Make({null()}, boolean()); - isin_base.null_handling = NullHandling::COMPUTED_PREALLOCATE; DCHECK_OK(is_in->AddKernel(isin_base)); DCHECK_OK(registry->AddFunction(is_in)); diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 9fae29b8fa6..40907da5a62 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -30,19 +30,16 @@ #include "arrow/array/array_base.h" #include "arrow/array/builder_binary.h" -#include "arrow/array/builder_decimal.h" #include "arrow/array/builder_primitive.h" #include "arrow/chunked_array.h" #include "arrow/compute/api.h" #include "arrow/compute/kernels/test_util.h" -#include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/testing/gtest_compat.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/type_traits.h" -#include "arrow/util/decimal.h" namespace arrow { namespace compute { @@ -50,33 +47,39 @@ namespace compute { // ---------------------------------------------------------------------- // IsIn tests -template ::c_type> -void CheckIsIn(const std::shared_ptr& type, const std::vector& in_values, - const std::vector& in_is_valid, - const std::vector& member_set_values, - const std::vector& member_set_is_valid, - const std::vector& out_values, - const std::vector& out_is_valid) { - std::shared_ptr input = _MakeArray(type, in_values, in_is_valid); - std::shared_ptr member_set = - _MakeArray(type, member_set_values, member_set_is_valid); - std::shared_ptr expected = - _MakeArray(boolean(), out_values, out_is_valid); - - ASSERT_OK_AND_ASSIGN(Datum datum_out, IsIn(input, member_set)); - std::shared_ptr result = datum_out.make_array(); - ASSERT_OK(result->ValidateFull()); - AssertArraysEqual(*expected, *result, /*verbose=*/true); +void CheckIsIn(const std::shared_ptr& type, const std::string& input_json, + const std::string& value_set_json, const std::string& expected_json, + bool skip_nulls = false) { + auto input = ArrayFromJSON(type, input_json); + auto value_set = ArrayFromJSON(type, value_set_json); + auto expected = ArrayFromJSON(boolean(), expected_json); + + ASSERT_OK_AND_ASSIGN(Datum actual_datum, + IsIn(input, SetLookupOptions(value_set, skip_nulls))); + std::shared_ptr actual = actual_datum.make_array(); + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); +} + +void CheckIsInChunked(const std::shared_ptr& input, + const std::shared_ptr& value_set, + const std::shared_ptr& expected, + bool skip_nulls = false) { + ASSERT_OK_AND_ASSIGN(Datum actual_datum, + IsIn(input, SetLookupOptions(value_set, skip_nulls))); + auto actual = actual_datum.chunked_array(); + ASSERT_OK(actual->ValidateFull()); + AssertChunkedEqual(*expected, *actual); } class TestIsInKernel : public ::testing::Test {}; TEST_F(TestIsInKernel, CallBinary) { - auto haystack = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8]"); - auto needles = ArrayFromJSON(int8(), "[2, 3, 5, 7]"); - ASSERT_RAISES(Invalid, CallFunction("is_in", {haystack, needles})); + auto input = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8]"); + auto value_set = ArrayFromJSON(int8(), "[2, 3, 5, 7]"); + ASSERT_RAISES(Invalid, CallFunction("is_in", {input, value_set})); - ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("is_in_meta_binary", {haystack, needles})); + ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("is_in_meta_binary", {input, value_set})); auto expected = ArrayFromJSON(boolean(), ("[false, false, true, true, false," "true, false, true, false]")); AssertArraysEqual(*expected, *out.make_array()); @@ -88,555 +91,388 @@ class TestIsInKernelPrimitive : public ::testing::Test {}; template class TestIsInKernelBinary : public ::testing::Test {}; -typedef ::testing::Types - PrimitiveDictionaries; +using PrimitiveTypes = ::testing::Types; -TYPED_TEST_SUITE(TestIsInKernelPrimitive, PrimitiveDictionaries); +TYPED_TEST_SUITE(TestIsInKernelPrimitive, PrimitiveTypes); TYPED_TEST(TestIsInKernelPrimitive, IsIn) { - using T = typename TypeParam::c_type; auto type = TypeTraits::type_singleton(); // No Nulls - CheckIsIn(type, {2, 1, 2, 1, 2, 3}, {true, true, true, true, true, true}, - {2, 1, 2, 3}, {true, true, true, true, true}, - {true, true, true, true, true, true}, {}); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]"); // Nulls in left array - CheckIsIn(type, {2, 1, 2, 1, 2, 3}, - {false, false, false, false, false, false}, {2, 1, 2, 1, 3}, {}, - {false, false, false, false, false, false}, - {false, false, false, false, false, false}); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", + /*skip_nulls=*/false); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", + /*skip_nulls=*/true); // Nulls in right array - CheckIsIn(type, {2, 1, 2, 1, 2, 3}, {}, {2, 1, 2, 3}, - {false, false, false, false}, - {false, false, false, false, false, false}, {}); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", + /*skip_nulls=*/false); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", + /*skip_nulls=*/true); // Nulls in both the arrays - CheckIsIn( - type, {2, 1, 2, 3}, {false, false, false, false}, {2, 1, 2, 1, 2, 3, 3}, - {false, false, false, false, false, false, false}, {true, true, true, true}, {}); - - // No Match - CheckIsIn( - type, {2, 1, 7, 3, 8}, {true, false, true, true, true}, {2, 1, 2, 1, 6, 3, 3}, - {true, false, true, false, true, true, true}, {true, true, false, true, false}, {}); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[true, true, true, false, true]", + /*skip_nulls=*/false); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", + "[false, true, true, false, true]", /*skip_nulls=*/true); // Empty Arrays - CheckIsIn(type, {}, {}, {}, {}, {}, {}); + CheckIsIn(type, "[]", "[]", "[]"); } -TYPED_TEST(TestIsInKernelPrimitive, PrimitiveResizeTable) { - using T = typename TypeParam::c_type; - - const int64_t kTotalValues = std::min(INT16_MAX, 1UL << sizeof(T) / 2); - const int64_t kRepeats = 5; +TEST_F(TestIsInKernel, NullType) { + auto type = null(); - std::vector values; - std::vector member_set; - std::vector expected; - for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { - const auto val = static_cast(i % kTotalValues); - values.push_back(val); - member_set.push_back(val); - expected.push_back(static_cast(true)); - } + CheckIsIn(type, "[null, null, null]", "[null]", "[true, true, true]"); + CheckIsIn(type, "[null, null, null]", "[]", "[false, false, false]"); + CheckIsIn(type, "[]", "[]", "[]"); - auto type = TypeTraits::type_singleton(); - CheckIsIn(type, values, {}, member_set, {}, expected, {}); + CheckIsIn(type, "[null, null]", "[null]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[]", "[false, false]", /*skip_nulls=*/true); } -TEST_F(TestIsInKernel, IsInNull) { - CheckIsIn(null(), {0, 0, 0}, {false, false, false}, {0, 0, 0}, - {false, false, false}, {true, true, true}, {}); - - CheckIsIn(null(), {NULL, NULL, NULL}, {}, - {NULL, NULL, NULL, NULL}, {}, {true, true, true}, - {}); - - CheckIsIn(null(), {nullptr, nullptr, nullptr}, {}, {nullptr}, - {}, {true, true, true}, {}); - - // Empty left array - CheckIsIn(null(), {}, {}, {nullptr, nullptr, nullptr}, {}, {}, - {}); - - // Empty right array - CheckIsIn(null(), {nullptr, nullptr, nullptr}, {}, {}, {}, - {false, false, false}, {false, false, false}); - - // Empty arrays - CheckIsIn(null(), {}, {}, {}, {}, {}, {}); -} - -TEST_F(TestIsInKernel, IsInTimeTimestamp) { - CheckIsIn( - time32(TimeUnit::SECOND), {2, 1, 5, 1}, {true, false, true, true}, {2, 1, 2, 1}, - {true, false, true, true}, {true, true, false, true}, {}); - - // Right array has no Nulls - CheckIsIn(time32(TimeUnit::SECOND), {2, 1, 5, 1}, - {true, false, true, true}, {2, 1, 1}, {true, true, true}, - {true, false, false, true}, {true, false, true, true}); - - // No match - CheckIsIn(time32(TimeUnit::SECOND), {3, 5, 5, 3}, - {true, false, true, true}, {2, 1, 2, 1, 2}, - {true, true, true, true, true}, - {false, false, false, false}, {true, false, true, true}); - - // Empty arrays - CheckIsIn(time32(TimeUnit::SECOND), {}, {}, {}, {}, {}, {}); - - CheckIsIn(time64(TimeUnit::NANO), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1, 1}, - {true, false, true}, {true, true, true, true}, {}); - - CheckIsIn( - timestamp(TimeUnit::NANO), {2, 1, 2, 1}, {true, false, true, true}, {2, 1, 2, 1}, - {true, false, true, true}, {true, true, true, true}, {}); - - // Empty left array - CheckIsIn(timestamp(TimeUnit::NANO), {}, {}, {2, 1, 2, 1}, - {true, false, true, true}, {}, {}); - - // Empty right array - CheckIsIn( - timestamp(TimeUnit::NANO), {2, 1, 2, 1}, {true, false, true, true}, {}, {}, - {false, false, false, false}, {true, false, true, true}); - - // Both array have Nulls - CheckIsIn(time32(TimeUnit::SECOND), {2, 1, 2, 1}, - {false, false, false, false}, {2, 1}, {false, false}, - {true, true, true, true}, {}); +TEST_F(TestIsInKernel, TimeTimestamp) { + for (const auto& type : + {time32(TimeUnit::SECOND), time64(TimeUnit::NANO), timestamp(TimeUnit::MICRO)}) { + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, true, false, true, true]", /*skip_nulls=*/false); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, false, false, true, true]", /*skip_nulls=*/true); + } } -TEST_F(TestIsInKernel, IsInBoolean) { - CheckIsIn(boolean(), {false, true, false, true}, - {true, false, true, true}, {true, false, true}, - {false, true, true}, {true, true, true, true}, {}); +TEST_F(TestIsInKernel, Boolean) { + auto type = boolean(); - CheckIsIn( - boolean(), {false, true, false, true}, {true, false, true, true}, - {false, true, false, true, false}, {true, true, false, true, false}, - {true, true, true, true}, {}); - - // No Nulls - CheckIsIn(boolean(), {true, true, false, true}, {}, {false, true}, - {}, {true, true, true, true}, {}); - - CheckIsIn(boolean(), {false, true, false, true}, {}, - {true, true, true, true}, {}, {false, true, false, true}, - {}); - - // No match - CheckIsIn(boolean(), {true, true, true, true}, {}, - {false, false, false, false, false}, {}, - {false, false, false, false}, {}); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, false, false, true]", /*skip_nulls=*/false); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, false, false, true]", /*skip_nulls=*/true); - // Nulls in left array - CheckIsIn( - boolean(), {false, true, false, true}, {false, false, false, false}, {true, true}, - {}, {false, false, false, false}, {false, false, false, false}); - - // Nulls in right array - CheckIsIn( - boolean(), {true, true, false, true}, {}, {true, true, false, true, true}, - {false, false, false, false, false}, {false, false, false, false}, {}); - - // Both array have Nulls - CheckIsIn(boolean(), {false, true, false, true}, - {false, false, false, false}, {true, true, true, true}, - {false, false, false, false}, {true, true, true, true}, - {}); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[false, true, true, false, true]", /*skip_nulls=*/false); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[false, true, false, false, true]", /*skip_nulls=*/true); } TYPED_TEST_SUITE(TestIsInKernelBinary, BinaryTypes); -TYPED_TEST(TestIsInKernelBinary, IsInBinary) { +TYPED_TEST(TestIsInKernelBinary, Binary) { auto type = TypeTraits::type_singleton(); - CheckIsIn(type, {"test", "", "test2", "test"}, - {true, false, true, true}, {"test", "", "test2"}, - {true, false, true}, {true, true, true, true}, {}); - - // No match - CheckIsIn( - type, {"test", "", "test2", "test"}, {true, false, true, true}, - {"test3", "test4", "test3", "test4"}, {true, true, true, true}, - {false, false, false, false}, {true, false, true, true}); - - // Nulls in left array - CheckIsIn( - type, {"test", "", "test2", "test"}, {false, false, false, false}, - {"test", "test2", "test"}, {true, true, true}, {false, false, false, false}, - {false, false, false, false}); - - // Nulls in right array - CheckIsIn( - type, {"test", "test2", "test"}, {true, true, true}, {"test", "", "test2", "test"}, - {false, false, false, false}, {false, false, false}, {}); - - // Both array have Nulls - CheckIsIn( - type, {"test", "", "test2", "test"}, {false, false, false, false}, - {"test", "", "test2", "test"}, {false, false, false, false}, - {true, true, true, true}, {}); - // Empty arrays - CheckIsIn(type, {}, {}, {}, {}, {}, {}); - - // Empty left array - CheckIsIn(type, {}, {}, {"test", "", "test2", "test"}, - {true, false, true, false}, {}, {}); - - // Empty right array - CheckIsIn( - type, {"test", "", "test2", "test"}, {true, false, true, true}, {}, {}, - {false, false, false, false}, {true, false, true, true}); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, false, true]", + /*skip_nulls=*/false); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, false, true]", + /*skip_nulls=*/true); + + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, false, true, true]", + /*skip_nulls=*/false); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, false, false, true]", + /*skip_nulls=*/true); } -TEST_F(TestIsInKernel, BinaryResizeTable) { - const int32_t kTotalValues = 10000; -#if !defined(ARROW_VALGRIND) - const int32_t kRepeats = 10; -#else - // Mitigate Valgrind's slowness - const int32_t kRepeats = 3; -#endif - - std::vector values; - std::vector member_set; - std::vector expected; - char buf[20] = "test"; - - for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { - int32_t index = i % kTotalValues; - - ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); - values.emplace_back(buf); - member_set.emplace_back(buf); - expected.push_back(true); - } - - CheckIsIn(binary(), values, {}, member_set, {}, expected, {}); - - CheckIsIn(utf8(), values, {}, member_set, {}, expected, {}); +TEST_F(TestIsInKernel, FixedSizeBinary) { + auto type = fixed_size_binary(3); + + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, false, true]", + /*skip_nulls=*/false); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, false, true]", + /*skip_nulls=*/true); + + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, false, true, true]", + /*skip_nulls=*/false); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, false, false, true]", + /*skip_nulls=*/true); } -TEST_F(TestIsInKernel, IsInFixedSizeBinary) { - CheckIsIn( - fixed_size_binary(5), {"bbbbb", "", "aaaaa", "ccccc"}, {true, false, true, true}, - {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, {true, false, true, true, true}, - {true, true, true, true}, {}); - - // Nulls in left - CheckIsIn( - fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, - {false, false, false, false, false}, {"bbbbb", "aabbb", "bbbbb", "aaaaa", "ccccc"}, - {true, true, true, true, true}, {false, false, false, false, false}, - {false, false, false, false, false}); - - // Nulls in right - CheckIsIn( - fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, - {true, false, true, true, true}, {"bbbbb", "", "bbbbb"}, {false, false, false}, - {false, true, false, false, false}, {}); - - // Both array have Nulls - CheckIsIn( - fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, - {false, false, false, false, false}, {"", "", "bbbbb", "aaaaa"}, - {false, false, false, false}, {true, true, true, true, true}, {}); - - // No match - CheckIsIn( - fixed_size_binary(5), {"bbbbc", "bbbbc", "aaaad", "cccca"}, - {true, true, true, true}, {"bbbbb", "", "bbbbb", "aaaaa", "ddddd"}, - {true, false, true, true, true}, {false, false, false, false}, {}); - - // Empty left array - CheckIsIn(fixed_size_binary(5), {}, {}, - {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, - {true, false, true, true, true}, {}, {}); - - // Empty right array - CheckIsIn( - fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, - {true, false, true, true, true}, {}, {}, {false, false, false, false, false}, - {true, false, true, true, true}); - - // Empty arrays - CheckIsIn(fixed_size_binary(0), {}, {}, {}, {}, {}, - {}); +TEST_F(TestIsInKernel, Decimal) { + auto type = decimal(3, 1); + + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, false, true]", + /*skip_nulls=*/false); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, false, true]", + /*skip_nulls=*/true); + + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, false, true, true, true]", + /*skip_nulls=*/false); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, false, true, false, true]", + /*skip_nulls=*/true); } -TEST_F(TestIsInKernel, IsInDecimal) { - std::vector input{12, 12, 11, 12}; - std::vector member_set{12, 12, 11, 12}; - std::vector expected{true, true, true, true}; - - CheckIsIn(decimal(2, 0), input, {true, false, true, true}, - member_set, {true, false, true, true}, expected, - {}); -} -TEST_F(TestIsInKernel, IsInChunkedArrayInvoke) { - std::vector values1 = {"foo", "bar", "foo"}; - std::vector values2 = {"bar", "baz", "quuux", "foo"}; - std::vector values3 = {"foo", "bar", "foo"}; - std::vector values4 = {"bar", "baz", "barr", "foo"}; - - auto type = utf8(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {true, true, true, false}); - auto a3 = _MakeArray(type, values3, {}); - auto a4 = _MakeArray(type, values4, {}); - - ArrayVector array1 = {a1, a2}; - auto carr = std::make_shared(array1); - ArrayVector array2 = {a3, a4}; - auto member_set = std::make_shared(array2); - - auto i1 = _MakeArray(boolean(), {true, true, true}, {}); - auto i2 = _MakeArray(boolean(), {true, true, false, false}, - {true, true, true, false}); - - ArrayVector expected = {i1, i2}; - auto expected_carr = std::make_shared(expected); - - ASSERT_OK_AND_ASSIGN(Datum encoded_out, IsIn(carr, member_set)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); - - AssertChunkedEquivalent(*expected_carr, *encoded_out.chunked_array()); +TEST_F(TestIsInKernel, ChunkedArrayInvoke) { + auto input = ChunkedArrayFromJSON( + utf8(), {R"(["abc", "def", "", "abc", "jkl"])", R"(["def", null, "abc", "zzz"])"}); + // No null in value_set + auto value_set = ChunkedArrayFromJSON(utf8(), {R"(["", "def"])", R"(["abc"])"}); + auto expected = ChunkedArrayFromJSON( + boolean(), {"[true, true, true, true, false]", "[true, false, true, false]"}); + + CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + + value_set = ChunkedArrayFromJSON(utf8(), {R"(["", "def"])", R"([null])"}); + expected = ChunkedArrayFromJSON( + boolean(), {"[false, true, true, false, false]", "[true, true, false, false]"}); + CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); + expected = ChunkedArrayFromJSON( + boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"}); + CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); } + // ---------------------------------------------------------------------- // IndexIn tests -class TestMatchKernel : public ::testing::Test { +class TestIndexInKernel : public ::testing::Test { public: - void CheckMatch(const std::shared_ptr& type, const std::string& haystack_json, - const std::string& needles_json, const std::string& expected_json) { - std::shared_ptr haystack = ArrayFromJSON(type, haystack_json); - std::shared_ptr needles = ArrayFromJSON(type, needles_json); + void CheckIndexIn(const std::shared_ptr& type, const std::string& input_json, + const std::string& value_set_json, const std::string& expected_json, + bool skip_nulls = false) { + std::shared_ptr input = ArrayFromJSON(type, input_json); + std::shared_ptr value_set = ArrayFromJSON(type, value_set_json); std::shared_ptr expected = ArrayFromJSON(int32(), expected_json); - ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(haystack, needles)); + SetLookupOptions options(value_set, skip_nulls); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options)); std::shared_ptr actual = actual_datum.make_array(); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual, /*verbose=*/true); } + + void CheckIndexInChunked(const std::shared_ptr& input, + const std::shared_ptr& value_set, + const std::shared_ptr& expected, + bool skip_nulls) { + ASSERT_OK_AND_ASSIGN(Datum actual, + IndexIn(input, SetLookupOptions(value_set, skip_nulls))); + ASSERT_EQ(Datum::CHUNKED_ARRAY, actual.kind()); + ASSERT_OK(actual.chunked_array()->ValidateFull()); + AssertChunkedEqual(*expected, *actual.chunked_array()); + } }; -TEST_F(TestMatchKernel, CallBinary) { - auto haystack = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"); - auto needles = ArrayFromJSON(int8(), "[2, 3, 5, 7]"); - ASSERT_RAISES(Invalid, CallFunction("index_in", {haystack, needles})); +TEST_F(TestIndexInKernel, CallBinary) { + auto input = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"); + auto value_set = ArrayFromJSON(int8(), "[2, 3, 5, 7]"); + ASSERT_RAISES(Invalid, CallFunction("index_in", {input, value_set})); ASSERT_OK_AND_ASSIGN(Datum out, - CallFunction("index_in_meta_binary", {haystack, needles})); + CallFunction("index_in_meta_binary", {input, value_set})); auto expected = ArrayFromJSON(int32(), ("[null, null, 0, 1, null, 2, null, 3, null," " null, null]")); AssertArraysEqual(*expected, *out.make_array()); } template -class TestMatchKernelPrimitive : public TestMatchKernel {}; +class TestIndexInKernelPrimitive : public TestIndexInKernel {}; using PrimitiveDictionaries = ::testing::Types; -TYPED_TEST_SUITE(TestMatchKernelPrimitive, PrimitiveDictionaries); +TYPED_TEST_SUITE(TestIndexInKernelPrimitive, PrimitiveDictionaries); -TYPED_TEST(TestMatchKernelPrimitive, IndexIn) { +TYPED_TEST(TestIndexInKernelPrimitive, IndexIn) { auto type = TypeTraits::type_singleton(); // No Nulls - this->CheckMatch(type, - /* haystack= */ "[2, 1, 2, 1, 2, 3]", - /* needles= */ "[2, 1, 2, 3]", - /* expected= */ "[0, 1, 0, 1, 0, 2]"); + this->CheckIndexIn(type, + /* input= */ "[2, 1, 2, 1, 2, 3]", + /* value_set= */ "[2, 1, 3]", + /* expected= */ "[0, 1, 0, 1, 0, 2]"); // Haystack array all null - this->CheckMatch(type, - /* haystack= */ "[null, null, null, null, null, null]", - /* needles= */ "[2, 1, 3]", - /* expected= */ "[null, null, null, null, null, null]"); + this->CheckIndexIn(type, + /* input= */ "[null, null, null, null, null, null]", + /* value_set= */ "[2, 1, 3]", + /* expected= */ "[null, null, null, null, null, null]"); // Needles array all null - this->CheckMatch(type, - /* haystack= */ "[2, 1, 2, 1, 2, 3]", - /* needles= */ "[null, null, null, null]", - /* expected= */ "[null, null, null, null, null, null]"); + this->CheckIndexIn(type, + /* input= */ "[2, 1, 2, 1, 2, 3]", + /* value_set= */ "[null]", + /* expected= */ "[null, null, null, null, null, null]"); // Both arrays all null - this->CheckMatch(type, - /* haystack= */ "[null, null, null, null]", - /* needles= */ "[null, null]", - /* expected= */ "[0, 0, 0, 0]"); + this->CheckIndexIn(type, + /* input= */ "[null, null, null, null]", + /* value_set= */ "[null]", + /* expected= */ "[0, 0, 0, 0]"); // No Match - this->CheckMatch(type, - /* haystack= */ "[2, null, 7, 3, 8]", - /* needles= */ "[2, null, 2, null, 6, 3, 3]", - /* expected= */ "[0, 1, null, 3, null]"); + this->CheckIndexIn(type, + /* input= */ "[2, null, 7, 3, 8]", + /* value_set= */ "[2, null, 6, 3]", + /* expected= */ "[0, 1, null, 3, null]"); // Empty Arrays - this->CheckMatch(type, "[]", "[]", "[]"); + this->CheckIndexIn(type, "[]", "[]", "[]"); } -TYPED_TEST(TestMatchKernelPrimitive, PrimitiveResizeTable) { - using T = typename TypeParam::c_type; - - const int64_t kTotalValues = std::min(INT16_MAX, 1UL << sizeof(T) / 2); - const int64_t kRepeats = 5; - - Int32Builder expected_builder; - NumericBuilder haystack_builder; - ASSERT_OK(expected_builder.Resize(kTotalValues * kRepeats)); - ASSERT_OK(haystack_builder.Resize(kTotalValues * kRepeats)); - - for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { - const auto index = i % kTotalValues; - - haystack_builder.UnsafeAppend(static_cast(index)); - expected_builder.UnsafeAppend(static_cast(index)); - } - - std::shared_ptr haystack, needles, expected; - ASSERT_OK(haystack_builder.Finish(&haystack)); - needles = haystack; - ASSERT_OK(expected_builder.Finish(&expected)); +TYPED_TEST(TestIndexInKernelPrimitive, SkipNulls) { + auto type = TypeTraits::type_singleton(); - ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(haystack, needles)); - std::shared_ptr actual = actual_datum.make_array(); - ASSERT_ARRAYS_EQUAL(*expected, *actual); + // No nulls in value_set + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 3]", + /*expected=*/"[null, 0, null, 1, null]", + /*skip_nulls=*/false); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 3]", + /*expected=*/"[null, 0, null, 1, null]", + /*skip_nulls=*/true); + + // Nulls in value_set + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, null, 3]", + /*expected=*/"[null, 0, null, 2, 1]", + /*skip_nulls=*/false); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, null, 3]", + /*expected=*/"[null, 0, null, 2, null]", + /*skip_nulls=*/true); } -TEST_F(TestMatchKernel, MatchNull) { - CheckMatch(null(), "[null, null, null]", "[null, null]", "[0, 0, 0]"); - - CheckMatch(null(), "[null, null, null]", "[]", "[null, null, null]"); +TEST_F(TestIndexInKernel, NullType) { + CheckIndexIn(null(), "[null, null, null]", "[null]", "[0, 0, 0]"); + CheckIndexIn(null(), "[null, null, null]", "[]", "[null, null, null]"); + CheckIndexIn(null(), "[]", "[null, null]", "[]"); + CheckIndexIn(null(), "[]", "[]", "[]"); - CheckMatch(null(), "[]", "[null, null]", "[]"); - - CheckMatch(null(), "[]", "[]", "[]"); + CheckIndexIn(null(), "[null, null]", "[null]", "[null, null]", /*skip_nulls=*/true); + CheckIndexIn(null(), "[null, null]", "[]", "[null, null]", /*skip_nulls=*/true); } -TEST_F(TestMatchKernel, MatchTimeTimestamp) { - CheckMatch(time32(TimeUnit::SECOND), - /* haystack= */ "[1, null, 5, 1, 2]", - /* needles= */ "[2, 1, null, 1]", - /* expected= */ "[1, 2, null, 1, 0]"); +TEST_F(TestIndexInKernel, TimeTimestamp) { + CheckIndexIn(time32(TimeUnit::SECOND), + /* input= */ "[1, null, 5, 1, 2]", + /* value_set= */ "[2, 1, null]", + /* expected= */ "[1, 2, null, 1, 0]"); // Needles array has no nulls - CheckMatch(time32(TimeUnit::SECOND), - /* haystack= */ "[2, null, 5, 1]", - /* needles= */ "[2, 1, 1]", - /* expected= */ "[0, null, null, 1]"); + CheckIndexIn(time32(TimeUnit::SECOND), + /* input= */ "[2, null, 5, 1]", + /* value_set= */ "[2, 1]", + /* expected= */ "[0, null, null, 1]"); // No match - CheckMatch(time32(TimeUnit::SECOND), "[3, null, 5, 3]", "[2, 1, 2, 1, 2]", - "[null, null, null, null]"); + CheckIndexIn(time32(TimeUnit::SECOND), "[3, null, 5, 3]", "[2, 1]", + "[null, null, null, null]"); // Empty arrays - CheckMatch(time32(TimeUnit::SECOND), "[]", "[]", "[]"); + CheckIndexIn(time32(TimeUnit::SECOND), "[]", "[]", "[]"); - CheckMatch(time64(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 1]", "[0, 1, 0, 2]"); + CheckIndexIn(time64(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 1]", "[0, 1, 0, 2]"); - CheckMatch(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 2, 1]", - "[0, 1, 0, 2]"); + CheckIndexIn(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 1]", + "[0, 1, 0, 2]"); - // Empty haystack array - CheckMatch(timestamp(TimeUnit::NANO), "[]", "[2, null, 2, 1]", "[]"); + // Empty input array + CheckIndexIn(timestamp(TimeUnit::NANO), "[]", "[2, null, 1]", "[]"); - // Empty needles array - CheckMatch(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[]", - "[null, null, null, null]"); + // Empty value_set array + CheckIndexIn(timestamp(TimeUnit::NANO), "[2, null, 1]", "[]", "[null, null, null]"); // Both array are all null - CheckMatch(time32(TimeUnit::SECOND), "[null, null, null, null]", "[null, null]", - "[0, 0, 0, 0]"); + CheckIndexIn(time32(TimeUnit::SECOND), "[null, null, null, null]", "[null]", + "[0, 0, 0, 0]"); } -TEST_F(TestMatchKernel, MatchBoolean) { - CheckMatch(boolean(), - /* haystack= */ "[false, null, false, true]", - /* needles= */ "[null, false, true]", - /* expected= */ "[1, 0, 1, 2]"); +TEST_F(TestIndexInKernel, Boolean) { + CheckIndexIn(boolean(), + /* input= */ "[false, null, false, true]", + /* value_set= */ "[null, false, true]", + /* expected= */ "[1, 0, 1, 2]"); - CheckMatch(boolean(), "[false, null, false, true]", "[false, true, null, true, null]", - "[0, 2, 0, 1]"); + CheckIndexIn(boolean(), "[false, null, false, true]", "[false, true, null]", + "[0, 2, 0, 1]"); // No Nulls - CheckMatch(boolean(), "[true, true, false, true]", "[false, true]", "[1, 1, 0, 1]"); + CheckIndexIn(boolean(), "[true, true, false, true]", "[false, true]", "[1, 1, 0, 1]"); - CheckMatch(boolean(), "[false, true, false, true]", "[true, true, true, true]", - "[null, 0, null, 0]"); + CheckIndexIn(boolean(), "[false, true, false, true]", "[true]", "[null, 0, null, 0]"); // No match - CheckMatch(boolean(), "[true, true, true, true]", "[false, false, false]", - "[null, null, null, null]"); + CheckIndexIn(boolean(), "[true, true, true, true]", "[false]", + "[null, null, null, null]"); - // Nulls in haystack array - CheckMatch(boolean(), "[null, null, null, null]", "[true, true]", - "[null, null, null, null]"); + // Nulls in input array + CheckIndexIn(boolean(), "[null, null, null, null]", "[true]", + "[null, null, null, null]"); - // Nulls in needles array - CheckMatch(boolean(), "[true, true, false, true]", - "[null, null, null, null, null, null]", "[null, null, null, null]"); + // Nulls in value_set array + CheckIndexIn(boolean(), "[true, true, false, true]", "[null]", + "[null, null, null, null]"); // Both array have Nulls - CheckMatch(boolean(), "[null, null, null, null]", "[null, null, null, null]", - "[0, 0, 0, 0]"); + CheckIndexIn(boolean(), "[null, null, null, null]", "[null]", "[0, 0, 0, 0]"); } template -class TestMatchKernelBinary : public TestMatchKernel {}; +class TestIndexInKernelBinary : public TestIndexInKernel {}; -TYPED_TEST_SUITE(TestMatchKernelBinary, BinaryTypes); +TYPED_TEST_SUITE(TestIndexInKernelBinary, BinaryTypes); -TYPED_TEST(TestMatchKernelBinary, MatchBinary) { +TYPED_TEST(TestIndexInKernelBinary, Binary) { auto type = TypeTraits::type_singleton(); - this->CheckMatch(type, R"(["foo", null, "bar", "foo"])", R"(["foo", null, "bar"])", - R"([0, 1, 2, 0])"); + this->CheckIndexIn(type, R"(["foo", null, "bar", "foo"])", R"(["foo", null, "bar"])", + R"([0, 1, 2, 0])"); // No match - this->CheckMatch(type, - /* haystack= */ R"(["foo", null, "bar", "foo"])", - /* needles= */ R"(["baz", "bazzz", "baz", "bazzz"])", - /* expected= */ R"([null, null, null, null])"); + this->CheckIndexIn(type, + /* input= */ R"(["foo", null, "bar", "foo"])", + /* value_set= */ R"(["baz", "bazzz"])", + /* expected= */ R"([null, null, null, null])"); - // Nulls in haystack array - this->CheckMatch(type, - /* haystack= */ R"([null, null, null, null])", - /* needles= */ R"(["foo", "bar", "foo"])", - /* expected= */ R"([null, null, null, null])"); + // Nulls in input array + this->CheckIndexIn(type, + /* input= */ R"([null, null, null, null])", + /* value_set= */ R"(["foo", "bar"])", + /* expected= */ R"([null, null, null, null])"); - // Nulls in needles array - this->CheckMatch(type, R"(["foo", "bar", "foo"])", R"([null, null, null])", - R"([null, null, null])"); + // Nulls in value_set array + this->CheckIndexIn(type, R"(["foo", "bar", "foo"])", R"([null])", + R"([null, null, null])"); // Both array have Nulls - this->CheckMatch(type, - /* haystack= */ R"([null, null, null, null])", - /* needles= */ R"([null, null, null, null])", - /* expected= */ R"([0, 0, 0, 0])"); + this->CheckIndexIn(type, + /* input= */ R"([null, null, null, null])", + /* value_set= */ R"([null])", + /* expected= */ R"([0, 0, 0, 0])"); // Empty arrays - this->CheckMatch(type, R"([])", R"([])", R"([])"); + this->CheckIndexIn(type, R"([])", R"([])", R"([])"); - // Empty haystack array - this->CheckMatch(type, R"([])", R"(["foo", null, "bar", null])", "[]"); + // Empty input array + this->CheckIndexIn(type, R"([])", R"(["foo", null, "bar"])", "[]"); - // Empty needles array - this->CheckMatch(type, R"(["foo", null, "bar", "foo"])", "[]", - R"([null, null, null, null])"); + // Empty value_set array + this->CheckIndexIn(type, R"(["foo", null, "bar", "foo"])", "[]", + R"([null, null, null, null])"); } -TEST_F(TestMatchKernel, BinaryResizeTable) { +TEST_F(TestIndexInKernel, BinaryResizeTable) { const int32_t kTotalValues = 10000; #if !defined(ARROW_VALGRIND) const int32_t kRepeats = 10; @@ -648,10 +484,10 @@ TEST_F(TestMatchKernel, BinaryResizeTable) { const int32_t kBufSize = 20; Int32Builder expected_builder; - StringBuilder haystack_builder; + StringBuilder input_builder; ASSERT_OK(expected_builder.Resize(kTotalValues * kRepeats)); - ASSERT_OK(haystack_builder.Resize(kTotalValues * kRepeats)); - ASSERT_OK(haystack_builder.ReserveData(kBufSize * kTotalValues * kRepeats)); + ASSERT_OK(input_builder.Resize(kTotalValues * kRepeats)); + ASSERT_OK(input_builder.ReserveData(kBufSize * kTotalValues * kRepeats)); for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { int32_t index = i % kTotalValues; @@ -659,101 +495,96 @@ TEST_F(TestMatchKernel, BinaryResizeTable) { char buf[kBufSize] = "test"; ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); - haystack_builder.UnsafeAppend(util::string_view(buf)); + input_builder.UnsafeAppend(util::string_view(buf)); expected_builder.UnsafeAppend(index); } - std::shared_ptr haystack, needles, expected; - ASSERT_OK(haystack_builder.Finish(&haystack)); - needles = haystack; + std::shared_ptr input, value_set, expected; + ASSERT_OK(input_builder.Finish(&input)); + value_set = input->Slice(0, kTotalValues); ASSERT_OK(expected_builder.Finish(&expected)); - ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(haystack, needles)); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, value_set)); std::shared_ptr actual = actual_datum.make_array(); ASSERT_ARRAYS_EQUAL(*expected, *actual); } -TEST_F(TestMatchKernel, MatchFixedSizeBinary) { - CheckMatch(fixed_size_binary(5), - /* haystack= */ R"(["bbbbb", null, "aaaaa", "ccccc"])", - /* needles= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])", - /* expected= */ R"([0, 1, 2, 3])"); - - // Nulls in haystack - CheckMatch(fixed_size_binary(5), - /* haystack= */ R"([null, null, null, null, null])", - /* needles= */ R"(["bbbbb", "aabbb", "bbbbb", "aaaaa", "ccccc"])", - /* expected= */ R"([null, null, null, null, null])"); - - // Nulls in needles - CheckMatch(fixed_size_binary(5), - /* haystack= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])", - /* needles= */ R"([null, null, null])", - /* expected= */ R"([null, 0, null, null, null])"); - - // Both array have Nulls - CheckMatch(fixed_size_binary(5), - /* haystack= */ R"([null, null, null, null, null])", - /* needles= */ R"([null, null, null, null])", - /* expected= */ R"([0, 0, 0, 0, 0])"); - - // No match - CheckMatch(fixed_size_binary(5), - /* haystack= */ R"(["bbbbc", "bbbbc", "aaaad", "cccca"])", - /* needles= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ddddd"])", - /* expected= */ R"([null, null, null, null])"); - - // Empty haystack array - CheckMatch(fixed_size_binary(5), R"([])", - R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])", R"([])"); - - // Empty needles array - CheckMatch(fixed_size_binary(5), R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])", - R"([])", R"([null, null, null, null, null])"); +TEST_F(TestIndexInKernel, FixedSizeBinary) { + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", null, "bbb", "ccc"])", + /*expected=*/R"([2, 1, null, 0, 3, 0])"); + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", null, "bbb", "ccc"])", + /*expected=*/R"([2, null, null, 0, 3, 0])", + /*skip_nulls=*/true); + + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", "bbb", "ccc"])", + /*expected=*/R"([1, null, null, 0, 2, 0])"); + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", "bbb", "ccc"])", + /*expected=*/R"([1, null, null, 0, 2, 0])", + /*skip_nulls=*/true); + + // Empty input array + CheckIndexIn(fixed_size_binary(5), R"([])", R"(["bbbbb", null, "aaaaa", "ccccc"])", + R"([])"); + + // Empty value_set array + CheckIndexIn(fixed_size_binary(5), R"(["bbbbb", null, "bbbbb"])", R"([])", + R"([null, null, null])"); // Empty arrays - CheckMatch(fixed_size_binary(0), R"([])", R"([])", R"([])"); + CheckIndexIn(fixed_size_binary(0), R"([])", R"([])", R"([])"); } -TEST_F(TestMatchKernel, MatchDecimal) { - std::vector input{12, 12, 11, 12}; - std::vector member_set{12, 12, 11, 12}; - std::vector expected{0, 1, 2, 0}; - - CheckMatch(decimal(2, 0), - /* haystack= */ R"(["12", null, "11", "12"])", - /* needles= */ R"(["12", null, "11", "12"])", - /* expected= */ R"([0, 1, 2, 0])"); +TEST_F(TestIndexInKernel, Decimal) { + auto type = decimal(2, 0); + + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, "11", "12"])", + /*expected=*/R"([2, 0, 1, 2, null])", + /*skip_nulls=*/false); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, "11", "12"])", + /*expected=*/R"([2, null, 1, 2, null])", + /*skip_nulls=*/true); + + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"(["11", "12"])", + /*expected=*/R"([1, null, 0, 1, null])", + /*skip_nulls=*/false); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"(["11", "12"])", + /*expected=*/R"([1, null, 0, 1, null])", + /*skip_nulls=*/true); } -TEST_F(TestMatchKernel, MatchChunkedArrayInvoke) { - std::vector values1 = {"foo", "bar", "foo"}; - std::vector values2 = {"bar", "baz", "quuux", "foo"}; - std::vector values3 = {"foo", "bar", "foo"}; - std::vector values4 = {"bar", "baz", "barr", "foo"}; - - auto type = utf8(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {true, true, true, false}); - auto a3 = _MakeArray(type, values3, {}); - auto a4 = _MakeArray(type, values4, {}); - - ArrayVector array1 = {a1, a2}; - auto carr = std::make_shared(array1); - ArrayVector array2 = {a3, a4}; - auto member_set = std::make_shared(array2); - - auto i1 = _MakeArray(int32(), {0, 1, 0}, {}); - auto i2 = - _MakeArray(int32(), {1, 2, 2, 2}, {true, true, false, false}); - - ArrayVector expected = {i1, i2}; - auto expected_carr = std::make_shared(expected); - - ASSERT_OK_AND_ASSIGN(Datum encoded_out, IndexIn(carr, Datum(member_set))); - ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); - - AssertChunkedEquivalent(*expected_carr, *encoded_out.chunked_array()); +TEST_F(TestIndexInKernel, ChunkedArrayInvoke) { + auto input = ChunkedArrayFromJSON(utf8(), {R"(["abc", "def", "ghi", "abc", "jkl"])", + R"(["def", null, "abc", "zzz"])"}); + // No null in value_set + auto value_set = ChunkedArrayFromJSON(utf8(), {R"(["ghi", "def"])", R"(["abc"])"}); + auto expected = + ChunkedArrayFromJSON(int32(), {"[2, 1, 0, 2, null]", "[1, null, 2, null]"}); + + CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true); + + // Null in value_set + value_set = ChunkedArrayFromJSON(utf8(), {R"(["ghi", "def"])", R"([null, "abc"])"}); + expected = ChunkedArrayFromJSON(int32(), {"[3, 1, 0, 3, null]", "[1, 2, 3, null]"}); + CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false); + expected = ChunkedArrayFromJSON(int32(), {"[3, 1, 0, 3, null]", "[1, null, 3, null]"}); + CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true); } } // namespace compute diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 3772ab24c91..b4a8cb19517 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -755,7 +755,7 @@ cdef class _SetLookupOptions(FunctionOptions): cdef const CFunctionOptions* get_options(self) except NULL: return self.set_lookup_options.get() - def _set_options(self, value_set, c_bool skip_null): + def _set_options(self, value_set, c_bool skip_nulls): if isinstance(value_set, Array): self.valset.reset(new CDatum(( value_set).sp_array)) elif isinstance(value_set, ChunkedArray): @@ -768,13 +768,13 @@ cdef class _SetLookupOptions(FunctionOptions): raise ValueError('"{}" is not a valid value_set'.format(value_set)) self.set_lookup_options.reset( - new CSetLookupOptions(deref(self.valset), skip_null) + new CSetLookupOptions(deref(self.valset), skip_nulls) ) class SetLookupOptions(_SetLookupOptions): - def __init__(self, *, value_set, skip_null=False): - self._set_options(value_set, skip_null) + def __init__(self, *, value_set, skip_nulls=False): + self._set_options(value_set, skip_nulls) cdef class _StrptimeOptions(FunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index f1e0509c122..1b641034836 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1109,3 +1109,36 @@ def test_sort_indices_table(): with pytest.raises(ValueError, match="not a valid order"): pc.sort_indices(table, sort_keys=[("a", "nonscending")]) + + +def test_is_in(): + arr = pa.array([1, 2, None, 1, 2, 3]) + + result = pc.is_in(arr, value_set=pa.array([1, 3, None])) + assert result.to_pylist() == [True, False, True, True, False, True] + + result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) + assert result.to_pylist() == [True, False, False, True, False, True] + + result = pc.is_in(arr, value_set=pa.array([1, 3])) + assert result.to_pylist() == [True, False, False, True, False, True] + + result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) + assert result.to_pylist() == [True, False, False, True, False, True] + + +def test_index_in(): + arr = pa.array([1, 2, None, 1, 2, 3]) + + result = pc.index_in(arr, value_set=pa.array([1, 3, None])) + assert result.to_pylist() == [0, None, 2, 0, None, 1] + + result = pc.index_in(arr, value_set=pa.array([1, 3, None]), + skip_nulls=True) + assert result.to_pylist() == [0, None, None, 0, None, 1] + + result = pc.index_in(arr, value_set=pa.array([1, 3])) + assert result.to_pylist() == [0, None, None, 0, None, 1] + + result = pc.index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) + assert result.to_pylist() == [0, None, None, 0, None, 1] diff --git a/ruby/red-arrow/test/test-array.rb b/ruby/red-arrow/test/test-array.rb index b2c9d5fb417..3a4c8ee6baa 100644 --- a/ruby/red-arrow/test/test-array.rb +++ b/ruby/red-arrow/test/test-array.rb @@ -160,8 +160,8 @@ def setup test("Arrow::ChunkedArray") do chunks = [ - Arrow::Int16Array.new([1, 0]), - Arrow::Int16Array.new([1, 0, 3]) + Arrow::Int16Array.new([1, 4]), + Arrow::Int16Array.new([0, 3]) ] right = Arrow::ChunkedArray.new(chunks) assert_equal(Arrow::BooleanArray.new([true, true, true, false]),