diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f247510b6f0..3a4d89e8e31 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -254,6 +254,26 @@ struct EnumTraits } }; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "MapLookupOptions::Occurrence"; } + static std::string value_name(compute::MapLookupOptions::Occurrence value) { + switch (value) { + case compute::MapLookupOptions::Occurrence::FIRST: + return "FIRST"; + case compute::MapLookupOptions::Occurrence::LAST: + return "LAST"; + case compute::MapLookupOptions::Occurrence::ALL: + return "ALL"; + } + return ""; + } +}; + } // namespace internal namespace compute { @@ -287,6 +307,9 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType( DataMember("field_names", &MakeStructOptions::field_names), DataMember("field_nullability", &MakeStructOptions::field_nullability), DataMember("field_metadata", &MakeStructOptions::field_metadata)); +static auto kMapLookupOptionsType = GetFunctionOptionsType( + DataMember("occurrence", &MapLookupOptions::occurrence), + DataMember("query_key", &MapLookupOptions::query_key)); static auto kMatchSubstringOptionsType = GetFunctionOptionsType( DataMember("pattern", &MatchSubstringOptions::pattern), DataMember("ignore_case", &MatchSubstringOptions::ignore_case)); @@ -344,6 +367,7 @@ static auto kRandomOptionsType = GetFunctionOptionsType( DataMember("length", &RandomOptions::length), DataMember("initializer", &RandomOptions::initializer), DataMember("seed", &RandomOptions::seed)); + } // namespace } // namespace internal @@ -399,6 +423,15 @@ MakeStructOptions::MakeStructOptions(std::vector n) MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector()) {} constexpr char MakeStructOptions::kTypeName[]; +MapLookupOptions::MapLookupOptions(std::shared_ptr query_key, + Occurrence occurrence) + : FunctionOptions(internal::kMapLookupOptionsType), + query_key(std::move(query_key)), + occurrence(occurrence) {} +MapLookupOptions::MapLookupOptions() + : MapLookupOptions(std::make_shared(), Occurrence::FIRST) {} +constexpr char MapLookupOptions::kTypeName[]; + MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case) : FunctionOptions(internal::kMatchSubstringOptionsType), pattern(std::move(pattern)), @@ -554,6 +587,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kMapLookupOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kNullOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType)); @@ -785,5 +819,13 @@ Result Week(const Datum& arg, WeekOptions options, ExecContext* ctx) { return CallFunction("week", {arg}, &options, ctx); } +// ---------------------------------------------------------------------- +// Structural transforms +Result MapLookup(const Datum& arg, MapLookupOptions options, ExecContext* ctx) { + return CallFunction("map_lookup", {arg}, &options, ctx); +} + +// ---------------------------------------------------------------------- + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 45dd6b79fc2..8537183c369 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -470,6 +470,30 @@ class ARROW_EXPORT RandomOptions : public FunctionOptions { uint64_t seed; }; +/// Options for map_lookup function +class ARROW_EXPORT MapLookupOptions : public FunctionOptions { + public: + enum Occurrence { + /// Return the first matching value + FIRST, + /// Return the last matching value + LAST, + /// Return all matching values + ALL + }; + + explicit MapLookupOptions(std::shared_ptr query_key, Occurrence occurrence); + MapLookupOptions(); + + constexpr static char const kTypeName[] = "MapLookupOptions"; + + /// The key to lookup in the map + std::shared_ptr query_key; + + /// Whether to return the first, last, or all matching values + Occurrence occurrence; +}; + /// @} /// \brief Get the absolute value of a value. @@ -1350,5 +1374,20 @@ ARROW_EXPORT Result AssumeTimezone(const Datum& values, AssumeTimezoneOptions options, ExecContext* ctx = NULLPTR); +/// \brief Finds either the FIRST, LAST, or ALL items with a key that matches the given +/// query key in a map. +/// +/// Returns an array of items for FIRST and LAST, and an array of list of items for ALL. +/// +/// \param[in] map to look in +/// \param[in] options to pass a query key and choose which matching keys to return +/// (FIRST, LAST or ALL) +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result MapLookup(const Datum& map, MapLookupOptions options, + ExecContext* ctx = NULLPTR); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index d444d78c6d7..cf8a7e08b03 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -18,6 +18,7 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/array/builder_nested.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" #include "arrow/result.h" @@ -428,6 +429,261 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray", "specified through MakeStructOptions."), {"*args"}, "MakeStructOptions"}; +template +struct MapLookupFunctor { + static Result GetOneMatchingIndex(const Array& keys, + const Scalar& query_key_scalar, + const bool* from_back) { + int64_t match_index = -1; + RETURN_NOT_OK( + FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) -> Status { + match_index = index; + if (*from_back) { + return Status::OK(); + } else { + return Status::Cancelled("Found key match for FIRST"); + } + })); + + return match_index; + } + + template + static Status FindMatchingIndices(const Array& keys, const Scalar& query_key_scalar, + FoundItem callback) { + const auto query_key = UnboxScalar::Unbox(query_key_scalar); + int64_t index = 0; + Status status = VisitArrayValuesInline( + *keys.data(), + [&](decltype(query_key) key) -> Status { + if (key == query_key) { + return callback(index++); + } + ++index; + return Status::OK(); + }, + [&]() -> Status { + ++index; + return Status::OK(); + }); + if (!status.ok() && !status.IsCancelled()) { + return status; + } + return Status::OK(); + } + + static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& options = OptionsWrapper::Get(ctx); + const auto& query_key = options.query_key; + const auto& occurrence = options.occurrence; + const MapArray map_array(batch[0].array()); + + std::unique_ptr builder; + if (occurrence == MapLookupOptions::Occurrence::ALL) { + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), + list(map_array.map_type()->item_type()), &builder)); + auto list_builder = checked_cast(builder.get()); + auto value_builder = list_builder->value_builder(); + + for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); + ++map_array_idx) { + if (!map_array.IsValid(map_array_idx)) { + RETURN_NOT_OK(list_builder->AppendNull()); + continue; + } + + auto map = map_array.value_slice(map_array_idx); + auto keys = checked_cast(*map).field(0); + auto items = checked_cast(*map).field(1); + bool found_at_least_one_key = false; + RETURN_NOT_OK( + FindMatchingIndices(*keys, *query_key, [&](int64_t index) -> Status { + if (!found_at_least_one_key) RETURN_NOT_OK(list_builder->Append(true)); + found_at_least_one_key = true; + RETURN_NOT_OK(value_builder->AppendArraySlice(*items->data(), index, 1)); + return Status::OK(); + })); + if (!found_at_least_one_key) { + RETURN_NOT_OK(list_builder->AppendNull()); + } + } + ARROW_ASSIGN_OR_RAISE(auto result, list_builder->Finish()); + out->value = result->data(); + } else { /* occurrence == FIRST || LAST */ + RETURN_NOT_OK( + MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); + RETURN_NOT_OK(builder->Reserve(batch.length)); + for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); + ++map_array_idx) { + if (!map_array.IsValid(map_array_idx)) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } + + auto map = map_array.value_slice(map_array_idx); + auto keys = checked_cast(*map).field(0); + auto items = checked_cast(*map).field(1); + bool from_back = (occurrence == MapLookupOptions::LAST); + ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx, + GetOneMatchingIndex(*keys, *query_key, &from_back)); + + if (key_match_idx != -1) { + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_match_idx, 1)); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = result->data(); + } + + return Status::OK(); + } + + static Status ExecMapScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& options = OptionsWrapper::Get(ctx); + const auto& query_key = options.query_key; + const auto& occurrence = options.occurrence; + + std::shared_ptr item_type = + checked_cast(*batch[0].type()).item_type(); + const auto& map_scalar = batch[0].scalar_as(); + + if (ARROW_PREDICT_FALSE(!map_scalar.is_valid)) { + if (options.occurrence == MapLookupOptions::Occurrence::ALL) { + out->value = MakeNullScalar(list(item_type)); + } else { + out->value = MakeNullScalar(item_type); + } + return Status::OK(); + } + + const auto& struct_array = checked_cast(*map_scalar.value); + const std::shared_ptr keys = struct_array.field(0); + const std::shared_ptr items = struct_array.field(1); + + if (occurrence == MapLookupOptions::Occurrence::ALL) { + bool found_at_least_one_key = false; + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); + + RETURN_NOT_OK(FindMatchingIndices(*keys, *query_key, [&](int64_t index) -> Status { + found_at_least_one_key = true; + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), index, 1)); + return Status::OK(); + })); + if (!found_at_least_one_key) { + out->value = MakeNullScalar(list(items->type())); + } else { + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + ARROW_ASSIGN_OR_RAISE(out->value, MakeScalar(list(items->type()), result)); + } + } else { /* occurrence == FIRST || LAST */ + bool from_back = (occurrence == MapLookupOptions::LAST); + + ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx, + GetOneMatchingIndex(*keys, *query_key, &from_back)); + if (key_match_idx != -1) { + ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(key_match_idx)); + } else { + out->value = MakeNullScalar(items->type()); + } + } + return Status::OK(); + } +}; + +Result ResolveMapLookupType(KernelContext* ctx, + const std::vector& descrs) { + const auto& options = OptionsWrapper::Get(ctx); + std::shared_ptr type = descrs.front().type; + std::shared_ptr item_type = checked_cast(*type).item_type(); + std::shared_ptr key_type = checked_cast(*type).key_type(); + + if (!options.query_key) { + return Status::Invalid("map_lookup: query_key can't be empty."); + } else if (!options.query_key->is_valid) { + return Status::Invalid("map_lookup: query_key can't be null."); + } else if (!options.query_key->type->Equals(key_type)) { + return Status::TypeError( + "map_lookup: query_key type and Map key_type don't match. Expected " + "type: ", + *key_type, ", but got type: ", *options.query_key->type); + } + + if (options.occurrence == MapLookupOptions::Occurrence::ALL) { + return ValueDescr(list(item_type), descrs.front().shape); + } else { /* occurrence == FIRST || LAST */ + return ValueDescr(item_type, descrs.front().shape); + } +} + +struct ResolveMapLookup { + KernelContext* ctx; + const ExecBatch& batch; + Datum* out; + + template + Status Execute() { + if (batch[0].kind() == Datum::SCALAR) { + return MapLookupFunctor::ExecMapScalar(ctx, batch, out); + } + return MapLookupFunctor::ExecMapArray(ctx, batch, out); + } + + template + enable_if_physical_integer Visit(const KeyType& type) { + return Execute(); + } + + template + enable_if_decimal Visit(const KeyType& type) { + return Execute(); + } + + template + enable_if_base_binary Visit(const KeyType& type) { + return Execute(); + } + + template + enable_if_boolean Visit(const KeyType& type) { + return Execute(); + } + + Status Visit(const FixedSizeBinaryType& key) { return Execute(); } + + Status Visit(const MonthDayNanoIntervalType& key) { + return Execute(); + } + + Status Visit(const DataType& type) { + return Status::TypeError("Got unsupported type: ", type.ToString()); + } + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + ResolveMapLookup visitor{ctx, batch, out}; + return VisitTypeInline(*checked_cast(*batch[0].type()).key_type(), + &visitor); + } +}; + +void AddMapLookupKernels(ScalarFunction* func) { + ScalarKernel kernel({InputType(Type::MAP)}, OutputType(ResolveMapLookupType), + ResolveMapLookup::Exec, OptionsWrapper::Init); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +const FunctionDoc map_lookup_doc{ + "Find the items corresponding to a given key in a Map", + ("For a given query key (passed via MapLookupOptions), extract\n" + "either the FIRST, LAST or ALL items from a Map that have\n" + "matching keys."), + {"container"}, + "MapLookupOptions", + /*options_required=*/true}; } // namespace @@ -453,6 +709,11 @@ void RegisterScalarNested(FunctionRegistry* registry) { AddStructFieldKernels(struct_field.get()); DCHECK_OK(registry->AddFunction(std::move(struct_field))); + auto map_lookup = + std::make_shared("map_lookup", Arity::Unary(), &map_lookup_doc); + AddMapLookupKernels(map_lookup.get()); + DCHECK_OK(registry->AddFunction(std::move(map_lookup))); + static MakeStructOptions kDefaultMakeStructOptions; auto make_struct_function = std::make_shared( "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions); diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 4640e1e2216..c35c8f35028 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -225,6 +225,350 @@ TEST(TestScalarNested, StructField) { } } +void CheckMapLookupWithDifferentOptions(const std::shared_ptr& map, + const std::shared_ptr& query_key, + const std::shared_ptr& expected_all, + const std::shared_ptr& expected_first, + const std::shared_ptr& expected_last) { + MapLookupOptions all_matches(query_key, MapLookupOptions::ALL); + MapLookupOptions first_matches(query_key, MapLookupOptions::FIRST); + MapLookupOptions last_matches(query_key, MapLookupOptions::LAST); + + CheckScalar("map_lookup", {map}, expected_all, &all_matches); + CheckScalar("map_lookup", {map}, expected_first, &first_matches); + CheckScalar("map_lookup", {map}, expected_last, &last_matches); +} + +class TestMapLookupKernel : public ::testing::Test {}; + +TEST_F(TestMapLookupKernel, BooleanKey) { + auto true_scalar = ScalarFromJSON(boolean(), R"(true)"); + auto map_type = map(boolean(), int32()); + const char* input = R"( + [ + [ + [true, 99], [false, 1], [false, 2], [true, null], [false, 5], + [true, 8] + ], + null, + [ + [false, null], [true, 67], [false, 101], [false, 1], [false, null], + [false, 9], [true, 80] + ], + [], + [ + [false, 1], [false, 2], [false, 3], [false, 4] + ], + [ + [true, 9], [true, 2], [true, 5], [true, 8] + ] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 5, false); + + auto expected_all = ArrayFromJSON(list(int32()), R"( + [[99, null, 8], null, [67, 80], null, null, null ])"); + auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); + auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); + + CheckMapLookupWithDifferentOptions(map_array_tweaked, true_scalar, expected_all, + expected_first, expected_last); +} + +TEST_F(TestMapLookupKernel, MonthDayNanoIntervalKeys) { + auto key_type = month_day_nano_interval(); + auto map_type = map(key_type, utf8()); + auto key_scalar = ScalarFromJSON(month_day_nano_interval(), R"([1, 2, -3])"); + const char* input = R"( + [ + [ + [[-9, -10, 11], "zero"], [[1, 2, -3], "first_one"], [[11, -12, 0], "two"], + [[1, 2, -3], null], [[-7, -8, -9], "three"], [[1, 2, -3], "second_one"], + [[1, 2, -3], "last_one"] + ], + null, + [ + [[-5, 6, 7], "zero_hero"], [[15, 16, 2], "almost_six"], + [[1, 2, -3], "the_dumb_one"], [[-7, -8, -9], "eleven"], + [[1, 2, -3], "the_chosen_one"], [[-5, 6, 7], "meaning of life"], + [[1, 2, -3], "just_one"], [[1, 2, -3], "no more ones!"] + ], + [ + [[-5, 6, 7], "this"], [[-13, 14, -1], "has"], [[11, -12, 0], "no"], + [[15, 16, 2], "keys"] + ], + [ + [[1, 2, -3], "this"], [[1, 2, -3], "should"], [[1, 2, -3], "also"], + [[1, 2, -3], "be"], [[1, 2, -3], "null"] + ], + [] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 4, false); + + auto expected_first = + ArrayFromJSON(utf8(), R"(["first_one", null, "the_dumb_one", null, null, null])"); + auto expected_last = + ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null, null, null])"); + auto expected_all = ArrayFromJSON(list(utf8()), + R"([ + ["first_one", null, "second_one", "last_one"], + null, + ["the_dumb_one", "the_chosen_one", "just_one", "no more ones!"], + null, + null, + null + ] + )"); + + CheckMapLookupWithDifferentOptions(map_array_tweaked, key_scalar, expected_all, + expected_first, expected_last); +} + +TEST_F(TestMapLookupKernel, FixedSizeBinary) { + auto key_type = fixed_size_binary(6); + auto map_type = map(key_type, int32()); + auto sheesh_scalar = ScalarFromJSON(key_type, R"("sheesh")"); + const char* input = R"( + [ + [ + ["sheesh", 99], ["yooloo", 1], ["yaaaay", 2], ["sheesh", null], ["no way", 5], + ["sheesh", 8] + ], + null, + [ + ["hmm,mm", null], ["sheesh", 67], ["snaccc", 101], ["awwwww", 1], ["dapdap", null], + ["yooloo", 9], ["sheesh", 80] + ], + [], + [ + ["nopeno", 1], ["nonono", 2], ["sheess", 3], ["here!!", 4] + ], + [ + ["sheesh", 9], ["sheesh", 2], ["sheesh", 5], ["sheesh", 8] + ] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 5, false); + + auto expected_all = ArrayFromJSON(list(int32()), R"( + [[99, null, 8], null, [67, 80], null, null, null ])"); + auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); + auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); + + CheckMapLookupWithDifferentOptions(map_array_tweaked, sheesh_scalar, expected_all, + expected_first, expected_last); +} + +TEST_F(TestMapLookupKernel, Errors) { + auto map_type = map(int32(), utf8()); + const char* input = R"( + [ + [ + [0, "zero"], [1, "first one"], [2, "two"], [1, null], [3, "three"], [1, "second one"], + [1, "last one"] + ], + null, + [ + [0, "zero hero"], [9, "almost six"], [1, "the dumb one"], [7, "eleven"], + [1, "the chosen one"], [42, "meaning of life?"], [1, "just_one"], + [1, "no more ones!"] + ], + [ + [4, "this"], [6, "has"], [8, "no"], [2, "ones"] + ], + [ + [1, "this"], [1, "should"], [1, "also"], [1, "be"], [1, "null"] + ], + [] + ])"; + auto map_array = ArrayFromJSON(map_type, input); + auto query_key_int16 = MakeScalar(int16(), 1).ValueOrDie(); + FieldVector fields = {field("a", int32()), field("b", utf8()), + field("c", struct_({ + field("d", int64()), + field("e", float64()), + }))}; + auto unsupported_scalar = ScalarFromJSON(struct_(fields), R"([1, "a", [10, 10.0]])"); + + MapLookupOptions unsupported(unsupported_scalar, MapLookupOptions::FIRST); + MapLookupOptions all(query_key_int16, MapLookupOptions::ALL); + MapLookupOptions first(query_key_int16, MapLookupOptions::FIRST); + MapLookupOptions last(query_key_int16, MapLookupOptions::LAST); + MapLookupOptions empty_key(nullptr, MapLookupOptions::FIRST); + MapLookupOptions null_key(MakeNullScalar(int32()), MapLookupOptions::FIRST); + + for (auto option : {unsupported, all, first, last}) { + ASSERT_RAISES(TypeError, CallFunction("map_lookup", {map_array}, &option)); + } + + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("key can't be empty"), + CallFunction("map_lookup", {map_array}, &empty_key)); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("key can't be null"), + CallFunction("map_lookup", {map_array}, &null_key)); +} + +template +class TestMapLookupIntegralKeys : public ::testing ::Test { + protected: + std::shared_ptr type_singleton() const { + std::shared_ptr type = default_type_instance(); + return map(type, utf8()); + } +}; + +TYPED_TEST_SUITE(TestMapLookupIntegralKeys, PhysicalIntegralArrowTypes); + +TYPED_TEST(TestMapLookupIntegralKeys, StringItems) { + auto map_type = this->type_singleton(); + const char* input = R"( + [ + [ + [0, "zero"], [1, "first_one"], [2, "two"], [1, null], [3, "three"], [1, "second_one"], + [1, "last_one"] + ], + null, + [ + [0, "zero_hero"], [9, "almost_six"], [1, "the_dumb_one"], [7, "eleven"], + [1, "the_chosen_one"], [42, "meaning of life?"], [1, "just_one"], + [1, "no more ones!"] + ], + [ + [4, "this"], [6, "has"], [8, "no"], [2, "ones"] + ], + [ + [1, "this"], [1, "should"], [1, "also"], [1, "be"], [1, "null"] + ], + [] + ])"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 4, false); + + auto expected_all = ArrayFromJSON(list(utf8()), R"( + [ + ["first_one", null, "second_one", "last_one"], + null, + ["the_dumb_one", "the_chosen_one", "just_one", "no more ones!"], + null, + null, + null + ])"); + auto expected_first = + ArrayFromJSON(utf8(), R"(["first_one", null, "the_dumb_one", null, null, null])"); + auto expected_last = + ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null, null, null])"); + + CheckMapLookupWithDifferentOptions( + map_array_tweaked, MakeScalar(default_type_instance(), 1).ValueOrDie(), + expected_all, expected_first, expected_last); +} +template +class TestMapLookupDecimalKeys : public ::testing ::Test { + protected: + std::shared_ptr type_singleton() const { + return std::make_shared(/*precision=*/5, + /*scale=*/4); + } +}; + +TYPED_TEST_SUITE(TestMapLookupDecimalKeys, DecimalArrowTypes); + +TYPED_TEST(TestMapLookupDecimalKeys, StringItems) { + auto type = this->type_singleton(); + auto map_type = map(type, utf8()); + auto key_scalar = DecimalScalarFromJSON(type, R"("1.2345")"); + const char* input = R"( + [ + [ + ["0.8923", "zero"], ["1.2345", "first_one"], ["2.7001", "two"], + ["1.2345", null], ["3.2234", "three"], ["1.2345", "second_one"], + ["1.2345", "last_one"] + ], + null, + [ + ["0.0012", "zero_hero"], ["9.0093", "almost_six"], ["1.2345", "the_dumb_one"], + ["7.6587", "eleven"], ["1.2345", "the_chosen_one"], ["4.2000", "meaning of life"], + ["1.2345", "just_one"], ["1.2345", "no more ones!"] + ], + [ + ["4.8794", "this"], ["6.2345", "has"], ["8.6649", "no"], ["0.0122", "ones"] + ], + [ + ["1.2345", "this"], ["1.2345", "should"], ["1.2345", "also"], ["1.2345", "be"], ["1.2345", "null"] + ], + [] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 4, false); + + auto expected_first = + ArrayFromJSON(utf8(), R"(["first_one", null, "the_dumb_one", null, null, null])"); + auto expected_last = + ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null, null, null])"); + auto expected_all = ArrayFromJSON(list(utf8()), + R"([ + ["first_one", null, "second_one", "last_one"], + null, + ["the_dumb_one", "the_chosen_one", "just_one", "no more ones!"], + null, + null, + null + ] + )"); + CheckMapLookupWithDifferentOptions(map_array_tweaked, key_scalar, expected_all, + expected_first, expected_last); +} + +template +class TestMapLookupBinaryKeys : public ::testing ::Test { + protected: + std::shared_ptr type_singleton() const { + return TypeTraits::type_singleton(); + } +}; + +TYPED_TEST_SUITE(TestMapLookupBinaryKeys, BaseBinaryArrowTypes); + +TYPED_TEST(TestMapLookupBinaryKeys, IntegralItems) { + auto key_type = this->type_singleton(); + auto sheesh_scalar = ScalarFromJSON(key_type, R"("sheesh")"); + auto map_type = map(key_type, int32()); + const char* input = R"( + [ + [ + ["sheesh", 99], ["yolo", 1], ["yay", 2], ["sheesh", null], ["no way!", 5], + ["sheesh", 8] + ], + null, + [ + ["hmm", null], ["sheesh", 67], ["snacc", 101], ["awesome", 1], ["dap", null], + ["yolo", 9], ["sheesh", 80] + ], + [], + [ + ["nope", 1], ["no", 2], ["sheeshes", 3], ["here!", 4] + ], + [ + ["sheesh", 9], ["sheesh", 2], ["sheesh", 5], ["sheesh", 8] + ] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 5, false); + + auto expected_all = ArrayFromJSON(list(int32()), R"( + [[99, null, 8], null, [67, 80], null, null, null ])"); + auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); + auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); + + CheckMapLookupWithDifferentOptions(map_array_tweaked, sheesh_scalar, expected_all, + expected_first, expected_last); +} + struct { Result operator()(std::vector args) { return CallFunction("make_struct", args); diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index f958bd8d398..b84205ce7b6 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1648,7 +1648,9 @@ Structural transforms +---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ | list_parent_indices | Unary | List-like | Int64 | | \(3) | +---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ -| struct_field | Unary | Struct or Union | Computed | :struct:`StructFieldOptions` | \(4) | +| map_lookup | Unary | Map | Computed | :struct:`MapLookupOptions` | \(4) | ++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ +| struct_field | Unary | Struct or Union | Computed | :struct:`StructFieldOptions` | \(5) | +---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ * \(1) Output is an array of the same length as the input list array. The @@ -1662,7 +1664,12 @@ Structural transforms in the list array is appended to the output. Nulls in the parent list array are discarded. -* \(4) Extract a child value based on a sequence of indices passed in +* \(4) Extract either the ``FIRST``, ``LAST`` or ``ALL`` items from a + map whose key match the given query key passed via options. + The output type is an Array of items for the ``FIRST``/``LAST`` options + and an Array of List of items for the ``ALL`` option. + +* \(5) Extract a child value based on a sequence of indices passed in the options. The validity bitmap of the result will be the intersection of all intermediate validity bitmaps. For example, for an array with type ``struct