From e4d04ef94f8d309ddb6dd8fbe614281e647ecc17 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Sun, 16 Jan 2022 00:51:40 +0530 Subject: [PATCH 01/24] MapArray basic rough outline. --- cpp/src/arrow/compute/api_scalar.cc | 33 +++++++ cpp/src/arrow/compute/api_scalar.h | 24 +++++ .../arrow/compute/kernels/scalar_nested.cc | 97 +++++++++++++++++++ .../compute/kernels/scalar_nested_test.cc | 24 +++++ 4 files changed, 178 insertions(+) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f247510b6f0..4adade45623 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -254,6 +254,26 @@ struct EnumTraits } }; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "MapArrayLookupOptions::Occurence"; } + static std::string value_name(compute::MapArrayLookupOptions::Occurence value) { + switch (value) { + case compute::MapArrayLookupOptions::Occurence::First: + return "First"; + case compute::MapArrayLookupOptions::Occurence::Last: + return "Last"; + case compute::MapArrayLookupOptions::Occurence::All: + return "All"; + } + return ""; + } +}; + } // namespace internal namespace compute { @@ -344,6 +364,9 @@ static auto kRandomOptionsType = GetFunctionOptionsType( DataMember("length", &RandomOptions::length), DataMember("initializer", &RandomOptions::initializer), DataMember("seed", &RandomOptions::seed)); +static auto kMapArrayLookupOptionsType = GetFunctionOptionsType( + DataMember("occurence", &MapArrayLookupOptions::occurence), + DataMember("query_key", &MapArrayLookupOptions::query_key)); } // namespace } // namespace internal @@ -545,6 +568,15 @@ RandomOptions::RandomOptions(int64_t length, Initializer initializer, uint64_t s RandomOptions::RandomOptions() : RandomOptions(0, SystemRandom, 0) {} constexpr char RandomOptions::kTypeName[]; +MapArrayLookupOptions::MapArrayLookupOptions(std::shared_ptr query_key, + Occurence occurence) + : FunctionOptions(internal::kMapArrayLookupOptionsType), + query_key(query_key), + occurence(occurence) {} +MapArrayLookupOptions::MapArrayLookupOptions() + : MapArrayLookupOptions(std::make_shared(), Occurence::First) {} +constexpr char MapArrayLookupOptions::kTypeName[]; + namespace internal { void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType)); @@ -573,6 +605,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kUtf8NormalizeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kWeekOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRandomOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kMapArrayLookupOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 45dd6b79fc2..026f430fd28 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -470,6 +470,30 @@ class ARROW_EXPORT RandomOptions : public FunctionOptions { uint64_t seed; }; +/// Options for map_array_lookup function +class ARROW_EXPORT MapArrayLookupOptions : public FunctionOptions { + public: + enum Occurence { + /// Return the first matching value + First, + /// Return the last matching value + Last, + /// Return all matching values + All + }; + + MapArrayLookupOptions(std::shared_ptr query_key, Occurence occurence = All); + MapArrayLookupOptions(); + + constexpr static char const kTypeName[] = "MapArrayLookupOptions"; + + /// The key to lookup in the map + std::shared_ptr query_key; + + /// Whether to return the first, last, or all matching values + Occurence occurence; +}; + /// @} /// \brief Get the absolute value of a value. diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index d444d78c6d7..147e04bbea8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -17,6 +17,7 @@ // Vector kernels involving nested types +#include #include "arrow/array/array_base.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" @@ -429,6 +430,97 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray", {"*args"}, "MakeStructOptions"}; +struct MapArrayLookupFunctor { + static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& options = OptionsWrapper::Get(ctx); + + MapArray map_array(batch[0].array()); + + // Offset differences will tell the number of Strcut = {key, value} pairs + // present in the current list. + // const std::shared_ptr offsets = map_array.value_offsets(); + + std::shared_ptr keys = map_array.keys(); + std::shared_ptr items = map_array.items(); + + const auto& query_key = options.query_key; + const auto& occurence = options.occurence; + + std::unique_ptr builder; + RETURN_NOT_OK( + MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); + + int32_t last_key_idx_checked = 0; + + // aka, number of {key, value} pairs in the current map + int32_t list_struct_len; + bool found_one_key = false; + for (int32_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { + // Number of Struct('s) = {key, value} in the list at the current index + list_struct_len = map_array.value_length(map_array_idx); + for (int32_t key_idx_to_check = last_key_idx_checked; + key_idx_to_check < last_key_idx_checked + list_struct_len; + ++key_idx_to_check) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, + keys->GetScalar(key_idx_to_check)); + if (key->Equals(*query_key)) { + std::cout << "Key being checked: " << key->ToString() << "\n"; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr item, + items->GetScalar(key_idx_to_check)); + std::cout << "Value at key: " << item->ToString() << "\n"; + ARROW_ASSIGN_OR_RAISE(auto value, + item->CastTo(map_array.map_type()->item_type())); + + std::cout << "Item being appended: " << value->ToString() << "\n"; + RETURN_NOT_OK(builder->AppendScalar(*value)); + + if (occurence == MapArrayLookupOptions::First) { + found_one_key = true; + break; + } + } + } + if (found_one_key && occurence == MapArrayLookupOptions::First) break; + + // new index from where to start checking + last_key_idx_checked += list_struct_len; + } + // For now, handling 'Last' and 'All' occurence options as same + // TODO: Handle 'Last' option. + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = result->data(); + return Status::OK(); + } +}; + +Result ResolveMapArrayLookupType(KernelContext* ctx, + const std::vector& descrs) { + std::shared_ptr type = descrs.front().type; + std::shared_ptr value_type; + std::shared_ptr key_type; + if (type->id() == Type::MAP) { + std::cout << "map type found!\n"; + key_type = type->field(0)->type()->field(0)->type(); + value_type = type->field(0)->type()->field(1)->type(); + + std::cout << "Value type: " << value_type->ToString() << "\n"; + } + return ValueDescr(value_type, descrs.front().shape); +} + +void AddMapArrayLookupKernels(ScalarFunction* func) { + ScalarKernel kernel( + {InputType(Type::MAP, ValueDescr::ARRAY)}, OutputType(ResolveMapArrayLookupType), + MapArrayLookupFunctor::ExecMapArray, OptionsWrapper::Init); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +const FunctionDoc map_array_lookup_doc( + "Find the items corresponding to a given key in a MapArray", ("More doc"), + {"container"}, "MapArrayLookupOptions"); + } // namespace void RegisterScalarNested(FunctionRegistry* registry) { @@ -453,6 +545,11 @@ void RegisterScalarNested(FunctionRegistry* registry) { AddStructFieldKernels(struct_field.get()); DCHECK_OK(registry->AddFunction(std::move(struct_field))); + auto map_array_lookup = std::make_shared( + "map_array_lookup", Arity::Unary(), &map_array_lookup_doc); + AddMapArrayLookupKernels(map_array_lookup.get()); + DCHECK_OK(registry->AddFunction(std::move(map_array_lookup))); + static MakeStructOptions kDefaultMakeStructOptions; auto make_struct_function = std::make_shared( "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions); diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 4640e1e2216..5feda78ea8b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -225,6 +225,30 @@ TEST(TestScalarNested, StructField) { } } +TEST(TestScalarNested, MapArrayLookup) { + MapArrayLookupOptions foo_all(MakeScalar("foo"), MapArrayLookupOptions::All); + MapArrayLookupOptions foo_first(MakeScalar("foo"), MapArrayLookupOptions::First); + auto type = map(utf8(), int32()); + + auto keys = ArrayFromJSON(utf8(), R"([ + "foo", "bar", "hello", "foo", "lesgo", "whatnow", + "nothing", "hat", "foo", "sorry", "dip", "foo" + ])"); + auto items = ArrayFromJSON(int16(), R"([ + 99, 1, 2, 3, 5, 8, + null, null, 101, 1, null, 22 + ])"); + auto offsets = ArrayFromJSON(int32(), "[0, 6, 6, 12, 12]")->data()->buffers[1]; + auto null_bitmap = ArrayFromJSON(boolean(), "[1, 0, 1, 1]")->data()->buffers[1]; + + MapArray map_array(type, 4, offsets, keys, items, null_bitmap, 1, 0); + + CheckScalar("map_array_lookup", {map_array}, ArrayFromJSON(int32(), "[99, 3, 101, 22]"), + &foo_all); + CheckScalar("map_array_lookup", {map_array}, ArrayFromJSON(int32(), "[99]"), + &foo_first); +} + struct { Result operator()(std::vector args) { return CallFunction("make_struct", args); From bbfad8e27187b42689444d75eb627a4e2d1dcc07 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Sun, 16 Jan 2022 19:01:09 +0530 Subject: [PATCH 02/24] Use non-recursive scalar check for the time being --- cpp/src/arrow/compute/kernels/scalar_nested_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 5feda78ea8b..56ac64b54d0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -243,10 +243,10 @@ TEST(TestScalarNested, MapArrayLookup) { MapArray map_array(type, 4, offsets, keys, items, null_bitmap, 1, 0); - CheckScalar("map_array_lookup", {map_array}, ArrayFromJSON(int32(), "[99, 3, 101, 22]"), - &foo_all); - CheckScalar("map_array_lookup", {map_array}, ArrayFromJSON(int32(), "[99]"), - &foo_first); + CheckScalarNonRecursive("map_array_lookup", {map_array}, + ArrayFromJSON(int32(), "[99, 3, 101, 22]"), &foo_all); + CheckScalarNonRecursive("map_array_lookup", {map_array}, ArrayFromJSON(int32(), "[99]"), + &foo_first); } struct { From c0697290a9999b05d5a06f5b6d072bc3b89c3dc2 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Tue, 18 Jan 2022 17:59:16 +0530 Subject: [PATCH 03/24] Second pass: correct basic implementation. --- cpp/src/arrow/compute/api_scalar.cc | 51 +-- cpp/src/arrow/compute/api_scalar.h | 12 +- .../arrow/compute/kernels/scalar_nested.cc | 290 ++++++++++++++---- .../compute/kernels/scalar_nested_test.cc | 64 ++-- 4 files changed, 310 insertions(+), 107 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 4adade45623..295d1ae8787 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -255,19 +255,19 @@ struct EnumTraits }; template <> -struct EnumTraits - : BasicEnumTraits { - static std::string name() { return "MapArrayLookupOptions::Occurence"; } - static std::string value_name(compute::MapArrayLookupOptions::Occurence value) { +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "MapArrayLookupOptions::Occurrence"; } + static std::string value_name(compute::MapArrayLookupOptions::Occurrence value) { switch (value) { - case compute::MapArrayLookupOptions::Occurence::First: - return "First"; - case compute::MapArrayLookupOptions::Occurence::Last: - return "Last"; - case compute::MapArrayLookupOptions::Occurence::All: + case compute::MapArrayLookupOptions::Occurrence::FIRST: + return "FIRST"; + case compute::MapArrayLookupOptions::Occurrence::LAST: + return "LAST"; + case compute::MapArrayLookupOptions::Occurrence::ALL: return "All"; } return ""; @@ -307,6 +307,9 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType( DataMember("field_names", &MakeStructOptions::field_names), DataMember("field_nullability", &MakeStructOptions::field_nullability), DataMember("field_metadata", &MakeStructOptions::field_metadata)); +static auto kMapArrayLookupOptionsType = GetFunctionOptionsType( + DataMember("occurrence", &MapArrayLookupOptions::occurrence), + DataMember("query_key", &MapArrayLookupOptions::query_key)); static auto kMatchSubstringOptionsType = GetFunctionOptionsType( DataMember("pattern", &MatchSubstringOptions::pattern), DataMember("ignore_case", &MatchSubstringOptions::ignore_case)); @@ -364,9 +367,7 @@ static auto kRandomOptionsType = GetFunctionOptionsType( DataMember("length", &RandomOptions::length), DataMember("initializer", &RandomOptions::initializer), DataMember("seed", &RandomOptions::seed)); -static auto kMapArrayLookupOptionsType = GetFunctionOptionsType( - DataMember("occurence", &MapArrayLookupOptions::occurence), - DataMember("query_key", &MapArrayLookupOptions::query_key)); + } // namespace } // namespace internal @@ -422,6 +423,15 @@ MakeStructOptions::MakeStructOptions(std::vector n) MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector()) {} constexpr char MakeStructOptions::kTypeName[]; +MapArrayLookupOptions::MapArrayLookupOptions(std::shared_ptr query_key, + Occurrence occurrence) + : FunctionOptions(internal::kMapArrayLookupOptionsType), + query_key(std::move(query_key)), + occurrence(occurrence) {} +MapArrayLookupOptions::MapArrayLookupOptions() + : MapArrayLookupOptions(std::make_shared(), Occurrence::FIRST) {} +constexpr char MapArrayLookupOptions::kTypeName[]; + MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case) : FunctionOptions(internal::kMatchSubstringOptionsType), pattern(std::move(pattern)), @@ -568,15 +578,6 @@ RandomOptions::RandomOptions(int64_t length, Initializer initializer, uint64_t s RandomOptions::RandomOptions() : RandomOptions(0, SystemRandom, 0) {} constexpr char RandomOptions::kTypeName[]; -MapArrayLookupOptions::MapArrayLookupOptions(std::shared_ptr query_key, - Occurence occurence) - : FunctionOptions(internal::kMapArrayLookupOptionsType), - query_key(query_key), - occurence(occurence) {} -MapArrayLookupOptions::MapArrayLookupOptions() - : MapArrayLookupOptions(std::make_shared(), Occurence::First) {} -constexpr char MapArrayLookupOptions::kTypeName[]; - namespace internal { void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType)); @@ -586,6 +587,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kMapArrayLookupOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kNullOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType)); @@ -605,7 +607,6 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kUtf8NormalizeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kWeekOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRandomOptionsType)); - DCHECK_OK(registry->AddFunctionOptionsType(kMapArrayLookupOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 026f430fd28..bc05ba22b44 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -473,16 +473,16 @@ class ARROW_EXPORT RandomOptions : public FunctionOptions { /// Options for map_array_lookup function class ARROW_EXPORT MapArrayLookupOptions : public FunctionOptions { public: - enum Occurence { + enum Occurrence { /// Return the first matching value - First, + FIRST, /// Return the last matching value - Last, + LAST, /// Return all matching values - All + ALL }; - MapArrayLookupOptions(std::shared_ptr query_key, Occurence occurence = All); + MapArrayLookupOptions(std::shared_ptr query_key, Occurrence occurrence = ALL); MapArrayLookupOptions(); constexpr static char const kTypeName[] = "MapArrayLookupOptions"; @@ -491,7 +491,7 @@ class ARROW_EXPORT MapArrayLookupOptions : public FunctionOptions { std::shared_ptr query_key; /// Whether to return the first, last, or all matching values - Occurence occurence; + Occurrence occurrence; }; /// @} diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 147e04bbea8..6943649acc8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -17,7 +17,6 @@ // Vector kernels involving nested types -#include #include "arrow/array/array_base.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" @@ -434,92 +433,269 @@ struct MapArrayLookupFunctor { static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = OptionsWrapper::Get(ctx); - MapArray map_array(batch[0].array()); - - // Offset differences will tell the number of Strcut = {key, value} pairs - // present in the current list. - // const std::shared_ptr offsets = map_array.value_offsets(); + const MapArray map_array(batch[0].array()); std::shared_ptr keys = map_array.keys(); std::shared_ptr items = map_array.items(); const auto& query_key = options.query_key; - const auto& occurence = options.occurence; + const auto& occurence = options.occurrence; - std::unique_ptr builder; - RETURN_NOT_OK( - MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); - - int32_t last_key_idx_checked = 0; - - // aka, number of {key, value} pairs in the current map - int32_t list_struct_len; - bool found_one_key = false; - for (int32_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { - // Number of Struct('s) = {key, value} in the list at the current index - list_struct_len = map_array.value_length(map_array_idx); - for (int32_t key_idx_to_check = last_key_idx_checked; - key_idx_to_check < last_key_idx_checked + list_struct_len; - ++key_idx_to_check) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, - keys->GetScalar(key_idx_to_check)); - if (key->Equals(*query_key)) { - std::cout << "Key being checked: " << key->ToString() << "\n"; - ARROW_ASSIGN_OR_RAISE(std::shared_ptr item, - items->GetScalar(key_idx_to_check)); - std::cout << "Value at key: " << item->ToString() << "\n"; - ARROW_ASSIGN_OR_RAISE(auto value, - item->CastTo(map_array.map_type()->item_type())); + if (occurence == MapArrayLookupOptions::Occurrence::FIRST) { + std::unique_ptr builder; + RETURN_NOT_OK( + MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); + + int32_t last_key_idx_checked = 0; - std::cout << "Item being appended: " << value->ToString() << "\n"; - RETURN_NOT_OK(builder->AppendScalar(*value)); + // aka, number of {key, value} pairs in the current map + int32_t list_struct_len; + bool found_one_key; - if (occurence == MapArrayLookupOptions::First) { + for (int32_t map_array_idx = 0; map_array_idx < map_array.length(); + ++map_array_idx) { + // Number of Struct('s) = {key, value} in the list at the current index + list_struct_len = map_array.value_length(map_array_idx); + found_one_key = false; + + for (int32_t key_idx_to_check = last_key_idx_checked; + key_idx_to_check < last_key_idx_checked + list_struct_len; + ++key_idx_to_check) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, + keys->GetScalar(key_idx_to_check)); + + if (key->Equals(*query_key)) { found_one_key = true; + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_idx_to_check, 1)); break; } } + if (!found_one_key) { + RETURN_NOT_OK(builder->AppendNull()); + } + // new index from where to start checking + last_key_idx_checked += list_struct_len; + } + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = result->data(); + + } else if (occurence == MapArrayLookupOptions::Occurrence::LAST) { + std::unique_ptr builder; + RETURN_NOT_OK( + MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); + + int32_t last_key_idx_checked = 0; + + // aka, number of {key, value} pairs in the current map + int32_t list_struct_len; + int32_t last_key_idx_match; + + for (int32_t map_array_idx = 0; map_array_idx < map_array.length(); + ++map_array_idx) { + // Number of Struct('s) = {key, value} in the list at the current index + list_struct_len = map_array.value_length(map_array_idx); + last_key_idx_match = -1; + + for (int32_t key_idx_to_check = last_key_idx_checked; + key_idx_to_check < last_key_idx_checked + list_struct_len; + ++key_idx_to_check) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, + keys->GetScalar(key_idx_to_check)); + + if (key->Equals(*query_key)) { + last_key_idx_match = key_idx_to_check; + } + } + if (last_key_idx_match == -1) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), last_key_idx_match, 1)); + } + // new index from where to start checking + last_key_idx_checked += list_struct_len; + } + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = result->data(); + + } else /* occurrence == MapArrayLookupOptions::Occurrence::All) */ { + std::unique_ptr builder; + std::unique_ptr list_builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), + list(map_array.map_type()->item_type()), &builder)); + + int32_t last_key_idx_checked = 0; + + // aka, number of {key, value} pairs in the current map + int32_t list_struct_len; + bool found_one_key; + + for (int32_t map_array_idx = 0; map_array_idx < map_array.length(); + ++map_array_idx) { + // Number of Struct('s) = {key, value} in the list at the current index + list_struct_len = map_array.value_length(map_array_idx); + found_one_key = false; + + if (list_struct_len > 0) { + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), + &list_builder)); + } + + for (int32_t key_idx_to_check = last_key_idx_checked; + key_idx_to_check < last_key_idx_checked + list_struct_len; + ++key_idx_to_check) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, + keys->GetScalar(key_idx_to_check)); + + if (key->Equals(*query_key)) { + found_one_key = true; + RETURN_NOT_OK( + list_builder->AppendArraySlice(*items->data(), key_idx_to_check, 1)); + } + } + if (!found_one_key) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + ARROW_ASSIGN_OR_RAISE(auto list_result, list_builder->Finish()); + RETURN_NOT_OK(builder->AppendScalar(ListScalar(list_result))); + } + list_builder->Reset(); + + // new index from where to start checking + last_key_idx_checked += list_struct_len; } - if (found_one_key && occurence == MapArrayLookupOptions::First) break; + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = result->data(); + } + return Status::OK(); + } + + static Status ExecMapScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& options = OptionsWrapper::Get(ctx); + + const auto& map_scalar = batch[0].scalar_as(); + const auto& struct_array = checked_cast(*map_scalar.value); + const std::shared_ptr keys = struct_array.field(0); + const std::shared_ptr items = struct_array.field(1); - // new index from where to start checking - last_key_idx_checked += list_struct_len; + if (ARROW_PREDICT_FALSE(!map_scalar.is_valid)) { + if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { + out->value = MakeNullScalar(list(items->type())); + } else { + out->value = MakeNullScalar(items->type()); + } + return Status::OK(); } - // For now, handling 'Last' and 'All' occurence options as same - // TODO: Handle 'Last' option. - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - out->value = result->data(); + + const std::shared_ptr query_key = options.query_key; + const auto& occurrence = options.occurrence; + + // for debugging + ARROW_LOG(WARNING) << keys->ToString(); + ARROW_LOG(WARNING) << items->ToString(); + ARROW_LOG(WARNING) << "Input: " << struct_array.ToString(); + + if (occurrence == MapArrayLookupOptions::Occurrence::FIRST) { + for (int32_t idx = 0; idx < struct_array.length(); ++idx) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); + + if (key->Equals(*query_key)) { + ARROW_ASSIGN_OR_RAISE(auto result, items->GetScalar(idx)); + out->value = result; + return Status::OK(); + } + } + out->value = MakeNullScalar(items->type()); + } + + else if (occurrence == MapArrayLookupOptions::Occurrence::LAST) { + int32_t last_key_idx_match = -1; + + for (int32_t idx = 0; idx < struct_array.length(); ++idx) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); + + if (key->Equals(*query_key)) { + last_key_idx_match = idx; + } + } + if (last_key_idx_match == -1) { + out->value = MakeNullScalar(items->type()); + } else { + ARROW_ASSIGN_OR_RAISE(auto result, items->GetScalar(last_key_idx_match)); + + ARROW_LOG(WARNING) << result->ToString() << " Type: " << *result->type; + + out->value = result; + } + + } else /* occurrence == MapArrayLookupOptions::Occurrence::ALL) */ { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); + + bool found_one_key = false; + for (int32_t idx = 0; idx < struct_array.length(); ++idx) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); + + if (key->Equals(*query_key)) { + found_one_key = true; + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), idx, 1)); + } + } + if (!found_one_key) { + out->value = MakeNullScalar(list(items->type())); + } else { + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + + ARROW_ASSIGN_OR_RAISE(auto scalar_list, MakeScalar(list(items->type()), result)); + ARROW_LOG(WARNING) << "MapArrayLookup Scalar result: " << result->ToString() + << " Type: " << *scalar_list->type; + out->value = scalar_list; + } + } + return Status::OK(); } }; Result ResolveMapArrayLookupType(KernelContext* ctx, const std::vector& descrs) { + const auto& options = OptionsWrapper::Get(ctx); std::shared_ptr type = descrs.front().type; - std::shared_ptr value_type; - std::shared_ptr key_type; - if (type->id() == Type::MAP) { - std::cout << "map type found!\n"; - key_type = type->field(0)->type()->field(0)->type(); - value_type = type->field(0)->type()->field(1)->type(); - - std::cout << "Value type: " << value_type->ToString() << "\n"; + std::shared_ptr item_type = checked_cast(*type).item_type(); + std::shared_ptr key_type = checked_cast(*type).key_type(); + + if (!options.query_key->type->Equals(key_type)) { + return Status::TypeError( + "map_array_lookup: query_key type and MapArray key_type don't match. Expected " + "type: ", + *item_type, ", but got type: ", *options.query_key->type); + } + + if (options.occurrence == MapArrayLookupOptions::Occurrence::FIRST || + options.occurrence == MapArrayLookupOptions::Occurrence::LAST) { + ARROW_LOG(WARNING) << ValueDescr(item_type, descrs.front().shape).ToString(); + return ValueDescr(item_type, descrs.front().shape); + } else /* occurrence == MapArrayLookupOptions::Occurrence::ALL) */ { + ARROW_LOG(WARNING) << ValueDescr(list(item_type), descrs.front().shape).ToString(); + return ValueDescr(list(item_type), descrs.front().shape); } - return ValueDescr(value_type, descrs.front().shape); } void AddMapArrayLookupKernels(ScalarFunction* func) { - ScalarKernel kernel( - {InputType(Type::MAP, ValueDescr::ARRAY)}, OutputType(ResolveMapArrayLookupType), - MapArrayLookupFunctor::ExecMapArray, OptionsWrapper::Init); - kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(std::move(kernel))); + for (const auto shape : {ValueDescr::ARRAY, ValueDescr::SCALAR}) { + ScalarKernel kernel({InputType(Type::MAP, shape)}, + OutputType(ResolveMapArrayLookupType), + shape == ValueDescr::ARRAY ? MapArrayLookupFunctor::ExecMapArray + : MapArrayLookupFunctor::ExecMapScalar, + OptionsWrapper::Init); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(std::move(kernel))); + } } const FunctionDoc map_array_lookup_doc( "Find the items corresponding to a given key in a MapArray", ("More doc"), - {"container"}, "MapArrayLookupOptions"); + {"container"}, "MapArrayLookupOptions", /*options_required=*/true); } // namespace diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 56ac64b54d0..bcce56f1b43 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -225,28 +225,54 @@ TEST(TestScalarNested, StructField) { } } -TEST(TestScalarNested, MapArrayLookup) { - MapArrayLookupOptions foo_all(MakeScalar("foo"), MapArrayLookupOptions::All); - MapArrayLookupOptions foo_first(MakeScalar("foo"), MapArrayLookupOptions::First); - auto type = map(utf8(), int32()); +TEST(TestScalarNested, MapArrayLookupNonRecursive) { + MapArrayLookupOptions foo_all(MakeScalar("foo"), MapArrayLookupOptions::ALL); + MapArrayLookupOptions foo_first(MakeScalar("foo"), MapArrayLookupOptions::FIRST); + MapArrayLookupOptions foo_last(MakeScalar("foo"), MapArrayLookupOptions::LAST); - auto keys = ArrayFromJSON(utf8(), R"([ - "foo", "bar", "hello", "foo", "lesgo", "whatnow", - "nothing", "hat", "foo", "sorry", "dip", "foo" - ])"); - auto items = ArrayFromJSON(int16(), R"([ - 99, 1, 2, 3, 5, 8, - null, null, 101, 1, null, 22 - ])"); - auto offsets = ArrayFromJSON(int32(), "[0, 6, 6, 12, 12]")->data()->buffers[1]; - auto null_bitmap = ArrayFromJSON(boolean(), "[1, 0, 1, 1]")->data()->buffers[1]; + auto type = map(utf8(), int32()); + const char* input = R"( +[ + [["foo", 99], ["bar", 1], ["hello", 2], ["foo", 3], ["lesgo", 5], ["whatnow", 8]], + null, + [["nothing", null], ["hat", null], ["foo", 101], ["sorry", 1], ["dip", null], ["foo", 22]], + [] + ] +)"; + auto map_array = ArrayFromJSON(type, input); + + CheckScalarNonRecursive( + "map_array_lookup", {map_array}, + ArrayFromJSON(list(int32()), "[[99, 3], null, [101, 22], null]"), &foo_all); + CheckScalarNonRecursive("map_array_lookup", {map_array}, + ArrayFromJSON(int32(), "[99, null, 101, null]"), &foo_first); + CheckScalarNonRecursive("map_array_lookup", {map_array}, + ArrayFromJSON(int32(), "[3, null, 22, null]"), &foo_last); +} - MapArray map_array(type, 4, offsets, keys, items, null_bitmap, 1, 0); +TEST(TestScalarNested, MapArrayLookup) { + MapArrayLookupOptions foo_all(MakeScalar("foo"), MapArrayLookupOptions::ALL); + MapArrayLookupOptions foo_first(MakeScalar("foo"), MapArrayLookupOptions::FIRST); + MapArrayLookupOptions foo_last(MakeScalar("foo"), MapArrayLookupOptions::LAST); - CheckScalarNonRecursive("map_array_lookup", {map_array}, - ArrayFromJSON(int32(), "[99, 3, 101, 22]"), &foo_all); - CheckScalarNonRecursive("map_array_lookup", {map_array}, ArrayFromJSON(int32(), "[99]"), - &foo_first); + auto type = map(utf8(), int32()); + const char* input = R"( +[ + [["foo", 99], ["bar", 1], ["hello", 2], ["foo", 3], ["lesgo", 5], ["whatnow", 8]], + null, + [["nothing", null], ["hat", null], ["foo", 101], ["sorry", 1], ["dip", null], + ["foo", 22]], + [] + ] +)"; + auto map_array = ArrayFromJSON(type, input); + + CheckScalar("map_array_lookup", {map_array}, + ArrayFromJSON(list(int32()), "[[99, 3], null, [101, 22], null]"), &foo_all); + CheckScalar("map_array_lookup", {map_array}, + ArrayFromJSON(int32(), "[99, null, 101, null]"), &foo_first); + CheckScalar("map_array_lookup", {map_array}, + ArrayFromJSON(int32(), "[3, null, 22, null]"), &foo_last); } struct { From 25587861ff885c6628f5d8dfa94827a1502b7a37 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Wed, 19 Jan 2022 18:47:07 +0530 Subject: [PATCH 04/24] Use helper function and offsets instead --- cpp/src/arrow/compute/api_scalar.cc | 2 +- .../arrow/compute/kernels/scalar_nested.cc | 255 +++++++----------- .../compute/kernels/scalar_nested_test.cc | 127 ++++++--- 3 files changed, 196 insertions(+), 188 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 295d1ae8787..d7b8a01bbb6 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -268,7 +268,7 @@ struct EnumTraits case compute::MapArrayLookupOptions::Occurrence::LAST: return "LAST"; case compute::MapArrayLookupOptions::Occurrence::ALL: - return "All"; + return "ALL"; } return ""; } diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 6943649acc8..b6bcf0375eb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -430,6 +430,25 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray", "MakeStructOptions"}; struct MapArrayLookupFunctor { + static Result FindOneMapValueIndex(const Array& keys, const Scalar& query_key, + const int32_t start, const int32_t end, + const bool from_back = false) { + if (!from_back) { + for (int32_t idx = start; idx < end; ++idx) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys.GetScalar(idx)); + + if (key->Equals(query_key)) return idx; + } + } else { + for (int32_t idx = end - 1; idx >= start; --idx) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys.GetScalar(idx)); + + if (key->Equals(query_key)) return idx; + } + } + return -1; + } + static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = OptionsWrapper::Get(ctx); @@ -437,131 +456,79 @@ struct MapArrayLookupFunctor { std::shared_ptr keys = map_array.keys(); std::shared_ptr items = map_array.items(); + auto offsets = std::dynamic_pointer_cast(map_array.offsets()); const auto& query_key = options.query_key; - const auto& occurence = options.occurrence; + const auto& occurrence = options.occurrence; - if (occurence == MapArrayLookupOptions::Occurrence::FIRST) { + if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { std::unique_ptr builder; - RETURN_NOT_OK( - MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); - - int32_t last_key_idx_checked = 0; - - // aka, number of {key, value} pairs in the current map - int32_t list_struct_len; - bool found_one_key; + std::unique_ptr list_builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), + list(map_array.map_type()->item_type()), &builder)); - for (int32_t map_array_idx = 0; map_array_idx < map_array.length(); + for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { - // Number of Struct('s) = {key, value} in the list at the current index - list_struct_len = map_array.value_length(map_array_idx); - found_one_key = false; - - for (int32_t key_idx_to_check = last_key_idx_checked; - key_idx_to_check < last_key_idx_checked + list_struct_len; - ++key_idx_to_check) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, - keys->GetScalar(key_idx_to_check)); - - if (key->Equals(*query_key)) { - found_one_key = true; - RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_idx_to_check, 1)); - break; - } - } - if (!found_one_key) { + if (!map_array.IsValid(map_array_idx)) { RETURN_NOT_OK(builder->AppendNull()); + continue; } - // new index from where to start checking - last_key_idx_checked += list_struct_len; - } - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - out->value = result->data(); - } else if (occurence == MapArrayLookupOptions::Occurrence::LAST) { - std::unique_ptr builder; - RETURN_NOT_OK( - MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); + int32_t start = offsets->Value(map_array_idx); + int32_t end = offsets->Value(map_array_idx + 1); + bool found_atleast_one_key = false; - int32_t last_key_idx_checked = 0; - - // aka, number of {key, value} pairs in the current map - int32_t list_struct_len; - int32_t last_key_idx_match; - - for (int32_t map_array_idx = 0; map_array_idx < map_array.length(); - ++map_array_idx) { - // Number of Struct('s) = {key, value} in the list at the current index - list_struct_len = map_array.value_length(map_array_idx); - last_key_idx_match = -1; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), + &list_builder)); - for (int32_t key_idx_to_check = last_key_idx_checked; - key_idx_to_check < last_key_idx_checked + list_struct_len; + for (int32_t key_idx_to_check = start; key_idx_to_check < end; ++key_idx_to_check) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(key_idx_to_check)); if (key->Equals(*query_key)) { - last_key_idx_match = key_idx_to_check; + found_atleast_one_key = true; + RETURN_NOT_OK( + list_builder->AppendArraySlice(*items->data(), key_idx_to_check, 1)); } } - if (last_key_idx_match == -1) { + + if (!found_atleast_one_key) { RETURN_NOT_OK(builder->AppendNull()); } else { - RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), last_key_idx_match, 1)); + ARROW_ASSIGN_OR_RAISE(auto list_result, list_builder->Finish()); + RETURN_NOT_OK(builder->AppendScalar(ListScalar(list_result))); } - // new index from where to start checking - last_key_idx_checked += list_struct_len; + + list_builder->Reset(); } ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); out->value = result->data(); - } else /* occurrence == MapArrayLookupOptions::Occurrence::All) */ { + } else { /* occurrence == FIRST || LAST */ std::unique_ptr builder; - std::unique_ptr list_builder; - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), - list(map_array.map_type()->item_type()), &builder)); - - int32_t last_key_idx_checked = 0; - - // aka, number of {key, value} pairs in the current map - int32_t list_struct_len; - bool found_one_key; + RETURN_NOT_OK( + MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); - for (int32_t map_array_idx = 0; map_array_idx < map_array.length(); + for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { - // Number of Struct('s) = {key, value} in the list at the current index - list_struct_len = map_array.value_length(map_array_idx); - found_one_key = false; - - if (list_struct_len > 0) { - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), - &list_builder)); + if (!map_array.IsValid(map_array_idx)) { + RETURN_NOT_OK(builder->AppendNull()); + continue; } - for (int32_t key_idx_to_check = last_key_idx_checked; - key_idx_to_check < last_key_idx_checked + list_struct_len; - ++key_idx_to_check) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, - keys->GetScalar(key_idx_to_check)); + int32_t start = offsets->Value(map_array_idx); + int32_t end = offsets->Value(map_array_idx + 1); + bool from_back = (occurrence == MapArrayLookupOptions::LAST); - if (key->Equals(*query_key)) { - found_one_key = true; - RETURN_NOT_OK( - list_builder->AppendArraySlice(*items->data(), key_idx_to_check, 1)); - } - } - if (!found_one_key) { - RETURN_NOT_OK(builder->AppendNull()); + ARROW_ASSIGN_OR_RAISE( + int32_t key_match_idx, + FindOneMapValueIndex(*keys, *query_key, start, end, from_back)); + if (key_match_idx != -1) { + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_match_idx, 1)); } else { - ARROW_ASSIGN_OR_RAISE(auto list_result, list_builder->Finish()); - RETURN_NOT_OK(builder->AppendScalar(ListScalar(list_result))); + RETURN_NOT_OK(builder->AppendNull()); } - list_builder->Reset(); - - // new index from where to start checking - last_key_idx_checked += list_struct_len; } ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); out->value = result->data(); @@ -571,84 +538,57 @@ struct MapArrayLookupFunctor { static Status ExecMapScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = OptionsWrapper::Get(ctx); + const std::shared_ptr& query_key = options.query_key; + const auto& occurrence = options.occurrence; + std::shared_ptr item_type = + checked_cast(*batch[0].type()).item_type(); const auto& map_scalar = batch[0].scalar_as(); - const auto& struct_array = checked_cast(*map_scalar.value); - const std::shared_ptr keys = struct_array.field(0); - const std::shared_ptr items = struct_array.field(1); if (ARROW_PREDICT_FALSE(!map_scalar.is_valid)) { if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { - out->value = MakeNullScalar(list(items->type())); + out->value = MakeNullScalar(list(item_type)); } else { - out->value = MakeNullScalar(items->type()); + out->value = MakeNullScalar(item_type); } return Status::OK(); } - const std::shared_ptr query_key = options.query_key; - const auto& occurrence = options.occurrence; - - // for debugging - ARROW_LOG(WARNING) << keys->ToString(); - ARROW_LOG(WARNING) << items->ToString(); - ARROW_LOG(WARNING) << "Input: " << struct_array.ToString(); - - if (occurrence == MapArrayLookupOptions::Occurrence::FIRST) { - for (int32_t idx = 0; idx < struct_array.length(); ++idx) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); - - if (key->Equals(*query_key)) { - ARROW_ASSIGN_OR_RAISE(auto result, items->GetScalar(idx)); - out->value = result; - return Status::OK(); - } - } - out->value = MakeNullScalar(items->type()); - } - - else if (occurrence == MapArrayLookupOptions::Occurrence::LAST) { - int32_t last_key_idx_match = -1; - - for (int32_t idx = 0; idx < struct_array.length(); ++idx) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); - - if (key->Equals(*query_key)) { - last_key_idx_match = idx; - } - } - if (last_key_idx_match == -1) { - out->value = MakeNullScalar(items->type()); - } else { - ARROW_ASSIGN_OR_RAISE(auto result, items->GetScalar(last_key_idx_match)); - - ARROW_LOG(WARNING) << result->ToString() << " Type: " << *result->type; - - out->value = result; - } + const auto& struct_array = checked_cast(*map_scalar.value); + const std::shared_ptr keys = struct_array.field(0); + const std::shared_ptr items = struct_array.field(1); - } else /* occurrence == MapArrayLookupOptions::Occurrence::ALL) */ { + if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); - bool found_one_key = false; - for (int32_t idx = 0; idx < struct_array.length(); ++idx) { + bool found_atleast_one_key = false; + for (int64_t idx = 0; idx < struct_array.length(); ++idx) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); if (key->Equals(*query_key)) { - found_one_key = true; + found_atleast_one_key = true; RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), idx, 1)); } } - if (!found_one_key) { + if (!found_atleast_one_key) { out->value = MakeNullScalar(list(items->type())); } else { ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + ARROW_ASSIGN_OR_RAISE(out->value, MakeScalar(list(items->type()), result)); + } + } + + else { /* occurrence == FIRST || LAST */ + bool from_back = (occurrence == MapArrayLookupOptions::LAST); - ARROW_ASSIGN_OR_RAISE(auto scalar_list, MakeScalar(list(items->type()), result)); - ARROW_LOG(WARNING) << "MapArrayLookup Scalar result: " << result->ToString() - << " Type: " << *scalar_list->type; - out->value = scalar_list; + ARROW_ASSIGN_OR_RAISE( + int32_t key_match_idx, + FindOneMapValueIndex(*keys, *query_key, 0, struct_array.length(), from_back)); + if (key_match_idx != -1) { + ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(key_match_idx)); + } else { + out->value = MakeNullScalar(items->type()); } } @@ -663,20 +603,18 @@ Result ResolveMapArrayLookupType(KernelContext* ctx, std::shared_ptr item_type = checked_cast(*type).item_type(); std::shared_ptr key_type = checked_cast(*type).key_type(); - if (!options.query_key->type->Equals(key_type)) { + if (!options.query_key || !options.query_key->type || + !options.query_key->type->Equals(key_type)) { return Status::TypeError( "map_array_lookup: query_key type and MapArray key_type don't match. Expected " "type: ", *item_type, ", but got type: ", *options.query_key->type); } - if (options.occurrence == MapArrayLookupOptions::Occurrence::FIRST || - options.occurrence == MapArrayLookupOptions::Occurrence::LAST) { - ARROW_LOG(WARNING) << ValueDescr(item_type, descrs.front().shape).ToString(); - return ValueDescr(item_type, descrs.front().shape); - } else /* occurrence == MapArrayLookupOptions::Occurrence::ALL) */ { - ARROW_LOG(WARNING) << ValueDescr(list(item_type), descrs.front().shape).ToString(); + if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { return ValueDescr(list(item_type), descrs.front().shape); + } else /* occurrence == FIRST || LAST */ { + return ValueDescr(item_type, descrs.front().shape); } } @@ -693,9 +631,14 @@ void AddMapArrayLookupKernels(ScalarFunction* func) { } } -const FunctionDoc map_array_lookup_doc( - "Find the items corresponding to a given key in a MapArray", ("More doc"), - {"container"}, "MapArrayLookupOptions", /*options_required=*/true); +const FunctionDoc map_array_lookup_doc{ + "Find the items corresponding to a given key in a MapArray", + ("For a given query key (passed via MapArrayLookupOptions), extract\n" + "either the FIRST, LAST or ALL items from a MapArray that have\n" + "matching keys."), + {"container"}, + "MapArrayLookupOptions", + /*options_required=*/true}; } // namespace diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index bcce56f1b43..d1d8d485761 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -225,54 +225,119 @@ TEST(TestScalarNested, StructField) { } } -TEST(TestScalarNested, MapArrayLookupNonRecursive) { +TEST(TestScalarNested, MapArrayLookup) { MapArrayLookupOptions foo_all(MakeScalar("foo"), MapArrayLookupOptions::ALL); MapArrayLookupOptions foo_first(MakeScalar("foo"), MapArrayLookupOptions::FIRST); MapArrayLookupOptions foo_last(MakeScalar("foo"), MapArrayLookupOptions::LAST); auto type = map(utf8(), int32()); const char* input = R"( -[ - [["foo", 99], ["bar", 1], ["hello", 2], ["foo", 3], ["lesgo", 5], ["whatnow", 8]], - null, - [["nothing", null], ["hat", null], ["foo", 101], ["sorry", 1], ["dip", null], ["foo", 22]], - [] - ] -)"; + [ + [["foo", 99], ["bar", 1], ["hello", 2], ["foo", 3], ["lesgo", 5], ["whatnow", 8]], + null, + [["nothing", null], ["hat", null], ["foo", 101], ["sorry", 1], ["dip", null], + ["foo", 22]], + [] + ] + )"; auto map_array = ArrayFromJSON(type, input); - CheckScalarNonRecursive( - "map_array_lookup", {map_array}, - ArrayFromJSON(list(int32()), "[[99, 3], null, [101, 22], null]"), &foo_all); - CheckScalarNonRecursive("map_array_lookup", {map_array}, - ArrayFromJSON(int32(), "[99, null, 101, null]"), &foo_first); - CheckScalarNonRecursive("map_array_lookup", {map_array}, - ArrayFromJSON(int32(), "[3, null, 22, null]"), &foo_last); + CheckScalar("map_array_lookup", {map_array}, + ArrayFromJSON(list(int32()), "[[99, 3], null, [101, 22], null]"), &foo_all); + CheckScalar("map_array_lookup", {map_array}, + ArrayFromJSON(int32(), "[99, null, 101, null]"), &foo_first); + CheckScalar("map_array_lookup", {map_array}, + ArrayFromJSON(int32(), "[3, null, 22, null]"), &foo_last); } -TEST(TestScalarNested, MapArrayLookup) { +TEST(TestScalarNested, MapArrayLookupNested) { + auto type = map(utf8(), map(int16(), int16())); + const char* input = R"( + [ + [ + [ + "just", + [[0, 0], [1, 1]] + ], + [ + "random", + [[2, 2], [3, 3]] + ], + [ + "foo", + [[4, 4], [5, 5]] + ], + [ + "values", + [[6, 6], [7, 7]] + ], + [ + "foo", + [[8, 8], [9, 9]] + ], + [ + "point", + [[10, 10], [11, 11]] + ], + [ + "foo", + [[12, 12], [13, 13]] + ] + ], + null, + [ + [ + "yet", + [[0, 1], [1, 2]] + ], + [ + "more", + [[2, 3], [3, 4]] + ], + [ + "foo", + [[4, 5], [5, 6]] + ], + [ + "random", + [[6, 7], [7, 8]] + ], + [ + "foo", + [[8, 9], [9, 10]] + ], + [ + "values", + [[10, 11], [11, 12]] + ], + [ + "foo", + [[12, 13], [13, 14]] + ] + ], + [] + ] + )"; + auto map_array = ArrayFromJSON(type, input); + MapArrayLookupOptions foo_all(MakeScalar("foo"), MapArrayLookupOptions::ALL); MapArrayLookupOptions foo_first(MakeScalar("foo"), MapArrayLookupOptions::FIRST); MapArrayLookupOptions foo_last(MakeScalar("foo"), MapArrayLookupOptions::LAST); - auto type = map(utf8(), int32()); - const char* input = R"( -[ - [["foo", 99], ["bar", 1], ["hello", 2], ["foo", 3], ["lesgo", 5], ["whatnow", 8]], - null, - [["nothing", null], ["hat", null], ["foo", 101], ["sorry", 1], ["dip", null], - ["foo", 22]], - [] - ] -)"; - auto map_array = ArrayFromJSON(type, input); + auto foo_all_output = ArrayFromJSON( + list(map(int16(), int16())), + "[ [[[4, 4], [5, 5]], [[8, 8], [9, 9]], [[12, 12], [13, 13]]], null, [[[4, 5], [5, " + "6]], [[8, 9], [9, 10]], [[12, 13], [13, 14]]], null ]"); + CheckScalar("map_array_lookup", {map_array}, foo_all_output, &foo_all); CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(list(int32()), "[[99, 3], null, [101, 22], null]"), &foo_all); - CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(int32(), "[99, null, 101, null]"), &foo_first); + ArrayFromJSON(map(int16(), int16()), + "[ [[4, 4], [5, 5]], null, [[4, 5], [5, 6]], null ]"), + &foo_first); CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(int32(), "[3, null, 22, null]"), &foo_last); + ArrayFromJSON(map(int16(), int16()), + "[ [[12, 12], [13, 13]], null, [[12, 13], [13, 14]], null ]"), + &foo_last); } struct { From b35acf38966d328e50cd8d9bd1567ed4e565c12a Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Thu, 20 Jan 2022 18:32:51 +0530 Subject: [PATCH 05/24] Add basic templated test --- .../arrow/compute/kernels/scalar_nested.cc | 22 ++++----- .../compute/kernels/scalar_nested_test.cc | 46 +++++++++++++++++++ 2 files changed, 57 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index b6bcf0375eb..40584ca38a8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -430,17 +430,17 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray", "MakeStructOptions"}; struct MapArrayLookupFunctor { - static Result FindOneMapValueIndex(const Array& keys, const Scalar& query_key, - const int32_t start, const int32_t end, + static Result FindOneMapValueIndex(const Array& keys, const Scalar& query_key, + const int64_t start, const int64_t end, const bool from_back = false) { if (!from_back) { - for (int32_t idx = start; idx < end; ++idx) { + for (int64_t idx = start; idx < end; ++idx) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys.GetScalar(idx)); if (key->Equals(query_key)) return idx; } } else { - for (int32_t idx = end - 1; idx >= start; --idx) { + for (int64_t idx = end - 1; idx >= start; --idx) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys.GetScalar(idx)); if (key->Equals(query_key)) return idx; @@ -474,14 +474,14 @@ struct MapArrayLookupFunctor { continue; } - int32_t start = offsets->Value(map_array_idx); - int32_t end = offsets->Value(map_array_idx + 1); + int64_t start = offsets->Value(map_array_idx); + int64_t end = offsets->Value(map_array_idx + 1); bool found_atleast_one_key = false; RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &list_builder)); - for (int32_t key_idx_to_check = start; key_idx_to_check < end; + for (int64_t key_idx_to_check = start; key_idx_to_check < end; ++key_idx_to_check) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(key_idx_to_check)); @@ -517,12 +517,12 @@ struct MapArrayLookupFunctor { continue; } - int32_t start = offsets->Value(map_array_idx); - int32_t end = offsets->Value(map_array_idx + 1); + int64_t start = offsets->Value(map_array_idx); + int64_t end = offsets->Value(map_array_idx + 1); bool from_back = (occurrence == MapArrayLookupOptions::LAST); ARROW_ASSIGN_OR_RAISE( - int32_t key_match_idx, + int64_t key_match_idx, FindOneMapValueIndex(*keys, *query_key, start, end, from_back)); if (key_match_idx != -1) { RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_match_idx, 1)); @@ -583,7 +583,7 @@ struct MapArrayLookupFunctor { bool from_back = (occurrence == MapArrayLookupOptions::LAST); ARROW_ASSIGN_OR_RAISE( - int32_t key_match_idx, + int64_t key_match_idx, FindOneMapValueIndex(*keys, *query_key, 0, struct_array.length(), from_back)); if (key_match_idx != -1) { ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(key_match_idx)); diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index d1d8d485761..bbdb4a37dd8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -340,6 +340,52 @@ TEST(TestScalarNested, MapArrayLookupNested) { &foo_last); } +template +class TestMapArrayLookupIntegralKeys : public ::testing ::Test {}; + +TYPED_TEST_SUITE(TestMapArrayLookupIntegralKeys, IntegralArrowTypes); + +TYPED_TEST(TestMapArrayLookupIntegralKeys, StringItems) { + auto type = default_type_instance(); + + auto one_scalar = MakeScalar(type, 1).ValueOrDie(); + MapArrayLookupOptions one_all(one_scalar, MapArrayLookupOptions::ALL); + MapArrayLookupOptions one_first(one_scalar, MapArrayLookupOptions::FIRST); + MapArrayLookupOptions one_last(one_scalar, MapArrayLookupOptions::LAST); + + auto map_type = map(type, utf8()); + const char* input = R"( + [ + [ + [0, "zero"], [1, "first_one"], [2, "two"], [3, "three"], [1, "second_one"], + [1, "last_one"] + ], + null, + [ + [0, "zero_hero"], [9, "almost_six"], [1, "the_dumb_one"], [7, "eleven"], + [1, "the_chosen_one"], [42, "meaning of life?"], [1, "just_one"], + [1, "no more ones!"] + ], + [] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + CheckScalar("map_array_lookup", {map_array}, + ArrayFromJSON(utf8(), R"(["first_one", null, "the_dumb_one", null])"), + &one_first); + CheckScalar("map_array_lookup", {map_array}, + ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null])"), + &one_last); + CheckScalar("map_array_lookup", {map_array}, ArrayFromJSON(list(utf8()), R"([ + ["first_one", "second_one", "last_one"], + null, + ["the_dumb_one", "the_chosen_one", "just_one", "no more ones!"], + null + ] + )"), + &one_all); +} + struct { Result operator()(std::vector args) { return CallFunction("make_struct", args); From f0473fabf6cc98b9f159a96c8f6a3f3a6eeb0088 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Fri, 21 Jan 2022 18:56:35 +0530 Subject: [PATCH 06/24] Even more templated tests --- .../compute/kernels/scalar_nested_test.cc | 243 +++++++++++++----- 1 file changed, 185 insertions(+), 58 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index bbdb4a37dd8..53a56ccb5da 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -225,32 +225,41 @@ TEST(TestScalarNested, StructField) { } } -TEST(TestScalarNested, MapArrayLookup) { - MapArrayLookupOptions foo_all(MakeScalar("foo"), MapArrayLookupOptions::ALL); - MapArrayLookupOptions foo_first(MakeScalar("foo"), MapArrayLookupOptions::FIRST); - MapArrayLookupOptions foo_last(MakeScalar("foo"), MapArrayLookupOptions::LAST); +void CheckMapArrayLookupWithDifferentOptions( + const std::shared_ptr& map, const std::shared_ptr& query_key, + const std::shared_ptr& expected_all, + const std::shared_ptr& expected_first, + const std::shared_ptr& expected_last) { + MapArrayLookupOptions all_matches(query_key, MapArrayLookupOptions::ALL); + MapArrayLookupOptions first_matches(query_key, MapArrayLookupOptions::FIRST); + MapArrayLookupOptions last_matches(query_key, MapArrayLookupOptions::LAST); + + CheckScalar("map_array_lookup", {map}, expected_all, &all_matches); + CheckScalar("map_array_lookup", {map}, expected_first, &first_matches); + CheckScalar("map_array_lookup", {map}, expected_last, &last_matches); +} + +class TestMapArrayLookupKernel : public ::testing::Test {}; +TEST_F(TestMapArrayLookupKernel, Basic) { auto type = map(utf8(), int32()); const char* input = R"( [ - [["foo", 99], ["bar", 1], ["hello", 2], ["foo", 3], ["lesgo", 5], ["whatnow", 8]], + [["foo", 99], ["bar", 1], ["hello", 2], ["foo", 3], ["lets go", 5], ["what now?", 8]], null, [["nothing", null], ["hat", null], ["foo", 101], ["sorry", 1], ["dip", null], ["foo", 22]], [] - ] - )"; + ])"; auto map_array = ArrayFromJSON(type, input); - - CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(list(int32()), "[[99, 3], null, [101, 22], null]"), &foo_all); - CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(int32(), "[99, null, 101, null]"), &foo_first); - CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(int32(), "[3, null, 22, null]"), &foo_last); + CheckMapArrayLookupWithDifferentOptions( + map_array, MakeScalar("foo"), + ArrayFromJSON(list(int32()), R"([[99, 3], null, [101, 22], null])"), + ArrayFromJSON(int32(), R"([99, null, 101, null])"), + ArrayFromJSON(int32(), R"([3, null, 22, null])")); } -TEST(TestScalarNested, MapArrayLookupNested) { +TEST_F(TestMapArrayLookupKernel, NestedItems) { auto type = map(utf8(), map(int16(), int16())); const char* input = R"( [ @@ -318,46 +327,51 @@ TEST(TestScalarNested, MapArrayLookupNested) { [] ] )"; - auto map_array = ArrayFromJSON(type, input); - - MapArrayLookupOptions foo_all(MakeScalar("foo"), MapArrayLookupOptions::ALL); - MapArrayLookupOptions foo_first(MakeScalar("foo"), MapArrayLookupOptions::FIRST); - MapArrayLookupOptions foo_last(MakeScalar("foo"), MapArrayLookupOptions::LAST); - - auto foo_all_output = ArrayFromJSON( - list(map(int16(), int16())), - "[ [[[4, 4], [5, 5]], [[8, 8], [9, 9]], [[12, 12], [13, 13]]], null, [[[4, 5], [5, " - "6]], [[8, 9], [9, 10]], [[12, 13], [13, 14]]], null ]"); - - CheckScalar("map_array_lookup", {map_array}, foo_all_output, &foo_all); - CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(map(int16(), int16()), - "[ [[4, 4], [5, 5]], null, [[4, 5], [5, 6]], null ]"), - &foo_first); - CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(map(int16(), int16()), - "[ [[12, 12], [13, 13]], null, [[12, 13], [13, 14]], null ]"), - &foo_last); + const auto map_array = ArrayFromJSON(type, input); + const auto expected_all = ArrayFromJSON(list(map(int16(), int16())), R"( + [ + [ + [[4, 4], [5, 5]], [[8, 8], [9, 9]], + [[12, 12], [13, 13]] + ], + null, + [ + [[4, 5], [5, 6]], [[8, 9], [9, 10]], + [[12, 13], [13, 14]] + ], + null + ])"); + const auto expected_first = ArrayFromJSON(map(int16(), int16()), R"( + [ + [[4, 4], [5, 5]], + null, + [[4, 5], [5, 6]], + null + ])"); + const auto expected_last = ArrayFromJSON(map(int16(), int16()), R"( + [ + [[12, 12], [13, 13]], + null, + [[12, 13], [13, 14]], + null + ])"); + CheckMapArrayLookupWithDifferentOptions(map_array, MakeScalar("foo"), expected_all, + expected_first, expected_last); } -template +template class TestMapArrayLookupIntegralKeys : public ::testing ::Test {}; -TYPED_TEST_SUITE(TestMapArrayLookupIntegralKeys, IntegralArrowTypes); +TYPED_TEST_SUITE(TestMapArrayLookupIntegralKeys, PhysicalIntegralArrowTypes); TYPED_TEST(TestMapArrayLookupIntegralKeys, StringItems) { - auto type = default_type_instance(); - - auto one_scalar = MakeScalar(type, 1).ValueOrDie(); - MapArrayLookupOptions one_all(one_scalar, MapArrayLookupOptions::ALL); - MapArrayLookupOptions one_first(one_scalar, MapArrayLookupOptions::FIRST); - MapArrayLookupOptions one_last(one_scalar, MapArrayLookupOptions::LAST); + std::shared_ptr type = default_type_instance(); auto map_type = map(type, utf8()); const char* input = R"( [ [ - [0, "zero"], [1, "first_one"], [2, "two"], [3, "three"], [1, "second_one"], + [0, "zero"], [1, "first_one"], [2, "two"], [1, null], [3, "three"], [1, "second_one"], [1, "last_one"] ], null, @@ -366,24 +380,137 @@ TYPED_TEST(TestMapArrayLookupIntegralKeys, StringItems) { [1, "the_chosen_one"], [42, "meaning of life?"], [1, "just_one"], [1, "no more ones!"] ], + [ + [4, "this"], [6, "has"], [8, "no"], [2, "ones"] + ], + [ + [1, "this"], [1, "should"], [1, "also"], [1, "be"], [1, "null"] + ], + [] + ])"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 4, false); + + auto expected_all = ArrayFromJSON(list(utf8()), R"( + [ + ["first_one", null, "second_one", "last_one"], + null, + ["the_dumb_one", "the_chosen_one", "just_one", "no more ones!"], + null, + null, + null + ])"); + auto expected_first = + ArrayFromJSON(utf8(), R"(["first_one", null, "the_dumb_one", null, null, null])"); + auto expected_last = + ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null, null, null])"); + + CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, + MakeScalar(type, 1).ValueOrDie(), expected_all, + expected_first, expected_last); +} +template +class TestMapArrayLookupDecimalKeys : public ::testing ::Test { + protected: + std::shared_ptr GetType() { + return std::make_shared(/*precision=*/5, + /*scale=*/4); + } +}; + +TYPED_TEST_SUITE(TestMapArrayLookupDecimalKeys, DecimalArrowTypes); + +TYPED_TEST(TestMapArrayLookupDecimalKeys, StringItems) { + std::shared_ptr type = this->GetType(); + auto key_scalar = DecimalScalarFromJSON(type, R"("1.2345")"); + + auto map_type = map(type, utf8()); + const char* input = R"( + [ + [ + ["0.8923", "zero"], ["1.2345", "first_one"], ["2.7001", "two"], + ["1.2345", null], ["3.2234", "three"], ["1.2345", "second_one"], + ["1.2345", "last_one"] + ], + null, + [ + ["0.0012", "zero_hero"], ["9.0093", "almost_six"], ["1.2345", "the_dumb_one"], + ["7.6587", "eleven"], ["1.2345", "the_chosen_one"], ["4.2000", "meaning of life"], + ["1.2345", "just_one"], ["1.2345", "no more ones!"] + ], + [ + ["4.8794", "this"], ["6.2345", "has"], ["8.6649", "no"], ["0.0122", "ones"] + ], + [ + ["1.2345", "this"], ["1.2345", "should"], ["1.2345", "also"], ["1.2345", "be"], ["1.2345", "null"] + ], [] ] )"; auto map_array = ArrayFromJSON(map_type, input); - CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(utf8(), R"(["first_one", null, "the_dumb_one", null])"), - &one_first); - CheckScalar("map_array_lookup", {map_array}, - ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null])"), - &one_last); - CheckScalar("map_array_lookup", {map_array}, ArrayFromJSON(list(utf8()), R"([ - ["first_one", "second_one", "last_one"], - null, - ["the_dumb_one", "the_chosen_one", "just_one", "no more ones!"], - null - ] - )"), - &one_all); + auto map_array_tweaked = TweakValidityBit(map_array, 4, false); + + auto expected_first = + ArrayFromJSON(utf8(), R"(["first_one", null, "the_dumb_one", null, null, null])"); + auto expected_last = + ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null, null, null])"); + auto expected_all = ArrayFromJSON(list(utf8()), + R"([ + ["first_one", null, "second_one", "last_one"], + null, + ["the_dumb_one", "the_chosen_one", "just_one", "no more ones!"], + null, + null, + null + ] + )"); + CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, key_scalar, expected_all, + expected_first, expected_last); +} + +template +class TestMapArrayLookupBinaryKeys : public ::testing ::Test { + protected: + std::shared_ptr GetType() { return TypeTraits::type_singleton(); } +}; + +TYPED_TEST_SUITE(TestMapArrayLookupBinaryKeys, BaseBinaryArrowTypes); + +TYPED_TEST(TestMapArrayLookupBinaryKeys, IntegralItems) { + auto key_type = this->GetType(); + auto sheesh_scalar = ScalarFromJSON(key_type, R"("sheesh")"); + + auto map_type = map(key_type, int32()); + const char* input = R"( + [ + [ + ["sheesh", 99], ["yolo", 1], ["yay", 2], ["sheesh", null], ["no way!", 5], + ["sheesh", 8] + ], + null, + [ + ["hmm", null], ["sheesh", 67], ["snacc", 101], ["awesome", 1], ["dap", null], + ["yolo", 9], ["sheesh", 80] + ], + [], + [ + ["nope", 1], ["no", 2], ["sheeshes", 3], ["here!", 4] + ], + [ + ["sheesh", 9], ["sheesh", 2], ["sheesh", 5], ["sheesh", 8] + ] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 5, false); + + auto expected_all = ArrayFromJSON(list(int32()), R"( + [[99, null, 8], null, [67, 80], null, null, null ])"); + auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); + auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); + + CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, sheesh_scalar, expected_all, + expected_first, expected_last); } struct { From 6e371b8221899656826160dc7e74113bc61a0dbd Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Fri, 21 Jan 2022 22:51:05 +0530 Subject: [PATCH 07/24] Try and refactor MapArrayLookup kernels --- .../arrow/compute/kernels/scalar_nested.cc | 166 +++++++----------- 1 file changed, 59 insertions(+), 107 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 40584ca38a8..02cd79ce162 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -449,97 +449,86 @@ struct MapArrayLookupFunctor { return -1; } - static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + static Result> GetScalarOutput(KernelContext* ctx, + const MapScalar map_scalar) { const auto& options = OptionsWrapper::Get(ctx); + const std::shared_ptr& query_key = options.query_key; + const auto& occurrence = options.occurrence; - const MapArray map_array(batch[0].array()); - - std::shared_ptr keys = map_array.keys(); - std::shared_ptr items = map_array.items(); - auto offsets = std::dynamic_pointer_cast(map_array.offsets()); + const auto& struct_array = checked_cast(*map_scalar.value); + const std::shared_ptr keys = struct_array.field(0); + const std::shared_ptr items = struct_array.field(1); - const auto& query_key = options.query_key; - const auto& occurrence = options.occurrence; + std::shared_ptr output; if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { std::unique_ptr builder; - std::unique_ptr list_builder; - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), - list(map_array.map_type()->item_type()), &builder)); - - for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); - ++map_array_idx) { - if (!map_array.IsValid(map_array_idx)) { - RETURN_NOT_OK(builder->AppendNull()); - continue; - } - - int64_t start = offsets->Value(map_array_idx); - int64_t end = offsets->Value(map_array_idx + 1); - bool found_atleast_one_key = false; - - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), - &list_builder)); + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); - for (int64_t key_idx_to_check = start; key_idx_to_check < end; - ++key_idx_to_check) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, - keys->GetScalar(key_idx_to_check)); + bool found_at_least_one_key = false; + for (int64_t idx = 0; idx < struct_array.length(); ++idx) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); - if (key->Equals(*query_key)) { - found_atleast_one_key = true; - RETURN_NOT_OK( - list_builder->AppendArraySlice(*items->data(), key_idx_to_check, 1)); - } + if (key->Equals(*query_key)) { + found_at_least_one_key = true; + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), idx, 1)); } + } + if (!found_at_least_one_key) { + output = MakeNullScalar(list(items->type())); + } else { + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + ARROW_ASSIGN_OR_RAISE(output, MakeScalar(list(items->type()), result)); + } + } - if (!found_atleast_one_key) { - RETURN_NOT_OK(builder->AppendNull()); - } else { - ARROW_ASSIGN_OR_RAISE(auto list_result, list_builder->Finish()); - RETURN_NOT_OK(builder->AppendScalar(ListScalar(list_result))); - } + else { /* occurrence == FIRST || LAST */ + bool from_back = (occurrence == MapArrayLookupOptions::LAST); - list_builder->Reset(); + ARROW_ASSIGN_OR_RAISE( + int64_t key_match_idx, + FindOneMapValueIndex(*keys, *query_key, 0, struct_array.length(), from_back)); + if (key_match_idx != -1) { + ARROW_ASSIGN_OR_RAISE(output, items->GetScalar(key_match_idx)); + } else { + output = MakeNullScalar(items->type()); } - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - out->value = result->data(); + } + return output; + } - } else { /* occurrence == FIRST || LAST */ - std::unique_ptr builder; - RETURN_NOT_OK( - MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); + static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& options = OptionsWrapper::Get(ctx); - for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); - ++map_array_idx) { - if (!map_array.IsValid(map_array_idx)) { - RETURN_NOT_OK(builder->AppendNull()); - continue; - } + const MapArray map_array(batch[0].array()); - int64_t start = offsets->Value(map_array_idx); - int64_t end = offsets->Value(map_array_idx + 1); - bool from_back = (occurrence == MapArrayLookupOptions::LAST); - - ARROW_ASSIGN_OR_RAISE( - int64_t key_match_idx, - FindOneMapValueIndex(*keys, *query_key, start, end, from_back)); - if (key_match_idx != -1) { - RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_match_idx, 1)); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } + std::unique_ptr builder; + if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), + list(map_array.map_type()->item_type()), &builder)); + } else { + RETURN_NOT_OK( + MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); + } + for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { + if (!map_array.IsValid(map_array_idx)) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } else { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + map_array.GetScalar(map_array_idx)); + auto map_scalar = std::static_pointer_cast(scalar); + ARROW_ASSIGN_OR_RAISE(auto scalar_output, GetScalarOutput(ctx, *map_scalar)); + RETURN_NOT_OK(builder->AppendScalar(*scalar_output)); } - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - out->value = result->data(); } + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = result->data(); return Status::OK(); } static Status ExecMapScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = OptionsWrapper::Get(ctx); - const std::shared_ptr& query_key = options.query_key; - const auto& occurrence = options.occurrence; std::shared_ptr item_type = checked_cast(*batch[0].type()).item_type(); @@ -554,44 +543,7 @@ struct MapArrayLookupFunctor { return Status::OK(); } - const auto& struct_array = checked_cast(*map_scalar.value); - const std::shared_ptr keys = struct_array.field(0); - const std::shared_ptr items = struct_array.field(1); - - if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); - - bool found_atleast_one_key = false; - for (int64_t idx = 0; idx < struct_array.length(); ++idx) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); - - if (key->Equals(*query_key)) { - found_atleast_one_key = true; - RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), idx, 1)); - } - } - if (!found_atleast_one_key) { - out->value = MakeNullScalar(list(items->type())); - } else { - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - ARROW_ASSIGN_OR_RAISE(out->value, MakeScalar(list(items->type()), result)); - } - } - - else { /* occurrence == FIRST || LAST */ - bool from_back = (occurrence == MapArrayLookupOptions::LAST); - - ARROW_ASSIGN_OR_RAISE( - int64_t key_match_idx, - FindOneMapValueIndex(*keys, *query_key, 0, struct_array.length(), from_back)); - if (key_match_idx != -1) { - ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(key_match_idx)); - } else { - out->value = MakeNullScalar(items->type()); - } - } - + ARROW_ASSIGN_OR_RAISE(out->value, GetScalarOutput(ctx, map_scalar)); return Status::OK(); } }; From 012ff35038b4f78077296f19670df9e75b5bda61 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Tue, 25 Jan 2022 18:14:43 +0530 Subject: [PATCH 08/24] Pass references to helper to reduce (un)/boxing --- .../arrow/compute/kernels/scalar_nested.cc | 68 +++++++++---------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 02cd79ce162..c67c1e0c3b5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -449,52 +449,42 @@ struct MapArrayLookupFunctor { return -1; } - static Result> GetScalarOutput(KernelContext* ctx, - const MapScalar map_scalar) { - const auto& options = OptionsWrapper::Get(ctx); - const std::shared_ptr& query_key = options.query_key; - const auto& occurrence = options.occurrence; - - const auto& struct_array = checked_cast(*map_scalar.value); - const std::shared_ptr keys = struct_array.field(0); - const std::shared_ptr items = struct_array.field(1); - - std::shared_ptr output; - - if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { + static Status SetScalarOutput(const Array& keys, const Array& items, KernelContext* ctx, + const MapArrayLookupOptions& options, + std::shared_ptr& out) { + const Scalar& query_key = *options.query_key; + if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder)); bool found_at_least_one_key = false; - for (int64_t idx = 0; idx < struct_array.length(); ++idx) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys->GetScalar(idx)); + for (int64_t idx = 0; idx < keys.length(); ++idx) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys.GetScalar(idx)); - if (key->Equals(*query_key)) { + if (key->Equals(query_key)) { found_at_least_one_key = true; - RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), idx, 1)); + RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), idx, 1)); } } if (!found_at_least_one_key) { - output = MakeNullScalar(list(items->type())); + out = MakeNullScalar(list(items.type())); } else { ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - ARROW_ASSIGN_OR_RAISE(output, MakeScalar(list(items->type()), result)); + ARROW_ASSIGN_OR_RAISE(out, MakeScalar(list(items.type()), result)); } - } - - else { /* occurrence == FIRST || LAST */ - bool from_back = (occurrence == MapArrayLookupOptions::LAST); + } else { /* occurrence == FIRST || LAST */ + bool from_back = (options.occurrence == MapArrayLookupOptions::LAST); ARROW_ASSIGN_OR_RAISE( int64_t key_match_idx, - FindOneMapValueIndex(*keys, *query_key, 0, struct_array.length(), from_back)); + FindOneMapValueIndex(keys, query_key, 0, keys.length(), from_back)); if (key_match_idx != -1) { - ARROW_ASSIGN_OR_RAISE(output, items->GetScalar(key_match_idx)); + ARROW_ASSIGN_OR_RAISE(out, items.GetScalar(key_match_idx)); } else { - output = MakeNullScalar(items->type()); + out = MakeNullScalar(items.type()); } } - return output; + return Status::OK(); } static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { @@ -510,16 +500,18 @@ struct MapArrayLookupFunctor { RETURN_NOT_OK( MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); } + for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { if (!map_array.IsValid(map_array_idx)) { RETURN_NOT_OK(builder->AppendNull()); continue; } else { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - map_array.GetScalar(map_array_idx)); - auto map_scalar = std::static_pointer_cast(scalar); - ARROW_ASSIGN_OR_RAISE(auto scalar_output, GetScalarOutput(ctx, *map_scalar)); - RETURN_NOT_OK(builder->AppendScalar(*scalar_output)); + auto map = map_array.value_slice(map_array_idx); + auto keys = checked_cast(*map).field(0); + auto items = checked_cast(*map).field(1); + std::shared_ptr output; + RETURN_NOT_OK(SetScalarOutput(*keys, *items, ctx, options, output)); + RETURN_NOT_OK(builder->AppendScalar(*output)); } } ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); @@ -543,7 +535,13 @@ struct MapArrayLookupFunctor { return Status::OK(); } - ARROW_ASSIGN_OR_RAISE(out->value, GetScalarOutput(ctx, map_scalar)); + const auto& struct_array = checked_cast(*map_scalar.value); + const std::shared_ptr keys = struct_array.field(0); + const std::shared_ptr items = struct_array.field(1); + std::shared_ptr output; + + RETURN_NOT_OK(SetScalarOutput(*keys, *items, ctx, options, output)); + out->value = output; return Status::OK(); } }; @@ -565,7 +563,7 @@ Result ResolveMapArrayLookupType(KernelContext* ctx, if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { return ValueDescr(list(item_type), descrs.front().shape); - } else /* occurrence == FIRST || LAST */ { + } else { /* occurrence == FIRST || LAST */ return ValueDescr(item_type, descrs.front().shape); } } From 8f287a2db0d794a2a2b43c0ded3617dc0126157d Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Wed, 26 Jan 2022 02:36:19 +0530 Subject: [PATCH 09/24] Use offsets and refactor for ALL --- .../arrow/compute/kernels/scalar_nested.cc | 147 +++++++++++------- .../compute/kernels/scalar_nested_test.cc | 28 ++-- 2 files changed, 110 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index c67c1e0c3b5..71b58019b1d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -449,81 +449,98 @@ struct MapArrayLookupFunctor { return -1; } - static Status SetScalarOutput(const Array& keys, const Array& items, KernelContext* ctx, - const MapArrayLookupOptions& options, - std::shared_ptr& out) { - const Scalar& query_key = *options.query_key; - if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder)); - - bool found_at_least_one_key = false; - for (int64_t idx = 0; idx < keys.length(); ++idx) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys.GetScalar(idx)); - - if (key->Equals(query_key)) { - found_at_least_one_key = true; - RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), idx, 1)); - } - } - if (!found_at_least_one_key) { - out = MakeNullScalar(list(items.type())); - } else { - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - ARROW_ASSIGN_OR_RAISE(out, MakeScalar(list(items.type()), result)); - } - } else { /* occurrence == FIRST || LAST */ - bool from_back = (options.occurrence == MapArrayLookupOptions::LAST); - - ARROW_ASSIGN_OR_RAISE( - int64_t key_match_idx, - FindOneMapValueIndex(keys, query_key, 0, keys.length(), from_back)); - if (key_match_idx != -1) { - ARROW_ASSIGN_OR_RAISE(out, items.GetScalar(key_match_idx)); - } else { - out = MakeNullScalar(items.type()); + static Result> GetBuiltArray( + const Array& keys, const Array& items, const Scalar& query_key, + bool& found_at_least_one_key, const int64_t& start, const int64_t& end, + KernelContext* ctx) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder)); + for (int64_t key_idx_to_check = start; key_idx_to_check < end; ++key_idx_to_check) { + ARROW_ASSIGN_OR_RAISE(auto key, keys.GetScalar(key_idx_to_check)); + if (key->Equals(query_key)) { + found_at_least_one_key = true; + RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), key_idx_to_check, 1)); } } - return Status::OK(); + return std::move(builder); } static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = OptionsWrapper::Get(ctx); - + const auto& query_key = options.query_key; + const auto& occurrence = options.occurrence; const MapArray map_array(batch[0].array()); + std::shared_ptr keys = map_array.keys(); + std::shared_ptr items = map_array.items(); + auto offsets = std::dynamic_pointer_cast(map_array.offsets()); + std::unique_ptr builder; - if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { + if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(map_array.map_type()->item_type()), &builder)); + + for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); + ++map_array_idx) { + if (!map_array.IsValid(map_array_idx)) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } + + int64_t start = offsets->Value(map_array_idx); + int64_t end = offsets->Value(map_array_idx + 1); + std::unique_ptr list_builder; + bool found_at_least_one_key = false; + ARROW_ASSIGN_OR_RAISE( + list_builder, GetBuiltArray(*keys, *items, *query_key, found_at_least_one_key, + start, end, ctx)); + if (!found_at_least_one_key) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + ARROW_ASSIGN_OR_RAISE(auto list_result, list_builder->Finish()); + RETURN_NOT_OK(builder->AppendScalar(ListScalar(list_result))); + } + list_builder->Reset(); + } + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = result->data(); } else { RETURN_NOT_OK( MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); - } - for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { - if (!map_array.IsValid(map_array_idx)) { - RETURN_NOT_OK(builder->AppendNull()); - continue; - } else { - auto map = map_array.value_slice(map_array_idx); - auto keys = checked_cast(*map).field(0); - auto items = checked_cast(*map).field(1); - std::shared_ptr output; - RETURN_NOT_OK(SetScalarOutput(*keys, *items, ctx, options, output)); - RETURN_NOT_OK(builder->AppendScalar(*output)); + for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); + ++map_array_idx) { + if (!map_array.IsValid(map_array_idx)) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } + int64_t start = offsets->Value(map_array_idx); + int64_t end = offsets->Value(map_array_idx + 1); + bool from_back = (occurrence == MapArrayLookupOptions::LAST); + + ARROW_ASSIGN_OR_RAISE( + int64_t key_match_idx, + FindOneMapValueIndex(*keys, *query_key, start, end, from_back)); + if (key_match_idx != -1) { + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_match_idx, 1)); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } } + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = result->data(); } - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - out->value = result->data(); + return Status::OK(); } static Status ExecMapScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = OptionsWrapper::Get(ctx); + const auto& query_key = options.query_key; + const auto& occurrence = options.occurrence; + std::shared_ptr item_type = checked_cast(*batch[0].type()).item_type(); - const auto& map_scalar = batch[0].scalar_as(); if (ARROW_PREDICT_FALSE(!map_scalar.is_valid)) { @@ -538,10 +555,32 @@ struct MapArrayLookupFunctor { const auto& struct_array = checked_cast(*map_scalar.value); const std::shared_ptr keys = struct_array.field(0); const std::shared_ptr items = struct_array.field(1); - std::shared_ptr output; - RETURN_NOT_OK(SetScalarOutput(*keys, *items, ctx, options, output)); - out->value = output; + if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { + bool found_at_least_one_key = false; + std::unique_ptr builder; + ARROW_ASSIGN_OR_RAISE( + builder, GetBuiltArray(*keys, *items, *query_key, found_at_least_one_key, 0, + struct_array.length(), ctx)); + + if (!found_at_least_one_key) { + out->value = MakeNullScalar(list(items->type())); + } else { + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + ARROW_ASSIGN_OR_RAISE(out->value, MakeScalar(list(items->type()), result)); + } + } else { /* occurrence == FIRST || LAST */ + bool from_back = (occurrence == MapArrayLookupOptions::LAST); + + ARROW_ASSIGN_OR_RAISE( + int64_t key_match_idx, + FindOneMapValueIndex(*keys, *query_key, 0, struct_array.length(), from_back)); + if (key_match_idx != -1) { + ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(key_match_idx)); + } else { + out->value = MakeNullScalar(items->type()); + } + } return Status::OK(); } }; diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 53a56ccb5da..9ad40ce56b4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -360,14 +360,18 @@ TEST_F(TestMapArrayLookupKernel, NestedItems) { } template -class TestMapArrayLookupIntegralKeys : public ::testing ::Test {}; +class TestMapArrayLookupIntegralKeys : public ::testing ::Test { + protected: + std::shared_ptr type_singleton() const { + std::shared_ptr type = default_type_instance(); + return map(type, utf8()); + } +}; TYPED_TEST_SUITE(TestMapArrayLookupIntegralKeys, PhysicalIntegralArrowTypes); TYPED_TEST(TestMapArrayLookupIntegralKeys, StringItems) { - std::shared_ptr type = default_type_instance(); - - auto map_type = map(type, utf8()); + auto map_type = this->type_singleton(); const char* input = R"( [ [ @@ -405,14 +409,14 @@ TYPED_TEST(TestMapArrayLookupIntegralKeys, StringItems) { auto expected_last = ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null, null, null])"); - CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, - MakeScalar(type, 1).ValueOrDie(), expected_all, - expected_first, expected_last); + CheckMapArrayLookupWithDifferentOptions( + map_array_tweaked, MakeScalar(default_type_instance(), 1).ValueOrDie(), + expected_all, expected_first, expected_last); } template class TestMapArrayLookupDecimalKeys : public ::testing ::Test { protected: - std::shared_ptr GetType() { + std::shared_ptr type_singleton() const { return std::make_shared(/*precision=*/5, /*scale=*/4); } @@ -421,7 +425,7 @@ class TestMapArrayLookupDecimalKeys : public ::testing ::Test { TYPED_TEST_SUITE(TestMapArrayLookupDecimalKeys, DecimalArrowTypes); TYPED_TEST(TestMapArrayLookupDecimalKeys, StringItems) { - std::shared_ptr type = this->GetType(); + std::shared_ptr type = this->type_singleton(); auto key_scalar = DecimalScalarFromJSON(type, R"("1.2345")"); auto map_type = map(type, utf8()); @@ -471,13 +475,15 @@ TYPED_TEST(TestMapArrayLookupDecimalKeys, StringItems) { template class TestMapArrayLookupBinaryKeys : public ::testing ::Test { protected: - std::shared_ptr GetType() { return TypeTraits::type_singleton(); } + std::shared_ptr type_singleton() const { + return TypeTraits::type_singleton(); + } }; TYPED_TEST_SUITE(TestMapArrayLookupBinaryKeys, BaseBinaryArrowTypes); TYPED_TEST(TestMapArrayLookupBinaryKeys, IntegralItems) { - auto key_type = this->GetType(); + auto key_type = this->type_singleton(); auto sheesh_scalar = ScalarFromJSON(key_type, R"("sheesh")"); auto map_type = map(key_type, int32()); From dadc4bf96ef6b366f0ace9e306ae631fbd2848d2 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Wed, 26 Jan 2022 20:25:46 +0530 Subject: [PATCH 10/24] Template kernel --- cpp/src/arrow/compute/api_scalar.h | 3 +- .../arrow/compute/kernels/scalar_nested.cc | 140 ++++++++++++++---- .../compute/kernels/scalar_nested_test.cc | 36 +++++ 3 files changed, 150 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index bc05ba22b44..ebeb128b029 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -482,7 +482,8 @@ class ARROW_EXPORT MapArrayLookupOptions : public FunctionOptions { ALL }; - MapArrayLookupOptions(std::shared_ptr query_key, Occurrence occurrence = ALL); + explicit MapArrayLookupOptions(std::shared_ptr query_key, + Occurrence occurrence = ALL); MapArrayLookupOptions(); constexpr static char const kTypeName[] = "MapArrayLookupOptions"; diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 71b58019b1d..f3fcb74a97e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -428,40 +428,82 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray", "specified through MakeStructOptions."), {"*args"}, "MakeStructOptions"}; - +template struct MapArrayLookupFunctor { - static Result FindOneMapValueIndex(const Array& keys, const Scalar& query_key, + static Result FindOneMapValueIndex(const Array& keys, + const Scalar& query_key_scalar, const int64_t start, const int64_t end, const bool from_back = false) { - if (!from_back) { - for (int64_t idx = start; idx < end; ++idx) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys.GetScalar(idx)); - - if (key->Equals(query_key)) return idx; - } - } else { - for (int64_t idx = end - 1; idx >= start; --idx) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr key, keys.GetScalar(idx)); + const auto query_key = UnboxScalar::Unbox(query_key_scalar); + int64_t index = 0; + int64_t match_idx = -1; + ARROW_UNUSED(VisitArrayValuesInline( + *keys.data(), + [&](decltype(query_key) key) -> Status { + if (index < start) { + ++index; + return Status::OK(); + } else if (index < end) { + if (key == query_key) { + if (!from_back) { + match_idx = index; + return Status::Cancelled("Found first matching key"); + } else { + match_idx = index; + } + } + ++index; + return Status::OK(); + } else { + return Status::Cancelled("End reached"); + } + }, + [&]() -> Status { + if (index < end) { + ++index; + return Status::OK(); + } else { + return Status::Cancelled("End reached"); + } + })); - if (key->Equals(query_key)) return idx; - } - } - return -1; + return match_idx; } static Result> GetBuiltArray( - const Array& keys, const Array& items, const Scalar& query_key, + const Array& keys, const Array& items, const Scalar& query_key_scalar, bool& found_at_least_one_key, const int64_t& start, const int64_t& end, KernelContext* ctx) { std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder)); - for (int64_t key_idx_to_check = start; key_idx_to_check < end; ++key_idx_to_check) { - ARROW_ASSIGN_OR_RAISE(auto key, keys.GetScalar(key_idx_to_check)); - if (key->Equals(query_key)) { - found_at_least_one_key = true; - RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), key_idx_to_check, 1)); - } - } + const auto query_key = UnboxScalar::Unbox(query_key_scalar); + int64_t index = 0; + ARROW_UNUSED(VisitArrayValuesInline( + *keys.data(), + [&](decltype(query_key) key) -> Status { + if (index < start) { + ++index; + return Status::OK(); + } else if (index < end) { + if (key == query_key) { + found_at_least_one_key = true; + RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), index, 1)); + } + ++index; + return Status::OK(); + } else { + return Status::Cancelled("End reached"); + } + }, + [&]() -> Status { + if (index < end) { + ++index; + return Status::OK(); + } else { + return Status::Cancelled("End reached"); + } + })); + return std::move(builder); } @@ -607,13 +649,55 @@ Result ResolveMapArrayLookupType(KernelContext* ctx, } } +struct ResolveMapArrayLookup { + KernelContext* ctx; + const ExecBatch& batch; + Datum* out; + + template + Status Execute() { + if (batch[0].kind() == Datum::SCALAR) { + return MapArrayLookupFunctor::ExecMapScalar(ctx, batch, out); + } + return MapArrayLookupFunctor::ExecMapArray(ctx, batch, out); + } + + template + enable_if_physical_integer Visit(const KeyType& type) { + return Execute(); + } + + template + enable_if_decimal Visit(const KeyType& type) { + return Execute(); + } + + template + enable_if_base_binary Visit(const KeyType& type) { + return Execute(); + } + + template + enable_if_boolean Visit(const KeyType& type) { + return Execute(); + } + + Status Visit(const DataType& type) { + return Status::TypeError("Got unsupported type: ", type.ToString()); + } + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + ResolveMapArrayLookup visitor{ctx, batch, out}; + return VisitTypeInline(*checked_cast(*batch[0].type()).key_type(), + &visitor); + } +}; + void AddMapArrayLookupKernels(ScalarFunction* func) { for (const auto shape : {ValueDescr::ARRAY, ValueDescr::SCALAR}) { - ScalarKernel kernel({InputType(Type::MAP, shape)}, - OutputType(ResolveMapArrayLookupType), - shape == ValueDescr::ARRAY ? MapArrayLookupFunctor::ExecMapArray - : MapArrayLookupFunctor::ExecMapScalar, - OptionsWrapper::Init); + ScalarKernel kernel( + {InputType(Type::MAP, shape)}, OutputType(ResolveMapArrayLookupType), + ResolveMapArrayLookup::Exec, OptionsWrapper::Init); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; DCHECK_OK(func->AddKernel(std::move(kernel))); diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 9ad40ce56b4..e5e777e2607 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -359,6 +359,42 @@ TEST_F(TestMapArrayLookupKernel, NestedItems) { expected_first, expected_last); } +TEST_F(TestMapArrayLookupKernel, BooleanKey) { + auto true_scalar = ScalarFromJSON(boolean(), R"(true)"); + auto map_type = map(boolean(), int32()); + const char* input = R"( + [ + [ + [true, 99], [false, 1], [false, 2], [true, null], [false, 5], + [true, 8] + ], + null, + [ + [false, null], [true, 67], [false, 101], [false, 1], [false, null], + [false, 9], [true, 80] + ], + [], + [ + [false, 1], [false, 2], [false, 3], [false, 4] + ], + [ + [true, 9], [true, 2], [true, 5], [true, 8] + ] + ] + )"; + + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 5, false); + + auto expected_all = ArrayFromJSON(list(int32()), R"( + [[99, null, 8], null, [67, 80], null, null, null ])"); + auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); + auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); + + CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, true_scalar, expected_all, + expected_first, expected_last); +} + template class TestMapArrayLookupIntegralKeys : public ::testing ::Test { protected: From fbe0e1ff918eb117a8f301827792b9b1cd9e4e7b Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Thu, 27 Jan 2022 16:17:30 +0530 Subject: [PATCH 11/24] Handle more types and refactor --- .../arrow/compute/kernels/scalar_nested.cc | 143 ++++++++---------- .../compute/kernels/scalar_nested_test.cc | 111 ++++++++++++-- 2 files changed, 162 insertions(+), 92 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index f3fcb74a97e..4819d4e5143 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -430,81 +430,56 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray", "MakeStructOptions"}; template struct MapArrayLookupFunctor { - static Result FindOneMapValueIndex(const Array& keys, - const Scalar& query_key_scalar, - const int64_t start, const int64_t end, - const bool from_back = false) { - const auto query_key = UnboxScalar::Unbox(query_key_scalar); - int64_t index = 0; - int64_t match_idx = -1; - ARROW_UNUSED(VisitArrayValuesInline( - *keys.data(), - [&](decltype(query_key) key) -> Status { - if (index < start) { - ++index; - return Status::OK(); - } else if (index < end) { - if (key == query_key) { - if (!from_back) { - match_idx = index; - return Status::Cancelled("Found first matching key"); - } else { - match_idx = index; - } - } - ++index; - return Status::OK(); - } else { - return Status::Cancelled("End reached"); - } - }, - [&]() -> Status { - if (index < end) { - ++index; + static Result GetOneMatchingIndex(const Array& keys, + const Scalar& query_key_scalar, + const bool& from_back) { + int64_t match_index = -1; + RETURN_NOT_OK( + FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) -> Status { + match_index = index; + if (from_back) { return Status::OK(); } else { - return Status::Cancelled("End reached"); + return Status::Cancelled("Found key match for FIRST"); } })); - return match_idx; + return match_index; } - static Result> GetBuiltArray( - const Array& keys, const Array& items, const Scalar& query_key_scalar, - bool& found_at_least_one_key, const int64_t& start, const int64_t& end, - KernelContext* ctx) { - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder)); + static Status BuildListOfItemsArray(const Array& keys, const Array& items, + const Scalar& query_key_scalar, + bool& found_at_least_one_key, + ArrayBuilder* builder) { + RETURN_NOT_OK( + FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) -> Status { + found_at_least_one_key = true; + RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), index, 1)); + return Status::OK(); + })); + return Status::OK(); + } + + template + static Status FindMatchingIndices(const Array& keys, const Scalar& query_key_scalar, + FoundItem callback) { const auto query_key = UnboxScalar::Unbox(query_key_scalar); int64_t index = 0; ARROW_UNUSED(VisitArrayValuesInline( *keys.data(), [&](decltype(query_key) key) -> Status { - if (index < start) { - ++index; - return Status::OK(); - } else if (index < end) { - if (key == query_key) { - found_at_least_one_key = true; - RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), index, 1)); - } - ++index; - return Status::OK(); - } else { - return Status::Cancelled("End reached"); + Status to_return = Status::OK(); + if (key == query_key) { + to_return = callback(index); } + ++index; + return to_return; }, [&]() -> Status { - if (index < end) { - ++index; - return Status::OK(); - } else { - return Status::Cancelled("End reached"); - } + ++index; + return Status::OK(); })); - - return std::move(builder); + return Status::OK(); } static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { @@ -513,10 +488,6 @@ struct MapArrayLookupFunctor { const auto& occurrence = options.occurrence; const MapArray map_array(batch[0].array()); - std::shared_ptr keys = map_array.keys(); - std::shared_ptr items = map_array.items(); - auto offsets = std::dynamic_pointer_cast(map_array.offsets()); - std::unique_ptr builder; if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), @@ -529,13 +500,16 @@ struct MapArrayLookupFunctor { continue; } - int64_t start = offsets->Value(map_array_idx); - int64_t end = offsets->Value(map_array_idx + 1); - std::unique_ptr list_builder; + auto map = map_array.value_slice(map_array_idx); + auto keys = checked_cast(*map).field(0); + auto items = checked_cast(*map).field(1); bool found_at_least_one_key = false; - ARROW_ASSIGN_OR_RAISE( - list_builder, GetBuiltArray(*keys, *items, *query_key, found_at_least_one_key, - start, end, ctx)); + std::unique_ptr list_builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), + &list_builder)); + + RETURN_NOT_OK(BuildListOfItemsArray(*keys, *items, *query_key, + found_at_least_one_key, list_builder.get())); if (!found_at_least_one_key) { RETURN_NOT_OK(builder->AppendNull()); } else { @@ -556,13 +530,14 @@ struct MapArrayLookupFunctor { RETURN_NOT_OK(builder->AppendNull()); continue; } - int64_t start = offsets->Value(map_array_idx); - int64_t end = offsets->Value(map_array_idx + 1); + + auto map = map_array.value_slice(map_array_idx); + auto keys = checked_cast(*map).field(0); + auto items = checked_cast(*map).field(1); bool from_back = (occurrence == MapArrayLookupOptions::LAST); + ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx, + GetOneMatchingIndex(*keys, *query_key, from_back)); - ARROW_ASSIGN_OR_RAISE( - int64_t key_match_idx, - FindOneMapValueIndex(*keys, *query_key, start, end, from_back)); if (key_match_idx != -1) { RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_match_idx, 1)); } else { @@ -601,9 +576,9 @@ struct MapArrayLookupFunctor { if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { bool found_at_least_one_key = false; std::unique_ptr builder; - ARROW_ASSIGN_OR_RAISE( - builder, GetBuiltArray(*keys, *items, *query_key, found_at_least_one_key, 0, - struct_array.length(), ctx)); + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); + RETURN_NOT_OK(BuildListOfItemsArray(*keys, *items, *query_key, + found_at_least_one_key, builder.get())); if (!found_at_least_one_key) { out->value = MakeNullScalar(list(items->type())); @@ -614,9 +589,8 @@ struct MapArrayLookupFunctor { } else { /* occurrence == FIRST || LAST */ bool from_back = (occurrence == MapArrayLookupOptions::LAST); - ARROW_ASSIGN_OR_RAISE( - int64_t key_match_idx, - FindOneMapValueIndex(*keys, *query_key, 0, struct_array.length(), from_back)); + ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx, + GetOneMatchingIndex(*keys, *query_key, from_back)); if (key_match_idx != -1) { ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(key_match_idx)); } else { @@ -682,6 +656,15 @@ struct ResolveMapArrayLookup { return Execute(); } + template + enable_if_same Visit(const KeyType& key) { + return Execute(); + } + + Status Visit(const MonthDayNanoIntervalType& key) { + return Execute(); + } + Status Visit(const DataType& type) { return Status::TypeError("Got unsupported type: ", type.ToString()); } diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index e5e777e2607..4ec0df32eca 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -252,6 +252,7 @@ TEST_F(TestMapArrayLookupKernel, Basic) { [] ])"; auto map_array = ArrayFromJSON(type, input); + CheckMapArrayLookupWithDifferentOptions( map_array, MakeScalar("foo"), ArrayFromJSON(list(int32()), R"([[99, 3], null, [101, 22], null])"), @@ -328,6 +329,7 @@ TEST_F(TestMapArrayLookupKernel, NestedItems) { ] )"; const auto map_array = ArrayFromJSON(type, input); + const auto expected_all = ArrayFromJSON(list(map(int16(), int16())), R"( [ [ @@ -355,6 +357,7 @@ TEST_F(TestMapArrayLookupKernel, NestedItems) { [[12, 13], [13, 14]], null ])"); + CheckMapArrayLookupWithDifferentOptions(map_array, MakeScalar("foo"), expected_all, expected_first, expected_last); } @@ -362,27 +365,113 @@ TEST_F(TestMapArrayLookupKernel, NestedItems) { TEST_F(TestMapArrayLookupKernel, BooleanKey) { auto true_scalar = ScalarFromJSON(boolean(), R"(true)"); auto map_type = map(boolean(), int32()); + const char* input = R"( + [ + [ + [true, 99], [false, 1], [false, 2], [true, null], [false, 5], + [true, 8] + ], + null, + [ + [false, null], [true, 67], [false, 101], [false, 1], [false, null], + [false, 9], [true, 80] + ], + [], + [ + [false, 1], [false, 2], [false, 3], [false, 4] + ], + [ + [true, 9], [true, 2], [true, 5], [true, 8] + ] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 5, false); + + auto expected_all = ArrayFromJSON(list(int32()), R"( + [[99, null, 8], null, [67, 80], null, null, null ])"); + auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); + auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); + + CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, true_scalar, expected_all, + expected_first, expected_last); +} + +TEST_F(TestMapArrayLookupKernel, MonthDayNanoIntervalKeys) { + auto key_type = month_day_nano_interval(); + auto map_type = map(key_type, utf8()); + auto key_scalar = ScalarFromJSON(month_day_nano_interval(), R"([1, 2, -3])"); + const char* input = R"( + [ + [ + [[-9, -10, 11], "zero"], [[1, 2, -3], "first_one"], [[11, -12, 0], "two"], + [[1, 2, -3], null], [[-7, -8, -9], "three"], [[1, 2, -3], "second_one"], + [[1, 2, -3], "last_one"] + ], + null, + [ + [[-5, 6, 7], "zero_hero"], [[15, 16, 2], "almost_six"], + [[1, 2, -3], "the_dumb_one"], [[-7, -8, -9], "eleven"], + [[1, 2, -3], "the_chosen_one"], [[-5, 6, 7], "meaning of life"], + [[1, 2, -3], "just_one"], [[1, 2, -3], "no more ones!"] + ], + [ + [[-5, 6, 7], "this"], [[-13, 14, -1], "has"], [[11, -12, 0], "no"], + [[15, 16, 2], "keys"] + ], + [ + [[1, 2, -3], "this"], [[1, 2, -3], "should"], [[1, 2, -3], "also"], + [[1, 2, -3], "be"], [[1, 2, -3], "null"] + ], + [] + ] + )"; + auto map_array = ArrayFromJSON(map_type, input); + auto map_array_tweaked = TweakValidityBit(map_array, 4, false); + + auto expected_first = + ArrayFromJSON(utf8(), R"(["first_one", null, "the_dumb_one", null, null, null])"); + auto expected_last = + ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null, null, null])"); + auto expected_all = ArrayFromJSON(list(utf8()), + R"([ + ["first_one", null, "second_one", "last_one"], + null, + ["the_dumb_one", "the_chosen_one", "just_one", "no more ones!"], + null, + null, + null + ] + )"); + + CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, key_scalar, expected_all, + expected_first, expected_last); +} + +TEST_F(TestMapArrayLookupKernel, FixedSizeBinary) { + auto key_type = fixed_size_binary(6); + auto map_type = map(key_type, int32()); + auto sheesh_scalar = ScalarFromJSON(key_type, R"("sheesh")"); const char* input = R"( [ [ - [true, 99], [false, 1], [false, 2], [true, null], [false, 5], - [true, 8] + ["sheesh", 99], ["yooloo", 1], ["yaaaay", 2], ["sheesh", null], ["no way", 5], + ["sheesh", 8] ], null, [ - [false, null], [true, 67], [false, 101], [false, 1], [false, null], - [false, 9], [true, 80] + ["hmm,mm", null], ["sheesh", 67], ["snaccc", 101], ["awwwww", 1], ["dapdap", null], + ["yooloo", 9], ["sheesh", 80] ], [], [ - [false, 1], [false, 2], [false, 3], [false, 4] + ["nopeno", 1], ["nonono", 2], ["sheess", 3], ["here!!", 4] ], [ - [true, 9], [true, 2], [true, 5], [true, 8] + ["sheesh", 9], ["sheesh", 2], ["sheesh", 5], ["sheesh", 8] ] ] )"; - auto map_array = ArrayFromJSON(map_type, input); auto map_array_tweaked = TweakValidityBit(map_array, 5, false); @@ -391,7 +480,7 @@ TEST_F(TestMapArrayLookupKernel, BooleanKey) { auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); - CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, true_scalar, expected_all, + CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, sheesh_scalar, expected_all, expected_first, expected_last); } @@ -461,10 +550,9 @@ class TestMapArrayLookupDecimalKeys : public ::testing ::Test { TYPED_TEST_SUITE(TestMapArrayLookupDecimalKeys, DecimalArrowTypes); TYPED_TEST(TestMapArrayLookupDecimalKeys, StringItems) { - std::shared_ptr type = this->type_singleton(); - auto key_scalar = DecimalScalarFromJSON(type, R"("1.2345")"); - + auto type = this->type_singleton(); auto map_type = map(type, utf8()); + auto key_scalar = DecimalScalarFromJSON(type, R"("1.2345")"); const char* input = R"( [ [ @@ -521,7 +609,6 @@ TYPED_TEST_SUITE(TestMapArrayLookupBinaryKeys, BaseBinaryArrowTypes); TYPED_TEST(TestMapArrayLookupBinaryKeys, IntegralItems) { auto key_type = this->type_singleton(); auto sheesh_scalar = ScalarFromJSON(key_type, R"("sheesh")"); - auto map_type = map(key_type, int32()); const char* input = R"( [ From 10956954b132076c309006ed8be4ec453f079000 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Thu, 27 Jan 2022 19:09:51 +0530 Subject: [PATCH 12/24] Add autocomplete helper --- cpp/src/arrow/compute/api_scalar.cc | 7 +++++++ cpp/src/arrow/compute/api_scalar.h | 15 +++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index d7b8a01bbb6..285a41f1b8f 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -819,5 +819,12 @@ Result Week(const Datum& arg, WeekOptions options, ExecContext* ctx) { return CallFunction("week", {arg}, &options, ctx); } +// ---------------------------------------------------------------------- + +Result MapArrayLookup(const Datum& arg, MapArrayLookupOptions options, + ExecContext* ctx) { + return CallFunction("map_array_lookup", {arg}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index ebeb128b029..d1ed11ffe0d 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -1375,5 +1375,20 @@ ARROW_EXPORT Result AssumeTimezone(const Datum& values, AssumeTimezoneOptions options, ExecContext* ctx = NULLPTR); +/// \brief Finds either the FIRST, LAST, or ALL items with a key that matches the given +/// query key in a map array. +/// +/// Returns an array of items for FIRST and LAST, and an array of list of items for ALL. +/// +/// \param[in] map_array to look in +/// \param[in] options to pass a query key and choose which matching keys to return +/// (FIRST, LAST or ALL) +/// \param[in] ctx the function execution context, optional +/// +/// \return the resulting datum +/// \note API not yet finalized +ARROW_EXPORT Result MapArrayLookup(const Datum& map_array, + MapArrayLookupOptions options, + ExecContext* ctx = NULLPTR); } // namespace compute } // namespace arrow From a1a019aa505b96e379144aa4eca806366f070a4a Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Fri, 28 Jan 2022 15:01:33 +0530 Subject: [PATCH 13/24] Add docs --- cpp/src/arrow/compute/api_scalar.cc | 4 +- .../arrow/compute/kernels/scalar_nested.cc | 32 +++++++------- .../compute/kernels/scalar_nested_test.cc | 42 +++++++++++++++++++ docs/source/cpp/compute.rst | 29 ++++++++----- docs/source/python/api/compute.rst | 2 + 5 files changed, 80 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 285a41f1b8f..955a2a55987 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -820,11 +820,13 @@ Result Week(const Datum& arg, WeekOptions options, ExecContext* ctx) { } // ---------------------------------------------------------------------- - +// Structural transforms Result MapArrayLookup(const Datum& arg, MapArrayLookupOptions options, ExecContext* ctx) { return CallFunction("map_array_lookup", {arg}, &options, ctx); } +// ---------------------------------------------------------------------- + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 4819d4e5143..ddcf0413819 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -432,12 +432,12 @@ template struct MapArrayLookupFunctor { static Result GetOneMatchingIndex(const Array& keys, const Scalar& query_key_scalar, - const bool& from_back) { + const bool* from_back) { int64_t match_index = -1; RETURN_NOT_OK( FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) -> Status { match_index = index; - if (from_back) { + if (*from_back) { return Status::OK(); } else { return Status::Cancelled("Found key match for FIRST"); @@ -447,13 +447,12 @@ struct MapArrayLookupFunctor { return match_index; } - static Status BuildListOfItemsArray(const Array& keys, const Array& items, - const Scalar& query_key_scalar, - bool& found_at_least_one_key, - ArrayBuilder* builder) { + static Status BuildItemsArray(const Array& keys, const Array& items, + const Scalar& query_key_scalar, + bool* found_at_least_one_key, ArrayBuilder* builder) { RETURN_NOT_OK( FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) -> Status { - found_at_least_one_key = true; + *found_at_least_one_key = true; RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), index, 1)); return Status::OK(); })); @@ -468,12 +467,11 @@ struct MapArrayLookupFunctor { ARROW_UNUSED(VisitArrayValuesInline( *keys.data(), [&](decltype(query_key) key) -> Status { - Status to_return = Status::OK(); if (key == query_key) { - to_return = callback(index); + return callback(index++); } ++index; - return to_return; + return Status::OK(); }, [&]() -> Status { ++index; @@ -508,8 +506,8 @@ struct MapArrayLookupFunctor { RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &list_builder)); - RETURN_NOT_OK(BuildListOfItemsArray(*keys, *items, *query_key, - found_at_least_one_key, list_builder.get())); + RETURN_NOT_OK(BuildItemsArray(*keys, *items, *query_key, &found_at_least_one_key, + list_builder.get())); if (!found_at_least_one_key) { RETURN_NOT_OK(builder->AppendNull()); } else { @@ -536,7 +534,7 @@ struct MapArrayLookupFunctor { auto items = checked_cast(*map).field(1); bool from_back = (occurrence == MapArrayLookupOptions::LAST); ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx, - GetOneMatchingIndex(*keys, *query_key, from_back)); + GetOneMatchingIndex(*keys, *query_key, &from_back)); if (key_match_idx != -1) { RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_match_idx, 1)); @@ -577,8 +575,8 @@ struct MapArrayLookupFunctor { bool found_at_least_one_key = false; std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); - RETURN_NOT_OK(BuildListOfItemsArray(*keys, *items, *query_key, - found_at_least_one_key, builder.get())); + RETURN_NOT_OK(BuildItemsArray(*keys, *items, *query_key, &found_at_least_one_key, + builder.get())); if (!found_at_least_one_key) { out->value = MakeNullScalar(list(items->type())); @@ -590,7 +588,7 @@ struct MapArrayLookupFunctor { bool from_back = (occurrence == MapArrayLookupOptions::LAST); ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx, - GetOneMatchingIndex(*keys, *query_key, from_back)); + GetOneMatchingIndex(*keys, *query_key, &from_back)); if (key_match_idx != -1) { ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(key_match_idx)); } else { @@ -613,7 +611,7 @@ Result ResolveMapArrayLookupType(KernelContext* ctx, return Status::TypeError( "map_array_lookup: query_key type and MapArray key_type don't match. Expected " "type: ", - *item_type, ", but got type: ", *options.query_key->type); + *key_type, ", but got type: ", *options.query_key->type); } if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 4ec0df32eca..698426e2f6f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -484,6 +484,48 @@ TEST_F(TestMapArrayLookupKernel, FixedSizeBinary) { expected_first, expected_last); } +TEST_F(TestMapArrayLookupKernel, Errors) { + auto map_type = map(int32(), utf8()); + const char* input = R"( + [ + [ + [0, "zero"], [1, "first one"], [2, "two"], [1, null], [3, "three"], [1, "second one"], + [1, "last one"] + ], + null, + [ + [0, "zero hero"], [9, "almost six"], [1, "the dumb one"], [7, "eleven"], + [1, "the chosen one"], [42, "meaning of life?"], [1, "just_one"], + [1, "no more ones!"] + ], + [ + [4, "this"], [6, "has"], [8, "no"], [2, "ones"] + ], + [ + [1, "this"], [1, "should"], [1, "also"], [1, "be"], [1, "null"] + ], + [] + ])"; + auto map_array = ArrayFromJSON(map_type, input); + auto query_key_int16 = MakeScalar(int16(), 1).ValueOrDie(); + FieldVector fields = {field("a", int32()), field("b", utf8()), + field("c", struct_({ + field("d", int64()), + field("e", float64()), + }))}; + auto unsupported_scalar = ScalarFromJSON(struct_(fields), R"([1, "a", [10, 10.0]])"); + + MapArrayLookupOptions unsupported(unsupported_scalar); + MapArrayLookupOptions all(query_key_int16, MapArrayLookupOptions::ALL); + MapArrayLookupOptions first(query_key_int16, MapArrayLookupOptions::FIRST); + MapArrayLookupOptions last(query_key_int16, MapArrayLookupOptions::LAST); + MapArrayLookupOptions null_key; + + for (auto option : {all, first, last, null_key, unsupported}) { + ASSERT_RAISES(TypeError, CallFunction("map_array_lookup", {map_array}, &option)); + } +} + template class TestMapArrayLookupIntegralKeys : public ::testing ::Test { protected: diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index f958bd8d398..f1b82f45cc5 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1639,17 +1639,19 @@ in the respective option classes. Structural transforms ~~~~~~~~~~~~~~~~~~~~~ -+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=====================+============+=====================================+==================+==============================+========+ -| list_element | Binary | List-like (Arg 0), Integral (Arg 1) | List value type | | \(1) | -+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ -| list_flatten | Unary | List-like | List value type | | \(2) | -+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ -| list_parent_indices | Unary | List-like | Int64 | | \(3) | -+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ -| struct_field | Unary | Struct or Union | Computed | :struct:`StructFieldOptions` | \(4) | -+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ ++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=====================+============+=====================================+==================+=================================+========+ +| list_element | Binary | List-like (Arg 0), Integral (Arg 1) | List value type | | \(1) | ++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ +| list_flatten | Unary | List-like | List value type | | \(2) | ++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ +| list_parent_indices | Unary | List-like | Int64 | | \(3) | ++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ +| struct_field | Unary | Struct or Union | Computed | :struct:`StructFieldOptions` | \(4) | ++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ +| map_array_lookup | Unary | MapArray | Computed | :struct:`MapArrayLookupOptions` | \(5) | ++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ * \(1) Output is an array of the same length as the input list array. The output values are the values at the specified index of each child list. @@ -1685,6 +1687,11 @@ Structural transforms index *n* and the type code at index *n* is 2. * The indices ``2`` and ``7`` are invalid. +* \(5) Extract either the ``FIRST``, ``LAST`` or ``ALL`` items from a + map array whose key match the given query key passed via options. + The output type is an Array of items for the ``FIRST``/``LAST`` options + and an Array of List of items for the ``ALL`` option. + These functions create a copy of the first input with some elements replaced, based on the remaining inputs. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index db817ac3d78..ab9e1497eef 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -482,6 +482,7 @@ Structural Transforms list_parent_indices list_value_length make_struct + map_array_lookup replace_with_mask struct_field @@ -504,6 +505,7 @@ Compute Options IndexOptions JoinOptions MakeStructOptions + MapArrayLookupOptions MatchSubstringOptions ModeOptions NullOptions From ecded80fb070b9ed34812e7ddfa19a9ad4bb0fbf Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Fri, 28 Jan 2022 19:45:48 +0530 Subject: [PATCH 14/24] Handle null keys --- cpp/src/arrow/compute/kernels/scalar_nested.cc | 7 +++++-- cpp/src/arrow/compute/kernels/scalar_nested_test.cc | 13 +++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index ddcf0413819..52eefb1f3bb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -606,8 +606,11 @@ Result ResolveMapArrayLookupType(KernelContext* ctx, std::shared_ptr item_type = checked_cast(*type).item_type(); std::shared_ptr key_type = checked_cast(*type).key_type(); - if (!options.query_key || !options.query_key->type || - !options.query_key->type->Equals(key_type)) { + if (!options.query_key) { + return Status::TypeError("map_array_lookup: query_key can't be empty."); + } else if (!options.query_key->is_valid) { + return Status::TypeError("map_array_lookup: query_key can't be null."); + } else if (!options.query_key->type || !options.query_key->type->Equals(key_type)) { return Status::TypeError( "map_array_lookup: query_key type and MapArray key_type don't match. Expected " "type: ", diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 698426e2f6f..02f249c34cc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -24,6 +24,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" namespace arrow { namespace compute { @@ -519,11 +520,19 @@ TEST_F(TestMapArrayLookupKernel, Errors) { MapArrayLookupOptions all(query_key_int16, MapArrayLookupOptions::ALL); MapArrayLookupOptions first(query_key_int16, MapArrayLookupOptions::FIRST); MapArrayLookupOptions last(query_key_int16, MapArrayLookupOptions::LAST); - MapArrayLookupOptions null_key; + MapArrayLookupOptions empty_key(nullptr); + MapArrayLookupOptions null_key(MakeNullScalar(int32())); - for (auto option : {all, first, last, null_key, unsupported}) { + for (auto option : {unsupported, all, first, last, empty_key, null_key}) { ASSERT_RAISES(TypeError, CallFunction("map_array_lookup", {map_array}, &option)); } + + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, ::testing::HasSubstr("key can't be empty"), + CallFunction("map_array_lookup", {map_array}, &empty_key)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, ::testing::HasSubstr("key can't be null"), + CallFunction("map_array_lookup", {map_array}, &null_key)); } template From 87a60860911d67075b82b1ea0a075d9a859e02d4 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Fri, 28 Jan 2022 19:51:01 +0530 Subject: [PATCH 15/24] Remove unused header --- cpp/src/arrow/compute/kernels/scalar_nested.cc | 2 +- cpp/src/arrow/compute/kernels/scalar_nested_test.cc | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 52eefb1f3bb..f04ca7b5e0d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -518,7 +518,7 @@ struct MapArrayLookupFunctor { } ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); out->value = result->data(); - } else { + } else { /* occurrence == FIRST || LAST */ RETURN_NOT_OK( MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 02f249c34cc..1d43adc2f77 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -24,7 +24,6 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/util/key_value_metadata.h" -#include "arrow/util/logging.h" namespace arrow { namespace compute { From 8704599c73aacfb60363637e8b30b5eee8c00d76 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Mon, 31 Jan 2022 10:57:17 +0530 Subject: [PATCH 16/24] Use ListBuilder --- .../arrow/compute/kernels/scalar_nested.cc | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index f04ca7b5e0d..e76010bc7a0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -23,6 +23,7 @@ #include "arrow/result.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bitmap_generate.h" +#include "arrow/array/builder_nested.h" namespace arrow { namespace compute { @@ -464,7 +465,7 @@ struct MapArrayLookupFunctor { FoundItem callback) { const auto query_key = UnboxScalar::Unbox(query_key_scalar); int64_t index = 0; - ARROW_UNUSED(VisitArrayValuesInline( + Status status = VisitArrayValuesInline( *keys.data(), [&](decltype(query_key) key) -> Status { if (key == query_key) { @@ -476,7 +477,10 @@ struct MapArrayLookupFunctor { [&]() -> Status { ++index; return Status::OK(); - })); + }); + if (!status.ok() && !status.IsCancelled()) { + return status; + } return Status::OK(); } @@ -490,11 +494,13 @@ struct MapArrayLookupFunctor { if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(map_array.map_type()->item_type()), &builder)); + auto list_builder = checked_cast(builder.get()); + auto value_builder = list_builder->value_builder(); for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { if (!map_array.IsValid(map_array_idx)) { - RETURN_NOT_OK(builder->AppendNull()); + RETURN_NOT_OK(list_builder->AppendNull()); continue; } @@ -502,26 +508,23 @@ struct MapArrayLookupFunctor { auto keys = checked_cast(*map).field(0); auto items = checked_cast(*map).field(1); bool found_at_least_one_key = false; - std::unique_ptr list_builder; - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), - &list_builder)); - - RETURN_NOT_OK(BuildItemsArray(*keys, *items, *query_key, &found_at_least_one_key, - list_builder.get())); + RETURN_NOT_OK( + FindMatchingIndices(*keys, *query_key, [&](int64_t index) -> Status { + if (!found_at_least_one_key) RETURN_NOT_OK(list_builder->Append(true)); + found_at_least_one_key = true; + RETURN_NOT_OK(value_builder->AppendArraySlice(*items->data(), index, 1)); + return Status::OK(); + })); if (!found_at_least_one_key) { - RETURN_NOT_OK(builder->AppendNull()); - } else { - ARROW_ASSIGN_OR_RAISE(auto list_result, list_builder->Finish()); - RETURN_NOT_OK(builder->AppendScalar(ListScalar(list_result))); + RETURN_NOT_OK(list_builder->AppendNull()); } - list_builder->Reset(); } - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + ARROW_ASSIGN_OR_RAISE(auto result, list_builder->Finish()); out->value = result->data(); } else { /* occurrence == FIRST || LAST */ RETURN_NOT_OK( MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder)); - + RETURN_NOT_OK(builder->Reserve(batch.length)); for (int64_t map_array_idx = 0; map_array_idx < map_array.length(); ++map_array_idx) { if (!map_array.IsValid(map_array_idx)) { From 1302a47f2b20a357a864a6a9a6ec849819af965a Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Mon, 31 Jan 2022 22:44:12 +0530 Subject: [PATCH 17/24] Add python bindings --- cpp/src/arrow/compute/api_scalar.h | 2 +- .../arrow/compute/kernels/scalar_nested.cc | 2 +- .../compute/kernels/scalar_nested_test.cc | 6 +-- python/pyarrow/_compute.pyx | 37 +++++++++++++++++++ python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 13 +++++++ python/pyarrow/tests/test_compute.py | 18 +++++++++ 7 files changed, 74 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index d1ed11ffe0d..b0cf97cc19c 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -483,7 +483,7 @@ class ARROW_EXPORT MapArrayLookupOptions : public FunctionOptions { }; explicit MapArrayLookupOptions(std::shared_ptr query_key, - Occurrence occurrence = ALL); + Occurrence occurrence); MapArrayLookupOptions(); constexpr static char const kTypeName[] = "MapArrayLookupOptions"; diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index e76010bc7a0..6f3c8498ce8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -18,12 +18,12 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/array/builder_nested.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" #include "arrow/result.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bitmap_generate.h" -#include "arrow/array/builder_nested.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 1d43adc2f77..35166bbe0aa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -515,12 +515,12 @@ TEST_F(TestMapArrayLookupKernel, Errors) { }))}; auto unsupported_scalar = ScalarFromJSON(struct_(fields), R"([1, "a", [10, 10.0]])"); - MapArrayLookupOptions unsupported(unsupported_scalar); + MapArrayLookupOptions unsupported(unsupported_scalar, MapArrayLookupOptions::FIRST); MapArrayLookupOptions all(query_key_int16, MapArrayLookupOptions::ALL); MapArrayLookupOptions first(query_key_int16, MapArrayLookupOptions::FIRST); MapArrayLookupOptions last(query_key_int16, MapArrayLookupOptions::LAST); - MapArrayLookupOptions empty_key(nullptr); - MapArrayLookupOptions null_key(MakeNullScalar(int32())); + MapArrayLookupOptions empty_key(nullptr, MapArrayLookupOptions::FIRST); + MapArrayLookupOptions null_key(MakeNullScalar(int32()), MapArrayLookupOptions::FIRST); for (auto option : {unsupported, all, first, last, empty_key, null_key}) { ASSERT_RAISES(TypeError, CallFunction("map_array_lookup", {map_array}, &option)); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 1ec96d33a02..25be4d0b162 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1333,6 +1333,43 @@ class IndexOptions(_IndexOptions): self._set_options(value) +cdef class _MapArrayLookupOptions(FunctionOptions): + _occurrence_map = { + "ALL": CMapArrayLookupOccurrence_ALL, + "FIRST": CMapArrayLookupOccurrence_FIRST, + "LAST": CMapArrayLookupOccurrence_LAST, + } + + def _set_options(self, scalar, occurrence): + try: + self.wrapped.reset( + new CMapArrayLookupOptions( + pyarrow_unwrap_scalar(scalar), + self._occurrence_map[occurrence] + ) + ) + except KeyError: + _raise_invalid_function_option(occurrence, + "Should either be FIRST, LAST or ALL") + + +class MapArrayLookupOptions(_MapArrayLookupOptions): + """ + Options for the `map_array_lookup` function. + + Parameters + ---------- + query_key : Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the MapArray + Accepted values are "FIRST", "LAST", "ALL". + """ + + def __init__(self, scalar, occurrence): + self._set_options(scalar, occurrence) + + cdef class _ModeOptions(FunctionOptions): def _set_options(self, n, skip_nulls, min_count): self.wrapped.reset(new CModeOptions(n, skip_nulls, min_count)) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 1cbf062f948..cbe7c24532b 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -41,6 +41,7 @@ IndexOptions, JoinOptions, MakeStructOptions, + MapArrayLookupOptions, MatchSubstringOptions, ModeOptions, NullOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4bf8e56331c..5efd7a783b4 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2147,6 +2147,19 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CIndexOptions(shared_ptr[CScalar] value) shared_ptr[CScalar] value + cdef enum CMapArrayLookupOccurrence \ + "arrow::compute::MapArrayLookupOptions::Occurrence": + CMapArrayLookupOccurrence_ALL "arrow::compute::MapArrayLookupOptions::ALL" + CMapArrayLookupOccurrence_FIRST "arrow::compute::MapArrayLookupOptions::FIRST" + CMapArrayLookupOccurrence_LAST "arrow::compute::MapArrayLookupOptions::LAST" + + cdef cppclass CMapArrayLookupOptions \ + "arrow::compute::MapArrayLookupOptions"(CFunctionOptions): + CMapArrayLookupOptions(shared_ptr[CScalar] query_key, + CMapArrayLookupOccurrence occurrence) + CMapArrayLookupOccurrence occurrence + shared_ptr[CScalar] query_key + cdef cppclass CMakeStructOptions \ "arrow::compute::MakeStructOptions"(CFunctionOptions): CMakeStructOptions(vector[c_string] n, diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index e97b2f53eb6..38ed1df70c0 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -141,6 +141,7 @@ def test_option_class_equality(): field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}), pa.KeyValueMetadata({"b": "2"})]), + pc.MapArrayLookupOptions(pa.scalar(1), "FIRST"), pc.MatchSubstringOptions("pattern"), pc.ModeOptions(), pc.NullOptions(), @@ -2470,6 +2471,23 @@ def test_make_struct(): pc.make_struct(field_names=['one', 'two']) +def test_map_array_lookup(): + ty = pa.map_(pa.utf8(), pa.int32()) + arr = pa.array([[('one', 1), ('two', 2)], [('none', 3)], + [], [('one', 5), ('one', 7)], None], type=ty) + result_first = pa.array([1, None, None, 5, None], type=pa.int32()) + result_last = pa.array([1, None, None, 7, None], type=pa.int32()) + result_all = pa.array([[1], None, None, [5, 7], None], + type=pa.list_(pa.int32())) + + assert pc.map_array_lookup(arr, pa.scalar( + 'one', type=pa.utf8()), 'FIRST') == result_first + assert pc.map_array_lookup(arr, pa.scalar( + 'one', type=pa.utf8()), 'LAST') == result_last + assert pc.map_array_lookup(arr, pa.scalar( + 'one', type=pa.utf8()), 'ALL') == result_all + + def test_struct_fields_options(): a = pa.array([4, 5, 6], type=pa.int64()) b = pa.array(["bar", None, ""]) From 926899407f0f21589627ac12edc819ed0ce99b80 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Mon, 31 Jan 2022 22:51:52 +0530 Subject: [PATCH 18/24] Rearrange docs --- docs/source/cpp/compute.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index f1b82f45cc5..6d639d2d263 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1648,9 +1648,9 @@ Structural transforms +---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ | list_parent_indices | Unary | List-like | Int64 | | \(3) | +---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ -| struct_field | Unary | Struct or Union | Computed | :struct:`StructFieldOptions` | \(4) | +| map_array_lookup | Unary | Map | Computed | :struct:`MapArrayLookupOptions` | \(4) | +---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ -| map_array_lookup | Unary | MapArray | Computed | :struct:`MapArrayLookupOptions` | \(5) | +| struct_field | Unary | Struct or Union | Computed | :struct:`StructFieldOptions` | \(5) | +---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ * \(1) Output is an array of the same length as the input list array. The @@ -1664,7 +1664,12 @@ Structural transforms in the list array is appended to the output. Nulls in the parent list array are discarded. -* \(4) Extract a child value based on a sequence of indices passed in +* \(4) Extract either the ``FIRST``, ``LAST`` or ``ALL`` items from a + map array whose key match the given query key passed via options. + The output type is an Array of items for the ``FIRST``/``LAST`` options + and an Array of List of items for the ``ALL`` option. + +* \(5) Extract a child value based on a sequence of indices passed in the options. The validity bitmap of the result will be the intersection of all intermediate validity bitmaps. For example, for an array with type ``struct Date: Mon, 31 Jan 2022 23:05:07 +0530 Subject: [PATCH 19/24] Linting --- python/pyarrow/_compute.pyx | 2 +- python/pyarrow/includes/libarrow.pxd | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 25be4d0b162..2aa48fe1497 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1356,7 +1356,7 @@ cdef class _MapArrayLookupOptions(FunctionOptions): class MapArrayLookupOptions(_MapArrayLookupOptions): """ Options for the `map_array_lookup` function. - + Parameters ---------- query_key : Scalar diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 5efd7a783b4..dc843a11141 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2148,10 +2148,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: shared_ptr[CScalar] value cdef enum CMapArrayLookupOccurrence \ - "arrow::compute::MapArrayLookupOptions::Occurrence": - CMapArrayLookupOccurrence_ALL "arrow::compute::MapArrayLookupOptions::ALL" - CMapArrayLookupOccurrence_FIRST "arrow::compute::MapArrayLookupOptions::FIRST" - CMapArrayLookupOccurrence_LAST "arrow::compute::MapArrayLookupOptions::LAST" + "arrow::compute::MapArrayLookupOptions::Occurrence": + CMapArrayLookupOccurrence_ALL "arrow::compute::MapArrayLookupOptions::ALL" + CMapArrayLookupOccurrence_FIRST "arrow::compute::MapArrayLookupOptions::FIRST" + CMapArrayLookupOccurrence_LAST "arrow::compute::MapArrayLookupOptions::LAST" cdef cppclass CMapArrayLookupOptions \ "arrow::compute::MapArrayLookupOptions"(CFunctionOptions): From 3891afde21cebb3a09b18121f2bef2c2e039ece4 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Wed, 2 Feb 2022 13:08:26 +0530 Subject: [PATCH 20/24] Update bindings and inline helper --- cpp/src/arrow/compute/api_scalar.h | 3 +- .../arrow/compute/kernels/scalar_nested.cc | 37 +++++++------------ .../compute/kernels/scalar_nested_test.cc | 6 +-- python/pyarrow/_compute.pyx | 18 ++++----- python/pyarrow/tests/test_compute.py | 8 ++-- 5 files changed, 31 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index b0cf97cc19c..5dd24dec3a6 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -1384,8 +1384,9 @@ ARROW_EXPORT Result AssumeTimezone(const Datum& values, /// \param[in] options to pass a query key and choose which matching keys to return /// (FIRST, LAST or ALL) /// \param[in] ctx the function execution context, optional -/// /// \return the resulting datum +/// +/// \since 8.0.0 /// \note API not yet finalized ARROW_EXPORT Result MapArrayLookup(const Datum& map_array, MapArrayLookupOptions options, diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 6f3c8498ce8..9cce4ccebe8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -448,18 +448,6 @@ struct MapArrayLookupFunctor { return match_index; } - static Status BuildItemsArray(const Array& keys, const Array& items, - const Scalar& query_key_scalar, - bool* found_at_least_one_key, ArrayBuilder* builder) { - RETURN_NOT_OK( - FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) -> Status { - *found_at_least_one_key = true; - RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), index, 1)); - return Status::OK(); - })); - return Status::OK(); - } - template static Status FindMatchingIndices(const Array& keys, const Scalar& query_key_scalar, FoundItem callback) { @@ -578,9 +566,12 @@ struct MapArrayLookupFunctor { bool found_at_least_one_key = false; std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); - RETURN_NOT_OK(BuildItemsArray(*keys, *items, *query_key, &found_at_least_one_key, - builder.get())); + RETURN_NOT_OK(FindMatchingIndices(*keys, *query_key, [&](int64_t index) -> Status { + found_at_least_one_key = true; + RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), index, 1)); + return Status::OK(); + })); if (!found_at_least_one_key) { out->value = MakeNullScalar(list(items->type())); } else { @@ -610,9 +601,9 @@ Result ResolveMapArrayLookupType(KernelContext* ctx, std::shared_ptr key_type = checked_cast(*type).key_type(); if (!options.query_key) { - return Status::TypeError("map_array_lookup: query_key can't be empty."); + return Status::Invalid("map_array_lookup: query_key can't be empty."); } else if (!options.query_key->is_valid) { - return Status::TypeError("map_array_lookup: query_key can't be null."); + return Status::Invalid("map_array_lookup: query_key can't be null."); } else if (!options.query_key->type || !options.query_key->type->Equals(key_type)) { return Status::TypeError( "map_array_lookup: query_key type and MapArray key_type don't match. Expected " @@ -681,14 +672,12 @@ struct ResolveMapArrayLookup { }; void AddMapArrayLookupKernels(ScalarFunction* func) { - for (const auto shape : {ValueDescr::ARRAY, ValueDescr::SCALAR}) { - ScalarKernel kernel( - {InputType(Type::MAP, shape)}, OutputType(ResolveMapArrayLookupType), - ResolveMapArrayLookup::Exec, OptionsWrapper::Init); - kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(std::move(kernel))); - } + ScalarKernel kernel({InputType(Type::MAP)}, OutputType(ResolveMapArrayLookupType), + ResolveMapArrayLookup::Exec, + OptionsWrapper::Init); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(std::move(kernel))); } const FunctionDoc map_array_lookup_doc{ diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 35166bbe0aa..85b2a4a5108 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -522,15 +522,15 @@ TEST_F(TestMapArrayLookupKernel, Errors) { MapArrayLookupOptions empty_key(nullptr, MapArrayLookupOptions::FIRST); MapArrayLookupOptions null_key(MakeNullScalar(int32()), MapArrayLookupOptions::FIRST); - for (auto option : {unsupported, all, first, last, empty_key, null_key}) { + for (auto option : {unsupported, all, first, last}) { ASSERT_RAISES(TypeError, CallFunction("map_array_lookup", {map_array}, &option)); } EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, ::testing::HasSubstr("key can't be empty"), + Invalid, ::testing::HasSubstr("key can't be empty"), CallFunction("map_array_lookup", {map_array}, &empty_key)); EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, ::testing::HasSubstr("key can't be null"), + Invalid, ::testing::HasSubstr("key can't be null"), CallFunction("map_array_lookup", {map_array}, &null_key)); } diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 2aa48fe1497..9ddbd259783 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1335,22 +1335,22 @@ class IndexOptions(_IndexOptions): cdef class _MapArrayLookupOptions(FunctionOptions): _occurrence_map = { - "ALL": CMapArrayLookupOccurrence_ALL, - "FIRST": CMapArrayLookupOccurrence_FIRST, - "LAST": CMapArrayLookupOccurrence_LAST, + "all": CMapArrayLookupOccurrence_ALL, + "first": CMapArrayLookupOccurrence_FIRST, + "last": CMapArrayLookupOccurrence_LAST, } - def _set_options(self, scalar, occurrence): + def _set_options(self, query_key, occurrence): try: self.wrapped.reset( new CMapArrayLookupOptions( - pyarrow_unwrap_scalar(scalar), + pyarrow_unwrap_scalar(query_key), self._occurrence_map[occurrence] ) ) except KeyError: _raise_invalid_function_option(occurrence, - "Should either be FIRST, LAST or ALL") + "Should either be first, last, or all") class MapArrayLookupOptions(_MapArrayLookupOptions): @@ -1363,11 +1363,11 @@ class MapArrayLookupOptions(_MapArrayLookupOptions): The key to search for. occurrence : str The occurrence(s) to return from the MapArray - Accepted values are "FIRST", "LAST", "ALL". + Accepted values are "first", "last", or "all". """ - def __init__(self, scalar, occurrence): - self._set_options(scalar, occurrence) + def __init__(self, query_key, occurrence): + self._set_options(query_key, occurrence) cdef class _ModeOptions(FunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 38ed1df70c0..7b80d1da296 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -141,7 +141,7 @@ def test_option_class_equality(): field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}), pa.KeyValueMetadata({"b": "2"})]), - pc.MapArrayLookupOptions(pa.scalar(1), "FIRST"), + pc.MapArrayLookupOptions(pa.scalar(1), "first"), pc.MatchSubstringOptions("pattern"), pc.ModeOptions(), pc.NullOptions(), @@ -2481,11 +2481,11 @@ def test_map_array_lookup(): type=pa.list_(pa.int32())) assert pc.map_array_lookup(arr, pa.scalar( - 'one', type=pa.utf8()), 'FIRST') == result_first + 'one', type=pa.utf8()), 'first') == result_first assert pc.map_array_lookup(arr, pa.scalar( - 'one', type=pa.utf8()), 'LAST') == result_last + 'one', type=pa.utf8()), 'last') == result_last assert pc.map_array_lookup(arr, pa.scalar( - 'one', type=pa.utf8()), 'ALL') == result_all + 'one', type=pa.utf8()), 'all') == result_all def test_struct_fields_options(): From a4164d469252adafb8ecf1c9ee59440ab5252598 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Wed, 2 Feb 2022 14:15:56 +0530 Subject: [PATCH 21/24] Rename to MapLookup --- cpp/src/arrow/compute/api_scalar.cc | 45 ++++---- cpp/src/arrow/compute/api_scalar.h | 20 ++-- .../arrow/compute/kernels/scalar_nested.cc | 63 +++++----- .../compute/kernels/scalar_nested_test.cc | 108 +++++++++--------- docs/source/cpp/compute.rst | 28 ++--- docs/source/python/api/compute.rst | 4 +- python/pyarrow/_compute.pyx | 16 +-- python/pyarrow/compute.py | 2 +- python/pyarrow/includes/libarrow.pxd | 22 ++-- python/pyarrow/tests/test_compute.py | 10 +- 10 files changed, 156 insertions(+), 162 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 955a2a55987..3a4d89e8e31 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -255,19 +255,19 @@ struct EnumTraits }; template <> -struct EnumTraits - : BasicEnumTraits { - static std::string name() { return "MapArrayLookupOptions::Occurrence"; } - static std::string value_name(compute::MapArrayLookupOptions::Occurrence value) { +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "MapLookupOptions::Occurrence"; } + static std::string value_name(compute::MapLookupOptions::Occurrence value) { switch (value) { - case compute::MapArrayLookupOptions::Occurrence::FIRST: + case compute::MapLookupOptions::Occurrence::FIRST: return "FIRST"; - case compute::MapArrayLookupOptions::Occurrence::LAST: + case compute::MapLookupOptions::Occurrence::LAST: return "LAST"; - case compute::MapArrayLookupOptions::Occurrence::ALL: + case compute::MapLookupOptions::Occurrence::ALL: return "ALL"; } return ""; @@ -307,9 +307,9 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType( DataMember("field_names", &MakeStructOptions::field_names), DataMember("field_nullability", &MakeStructOptions::field_nullability), DataMember("field_metadata", &MakeStructOptions::field_metadata)); -static auto kMapArrayLookupOptionsType = GetFunctionOptionsType( - DataMember("occurrence", &MapArrayLookupOptions::occurrence), - DataMember("query_key", &MapArrayLookupOptions::query_key)); +static auto kMapLookupOptionsType = GetFunctionOptionsType( + DataMember("occurrence", &MapLookupOptions::occurrence), + DataMember("query_key", &MapLookupOptions::query_key)); static auto kMatchSubstringOptionsType = GetFunctionOptionsType( DataMember("pattern", &MatchSubstringOptions::pattern), DataMember("ignore_case", &MatchSubstringOptions::ignore_case)); @@ -423,14 +423,14 @@ MakeStructOptions::MakeStructOptions(std::vector n) MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector()) {} constexpr char MakeStructOptions::kTypeName[]; -MapArrayLookupOptions::MapArrayLookupOptions(std::shared_ptr query_key, - Occurrence occurrence) - : FunctionOptions(internal::kMapArrayLookupOptionsType), +MapLookupOptions::MapLookupOptions(std::shared_ptr query_key, + Occurrence occurrence) + : FunctionOptions(internal::kMapLookupOptionsType), query_key(std::move(query_key)), occurrence(occurrence) {} -MapArrayLookupOptions::MapArrayLookupOptions() - : MapArrayLookupOptions(std::make_shared(), Occurrence::FIRST) {} -constexpr char MapArrayLookupOptions::kTypeName[]; +MapLookupOptions::MapLookupOptions() + : MapLookupOptions(std::make_shared(), Occurrence::FIRST) {} +constexpr char MapLookupOptions::kTypeName[]; MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case) : FunctionOptions(internal::kMatchSubstringOptionsType), @@ -587,7 +587,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); - DCHECK_OK(registry->AddFunctionOptionsType(kMapArrayLookupOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kMapLookupOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kNullOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType)); @@ -821,9 +821,8 @@ Result Week(const Datum& arg, WeekOptions options, ExecContext* ctx) { // ---------------------------------------------------------------------- // Structural transforms -Result MapArrayLookup(const Datum& arg, MapArrayLookupOptions options, - ExecContext* ctx) { - return CallFunction("map_array_lookup", {arg}, &options, ctx); +Result MapLookup(const Datum& arg, MapLookupOptions options, ExecContext* ctx) { + return CallFunction("map_lookup", {arg}, &options, ctx); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 5dd24dec3a6..8537183c369 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -470,8 +470,8 @@ class ARROW_EXPORT RandomOptions : public FunctionOptions { uint64_t seed; }; -/// Options for map_array_lookup function -class ARROW_EXPORT MapArrayLookupOptions : public FunctionOptions { +/// Options for map_lookup function +class ARROW_EXPORT MapLookupOptions : public FunctionOptions { public: enum Occurrence { /// Return the first matching value @@ -482,11 +482,10 @@ class ARROW_EXPORT MapArrayLookupOptions : public FunctionOptions { ALL }; - explicit MapArrayLookupOptions(std::shared_ptr query_key, - Occurrence occurrence); - MapArrayLookupOptions(); + explicit MapLookupOptions(std::shared_ptr query_key, Occurrence occurrence); + MapLookupOptions(); - constexpr static char const kTypeName[] = "MapArrayLookupOptions"; + constexpr static char const kTypeName[] = "MapLookupOptions"; /// The key to lookup in the map std::shared_ptr query_key; @@ -1376,11 +1375,11 @@ ARROW_EXPORT Result AssumeTimezone(const Datum& values, ExecContext* ctx = NULLPTR); /// \brief Finds either the FIRST, LAST, or ALL items with a key that matches the given -/// query key in a map array. +/// query key in a map. /// /// Returns an array of items for FIRST and LAST, and an array of list of items for ALL. /// -/// \param[in] map_array to look in +/// \param[in] map to look in /// \param[in] options to pass a query key and choose which matching keys to return /// (FIRST, LAST or ALL) /// \param[in] ctx the function execution context, optional @@ -1388,8 +1387,7 @@ ARROW_EXPORT Result AssumeTimezone(const Datum& values, /// /// \since 8.0.0 /// \note API not yet finalized -ARROW_EXPORT Result MapArrayLookup(const Datum& map_array, - MapArrayLookupOptions options, - ExecContext* ctx = NULLPTR); +ARROW_EXPORT Result MapLookup(const Datum& map, MapLookupOptions options, + ExecContext* ctx = NULLPTR); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 9cce4ccebe8..0288ee26ac2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -430,7 +430,7 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray", {"*args"}, "MakeStructOptions"}; template -struct MapArrayLookupFunctor { +struct MapLookupFunctor { static Result GetOneMatchingIndex(const Array& keys, const Scalar& query_key_scalar, const bool* from_back) { @@ -473,13 +473,13 @@ struct MapArrayLookupFunctor { } static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const auto& options = OptionsWrapper::Get(ctx); + const auto& options = OptionsWrapper::Get(ctx); const auto& query_key = options.query_key; const auto& occurrence = options.occurrence; const MapArray map_array(batch[0].array()); std::unique_ptr builder; - if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { + if (occurrence == MapLookupOptions::Occurrence::ALL) { RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(map_array.map_type()->item_type()), &builder)); auto list_builder = checked_cast(builder.get()); @@ -523,7 +523,7 @@ struct MapArrayLookupFunctor { auto map = map_array.value_slice(map_array_idx); auto keys = checked_cast(*map).field(0); auto items = checked_cast(*map).field(1); - bool from_back = (occurrence == MapArrayLookupOptions::LAST); + bool from_back = (occurrence == MapLookupOptions::LAST); ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx, GetOneMatchingIndex(*keys, *query_key, &from_back)); @@ -541,7 +541,7 @@ struct MapArrayLookupFunctor { } static Status ExecMapScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const auto& options = OptionsWrapper::Get(ctx); + const auto& options = OptionsWrapper::Get(ctx); const auto& query_key = options.query_key; const auto& occurrence = options.occurrence; @@ -550,7 +550,7 @@ struct MapArrayLookupFunctor { const auto& map_scalar = batch[0].scalar_as(); if (ARROW_PREDICT_FALSE(!map_scalar.is_valid)) { - if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { + if (options.occurrence == MapLookupOptions::Occurrence::ALL) { out->value = MakeNullScalar(list(item_type)); } else { out->value = MakeNullScalar(item_type); @@ -562,7 +562,7 @@ struct MapArrayLookupFunctor { const std::shared_ptr keys = struct_array.field(0); const std::shared_ptr items = struct_array.field(1); - if (occurrence == MapArrayLookupOptions::Occurrence::ALL) { + if (occurrence == MapLookupOptions::Occurrence::ALL) { bool found_at_least_one_key = false; std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder)); @@ -579,7 +579,7 @@ struct MapArrayLookupFunctor { ARROW_ASSIGN_OR_RAISE(out->value, MakeScalar(list(items->type()), result)); } } else { /* occurrence == FIRST || LAST */ - bool from_back = (occurrence == MapArrayLookupOptions::LAST); + bool from_back = (occurrence == MapLookupOptions::LAST); ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx, GetOneMatchingIndex(*keys, *query_key, &from_back)); @@ -593,32 +593,32 @@ struct MapArrayLookupFunctor { } }; -Result ResolveMapArrayLookupType(KernelContext* ctx, - const std::vector& descrs) { - const auto& options = OptionsWrapper::Get(ctx); +Result ResolveMapLookupType(KernelContext* ctx, + const std::vector& descrs) { + const auto& options = OptionsWrapper::Get(ctx); std::shared_ptr type = descrs.front().type; std::shared_ptr item_type = checked_cast(*type).item_type(); std::shared_ptr key_type = checked_cast(*type).key_type(); if (!options.query_key) { - return Status::Invalid("map_array_lookup: query_key can't be empty."); + return Status::Invalid("map_lookup: query_key can't be empty."); } else if (!options.query_key->is_valid) { - return Status::Invalid("map_array_lookup: query_key can't be null."); + return Status::Invalid("map_lookup: query_key can't be null."); } else if (!options.query_key->type || !options.query_key->type->Equals(key_type)) { return Status::TypeError( - "map_array_lookup: query_key type and MapArray key_type don't match. Expected " + "map_lookup: query_key type and Map key_type don't match. Expected " "type: ", *key_type, ", but got type: ", *options.query_key->type); } - if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) { + if (options.occurrence == MapLookupOptions::Occurrence::ALL) { return ValueDescr(list(item_type), descrs.front().shape); } else { /* occurrence == FIRST || LAST */ return ValueDescr(item_type, descrs.front().shape); } } -struct ResolveMapArrayLookup { +struct ResolveMapLookup { KernelContext* ctx; const ExecBatch& batch; Datum* out; @@ -626,9 +626,9 @@ struct ResolveMapArrayLookup { template Status Execute() { if (batch[0].kind() == Datum::SCALAR) { - return MapArrayLookupFunctor::ExecMapScalar(ctx, batch, out); + return MapLookupFunctor::ExecMapScalar(ctx, batch, out); } - return MapArrayLookupFunctor::ExecMapArray(ctx, batch, out); + return MapLookupFunctor::ExecMapArray(ctx, batch, out); } template @@ -665,28 +665,27 @@ struct ResolveMapArrayLookup { } static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ResolveMapArrayLookup visitor{ctx, batch, out}; + ResolveMapLookup visitor{ctx, batch, out}; return VisitTypeInline(*checked_cast(*batch[0].type()).key_type(), &visitor); } }; -void AddMapArrayLookupKernels(ScalarFunction* func) { - ScalarKernel kernel({InputType(Type::MAP)}, OutputType(ResolveMapArrayLookupType), - ResolveMapArrayLookup::Exec, - OptionsWrapper::Init); +void AddMapLookupKernels(ScalarFunction* func) { + ScalarKernel kernel({InputType(Type::MAP)}, OutputType(ResolveMapLookupType), + ResolveMapLookup::Exec, OptionsWrapper::Init); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; DCHECK_OK(func->AddKernel(std::move(kernel))); } -const FunctionDoc map_array_lookup_doc{ - "Find the items corresponding to a given key in a MapArray", - ("For a given query key (passed via MapArrayLookupOptions), extract\n" - "either the FIRST, LAST or ALL items from a MapArray that have\n" +const FunctionDoc map_lookup_doc{ + "Find the items corresponding to a given key in a Map", + ("For a given query key (passed via MapLookupOptions), extract\n" + "either the FIRST, LAST or ALL items from a Map that have\n" "matching keys."), {"container"}, - "MapArrayLookupOptions", + "MapLookupOptions", /*options_required=*/true}; } // namespace @@ -713,10 +712,10 @@ void RegisterScalarNested(FunctionRegistry* registry) { AddStructFieldKernels(struct_field.get()); DCHECK_OK(registry->AddFunction(std::move(struct_field))); - auto map_array_lookup = std::make_shared( - "map_array_lookup", Arity::Unary(), &map_array_lookup_doc); - AddMapArrayLookupKernels(map_array_lookup.get()); - DCHECK_OK(registry->AddFunction(std::move(map_array_lookup))); + auto map_lookup = + std::make_shared("map_lookup", Arity::Unary(), &map_lookup_doc); + AddMapLookupKernels(map_lookup.get()); + DCHECK_OK(registry->AddFunction(std::move(map_lookup))); static MakeStructOptions kDefaultMakeStructOptions; auto make_struct_function = std::make_shared( diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 85b2a4a5108..050d070358a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -225,23 +225,23 @@ TEST(TestScalarNested, StructField) { } } -void CheckMapArrayLookupWithDifferentOptions( - const std::shared_ptr& map, const std::shared_ptr& query_key, - const std::shared_ptr& expected_all, - const std::shared_ptr& expected_first, - const std::shared_ptr& expected_last) { - MapArrayLookupOptions all_matches(query_key, MapArrayLookupOptions::ALL); - MapArrayLookupOptions first_matches(query_key, MapArrayLookupOptions::FIRST); - MapArrayLookupOptions last_matches(query_key, MapArrayLookupOptions::LAST); - - CheckScalar("map_array_lookup", {map}, expected_all, &all_matches); - CheckScalar("map_array_lookup", {map}, expected_first, &first_matches); - CheckScalar("map_array_lookup", {map}, expected_last, &last_matches); +void CheckMapLookupWithDifferentOptions(const std::shared_ptr& map, + const std::shared_ptr& query_key, + const std::shared_ptr& expected_all, + const std::shared_ptr& expected_first, + const std::shared_ptr& expected_last) { + MapLookupOptions all_matches(query_key, MapLookupOptions::ALL); + MapLookupOptions first_matches(query_key, MapLookupOptions::FIRST); + MapLookupOptions last_matches(query_key, MapLookupOptions::LAST); + + CheckScalar("map_lookup", {map}, expected_all, &all_matches); + CheckScalar("map_lookup", {map}, expected_first, &first_matches); + CheckScalar("map_lookup", {map}, expected_last, &last_matches); } -class TestMapArrayLookupKernel : public ::testing::Test {}; +class TestMapLookupKernel : public ::testing::Test {}; -TEST_F(TestMapArrayLookupKernel, Basic) { +TEST_F(TestMapLookupKernel, Basic) { auto type = map(utf8(), int32()); const char* input = R"( [ @@ -253,14 +253,14 @@ TEST_F(TestMapArrayLookupKernel, Basic) { ])"; auto map_array = ArrayFromJSON(type, input); - CheckMapArrayLookupWithDifferentOptions( + CheckMapLookupWithDifferentOptions( map_array, MakeScalar("foo"), ArrayFromJSON(list(int32()), R"([[99, 3], null, [101, 22], null])"), ArrayFromJSON(int32(), R"([99, null, 101, null])"), ArrayFromJSON(int32(), R"([3, null, 22, null])")); } -TEST_F(TestMapArrayLookupKernel, NestedItems) { +TEST_F(TestMapLookupKernel, NestedItems) { auto type = map(utf8(), map(int16(), int16())); const char* input = R"( [ @@ -358,11 +358,11 @@ TEST_F(TestMapArrayLookupKernel, NestedItems) { null ])"); - CheckMapArrayLookupWithDifferentOptions(map_array, MakeScalar("foo"), expected_all, - expected_first, expected_last); + CheckMapLookupWithDifferentOptions(map_array, MakeScalar("foo"), expected_all, + expected_first, expected_last); } -TEST_F(TestMapArrayLookupKernel, BooleanKey) { +TEST_F(TestMapLookupKernel, BooleanKey) { auto true_scalar = ScalarFromJSON(boolean(), R"(true)"); auto map_type = map(boolean(), int32()); const char* input = R"( @@ -393,11 +393,11 @@ TEST_F(TestMapArrayLookupKernel, BooleanKey) { auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); - CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, true_scalar, expected_all, - expected_first, expected_last); + CheckMapLookupWithDifferentOptions(map_array_tweaked, true_scalar, expected_all, + expected_first, expected_last); } -TEST_F(TestMapArrayLookupKernel, MonthDayNanoIntervalKeys) { +TEST_F(TestMapLookupKernel, MonthDayNanoIntervalKeys) { auto key_type = month_day_nano_interval(); auto map_type = map(key_type, utf8()); auto key_scalar = ScalarFromJSON(month_day_nano_interval(), R"([1, 2, -3])"); @@ -444,11 +444,11 @@ TEST_F(TestMapArrayLookupKernel, MonthDayNanoIntervalKeys) { ] )"); - CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, key_scalar, expected_all, - expected_first, expected_last); + CheckMapLookupWithDifferentOptions(map_array_tweaked, key_scalar, expected_all, + expected_first, expected_last); } -TEST_F(TestMapArrayLookupKernel, FixedSizeBinary) { +TEST_F(TestMapLookupKernel, FixedSizeBinary) { auto key_type = fixed_size_binary(6); auto map_type = map(key_type, int32()); auto sheesh_scalar = ScalarFromJSON(key_type, R"("sheesh")"); @@ -480,11 +480,11 @@ TEST_F(TestMapArrayLookupKernel, FixedSizeBinary) { auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); - CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, sheesh_scalar, expected_all, - expected_first, expected_last); + CheckMapLookupWithDifferentOptions(map_array_tweaked, sheesh_scalar, expected_all, + expected_first, expected_last); } -TEST_F(TestMapArrayLookupKernel, Errors) { +TEST_F(TestMapLookupKernel, Errors) { auto map_type = map(int32(), utf8()); const char* input = R"( [ @@ -515,27 +515,25 @@ TEST_F(TestMapArrayLookupKernel, Errors) { }))}; auto unsupported_scalar = ScalarFromJSON(struct_(fields), R"([1, "a", [10, 10.0]])"); - MapArrayLookupOptions unsupported(unsupported_scalar, MapArrayLookupOptions::FIRST); - MapArrayLookupOptions all(query_key_int16, MapArrayLookupOptions::ALL); - MapArrayLookupOptions first(query_key_int16, MapArrayLookupOptions::FIRST); - MapArrayLookupOptions last(query_key_int16, MapArrayLookupOptions::LAST); - MapArrayLookupOptions empty_key(nullptr, MapArrayLookupOptions::FIRST); - MapArrayLookupOptions null_key(MakeNullScalar(int32()), MapArrayLookupOptions::FIRST); + MapLookupOptions unsupported(unsupported_scalar, MapLookupOptions::FIRST); + MapLookupOptions all(query_key_int16, MapLookupOptions::ALL); + MapLookupOptions first(query_key_int16, MapLookupOptions::FIRST); + MapLookupOptions last(query_key_int16, MapLookupOptions::LAST); + MapLookupOptions empty_key(nullptr, MapLookupOptions::FIRST); + MapLookupOptions null_key(MakeNullScalar(int32()), MapLookupOptions::FIRST); for (auto option : {unsupported, all, first, last}) { - ASSERT_RAISES(TypeError, CallFunction("map_array_lookup", {map_array}, &option)); + ASSERT_RAISES(TypeError, CallFunction("map_lookup", {map_array}, &option)); } - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("key can't be empty"), - CallFunction("map_array_lookup", {map_array}, &empty_key)); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("key can't be null"), - CallFunction("map_array_lookup", {map_array}, &null_key)); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("key can't be empty"), + CallFunction("map_lookup", {map_array}, &empty_key)); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("key can't be null"), + CallFunction("map_lookup", {map_array}, &null_key)); } template -class TestMapArrayLookupIntegralKeys : public ::testing ::Test { +class TestMapLookupIntegralKeys : public ::testing ::Test { protected: std::shared_ptr type_singleton() const { std::shared_ptr type = default_type_instance(); @@ -543,9 +541,9 @@ class TestMapArrayLookupIntegralKeys : public ::testing ::Test { } }; -TYPED_TEST_SUITE(TestMapArrayLookupIntegralKeys, PhysicalIntegralArrowTypes); +TYPED_TEST_SUITE(TestMapLookupIntegralKeys, PhysicalIntegralArrowTypes); -TYPED_TEST(TestMapArrayLookupIntegralKeys, StringItems) { +TYPED_TEST(TestMapLookupIntegralKeys, StringItems) { auto map_type = this->type_singleton(); const char* input = R"( [ @@ -584,12 +582,12 @@ TYPED_TEST(TestMapArrayLookupIntegralKeys, StringItems) { auto expected_last = ArrayFromJSON(utf8(), R"(["last_one", null, "no more ones!", null, null, null])"); - CheckMapArrayLookupWithDifferentOptions( + CheckMapLookupWithDifferentOptions( map_array_tweaked, MakeScalar(default_type_instance(), 1).ValueOrDie(), expected_all, expected_first, expected_last); } template -class TestMapArrayLookupDecimalKeys : public ::testing ::Test { +class TestMapLookupDecimalKeys : public ::testing ::Test { protected: std::shared_ptr type_singleton() const { return std::make_shared(/*precision=*/5, @@ -597,9 +595,9 @@ class TestMapArrayLookupDecimalKeys : public ::testing ::Test { } }; -TYPED_TEST_SUITE(TestMapArrayLookupDecimalKeys, DecimalArrowTypes); +TYPED_TEST_SUITE(TestMapLookupDecimalKeys, DecimalArrowTypes); -TYPED_TEST(TestMapArrayLookupDecimalKeys, StringItems) { +TYPED_TEST(TestMapLookupDecimalKeys, StringItems) { auto type = this->type_singleton(); auto map_type = map(type, utf8()); auto key_scalar = DecimalScalarFromJSON(type, R"("1.2345")"); @@ -642,21 +640,21 @@ TYPED_TEST(TestMapArrayLookupDecimalKeys, StringItems) { null ] )"); - CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, key_scalar, expected_all, - expected_first, expected_last); + CheckMapLookupWithDifferentOptions(map_array_tweaked, key_scalar, expected_all, + expected_first, expected_last); } template -class TestMapArrayLookupBinaryKeys : public ::testing ::Test { +class TestMapLookupBinaryKeys : public ::testing ::Test { protected: std::shared_ptr type_singleton() const { return TypeTraits::type_singleton(); } }; -TYPED_TEST_SUITE(TestMapArrayLookupBinaryKeys, BaseBinaryArrowTypes); +TYPED_TEST_SUITE(TestMapLookupBinaryKeys, BaseBinaryArrowTypes); -TYPED_TEST(TestMapArrayLookupBinaryKeys, IntegralItems) { +TYPED_TEST(TestMapLookupBinaryKeys, IntegralItems) { auto key_type = this->type_singleton(); auto sheesh_scalar = ScalarFromJSON(key_type, R"("sheesh")"); auto map_type = map(key_type, int32()); @@ -688,8 +686,8 @@ TYPED_TEST(TestMapArrayLookupBinaryKeys, IntegralItems) { auto expected_first = ArrayFromJSON(int32(), "[99, null, 67, null, null, null]"); auto expected_last = ArrayFromJSON(int32(), "[8, null, 80, null, null, null]"); - CheckMapArrayLookupWithDifferentOptions(map_array_tweaked, sheesh_scalar, expected_all, - expected_first, expected_last); + CheckMapLookupWithDifferentOptions(map_array_tweaked, sheesh_scalar, expected_all, + expected_first, expected_last); } struct { diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 6d639d2d263..b84205ce7b6 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1639,19 +1639,19 @@ in the respective option classes. Structural transforms ~~~~~~~~~~~~~~~~~~~~~ -+---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=====================+============+=====================================+==================+=================================+========+ -| list_element | Binary | List-like (Arg 0), Integral (Arg 1) | List value type | | \(1) | -+---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ -| list_flatten | Unary | List-like | List value type | | \(2) | -+---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ -| list_parent_indices | Unary | List-like | Int64 | | \(3) | -+---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ -| map_array_lookup | Unary | Map | Computed | :struct:`MapArrayLookupOptions` | \(4) | -+---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ -| struct_field | Unary | Struct or Union | Computed | :struct:`StructFieldOptions` | \(5) | -+---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+ ++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=====================+============+=====================================+==================+==============================+========+ +| list_element | Binary | List-like (Arg 0), Integral (Arg 1) | List value type | | \(1) | ++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ +| list_flatten | Unary | List-like | List value type | | \(2) | ++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ +| list_parent_indices | Unary | List-like | Int64 | | \(3) | ++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ +| map_lookup | Unary | Map | Computed | :struct:`MapLookupOptions` | \(4) | ++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ +| struct_field | Unary | Struct or Union | Computed | :struct:`StructFieldOptions` | \(5) | ++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+ * \(1) Output is an array of the same length as the input list array. The output values are the values at the specified index of each child list. @@ -1665,7 +1665,7 @@ Structural transforms are discarded. * \(4) Extract either the ``FIRST``, ``LAST`` or ``ALL`` items from a - map array whose key match the given query key passed via options. + map whose key match the given query key passed via options. The output type is an Array of items for the ``FIRST``/``LAST`` options and an Array of List of items for the ``ALL`` option. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index ab9e1497eef..b6ba414c5bb 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -482,7 +482,7 @@ Structural Transforms list_parent_indices list_value_length make_struct - map_array_lookup + map_lookup replace_with_mask struct_field @@ -505,7 +505,7 @@ Compute Options IndexOptions JoinOptions MakeStructOptions - MapArrayLookupOptions + MapLookupOptions MatchSubstringOptions ModeOptions NullOptions diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 9ddbd259783..34437b08f1b 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1333,17 +1333,17 @@ class IndexOptions(_IndexOptions): self._set_options(value) -cdef class _MapArrayLookupOptions(FunctionOptions): +cdef class _MapLookupOptions(FunctionOptions): _occurrence_map = { - "all": CMapArrayLookupOccurrence_ALL, - "first": CMapArrayLookupOccurrence_FIRST, - "last": CMapArrayLookupOccurrence_LAST, + "all": CMapLookupOccurrence_ALL, + "first": CMapLookupOccurrence_FIRST, + "last": CMapLookupOccurrence_LAST, } def _set_options(self, query_key, occurrence): try: self.wrapped.reset( - new CMapArrayLookupOptions( + new CMapLookupOptions( pyarrow_unwrap_scalar(query_key), self._occurrence_map[occurrence] ) @@ -1353,16 +1353,16 @@ cdef class _MapArrayLookupOptions(FunctionOptions): "Should either be first, last, or all") -class MapArrayLookupOptions(_MapArrayLookupOptions): +class MapLookupOptions(_MapLookupOptions): """ - Options for the `map_array_lookup` function. + Options for the `map_lookup` function. Parameters ---------- query_key : Scalar The key to search for. occurrence : str - The occurrence(s) to return from the MapArray + The occurrence(s) to return from the Map Accepted values are "first", "last", or "all". """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index cbe7c24532b..ea1ea92dbaf 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -41,7 +41,7 @@ IndexOptions, JoinOptions, MakeStructOptions, - MapArrayLookupOptions, + MapLookupOptions, MatchSubstringOptions, ModeOptions, NullOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index dc843a11141..2f1f0f00c2c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2147,17 +2147,17 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CIndexOptions(shared_ptr[CScalar] value) shared_ptr[CScalar] value - cdef enum CMapArrayLookupOccurrence \ - "arrow::compute::MapArrayLookupOptions::Occurrence": - CMapArrayLookupOccurrence_ALL "arrow::compute::MapArrayLookupOptions::ALL" - CMapArrayLookupOccurrence_FIRST "arrow::compute::MapArrayLookupOptions::FIRST" - CMapArrayLookupOccurrence_LAST "arrow::compute::MapArrayLookupOptions::LAST" - - cdef cppclass CMapArrayLookupOptions \ - "arrow::compute::MapArrayLookupOptions"(CFunctionOptions): - CMapArrayLookupOptions(shared_ptr[CScalar] query_key, - CMapArrayLookupOccurrence occurrence) - CMapArrayLookupOccurrence occurrence + cdef enum CMapLookupOccurrence \ + "arrow::compute::MapLookupOptions::Occurrence": + CMapLookupOccurrence_ALL "arrow::compute::MapLookupOptions::ALL" + CMapLookupOccurrence_FIRST "arrow::compute::MapLookupOptions::FIRST" + CMapLookupOccurrence_LAST "arrow::compute::MapLookupOptions::LAST" + + cdef cppclass CMapLookupOptions \ + "arrow::compute::MapLookupOptions"(CFunctionOptions): + CMapLookupOptions(shared_ptr[CScalar] query_key, + CMapLookupOccurrence occurrence) + CMapLookupOccurrence occurrence shared_ptr[CScalar] query_key cdef cppclass CMakeStructOptions \ diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 7b80d1da296..1f170f078ed 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -141,7 +141,7 @@ def test_option_class_equality(): field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}), pa.KeyValueMetadata({"b": "2"})]), - pc.MapArrayLookupOptions(pa.scalar(1), "first"), + pc.MapLookupOptions(pa.scalar(1), "first"), pc.MatchSubstringOptions("pattern"), pc.ModeOptions(), pc.NullOptions(), @@ -2471,7 +2471,7 @@ def test_make_struct(): pc.make_struct(field_names=['one', 'two']) -def test_map_array_lookup(): +def test_map_lookup(): ty = pa.map_(pa.utf8(), pa.int32()) arr = pa.array([[('one', 1), ('two', 2)], [('none', 3)], [], [('one', 5), ('one', 7)], None], type=ty) @@ -2480,11 +2480,11 @@ def test_map_array_lookup(): result_all = pa.array([[1], None, None, [5, 7], None], type=pa.list_(pa.int32())) - assert pc.map_array_lookup(arr, pa.scalar( + assert pc.map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'first') == result_first - assert pc.map_array_lookup(arr, pa.scalar( + assert pc.map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'last') == result_last - assert pc.map_array_lookup(arr, pa.scalar( + assert pc.map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'all') == result_all From 313ce8e7101607495b86555a9c2876625eb3300e Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Wed, 2 Feb 2022 15:04:17 +0530 Subject: [PATCH 22/24] Tiny lint --- python/pyarrow/includes/libarrow.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2f1f0f00c2c..daffad2ddac 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2156,7 +2156,7 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CMapLookupOptions \ "arrow::compute::MapLookupOptions"(CFunctionOptions): CMapLookupOptions(shared_ptr[CScalar] query_key, - CMapLookupOccurrence occurrence) + CMapLookupOccurrence occurrence) CMapLookupOccurrence occurrence shared_ptr[CScalar] query_key From a8c5af599a979439faa81abc7bed10312ddbc743 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Wed, 2 Feb 2022 23:09:45 +0530 Subject: [PATCH 23/24] Update type resolver --- cpp/src/arrow/compute/kernels/scalar_nested.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 0288ee26ac2..cf8a7e08b03 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -604,7 +604,7 @@ Result ResolveMapLookupType(KernelContext* ctx, return Status::Invalid("map_lookup: query_key can't be empty."); } else if (!options.query_key->is_valid) { return Status::Invalid("map_lookup: query_key can't be null."); - } else if (!options.query_key->type || !options.query_key->type->Equals(key_type)) { + } else if (!options.query_key->type->Equals(key_type)) { return Status::TypeError( "map_lookup: query_key type and Map key_type don't match. Expected " "type: ", @@ -651,10 +651,7 @@ struct ResolveMapLookup { return Execute(); } - template - enable_if_same Visit(const KeyType& key) { - return Execute(); - } + Status Visit(const FixedSizeBinaryType& key) { return Execute(); } Status Visit(const MonthDayNanoIntervalType& key) { return Execute(); From 92f86109f3c82207ee5db354f7c37680b313b24d Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Thu, 3 Feb 2022 13:43:00 +0530 Subject: [PATCH 24/24] Remove unnecessary tests --- .../compute/kernels/scalar_nested_test.cc | 121 ------------------ 1 file changed, 121 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 050d070358a..c35c8f35028 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -241,127 +241,6 @@ void CheckMapLookupWithDifferentOptions(const std::shared_ptr& map, class TestMapLookupKernel : public ::testing::Test {}; -TEST_F(TestMapLookupKernel, Basic) { - auto type = map(utf8(), int32()); - const char* input = R"( - [ - [["foo", 99], ["bar", 1], ["hello", 2], ["foo", 3], ["lets go", 5], ["what now?", 8]], - null, - [["nothing", null], ["hat", null], ["foo", 101], ["sorry", 1], ["dip", null], - ["foo", 22]], - [] - ])"; - auto map_array = ArrayFromJSON(type, input); - - CheckMapLookupWithDifferentOptions( - map_array, MakeScalar("foo"), - ArrayFromJSON(list(int32()), R"([[99, 3], null, [101, 22], null])"), - ArrayFromJSON(int32(), R"([99, null, 101, null])"), - ArrayFromJSON(int32(), R"([3, null, 22, null])")); -} - -TEST_F(TestMapLookupKernel, NestedItems) { - auto type = map(utf8(), map(int16(), int16())); - const char* input = R"( - [ - [ - [ - "just", - [[0, 0], [1, 1]] - ], - [ - "random", - [[2, 2], [3, 3]] - ], - [ - "foo", - [[4, 4], [5, 5]] - ], - [ - "values", - [[6, 6], [7, 7]] - ], - [ - "foo", - [[8, 8], [9, 9]] - ], - [ - "point", - [[10, 10], [11, 11]] - ], - [ - "foo", - [[12, 12], [13, 13]] - ] - ], - null, - [ - [ - "yet", - [[0, 1], [1, 2]] - ], - [ - "more", - [[2, 3], [3, 4]] - ], - [ - "foo", - [[4, 5], [5, 6]] - ], - [ - "random", - [[6, 7], [7, 8]] - ], - [ - "foo", - [[8, 9], [9, 10]] - ], - [ - "values", - [[10, 11], [11, 12]] - ], - [ - "foo", - [[12, 13], [13, 14]] - ] - ], - [] - ] - )"; - const auto map_array = ArrayFromJSON(type, input); - - const auto expected_all = ArrayFromJSON(list(map(int16(), int16())), R"( - [ - [ - [[4, 4], [5, 5]], [[8, 8], [9, 9]], - [[12, 12], [13, 13]] - ], - null, - [ - [[4, 5], [5, 6]], [[8, 9], [9, 10]], - [[12, 13], [13, 14]] - ], - null - ])"); - const auto expected_first = ArrayFromJSON(map(int16(), int16()), R"( - [ - [[4, 4], [5, 5]], - null, - [[4, 5], [5, 6]], - null - ])"); - const auto expected_last = ArrayFromJSON(map(int16(), int16()), R"( - [ - [[12, 12], [13, 13]], - null, - [[12, 13], [13, 14]], - null - ])"); - - CheckMapLookupWithDifferentOptions(map_array, MakeScalar("foo"), expected_all, - expected_first, expected_last); -} - TEST_F(TestMapLookupKernel, BooleanKey) { auto true_scalar = ScalarFromJSON(boolean(), R"(true)"); auto map_type = map(boolean(), int32());