From 7a5dfae41cb365cb333f9d28b161cccdb8f11951 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui Date: Thu, 18 Apr 2024 22:53:35 +0800 Subject: [PATCH 1/7] 1. Expose recursive flatten for logical lists on list_flatten kernel function 2. Support [Large]ListView for some kernel functions: list_flatten,list_value_length, list_element 3. Support recursive flatten for pyarrow bindinds and simplify [Large]ListView's pyarrow bindings 4. Refactor vector_nested_test.cc for better support [Large]ListView types. --- cpp/src/arrow/compute/api_vector.cc | 7 + cpp/src/arrow/compute/api_vector.h | 12 + .../arrow/compute/kernels/codegen_internal.cc | 10 +- .../arrow/compute/kernels/scalar_nested.cc | 49 +++- .../compute/kernels/scalar_nested_test.cc | 17 +- .../arrow/compute/kernels/vector_nested.cc | 50 +++- .../compute/kernels/vector_nested_test.cc | 187 +++++++++++---- python/pyarrow/_compute.pyx | 20 ++ python/pyarrow/array.pxi | 215 +++++++----------- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 + python/pyarrow/tests/test_array.py | 14 +- python/pyarrow/tests/test_compute.py | 1 + 13 files changed, 381 insertions(+), 207 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index d47ee42ebf2..6bf0fac429f 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -153,6 +153,8 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("tiebreaker", &RankOptions::tiebreaker)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); +static auto kListFlattenOptionsType = GetFunctionOptionsType( + DataMember("recursively", &ListFlattenOptions::recursively)); } // namespace } // namespace internal @@ -224,6 +226,10 @@ PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; +ListFlattenOptions::ListFlattenOptions(bool recursively) + : FunctionOptions(internal::kListFlattenOptionsType), recursively(recursively) {} +constexpr char ListFlattenOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -237,6 +243,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 919572f16ee..ff88203ed22 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -245,6 +245,18 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { int64_t periods = 1; }; +/// \brief Options for list_flatten function +class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { + public: + explicit ListFlattenOptions(bool recursively = false); + static constexpr char const kTypeName[] = "ListFlattenOptions"; + static ListFlattenOptions Defaults() { return ListFlattenOptions(); } + + /// Control the version of 'Flatten' that keeps recursively flattening + /// until an array of non-list values is reached. + bool recursively = false; +}; + /// @} /// \brief Filter with a boolean selection filter diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 00a833742f9..abde2175de8 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -57,8 +57,14 @@ Result LastType(KernelContext*, const std::vector& types } Result ListValuesType(KernelContext*, const std::vector& args) { - const auto& list_type = checked_cast(*args[0].type); - return list_type.value_type().get(); + auto list_type = checked_cast(args[0].type); + auto value_type = list_type->value_type().get(); + for (auto value_kind = value_type->id(); + is_list(value_kind) || is_list_view(value_kind); value_kind = value_type->id()) { + list_type = checked_cast(list_type->value_type().get()); + value_type = list_type->value_type().get(); + } + return value_type; } void EnsureDictionaryDecoded(std::vector* types) { diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 733ab9c0dc2..35320e285b9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -39,12 +39,26 @@ namespace { template Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& arr = batch[0].array; + const auto kind = arr.type->id(); ArraySpan* out_arr = out->array_span_mutable(); auto out_values = out_arr->GetValues(1); - const offset_type* offsets = arr.GetValues(1); - // Offsets are always well-defined and monotonic, even for null values - for (int64_t i = 0; i < arr.length; ++i) { - *out_values++ = offsets[i + 1] - offsets[i]; + if (is_list_view(kind)) { + // [Large]ListView's buffer layout: + // buffer1 : valid bitmap + // buffer2 : elements' start offset in current array + // buffer3 : elements' size + // + // It's unnecessary to calculate according offsets. + const auto* sizes = arr.GetValues(2); + for (int64_t i = 0; i < arr.length; i++) { + *out_values++ = sizes[i]; + } + } else { + const offset_type* offsets = arr.GetValues(1); + // Offsets are always well-defined and monotonic, even for null values + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = offsets[i + 1] - offsets[i]; + } } return Status::OK(); } @@ -59,6 +73,24 @@ Status FixedSizeListValueLength(KernelContext* ctx, const ExecSpan& batch, return Status::OK(); } +template +void AddListValueLengthKernel(ScalarFunction* func, + const std::shared_ptr& out_type) { + auto in_type = {InputType(InListType::type_id)}; + ScalarKernel kernel(in_type, out_type, ListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddListValueLengthKernels(ScalarFunction* func) { + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + + DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), + FixedSizeListValueLength)); +} + const FunctionDoc list_value_length_doc{ "Compute list lengths", ("`lists` must have a list-like type.\n" @@ -399,6 +431,8 @@ void AddListElementKernels(ScalarFunction* func) { void AddListElementKernels(ScalarFunction* func) { AddListElementKernels(func); AddListElementKernels(func); + AddListElementKernels(func); + AddListElementKernels(func); AddListElementKernels(func); } @@ -824,12 +858,7 @@ const FunctionDoc map_lookup_doc{ void RegisterScalarNested(FunctionRegistry* registry) { auto list_value_length = std::make_shared( "list_value_length", Arity::Unary(), list_value_length_doc); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(), - ListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), - FixedSizeListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(), - ListValueLength)); + AddListValueLengthKernels(list_value_length.get()); DCHECK_OK(registry->AddFunction(std::move(list_value_length))); auto list_element = diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b..32bea824695 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -30,11 +30,21 @@ namespace arrow { namespace compute { static std::shared_ptr GetOffsetType(const DataType& type) { - return type.id() == Type::LIST ? int32() : int64(); + switch (type.id()) { + case Type::LIST: + case Type::LIST_VIEW: + return int32(); + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return int64(); + default: + Unreachable("Unexpected type"); + } } TEST(TestScalarNested, ListValueLength) { - for (auto ty : {list(int32()), large_list(int32())}) { + for (auto ty : {list(int32()), large_list(int32()), list_view(int32()), + large_list_view(int32())}) { CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]", GetOffsetType(*ty), "[3, null, 2, 0]"); } @@ -47,7 +57,8 @@ TEST(TestScalarNested, ListValueLength) { TEST(TestScalarNested, ListElementNonFixedListWithNulls) { auto sample = "[[7, 5, 81], [6, null, 4, 7, 8], [3, 12, 2, 0], [1, 9], null]"; for (auto ty : NumericTypes()) { - for (auto list_type : {list(ty), large_list(ty)}) { + for (auto list_type : + {list(ty), large_list(ty), list_view(ty), large_list_view(ty)}) { auto input = ArrayFromJSON(list_type, sample); auto null_input = ArrayFromJSON(list_type, "[null]"); for (auto index_type : IntTypes()) { diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 08930e589f7..53ceec41d6a 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -18,6 +18,7 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/visit_type_inline.h" @@ -29,8 +30,16 @@ namespace { template Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + auto recursively = OptionsWrapper::Get(ctx).recursively; typename TypeTraits::ArrayType list_array(batch[0].array.ToArrayData()); - ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool())); + + std::shared_ptr result; + if (!recursively) { + ARROW_ASSIGN_OR_RAISE(result, list_array.Flatten(ctx->memory_pool())); + } else { + ARROW_ASSIGN_OR_RAISE(result, list_array.FlattenRecursively(ctx->memory_pool())); + } + out->value = std::move(result->data()); return Status::OK(); } @@ -70,6 +79,10 @@ struct ListParentIndicesArray { Status Visit(const LargeListType& type) { return VisitList(type); } + Status Visit(const ListViewType& type) { return VisitList(type); } + + Status Visit(const LargeListViewType& type) { return VisitList(type); } + Status Visit(const FixedSizeListType& type) { using offset_type = typename FixedSizeListType::offset_type; const offset_type slot_length = type.list_size(); @@ -110,7 +123,7 @@ const FunctionDoc list_flatten_doc( ("`lists` must have a list-like type.\n" "Return an array with the top list level flattened.\n" "Top-level null values in `lists` do not emit anything in the input."), - {"lists"}); + {"lists"}, "ListFlattenOptions"); const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", @@ -153,17 +166,34 @@ class ListParentIndicesFunction : public MetaFunction { } }; +const ListFlattenOptions* GetDefaultListFlattenOptions() { + static const auto kDefaultListFlattenOptions = ListFlattenOptions::Defaults(); + return &kDefaultListFlattenOptions; +} + +template +void AddBaseListFlattenKernels(VectorFunction* func) { + auto in_type = {InputType(InListType::type_id)}; + auto out_type = OutputType(ListValuesType); + VectorKernel kernel(in_type, out_type, ListFlatten, + OptionsWrapper::Init); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddBaseListFlattenKernels(VectorFunction* func) { + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); +} + } // namespace void RegisterVectorNested(FunctionRegistry* registry) { - auto flatten = - std::make_shared("list_flatten", Arity::Unary(), list_flatten_doc); - DCHECK_OK(flatten->AddKernel({Type::LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::FIXED_SIZE_LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::LARGE_LIST}, OutputType(ListValuesType), - ListFlatten)); + auto flatten = std::make_shared( + "list_flatten", Arity::Unary(), list_flatten_doc, GetDefaultListFlattenOptions()); + AddBaseListFlattenKernels(flatten.get()); DCHECK_OK(registry->AddFunction(std::move(flatten))); DCHECK_OK(registry->AddFunction(std::make_shared())); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index eef1b6835ff..a03883d145a 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -19,6 +19,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/result.h" #include "arrow/testing/gtest_util.h" @@ -29,41 +30,143 @@ namespace compute { using arrow::internal::checked_cast; -TEST(TestVectorNested, ListFlatten) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]"); - auto expected = ArrayFromJSON(int16(), "[0, null, 1, 2, 3]"); +using ListAndListViewTypes = + ::testing::Types; + +// ---------------------------------------------------------------------- +// [Large]List and [Large]ListView tests +template +class TestVectorLogicalList : public ::testing::Test { + public: + using TypeClass = T; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + } + + public: + void TestListFlatten() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], []]"); + auto expected = ArrayFromJSON(value_type_, "[0, null, 1, 2, 3]"); CheckVectorUnary("list_flatten", input, expected); // Construct a list with a non-empty null slot auto tweaked = TweakValidityBit(input, 0, false); - expected = ArrayFromJSON(int16(), "[2, 3]"); + expected = ArrayFromJSON(value_type_, "[2, 3]"); CheckVectorUnary("list_flatten", tweaked, expected); } -} -TEST(TestVectorNested, ListFlattenNulls) { - const auto ty = list(int32()); - auto input = ArrayFromJSON(ty, "[null, null]"); - auto expected = ArrayFromJSON(int32(), "[]"); - CheckVectorUnary("list_flatten", input, expected); -} + void TestListFlattenNulls() { + value_type_ = int32(); + type_ = std::make_shared(value_type_); + auto input = ArrayFromJSON(type_, "[null, null]"); + auto expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected); + } -TEST(TestVectorNested, ListFlattenChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - ARROW_SCOPED_TRACE(ty->ToString()); - auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], []]"}); - auto expected = ChunkedArrayFromJSON(int16(), {"[0, null, 1]", "[2, 3]"}); + void TestListFlattenChunkedArray() { + ARROW_SCOPED_TRACE(type_->ToString()); + auto input = ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], []]"}); + auto expected = ChunkedArrayFromJSON(value_type_, {"[0, null, 1]", "[2, 3]"}); CheckVectorUnary("list_flatten", input, expected); ARROW_SCOPED_TRACE("empty"); - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int16(), {}); + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(value_type_, {}); CheckVectorUnary("list_flatten", input, expected); } + + void TestListFlattenRecursively() { + auto inner_type = std::make_shared(value_type_); + type_ = std::make_shared(inner_type); + + ListFlattenOptions opts; + opts.recursively = true; + + // List types with two nested level: list> + auto input = ArrayFromJSON(type_, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])"); + auto expected = ArrayFromJSON(value_type_, "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // Empty nested list should flatten until non-list type is reached + input = ArrayFromJSON(type_, R"([null])"); + expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // List types with three nested level: list>> + type_ = std::make_shared(std::make_shared(fixed_size_list(value_type_, 2))); + input = ArrayFromJSON(type_, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])"); + expected = ArrayFromJSON(value_type_, "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + } + + void TestListParentIndices() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); + auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", input, expected); + + // Construct a list with a non-empty null slot + input = ArrayFromJSON(type_, "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); + auto tweaked = TweakValidityBit(input, 1, false); + expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", tweaked, expected); + } + + void TestListParentIndicesChunkedArray() { + auto input = + ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); + auto expected = ChunkedArrayFromJSON(int64(), {"[0, 0, 0]", "[2, 2, 4, 4]"}); + CheckVectorUnary("list_parent_indices", input, expected); + + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(int64(), {}); + CheckVectorUnary("list_parent_indices", input, expected); + } + + protected: + std::shared_ptr type_; + std::shared_ptr value_type_; +}; + +TYPED_TEST_SUITE(TestVectorLogicalList, ListAndListViewTypes); + +TYPED_TEST(TestVectorLogicalList, ListFlatten) { this->TestListFlatten(); } + +TYPED_TEST(TestVectorLogicalList, ListFlattenNulls) { this->TestListFlattenNulls(); } + +TYPED_TEST(TestVectorLogicalList, ListFlattenChunkedArray) { + this->TestListFlattenChunkedArray(); } -TEST(TestVectorNested, ListFlattenFixedSizeList) { +TYPED_TEST(TestVectorLogicalList, ListFlattenRecursively) { + this->TestListFlattenRecursively(); +} + +TYPED_TEST(TestVectorLogicalList, ListParentIndices) { this->TestListParentIndices(); } + +TYPED_TEST(TestVectorLogicalList, ListParentIndicesChunkedArray) { + this->TestListParentIndicesChunkedArray(); +} + +TEST(TestVectorFixedSizeList, ListFlattenFixedSizeList) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { const auto& out_ty = checked_cast(*ty).value_type(); { @@ -85,43 +188,29 @@ TEST(TestVectorNested, ListFlattenFixedSizeList) { } } -TEST(TestVectorNested, ListFlattenFixedSizeListNulls) { +TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListNulls) { const auto ty = fixed_size_list(int32(), 1); auto input = ArrayFromJSON(ty, "[null, null]"); auto expected = ArrayFromJSON(int32(), "[]"); CheckVectorUnary("list_flatten", input, expected); } -TEST(TestVectorNested, ListParentIndices) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); - - auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); - CheckVectorUnary("list_parent_indices", input, expected); - } - - // Construct a list with a non-empty null slot - auto input = ArrayFromJSON(list(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); - auto tweaked = TweakValidityBit(input, 1, false); - auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); - CheckVectorUnary("list_parent_indices", tweaked, expected); -} - -TEST(TestVectorNested, ListParentIndicesChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = - ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); - - auto expected = ChunkedArrayFromJSON(int64(), {"[0, 0, 0]", "[2, 2, 4, 4]"}); - CheckVectorUnary("list_parent_indices", input, expected); - - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int64(), {}); - CheckVectorUnary("list_parent_indices", input, expected); - } +TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListRecursively) { + ListFlattenOptions opts; + opts.recursively = true; + + auto inner_type = fixed_size_list(int32(), 2); + auto type = fixed_size_list(inner_type, 2); + auto input = ArrayFromJSON(type, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])"); + auto expected = ArrayFromJSON(int32(), "[0, 1, null, 3, 7, null, 2, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); } -TEST(TestVectorNested, ListParentIndicesFixedSizeList) { +TEST(TestVectorFixedSizeList, ListParentIndicesFixedSizeList) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { { auto input = ArrayFromJSON(ty, "[[0, null], null, [1, 2], [3, 4], [null, 5]]"); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a267d535994..d0fc93fc9af 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2035,6 +2035,26 @@ class PairwiseOptions(_PairwiseOptions): self._set_options(period) +cdef class _ListFlattenOptions(FunctionOptions): + def _set_options(self, recursively): + self.wrapped.reset(new CListFlattenOptions(recursively)) + + +class ListFlattenOptions(_ListFlattenOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursively : bool, defalut false + When true, do list flatten recursively until an array of + non-list values is reached. + """ + + def __init__(self, recursively=False): + self._set_options(recursively) + + cdef class _ArraySortOptions(FunctionOptions): def _set_options(self, order, null_placement): self.wrapped.reset(new CArraySortOptions( diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 60fc09ea861..2c762cc2c6c 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2141,22 +2141,99 @@ cdef class Decimal256Array(FixedSizeBinaryArray): cdef class BaseListArray(Array): - def flatten(self): + def flatten(self, recursively=False): """ - Unnest this ListArray/LargeListArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. + Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray + according to 'recursively'. Note that this method is different from ``self.values`` in that it takes care of the slicing offset as well as null elements backed by non-empty sub-lists. + Parameters + ---------- + recursively : bool, defalut false, optional + When true, flatten this logical list-array recursively until an + array of non-list values is reached. + When false, flatten this logical list-array by one level + Returns ------- result : Array + + Examples + -------- + + Basic logical list-array's flatten + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + + If an logical list-array is nested with multi-level, the array will + be flattened recursively until an array of non-list values is reached + if we enable recursively=True. + + >>> array = pa.array([ + None, + [ + [1, None, 2], + None, + [3, 4] + ], + [], + [ + [], + [5, 6], + None + ], + [ + [7, 8] + ] + ], type=pa.list_(pa.list_(pa.int64()))) + >>> array.flatten(True) + + [ + 1, + None, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ] """ - return _pc().list_flatten(self) + options = _pc().ListFlattenOptions(recursively) + return _pc().list_flatten(self, options) def value_parent_indices(self): """ @@ -2527,7 +2604,7 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a list view data type. """ @@ -2747,69 +2824,8 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this ListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a large list view data type. @@ -3037,67 +3053,6 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this LargeListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - cdef class MapArray(ListArray): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 205ab393b8b..83612f66d21 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -44,6 +44,7 @@ IndexOptions, JoinOptions, ListSliceOptions, + ListFlattenOptions, MakeStructOptions, MapLookupOptions, MatchSubstringOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6dae45ab80b..180a0e116bb 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2589,6 +2589,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CPairwiseOptions(int64_t period) int64_t period + cdef cppclass CListFlattenOptions\ + "arrow::compute::ListFlattenOptions"(CFunctionOptions): + CListFlattenOptions(bool recursively) + bool recursively + cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): CArraySortOptions(CSortOrder, CNullPlacement) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 156d58326b9..83e35d7b603 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2704,7 +2704,8 @@ def test_invalid_tensor_construction(): @pytest.mark.parametrize(('offset_type', 'list_type_factory'), - [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) + [(pa.int32(), pa.list_), (pa.int64(), pa.large_list), + (pa.int32(), pa.list_view), (pa.int64(), pa.large_list_view)]) def test_list_array_flatten(offset_type, list_type_factory): typ2 = list_type_factory( list_type_factory( @@ -2757,12 +2758,15 @@ def test_list_array_flatten(offset_type, list_type_factory): assert arr1.values.equals(arr0) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) @pytest.mark.parametrize('list_type', [ pa.list_(pa.int32()), pa.list_(pa.int32(), list_size=2), - pa.large_list(pa.int32())]) + pa.large_list(pa.int32()), + pa.list_view(pa.int32()), + pa.large_list_view(pa.int32())]) def test_list_value_parent_indices(list_type): arr = pa.array( [ @@ -2778,7 +2782,9 @@ def test_list_value_parent_indices(list_type): @pytest.mark.parametrize(('offset_type', 'list_type'), [(pa.int32(), pa.list_(pa.int32())), (pa.int32(), pa.list_(pa.int32(), list_size=2)), - (pa.int64(), pa.large_list(pa.int32()))]) + (pa.int64(), pa.large_list(pa.int32())), + (pa.int32(), pa.list_view(pa.int32())), + (pa.int64(), pa.large_list_view(pa.int32()))]) def test_list_value_lengths(offset_type, list_type): # FixedSizeListArray needs fixed list sizes @@ -2876,6 +2882,7 @@ def test_fixed_size_list_array_flatten(): assert arr0.type.equals(typ0) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0) + assert arr2.flatten(True).equals(arr0) def test_fixed_size_list_array_flatten_with_slice(): @@ -3844,6 +3851,7 @@ def test_list_view_flatten(list_array_type, list_type_factory, offset_type): assert arr2.values.equals(arr1) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) # test out of order offsets values = [1, 2, 3, 4] diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 98cbd920b50..0453dbf34e6 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -152,6 +152,7 @@ def test_option_class_equality(): pc.IndexOptions(pa.scalar(1)), pc.JoinOptions(), pc.ListSliceOptions(0, -1, 1, True), + pc.ListFlattenOptions(recursively=False), pc.MakeStructOptions(["field", "names"], field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}), From 4a9c94befbfcffe50df03a050c954b55fa65275d Mon Sep 17 00:00:00 2001 From: ZhangHuiGui Date: Thu, 18 Apr 2024 23:59:39 +0800 Subject: [PATCH 2/7] fix --- .../arrow/compute/kernels/vector_nested.cc | 4 -- .../compute/kernels/vector_nested_test.cc | 60 +++++++++---------- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 53ceec41d6a..89861ade5e0 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -79,10 +79,6 @@ struct ListParentIndicesArray { Status Visit(const LargeListType& type) { return VisitList(type); } - Status Visit(const ListViewType& type) { return VisitList(type); } - - Status Visit(const LargeListViewType& type) { return VisitList(type); } - Status Visit(const FixedSizeListType& type) { using offset_type = typename FixedSizeListType::offset_type; const offset_type slot_length = type.list_size(); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index a03883d145a..b95d88026a0 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -118,29 +118,6 @@ class TestVectorLogicalList : public ::testing::Test { CheckVectorUnary("list_flatten", input, expected, &opts); } - void TestListParentIndices() { - auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); - auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); - CheckVectorUnary("list_parent_indices", input, expected); - - // Construct a list with a non-empty null slot - input = ArrayFromJSON(type_, "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); - auto tweaked = TweakValidityBit(input, 1, false); - expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); - CheckVectorUnary("list_parent_indices", tweaked, expected); - } - - void TestListParentIndicesChunkedArray() { - auto input = - ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); - auto expected = ChunkedArrayFromJSON(int64(), {"[0, 0, 0]", "[2, 2, 4, 4]"}); - CheckVectorUnary("list_parent_indices", input, expected); - - input = ChunkedArrayFromJSON(type_, {}); - expected = ChunkedArrayFromJSON(int64(), {}); - CheckVectorUnary("list_parent_indices", input, expected); - } - protected: std::shared_ptr type_; std::shared_ptr value_type_; @@ -160,12 +137,6 @@ TYPED_TEST(TestVectorLogicalList, ListFlattenRecursively) { this->TestListFlattenRecursively(); } -TYPED_TEST(TestVectorLogicalList, ListParentIndices) { this->TestListParentIndices(); } - -TYPED_TEST(TestVectorLogicalList, ListParentIndicesChunkedArray) { - this->TestListParentIndicesChunkedArray(); -} - TEST(TestVectorFixedSizeList, ListFlattenFixedSizeList) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { const auto& out_ty = checked_cast(*ty).value_type(); @@ -210,7 +181,36 @@ TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListRecursively) { CheckVectorUnary("list_flatten", input, expected, &opts); } -TEST(TestVectorFixedSizeList, ListParentIndicesFixedSizeList) { +TEST(TestVectorNested, ListParentIndices) { + for (auto ty : {list(int16()), large_list(int16())}) { + auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); + + auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", input, expected); + } + + // Construct a list with a non-empty null slot + auto input = ArrayFromJSON(list(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); + auto tweaked = TweakValidityBit(input, 1, false); + auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", tweaked, expected); +} + +TEST(TestVectorNested, ListParentIndicesChunkedArray) { + for (auto ty : {list(int16()), large_list(int16())}) { + auto input = + ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); + + auto expected = ChunkedArrayFromJSON(int64(), {"[0, 0, 0]", "[2, 2, 4, 4]"}); + CheckVectorUnary("list_parent_indices", input, expected); + + input = ChunkedArrayFromJSON(ty, {}); + expected = ChunkedArrayFromJSON(int64(), {}); + CheckVectorUnary("list_parent_indices", input, expected); + } +} + +TEST(TestVectorNested, ListParentIndicesFixedSizeList) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { { auto input = ArrayFromJSON(ty, "[[0, null], null, [1, 2], [3, 4], [null, 5]]"); From 6fdd0ee5343b5d7bc91839eb6dcd866f50ac9c37 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui Date: Fri, 19 Apr 2024 17:49:58 +0800 Subject: [PATCH 3/7] fix comments and pyarrow's compile failure --- cpp/src/arrow/compute/api_vector.cc | 6 ++--- cpp/src/arrow/compute/api_vector.h | 6 ++--- .../arrow/compute/kernels/codegen_internal.cc | 3 ++- .../arrow/compute/kernels/codegen_internal.h | 3 ++- .../arrow/compute/kernels/scalar_nested.cc | 26 +++++++++---------- .../arrow/compute/kernels/vector_nested.cc | 18 ++++++------- .../compute/kernels/vector_nested_test.cc | 10 +++---- python/pyarrow/includes/libarrow.pxd | 4 +-- python/pyarrow/lib.pxd | 4 +-- 9 files changed, 41 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 6bf0fac429f..f0d5c0fcc3d 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -154,7 +154,7 @@ static auto kRankOptionsType = GetFunctionOptionsType( static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); static auto kListFlattenOptionsType = GetFunctionOptionsType( - DataMember("recursively", &ListFlattenOptions::recursively)); + DataMember("recursive", &ListFlattenOptions::recursive)); } // namespace } // namespace internal @@ -226,8 +226,8 @@ PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; -ListFlattenOptions::ListFlattenOptions(bool recursively) - : FunctionOptions(internal::kListFlattenOptionsType), recursively(recursively) {} +ListFlattenOptions::ListFlattenOptions(bool recursive) + : FunctionOptions(internal::kListFlattenOptionsType), recursive(recursive) {} constexpr char ListFlattenOptions::kTypeName[]; namespace internal { diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index ff88203ed22..6d19a8dafc7 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -252,9 +252,9 @@ class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { static constexpr char const kTypeName[] = "ListFlattenOptions"; static ListFlattenOptions Defaults() { return ListFlattenOptions(); } - /// Control the version of 'Flatten' that keeps recursively flattening - /// until an array of non-list values is reached. - bool recursively = false; + /// \brief If true, the list is flattened recursively until a non-list + /// array is formed. + bool recursive = false; }; /// @} diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index abde2175de8..570e7ea01e8 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -56,7 +56,8 @@ Result LastType(KernelContext*, const std::vector& types return types.back(); } -Result ListValuesType(KernelContext*, const std::vector& args) { +Result ListValuesType(KernelContext* ctx, + const std::vector& args) { auto list_type = checked_cast(args[0].type); auto value_type = list_type->value_type().get(); for (auto value_kind = value_type->id(); diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 097ee1de45b..9e46a21887f 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -423,7 +423,8 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar Result FirstType(KernelContext*, const std::vector& types); Result LastType(KernelContext*, const std::vector& types); -Result ListValuesType(KernelContext*, const std::vector& types); +Result ListValuesType(KernelContext* ctx, + const std::vector& types); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 35320e285b9..b99f065a0b1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -23,6 +23,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/type_fwd.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" @@ -39,19 +40,12 @@ namespace { template Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& arr = batch[0].array; - const auto kind = arr.type->id(); ArraySpan* out_arr = out->array_span_mutable(); auto out_values = out_arr->GetValues(1); - if (is_list_view(kind)) { - // [Large]ListView's buffer layout: - // buffer1 : valid bitmap - // buffer2 : elements' start offset in current array - // buffer3 : elements' size - // - // It's unnecessary to calculate according offsets. + if (is_list_view(*arr.type)) { const auto* sizes = arr.GetValues(2); - for (int64_t i = 0; i < arr.length; i++) { - *out_values++ = sizes[i]; + if (arr.length > 0) { + memcpy(out_values, sizes, arr.length * sizeof(offset_type)); } } else { const offset_type* offsets = arr.GetValues(1); @@ -81,14 +75,20 @@ void AddListValueLengthKernel(ScalarFunction* func, DCHECK_OK(func->AddKernel(std::move(kernel))); } +template <> +void AddListValueLengthKernel( + ScalarFunction* func, const std::shared_ptr& out_type) { + auto in_type = {InputType(Type::FIXED_SIZE_LIST)}; + ScalarKernel kernel(in_type, out_type, FixedSizeListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + void AddListValueLengthKernels(ScalarFunction* func) { AddListValueLengthKernel(func, int32()); AddListValueLengthKernel(func, int64()); AddListValueLengthKernel(func, int32()); AddListValueLengthKernel(func, int64()); - - DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), - FixedSizeListValueLength)); + AddListValueLengthKernel(func, int32()); } const FunctionDoc list_value_length_doc{ diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 89861ade5e0..721b1b1cfe4 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -30,15 +30,12 @@ namespace { template Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - auto recursively = OptionsWrapper::Get(ctx).recursively; + auto recursive = OptionsWrapper::Get(ctx).recursive; typename TypeTraits::ArrayType list_array(batch[0].array.ToArrayData()); - std::shared_ptr result; - if (!recursively) { - ARROW_ASSIGN_OR_RAISE(result, list_array.Flatten(ctx->memory_pool())); - } else { - ARROW_ASSIGN_OR_RAISE(result, list_array.FlattenRecursively(ctx->memory_pool())); - } + auto pool = ctx->memory_pool(); + ARROW_ASSIGN_OR_RAISE(auto result, (recursive ? list_array.FlattenRecursively(pool) + : list_array.Flatten(pool))); out->value = std::move(result->data()); return Status::OK(); @@ -116,8 +113,11 @@ struct ListParentIndicesArray { const FunctionDoc list_flatten_doc( "Flatten list values", - ("`lists` must have a list-like type.\n" - "Return an array with the top list level flattened.\n" + ("`lists` must have a logical list type like `[Large]ListType`, \n" + "`[Large]ListViewType` and `FixedSizeListType`. \n" + "Whether to flatten the top list level or the bottom list level \n" + "will be decided based on the `recursive` option specified in \n" + ":struct:`ListFlattenOptions`. \n" "Top-level null values in `lists` do not emit anything in the input."), {"lists"}, "ListFlattenOptions"); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index b95d88026a0..78d3fdea8d0 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -82,7 +82,7 @@ class TestVectorLogicalList : public ::testing::Test { type_ = std::make_shared(inner_type); ListFlattenOptions opts; - opts.recursively = true; + opts.recursive = true; // List types with two nested level: list> auto input = ArrayFromJSON(type_, R"([ @@ -168,7 +168,7 @@ TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListNulls) { TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListRecursively) { ListFlattenOptions opts; - opts.recursively = true; + opts.recursive = true; auto inner_type = fixed_size_list(int32(), 2); auto type = fixed_size_list(inner_type, 2); @@ -181,7 +181,7 @@ TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListRecursively) { CheckVectorUnary("list_flatten", input, expected, &opts); } -TEST(TestVectorNested, ListParentIndices) { +TEST(TestVectorListParentIndices, BasicListArray) { for (auto ty : {list(int16()), large_list(int16())}) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); @@ -196,7 +196,7 @@ TEST(TestVectorNested, ListParentIndices) { CheckVectorUnary("list_parent_indices", tweaked, expected); } -TEST(TestVectorNested, ListParentIndicesChunkedArray) { +TEST(TestVectorListParentIndices, BasicListChunkedArray) { for (auto ty : {list(int16()), large_list(int16())}) { auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); @@ -210,7 +210,7 @@ TEST(TestVectorNested, ListParentIndicesChunkedArray) { } } -TEST(TestVectorNested, ListParentIndicesFixedSizeList) { +TEST(TestVectorListParentIndices, FixedSizeListArray) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { { auto input = ArrayFromJSON(ty, "[[0, null], null, [1, 2], [3, 4], [null, 5]]"); diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 180a0e116bb..4ef897eddd8 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2591,8 +2591,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CListFlattenOptions\ "arrow::compute::ListFlattenOptions"(CFunctionOptions): - CListFlattenOptions(bool recursively) - bool recursively + CListFlattenOptions(c_bool recursively) + c_bool recursively cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index b1187a77c2a..bfd266a807c 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -437,11 +437,11 @@ cdef class LargeListArray(BaseListArray): pass -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): pass -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): pass From 81a909fbeb93beac1039ce07f48587f12d8e0e1c Mon Sep 17 00:00:00 2001 From: ZhangHuiGui Date: Sat, 20 Apr 2024 12:09:42 +0800 Subject: [PATCH 4/7] fix --- cpp/src/arrow/compute/kernels/codegen_internal.cc | 8 ++++++++ python/pyarrow/_compute.pyx | 10 +++++----- python/pyarrow/array.pxi | 14 +++++++------- python/pyarrow/includes/libarrow.pxd | 4 ++-- python/pyarrow/tests/test_array.py | 7 ++----- python/pyarrow/tests/test_compute.py | 2 +- 6 files changed, 25 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 570e7ea01e8..0fd9cae7a8d 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/compute/api_vector.h" #include "arrow/type_fwd.h" namespace arrow { @@ -60,6 +61,13 @@ Result ListValuesType(KernelContext* ctx, const std::vector& args) { auto list_type = checked_cast(args[0].type); auto value_type = list_type->value_type().get(); + + auto recursive = + ctx->state() ? OptionsWrapper::Get(ctx).recursive : false; + if (!recursive) { + return value_type; + } + for (auto value_kind = value_type->id(); is_list(value_kind) || is_list_view(value_kind); value_kind = value_type->id()) { list_type = checked_cast(list_type->value_type().get()); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index d0fc93fc9af..60eb195b78f 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2036,8 +2036,8 @@ class PairwiseOptions(_PairwiseOptions): cdef class _ListFlattenOptions(FunctionOptions): - def _set_options(self, recursively): - self.wrapped.reset(new CListFlattenOptions(recursively)) + def _set_options(self, recursive): + self.wrapped.reset(new CListFlattenOptions(recursive)) class ListFlattenOptions(_ListFlattenOptions): @@ -2046,13 +2046,13 @@ class ListFlattenOptions(_ListFlattenOptions): Parameters ---------- - recursively : bool, defalut false + recursive : bool, defalut false When true, do list flatten recursively until an array of non-list values is reached. """ - def __init__(self, recursively=False): - self._set_options(recursively) + def __init__(self, recursive=False): + self._set_options(recursive) cdef class _ArraySortOptions(FunctionOptions): diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 2c762cc2c6c..cc6e24f381c 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2141,10 +2141,10 @@ cdef class Decimal256Array(FixedSizeBinaryArray): cdef class BaseListArray(Array): - def flatten(self, recursively=False): + def flatten(self, recursive=False): """ Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray - according to 'recursively'. + according to 'recursive'. Note that this method is different from ``self.values`` in that it takes care of the slicing offset as well as null elements backed @@ -2152,8 +2152,8 @@ cdef class BaseListArray(Array): Parameters ---------- - recursively : bool, defalut false, optional - When true, flatten this logical list-array recursively until an + recursive : bool, defalut false, optional + When true, flatten this logical list-array recursivey until an array of non-list values is reached. When false, flatten this logical list-array by one level @@ -2199,7 +2199,7 @@ cdef class BaseListArray(Array): If an logical list-array is nested with multi-level, the array will be flattened recursively until an array of non-list values is reached - if we enable recursively=True. + if we enable recursive=True. >>> array = pa.array([ None, @@ -2232,8 +2232,8 @@ cdef class BaseListArray(Array): 8 ] """ - options = _pc().ListFlattenOptions(recursively) - return _pc().list_flatten(self, options) + options = _pc().ListFlattenOptions(recursive) + return _pc().list_flatten(self, options=options) def value_parent_indices(self): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4ef897eddd8..f461513e8b3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2591,8 +2591,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CListFlattenOptions\ "arrow::compute::ListFlattenOptions"(CFunctionOptions): - CListFlattenOptions(c_bool recursively) - c_bool recursively + CListFlattenOptions(c_bool recursive) + c_bool recursive cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 83e35d7b603..5fce0513f67 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2704,8 +2704,7 @@ def test_invalid_tensor_construction(): @pytest.mark.parametrize(('offset_type', 'list_type_factory'), - [(pa.int32(), pa.list_), (pa.int64(), pa.large_list), - (pa.int32(), pa.list_view), (pa.int64(), pa.large_list_view)]) + [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) def test_list_array_flatten(offset_type, list_type_factory): typ2 = list_type_factory( list_type_factory( @@ -2764,9 +2763,7 @@ def test_list_array_flatten(offset_type, list_type_factory): @pytest.mark.parametrize('list_type', [ pa.list_(pa.int32()), pa.list_(pa.int32(), list_size=2), - pa.large_list(pa.int32()), - pa.list_view(pa.int32()), - pa.large_list_view(pa.int32())]) + pa.large_list(pa.int32())]) def test_list_value_parent_indices(list_type): arr = pa.array( [ diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 0453dbf34e6..17cc546f834 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -152,7 +152,7 @@ def test_option_class_equality(): pc.IndexOptions(pa.scalar(1)), pc.JoinOptions(), pc.ListSliceOptions(0, -1, 1, True), - pc.ListFlattenOptions(recursively=False), + pc.ListFlattenOptions(recursive=False), pc.MakeStructOptions(["field", "names"], field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}), From 402c2253a57f22a064274ab1dd8f372fe08c2be5 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui Date: Sat, 20 Apr 2024 16:43:17 +0800 Subject: [PATCH 5/7] fix doctest --- python/pyarrow/array.pxi | 50 ++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index cc6e24f381c..ce04cb87de0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2202,34 +2202,34 @@ cdef class BaseListArray(Array): if we enable recursive=True. >>> array = pa.array([ - None, - [ - [1, None, 2], - None, - [3, 4] - ], - [], - [ - [], - [5, 6], - None - ], - [ - [7, 8] - ] - ], type=pa.list_(pa.list_(pa.int64()))) + ... None, + ... [ + ... [1, None, 2], + ... None, + ... [3, 4] + ... ], + ... [], + ... [ + ... [], + ... [5, 6], + ... None + ... ], + ... [ + ... [7, 8] + ... ] + ... ], type=pa.list_(pa.list_(pa.int64()))) >>> array.flatten(True) [ - 1, - None, - 2, - 3, - 4, - 5, - 6, - 7, - 8 + 1, + null, + 2, + 3, + 4, + 5, + 6, + 7, + 8 ] """ options = _pc().ListFlattenOptions(recursive) From ce9947ab2875a0a88d690c81d718f53198f543fe Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <2689496754@qq.com> Date: Tue, 30 Apr 2024 03:54:19 +0800 Subject: [PATCH 6/7] fix typo and grammar --- cpp/src/arrow/compute/api_vector.h | 2 +- .../arrow/compute/kernels/vector_nested.cc | 14 +++++---- .../compute/kernels/vector_nested_test.cc | 30 ++++++++++--------- python/pyarrow/_compute.pyx | 6 ++-- python/pyarrow/array.pxi | 14 ++++----- 5 files changed, 35 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 6d19a8dafc7..e5bcc373296 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -248,7 +248,7 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { /// \brief Options for list_flatten function class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { public: - explicit ListFlattenOptions(bool recursively = false); + explicit ListFlattenOptions(bool recursive = false); static constexpr char const kTypeName[] = "ListFlattenOptions"; static ListFlattenOptions Defaults() { return ListFlattenOptions(); } diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 721b1b1cfe4..8c77c261c6a 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -113,12 +113,14 @@ struct ListParentIndicesArray { const FunctionDoc list_flatten_doc( "Flatten list values", - ("`lists` must have a logical list type like `[Large]ListType`, \n" - "`[Large]ListViewType` and `FixedSizeListType`. \n" - "Whether to flatten the top list level or the bottom list level \n" - "will be decided based on the `recursive` option specified in \n" - ":struct:`ListFlattenOptions`. \n" - "Top-level null values in `lists` do not emit anything in the input."), + ("`lists` must have a list-like type (lists, list-views, and\n" + "fixed-size lists).\n" + "Return an array with the top list level flattened unless\n" + "`recursive` is set to true in ListFlattenOptions. When that\n" + "is that case, flattening happens recursively until a non-list\n" + "array is formed.\n" + "\n" + "Null list values do not emit anything to the output."), {"lists"}, "ListFlattenOptions"); const FunctionDoc list_parent_indices_doc( diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index 78d3fdea8d0..56604ebd16c 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -36,7 +36,7 @@ using ListAndListViewTypes = // ---------------------------------------------------------------------- // [Large]List and [Large]ListView tests template -class TestVectorLogicalList : public ::testing::Test { +class TestVectorNestedSpecialized : public ::testing::Test { public: using TypeClass = T; @@ -84,7 +84,7 @@ class TestVectorLogicalList : public ::testing::Test { ListFlattenOptions opts; opts.recursive = true; - // List types with two nested level: list> + // List types with two nesting levels: list> auto input = ArrayFromJSON(type_, R"([ [[0, 1, 2], null, [3, null]], [null], @@ -98,7 +98,7 @@ class TestVectorLogicalList : public ::testing::Test { expected = ArrayFromJSON(value_type_, "[]"); CheckVectorUnary("list_flatten", input, expected, &opts); - // List types with three nested level: list>> + // List types with three nesting levels: list>> type_ = std::make_shared(std::make_shared(fixed_size_list(value_type_, 2))); input = ArrayFromJSON(type_, R"([ [ @@ -123,21 +123,23 @@ class TestVectorLogicalList : public ::testing::Test { std::shared_ptr value_type_; }; -TYPED_TEST_SUITE(TestVectorLogicalList, ListAndListViewTypes); +TYPED_TEST_SUITE(TestVectorNestedSpecialized, ListAndListViewTypes); -TYPED_TEST(TestVectorLogicalList, ListFlatten) { this->TestListFlatten(); } +TYPED_TEST(TestVectorNestedSpecialized, ListFlatten) { this->TestListFlatten(); } -TYPED_TEST(TestVectorLogicalList, ListFlattenNulls) { this->TestListFlattenNulls(); } +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenNulls) { + this->TestListFlattenNulls(); +} -TYPED_TEST(TestVectorLogicalList, ListFlattenChunkedArray) { +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenChunkedArray) { this->TestListFlattenChunkedArray(); } -TYPED_TEST(TestVectorLogicalList, ListFlattenRecursively) { +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenRecursively) { this->TestListFlattenRecursively(); } -TEST(TestVectorFixedSizeList, ListFlattenFixedSizeList) { +TEST(TestVectorNested, ListFlattenFixedSizeList) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { const auto& out_ty = checked_cast(*ty).value_type(); { @@ -159,14 +161,14 @@ TEST(TestVectorFixedSizeList, ListFlattenFixedSizeList) { } } -TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListNulls) { +TEST(TestVectorNested, ListFlattenFixedSizeListNulls) { const auto ty = fixed_size_list(int32(), 1); auto input = ArrayFromJSON(ty, "[null, null]"); auto expected = ArrayFromJSON(int32(), "[]"); CheckVectorUnary("list_flatten", input, expected); } -TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListRecursively) { +TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { ListFlattenOptions opts; opts.recursive = true; @@ -181,7 +183,7 @@ TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListRecursively) { CheckVectorUnary("list_flatten", input, expected, &opts); } -TEST(TestVectorListParentIndices, BasicListArray) { +TEST(TestVectorNested, ListParentIndices) { for (auto ty : {list(int16()), large_list(int16())}) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); @@ -196,7 +198,7 @@ TEST(TestVectorListParentIndices, BasicListArray) { CheckVectorUnary("list_parent_indices", tweaked, expected); } -TEST(TestVectorListParentIndices, BasicListChunkedArray) { +TEST(TestVectorNested, ListParentIndicesChunkedArray) { for (auto ty : {list(int16()), large_list(int16())}) { auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); @@ -210,7 +212,7 @@ TEST(TestVectorListParentIndices, BasicListChunkedArray) { } } -TEST(TestVectorListParentIndices, FixedSizeListArray) { +TEST(TestVectorNested, ListParentIndicesFixedSizeList) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { { auto input = ArrayFromJSON(ty, "[[0, null], null, [1, 2], [3, 4], [null, 5]]"); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 60eb195b78f..44a3d5e7407 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2046,9 +2046,9 @@ class ListFlattenOptions(_ListFlattenOptions): Parameters ---------- - recursive : bool, defalut false - When true, do list flatten recursively until an array of - non-list values is reached. + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. """ def __init__(self, recursive=False): diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ce04cb87de0..6a11b19ffcd 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2152,10 +2152,11 @@ cdef class BaseListArray(Array): Parameters ---------- - recursive : bool, defalut false, optional - When true, flatten this logical list-array recursivey until an - array of non-list values is reached. - When false, flatten this logical list-array by one level + recursive : bool, default False, optional + When True, flatten this logical list-array recursively until an + array of non-list values is formed. + + When False, flatten only the top level. Returns ------- @@ -2197,9 +2198,8 @@ cdef class BaseListArray(Array): 2 ] - If an logical list-array is nested with multi-level, the array will - be flattened recursively until an array of non-list values is reached - if we enable recursive=True. + When recursive=True, nested list arrays are flattened recursively + until an array of non-list values is formed. >>> array = pa.array([ ... None, From 4a08a141baf074a981a12e9b7d4e54c387a43ab6 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <2689496754@qq.com> Date: Tue, 30 Apr 2024 20:36:41 +0800 Subject: [PATCH 7/7] add a test case --- python/pyarrow/tests/test_array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 5fce0513f67..6a190957879 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2879,6 +2879,7 @@ def test_fixed_size_list_array_flatten(): assert arr0.type.equals(typ0) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0) + assert arr2.flatten().equals(arr1) assert arr2.flatten(True).equals(arr0)