diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index d47ee42ebf2..f0d5c0fcc3d 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -153,6 +153,8 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("tiebreaker", &RankOptions::tiebreaker)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); +static auto kListFlattenOptionsType = GetFunctionOptionsType( + DataMember("recursive", &ListFlattenOptions::recursive)); } // namespace } // namespace internal @@ -224,6 +226,10 @@ PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; +ListFlattenOptions::ListFlattenOptions(bool recursive) + : FunctionOptions(internal::kListFlattenOptionsType), recursive(recursive) {} +constexpr char ListFlattenOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -237,6 +243,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 919572f16ee..e5bcc373296 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -245,6 +245,18 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { int64_t periods = 1; }; +/// \brief Options for list_flatten function +class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { + public: + explicit ListFlattenOptions(bool recursive = false); + static constexpr char const kTypeName[] = "ListFlattenOptions"; + static ListFlattenOptions Defaults() { return ListFlattenOptions(); } + + /// \brief If true, the list is flattened recursively until a non-list + /// array is formed. + bool recursive = false; +}; + /// @} /// \brief Filter with a boolean selection filter diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 00a833742f9..0fd9cae7a8d 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/compute/api_vector.h" #include "arrow/type_fwd.h" namespace arrow { @@ -56,9 +57,23 @@ Result LastType(KernelContext*, const std::vector& types return types.back(); } -Result ListValuesType(KernelContext*, const std::vector& args) { - const auto& list_type = checked_cast(*args[0].type); - return list_type.value_type().get(); +Result ListValuesType(KernelContext* ctx, + const std::vector& args) { + auto list_type = checked_cast(args[0].type); + auto value_type = list_type->value_type().get(); + + auto recursive = + ctx->state() ? OptionsWrapper::Get(ctx).recursive : false; + if (!recursive) { + return value_type; + } + + for (auto value_kind = value_type->id(); + is_list(value_kind) || is_list_view(value_kind); value_kind = value_type->id()) { + list_type = checked_cast(list_type->value_type().get()); + value_type = list_type->value_type().get(); + } + return value_type; } void EnsureDictionaryDecoded(std::vector* types) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 097ee1de45b..9e46a21887f 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -423,7 +423,8 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar Result FirstType(KernelContext*, const std::vector& types); Result LastType(KernelContext*, const std::vector& types); -Result ListValuesType(KernelContext*, const std::vector& types); +Result ListValuesType(KernelContext* ctx, + const std::vector& types); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 733ab9c0dc2..b99f065a0b1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -23,6 +23,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/type_fwd.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" @@ -41,10 +42,17 @@ Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou const ArraySpan& arr = batch[0].array; ArraySpan* out_arr = out->array_span_mutable(); auto out_values = out_arr->GetValues(1); - const offset_type* offsets = arr.GetValues(1); - // Offsets are always well-defined and monotonic, even for null values - for (int64_t i = 0; i < arr.length; ++i) { - *out_values++ = offsets[i + 1] - offsets[i]; + if (is_list_view(*arr.type)) { + const auto* sizes = arr.GetValues(2); + if (arr.length > 0) { + memcpy(out_values, sizes, arr.length * sizeof(offset_type)); + } + } else { + const offset_type* offsets = arr.GetValues(1); + // Offsets are always well-defined and monotonic, even for null values + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = offsets[i + 1] - offsets[i]; + } } return Status::OK(); } @@ -59,6 +67,30 @@ Status FixedSizeListValueLength(KernelContext* ctx, const ExecSpan& batch, return Status::OK(); } +template +void AddListValueLengthKernel(ScalarFunction* func, + const std::shared_ptr& out_type) { + auto in_type = {InputType(InListType::type_id)}; + ScalarKernel kernel(in_type, out_type, ListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +template <> +void AddListValueLengthKernel( + ScalarFunction* func, const std::shared_ptr& out_type) { + auto in_type = {InputType(Type::FIXED_SIZE_LIST)}; + ScalarKernel kernel(in_type, out_type, FixedSizeListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddListValueLengthKernels(ScalarFunction* func) { + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); +} + const FunctionDoc list_value_length_doc{ "Compute list lengths", ("`lists` must have a list-like type.\n" @@ -399,6 +431,8 @@ void AddListElementKernels(ScalarFunction* func) { void AddListElementKernels(ScalarFunction* func) { AddListElementKernels(func); AddListElementKernels(func); + AddListElementKernels(func); + AddListElementKernels(func); AddListElementKernels(func); } @@ -824,12 +858,7 @@ const FunctionDoc map_lookup_doc{ void RegisterScalarNested(FunctionRegistry* registry) { auto list_value_length = std::make_shared( "list_value_length", Arity::Unary(), list_value_length_doc); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(), - ListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), - FixedSizeListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(), - ListValueLength)); + AddListValueLengthKernels(list_value_length.get()); DCHECK_OK(registry->AddFunction(std::move(list_value_length))); auto list_element = diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b..32bea824695 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -30,11 +30,21 @@ namespace arrow { namespace compute { static std::shared_ptr GetOffsetType(const DataType& type) { - return type.id() == Type::LIST ? int32() : int64(); + switch (type.id()) { + case Type::LIST: + case Type::LIST_VIEW: + return int32(); + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return int64(); + default: + Unreachable("Unexpected type"); + } } TEST(TestScalarNested, ListValueLength) { - for (auto ty : {list(int32()), large_list(int32())}) { + for (auto ty : {list(int32()), large_list(int32()), list_view(int32()), + large_list_view(int32())}) { CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]", GetOffsetType(*ty), "[3, null, 2, 0]"); } @@ -47,7 +57,8 @@ TEST(TestScalarNested, ListValueLength) { TEST(TestScalarNested, ListElementNonFixedListWithNulls) { auto sample = "[[7, 5, 81], [6, null, 4, 7, 8], [3, 12, 2, 0], [1, 9], null]"; for (auto ty : NumericTypes()) { - for (auto list_type : {list(ty), large_list(ty)}) { + for (auto list_type : + {list(ty), large_list(ty), list_view(ty), large_list_view(ty)}) { auto input = ArrayFromJSON(list_type, sample); auto null_input = ArrayFromJSON(list_type, "[null]"); for (auto index_type : IntTypes()) { diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 08930e589f7..8c77c261c6a 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -18,6 +18,7 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/visit_type_inline.h" @@ -29,8 +30,13 @@ namespace { template Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + auto recursive = OptionsWrapper::Get(ctx).recursive; typename TypeTraits::ArrayType list_array(batch[0].array.ToArrayData()); - ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool())); + + auto pool = ctx->memory_pool(); + ARROW_ASSIGN_OR_RAISE(auto result, (recursive ? list_array.FlattenRecursively(pool) + : list_array.Flatten(pool))); + out->value = std::move(result->data()); return Status::OK(); } @@ -107,10 +113,15 @@ struct ListParentIndicesArray { const FunctionDoc list_flatten_doc( "Flatten list values", - ("`lists` must have a list-like type.\n" - "Return an array with the top list level flattened.\n" - "Top-level null values in `lists` do not emit anything in the input."), - {"lists"}); + ("`lists` must have a list-like type (lists, list-views, and\n" + "fixed-size lists).\n" + "Return an array with the top list level flattened unless\n" + "`recursive` is set to true in ListFlattenOptions. When that\n" + "is that case, flattening happens recursively until a non-list\n" + "array is formed.\n" + "\n" + "Null list values do not emit anything to the output."), + {"lists"}, "ListFlattenOptions"); const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", @@ -153,17 +164,34 @@ class ListParentIndicesFunction : public MetaFunction { } }; +const ListFlattenOptions* GetDefaultListFlattenOptions() { + static const auto kDefaultListFlattenOptions = ListFlattenOptions::Defaults(); + return &kDefaultListFlattenOptions; +} + +template +void AddBaseListFlattenKernels(VectorFunction* func) { + auto in_type = {InputType(InListType::type_id)}; + auto out_type = OutputType(ListValuesType); + VectorKernel kernel(in_type, out_type, ListFlatten, + OptionsWrapper::Init); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddBaseListFlattenKernels(VectorFunction* func) { + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); +} + } // namespace void RegisterVectorNested(FunctionRegistry* registry) { - auto flatten = - std::make_shared("list_flatten", Arity::Unary(), list_flatten_doc); - DCHECK_OK(flatten->AddKernel({Type::LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::FIXED_SIZE_LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::LARGE_LIST}, OutputType(ListValuesType), - ListFlatten)); + auto flatten = std::make_shared( + "list_flatten", Arity::Unary(), list_flatten_doc, GetDefaultListFlattenOptions()); + AddBaseListFlattenKernels(flatten.get()); DCHECK_OK(registry->AddFunction(std::move(flatten))); DCHECK_OK(registry->AddFunction(std::make_shared())); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index eef1b6835ff..56604ebd16c 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -19,6 +19,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/result.h" #include "arrow/testing/gtest_util.h" @@ -29,38 +30,113 @@ namespace compute { using arrow::internal::checked_cast; -TEST(TestVectorNested, ListFlatten) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]"); - auto expected = ArrayFromJSON(int16(), "[0, null, 1, 2, 3]"); +using ListAndListViewTypes = + ::testing::Types; + +// ---------------------------------------------------------------------- +// [Large]List and [Large]ListView tests +template +class TestVectorNestedSpecialized : public ::testing::Test { + public: + using TypeClass = T; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + } + + public: + void TestListFlatten() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], []]"); + auto expected = ArrayFromJSON(value_type_, "[0, null, 1, 2, 3]"); CheckVectorUnary("list_flatten", input, expected); // Construct a list with a non-empty null slot auto tweaked = TweakValidityBit(input, 0, false); - expected = ArrayFromJSON(int16(), "[2, 3]"); + expected = ArrayFromJSON(value_type_, "[2, 3]"); CheckVectorUnary("list_flatten", tweaked, expected); } -} -TEST(TestVectorNested, ListFlattenNulls) { - const auto ty = list(int32()); - auto input = ArrayFromJSON(ty, "[null, null]"); - auto expected = ArrayFromJSON(int32(), "[]"); - CheckVectorUnary("list_flatten", input, expected); -} + void TestListFlattenNulls() { + value_type_ = int32(); + type_ = std::make_shared(value_type_); + auto input = ArrayFromJSON(type_, "[null, null]"); + auto expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected); + } -TEST(TestVectorNested, ListFlattenChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - ARROW_SCOPED_TRACE(ty->ToString()); - auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], []]"}); - auto expected = ChunkedArrayFromJSON(int16(), {"[0, null, 1]", "[2, 3]"}); + void TestListFlattenChunkedArray() { + ARROW_SCOPED_TRACE(type_->ToString()); + auto input = ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], []]"}); + auto expected = ChunkedArrayFromJSON(value_type_, {"[0, null, 1]", "[2, 3]"}); CheckVectorUnary("list_flatten", input, expected); ARROW_SCOPED_TRACE("empty"); - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int16(), {}); + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(value_type_, {}); CheckVectorUnary("list_flatten", input, expected); } + + void TestListFlattenRecursively() { + auto inner_type = std::make_shared(value_type_); + type_ = std::make_shared(inner_type); + + ListFlattenOptions opts; + opts.recursive = true; + + // List types with two nesting levels: list> + auto input = ArrayFromJSON(type_, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])"); + auto expected = ArrayFromJSON(value_type_, "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // Empty nested list should flatten until non-list type is reached + input = ArrayFromJSON(type_, R"([null])"); + expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // List types with three nesting levels: list>> + type_ = std::make_shared(std::make_shared(fixed_size_list(value_type_, 2))); + input = ArrayFromJSON(type_, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])"); + expected = ArrayFromJSON(value_type_, "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + } + + protected: + std::shared_ptr type_; + std::shared_ptr value_type_; +}; + +TYPED_TEST_SUITE(TestVectorNestedSpecialized, ListAndListViewTypes); + +TYPED_TEST(TestVectorNestedSpecialized, ListFlatten) { this->TestListFlatten(); } + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenNulls) { + this->TestListFlattenNulls(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenChunkedArray) { + this->TestListFlattenChunkedArray(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenRecursively) { + this->TestListFlattenRecursively(); } TEST(TestVectorNested, ListFlattenFixedSizeList) { @@ -92,6 +168,21 @@ TEST(TestVectorNested, ListFlattenFixedSizeListNulls) { CheckVectorUnary("list_flatten", input, expected); } +TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { + ListFlattenOptions opts; + opts.recursive = true; + + auto inner_type = fixed_size_list(int32(), 2); + auto type = fixed_size_list(inner_type, 2); + auto input = ArrayFromJSON(type, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])"); + auto expected = ArrayFromJSON(int32(), "[0, 1, null, 3, 7, null, 2, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); +} + TEST(TestVectorNested, ListParentIndices) { for (auto ty : {list(int16()), large_list(int16())}) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a267d535994..44a3d5e7407 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2035,6 +2035,26 @@ class PairwiseOptions(_PairwiseOptions): self._set_options(period) +cdef class _ListFlattenOptions(FunctionOptions): + def _set_options(self, recursive): + self.wrapped.reset(new CListFlattenOptions(recursive)) + + +class ListFlattenOptions(_ListFlattenOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + + def __init__(self, recursive=False): + self._set_options(recursive) + + cdef class _ArraySortOptions(FunctionOptions): def _set_options(self, order, null_placement): self.wrapped.reset(new CArraySortOptions( diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 60fc09ea861..6a11b19ffcd 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2141,22 +2141,99 @@ cdef class Decimal256Array(FixedSizeBinaryArray): cdef class BaseListArray(Array): - def flatten(self): + def flatten(self, recursive=False): """ - Unnest this ListArray/LargeListArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. + Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray + according to 'recursive'. Note that this method is different from ``self.values`` in that it takes care of the slicing offset as well as null elements backed by non-empty sub-lists. + Parameters + ---------- + recursive : bool, default False, optional + When True, flatten this logical list-array recursively until an + array of non-list values is formed. + + When False, flatten only the top level. + Returns ------- result : Array + + Examples + -------- + + Basic logical list-array's flatten + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + + When recursive=True, nested list arrays are flattened recursively + until an array of non-list values is formed. + + >>> array = pa.array([ + ... None, + ... [ + ... [1, None, 2], + ... None, + ... [3, 4] + ... ], + ... [], + ... [ + ... [], + ... [5, 6], + ... None + ... ], + ... [ + ... [7, 8] + ... ] + ... ], type=pa.list_(pa.list_(pa.int64()))) + >>> array.flatten(True) + + [ + 1, + null, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ] """ - return _pc().list_flatten(self) + options = _pc().ListFlattenOptions(recursive) + return _pc().list_flatten(self, options=options) def value_parent_indices(self): """ @@ -2527,7 +2604,7 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a list view data type. """ @@ -2747,69 +2824,8 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this ListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - - -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a large list view data type. @@ -3037,67 +3053,6 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this LargeListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - cdef class MapArray(ListArray): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 205ab393b8b..83612f66d21 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -44,6 +44,7 @@ IndexOptions, JoinOptions, ListSliceOptions, + ListFlattenOptions, MakeStructOptions, MapLookupOptions, MatchSubstringOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6dae45ab80b..f461513e8b3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2589,6 +2589,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CPairwiseOptions(int64_t period) int64_t period + cdef cppclass CListFlattenOptions\ + "arrow::compute::ListFlattenOptions"(CFunctionOptions): + CListFlattenOptions(c_bool recursive) + c_bool recursive + cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): CArraySortOptions(CSortOrder, CNullPlacement) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index b1187a77c2a..bfd266a807c 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -437,11 +437,11 @@ cdef class LargeListArray(BaseListArray): pass -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): pass -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): pass diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 156d58326b9..6a190957879 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2757,6 +2757,7 @@ def test_list_array_flatten(offset_type, list_type_factory): assert arr1.values.equals(arr0) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) @pytest.mark.parametrize('list_type', [ @@ -2778,7 +2779,9 @@ def test_list_value_parent_indices(list_type): @pytest.mark.parametrize(('offset_type', 'list_type'), [(pa.int32(), pa.list_(pa.int32())), (pa.int32(), pa.list_(pa.int32(), list_size=2)), - (pa.int64(), pa.large_list(pa.int32()))]) + (pa.int64(), pa.large_list(pa.int32())), + (pa.int32(), pa.list_view(pa.int32())), + (pa.int64(), pa.large_list_view(pa.int32()))]) def test_list_value_lengths(offset_type, list_type): # FixedSizeListArray needs fixed list sizes @@ -2876,6 +2879,8 @@ def test_fixed_size_list_array_flatten(): assert arr0.type.equals(typ0) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0) + assert arr2.flatten().equals(arr1) + assert arr2.flatten(True).equals(arr0) def test_fixed_size_list_array_flatten_with_slice(): @@ -3844,6 +3849,7 @@ def test_list_view_flatten(list_array_type, list_type_factory, offset_type): assert arr2.values.equals(arr1) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) # test out of order offsets values = [1, 2, 3, 4] diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 98cbd920b50..17cc546f834 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -152,6 +152,7 @@ def test_option_class_equality(): pc.IndexOptions(pa.scalar(1)), pc.JoinOptions(), pc.ListSliceOptions(0, -1, 1, True), + pc.ListFlattenOptions(recursive=False), pc.MakeStructOptions(["field", "names"], field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}),