diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 956c509cdaa..d4d5c89346c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -364,6 +364,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_validity.cc compute/kernels/util_internal.cc compute/kernels/vector_hash.cc + compute/kernels/vector_nested.cc compute/kernels/vector_selection.cc compute/kernels/vector_sort.cc) endif() diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 44627252aa2..64debf8537a 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -40,6 +40,7 @@ add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute") add_arrow_compute_test(vector_test SOURCES vector_hash_test.cc + vector_nested_test.cc vector_selection_test.cc vector_sort_test.cc test_util.cc) diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc new file mode 100644 index 00000000000..e35bc58547c --- /dev/null +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Vector kernels involving nested types + +#include "arrow/array/array_base.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/result.h" + +namespace arrow { +namespace compute { +namespace internal { + +template +void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + typename TypeTraits::ArrayType list_array(batch[0].array()); + Result> result = list_array.Flatten(ctx->memory_pool()); + if (!result.ok()) { + ctx->SetStatus(result.status()); + return; + } + out->value = (*result)->data(); +} + +static Result ValuesType(KernelContext*, + const std::vector& args) { + const auto& list_type = checked_cast(*args[0].type); + return ValueDescr::Array(list_type.value_type()); +} + +void RegisterVectorNested(FunctionRegistry* registry) { + auto flatten = std::make_shared("list_flatten", Arity::Unary()); + DCHECK_OK(flatten->AddKernel({InputType(Type::LIST)}, OutputType(ValuesType), + ListFlatten)); + DCHECK_OK(flatten->AddKernel({InputType(Type::LARGE_LIST)}, OutputType(ValuesType), + ListFlatten)); + DCHECK_OK(registry->AddFunction(std::move(flatten))); +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc new file mode 100644 index 00000000000..61c2b777262 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/api.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/result.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { +namespace compute { + +TEST(TestVectorNested, ListFlatten) { + for (auto ty : {list(int32()), large_list(int32())}) { + auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]"); + auto expected = ArrayFromJSON(int32(), "[0, null, 1, 2, 3]"); + ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_flatten", {input})); + AssertArraysEqual(*expected, *out.make_array()); + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc index 416a3912cd3..061c01c3c45 100644 --- a/cpp/src/arrow/compute/registry.cc +++ b/cpp/src/arrow/compute/registry.cc @@ -112,6 +112,7 @@ static std::unique_ptr CreateBuiltInRegistry() { // Vector functions RegisterVectorHash(registry.get()); RegisterVectorSelection(registry.get()); + RegisterVectorNested(registry.get()); RegisterVectorSort(registry.get()); return registry; diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h index 93598f70078..5d22162517d 100644 --- a/cpp/src/arrow/compute/registry_internal.h +++ b/cpp/src/arrow/compute/registry_internal.h @@ -36,6 +36,7 @@ void RegisterScalarValidity(FunctionRegistry* registry); // Vector functions void RegisterVectorHash(FunctionRegistry* registry); void RegisterVectorSelection(FunctionRegistry* registry); +void RegisterVectorNested(FunctionRegistry* registry); void RegisterVectorSort(FunctionRegistry* registry); // Aggregate functions diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f5478f93944..6abf4b70c9a 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1342,8 +1342,27 @@ cdef class Decimal128Array(FixedSizeBinaryArray): Concrete class for Arrow arrays of decimal128 data type. """ +cdef class BaseListArray(Array): -cdef class ListArray(Array): + def flatten(self): + """ + Unnest this ListArray/LargeListArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values()`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Returns + ------- + result : Array + """ + return _pc().list_flatten(self) + + +cdef class ListArray(BaseListArray): """ Concrete class for Arrow arrays of a list data type. """ @@ -1389,39 +1408,8 @@ cdef class ListArray(Array): """ return pyarrow_wrap_array(( self.ap).offsets()) - def flatten(self, MemoryPool memory_pool=None): - """ - Unnest this ListArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values()`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - Parameters - ---------- - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool - - Returns - ------- - result : Array - """ - cdef: - shared_ptr[CArray] c_result_array - CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - CListArray* arr = self.ap - - with nogil: - c_result_array = GetResultValue(arr.Flatten(cpool)) - - return pyarrow_wrap_array(c_result_array) - - -cdef class LargeListArray(Array): +cdef class LargeListArray(BaseListArray): """ Concrete class for Arrow arrays of a large list data type. @@ -1470,37 +1458,6 @@ cdef class LargeListArray(Array): """ return pyarrow_wrap_array(( self.ap).offsets()) - def flatten(self, MemoryPool memory_pool=None): - """ - Unnest this LargeListArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values()`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool. - - Returns - ------- - result : Array - """ - cdef: - shared_ptr[CArray] c_result_array - CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - CLargeListArray* arr = self.ap - - with nogil: - c_result_array = GetResultValue(arr.Flatten(cpool)) - - return pyarrow_wrap_array(c_result_array) - cdef class MapArray(Array): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index a1c7b9bd690..0652ab3c519 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -104,6 +104,8 @@ def func(left, right): is_valid = _simple_unary_function('is_valid') is_null = _simple_unary_function('is_null') +list_flatten = _simple_unary_function('list_flatten') + add = _simple_binary_function('add') subtract = _simple_binary_function('subtract') multiply = _simple_binary_function('multiply') diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 93c43ca99b4..aa2500ff2ee 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -519,7 +519,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int32_t value_length(int i) shared_ptr[CArray] values() shared_ptr[CArray] offsets() - CResult[shared_ptr[CArray]] Flatten(CMemoryPool* memory_pool) shared_ptr[CDataType] value_type() cdef cppclass CLargeListArray" arrow::LargeListArray"(CArray): @@ -531,7 +530,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int64_t value_length(int i) shared_ptr[CArray] values() shared_ptr[CArray] offsets() - CResult[shared_ptr[CArray]] Flatten(CMemoryPool* memory_pool) shared_ptr[CDataType] value_type() cdef cppclass CFixedSizeListArray" arrow::FixedSizeListArray"(CArray): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 68a22b484bb..50e437c2960 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -428,11 +428,15 @@ cdef class StructArray(Array): pass -cdef class ListArray(Array): +cdef class BaseListArray(Array): pass -cdef class LargeListArray(Array): +cdef class ListArray(BaseListArray): + pass + + +cdef class LargeListArray(BaseListArray): pass