Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ if(ARROW_COMPUTE)
compute/kernels/scalar_cast_string.cc
compute/kernels/scalar_cast_temporal.cc
compute/kernels/scalar_compare.cc
compute/kernels/scalar_nested.cc
compute/kernels/scalar_set_lookup.cc
compute/kernels/scalar_string.cc
compute/kernels/scalar_validity.cc
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ add_arrow_compute_test(scalar_test
scalar_boolean_test.cc
scalar_cast_test.cc
scalar_compare_test.cc
scalar_nested_test.cc
scalar_set_lookup_test.cc
scalar_string_test.cc
scalar_validity_test.cc
Expand Down
69 changes: 69 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_nested.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Vector kernels involving nested types

#include "arrow/array/array_base.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/result.h"
#include "arrow/visitor_inline.h"

namespace arrow {
namespace compute {
namespace internal {
namespace {

template <typename Type, typename offset_type = typename Type::offset_type>
void ListValueLengths(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
using ScalarType = typename TypeTraits<Type>::ScalarType;
using OffsetScalarType = typename TypeTraits<Type>::OffsetScalarType;

if (batch[0].kind() == Datum::ARRAY) {
typename TypeTraits<Type>::ArrayType list(batch[0].array());
ArrayData* out_arr = out->mutable_array();
auto out_values = out_arr->GetMutableValues<offset_type>(1);
const offset_type* offsets = list.raw_value_offsets();
::arrow::internal::detail::VisitBitBlocksVoid(
list.data()->buffers[0], list.offset(), list.length(),
[&](int64_t position) {
*out_values++ = offsets[position + 1] - offsets[position];
},
[&]() { *out_values++ = 0; });
} else {
const auto& arg0 = batch[0].scalar_as<ScalarType>();
if (arg0.is_valid) {
checked_cast<OffsetScalarType*>(out->scalar().get())->value =
static_cast<offset_type>(arg0.value->length());
}
}
}

} // namespace

void RegisterScalarNested(FunctionRegistry* registry) {
auto list_value_lengths =
std::make_shared<ScalarFunction>("list_value_lengths", Arity::Unary());
DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LIST)}, int32(),
ListValueLengths<ListType>));
DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LARGE_LIST)}, int64(),
ListValueLengths<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(list_value_lengths)));
}

} // namespace internal
} // namespace compute
} // namespace arrow
40 changes: 40 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_nested_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>

#include "arrow/compute/api.h"
#include "arrow/compute/kernels/test_util.h"
#include "arrow/result.h"
#include "arrow/testing/gtest_util.h"

namespace arrow {
namespace compute {

static std::shared_ptr<DataType> GetOffsetType(const DataType& type) {
return type.id() == Type::LIST ? int32() : int64();
}

TEST(TestScalarNested, ListValueLengths) {
for (auto ty : {list(int32()), large_list(int32())}) {
CheckScalarUnary("list_value_lengths", ty, "[[0, null, 1], null, [2, 3], []]",
GetOffsetType(*ty), "[3, null, 2, 0]");
}
}

} // namespace compute
} // namespace arrow
43 changes: 38 additions & 5 deletions cpp/src/arrow/compute/kernels/vector_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
namespace arrow {
namespace compute {
namespace internal {
namespace {

template <typename Type>
void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
Expand All @@ -36,19 +37,51 @@ void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
out->value = (*result)->data();
}

static Result<ValueDescr> ValuesType(KernelContext*,
const std::vector<ValueDescr>& args) {
template <typename Type, typename offset_type = typename Type::offset_type>
void ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
typename TypeTraits<Type>::ArrayType list(batch[0].array());
ArrayData* out_arr = out->mutable_array();

const offset_type* offsets = list.raw_value_offsets();
offset_type values_length = offsets[list.length()] - offsets[0];

out_arr->length = values_length;
out_arr->null_count = 0;
KERNEL_ASSIGN_OR_RAISE(out_arr->buffers[1], ctx,
ctx->Allocate(values_length * sizeof(offset_type)));
auto out_indices = reinterpret_cast<offset_type*>(out_arr->buffers[1]->mutable_data());
for (int64_t i = 0; i < list.length(); ++i) {
// Note: In most cases, null slots are empty, but when they are non-empty
// we write out the indices so make sure they are accounted for. This
// behavior could be changed if needed in the future.
for (offset_type j = offsets[i]; j < offsets[i + 1]; ++j) {
*out_indices++ = static_cast<offset_type>(i);
}
}
}

Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>& args) {
const auto& list_type = checked_cast<const BaseListType&>(*args[0].type);
return ValueDescr::Array(list_type.value_type());
}

} // namespace

void RegisterVectorNested(FunctionRegistry* registry) {
auto flatten = std::make_shared<VectorFunction>("list_flatten", Arity::Unary());
DCHECK_OK(flatten->AddKernel({InputType(Type::LIST)}, OutputType(ValuesType),
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, OutputType(ValuesType),
ListFlatten<ListType>));
DCHECK_OK(flatten->AddKernel({InputType(Type::LARGE_LIST)}, OutputType(ValuesType),
ListFlatten<LargeListType>));
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
OutputType(ValuesType), ListFlatten<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(flatten)));

auto list_parent_indices =
std::make_shared<VectorFunction>("list_parent_indices", Arity::Unary());
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LIST)}, int32(),
ListParentIndices<ListType>));
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LARGE_LIST)}, int64(),
ListParentIndices<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(list_parent_indices)));
}

} // namespace internal
Expand Down
20 changes: 20 additions & 0 deletions cpp/src/arrow/compute/kernels/vector_nested_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,25 @@ TEST(TestVectorNested, ListFlatten) {
}
}

TEST(TestVectorNested, ListParentIndices) {
for (auto ty : {list(int32()), large_list(int32())}) {
auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]");

auto out_ty = ty->id() == Type::LIST ? int32() : int64();
auto expected = ArrayFromJSON(out_ty, "[0, 0, 0, 2, 2, 4, 4]");
ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", {input}));
AssertArraysEqual(*expected, *out.make_array());
}

// Construct a list with non-empty null slots
auto input = ArrayFromJSON(list(int32()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]");
std::shared_ptr<ArrayData> data = input->data()->Copy();
data->buffers[0] =
(ArrayFromJSON(boolean(), "[true, false, true, true, true]")->data()->buffers[1]);
auto expected = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]");
ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", {data}));
AssertArraysEqual(*expected, *out.make_array());
}

} // namespace compute
} // namespace arrow
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
RegisterScalarBoolean(registry.get());
RegisterScalarCast(registry.get());
RegisterScalarComparison(registry.get());
RegisterScalarNested(registry.get());
RegisterScalarSetLookup(registry.get());
RegisterScalarStringAscii(registry.get());
RegisterScalarValidity(registry.get());
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/registry_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ void RegisterScalarArithmetic(FunctionRegistry* registry);
void RegisterScalarBoolean(FunctionRegistry* registry);
void RegisterScalarCast(FunctionRegistry* registry);
void RegisterScalarComparison(FunctionRegistry* registry);
void RegisterScalarNested(FunctionRegistry* registry);
void RegisterScalarSetLookup(FunctionRegistry* registry);
void RegisterScalarStringAscii(FunctionRegistry* registry);
void RegisterScalarValidity(FunctionRegistry* registry);
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/type_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ struct TypeTraits<ListType> {
using OffsetType = Int32Type;
using OffsetArrayType = Int32Array;
using OffsetBuilderType = Int32Builder;
using OffsetScalarType = Int32Scalar;
constexpr static bool is_parameter_free = false;
};

Expand All @@ -318,6 +319,7 @@ struct TypeTraits<LargeListType> {
using OffsetType = Int64Type;
using OffsetArrayType = Int64Array;
using OffsetBuilderType = Int64Builder;
using OffsetScalarType = Int64Scalar;
constexpr static bool is_parameter_free = false;
};

Expand Down
41 changes: 41 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1361,6 +1361,47 @@ cdef class BaseListArray(Array):
"""
return _pc().list_flatten(self)

def value_parent_indices(self):
"""
Return array of same length as list child values array where each
output value is the index of the parent list array slot containing each
child value.
Examples
--------
>>> arr = pa.array([[1, 2, 3], [], None, [4]],
... type=pa.list_(pa.int32()))
>>> arr.value_parent_indices()
<pyarrow.lib.Int32Array object at 0x7efc5db958a0>
[
0,
0,
0,
3
]
"""
return _pc().list_parent_indices(self)

def value_lengths(self):
"""
Return integers array with values equal to the respective length of
each list element. Null list values are null in the output.
Examples
--------
>>> arr = pa.array([[1, 2, 3], [], None, [4]],
... type=pa.list_(pa.int32()))
>>> arr.value_lengths()
<pyarrow.lib.Int32Array object at 0x7efc5db95910>
[
3,
0,
null,
1
]
"""
return _pc().list_value_lengths(self)


cdef class ListArray(BaseListArray):
"""
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def func(left, right):
is_null = _simple_unary_function('is_null')

list_flatten = _simple_unary_function('list_flatten')
list_parent_indices = _simple_unary_function('list_parent_indices')
list_value_lengths = _simple_unary_function('list_value_lengths')

add = _simple_binary_function('add')
subtract = _simple_binary_function('subtract')
Expand Down
28 changes: 28 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2109,6 +2109,34 @@ def test_list_array_flatten(offset_type, list_type_factory):
assert arr2.values.values.equals(arr0)


@pytest.mark.parametrize(('offset_type', 'list_type_factory'),
[(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
def test_list_value_parent_indices(offset_type, list_type_factory):
arr = pa.array(
[
[0, 1, 2],
None,
[],
[3, 4]
], type=list_type_factory(pa.int32()))
expected = pa.array([0, 0, 0, 3, 3], type=offset_type)
assert arr.value_parent_indices().equals(expected)


@pytest.mark.parametrize(('offset_type', 'list_type_factory'),
[(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
def test_list_value_lengths(offset_type, list_type_factory):
arr = pa.array(
[
[0, 1, 2],
None,
[],
[3, 4]
], type=list_type_factory(pa.int32()))
expected = pa.array([3, None, 0, 2], type=offset_type)
assert arr.value_lengths().equals(expected)


@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
def test_list_array_flatten_non_canonical(list_type_factory):
# Non-canonical list array (null elements backed by non-empty sublists)
Expand Down