From 0fa40b1f19b6ac024949b36e7bce393eb989906b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 3 Jul 2020 09:55:44 -0500 Subject: [PATCH 1/5] Implement list_value_lengths and list_parent_indices functions --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/compute/kernels/CMakeLists.txt | 1 + .../arrow/compute/kernels/vector_nested.cc | 44 ++++++++++++++++--- .../compute/kernels/vector_nested_test.cc | 20 +++++++++ cpp/src/arrow/compute/registry.cc | 1 + cpp/src/arrow/compute/registry_internal.h | 1 + cpp/src/arrow/type_traits.h | 2 + python/pyarrow/array.pxi | 29 ++++++++++++ python/pyarrow/compute.py | 2 + python/pyarrow/tests/test_array.py | 28 ++++++++++++ 10 files changed, 124 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 345f9b92fee..2682471ceb9 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -359,6 +359,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_cast_string.cc compute/kernels/scalar_cast_temporal.cc compute/kernels/scalar_compare.cc + compute/kernels/scalar_nested.cc compute/kernels/scalar_set_lookup.cc compute/kernels/scalar_string.cc compute/kernels/scalar_validity.cc diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 64debf8537a..e693a4176ab 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -24,6 +24,7 @@ add_arrow_compute_test(scalar_test scalar_boolean_test.cc scalar_cast_test.cc scalar_compare_test.cc + scalar_nested_test.cc scalar_set_lookup_test.cc scalar_string_test.cc scalar_validity_test.cc diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index e35bc58547c..e0bd4f0dc79 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -24,6 +24,7 @@ namespace arrow { namespace compute { namespace internal { +namespace { template void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) { @@ -36,19 +37,52 @@ void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) { out->value = (*result)->data(); } -static Result ValuesType(KernelContext*, - const std::vector& args) { +template +void ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + typename TypeTraits::ArrayType list(batch[0].array()); + ArrayData* out_arr = out->mutable_array(); + + const offset_type* offsets = list.raw_value_offsets(); + offset_type values_length = offsets[list.length()] - offsets[0]; + + out_arr->length = values_length; + out_arr->null_count = 0; + KERNEL_RETURN_IF_ERROR( + ctx, + (ctx->Allocate(values_length * sizeof(offset_type)).Value(&out_arr->buffers[1]))); + auto out_indices = reinterpret_cast(out_arr->buffers[1]->mutable_data()); + for (int64_t i = 0; i < list.length(); ++i) { + // Note: In most cases, null slots are empty, but when they are non-empty + // we write out the indices so make sure they are accounted for. This + // behavior could be changed if needed in the future. + for (offset_type j = offsets[i]; j < offsets[i + 1]; ++j) { + *out_indices++ = i; + } + } +} + +Result ValuesType(KernelContext*, const std::vector& args) { const auto& list_type = checked_cast(*args[0].type); return ValueDescr::Array(list_type.value_type()); } +} // namespace + void RegisterVectorNested(FunctionRegistry* registry) { auto flatten = std::make_shared("list_flatten", Arity::Unary()); - DCHECK_OK(flatten->AddKernel({InputType(Type::LIST)}, OutputType(ValuesType), + DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, OutputType(ValuesType), ListFlatten)); - DCHECK_OK(flatten->AddKernel({InputType(Type::LARGE_LIST)}, OutputType(ValuesType), - ListFlatten)); + DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)}, + OutputType(ValuesType), ListFlatten)); DCHECK_OK(registry->AddFunction(std::move(flatten))); + + auto list_parent_indices = + std::make_shared("list_parent_indices", Arity::Unary()); + DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LIST)}, int32(), + ListParentIndices)); + DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LARGE_LIST)}, int64(), + ListParentIndices)); + DCHECK_OK(registry->AddFunction(std::move(list_parent_indices))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index 61c2b777262..906772a0693 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -34,5 +34,25 @@ TEST(TestVectorNested, ListFlatten) { } } +TEST(TestVectorNested, ListParentIndices) { + for (auto ty : {list(int32()), large_list(int32())}) { + auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); + + auto out_ty = ty->id() == Type::LIST ? int32() : int64(); + auto expected = ArrayFromJSON(out_ty, "[0, 0, 0, 2, 2, 4, 4]"); + ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", {input})); + AssertArraysEqual(*expected, *out.make_array()); + } + + // Construct a list with non-empty null slots + auto input = ArrayFromJSON(list(int32()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); + std::shared_ptr data = input->data()->Copy(); + data->buffers[0] = + (ArrayFromJSON(boolean(), "[true, false, true, true, true]")->data()->buffers[1]); + auto expected = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); + ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", {data})); + AssertArraysEqual(*expected, *out.make_array()); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc index 061c01c3c45..4b1adf3465d 100644 --- a/cpp/src/arrow/compute/registry.cc +++ b/cpp/src/arrow/compute/registry.cc @@ -102,6 +102,7 @@ static std::unique_ptr CreateBuiltInRegistry() { RegisterScalarBoolean(registry.get()); RegisterScalarCast(registry.get()); RegisterScalarComparison(registry.get()); + RegisterScalarNested(registry.get()); RegisterScalarSetLookup(registry.get()); RegisterScalarStringAscii(registry.get()); RegisterScalarValidity(registry.get()); diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h index 5d22162517d..1e1a7b093e6 100644 --- a/cpp/src/arrow/compute/registry_internal.h +++ b/cpp/src/arrow/compute/registry_internal.h @@ -29,6 +29,7 @@ void RegisterScalarArithmetic(FunctionRegistry* registry); void RegisterScalarBoolean(FunctionRegistry* registry); void RegisterScalarCast(FunctionRegistry* registry); void RegisterScalarComparison(FunctionRegistry* registry); +void RegisterScalarNested(FunctionRegistry* registry); void RegisterScalarSetLookup(FunctionRegistry* registry); void RegisterScalarStringAscii(FunctionRegistry* registry); void RegisterScalarValidity(FunctionRegistry* registry); diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index dbe584426e8..eea4e86d27c 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -307,6 +307,7 @@ struct TypeTraits { using OffsetType = Int32Type; using OffsetArrayType = Int32Array; using OffsetBuilderType = Int32Builder; + using OffsetScalarType = Int32Scalar; constexpr static bool is_parameter_free = false; }; @@ -318,6 +319,7 @@ struct TypeTraits { using OffsetType = Int64Type; using OffsetArrayType = Int64Array; using OffsetBuilderType = Int64Builder; + using OffsetScalarType = Int64Scalar; constexpr static bool is_parameter_free = false; }; diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5f8c851ee17..160dbaa97a9 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1361,6 +1361,35 @@ cdef class BaseListArray(Array): """ return _pc().list_flatten(self) + def value_parent_indices(self): + """ + Return array of same length as list child values array where each + output value is the index of the parent list array slot containing each + child value. + + Examples + -------- + >>> arr = pa.array([[1, 2, 3], [], None, [4]], + ... type=pa.list_(pa.int32())) + >>> arr.value_parent_indices() + [0, 0, 0, 3] + """ + return _pc().list_parent_indices(self) + + def value_lengths(self): + """ + Return integers array with values equal to the respective length of + each list element. Null list values are null in the output. + + Examples + -------- + >>> arr = pa.array([[1, 2, 3], [], None, [4]], + ... type=pa.list_(pa.int32())) + >>> arr.value_lengths() + [3, 0, null, 4] + """ + return _pc().list_value_lengths(self) + cdef class ListArray(BaseListArray): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index babab27e58f..ae7dae86d18 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -107,6 +107,8 @@ def func(left, right): is_null = _simple_unary_function('is_null') list_flatten = _simple_unary_function('list_flatten') +list_parent_indices = _simple_unary_function('list_parent_indices') +list_value_lengths = _simple_unary_function('list_value_lengths') add = _simple_binary_function('add') subtract = _simple_binary_function('subtract') diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 18c9a1bcb8c..bd63ba76a43 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2109,6 +2109,34 @@ def test_list_array_flatten(offset_type, list_type_factory): assert arr2.values.values.equals(arr0) +@pytest.mark.parametrize(('offset_type', 'list_type_factory'), + [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) +def test_list_value_parent_indices(offset_type, list_type_factory): + arr = pa.array( + [ + [0, 1, 2], + None, + [], + [3, 4] + ], type=list_type_factory(pa.int32())) + expected = pa.array([0, 0, 0, 3, 3], type=offset_type) + assert arr.value_parent_indices().equals(expected) + + +@pytest.mark.parametrize(('offset_type', 'list_type_factory'), + [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) +def test_list_value_lengths(offset_type, list_type_factory): + arr = pa.array( + [ + [0, 1, 2], + None, + [], + [3, 4] + ], type=list_type_factory(pa.int32())) + expected = pa.array([3, None, 0, 2], type=offset_type) + assert arr.value_lengths().equals(expected) + + @pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list]) def test_list_array_flatten_non_canonical(list_type_factory): # Non-canonical list array (null elements backed by non-empty sublists) From 5d8a3319465acb51c0cd80828bfd599b0cd31ca9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 3 Jul 2020 09:58:22 -0500 Subject: [PATCH 2/5] Update docstrings --- python/pyarrow/array.pxi | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 160dbaa97a9..50398b7c338 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1372,7 +1372,13 @@ cdef class BaseListArray(Array): >>> arr = pa.array([[1, 2, 3], [], None, [4]], ... type=pa.list_(pa.int32())) >>> arr.value_parent_indices() - [0, 0, 0, 3] + + [ + 0, + 0, + 0, + 3 + ] """ return _pc().list_parent_indices(self) @@ -1386,7 +1392,13 @@ cdef class BaseListArray(Array): >>> arr = pa.array([[1, 2, 3], [], None, [4]], ... type=pa.list_(pa.int32())) >>> arr.value_lengths() - [3, 0, null, 4] + + [ + 3, + 0, + null, + 1 + ] """ return _pc().list_value_lengths(self) From cbe6744a282d4ed2acca5814e2d8524792e13d7d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 3 Jul 2020 10:02:01 -0500 Subject: [PATCH 3/5] Check in missing files --- .../arrow/compute/kernels/scalar_nested.cc | 69 +++++++++++++++++++ .../compute/kernels/scalar_nested_test.cc | 40 +++++++++++ 2 files changed, 109 insertions(+) create mode 100644 cpp/src/arrow/compute/kernels/scalar_nested.cc create mode 100644 cpp/src/arrow/compute/kernels/scalar_nested_test.cc diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc new file mode 100644 index 00000000000..7c61aa3b476 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Vector kernels involving nested types + +#include "arrow/array/array_base.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/result.h" +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace compute { +namespace internal { +namespace { + +template +void ListValueLengths(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + using ScalarType = typename TypeTraits::ScalarType; + using OffsetScalarType = typename TypeTraits::OffsetScalarType; + + if (batch[0].kind() == Datum::ARRAY) { + typename TypeTraits::ArrayType list(batch[0].array()); + ArrayData* out_arr = out->mutable_array(); + auto out_values = out_arr->GetMutableValues(1); + const offset_type* offsets = list.raw_value_offsets(); + ::arrow::internal::detail::VisitBitBlocksVoid( + list.data()->buffers[0], list.offset(), list.length(), + [&](int64_t position) { + *out_values++ = offsets[position + 1] - offsets[position]; + }, + [&]() { *out_values++ = 0; }); + } else { + const auto& arg0 = batch[0].scalar_as(); + if (arg0.is_valid) { + checked_cast(out->scalar().get())->value = + static_cast(arg0.value->length()); + } + } +} + +} // namespace + +void RegisterScalarNested(FunctionRegistry* registry) { + auto list_value_lengths = + std::make_shared("list_value_lengths", Arity::Unary()); + DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LIST)}, int32(), + ListValueLengths)); + DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LARGE_LIST)}, int64(), + ListValueLengths)); + DCHECK_OK(registry->AddFunction(std::move(list_value_lengths))); +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc new file mode 100644 index 00000000000..4657c41a407 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/api.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/result.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { +namespace compute { + +static std::shared_ptr GetOffsetType(const DataType& type) { + return type.id() == Type::LIST ? int32() : int64(); +} + +TEST(TestScalarNested, ListValueLengths) { + for (auto ty : {list(int32()), large_list(int32())}) { + CheckScalarUnary("list_value_lengths", ty, "[[0, null, 1], null, [2, 3], []]", + GetOffsetType(*ty), "[3, null, 2, 0]"); + } +} + +} // namespace compute +} // namespace arrow From b5b9238f9104b7d489e6761cc9172b145e8e6450 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 4 Jul 2020 16:50:56 -0500 Subject: [PATCH 4/5] Fix Windows integer narrowing warning --- cpp/src/arrow/compute/kernels/vector_nested.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index e0bd4f0dc79..30b2ca55c15 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -56,7 +56,7 @@ void ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // we write out the indices so make sure they are accounted for. This // behavior could be changed if needed in the future. for (offset_type j = offsets[i]; j < offsets[i + 1]; ++j) { - *out_indices++ = i; + *out_indices++ = static_cast(i); } } } From aacb398306016f78fe567dac9fa34a0317c2dbb5 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 7 Jul 2020 11:27:18 +0200 Subject: [PATCH 5/5] KERNEL_ASSIGN_OR_RAISE --- cpp/src/arrow/compute/kernels/vector_nested.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 30b2ca55c15..eeb58d3824e 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -47,9 +47,8 @@ void ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) { out_arr->length = values_length; out_arr->null_count = 0; - KERNEL_RETURN_IF_ERROR( - ctx, - (ctx->Allocate(values_length * sizeof(offset_type)).Value(&out_arr->buffers[1]))); + KERNEL_ASSIGN_OR_RAISE(out_arr->buffers[1], ctx, + ctx->Allocate(values_length * sizeof(offset_type))); auto out_indices = reinterpret_cast(out_arr->buffers[1]->mutable_data()); for (int64_t i = 0; i < list.length(); ++i) { // Note: In most cases, null slots are empty, but when they are non-empty