From 13a1969d2de9c772ab7477f1f2d297e5f111a959 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 21 May 2019 17:01:47 -0400 Subject: [PATCH 01/29] initial mask kernel impl --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/compute/kernels/CMakeLists.txt | 1 + cpp/src/arrow/compute/kernels/mask-test.cc | 145 ++++++++++++ cpp/src/arrow/compute/kernels/mask.cc | 230 +++++++++++++++++++ cpp/src/arrow/compute/kernels/mask.h | 82 +++++++ cpp/submodules/parquet-testing | 2 +- 6 files changed, 460 insertions(+), 1 deletion(-) create mode 100644 cpp/src/arrow/compute/kernels/mask-test.cc create mode 100644 cpp/src/arrow/compute/kernels/mask.cc create mode 100644 cpp/src/arrow/compute/kernels/mask.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index c989f855a5b..771a0f72d91 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -174,6 +174,7 @@ if(ARROW_COMPUTE) compute/kernels/count.cc compute/kernels/filter.cc compute/kernels/hash.cc + compute/kernels/mask.cc compute/kernels/mean.cc compute/kernels/sum.cc compute/kernels/take.cc diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 6c386c9b39c..8b223d8e536 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -20,6 +20,7 @@ arrow_install_all_headers("arrow/compute/kernels") add_arrow_test(boolean-test PREFIX "arrow-compute") add_arrow_test(cast-test PREFIX "arrow-compute") add_arrow_test(hash-test PREFIX "arrow-compute") +add_arrow_test(mask-test PREFIX "arrow-compute") add_arrow_test(take-test PREFIX "arrow-compute") add_arrow_test(util-internal-test PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/mask-test.cc b/cpp/src/arrow/compute/kernels/mask-test.cc new file mode 100644 index 00000000000..365b135c7b7 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/mask-test.cc @@ -0,0 +1,145 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// returnGegarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/compute/context.h" +#include "arrow/compute/kernels/mask.h" +#include "arrow/compute/test-util.h" +#include "arrow/testing/gtest_common.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/testing/util.h" + +namespace arrow { +namespace compute { + +using util::string_view; + +template +class TestMaskKernel : public ComputeFixture, public TestBase { + protected: + void AssertMaskArrays(const std::shared_ptr& values, + const std::shared_ptr& mask, MaskOptions options, + const std::shared_ptr& expected) { + std::shared_ptr actual; + ASSERT_OK(arrow::compute::Mask(&this->ctx_, *values, *mask, options, &actual)); + AssertArraysEqual(*expected, *actual); + } + void AssertMask(const std::shared_ptr& type, const std::string& values, + const std::string& mask, MaskOptions options, + const std::string& expected) { + std::shared_ptr actual; + ASSERT_OK(this->Mask(type, values, mask, options, &actual)); + AssertArraysEqual(*ArrayFromJSON(type, expected), *actual); + } + Status Mask(const std::shared_ptr& type, const std::string& values, + const std::string& mask, MaskOptions options, std::shared_ptr* out) { + return arrow::compute::Mask(&this->ctx_, *ArrayFromJSON(type, values), + *ArrayFromJSON(boolean(), mask), options, out); + } +}; + +class TestMaskKernelWithNull : public TestMaskKernel { + protected: + void AssertMask(const std::string& values, const std::string& mask, MaskOptions options, + const std::string& expected) { + TestMaskKernel::AssertMask(utf8(), values, mask, options, expected); + } +}; + +TEST_F(TestMaskKernelWithNull, MaskNull) { + MaskOptions options; + this->AssertMask("[null, null, null]", "[0, 1, 0]", options, "[null]"); + this->AssertMask("[null, null, null]", "[1, 1, 0]", options, "[null, null]"); +} + +class TestMaskKernelWithBoolean : public TestMaskKernel { + protected: + void AssertMask(const std::string& values, const std::string& mask, MaskOptions options, + const std::string& expected) { + TestMaskKernel::AssertMask(boolean(), values, mask, options, expected); + } +}; + +TEST_F(TestMaskKernelWithBoolean, MaskBoolean) { + MaskOptions options; + this->AssertMask("[true, false, true]", "[0, 1, 0]", options, "[false]"); + this->AssertMask("[null, false, true]", "[0, 1, 0]", options, "[false]"); + this->AssertMask("[true, false, true]", "[null, 1, 0]", options, "[null, false]"); +} + +template +class TestMaskKernelWithNumeric : public TestMaskKernel { + protected: + void AssertMask(const std::string& values, const std::string& mask, MaskOptions options, + const std::string& expected) { + TestMaskKernel::AssertMask(type_singleton(), values, mask, options, + expected); + } + std::shared_ptr type_singleton() { + return TypeTraits::type_singleton(); + } +}; + +TYPED_TEST_CASE(TestMaskKernelWithNumeric, NumericArrowTypes); +TYPED_TEST(TestMaskKernelWithNumeric, MaskNumeric) { + MaskOptions options; + this->AssertMask("[7, 8, 9]", "[0, 1, 0]", options, "[8]"); + this->AssertMask("[null, 8, 9]", "[0, 1, 0]", options, "[8]"); + this->AssertMask("[7, 8, 9]", "[null, 1, 0]", options, "[null, 8]"); +} + +class TestMaskKernelWithString : public TestMaskKernel { + protected: + void AssertMask(const std::string& values, const std::string& mask, MaskOptions options, + const std::string& expected) { + TestMaskKernel::AssertMask(utf8(), values, mask, options, expected); + } + void AssertMaskDictionary(const std::string& dictionary_values, + const std::string& dictionary_mask, const std::string& mask, + MaskOptions options, const std::string& expected_mask) { + auto dict = ArrayFromJSON(utf8(), dictionary_values); + auto type = dictionary(int8(), utf8()); + std::shared_ptr values, actual, expected; + ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_mask), + dict, &values)); + ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_mask), + dict, &expected)); + auto take_mask = ArrayFromJSON(boolean(), mask); + this->AssertMaskArrays(values, take_mask, options, expected); + } +}; + +TEST_F(TestMaskKernelWithString, MaskString) { + MaskOptions options; + this->AssertMask(R"(["a", "b", "c"])", "[0, 1, 0]", options, R"(["b"])"); + this->AssertMask(R"([null, "b", "c"])", "[0, 1, 0]", options, R"(["b"])"); + this->AssertMask(R"(["a", "b", "c"])", "[null, 1, 0]", options, R"([null, "b"])"); +} + +TEST_F(TestMaskKernelWithString, MaskDictionary) { + MaskOptions options; + auto dict = R"(["a", "b", "c", "d", "e"])"; + this->AssertMaskDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", options, "[4]"); + this->AssertMaskDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", options, "[4]"); + this->AssertMaskDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", options, "[null, 4]"); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/mask.cc b/cpp/src/arrow/compute/kernels/mask.cc new file mode 100644 index 00000000000..3a74deb08a8 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/mask.cc @@ -0,0 +1,230 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// returnGegarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/builder.h" +#include "arrow/compute/context.h" +#include "arrow/compute/kernels/mask.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace compute { + +Status Mask(FunctionContext* context, const Array& values, const Array& mask, + const MaskOptions& options, std::shared_ptr* out) { + Datum out_datum; + RETURN_NOT_OK( + Mask(context, Datum(values.data()), Datum(mask.data()), options, &out_datum)); + *out = out_datum.make_array(); + return Status::OK(); +} + +Status Mask(FunctionContext* context, const Datum& values, const Datum& mask, + const MaskOptions& options, Datum* out) { + MaskKernel kernel(values.type(), options); + RETURN_NOT_OK(kernel.Call(context, values, mask, out)); + return Status::OK(); +} + +struct MaskParameters { + FunctionContext* context; + std::shared_ptr values, mask; + MaskOptions options; + std::shared_ptr* out; +}; + +template +static Status UnsafeAppend(Builder* builder, Scalar&& value) { + builder->UnsafeAppend(std::forward(value)); + return Status::OK(); +} + +static Status UnsafeAppend(BinaryBuilder* builder, util::string_view value) { + RETURN_NOT_OK(builder->ReserveData(static_cast(value.size()))); + builder->UnsafeAppend(value); + return Status::OK(); +} + +static Status UnsafeAppend(StringBuilder* builder, util::string_view value) { + RETURN_NOT_OK(builder->ReserveData(static_cast(value.size()))); + builder->UnsafeAppend(value); + return Status::OK(); +} + +// TODO(bkietz) this can be optimized +static int64_t OutputSize(const BooleanArray& mask) { + auto offset = mask.offset(); + auto length = mask.length(); + internal::BitmapReader mask_data(mask.data()->buffers[1]->data(), offset, length); + int64_t size = 0; + for (auto i = offset; i < offset + length; ++i) { + if (mask.IsNull(i) || mask_data.IsSet()) { + ++size; + } + mask_data.Next(); + } + return size; +} + +template +Status MaskImpl(FunctionContext*, const ValueArray& values, const BooleanArray& mask, + OutBuilder* builder) { + auto offset = mask.offset(); + auto length = mask.length(); + internal::BitmapReader mask_data(mask.data()->buffers[1]->data(), offset, length); + for (int64_t i = 0; i < mask.length(); mask_data.Next(), ++i) { + if (!WholeMaskValid && mask.IsNull(i)) { + builder->UnsafeAppendNull(); + continue; + } + if (mask_data.IsNotSet()) { + continue; + } + if (!AllValuesValid && values.IsNull(i)) { + builder->UnsafeAppendNull(); + continue; + } + RETURN_NOT_OK(UnsafeAppend(builder, values.GetView(i))); + } + return Status::OK(); +} + +template +Status UnpackMaskNullCount(FunctionContext* context, const ValueArray& values, + const MaskArray& mask, OutBuilder* builder) { + if (mask.null_count() == 0) { + return MaskImpl(context, values, mask, builder); + } + return MaskImpl(context, values, mask, builder); +} + +template +Status UnpackValuesNullCount(FunctionContext* context, const ValueArray& values, + const MaskArray& mask, OutBuilder* builder) { + if (values.null_count() == 0) { + return UnpackMaskNullCount(context, values, mask, builder); + } + return UnpackMaskNullCount(context, values, mask, builder); +} + +template +using ArrayType = typename TypeTraits::ArrayType; + +template +struct UnpackValues { + template + Status Visit(const ValueType&) { + using OutBuilder = typename TypeTraits::BuilderType; + auto&& mask = static_cast&>(*params_.mask); + auto&& values = static_cast&>(*params_.values); + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(params_.context->memory_pool(), values.type(), &builder)); + RETURN_NOT_OK(builder->Reserve(OutputSize(mask))); + RETURN_NOT_OK(UnpackValuesNullCount(params_.context, values, mask, + static_cast(builder.get()))); + return builder->Finish(params_.out); + } + + Status Visit(const NullType& t) { + auto&& mask = static_cast&>(*params_.mask); + params_.out->reset(new NullArray(OutputSize(mask))); + return Status::OK(); + } + + Status Visit(const DictionaryType& t) { + std::shared_ptr masked_indices; + const auto& values = internal::checked_cast(*params_.values); + { + // To take from a dictionary, apply the current kernel to the dictionary's + // mask. (Use UnpackValues since MaskType is already unpacked) + MaskParameters params = params_; + params.values = values.indices(); + params.out = &masked_indices; + UnpackValues unpack = {params}; + RETURN_NOT_OK(VisitTypeInline(*t.index_type(), &unpack)); + } + // create output dictionary from taken mask + *params_.out = std::make_shared(values.type(), masked_indices, + values.dictionary()); + return Status::OK(); + } + + Status Visit(const ExtensionType& t) { + // XXX can we just take from its storage? + return Status::NotImplemented("gathering values of type ", t); + } + + Status Visit(const UnionType& t) { + return Status::NotImplemented("gathering values of type ", t); + } + + Status Visit(const ListType& t) { + return Status::NotImplemented("gathering values of type ", t); + } + + Status Visit(const FixedSizeListType& t) { + return Status::NotImplemented("gathering values of type ", t); + } + + Status Visit(const StructType& t) { + return Status::NotImplemented("gathering values of type ", t); + } + + const MaskParameters& params_; +}; + +struct UnpackMask { + Status Visit(const BooleanType&) { + UnpackValues unpack = {params_}; + return VisitTypeInline(*params_.values->type(), &unpack); + } + + Status Visit(const DataType& other) { + return Status::TypeError("mask type not supported: ", other); + } + + const MaskParameters& params_; +}; + +Status MaskKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& mask, + Datum* out) { + if (!values.is_array() || !mask.is_array()) { + return Status::Invalid("MaskKernel expects array values and mask"); + } + std::shared_ptr out_array; + MaskParameters params; + params.context = ctx; + params.values = values.make_array(); + params.mask = mask.make_array(); + params.options = options_; + params.out = &out_array; + UnpackMask unpack = {params}; + RETURN_NOT_OK(VisitTypeInline(*mask.type(), &unpack)); + *out = Datum(out_array); + return Status::OK(); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/mask.h b/cpp/src/arrow/compute/kernels/mask.h new file mode 100644 index 00000000000..6866f180462 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/mask.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/kernel.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; + +namespace compute { + +class FunctionContext; + +struct ARROW_EXPORT MaskOptions {}; + +/// \brief Mask an array with a boolean selection filter +/// +/// The output array will be populated with values from the input at positions +/// where the selection filter is not 0. Nulls in the filter will result in nulls +/// in the output. +/// +/// For example given values = ["a", "b", "c", null, "e", "f"] and +/// filter = [0, 1, 1, 0, null, 1], the output will be +/// = ["b", "c", null, "f"] +/// +/// \param[in] context the FunctionContext +/// \param[in] values array from which to take +/// \param[in] mask indicates which values should be masked out +/// \param[in] options options +/// \param[out] out resulting array +ARROW_EXPORT +Status Mask(FunctionContext* context, const Array& values, const Array& mask, + const MaskOptions& options, std::shared_ptr* out); + +/// \brief Mask an array with a boolean selection filter +/// +/// \param[in] context the FunctionContext +/// \param[in] values datum from which to take +/// \param[in] mask indicates which values should be masked out +/// \param[in] options options +/// \param[out] out resulting datum +ARROW_EXPORT +Status Mask(FunctionContext* context, const Datum& values, const Datum& indices, + const MaskOptions& options, Datum* out); + +/// \brief BinaryKernel implementing Mask operation +class ARROW_EXPORT MaskKernel : public BinaryKernel { + public: + explicit MaskKernel(const std::shared_ptr& type, MaskOptions options = {}) + : type_(type), options_(options) {} + + Status Call(FunctionContext* ctx, const Datum& values, const Datum& indices, + Datum* out) override; + + std::shared_ptr out_type() const override { return type_; } + + private: + std::shared_ptr type_; + MaskOptions options_; +}; +} // namespace compute +} // namespace arrow diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 2fc3ade4ccb..bb7b6abbb3f 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 2fc3ade4ccbf17271194df0b1549bc6733204314 +Subproject commit bb7b6abbb3fbeff845646364a4286142127be04c From db444242e0eca1ddce1816280baa356573568a8b Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 22 May 2019 10:58:19 -0400 Subject: [PATCH 02/29] remove empty MaskOptions --- cpp/src/arrow/compute/kernels/mask-test.cc | 67 ++++++++++------------ cpp/src/arrow/compute/kernels/mask.cc | 11 ++-- cpp/src/arrow/compute/kernels/mask.h | 12 +--- 3 files changed, 37 insertions(+), 53 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/mask-test.cc b/cpp/src/arrow/compute/kernels/mask-test.cc index 365b135c7b7..dbb59577585 100644 --- a/cpp/src/arrow/compute/kernels/mask-test.cc +++ b/cpp/src/arrow/compute/kernels/mask-test.cc @@ -35,62 +35,58 @@ template class TestMaskKernel : public ComputeFixture, public TestBase { protected: void AssertMaskArrays(const std::shared_ptr& values, - const std::shared_ptr& mask, MaskOptions options, + const std::shared_ptr& mask, const std::shared_ptr& expected) { std::shared_ptr actual; - ASSERT_OK(arrow::compute::Mask(&this->ctx_, *values, *mask, options, &actual)); + ASSERT_OK(arrow::compute::Mask(&this->ctx_, *values, *mask, &actual)); AssertArraysEqual(*expected, *actual); } void AssertMask(const std::shared_ptr& type, const std::string& values, - const std::string& mask, MaskOptions options, - const std::string& expected) { + const std::string& mask, const std::string& expected) { std::shared_ptr actual; - ASSERT_OK(this->Mask(type, values, mask, options, &actual)); + ASSERT_OK(this->Mask(type, values, mask, &actual)); AssertArraysEqual(*ArrayFromJSON(type, expected), *actual); } Status Mask(const std::shared_ptr& type, const std::string& values, - const std::string& mask, MaskOptions options, std::shared_ptr* out) { + const std::string& mask, std::shared_ptr* out) { return arrow::compute::Mask(&this->ctx_, *ArrayFromJSON(type, values), - *ArrayFromJSON(boolean(), mask), options, out); + *ArrayFromJSON(boolean(), mask), out); } }; class TestMaskKernelWithNull : public TestMaskKernel { protected: - void AssertMask(const std::string& values, const std::string& mask, MaskOptions options, + void AssertMask(const std::string& values, const std::string& mask, const std::string& expected) { - TestMaskKernel::AssertMask(utf8(), values, mask, options, expected); + TestMaskKernel::AssertMask(utf8(), values, mask, expected); } }; TEST_F(TestMaskKernelWithNull, MaskNull) { - MaskOptions options; - this->AssertMask("[null, null, null]", "[0, 1, 0]", options, "[null]"); - this->AssertMask("[null, null, null]", "[1, 1, 0]", options, "[null, null]"); + this->AssertMask("[null, null, null]", "[0, 1, 0]", "[null]"); + this->AssertMask("[null, null, null]", "[1, 1, 0]", "[null, null]"); } class TestMaskKernelWithBoolean : public TestMaskKernel { protected: - void AssertMask(const std::string& values, const std::string& mask, MaskOptions options, + void AssertMask(const std::string& values, const std::string& mask, const std::string& expected) { - TestMaskKernel::AssertMask(boolean(), values, mask, options, expected); + TestMaskKernel::AssertMask(boolean(), values, mask, expected); } }; TEST_F(TestMaskKernelWithBoolean, MaskBoolean) { - MaskOptions options; - this->AssertMask("[true, false, true]", "[0, 1, 0]", options, "[false]"); - this->AssertMask("[null, false, true]", "[0, 1, 0]", options, "[false]"); - this->AssertMask("[true, false, true]", "[null, 1, 0]", options, "[null, false]"); + this->AssertMask("[true, false, true]", "[0, 1, 0]", "[false]"); + this->AssertMask("[null, false, true]", "[0, 1, 0]", "[false]"); + this->AssertMask("[true, false, true]", "[null, 1, 0]", "[null, false]"); } template class TestMaskKernelWithNumeric : public TestMaskKernel { protected: - void AssertMask(const std::string& values, const std::string& mask, MaskOptions options, + void AssertMask(const std::string& values, const std::string& mask, const std::string& expected) { - TestMaskKernel::AssertMask(type_singleton(), values, mask, options, - expected); + TestMaskKernel::AssertMask(type_singleton(), values, mask, expected); } std::shared_ptr type_singleton() { return TypeTraits::type_singleton(); @@ -99,21 +95,20 @@ class TestMaskKernelWithNumeric : public TestMaskKernel { TYPED_TEST_CASE(TestMaskKernelWithNumeric, NumericArrowTypes); TYPED_TEST(TestMaskKernelWithNumeric, MaskNumeric) { - MaskOptions options; - this->AssertMask("[7, 8, 9]", "[0, 1, 0]", options, "[8]"); - this->AssertMask("[null, 8, 9]", "[0, 1, 0]", options, "[8]"); - this->AssertMask("[7, 8, 9]", "[null, 1, 0]", options, "[null, 8]"); + this->AssertMask("[7, 8, 9]", "[0, 1, 0]", "[8]"); + this->AssertMask("[null, 8, 9]", "[0, 1, 0]", "[8]"); + this->AssertMask("[7, 8, 9]", "[null, 1, 0]", "[null, 8]"); } class TestMaskKernelWithString : public TestMaskKernel { protected: - void AssertMask(const std::string& values, const std::string& mask, MaskOptions options, + void AssertMask(const std::string& values, const std::string& mask, const std::string& expected) { - TestMaskKernel::AssertMask(utf8(), values, mask, options, expected); + TestMaskKernel::AssertMask(utf8(), values, mask, expected); } void AssertMaskDictionary(const std::string& dictionary_values, const std::string& dictionary_mask, const std::string& mask, - MaskOptions options, const std::string& expected_mask) { + const std::string& expected_mask) { auto dict = ArrayFromJSON(utf8(), dictionary_values); auto type = dictionary(int8(), utf8()); std::shared_ptr values, actual, expected; @@ -122,23 +117,21 @@ class TestMaskKernelWithString : public TestMaskKernel { ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_mask), dict, &expected)); auto take_mask = ArrayFromJSON(boolean(), mask); - this->AssertMaskArrays(values, take_mask, options, expected); + this->AssertMaskArrays(values, take_mask, expected); } }; TEST_F(TestMaskKernelWithString, MaskString) { - MaskOptions options; - this->AssertMask(R"(["a", "b", "c"])", "[0, 1, 0]", options, R"(["b"])"); - this->AssertMask(R"([null, "b", "c"])", "[0, 1, 0]", options, R"(["b"])"); - this->AssertMask(R"(["a", "b", "c"])", "[null, 1, 0]", options, R"([null, "b"])"); + this->AssertMask(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])"); + this->AssertMask(R"([null, "b", "c"])", "[0, 1, 0]", R"(["b"])"); + this->AssertMask(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])"); } TEST_F(TestMaskKernelWithString, MaskDictionary) { - MaskOptions options; auto dict = R"(["a", "b", "c", "d", "e"])"; - this->AssertMaskDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", options, "[4]"); - this->AssertMaskDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", options, "[4]"); - this->AssertMaskDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", options, "[null, 4]"); + this->AssertMaskDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[4]"); + this->AssertMaskDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[4]"); + this->AssertMaskDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4]"); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/mask.cc b/cpp/src/arrow/compute/kernels/mask.cc index 3a74deb08a8..59a13e01a0a 100644 --- a/cpp/src/arrow/compute/kernels/mask.cc +++ b/cpp/src/arrow/compute/kernels/mask.cc @@ -31,17 +31,16 @@ namespace arrow { namespace compute { Status Mask(FunctionContext* context, const Array& values, const Array& mask, - const MaskOptions& options, std::shared_ptr* out) { + std::shared_ptr* out) { Datum out_datum; - RETURN_NOT_OK( - Mask(context, Datum(values.data()), Datum(mask.data()), options, &out_datum)); + RETURN_NOT_OK(Mask(context, Datum(values.data()), Datum(mask.data()), &out_datum)); *out = out_datum.make_array(); return Status::OK(); } Status Mask(FunctionContext* context, const Datum& values, const Datum& mask, - const MaskOptions& options, Datum* out) { - MaskKernel kernel(values.type(), options); + Datum* out) { + MaskKernel kernel(values.type()); RETURN_NOT_OK(kernel.Call(context, values, mask, out)); return Status::OK(); } @@ -49,7 +48,6 @@ Status Mask(FunctionContext* context, const Datum& values, const Datum& mask, struct MaskParameters { FunctionContext* context; std::shared_ptr values, mask; - MaskOptions options; std::shared_ptr* out; }; @@ -218,7 +216,6 @@ Status MaskKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& params.context = ctx; params.values = values.make_array(); params.mask = mask.make_array(); - params.options = options_; params.out = &out_array; UnpackMask unpack = {params}; RETURN_NOT_OK(VisitTypeInline(*mask.type(), &unpack)); diff --git a/cpp/src/arrow/compute/kernels/mask.h b/cpp/src/arrow/compute/kernels/mask.h index 6866f180462..1bd7ef0bb95 100644 --- a/cpp/src/arrow/compute/kernels/mask.h +++ b/cpp/src/arrow/compute/kernels/mask.h @@ -31,8 +31,6 @@ namespace compute { class FunctionContext; -struct ARROW_EXPORT MaskOptions {}; - /// \brief Mask an array with a boolean selection filter /// /// The output array will be populated with values from the input at positions @@ -46,28 +44,25 @@ struct ARROW_EXPORT MaskOptions {}; /// \param[in] context the FunctionContext /// \param[in] values array from which to take /// \param[in] mask indicates which values should be masked out -/// \param[in] options options /// \param[out] out resulting array ARROW_EXPORT Status Mask(FunctionContext* context, const Array& values, const Array& mask, - const MaskOptions& options, std::shared_ptr* out); + std::shared_ptr* out); /// \brief Mask an array with a boolean selection filter /// /// \param[in] context the FunctionContext /// \param[in] values datum from which to take /// \param[in] mask indicates which values should be masked out -/// \param[in] options options /// \param[out] out resulting datum ARROW_EXPORT Status Mask(FunctionContext* context, const Datum& values, const Datum& indices, - const MaskOptions& options, Datum* out); + Datum* out); /// \brief BinaryKernel implementing Mask operation class ARROW_EXPORT MaskKernel : public BinaryKernel { public: - explicit MaskKernel(const std::shared_ptr& type, MaskOptions options = {}) - : type_(type), options_(options) {} + explicit MaskKernel(const std::shared_ptr& type) : type_(type) {} Status Call(FunctionContext* ctx, const Datum& values, const Datum& indices, Datum* out) override; @@ -76,7 +71,6 @@ class ARROW_EXPORT MaskKernel : public BinaryKernel { private: std::shared_ptr type_; - MaskOptions options_; }; } // namespace compute } // namespace arrow From c953dca1b7d0866e90611e92e344b39af6cf01ff Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 22 May 2019 11:02:49 -0400 Subject: [PATCH 03/29] remove empty TakeOptions --- cpp/src/arrow/compute/kernels/take-test.cc | 84 ++++++++++------------ cpp/src/arrow/compute/kernels/take.cc | 11 ++- cpp/src/arrow/compute/kernels/take.h | 12 +--- 3 files changed, 44 insertions(+), 63 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/take-test.cc b/cpp/src/arrow/compute/kernels/take-test.cc index b3de04d8cc7..769cd5e3043 100644 --- a/cpp/src/arrow/compute/kernels/take-test.cc +++ b/cpp/src/arrow/compute/kernels/take-test.cc @@ -35,81 +35,75 @@ template class TestTakeKernel : public ComputeFixture, public TestBase { protected: void AssertTakeArrays(const std::shared_ptr& values, - const std::shared_ptr& indices, TakeOptions options, + const std::shared_ptr& indices, const std::shared_ptr& expected) { std::shared_ptr actual; - ASSERT_OK(arrow::compute::Take(&this->ctx_, *values, *indices, options, &actual)); + ASSERT_OK(arrow::compute::Take(&this->ctx_, *values, *indices, &actual)); AssertArraysEqual(*expected, *actual); } void AssertTake(const std::shared_ptr& type, const std::string& values, - const std::string& indices, TakeOptions options, - const std::string& expected) { + const std::string& indices, const std::string& expected) { std::shared_ptr actual; for (auto index_type : {int8(), uint32()}) { - ASSERT_OK(this->Take(type, values, index_type, indices, options, &actual)); + ASSERT_OK(this->Take(type, values, index_type, indices, &actual)); AssertArraysEqual(*ArrayFromJSON(type, expected), *actual); } } Status Take(const std::shared_ptr& type, const std::string& values, const std::shared_ptr& index_type, const std::string& indices, - TakeOptions options, std::shared_ptr* out) { + std::shared_ptr* out) { return arrow::compute::Take(&this->ctx_, *ArrayFromJSON(type, values), - *ArrayFromJSON(index_type, indices), options, out); + *ArrayFromJSON(index_type, indices), out); } }; class TestTakeKernelWithNull : public TestTakeKernel { protected: void AssertTake(const std::string& values, const std::string& indices, - TakeOptions options, const std::string& expected) { - TestTakeKernel::AssertTake(utf8(), values, indices, options, expected); + const std::string& expected) { + TestTakeKernel::AssertTake(utf8(), values, indices, expected); } }; TEST_F(TestTakeKernelWithNull, TakeNull) { - TakeOptions options; - this->AssertTake("[null, null, null]", "[0, 1, 0]", options, "[null, null, null]"); + this->AssertTake("[null, null, null]", "[0, 1, 0]", "[null, null, null]"); std::shared_ptr arr; - ASSERT_RAISES(IndexError, this->Take(null(), "[null, null, null]", int8(), "[0, 9, 0]", - options, &arr)); + ASSERT_RAISES(IndexError, + this->Take(null(), "[null, null, null]", int8(), "[0, 9, 0]", &arr)); } TEST_F(TestTakeKernelWithNull, InvalidIndexType) { - TakeOptions options; std::shared_ptr arr; ASSERT_RAISES(TypeError, this->Take(null(), "[null, null, null]", float32(), - "[0.0, 1.0, 0.1]", options, &arr)); + "[0.0, 1.0, 0.1]", &arr)); } class TestTakeKernelWithBoolean : public TestTakeKernel { protected: void AssertTake(const std::string& values, const std::string& indices, - TakeOptions options, const std::string& expected) { - TestTakeKernel::AssertTake(boolean(), values, indices, options, - expected); + const std::string& expected) { + TestTakeKernel::AssertTake(boolean(), values, indices, expected); } }; TEST_F(TestTakeKernelWithBoolean, TakeBoolean) { - TakeOptions options; - this->AssertTake("[true, false, true]", "[0, 1, 0]", options, "[true, false, true]"); - this->AssertTake("[null, false, true]", "[0, 1, 0]", options, "[null, false, null]"); - this->AssertTake("[true, false, true]", "[null, 1, 0]", options, "[null, false, true]"); + this->AssertTake("[true, false, true]", "[0, 1, 0]", "[true, false, true]"); + this->AssertTake("[null, false, true]", "[0, 1, 0]", "[null, false, null]"); + this->AssertTake("[true, false, true]", "[null, 1, 0]", "[null, false, true]"); std::shared_ptr arr; - ASSERT_RAISES(IndexError, this->Take(boolean(), "[true, false, true]", int8(), - "[0, 9, 0]", options, &arr)); + ASSERT_RAISES(IndexError, + this->Take(boolean(), "[true, false, true]", int8(), "[0, 9, 0]", &arr)); } template class TestTakeKernelWithNumeric : public TestTakeKernel { protected: void AssertTake(const std::string& values, const std::string& indices, - TakeOptions options, const std::string& expected) { - TestTakeKernel::AssertTake(type_singleton(), values, indices, options, - expected); + const std::string& expected) { + TestTakeKernel::AssertTake(type_singleton(), values, indices, expected); } std::shared_ptr type_singleton() { return TypeTraits::type_singleton(); @@ -118,25 +112,24 @@ class TestTakeKernelWithNumeric : public TestTakeKernel { TYPED_TEST_CASE(TestTakeKernelWithNumeric, NumericArrowTypes); TYPED_TEST(TestTakeKernelWithNumeric, TakeNumeric) { - TakeOptions options; - this->AssertTake("[7, 8, 9]", "[0, 1, 0]", options, "[7, 8, 7]"); - this->AssertTake("[null, 8, 9]", "[0, 1, 0]", options, "[null, 8, null]"); - this->AssertTake("[7, 8, 9]", "[null, 1, 0]", options, "[null, 8, 7]"); + this->AssertTake("[7, 8, 9]", "[0, 1, 0]", "[7, 8, 7]"); + this->AssertTake("[null, 8, 9]", "[0, 1, 0]", "[null, 8, null]"); + this->AssertTake("[7, 8, 9]", "[null, 1, 0]", "[null, 8, 7]"); std::shared_ptr arr; ASSERT_RAISES(IndexError, this->Take(this->type_singleton(), "[7, 8, 9]", int8(), - "[0, 9, 0]", options, &arr)); + "[0, 9, 0]", &arr)); } class TestTakeKernelWithString : public TestTakeKernel { protected: void AssertTake(const std::string& values, const std::string& indices, - TakeOptions options, const std::string& expected) { - TestTakeKernel::AssertTake(utf8(), values, indices, options, expected); + const std::string& expected) { + TestTakeKernel::AssertTake(utf8(), values, indices, expected); } void AssertTakeDictionary(const std::string& dictionary_values, const std::string& dictionary_indices, - const std::string& indices, TakeOptions options, + const std::string& indices, const std::string& expected_indices) { auto dict = ArrayFromJSON(utf8(), dictionary_values); auto type = dictionary(int8(), utf8()); @@ -146,28 +139,25 @@ class TestTakeKernelWithString : public TestTakeKernel { ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_indices), dict, &expected)); auto take_indices = ArrayFromJSON(int8(), indices); - this->AssertTakeArrays(values, take_indices, options, expected); + this->AssertTakeArrays(values, take_indices, expected); } }; TEST_F(TestTakeKernelWithString, TakeString) { - TakeOptions options; - this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", options, R"(["a", "b", "a"])"); - this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", options, "[null, \"b\", null]"); - this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", options, R"([null, "b", "a"])"); + this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])"); + this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]"); + this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])"); std::shared_ptr arr; - ASSERT_RAISES(IndexError, this->Take(utf8(), R"(["a", "b", "c"])", int8(), "[0, 9, 0]", - options, &arr)); + ASSERT_RAISES(IndexError, + this->Take(utf8(), R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr)); } TEST_F(TestTakeKernelWithString, TakeDictionary) { - TakeOptions options; auto dict = R"(["a", "b", "c", "d", "e"])"; - this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", options, "[3, 4, 3]"); - this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", options, - "[null, 4, null]"); - this->AssertTakeDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", options, "[null, 4, 3]"); + this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]"); + this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[null, 4, null]"); + this->AssertTakeDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4, 3]"); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/take.cc b/cpp/src/arrow/compute/kernels/take.cc index 9af2c0cab11..6addcd7444d 100644 --- a/cpp/src/arrow/compute/kernels/take.cc +++ b/cpp/src/arrow/compute/kernels/take.cc @@ -29,17 +29,16 @@ namespace arrow { namespace compute { Status Take(FunctionContext* context, const Array& values, const Array& indices, - const TakeOptions& options, std::shared_ptr* out) { + std::shared_ptr* out) { Datum out_datum; - RETURN_NOT_OK( - Take(context, Datum(values.data()), Datum(indices.data()), options, &out_datum)); + RETURN_NOT_OK(Take(context, Datum(values.data()), Datum(indices.data()), &out_datum)); *out = out_datum.make_array(); return Status::OK(); } Status Take(FunctionContext* context, const Datum& values, const Datum& indices, - const TakeOptions& options, Datum* out) { - TakeKernel kernel(values.type(), options); + Datum* out) { + TakeKernel kernel(values.type()); RETURN_NOT_OK(kernel.Call(context, values, indices, out)); return Status::OK(); } @@ -47,7 +46,6 @@ Status Take(FunctionContext* context, const Datum& values, const Datum& indices, struct TakeParameters { FunctionContext* context; std::shared_ptr values, indices; - TakeOptions options; std::shared_ptr* out; }; @@ -215,7 +213,6 @@ Status TakeKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& params.context = ctx; params.values = values.make_array(); params.indices = indices.make_array(); - params.options = options_; params.out = &out_array; UnpackIndices unpack = {params}; RETURN_NOT_OK(VisitTypeInline(*indices.type(), &unpack)); diff --git a/cpp/src/arrow/compute/kernels/take.h b/cpp/src/arrow/compute/kernels/take.h index 3aa5ed5eedf..1e89bd11a22 100644 --- a/cpp/src/arrow/compute/kernels/take.h +++ b/cpp/src/arrow/compute/kernels/take.h @@ -31,8 +31,6 @@ namespace compute { class FunctionContext; -struct ARROW_EXPORT TakeOptions {}; - /// \brief Take from an array of values at indices in another array /// /// The output array will be of the same type as the input values @@ -47,28 +45,25 @@ struct ARROW_EXPORT TakeOptions {}; /// \param[in] context the FunctionContext /// \param[in] values array from which to take /// \param[in] indices which values to take -/// \param[in] options options /// \param[out] out resulting array ARROW_EXPORT Status Take(FunctionContext* context, const Array& values, const Array& indices, - const TakeOptions& options, std::shared_ptr* out); + std::shared_ptr* out); /// \brief Take from an array of values at indices in another array /// /// \param[in] context the FunctionContext /// \param[in] values datum from which to take /// \param[in] indices which values to take -/// \param[in] options options /// \param[out] out resulting datum ARROW_EXPORT Status Take(FunctionContext* context, const Datum& values, const Datum& indices, - const TakeOptions& options, Datum* out); + Datum* out); /// \brief BinaryKernel implementing Take operation class ARROW_EXPORT TakeKernel : public BinaryKernel { public: - explicit TakeKernel(const std::shared_ptr& type, TakeOptions options = {}) - : type_(type), options_(options) {} + explicit TakeKernel(const std::shared_ptr& type) : type_(type) {} Status Call(FunctionContext* ctx, const Datum& values, const Datum& indices, Datum* out) override; @@ -77,7 +72,6 @@ class ARROW_EXPORT TakeKernel : public BinaryKernel { private: std::shared_ptr type_; - TakeOptions options_; }; } // namespace compute } // namespace arrow From 223a8605e61a61390408865bf539dbe11f93eb2d Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 22 May 2019 11:04:27 -0400 Subject: [PATCH 04/29] fix typo --- cpp/src/arrow/compute/kernels/mask.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/mask.cc b/cpp/src/arrow/compute/kernels/mask.cc index 59a13e01a0a..472c50af9df 100644 --- a/cpp/src/arrow/compute/kernels/mask.cc +++ b/cpp/src/arrow/compute/kernels/mask.cc @@ -156,7 +156,7 @@ struct UnpackValues { const auto& values = internal::checked_cast(*params_.values); { // To take from a dictionary, apply the current kernel to the dictionary's - // mask. (Use UnpackValues since MaskType is already unpacked) + // indices. (Use UnpackValues since MaskType is already unpacked) MaskParameters params = params_; params.values = values.indices(); params.out = &masked_indices; From d5c9c14f636cbf4451af9ad6a29045415fc3d7de Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 22 May 2019 11:06:26 -0400 Subject: [PATCH 05/29] use checked_cast --- cpp/src/arrow/compute/kernels/mask.cc | 10 ++++++---- cpp/src/arrow/compute/kernels/take.cc | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/mask.cc b/cpp/src/arrow/compute/kernels/mask.cc index 472c50af9df..2aaa9a9a622 100644 --- a/cpp/src/arrow/compute/kernels/mask.cc +++ b/cpp/src/arrow/compute/kernels/mask.cc @@ -30,6 +30,8 @@ namespace arrow { namespace compute { +using internal::checked_cast; + Status Mask(FunctionContext* context, const Array& values, const Array& mask, std::shared_ptr* out) { Datum out_datum; @@ -135,18 +137,18 @@ struct UnpackValues { template Status Visit(const ValueType&) { using OutBuilder = typename TypeTraits::BuilderType; - auto&& mask = static_cast&>(*params_.mask); - auto&& values = static_cast&>(*params_.values); + const auto& mask = checked_cast&>(*params_.mask); + const auto& values = checked_cast&>(*params_.values); std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(params_.context->memory_pool(), values.type(), &builder)); RETURN_NOT_OK(builder->Reserve(OutputSize(mask))); RETURN_NOT_OK(UnpackValuesNullCount(params_.context, values, mask, - static_cast(builder.get()))); + checked_cast(builder.get()))); return builder->Finish(params_.out); } Status Visit(const NullType& t) { - auto&& mask = static_cast&>(*params_.mask); + const auto& mask = checked_cast&>(*params_.mask); params_.out->reset(new NullArray(OutputSize(mask))); return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/take.cc b/cpp/src/arrow/compute/kernels/take.cc index 6addcd7444d..b77715abb6f 100644 --- a/cpp/src/arrow/compute/kernels/take.cc +++ b/cpp/src/arrow/compute/kernels/take.cc @@ -28,6 +28,8 @@ namespace arrow { namespace compute { +using internal::checked_cast; + Status Take(FunctionContext* context, const Array& values, const Array& indices, std::shared_ptr* out) { Datum out_datum; @@ -117,20 +119,20 @@ struct UnpackValues { Status Visit(const ValueType&) { using ValueArrayRef = const typename TypeTraits::ArrayType&; using OutBuilder = typename TypeTraits::BuilderType; - IndexArrayRef indices = static_cast(*params_.indices); - ValueArrayRef values = static_cast(*params_.values); + IndexArrayRef indices = checked_cast(*params_.indices); + ValueArrayRef values = checked_cast(*params_.values); std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(params_.context->memory_pool(), values.type(), &builder)); RETURN_NOT_OK(builder->Reserve(indices.length())); RETURN_NOT_OK(UnpackValuesNullCount(params_.context, values, indices, - static_cast(builder.get()))); + checked_cast(builder.get()))); return builder->Finish(params_.out); } Status Visit(const NullType& t) { auto indices_length = params_.indices->length(); if (indices_length != 0) { - auto indices = static_cast(*params_.indices).raw_values(); + auto indices = checked_cast(*params_.indices).raw_values(); auto minmax = std::minmax_element(indices, indices + indices_length); auto min = static_cast(*minmax.first); auto max = static_cast(*minmax.second); From a54741ea2272f77bfb4ac5b45f702f0b4bd4c210 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 22 May 2019 11:08:45 -0400 Subject: [PATCH 06/29] add some tests with empty masks/take indices --- cpp/src/arrow/compute/kernels/mask-test.cc | 1 + cpp/src/arrow/compute/kernels/mask.cc | 3 +++ cpp/src/arrow/compute/kernels/take-test.cc | 1 + 3 files changed, 5 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/mask-test.cc b/cpp/src/arrow/compute/kernels/mask-test.cc index dbb59577585..5672407694e 100644 --- a/cpp/src/arrow/compute/kernels/mask-test.cc +++ b/cpp/src/arrow/compute/kernels/mask-test.cc @@ -98,6 +98,7 @@ TYPED_TEST(TestMaskKernelWithNumeric, MaskNumeric) { this->AssertMask("[7, 8, 9]", "[0, 1, 0]", "[8]"); this->AssertMask("[null, 8, 9]", "[0, 1, 0]", "[8]"); this->AssertMask("[7, 8, 9]", "[null, 1, 0]", "[null, 8]"); + this->AssertMask("[]", "[]", "[]"); } class TestMaskKernelWithString : public TestMaskKernel { diff --git a/cpp/src/arrow/compute/kernels/mask.cc b/cpp/src/arrow/compute/kernels/mask.cc index 2aaa9a9a622..7041ec86b66 100644 --- a/cpp/src/arrow/compute/kernels/mask.cc +++ b/cpp/src/arrow/compute/kernels/mask.cc @@ -218,6 +218,9 @@ Status MaskKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& params.context = ctx; params.values = values.make_array(); params.mask = mask.make_array(); + if (params.values->length() != params.mask->length()) { + return Status::Invalid("Mask is not the same size as values"); + } params.out = &out_array; UnpackMask unpack = {params}; RETURN_NOT_OK(VisitTypeInline(*mask.type(), &unpack)); diff --git a/cpp/src/arrow/compute/kernels/take-test.cc b/cpp/src/arrow/compute/kernels/take-test.cc index 769cd5e3043..1fb5c307179 100644 --- a/cpp/src/arrow/compute/kernels/take-test.cc +++ b/cpp/src/arrow/compute/kernels/take-test.cc @@ -115,6 +115,7 @@ TYPED_TEST(TestTakeKernelWithNumeric, TakeNumeric) { this->AssertTake("[7, 8, 9]", "[0, 1, 0]", "[7, 8, 7]"); this->AssertTake("[null, 8, 9]", "[0, 1, 0]", "[null, 8, null]"); this->AssertTake("[7, 8, 9]", "[null, 1, 0]", "[null, 8, 7]"); + this->AssertTake("[null, 8, 9]", "[]", "[]"); std::shared_ptr arr; ASSERT_RAISES(IndexError, this->Take(this->type_singleton(), "[7, 8, 9]", int8(), From 4c8ce6d9faf6288674426c4618148ae72cccf3a4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 22 May 2019 11:14:45 -0400 Subject: [PATCH 07/29] revert submodule --- cpp/submodules/parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index bb7b6abbb3f..2fc3ade4ccb 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit bb7b6abbb3fbeff845646364a4286142127be04c +Subproject commit 2fc3ade4ccbf17271194df0b1549bc6733204314 From 4b24ca33039729161b451b4381d2fcfb697e498e Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 22 May 2019 13:09:36 -0400 Subject: [PATCH 08/29] revert removal of TakeOptions --- cpp/src/arrow/compute/kernels/take-test.cc | 86 ++++++++++++---------- cpp/src/arrow/compute/kernels/take.cc | 11 ++- cpp/src/arrow/compute/kernels/take.h | 12 ++- 3 files changed, 64 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/take-test.cc b/cpp/src/arrow/compute/kernels/take-test.cc index 1fb5c307179..c61aedab5b9 100644 --- a/cpp/src/arrow/compute/kernels/take-test.cc +++ b/cpp/src/arrow/compute/kernels/take-test.cc @@ -35,75 +35,81 @@ template class TestTakeKernel : public ComputeFixture, public TestBase { protected: void AssertTakeArrays(const std::shared_ptr& values, - const std::shared_ptr& indices, + const std::shared_ptr& indices, TakeOptions options, const std::shared_ptr& expected) { std::shared_ptr actual; - ASSERT_OK(arrow::compute::Take(&this->ctx_, *values, *indices, &actual)); + ASSERT_OK(arrow::compute::Take(&this->ctx_, *values, *indices, options, &actual)); AssertArraysEqual(*expected, *actual); } void AssertTake(const std::shared_ptr& type, const std::string& values, - const std::string& indices, const std::string& expected) { + const std::string& indices, TakeOptions options, + const std::string& expected) { std::shared_ptr actual; for (auto index_type : {int8(), uint32()}) { - ASSERT_OK(this->Take(type, values, index_type, indices, &actual)); + ASSERT_OK(this->Take(type, values, index_type, indices, options, &actual)); AssertArraysEqual(*ArrayFromJSON(type, expected), *actual); } } Status Take(const std::shared_ptr& type, const std::string& values, const std::shared_ptr& index_type, const std::string& indices, - std::shared_ptr* out) { + TakeOptions options, std::shared_ptr* out) { return arrow::compute::Take(&this->ctx_, *ArrayFromJSON(type, values), - *ArrayFromJSON(index_type, indices), out); + *ArrayFromJSON(index_type, indices), options, out); } }; class TestTakeKernelWithNull : public TestTakeKernel { protected: void AssertTake(const std::string& values, const std::string& indices, - const std::string& expected) { - TestTakeKernel::AssertTake(utf8(), values, indices, expected); + TakeOptions options, const std::string& expected) { + TestTakeKernel::AssertTake(utf8(), values, indices, options, expected); } }; TEST_F(TestTakeKernelWithNull, TakeNull) { - this->AssertTake("[null, null, null]", "[0, 1, 0]", "[null, null, null]"); + TakeOptions options; + this->AssertTake("[null, null, null]", "[0, 1, 0]", options, "[null, null, null]"); std::shared_ptr arr; - ASSERT_RAISES(IndexError, - this->Take(null(), "[null, null, null]", int8(), "[0, 9, 0]", &arr)); + ASSERT_RAISES(IndexError, this->Take(null(), "[null, null, null]", int8(), "[0, 9, 0]", + options, &arr)); } TEST_F(TestTakeKernelWithNull, InvalidIndexType) { + TakeOptions options; std::shared_ptr arr; ASSERT_RAISES(TypeError, this->Take(null(), "[null, null, null]", float32(), - "[0.0, 1.0, 0.1]", &arr)); + "[0.0, 1.0, 0.1]", options, &arr)); } class TestTakeKernelWithBoolean : public TestTakeKernel { protected: void AssertTake(const std::string& values, const std::string& indices, - const std::string& expected) { - TestTakeKernel::AssertTake(boolean(), values, indices, expected); + TakeOptions options, const std::string& expected) { + TestTakeKernel::AssertTake(boolean(), values, indices, options, + expected); } }; TEST_F(TestTakeKernelWithBoolean, TakeBoolean) { - this->AssertTake("[true, false, true]", "[0, 1, 0]", "[true, false, true]"); - this->AssertTake("[null, false, true]", "[0, 1, 0]", "[null, false, null]"); - this->AssertTake("[true, false, true]", "[null, 1, 0]", "[null, false, true]"); + TakeOptions options; + this->AssertTake("[true, false, true]", "[0, 1, 0]", options, "[true, false, true]"); + this->AssertTake("[null, false, true]", "[0, 1, 0]", options, "[null, false, null]"); + this->AssertTake("[true, false, true]", "[null, 1, 0]", options, "[null, false, true]"); std::shared_ptr arr; - ASSERT_RAISES(IndexError, - this->Take(boolean(), "[true, false, true]", int8(), "[0, 9, 0]", &arr)); + ASSERT_RAISES(IndexError, this->Take(boolean(), "[true, false, true]", int8(), + "[0, 9, 0]", options, &arr)); } template class TestTakeKernelWithNumeric : public TestTakeKernel { protected: void AssertTake(const std::string& values, const std::string& indices, - const std::string& expected) { - TestTakeKernel::AssertTake(type_singleton(), values, indices, expected); + TakeOptions options, const std::string& expected) { + TestTakeKernel::AssertTake(type_singleton(), values, indices, options, + expected); } std::shared_ptr type_singleton() { return TypeTraits::type_singleton(); @@ -112,25 +118,26 @@ class TestTakeKernelWithNumeric : public TestTakeKernel { TYPED_TEST_CASE(TestTakeKernelWithNumeric, NumericArrowTypes); TYPED_TEST(TestTakeKernelWithNumeric, TakeNumeric) { - this->AssertTake("[7, 8, 9]", "[0, 1, 0]", "[7, 8, 7]"); - this->AssertTake("[null, 8, 9]", "[0, 1, 0]", "[null, 8, null]"); - this->AssertTake("[7, 8, 9]", "[null, 1, 0]", "[null, 8, 7]"); - this->AssertTake("[null, 8, 9]", "[]", "[]"); + TakeOptions options; + this->AssertTake("[7, 8, 9]", "[0, 1, 0]", options, "[7, 8, 7]"); + this->AssertTake("[null, 8, 9]", "[0, 1, 0]", options, "[null, 8, null]"); + this->AssertTake("[7, 8, 9]", "[null, 1, 0]", options, "[null, 8, 7]"); + this->AssertTake("[null, 8, 9]", "[]", options, "[]"); std::shared_ptr arr; ASSERT_RAISES(IndexError, this->Take(this->type_singleton(), "[7, 8, 9]", int8(), - "[0, 9, 0]", &arr)); + "[0, 9, 0]", options, &arr)); } class TestTakeKernelWithString : public TestTakeKernel { protected: void AssertTake(const std::string& values, const std::string& indices, - const std::string& expected) { - TestTakeKernel::AssertTake(utf8(), values, indices, expected); + TakeOptions options, const std::string& expected) { + TestTakeKernel::AssertTake(utf8(), values, indices, options, expected); } void AssertTakeDictionary(const std::string& dictionary_values, const std::string& dictionary_indices, - const std::string& indices, + const std::string& indices, TakeOptions options, const std::string& expected_indices) { auto dict = ArrayFromJSON(utf8(), dictionary_values); auto type = dictionary(int8(), utf8()); @@ -140,25 +147,28 @@ class TestTakeKernelWithString : public TestTakeKernel { ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_indices), dict, &expected)); auto take_indices = ArrayFromJSON(int8(), indices); - this->AssertTakeArrays(values, take_indices, expected); + this->AssertTakeArrays(values, take_indices, options, expected); } }; TEST_F(TestTakeKernelWithString, TakeString) { - this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])"); - this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]"); - this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])"); + TakeOptions options; + this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", options, R"(["a", "b", "a"])"); + this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", options, "[null, \"b\", null]"); + this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", options, R"([null, "b", "a"])"); std::shared_ptr arr; - ASSERT_RAISES(IndexError, - this->Take(utf8(), R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr)); + ASSERT_RAISES(IndexError, this->Take(utf8(), R"(["a", "b", "c"])", int8(), "[0, 9, 0]", + options, &arr)); } TEST_F(TestTakeKernelWithString, TakeDictionary) { + TakeOptions options; auto dict = R"(["a", "b", "c", "d", "e"])"; - this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]"); - this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[null, 4, null]"); - this->AssertTakeDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4, 3]"); + this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", options, "[3, 4, 3]"); + this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", options, + "[null, 4, null]"); + this->AssertTakeDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", options, "[null, 4, 3]"); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/take.cc b/cpp/src/arrow/compute/kernels/take.cc index b77715abb6f..17b054099ea 100644 --- a/cpp/src/arrow/compute/kernels/take.cc +++ b/cpp/src/arrow/compute/kernels/take.cc @@ -31,16 +31,17 @@ namespace compute { using internal::checked_cast; Status Take(FunctionContext* context, const Array& values, const Array& indices, - std::shared_ptr* out) { + const TakeOptions& options, std::shared_ptr* out) { Datum out_datum; - RETURN_NOT_OK(Take(context, Datum(values.data()), Datum(indices.data()), &out_datum)); + RETURN_NOT_OK( + Take(context, Datum(values.data()), Datum(indices.data()), options, &out_datum)); *out = out_datum.make_array(); return Status::OK(); } Status Take(FunctionContext* context, const Datum& values, const Datum& indices, - Datum* out) { - TakeKernel kernel(values.type()); + const TakeOptions& options, Datum* out) { + TakeKernel kernel(values.type(), options); RETURN_NOT_OK(kernel.Call(context, values, indices, out)); return Status::OK(); } @@ -48,6 +49,7 @@ Status Take(FunctionContext* context, const Datum& values, const Datum& indices, struct TakeParameters { FunctionContext* context; std::shared_ptr values, indices; + TakeOptions options; std::shared_ptr* out; }; @@ -215,6 +217,7 @@ Status TakeKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& params.context = ctx; params.values = values.make_array(); params.indices = indices.make_array(); + params.options = options_; params.out = &out_array; UnpackIndices unpack = {params}; RETURN_NOT_OK(VisitTypeInline(*indices.type(), &unpack)); diff --git a/cpp/src/arrow/compute/kernels/take.h b/cpp/src/arrow/compute/kernels/take.h index 1e89bd11a22..3aa5ed5eedf 100644 --- a/cpp/src/arrow/compute/kernels/take.h +++ b/cpp/src/arrow/compute/kernels/take.h @@ -31,6 +31,8 @@ namespace compute { class FunctionContext; +struct ARROW_EXPORT TakeOptions {}; + /// \brief Take from an array of values at indices in another array /// /// The output array will be of the same type as the input values @@ -45,25 +47,28 @@ class FunctionContext; /// \param[in] context the FunctionContext /// \param[in] values array from which to take /// \param[in] indices which values to take +/// \param[in] options options /// \param[out] out resulting array ARROW_EXPORT Status Take(FunctionContext* context, const Array& values, const Array& indices, - std::shared_ptr* out); + const TakeOptions& options, std::shared_ptr* out); /// \brief Take from an array of values at indices in another array /// /// \param[in] context the FunctionContext /// \param[in] values datum from which to take /// \param[in] indices which values to take +/// \param[in] options options /// \param[out] out resulting datum ARROW_EXPORT Status Take(FunctionContext* context, const Datum& values, const Datum& indices, - Datum* out); + const TakeOptions& options, Datum* out); /// \brief BinaryKernel implementing Take operation class ARROW_EXPORT TakeKernel : public BinaryKernel { public: - explicit TakeKernel(const std::shared_ptr& type) : type_(type) {} + explicit TakeKernel(const std::shared_ptr& type, TakeOptions options = {}) + : type_(type), options_(options) {} Status Call(FunctionContext* ctx, const Datum& values, const Datum& indices, Datum* out) override; @@ -72,6 +77,7 @@ class ARROW_EXPORT TakeKernel : public BinaryKernel { private: std::shared_ptr type_; + TakeOptions options_; }; } // namespace compute } // namespace arrow From edf2eb15df91894582d7afc173e503c65a6cc406 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 10 Jun 2019 11:55:22 -0400 Subject: [PATCH 09/29] rename FilterFunction -> CompareFunction --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/compute/kernels/CMakeLists.txt | 6 +- ...lter-benchmark.cc => compare-benchmark.cc} | 0 .../{filter-test.cc => compare-test.cc} | 1 - cpp/src/arrow/compute/kernels/compare.cc | 91 +++++++++++------- cpp/src/arrow/compute/kernels/compare.h | 70 +++++++++++++- cpp/src/arrow/compute/kernels/filter.cc | 59 ------------ cpp/src/arrow/compute/kernels/filter.h | 96 ------------------- cpp/src/arrow/compute/kernels/mask.h | 5 +- 9 files changed, 129 insertions(+), 200 deletions(-) rename cpp/src/arrow/compute/kernels/{filter-benchmark.cc => compare-benchmark.cc} (100%) rename cpp/src/arrow/compute/kernels/{filter-test.cc => compare-test.cc} (99%) delete mode 100644 cpp/src/arrow/compute/kernels/filter.cc delete mode 100644 cpp/src/arrow/compute/kernels/filter.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 771a0f72d91..56a1b135c0c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -172,7 +172,6 @@ if(ARROW_COMPUTE) compute/kernels/cast.cc compute/kernels/compare.cc compute/kernels/count.cc - compute/kernels/filter.cc compute/kernels/hash.cc compute/kernels/mask.cc compute/kernels/mean.cc diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 8b223d8e536..248f990fa02 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -28,6 +28,6 @@ add_arrow_test(util-internal-test PREFIX "arrow-compute") add_arrow_test(aggregate-test PREFIX "arrow-compute") add_arrow_benchmark(aggregate-benchmark PREFIX "arrow-compute") -# Filters -add_arrow_test(filter-test PREFIX "arrow-compute") -add_arrow_benchmark(filter-benchmark PREFIX "arrow-compute") +# Comparison +add_arrow_test(compare-test PREFIX "arrow-compute") +add_arrow_benchmark(compare-benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/filter-benchmark.cc b/cpp/src/arrow/compute/kernels/compare-benchmark.cc similarity index 100% rename from cpp/src/arrow/compute/kernels/filter-benchmark.cc rename to cpp/src/arrow/compute/kernels/compare-benchmark.cc diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/compare-test.cc similarity index 99% rename from cpp/src/arrow/compute/kernels/filter-test.cc rename to cpp/src/arrow/compute/kernels/compare-test.cc index 1c8967cf7ac..1a6339c57d8 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/compare-test.cc @@ -26,7 +26,6 @@ #include "arrow/array.h" #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/compare.h" -#include "arrow/compute/kernels/filter.h" #include "arrow/compute/test-util.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/compute/kernels/compare.cc b/cpp/src/arrow/compute/kernels/compare.cc index 040793f4e65..f5fab7d0fe2 100644 --- a/cpp/src/arrow/compute/kernels/compare.cc +++ b/cpp/src/arrow/compute/kernels/compare.cc @@ -19,7 +19,6 @@ #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" -#include "arrow/compute/kernels/filter.h" #include "arrow/compute/kernels/util-internal.h" #include "arrow/util/bit-util.h" #include "arrow/util/logging.h" @@ -28,8 +27,35 @@ namespace arrow { namespace compute { -class FunctionContext; -struct Datum; +std::shared_ptr CompareBinaryKernel::out_type() const { + return compare_function_->out_type(); +} + +Status CompareBinaryKernel::Call(FunctionContext* ctx, const Datum& left, + const Datum& right, Datum* out) { + DCHECK(left.type()->Equals(right.type())); + + auto lk = left.kind(); + auto rk = right.kind(); + auto out_array = out->array(); + + if (lk == Datum::ARRAY && rk == Datum::SCALAR) { + auto array = left.array(); + auto scalar = right.scalar(); + return compare_function_->Compare(*array, *scalar, &out_array); + } else if (lk == Datum::SCALAR && rk == Datum::ARRAY) { + auto scalar = left.scalar(); + auto array = right.array(); + auto out_array = out->array(); + return compare_function_->Compare(*scalar, *array, &out_array); + } else if (lk == Datum::ARRAY && rk == Datum::ARRAY) { + auto lhs = left.array(); + auto rhs = right.array(); + return compare_function_->Compare(*lhs, *rhs, &out_array); + } + + return Status::Invalid("Invalid datum signature for CompareBinaryKernel"); +} template ::ScalarType, @@ -76,13 +102,14 @@ static Status CompareArrayArray(const ArrayData& lhs, const ArrayData& rhs, } template -class CompareFunction final : public FilterFunction { +class CompareFunctionImpl final : public CompareFunction { + using ArrayType = typename TypeTraits::ArrayType; using ScalarType = typename TypeTraits::ScalarType; public: - explicit CompareFunction(FunctionContext* ctx) : ctx_(ctx) {} + explicit CompareFunctionImpl(FunctionContext* ctx) : ctx_(ctx) {} - Status Filter(const ArrayData& array, const Scalar& scalar, ArrayData* output) const { + Status Compare(const ArrayData& array, const Scalar& scalar, ArrayData* output) const { // Caller must cast DCHECK(array.type->Equals(scalar.type)); // Output must be a boolean array @@ -103,7 +130,7 @@ class CompareFunction final : public FilterFunction { array, static_cast(scalar), bitmap_result); } - Status Filter(const Scalar& scalar, const ArrayData& array, ArrayData* output) const { + Status Compare(const Scalar& scalar, const ArrayData& array, ArrayData* output) const { // Caller must cast DCHECK(array.type->Equals(scalar.type)); // Output must be a boolean array @@ -124,7 +151,7 @@ class CompareFunction final : public FilterFunction { array, bitmap_result); } - Status Filter(const ArrayData& lhs, const ArrayData& rhs, ArrayData* output) const { + Status Compare(const ArrayData& lhs, const ArrayData& rhs, ArrayData* output) const { // Caller must cast DCHECK(lhs.type->Equals(rhs.type)); // Output must be a boolean array @@ -146,13 +173,13 @@ class CompareFunction final : public FilterFunction { }; template -static inline std::shared_ptr MakeCompareFunctionTypeOp( +static inline std::shared_ptr MakeCompareFunctionTypeOp( FunctionContext* ctx) { - return std::make_shared>(ctx); + return std::make_shared>(ctx); } template -static inline std::shared_ptr MakeCompareFilterFunctionType( +static inline std::shared_ptr MakeCompareFunctionType( FunctionContext* ctx, struct CompareOptions options) { switch (options.op) { case CompareOperator::EQUAL: @@ -172,40 +199,40 @@ static inline std::shared_ptr MakeCompareFilterFunctionType( return nullptr; } -std::shared_ptr MakeCompareFilterFunction(FunctionContext* ctx, - const DataType& type, - struct CompareOptions options) { +std::shared_ptr MakeCompareFunction(FunctionContext* ctx, + const DataType& type, + struct CompareOptions options) { switch (type.id()) { case UInt8Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case Int8Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case UInt16Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case Int16Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case UInt32Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case Int32Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case UInt64Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case Int64Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case FloatType::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case DoubleType::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case Date32Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case Date64Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case TimestampType::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case Time32Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); case Time64Type::type_id: - return MakeCompareFilterFunctionType(ctx, options); + return MakeCompareFunctionType(ctx, options); default: return nullptr; } @@ -219,15 +246,15 @@ Status Compare(FunctionContext* context, const Datum& left, const Datum& right, auto type = left.type(); DCHECK(type->Equals(right.type())); // Requires that both types are equal. - auto fn = MakeCompareFilterFunction(context, *type, options); + auto fn = MakeCompareFunction(context, *type, options); if (fn == nullptr) { return Status::NotImplemented("Compare not implemented for type ", type->ToString()); } - FilterBinaryKernel filter_kernel(fn); + CompareBinaryKernel filter_kernel(fn); detail::PrimitiveAllocatingBinaryKernel kernel(&filter_kernel); - const int64_t length = FilterBinaryKernel::out_length(left, right); + const int64_t length = CompareBinaryKernel::out_length(left, right); out->value = ArrayData::Make(filter_kernel.out_type(), length); return kernel.Call(context, left, right, out); diff --git a/cpp/src/arrow/compute/kernels/compare.h b/cpp/src/arrow/compute/kernels/compare.h index a1924512916..b4c9612ae8c 100644 --- a/cpp/src/arrow/compute/kernels/compare.h +++ b/cpp/src/arrow/compute/kernels/compare.h @@ -19,6 +19,7 @@ #include +#include "arrow/compute/kernel.h" #include "arrow/util/visibility.h" namespace arrow { @@ -31,9 +32,68 @@ class Status; namespace compute { struct Datum; -class FilterFunction; class FunctionContext; +/// CompareFunction is an interface for Comparisons +/// +/// Comparisons take an array and emits a selection vector. The selection vector +/// is given in the form of a bitmask as a BooleanArray result. +class ARROW_EXPORT CompareFunction { + public: + /// Compare an array with a scalar argument. + virtual Status Compare(const ArrayData& array, const Scalar& scalar, + ArrayData* output) const = 0; + + Status Compare(const ArrayData& array, const Scalar& scalar, + std::shared_ptr* output) { + return Compare(array, scalar, output->get()); + } + + virtual Status Compare(const Scalar& scalar, const ArrayData& array, + ArrayData* output) const = 0; + + Status Compare(const Scalar& scalar, const ArrayData& array, + std::shared_ptr* output) { + return Compare(scalar, array, output->get()); + } + + /// Compare an array with an array argument. + virtual Status Compare(const ArrayData& lhs, const ArrayData& rhs, + ArrayData* output) const = 0; + + Status Compare(const ArrayData& lhs, const ArrayData& rhs, + std::shared_ptr* output) { + return Compare(lhs, rhs, output->get()); + } + + /// By default, CompareFunction emits a result bitmap. + virtual std::shared_ptr out_type() const { return boolean(); } + + virtual ~CompareFunction() {} +}; + +/// \brief BinaryKernel bound to a select function +class ARROW_EXPORT CompareBinaryKernel : public BinaryKernel { + public: + explicit CompareBinaryKernel(std::shared_ptr& select) + : compare_function_(select) {} + + Status Call(FunctionContext* ctx, const Datum& left, const Datum& right, + Datum* out) override; + + static int64_t out_length(const Datum& left, const Datum& right) { + if (left.kind() == Datum::ARRAY) return left.length(); + if (right.kind() == Datum::ARRAY) return right.length(); + + return 0; + } + + std::shared_ptr out_type() const override; + + private: + std::shared_ptr compare_function_; +}; + enum CompareOperator { EQUAL, NOT_EQUAL, @@ -82,7 +142,7 @@ struct CompareOptions { enum CompareOperator op; }; -/// \brief Return a Compare FilterFunction +/// \brief Return a Compare CompareFunction /// /// \param[in] context FunctionContext passing context information /// \param[in] type required to specialize the kernel @@ -91,9 +151,9 @@ struct CompareOptions { /// \since 0.14.0 /// \note API not yet finalized ARROW_EXPORT -std::shared_ptr MakeCompareFilterFunction(FunctionContext* context, - const DataType& type, - struct CompareOptions options); +std::shared_ptr MakeCompareFunction(FunctionContext* context, + const DataType& type, + struct CompareOptions options); /// \brief Compare a numeric array with a scalar. /// diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc deleted file mode 100644 index 1cbf0dc0992..00000000000 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/compute/kernels/filter.h" - -#include "arrow/array.h" -#include "arrow/compute/kernel.h" -#include "arrow/util/logging.h" - -namespace arrow { - -namespace compute { - -std::shared_ptr FilterBinaryKernel::out_type() const { - return filter_function_->out_type(); -} - -Status FilterBinaryKernel::Call(FunctionContext* ctx, const Datum& left, - const Datum& right, Datum* out) { - DCHECK(left.type()->Equals(right.type())); - - auto lk = left.kind(); - auto rk = right.kind(); - auto out_array = out->array(); - - if (lk == Datum::ARRAY && rk == Datum::SCALAR) { - auto array = left.array(); - auto scalar = right.scalar(); - return filter_function_->Filter(*array, *scalar, &out_array); - } else if (lk == Datum::SCALAR && rk == Datum::ARRAY) { - auto scalar = left.scalar(); - auto array = right.array(); - auto out_array = out->array(); - return filter_function_->Filter(*scalar, *array, &out_array); - } else if (lk == Datum::ARRAY && rk == Datum::ARRAY) { - auto lhs = left.array(); - auto rhs = right.array(); - return filter_function_->Filter(*lhs, *rhs, &out_array); - } - - return Status::Invalid("Invalid datum signature for FilterBinaryKernel"); -} - -} // namespace compute -} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/filter.h b/cpp/src/arrow/compute/kernels/filter.h deleted file mode 100644 index 3b28bc9391a..00000000000 --- a/cpp/src/arrow/compute/kernels/filter.h +++ /dev/null @@ -1,96 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/compute/kernel.h" - -namespace arrow { - -class Array; -struct Scalar; -class Status; - -namespace compute { - -class FunctionContext; -struct Datum; - -/// FilterFunction is an interface for Filters -/// -/// Filters takes an array and emits a selection vector. The selection vector -/// is given in the form of a bitmask as a BooleanArray result. -class ARROW_EXPORT FilterFunction { - public: - /// Filter an array with a scalar argument. - virtual Status Filter(const ArrayData& array, const Scalar& scalar, - ArrayData* output) const = 0; - - Status Filter(const ArrayData& array, const Scalar& scalar, - std::shared_ptr* output) { - return Filter(array, scalar, output->get()); - } - - virtual Status Filter(const Scalar& scalar, const ArrayData& array, - ArrayData* output) const = 0; - - Status Filter(const Scalar& scalar, const ArrayData& array, - std::shared_ptr* output) { - return Filter(scalar, array, output->get()); - } - - /// Filter an array with an array argument. - virtual Status Filter(const ArrayData& lhs, const ArrayData& rhs, - ArrayData* output) const = 0; - - Status Filter(const ArrayData& lhs, const ArrayData& rhs, - std::shared_ptr* output) { - return Filter(lhs, rhs, output->get()); - } - - /// By default, FilterFunction emits a result bitmap. - virtual std::shared_ptr out_type() const { return boolean(); } - - virtual ~FilterFunction() {} -}; - -/// \brief BinaryKernel bound to a filter function -class ARROW_EXPORT FilterBinaryKernel : public BinaryKernel { - public: - explicit FilterBinaryKernel(std::shared_ptr& filter) - : filter_function_(filter) {} - - Status Call(FunctionContext* ctx, const Datum& left, const Datum& right, - Datum* out) override; - - static int64_t out_length(const Datum& left, const Datum& right) { - if (left.kind() == Datum::ARRAY) return left.length(); - if (right.kind() == Datum::ARRAY) return right.length(); - - return 0; - } - - std::shared_ptr out_type() const override; - - private: - std::shared_ptr filter_function_; -}; - -} // namespace compute -} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/mask.h b/cpp/src/arrow/compute/kernels/mask.h index 1bd7ef0bb95..caca67168c0 100644 --- a/cpp/src/arrow/compute/kernels/mask.h +++ b/cpp/src/arrow/compute/kernels/mask.h @@ -56,15 +56,14 @@ Status Mask(FunctionContext* context, const Array& values, const Array& mask, /// \param[in] mask indicates which values should be masked out /// \param[out] out resulting datum ARROW_EXPORT -Status Mask(FunctionContext* context, const Datum& values, const Datum& indices, - Datum* out); +Status Mask(FunctionContext* context, const Datum& values, const Datum& mask, Datum* out); /// \brief BinaryKernel implementing Mask operation class ARROW_EXPORT MaskKernel : public BinaryKernel { public: explicit MaskKernel(const std::shared_ptr& type) : type_(type) {} - Status Call(FunctionContext* ctx, const Datum& values, const Datum& indices, + Status Call(FunctionContext* ctx, const Datum& values, const Datum& mask, Datum* out) override; std::shared_ptr out_type() const override { return type_; } From 0f29ab27d97be37d5a578fdf9b4145ac8fdfed18 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 10 Jun 2019 12:36:58 -0400 Subject: [PATCH 10/29] rename Mask -> Filter --- cpp/src/arrow/CMakeLists.txt | 2 +- cpp/src/arrow/compute/kernels/CMakeLists.txt | 2 +- cpp/src/arrow/compute/kernels/filter-test.cc | 140 ++++++++++++++++++ .../compute/kernels/{mask.cc => filter.cc} | 124 ++++++++-------- .../compute/kernels/{mask.h => filter.h} | 23 +-- cpp/src/arrow/compute/kernels/mask-test.cc | 139 ----------------- 6 files changed, 216 insertions(+), 214 deletions(-) create mode 100644 cpp/src/arrow/compute/kernels/filter-test.cc rename cpp/src/arrow/compute/kernels/{mask.cc => filter.cc} (58%) rename cpp/src/arrow/compute/kernels/{mask.h => filter.h} (73%) delete mode 100644 cpp/src/arrow/compute/kernels/mask-test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 56a1b135c0c..1666d178fac 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -173,7 +173,7 @@ if(ARROW_COMPUTE) compute/kernels/compare.cc compute/kernels/count.cc compute/kernels/hash.cc - compute/kernels/mask.cc + compute/kernels/filter.cc compute/kernels/mean.cc compute/kernels/sum.cc compute/kernels/take.cc diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 248f990fa02..82b94491307 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -20,7 +20,7 @@ arrow_install_all_headers("arrow/compute/kernels") add_arrow_test(boolean-test PREFIX "arrow-compute") add_arrow_test(cast-test PREFIX "arrow-compute") add_arrow_test(hash-test PREFIX "arrow-compute") -add_arrow_test(mask-test PREFIX "arrow-compute") +add_arrow_test(filter-test PREFIX "arrow-compute") add_arrow_test(take-test PREFIX "arrow-compute") add_arrow_test(util-internal-test PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc new file mode 100644 index 00000000000..bd664630325 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// returnGegarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/compute/context.h" +#include "arrow/compute/kernels/filter.h" +#include "arrow/compute/test-util.h" +#include "arrow/testing/gtest_common.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/testing/util.h" + +namespace arrow { +namespace compute { + +using util::string_view; + +template +class TestFilterKernel : public ComputeFixture, public TestBase { + protected: + void AssertFilterArrays(const std::shared_ptr& values, + const std::shared_ptr& filter, + const std::shared_ptr& expected) { + std::shared_ptr actual; + ASSERT_OK(arrow::compute::Filter(&this->ctx_, *values, *filter, &actual)); + AssertArraysEqual(*expected, *actual); + } + void AssertFilter(const std::shared_ptr& type, const std::string& values, + const std::string& filter, const std::string& expected) { + std::shared_ptr actual; + ASSERT_OK(this->Filter(type, values, filter, &actual)); + AssertArraysEqual(*ArrayFromJSON(type, expected), *actual); + } + Status Filter(const std::shared_ptr& type, const std::string& values, + const std::string& filter, std::shared_ptr* out) { + return arrow::compute::Filter(&this->ctx_, *ArrayFromJSON(type, values), + *ArrayFromJSON(boolean(), filter), out); + } +}; + +class TestFilterKernelWithNull : public TestFilterKernel { + protected: + void AssertFilter(const std::string& values, const std::string& filter, + const std::string& expected) { + TestFilterKernel::AssertFilter(utf8(), values, filter, expected); + } +}; + +TEST_F(TestFilterKernelWithNull, FilterNull) { + this->AssertFilter("[null, null, null]", "[0, 1, 0]", "[null]"); + this->AssertFilter("[null, null, null]", "[1, 1, 0]", "[null, null]"); +} + +class TestFilterKernelWithBoolean : public TestFilterKernel { + protected: + void AssertFilter(const std::string& values, const std::string& filter, + const std::string& expected) { + TestFilterKernel::AssertFilter(boolean(), values, filter, expected); + } +}; + +TEST_F(TestFilterKernelWithBoolean, FilterBoolean) { + this->AssertFilter("[true, false, true]", "[0, 1, 0]", "[false]"); + this->AssertFilter("[null, false, true]", "[0, 1, 0]", "[false]"); + this->AssertFilter("[true, false, true]", "[null, 1, 0]", "[null, false]"); +} + +template +class TestFilterKernelWithNumeric : public TestFilterKernel { + protected: + void AssertFilter(const std::string& values, const std::string& filter, + const std::string& expected) { + TestFilterKernel::AssertFilter(type_singleton(), values, filter, expected); + } + std::shared_ptr type_singleton() { + return TypeTraits::type_singleton(); + } +}; + +TYPED_TEST_CASE(TestFilterKernelWithNumeric, NumericArrowTypes); +TYPED_TEST(TestFilterKernelWithNumeric, FilterNumeric) { + this->AssertFilter("[7, 8, 9]", "[0, 1, 0]", "[8]"); + this->AssertFilter("[null, 8, 9]", "[0, 1, 0]", "[8]"); + this->AssertFilter("[7, 8, 9]", "[null, 1, 0]", "[null, 8]"); + this->AssertFilter("[]", "[]", "[]"); +} + +class TestFilterKernelWithString : public TestFilterKernel { + protected: + void AssertFilter(const std::string& values, const std::string& filter, + const std::string& expected) { + TestFilterKernel::AssertFilter(utf8(), values, filter, expected); + } + void AssertFilterDictionary(const std::string& dictionary_values, + const std::string& dictionary_filter, + const std::string& filter, + const std::string& expected_filter) { + auto dict = ArrayFromJSON(utf8(), dictionary_values); + auto type = dictionary(int8(), utf8()); + std::shared_ptr values, actual, expected; + ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_filter), + dict, &values)); + ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_filter), + dict, &expected)); + auto take_filter = ArrayFromJSON(boolean(), filter); + this->AssertFilterArrays(values, take_filter, expected); + } +}; + +TEST_F(TestFilterKernelWithString, FilterString) { + this->AssertFilter(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])"); + this->AssertFilter(R"([null, "b", "c"])", "[0, 1, 0]", R"(["b"])"); + this->AssertFilter(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])"); +} + +TEST_F(TestFilterKernelWithString, FilterDictionary) { + auto dict = R"(["a", "b", "c", "d", "e"])"; + this->AssertFilterDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[4]"); + this->AssertFilterDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[4]"); + this->AssertFilterDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4]"); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/mask.cc b/cpp/src/arrow/compute/kernels/filter.cc similarity index 58% rename from cpp/src/arrow/compute/kernels/mask.cc rename to cpp/src/arrow/compute/kernels/filter.cc index 7041ec86b66..235539bae8b 100644 --- a/cpp/src/arrow/compute/kernels/mask.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -21,7 +21,7 @@ #include "arrow/builder.h" #include "arrow/compute/context.h" -#include "arrow/compute/kernels/mask.h" +#include "arrow/compute/kernels/filter.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -32,24 +32,24 @@ namespace compute { using internal::checked_cast; -Status Mask(FunctionContext* context, const Array& values, const Array& mask, - std::shared_ptr* out) { +Status Filter(FunctionContext* context, const Array& values, const Array& filter, + std::shared_ptr* out) { Datum out_datum; - RETURN_NOT_OK(Mask(context, Datum(values.data()), Datum(mask.data()), &out_datum)); + RETURN_NOT_OK(Filter(context, Datum(values.data()), Datum(filter.data()), &out_datum)); *out = out_datum.make_array(); return Status::OK(); } -Status Mask(FunctionContext* context, const Datum& values, const Datum& mask, - Datum* out) { - MaskKernel kernel(values.type()); - RETURN_NOT_OK(kernel.Call(context, values, mask, out)); +Status Filter(FunctionContext* context, const Datum& values, const Datum& filter, + Datum* out) { + FilterKernel kernel(values.type()); + RETURN_NOT_OK(kernel.Call(context, values, filter, out)); return Status::OK(); } -struct MaskParameters { +struct FilterParameters { FunctionContext* context; - std::shared_ptr values, mask; + std::shared_ptr values, filter; std::shared_ptr* out; }; @@ -72,33 +72,33 @@ static Status UnsafeAppend(StringBuilder* builder, util::string_view value) { } // TODO(bkietz) this can be optimized -static int64_t OutputSize(const BooleanArray& mask) { - auto offset = mask.offset(); - auto length = mask.length(); - internal::BitmapReader mask_data(mask.data()->buffers[1]->data(), offset, length); +static int64_t OutputSize(const BooleanArray& filter) { + auto offset = filter.offset(); + auto length = filter.length(); + internal::BitmapReader filter_data(filter.data()->buffers[1]->data(), offset, length); int64_t size = 0; for (auto i = offset; i < offset + length; ++i) { - if (mask.IsNull(i) || mask_data.IsSet()) { + if (filter.IsNull(i) || filter_data.IsSet()) { ++size; } - mask_data.Next(); + filter_data.Next(); } return size; } -template -Status MaskImpl(FunctionContext*, const ValueArray& values, const BooleanArray& mask, - OutBuilder* builder) { - auto offset = mask.offset(); - auto length = mask.length(); - internal::BitmapReader mask_data(mask.data()->buffers[1]->data(), offset, length); - for (int64_t i = 0; i < mask.length(); mask_data.Next(), ++i) { - if (!WholeMaskValid && mask.IsNull(i)) { +Status FilterImpl(FunctionContext*, const ValueArray& values, const BooleanArray& filter, + OutBuilder* builder) { + auto offset = filter.offset(); + auto length = filter.length(); + internal::BitmapReader filter_data(filter.data()->buffers[1]->data(), offset, length); + for (int64_t i = 0; i < filter.length(); filter_data.Next(), ++i) { + if (!WholeFilterValid && filter.IsNull(i)) { builder->UnsafeAppendNull(); continue; } - if (mask_data.IsNotSet()) { + if (filter_data.IsNotSet()) { continue; } if (!AllValuesValid && values.IsNull(i)) { @@ -110,63 +110,63 @@ Status MaskImpl(FunctionContext*, const ValueArray& values, const BooleanArray& return Status::OK(); } -template -Status UnpackMaskNullCount(FunctionContext* context, const ValueArray& values, - const MaskArray& mask, OutBuilder* builder) { - if (mask.null_count() == 0) { - return MaskImpl(context, values, mask, builder); +Status UnpackFilterNullCount(FunctionContext* context, const ValueArray& values, + const FilterArray& filter, OutBuilder* builder) { + if (filter.null_count() == 0) { + return FilterImpl(context, values, filter, builder); } - return MaskImpl(context, values, mask, builder); + return FilterImpl(context, values, filter, builder); } -template +template Status UnpackValuesNullCount(FunctionContext* context, const ValueArray& values, - const MaskArray& mask, OutBuilder* builder) { + const FilterArray& filter, OutBuilder* builder) { if (values.null_count() == 0) { - return UnpackMaskNullCount(context, values, mask, builder); + return UnpackFilterNullCount(context, values, filter, builder); } - return UnpackMaskNullCount(context, values, mask, builder); + return UnpackFilterNullCount(context, values, filter, builder); } template using ArrayType = typename TypeTraits::ArrayType; -template +template struct UnpackValues { template Status Visit(const ValueType&) { using OutBuilder = typename TypeTraits::BuilderType; - const auto& mask = checked_cast&>(*params_.mask); + const auto& filter = checked_cast&>(*params_.filter); const auto& values = checked_cast&>(*params_.values); std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(params_.context->memory_pool(), values.type(), &builder)); - RETURN_NOT_OK(builder->Reserve(OutputSize(mask))); - RETURN_NOT_OK(UnpackValuesNullCount(params_.context, values, mask, + RETURN_NOT_OK(builder->Reserve(OutputSize(filter))); + RETURN_NOT_OK(UnpackValuesNullCount(params_.context, values, filter, checked_cast(builder.get()))); return builder->Finish(params_.out); } Status Visit(const NullType& t) { - const auto& mask = checked_cast&>(*params_.mask); - params_.out->reset(new NullArray(OutputSize(mask))); + const auto& filter = checked_cast&>(*params_.filter); + params_.out->reset(new NullArray(OutputSize(filter))); return Status::OK(); } Status Visit(const DictionaryType& t) { - std::shared_ptr masked_indices; + std::shared_ptr filtered_indices; const auto& values = internal::checked_cast(*params_.values); { // To take from a dictionary, apply the current kernel to the dictionary's - // indices. (Use UnpackValues since MaskType is already unpacked) - MaskParameters params = params_; + // indices. (Use UnpackValues since FilterType is already unpacked) + FilterParameters params = params_; params.values = values.indices(); - params.out = &masked_indices; - UnpackValues unpack = {params}; + params.out = &filtered_indices; + UnpackValues unpack = {params}; RETURN_NOT_OK(VisitTypeInline(*t.index_type(), &unpack)); } - // create output dictionary from taken mask - *params_.out = std::make_shared(values.type(), masked_indices, + // create output dictionary from taken filter + *params_.out = std::make_shared(values.type(), filtered_indices, values.dictionary()); return Status::OK(); } @@ -192,38 +192,38 @@ struct UnpackValues { return Status::NotImplemented("gathering values of type ", t); } - const MaskParameters& params_; + const FilterParameters& params_; }; -struct UnpackMask { +struct UnpackFilter { Status Visit(const BooleanType&) { UnpackValues unpack = {params_}; return VisitTypeInline(*params_.values->type(), &unpack); } Status Visit(const DataType& other) { - return Status::TypeError("mask type not supported: ", other); + return Status::TypeError("filter type not supported: ", other); } - const MaskParameters& params_; + const FilterParameters& params_; }; -Status MaskKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& mask, - Datum* out) { - if (!values.is_array() || !mask.is_array()) { - return Status::Invalid("MaskKernel expects array values and mask"); +Status FilterKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& filter, + Datum* out) { + if (!values.is_array() || !filter.is_array()) { + return Status::Invalid("FilterKernel expects array values and filter"); } std::shared_ptr out_array; - MaskParameters params; + FilterParameters params; params.context = ctx; params.values = values.make_array(); - params.mask = mask.make_array(); - if (params.values->length() != params.mask->length()) { - return Status::Invalid("Mask is not the same size as values"); + params.filter = filter.make_array(); + if (params.values->length() != params.filter->length()) { + return Status::Invalid("Filter is not the same size as values"); } params.out = &out_array; - UnpackMask unpack = {params}; - RETURN_NOT_OK(VisitTypeInline(*mask.type(), &unpack)); + UnpackFilter unpack = {params}; + RETURN_NOT_OK(VisitTypeInline(*filter.type(), &unpack)); *out = Datum(out_array); return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/mask.h b/cpp/src/arrow/compute/kernels/filter.h similarity index 73% rename from cpp/src/arrow/compute/kernels/mask.h rename to cpp/src/arrow/compute/kernels/filter.h index caca67168c0..2abd38bfb79 100644 --- a/cpp/src/arrow/compute/kernels/mask.h +++ b/cpp/src/arrow/compute/kernels/filter.h @@ -31,7 +31,7 @@ namespace compute { class FunctionContext; -/// \brief Mask an array with a boolean selection filter +/// \brief Filter an array with a boolean selection filter /// /// The output array will be populated with values from the input at positions /// where the selection filter is not 0. Nulls in the filter will result in nulls @@ -43,27 +43,28 @@ class FunctionContext; /// /// \param[in] context the FunctionContext /// \param[in] values array from which to take -/// \param[in] mask indicates which values should be masked out +/// \param[in] filter indicates which values should be filtered out /// \param[out] out resulting array ARROW_EXPORT -Status Mask(FunctionContext* context, const Array& values, const Array& mask, - std::shared_ptr* out); +Status Filter(FunctionContext* context, const Array& values, const Array& filter, + std::shared_ptr* out); -/// \brief Mask an array with a boolean selection filter +/// \brief Filter an array with a boolean selection filter /// /// \param[in] context the FunctionContext /// \param[in] values datum from which to take -/// \param[in] mask indicates which values should be masked out +/// \param[in] filter indicates which values should be filtered out /// \param[out] out resulting datum ARROW_EXPORT -Status Mask(FunctionContext* context, const Datum& values, const Datum& mask, Datum* out); +Status Filter(FunctionContext* context, const Datum& values, const Datum& filter, + Datum* out); -/// \brief BinaryKernel implementing Mask operation -class ARROW_EXPORT MaskKernel : public BinaryKernel { +/// \brief BinaryKernel implementing Filter operation +class ARROW_EXPORT FilterKernel : public BinaryKernel { public: - explicit MaskKernel(const std::shared_ptr& type) : type_(type) {} + explicit FilterKernel(const std::shared_ptr& type) : type_(type) {} - Status Call(FunctionContext* ctx, const Datum& values, const Datum& mask, + Status Call(FunctionContext* ctx, const Datum& values, const Datum& filter, Datum* out) override; std::shared_ptr out_type() const override { return type_; } diff --git a/cpp/src/arrow/compute/kernels/mask-test.cc b/cpp/src/arrow/compute/kernels/mask-test.cc deleted file mode 100644 index 5672407694e..00000000000 --- a/cpp/src/arrow/compute/kernels/mask-test.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// returnGegarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "arrow/compute/context.h" -#include "arrow/compute/kernels/mask.h" -#include "arrow/compute/test-util.h" -#include "arrow/testing/gtest_common.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/testing/random.h" -#include "arrow/testing/util.h" - -namespace arrow { -namespace compute { - -using util::string_view; - -template -class TestMaskKernel : public ComputeFixture, public TestBase { - protected: - void AssertMaskArrays(const std::shared_ptr& values, - const std::shared_ptr& mask, - const std::shared_ptr& expected) { - std::shared_ptr actual; - ASSERT_OK(arrow::compute::Mask(&this->ctx_, *values, *mask, &actual)); - AssertArraysEqual(*expected, *actual); - } - void AssertMask(const std::shared_ptr& type, const std::string& values, - const std::string& mask, const std::string& expected) { - std::shared_ptr actual; - ASSERT_OK(this->Mask(type, values, mask, &actual)); - AssertArraysEqual(*ArrayFromJSON(type, expected), *actual); - } - Status Mask(const std::shared_ptr& type, const std::string& values, - const std::string& mask, std::shared_ptr* out) { - return arrow::compute::Mask(&this->ctx_, *ArrayFromJSON(type, values), - *ArrayFromJSON(boolean(), mask), out); - } -}; - -class TestMaskKernelWithNull : public TestMaskKernel { - protected: - void AssertMask(const std::string& values, const std::string& mask, - const std::string& expected) { - TestMaskKernel::AssertMask(utf8(), values, mask, expected); - } -}; - -TEST_F(TestMaskKernelWithNull, MaskNull) { - this->AssertMask("[null, null, null]", "[0, 1, 0]", "[null]"); - this->AssertMask("[null, null, null]", "[1, 1, 0]", "[null, null]"); -} - -class TestMaskKernelWithBoolean : public TestMaskKernel { - protected: - void AssertMask(const std::string& values, const std::string& mask, - const std::string& expected) { - TestMaskKernel::AssertMask(boolean(), values, mask, expected); - } -}; - -TEST_F(TestMaskKernelWithBoolean, MaskBoolean) { - this->AssertMask("[true, false, true]", "[0, 1, 0]", "[false]"); - this->AssertMask("[null, false, true]", "[0, 1, 0]", "[false]"); - this->AssertMask("[true, false, true]", "[null, 1, 0]", "[null, false]"); -} - -template -class TestMaskKernelWithNumeric : public TestMaskKernel { - protected: - void AssertMask(const std::string& values, const std::string& mask, - const std::string& expected) { - TestMaskKernel::AssertMask(type_singleton(), values, mask, expected); - } - std::shared_ptr type_singleton() { - return TypeTraits::type_singleton(); - } -}; - -TYPED_TEST_CASE(TestMaskKernelWithNumeric, NumericArrowTypes); -TYPED_TEST(TestMaskKernelWithNumeric, MaskNumeric) { - this->AssertMask("[7, 8, 9]", "[0, 1, 0]", "[8]"); - this->AssertMask("[null, 8, 9]", "[0, 1, 0]", "[8]"); - this->AssertMask("[7, 8, 9]", "[null, 1, 0]", "[null, 8]"); - this->AssertMask("[]", "[]", "[]"); -} - -class TestMaskKernelWithString : public TestMaskKernel { - protected: - void AssertMask(const std::string& values, const std::string& mask, - const std::string& expected) { - TestMaskKernel::AssertMask(utf8(), values, mask, expected); - } - void AssertMaskDictionary(const std::string& dictionary_values, - const std::string& dictionary_mask, const std::string& mask, - const std::string& expected_mask) { - auto dict = ArrayFromJSON(utf8(), dictionary_values); - auto type = dictionary(int8(), utf8()); - std::shared_ptr values, actual, expected; - ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_mask), - dict, &values)); - ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_mask), - dict, &expected)); - auto take_mask = ArrayFromJSON(boolean(), mask); - this->AssertMaskArrays(values, take_mask, expected); - } -}; - -TEST_F(TestMaskKernelWithString, MaskString) { - this->AssertMask(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])"); - this->AssertMask(R"([null, "b", "c"])", "[0, 1, 0]", R"(["b"])"); - this->AssertMask(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])"); -} - -TEST_F(TestMaskKernelWithString, MaskDictionary) { - auto dict = R"(["a", "b", "c", "d", "e"])"; - this->AssertMaskDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[4]"); - this->AssertMaskDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[4]"); - this->AssertMaskDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4]"); -} - -} // namespace compute -} // namespace arrow From 6efc4f55beafba241e76238919e4468d7d5619ff Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 10 Jun 2019 14:40:42 -0400 Subject: [PATCH 11/29] add filter tests with large, random arrays --- cpp/src/arrow/compute/kernels/filter-test.cc | 50 +++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index bd664630325..c3e41e1f9b6 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -29,6 +29,7 @@ namespace arrow { namespace compute { +using internal::checked_pointer_cast; using util::string_view; template @@ -52,6 +53,29 @@ class TestFilterKernel : public ComputeFixture, public TestBase { return arrow::compute::Filter(&this->ctx_, *ArrayFromJSON(type, values), *ArrayFromJSON(boolean(), filter), out); } + void ValidateFilter(const std::shared_ptr& values, + const std::shared_ptr& filter_boxed) { + std::shared_ptr filtered; + ASSERT_OK(arrow::compute::Filter(&this->ctx_, *values, *filter_boxed, &filtered)); + + auto filter = checked_pointer_cast(filter_boxed); + int64_t values_i = 0, filtered_i = 0; + for (; values_i < values->length(); ++values_i, ++filtered_i) { + if (filter->IsNull(values_i)) { + ASSERT_LT(filtered_i, filtered->length()); + ASSERT_TRUE(filtered->IsNull(filtered_i)); + continue; + } + if (!filter->Value(values_i)) { + // this element was filtered out; don't examine filtered + --filtered_i; + continue; + } + ASSERT_LT(filtered_i, filtered->length()); + ASSERT_TRUE(values->RangeEquals(values_i, values_i + 1, filtered_i, filtered)); + } + ASSERT_EQ(filtered_i, filtered->length()); + } }; class TestFilterKernelWithNull : public TestFilterKernel { @@ -95,10 +119,34 @@ class TestFilterKernelWithNumeric : public TestFilterKernel { TYPED_TEST_CASE(TestFilterKernelWithNumeric, NumericArrowTypes); TYPED_TEST(TestFilterKernelWithNumeric, FilterNumeric) { + this->AssertFilter("[]", "[]", "[]"); + + this->AssertFilter("[9]", "[0]", "[]"); + this->AssertFilter("[9]", "[1]", "[9]"); + this->AssertFilter("[9]", "[null]", "[null]"); + this->AssertFilter("[null]", "[0]", "[]"); + this->AssertFilter("[null]", "[1]", "[null]"); + this->AssertFilter("[null]", "[null]", "[null]"); + this->AssertFilter("[7, 8, 9]", "[0, 1, 0]", "[8]"); + this->AssertFilter("[7, 8, 9]", "[1, 0, 1]", "[7, 9]"); this->AssertFilter("[null, 8, 9]", "[0, 1, 0]", "[8]"); this->AssertFilter("[7, 8, 9]", "[null, 1, 0]", "[null, 8]"); - this->AssertFilter("[]", "[]", "[]"); + this->AssertFilter("[7, 8, 9]", "[1, null, 1]", "[7, null, 9]"); +} + +TYPED_TEST(TestFilterKernelWithNumeric, FilterRandomNumeric) { + auto rand = random::RandomArrayGenerator(0x5416447); + for (size_t i = 3; i < 13; i++) { + const int64_t length = static_cast(1ULL << i); + for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) { + for (auto filter_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) { + auto values = rand.Numeric(length, 0, 127, null_probability); + auto filter = rand.Boolean(length, filter_probability, null_probability); + this->ValidateFilter(values, filter); + } + } + } } class TestFilterKernelWithString : public TestFilterKernel { From 7c50027152b9b918c90f1c81ef0793a01d4ec086 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 11 Jun 2019 10:12:51 -0400 Subject: [PATCH 12/29] add a test integrating with arrow::compute::Compare --- cpp/src/arrow/compute/kernels/filter-test.cc | 43 ++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index c3e41e1f9b6..fc3319cb395 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -19,6 +19,7 @@ #include #include "arrow/compute/context.h" +#include "arrow/compute/kernels/compare.h" #include "arrow/compute/kernels/filter.h" #include "arrow/compute/test-util.h" #include "arrow/testing/gtest_common.h" @@ -149,6 +150,48 @@ TYPED_TEST(TestFilterKernelWithNumeric, FilterRandomNumeric) { } } +template +std::vector CompareAndFilter(const CType* data, int64_t length, CType val, + CompareOperator op) { + using cmp_t = bool(const CType&, const CType&); + static cmp_t* cmp[] = { + Comparator::Compare, Comparator::Compare, + Comparator::Compare, Comparator::Compare, + Comparator::Compare, Comparator::Compare, + }; + + std::vector filtered; + filtered.reserve(length); + std::copy_if(data, data + length, std::back_inserter(filtered), + [op, val](CType e) { return cmp[op](e, val); }); + return filtered; +} + +TYPED_TEST(TestFilterKernelWithNumeric, CompareAndFilterRandomNumeric) { + using ScalarType = typename TypeTraits::ScalarType; + using ArrayType = typename TypeTraits::ArrayType; + using CType = typename TypeTraits::CType; + + auto rand = random::RandomArrayGenerator(0x5416447); + for (size_t i = 3; i < 13; i++) { + const int64_t length = static_cast(1ULL << i); + for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) { + auto array = checked_pointer_cast( + rand.Numeric(length, 0, 100, null_probability)); + CType c_fifty = 50; + auto fifty = std::make_shared(c_fifty); + for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) { + auto options = CompareOptions(op); + Datum selection, filtered; + ASSERT_OK(Compare(&this->ctx_, Datum(array), Datum(fifty), options, &selection)); + ASSERT_OK(Filter(&this->ctx_, Datum(array), selection, &filtered)); + auto expected = + CompareAndFilter(array->raw_values(), array->length(), c_fifty, op); + } + } + } +} + class TestFilterKernelWithString : public TestFilterKernel { protected: void AssertFilter(const std::string& values, const std::string& filter, From 8a9f3793453d5b8da67de8f28d1bd55b617118ce Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 11 Jun 2019 10:38:01 -0400 Subject: [PATCH 13/29] add a test integrating with arrow::compute::Compare (array-array) --- cpp/src/arrow/compute/kernels/filter-test.cc | 50 ++++++++++++++++++-- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index fc3319cb395..5c5de56b330 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -151,23 +151,40 @@ TYPED_TEST(TestFilterKernelWithNumeric, FilterRandomNumeric) { } template -std::vector CompareAndFilter(const CType* data, int64_t length, CType val, - CompareOperator op) { - using cmp_t = bool(const CType&, const CType&); +decltype(Comparator::Compare)* GetComparator(CompareOperator op) { + using cmp_t = decltype(Comparator::Compare); static cmp_t* cmp[] = { Comparator::Compare, Comparator::Compare, Comparator::Compare, Comparator::Compare, Comparator::Compare, Comparator::Compare, }; + return cmp[op]; +} + +template +std::vector CompareAndFilter(const CType* data, int64_t length, CType val, + CompareOperator op) { + std::vector filtered; + filtered.reserve(length); + auto cmp = GetComparator(op); + std::copy_if(data, data + length, std::back_inserter(filtered), + [cmp, val](CType e) { return cmp(e, val); }); + return filtered; +} +template +std::vector CompareAndFilter(const CType* data, int64_t length, const CType* other, + CompareOperator op) { std::vector filtered; filtered.reserve(length); + auto cmp = GetComparator(op); + int64_t i = 0; std::copy_if(data, data + length, std::back_inserter(filtered), - [op, val](CType e) { return cmp[op](e, val); }); + [cmp, other, &i](CType e) { return cmp(e, other[i++]); }); return filtered; } -TYPED_TEST(TestFilterKernelWithNumeric, CompareAndFilterRandomNumeric) { +TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) { using ScalarType = typename TypeTraits::ScalarType; using ArrayType = typename TypeTraits::ArrayType; using CType = typename TypeTraits::CType; @@ -192,6 +209,29 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareAndFilterRandomNumeric) { } } +TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) { + using ArrayType = typename TypeTraits::ArrayType; + + auto rand = random::RandomArrayGenerator(0x5416447); + for (size_t i = 3; i < 13; i++) { + const int64_t length = static_cast(1ULL << i); + for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) { + auto lhs = checked_pointer_cast( + rand.Numeric(length, 0, 100, null_probability)); + auto rhs = checked_pointer_cast( + rand.Numeric(length, 0, 100, null_probability)); + for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) { + auto options = CompareOptions(op); + Datum selection, filtered; + ASSERT_OK(Compare(&this->ctx_, Datum(lhs), Datum(rhs), options, &selection)); + ASSERT_OK(Filter(&this->ctx_, Datum(lhs), selection, &filtered)); + auto expected = + CompareAndFilter(lhs->raw_values(), lhs->length(), rhs->raw_values(), op); + } + } + } +} + class TestFilterKernelWithString : public TestFilterKernel { protected: void AssertFilter(const std::string& values, const std::string& filter, From ccd32a532a6eab065b50fd03b1a1eaf169ec82ae Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 11 Jun 2019 11:00:14 -0400 Subject: [PATCH 14/29] add a basic filter benchmark --- cpp/src/arrow/compute/benchmark-util.h | 23 +++++++++ cpp/src/arrow/compute/kernels/CMakeLists.txt | 7 ++- .../arrow/compute/kernels/filter-benchmark.cc | 51 +++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 cpp/src/arrow/compute/kernels/filter-benchmark.cc diff --git a/cpp/src/arrow/compute/benchmark-util.h b/cpp/src/arrow/compute/benchmark-util.h index ee9cb9504a3..f8b30fbbe42 100644 --- a/cpp/src/arrow/compute/benchmark-util.h +++ b/cpp/src/arrow/compute/benchmark-util.h @@ -70,5 +70,28 @@ void RegressionSetArgs(benchmark::internal::Benchmark* bench) { BenchmarkSetArgsWithSizes(bench, {kL1Size}); } +// RAII struct to handle some of the boilerplate in regression benchmarks +struct RegressionArgs { + // size of memory tested (per iteration) in bytes + const int64_t size; + + // proportion of nulls in generated arrays + const double null_proportion; + + RegressionArgs(benchmark::State& state) + : size(state.range(0) / 4), + null_proportion(static_cast(state.range(1)) / 100.0), + state_(state) {} + + ~RegressionArgs() { + state_.counters["size"] = static_cast(size); + state_.counters["null_percent"] = static_cast(state_.range(1)); + state_.SetBytesProcessed(state_.iterations() * size); + } + + private: + benchmark::State& state_; +}; + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 82b94491307..1bbb5bc5f32 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -20,8 +20,6 @@ arrow_install_all_headers("arrow/compute/kernels") add_arrow_test(boolean-test PREFIX "arrow-compute") add_arrow_test(cast-test PREFIX "arrow-compute") add_arrow_test(hash-test PREFIX "arrow-compute") -add_arrow_test(filter-test PREFIX "arrow-compute") -add_arrow_test(take-test PREFIX "arrow-compute") add_arrow_test(util-internal-test PREFIX "arrow-compute") # Aggregates @@ -31,3 +29,8 @@ add_arrow_benchmark(aggregate-benchmark PREFIX "arrow-compute") # Comparison add_arrow_test(compare-test PREFIX "arrow-compute") add_arrow_benchmark(compare-benchmark PREFIX "arrow-compute") + +# Selection +add_arrow_test(take-test PREFIX "arrow-compute") +add_arrow_test(filter-test PREFIX "arrow-compute") +add_arrow_benchmark(filter-benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/filter-benchmark.cc b/cpp/src/arrow/compute/kernels/filter-benchmark.cc new file mode 100644 index 00000000000..8f61d7bc5fd --- /dev/null +++ b/cpp/src/arrow/compute/kernels/filter-benchmark.cc @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "arrow/compute/kernels/filter.h" + +#include "arrow/compute/benchmark-util.h" +#include "arrow/compute/test-util.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" + +namespace arrow { +namespace compute { + +static void Filter(benchmark::State& state) { + RegressionArgs args(state); + + const int64_t array_size = args.size / sizeof(int64_t); + auto rand = random::RandomArrayGenerator(0x94378165); + auto array = std::static_pointer_cast>( + rand.Int64(array_size, -100, 100, args.null_proportion)); + auto filter = std::static_pointer_cast( + rand.Boolean(array_size, 0.75, args.null_proportion)); + + FunctionContext ctx; + for (auto _ : state) { + Datum out; + ABORT_NOT_OK(Filter(&ctx, Datum(array), Datum(filter), &out)); + benchmark::DoNotOptimize(out); + } +} + +BENCHMARK(Filter)->Apply(RegressionSetArgs); + +} // namespace compute +} // namespace arrow From a21638817b93888aa332207a577352c4b172ad1f Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 11 Jun 2019 11:04:24 -0400 Subject: [PATCH 15/29] add explicit qualification for MSVC --- cpp/src/arrow/compute/kernels/filter-test.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index 5c5de56b330..b550f6421a0 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -200,8 +200,10 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) { for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) { auto options = CompareOptions(op); Datum selection, filtered; - ASSERT_OK(Compare(&this->ctx_, Datum(array), Datum(fifty), options, &selection)); - ASSERT_OK(Filter(&this->ctx_, Datum(array), selection, &filtered)); + ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(array), Datum(fifty), + options, &selection)); + ASSERT_OK( + arrow::compute::Filter(&this->ctx_, Datum(array), selection, &filtered)); auto expected = CompareAndFilter(array->raw_values(), array->length(), c_fifty, op); } @@ -223,8 +225,9 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) { for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) { auto options = CompareOptions(op); Datum selection, filtered; - ASSERT_OK(Compare(&this->ctx_, Datum(lhs), Datum(rhs), options, &selection)); - ASSERT_OK(Filter(&this->ctx_, Datum(lhs), selection, &filtered)); + ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(lhs), Datum(rhs), options, + &selection)); + ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(lhs), selection, &filtered)); auto expected = CompareAndFilter(lhs->raw_values(), lhs->length(), rhs->raw_values(), op); } From e3b402281f9d944b740ea89d83b5380f69ab289b Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 13 Jun 2019 08:28:20 -0400 Subject: [PATCH 16/29] add filter impls for nested types --- cpp/src/arrow/array.cc | 169 +++++++ cpp/src/arrow/array.h | 18 +- cpp/src/arrow/compute/kernels/filter-test.cc | 155 +++++-- cpp/src/arrow/compute/kernels/filter.cc | 452 ++++++++++++++----- cpp/src/arrow/compute/kernels/filter.h | 13 - cpp/src/arrow/extension_type.h | 4 + 6 files changed, 643 insertions(+), 168 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 9d37b45914b..b41d6dda681 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -1369,6 +1369,175 @@ std::shared_ptr MakeArray(const std::shared_ptr& data) { namespace internal { +// get the maximum buffer length required, then allocate a single zeroed buffer +// to use anywhere a buffer is required +class NullArrayFactory { + public: + struct GetBufferLength { + GetBufferLength(const std::shared_ptr& type, int64_t length) + : type_(*type), length_(length), buffer_length_(BitUtil::BytesForBits(length)) {} + + operator int64_t() && { + DCHECK_OK(VisitTypeInline(type_, this)); + return buffer_length_; + } + + template ::bytes_required(0))> + Status Visit(const T&) { + return MaxOf(TypeTraits::bytes_required(length_)); + } + + Status Visit(const ListType& type) { + // list's values array may be empty, but there must be at least one offset of 0 + return MaxOf(sizeof(int32_t)); + } + + Status Visit(const FixedSizeListType& type) { + return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_)); + } + + Status Visit(const StructType& type) { + for (const auto& child : type.children()) { + DCHECK_OK(MaxOf(GetBufferLength(child->type(), length_))); + } + return Status::OK(); + } + + Status Visit(const UnionType& type) { + // type codes + DCHECK_OK(MaxOf(length_)); + if (type.mode() == UnionMode::DENSE) { + // offsets + DCHECK_OK(MaxOf(sizeof(int32_t) * length_)); + } + for (const auto& child : type.children()) { + DCHECK_OK(MaxOf(GetBufferLength(child->type(), length_))); + } + return Status::OK(); + } + + Status Visit(const DictionaryType& type) { + DCHECK_OK(MaxOf(GetBufferLength(type.value_type(), length_))); + return MaxOf(GetBufferLength(type.index_type(), length_)); + } + + Status Visit(const ExtensionType& type) { + // XXX is an extension array's length always == storage length + return MaxOf(GetBufferLength(type.storage_type(), length_)); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("construction of all-null ", type); + } + + private: + Status MaxOf(int64_t buffer_length) { + if (buffer_length > buffer_length_) { + buffer_length_ = buffer_length; + } + return Status::OK(); + } + + const DataType& type_; + int64_t length_, buffer_length_; + }; + + NullArrayFactory(const std::shared_ptr& type, int64_t length, + std::shared_ptr* out) + : type_(type), length_(length), out_(out) {} + + Status CreateBuffer() { + int64_t buffer_length = GetBufferLength(type_, length_); + RETURN_NOT_OK(AllocateBuffer(buffer_length, &buffer_)); + std::memset(buffer_->mutable_data(), 0, buffer_->size()); + return Status::OK(); + } + + Status Create() { + if (buffer_ == nullptr) { + RETURN_NOT_OK(CreateBuffer()); + } + std::vector> child_data(type_->num_children()); + *out_ = ArrayData::Make(type_, length_, {buffer_}, child_data, length_, 0); + return VisitTypeInline(*type_, this); + } + + Status Visit(const NullType&) { return Status::OK(); } + + Status Visit(const FixedWidthType&) { + (*out_)->buffers.resize(2, buffer_); + return Status::OK(); + } + + Status Visit(const BinaryType&) { + (*out_)->buffers.resize(3, buffer_); + return Status::OK(); + } + + Status Visit(const ListType& type) { + (*out_)->buffers.resize(2, buffer_); + return CreateChild(0, length_, &(*out_)->child_data[0]); + } + + Status Visit(const FixedSizeListType& type) { + return CreateChild(0, length_ * type.list_size(), &(*out_)->child_data[0]); + } + + Status Visit(const StructType& type) { + for (int i = 0; i < type_->num_children(); ++i) { + DCHECK_OK(CreateChild(i, length_, &(*out_)->child_data[i])); + } + return Status::OK(); + } + + Status Visit(const UnionType& type) { + if (type.mode() == UnionMode::DENSE) { + (*out_)->buffers.resize(3, buffer_); + } else { + (*out_)->buffers.resize(2, buffer_); + } + + for (int i = 0; i < type_->num_children(); ++i) { + DCHECK_OK(CreateChild(i, length_, &(*out_)->child_data[i])); + } + return Status::OK(); + } + + Status Visit(const DictionaryType& type) { + (*out_)->buffers.resize(2, buffer_); + std::shared_ptr dictionary_data; + return MakeArrayOfNull(type.value_type(), 0, &(*out_)->dictionary); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("construction of all-null ", type); + } + + Status CreateChild(int i, int64_t length, std::shared_ptr* out) { + NullArrayFactory child_factory(type_->child(i)->type(), length, + &(*out_)->child_data[i]); + child_factory.buffer_ = buffer_; + return child_factory.Create(); + } + + std::shared_ptr type_; + int64_t length_; + std::shared_ptr* out_; + std::shared_ptr buffer_; +}; + +} // namespace internal + +Status MakeArrayOfNull(const std::shared_ptr& type, int64_t length, + std::shared_ptr* out) { + std::shared_ptr out_data; + RETURN_NOT_OK(internal::NullArrayFactory(type, length, &out_data).Create()); + *out = MakeArray(out_data); + return Status::OK(); +} + +namespace internal { + std::vector RechunkArraysConsistently( const std::vector& groups) { if (groups.size() <= 1) { diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index a6554227302..78542c6b5be 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -220,6 +220,14 @@ struct ARROW_EXPORT ArrayData { ARROW_EXPORT std::shared_ptr MakeArray(const std::shared_ptr& data); +/// \brief Create a strongly-typed Array instance with all elements null +/// \param[in] type the array type +/// \param[in] length the array length +/// \param[out] out resulting Array instance +ARROW_EXPORT +Status MakeArrayOfNull(const std::shared_ptr& type, int64_t length, + std::shared_ptr* out); + // ---------------------------------------------------------------------- // User array accessor types @@ -521,12 +529,15 @@ class ARROW_EXPORT ListArray : public Array { /// Return pointer to raw value offsets accounting for any slice offset const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - // Neither of these functions will perform boundschecking + // The following functions will not perform boundschecking int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } int32_t value_length(int64_t i) const { i += data_->offset; return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } + std::shared_ptr value_slice(int64_t i) const { + return values_->Slice(value_offset(i), value_length(i)); + } protected: // This constructor defers SetData to a derived array class @@ -596,12 +607,15 @@ class ARROW_EXPORT FixedSizeListArray : public Array { std::shared_ptr value_type() const; - // Neither of these functions will perform boundschecking + // The following functions will not perform boundschecking int32_t value_offset(int64_t i) const { i += data_->offset; return static_cast(list_size_ * i); } int32_t value_length(int64_t i = 0) const { return list_size_; } + std::shared_ptr value_slice(int64_t i) const { + return values_->Slice(value_offset(i), value_length(i)); + } protected: void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index b550f6421a0..0607a708856 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -19,6 +19,7 @@ #include #include "arrow/compute/context.h" +#include "arrow/compute/kernels/boolean.h" #include "arrow/compute/kernels/compare.h" #include "arrow/compute/kernels/filter.h" #include "arrow/compute/test-util.h" @@ -161,27 +162,28 @@ decltype(Comparator::Compare)* GetComparator(CompareOperator op) { return cmp[op]; } -template -std::vector CompareAndFilter(const CType* data, int64_t length, CType val, - CompareOperator op) { +template ::CType> +std::shared_ptr CompareAndFilter(const CType* data, int64_t length, Fn&& fn) { std::vector filtered; filtered.reserve(length); + std::copy_if(data, data + length, std::back_inserter(filtered), std::forward(fn)); + std::shared_ptr filtered_array; + ArrayFromVector(filtered, &filtered_array); + return filtered_array; +} + +template ::CType> +std::shared_ptr CompareAndFilter(const CType* data, int64_t length, CType val, + CompareOperator op) { auto cmp = GetComparator(op); - std::copy_if(data, data + length, std::back_inserter(filtered), - [cmp, val](CType e) { return cmp(e, val); }); - return filtered; + return CompareAndFilter(data, length, [&](CType e) { return cmp(e, val); }); } -template -std::vector CompareAndFilter(const CType* data, int64_t length, const CType* other, - CompareOperator op) { - std::vector filtered; - filtered.reserve(length); +template ::CType> +std::shared_ptr CompareAndFilter(const CType* data, int64_t length, + const CType* other, CompareOperator op) { auto cmp = GetComparator(op); - int64_t i = 0; - std::copy_if(data, data + length, std::back_inserter(filtered), - [cmp, other, &i](CType e) { return cmp(e, other[i++]); }); - return filtered; + return CompareAndFilter(data, length, [&](CType e) { return cmp(e, *other++); }); } TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) { @@ -192,21 +194,21 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) { auto rand = random::RandomArrayGenerator(0x5416447); for (size_t i = 3; i < 13; i++) { const int64_t length = static_cast(1ULL << i); - for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) { - auto array = checked_pointer_cast( - rand.Numeric(length, 0, 100, null_probability)); - CType c_fifty = 50; - auto fifty = std::make_shared(c_fifty); - for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) { - auto options = CompareOptions(op); - Datum selection, filtered; - ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(array), Datum(fifty), - options, &selection)); - ASSERT_OK( - arrow::compute::Filter(&this->ctx_, Datum(array), selection, &filtered)); - auto expected = - CompareAndFilter(array->raw_values(), array->length(), c_fifty, op); - } + // TODO(bkietz) rewrite with some nulls + auto array = + checked_pointer_cast(rand.Numeric(length, 0, 100, 0)); + CType c_fifty = 50; + auto fifty = std::make_shared(c_fifty); + for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) { + auto options = CompareOptions(op); + Datum selection, filtered; + ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(array), Datum(fifty), options, + &selection)); + ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(array), selection, &filtered)); + auto filtered_array = filtered.make_array(); + auto expected = + CompareAndFilter(array->raw_values(), array->length(), c_fifty, op); + ASSERT_ARRAYS_EQUAL(*filtered_array, *expected); } } } @@ -217,24 +219,53 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) { auto rand = random::RandomArrayGenerator(0x5416447); for (size_t i = 3; i < 13; i++) { const int64_t length = static_cast(1ULL << i); - for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) { - auto lhs = checked_pointer_cast( - rand.Numeric(length, 0, 100, null_probability)); - auto rhs = checked_pointer_cast( - rand.Numeric(length, 0, 100, null_probability)); - for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) { - auto options = CompareOptions(op); - Datum selection, filtered; - ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(lhs), Datum(rhs), options, - &selection)); - ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(lhs), selection, &filtered)); - auto expected = - CompareAndFilter(lhs->raw_values(), lhs->length(), rhs->raw_values(), op); - } + auto lhs = + checked_pointer_cast(rand.Numeric(length, 0, 100, 0)); + auto rhs = + checked_pointer_cast(rand.Numeric(length, 0, 100, 0)); + for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) { + auto options = CompareOptions(op); + Datum selection, filtered; + ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(lhs), Datum(rhs), options, + &selection)); + ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(lhs), selection, &filtered)); + auto filtered_array = filtered.make_array(); + auto expected = CompareAndFilter(lhs->raw_values(), lhs->length(), + rhs->raw_values(), op); + ASSERT_ARRAYS_EQUAL(*filtered_array, *expected); } } } +TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) { + using ScalarType = typename TypeTraits::ScalarType; + using ArrayType = typename TypeTraits::ArrayType; + using CType = typename TypeTraits::CType; + + auto rand = random::RandomArrayGenerator(0x5416447); + for (size_t i = 3; i < 13; i++) { + const int64_t length = static_cast(1ULL << i); + auto array = + checked_pointer_cast(rand.Numeric(length, 0, 100, 0)); + CType c_fifty = 50, c_hundred = 100; + auto fifty = std::make_shared(c_fifty); + auto hundred = std::make_shared(c_hundred); + Datum greater_than_fifty, less_than_hundred, selection, filtered; + ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(array), Datum(fifty), + CompareOptions(GREATER), &greater_than_fifty)); + ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(array), Datum(hundred), + CompareOptions(LESS), &less_than_hundred)); + ASSERT_OK(arrow::compute::And(&this->ctx_, greater_than_fifty, less_than_hundred, + &selection)); + ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(array), selection, &filtered)); + auto filtered_array = filtered.make_array(); + auto expected = CompareAndFilter( + array->raw_values(), array->length(), + [&](CType e) { return (e > c_fifty) && (e < c_hundred); }); + ASSERT_ARRAYS_EQUAL(*filtered_array, *expected); + } +} + class TestFilterKernelWithString : public TestFilterKernel { protected: void AssertFilter(const std::string& values, const std::string& filter, @@ -270,5 +301,41 @@ TEST_F(TestFilterKernelWithString, FilterDictionary) { this->AssertFilterDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4]"); } +class TestFilterKernelWithList : public TestFilterKernel {}; + +TEST_F(TestFilterKernelWithList, FilterListInt32) { + std::string list_json = "[[], [1,2], null, [3]]"; + this->AssertFilter(list(int32()), list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(list(int32()), list_json, "[0, 1, 1, null]", "[[1,2], null, null]"); + this->AssertFilter(list(int32()), list_json, "[1, 1, 1, 1]", list_json); + this->AssertFilter(list(int32()), list_json, "[0, 1, 0, 1]", "[[1,2], [3]]"); +} + +class TestFilterKernelWithFixedSizeList : public TestFilterKernel {}; + +TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { + std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]"; + this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[0, 1, 1, null]", + "[[1, null, 3], [4, 5, 6], null]"); + this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[1, 1, 1, 1]", list_json); + this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[0, 1, 0, 1]", + "[[1, null, 3], [7, 8, null]]"); +} + +class TestFilterKernelWithStruct : public TestFilterKernel {}; + +TEST_F(TestFilterKernelWithStruct, FilterStruct) { + auto struct_type = struct_({field("a", int32()), field("b", utf8())}); + auto struct_json = + R"([null, {"a": 1, "b": ""}, {"a": 2, "b": "hello"}, {"a": 4, "b": "eh"}])"; + this->AssertFilter(struct_type, struct_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(struct_type, struct_json, "[0, 1, 1, null]", + R"([{"a": 1, "b": ""}, {"a": 2, "b": "hello"}, null])"); + this->AssertFilter(struct_type, struct_json, "[1, 1, 1, 1]", struct_json); + this->AssertFilter(struct_type, struct_json, "[1, 0, 1, 0]", + R"([null, {"a": 2, "b": "hello"}])"); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index 235539bae8b..0b369a3a4bd 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -24,33 +24,49 @@ #include "arrow/compute/kernels/filter.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/concatenate.h" #include "arrow/util/logging.h" +#include "arrow/util/stl.h" #include "arrow/visitor_inline.h" namespace arrow { namespace compute { using internal::checked_cast; +using internal::checked_pointer_cast; -Status Filter(FunctionContext* context, const Array& values, const Array& filter, - std::shared_ptr* out) { - Datum out_datum; - RETURN_NOT_OK(Filter(context, Datum(values.data()), Datum(filter.data()), &out_datum)); - *out = out_datum.make_array(); - return Status::OK(); -} +/// \brief BinaryKernel implementing Filter operation +class ARROW_EXPORT FilterKernel : public BinaryKernel { + public: + explicit FilterKernel(const std::shared_ptr& type) : type_(type) {} -Status Filter(FunctionContext* context, const Datum& values, const Datum& filter, - Datum* out) { - FilterKernel kernel(values.type()); - RETURN_NOT_OK(kernel.Call(context, values, filter, out)); + Status Call(FunctionContext* ctx, const Datum& values, const Datum& filter, + Datum* out) override; + + std::shared_ptr out_type() const override { return type_; } + + virtual Status Filter(FunctionContext* ctx, const Array& values, + const BooleanArray& filter, std::shared_ptr* out) = 0; + + static Status Make(const std::shared_ptr& value_type, + std::unique_ptr* out); + + protected: + std::shared_ptr type_; +}; + +template +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::unique_ptr* out) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, type, &builder)); + out->reset(checked_cast(builder.release())); return Status::OK(); } struct FilterParameters { - FunctionContext* context; - std::shared_ptr values, filter; - std::shared_ptr* out; + std::shared_ptr value_type; + std::unique_ptr* out; }; template @@ -86,123 +102,322 @@ static int64_t OutputSize(const BooleanArray& filter) { return size; } -template -Status FilterImpl(FunctionContext*, const ValueArray& values, const BooleanArray& filter, - OutBuilder* builder) { - auto offset = filter.offset(); - auto length = filter.length(); - internal::BitmapReader filter_data(filter.data()->buffers[1]->data(), offset, length); - for (int64_t i = 0; i < filter.length(); filter_data.Next(), ++i) { - if (!WholeFilterValid && filter.IsNull(i)) { - builder->UnsafeAppendNull(); - continue; +template +class FilterImpl; + +template <> +class FilterImpl : public FilterKernel { + public: + using FilterKernel::FilterKernel; + + Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, + std::shared_ptr* out) override { + out->reset(new NullArray(OutputSize(checked_cast(filter)))); + return Status::OK(); + } +}; + +template +class FilterImpl : public FilterKernel { + public: + using ValueArray = typename TypeTraits::ArrayType; + using OutBuilder = typename TypeTraits::BuilderType; + + using FilterKernel::FilterKernel; + + Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, + std::shared_ptr* out) override { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type_, &builder)); + RETURN_NOT_OK(builder->Resize(OutputSize(filter))); + RETURN_NOT_OK(UnpackValuesNullCount(checked_cast(values), filter, + builder.get())); + return builder->Finish(out); + } + + private: + Status UnpackValuesNullCount(const ValueArray& values, const BooleanArray& filter, + OutBuilder* builder) { + if (values.null_count() == 0) { + return UnpackIndicesNullCount(values, filter, builder); } - if (filter_data.IsNotSet()) { - continue; + return UnpackIndicesNullCount(values, filter, builder); + } + + template + Status UnpackIndicesNullCount(const ValueArray& values, const BooleanArray& filter, + OutBuilder* builder) { + if (filter.null_count() == 0) { + return Filter(values, filter, builder); } - if (!AllValuesValid && values.IsNull(i)) { - builder->UnsafeAppendNull(); - continue; + return Filter(values, filter, builder); + } + + template + Status Filter(const ValueArray& values, const BooleanArray& filter, + OutBuilder* builder) { + for (int64_t i = 0; i < filter.length(); ++i) { + if (!AllIndicesValid && filter.IsNull(i)) { + builder->UnsafeAppendNull(); + continue; + } + if (!filter.Value(i)) { + continue; + } + if (!AllValuesValid && values.IsNull(i)) { + builder->UnsafeAppendNull(); + continue; + } + RETURN_NOT_OK(UnsafeAppend(builder, values.GetView(i))); } - RETURN_NOT_OK(UnsafeAppend(builder, values.GetView(i))); + return Status::OK(); } - return Status::OK(); -} +}; + +template <> +class FilterImpl : public FilterKernel { + public: + using FilterKernel::FilterKernel; + + Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, + std::shared_ptr* out) override { + const auto& struct_array = checked_cast(values); + + auto length = OutputSize(filter); + TypedBufferBuilder null_bitmap_builder(ctx->memory_pool()); + RETURN_NOT_OK(null_bitmap_builder.Resize(length)); + + ArrayVector fields(type_->num_children()); + for (int i = 0; i < type_->num_children(); ++i) { + RETURN_NOT_OK( + arrow::compute::Filter(ctx, *struct_array.field(i), filter, &fields[i])); + } + + for (int64_t i = 0; i < filter.length(); ++i) { + if (filter.IsNull(i)) { + null_bitmap_builder.UnsafeAppend(false); + continue; + } + if (!filter.Value(i)) { + continue; + } + if (struct_array.IsNull(i)) { + null_bitmap_builder.UnsafeAppend(false); + continue; + } + null_bitmap_builder.UnsafeAppend(true); + } + + auto null_count = null_bitmap_builder.false_count(); + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder.Finish(&null_bitmap)); -template -Status UnpackFilterNullCount(FunctionContext* context, const ValueArray& values, - const FilterArray& filter, OutBuilder* builder) { - if (filter.null_count() == 0) { - return FilterImpl(context, values, filter, builder); + out->reset(new StructArray(type_, length, fields, null_bitmap, null_count)); + return Status::OK(); } - return FilterImpl(context, values, filter, builder); -} +}; + +template <> +class FilterImpl : public FilterKernel { + public: + using FilterKernel::FilterKernel; + + Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, + std::shared_ptr* out) override { + const auto& list_array = checked_cast(values); + + TypedBufferBuilder null_bitmap_builder(ctx->memory_pool()); + auto length = OutputSize(filter); + RETURN_NOT_OK(null_bitmap_builder.Resize(length)); + + std::shared_ptr null_slice; + RETURN_NOT_OK(MakeArrayOfNull(list_array.value_type(), + list_array.list_type()->list_size(), &null_slice)); + ArrayVector value_slices(length, null_slice); -template -Status UnpackValuesNullCount(FunctionContext* context, const ValueArray& values, - const FilterArray& filter, OutBuilder* builder) { - if (values.null_count() == 0) { - return UnpackFilterNullCount(context, values, filter, builder); + for (int64_t filtered_i = 0, i = 0; i < filter.length(); ++i) { + if (filter.IsNull(i)) { + null_bitmap_builder.UnsafeAppend(false); + ++filtered_i; + continue; + } + if (!filter.Value(i)) { + continue; + } + if (values.IsNull(i)) { + null_bitmap_builder.UnsafeAppend(false); + ++filtered_i; + continue; + } + null_bitmap_builder.UnsafeAppend(true); + value_slices[filtered_i] = list_array.value_slice(i); + ++filtered_i; + } + + std::shared_ptr out_values; + if (length != 0) { + RETURN_NOT_OK(Concatenate(value_slices, ctx->memory_pool(), &out_values)); + } else { + out_values = list_array.values()->Slice(0, 0); + } + + auto null_count = null_bitmap_builder.false_count(); + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder.Finish(&null_bitmap)); + + out->reset( + new FixedSizeListArray(type_, length, out_values, null_bitmap, null_count)); + return Status::OK(); } - return UnpackFilterNullCount(context, values, filter, builder); -} +}; + +class ListFilterImpl : public FilterKernel { + public: + using FilterKernel::FilterKernel; + + Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, + std::shared_ptr* out) override { + const auto& list_array = checked_cast(values); + + TypedBufferBuilder null_bitmap_builder(ctx->memory_pool()); + auto length = OutputSize(filter); + RETURN_NOT_OK(null_bitmap_builder.Resize(length)); + + TypedBufferBuilder offset_builder(ctx->memory_pool()); + RETURN_NOT_OK(offset_builder.Resize(length + 1)); + int32_t offset = 0; + offset_builder.UnsafeAppend(offset); + + ArrayVector value_slices(length, list_array.values()->Slice(0, 0)); + for (int64_t filtered_i = 0, i = 0; i < filter.length(); ++i) { + if (filter.IsNull(i)) { + null_bitmap_builder.UnsafeAppend(false); + offset_builder.UnsafeAppend(offset); + ++filtered_i; + continue; + } + if (!filter.Value(i)) { + continue; + } + if (values.IsNull(i)) { + null_bitmap_builder.UnsafeAppend(false); + offset_builder.UnsafeAppend(offset); + ++filtered_i; + continue; + } + null_bitmap_builder.UnsafeAppend(true); + value_slices[filtered_i] = list_array.value_slice(i); + ++filtered_i; + offset += list_array.value_length(i); + offset_builder.UnsafeAppend(offset); + } + + std::shared_ptr out_values; + if (length != 0) { + RETURN_NOT_OK(Concatenate(value_slices, ctx->memory_pool(), &out_values)); + } else { + out_values = list_array.values()->Slice(0, 0); + } + + auto null_count = null_bitmap_builder.false_count(); + std::shared_ptr offsets, null_bitmap; + RETURN_NOT_OK(offset_builder.Finish(&offsets)); + RETURN_NOT_OK(null_bitmap_builder.Finish(&null_bitmap)); + + *out = MakeArray(ArrayData::Make(type_, length, {null_bitmap, offsets}, + {out_values->data()}, null_count)); + return Status::OK(); + } +}; + +class DictionaryFilterImpl : public FilterKernel { + public: + DictionaryFilterImpl(const std::shared_ptr& type, + std::unique_ptr impl) + : FilterKernel(type), impl_(std::move(impl)) {} + + Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, + std::shared_ptr* out) override { + auto dict_array = checked_cast(&values); + // To filter a dictionary, apply the current kernel to the dictionary's indices. + std::shared_ptr taken_indices; + RETURN_NOT_OK(impl_->Filter(ctx, *dict_array->indices(), filter, &taken_indices)); + return DictionaryArray::FromArrays(values.type(), taken_indices, + dict_array->dictionary(), out); + } + + private: + std::unique_ptr impl_; +}; + +class ExtensionFilterImpl : public FilterKernel { + public: + ExtensionFilterImpl(const std::shared_ptr& type, + std::unique_ptr impl) + : FilterKernel(type), impl_(std::move(impl)) {} + + Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, + std::shared_ptr* out) override { + auto ext_array = checked_cast(&values); + // To take from an extension array, apply the current kernel to storage. + std::shared_ptr taken_storage; + RETURN_NOT_OK(impl_->Filter(ctx, *ext_array->storage(), filter, &taken_storage)); + *out = ext_array->extension_type()->MakeArray(taken_storage->data()); + return Status::OK(); + } + + private: + std::unique_ptr impl_; +}; template using ArrayType = typename TypeTraits::ArrayType; -template struct UnpackValues { template Status Visit(const ValueType&) { - using OutBuilder = typename TypeTraits::BuilderType; - const auto& filter = checked_cast&>(*params_.filter); - const auto& values = checked_cast&>(*params_.values); - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(params_.context->memory_pool(), values.type(), &builder)); - RETURN_NOT_OK(builder->Reserve(OutputSize(filter))); - RETURN_NOT_OK(UnpackValuesNullCount(params_.context, values, filter, - checked_cast(builder.get()))); - return builder->Finish(params_.out); - } - - Status Visit(const NullType& t) { - const auto& filter = checked_cast&>(*params_.filter); - params_.out->reset(new NullArray(OutputSize(filter))); - return Status::OK(); + return Make>(); } + Status Visit(const NullType&) { return Make>(); } + Status Visit(const DictionaryType& t) { - std::shared_ptr filtered_indices; - const auto& values = internal::checked_cast(*params_.values); - { - // To take from a dictionary, apply the current kernel to the dictionary's - // indices. (Use UnpackValues since FilterType is already unpacked) - FilterParameters params = params_; - params.values = values.indices(); - params.out = &filtered_indices; - UnpackValues unpack = {params}; - RETURN_NOT_OK(VisitTypeInline(*t.index_type(), &unpack)); - } - // create output dictionary from taken filter - *params_.out = std::make_shared(values.type(), filtered_indices, - values.dictionary()); - return Status::OK(); + std::unique_ptr indices_filter_impl; + FilterParameters params = params_; + params.value_type = t.index_type(); + params.out = &indices_filter_impl; + UnpackValues unpack = {params}; + RETURN_NOT_OK(VisitTypeInline(*t.index_type(), &unpack)); + return Make(std::move(indices_filter_impl)); } Status Visit(const ExtensionType& t) { - // XXX can we just take from its storage? - return Status::NotImplemented("gathering values of type ", t); + std::unique_ptr storage_filter_impl; + FilterParameters params = params_; + params.value_type = t.storage_type(); + params.out = &storage_filter_impl; + UnpackValues unpack = {params}; + RETURN_NOT_OK(VisitTypeInline(*t.storage_type(), &unpack)); + return Make(std::move(storage_filter_impl)); } Status Visit(const UnionType& t) { return Status::NotImplemented("gathering values of type ", t); } - Status Visit(const ListType& t) { - return Status::NotImplemented("gathering values of type ", t); - } + Status Visit(const ListType& t) { return Make(); } Status Visit(const FixedSizeListType& t) { - return Status::NotImplemented("gathering values of type ", t); - } - - Status Visit(const StructType& t) { - return Status::NotImplemented("gathering values of type ", t); + return Make>(); } - const FilterParameters& params_; -}; - -struct UnpackFilter { - Status Visit(const BooleanType&) { - UnpackValues unpack = {params_}; - return VisitTypeInline(*params_.values->type(), &unpack); - } + Status Visit(const StructType& t) { return Make>(); } - Status Visit(const DataType& other) { - return Status::TypeError("filter type not supported: ", other); + template + Status Make(Extra&&... extra) { + *params_.out = + internal::make_unique(params_.value_type, std::forward(extra)...); + return Status::OK(); } const FilterParameters& params_; @@ -214,17 +429,36 @@ Status FilterKernel::Call(FunctionContext* ctx, const Datum& values, const Datum return Status::Invalid("FilterKernel expects array values and filter"); } std::shared_ptr out_array; + auto filter_array = checked_pointer_cast(filter.make_array()); + auto values_array = values.make_array(); + RETURN_NOT_OK(this->Filter(ctx, *values_array, *filter_array, &out_array)); + *out = out_array; + return Status::OK(); +} + +Status FilterKernel::Make(const std::shared_ptr& value_type, + std::unique_ptr* out) { FilterParameters params; - params.context = ctx; - params.values = values.make_array(); - params.filter = filter.make_array(); - if (params.values->length() != params.filter->length()) { - return Status::Invalid("Filter is not the same size as values"); - } - params.out = &out_array; - UnpackFilter unpack = {params}; - RETURN_NOT_OK(VisitTypeInline(*filter.type(), &unpack)); - *out = Datum(out_array); + params.value_type = value_type; + params.out = out; + UnpackValues unpack = {params}; + RETURN_NOT_OK(VisitTypeInline(*value_type, &unpack)); + return Status::OK(); +} + +Status Filter(FunctionContext* context, const Array& values, const Array& filter, + std::shared_ptr* out) { + Datum out_datum; + RETURN_NOT_OK(Filter(context, Datum(values.data()), Datum(filter.data()), &out_datum)); + *out = out_datum.make_array(); + return Status::OK(); +} + +Status Filter(FunctionContext* context, const Datum& values, const Datum& filter, + Datum* out) { + std::unique_ptr kernel; + RETURN_NOT_OK(FilterKernel::Make(values.type(), &kernel)); + RETURN_NOT_OK(kernel->Call(context, values, filter, out)); return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/filter.h b/cpp/src/arrow/compute/kernels/filter.h index 2abd38bfb79..6192a277794 100644 --- a/cpp/src/arrow/compute/kernels/filter.h +++ b/cpp/src/arrow/compute/kernels/filter.h @@ -59,18 +59,5 @@ ARROW_EXPORT Status Filter(FunctionContext* context, const Datum& values, const Datum& filter, Datum* out); -/// \brief BinaryKernel implementing Filter operation -class ARROW_EXPORT FilterKernel : public BinaryKernel { - public: - explicit FilterKernel(const std::shared_ptr& type) : type_(type) {} - - Status Call(FunctionContext* ctx, const Datum& values, const Datum& filter, - Datum* out) override; - - std::shared_ptr out_type() const override { return type_; } - - private: - std::shared_ptr type_; -}; } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/extension_type.h b/cpp/src/arrow/extension_type.h index 6a1ca0b7155..8bf4639bd12 100644 --- a/cpp/src/arrow/extension_type.h +++ b/cpp/src/arrow/extension_type.h @@ -93,6 +93,10 @@ class ARROW_EXPORT ExtensionArray : public Array { ExtensionArray(const std::shared_ptr& type, const std::shared_ptr& storage); + const ExtensionType* extension_type() const { + return internal::checked_cast(data_->type.get()); + } + /// \brief The physical storage for the extension array std::shared_ptr storage() const { return storage_; } From a8cb993da5b0fe9c62e4bbf06e41f81a3a7f247a Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 13 Jun 2019 08:30:01 -0400 Subject: [PATCH 17/29] fix lint error --- cpp/src/arrow/compute/benchmark-util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/benchmark-util.h b/cpp/src/arrow/compute/benchmark-util.h index f8b30fbbe42..51e57071c61 100644 --- a/cpp/src/arrow/compute/benchmark-util.h +++ b/cpp/src/arrow/compute/benchmark-util.h @@ -78,7 +78,7 @@ struct RegressionArgs { // proportion of nulls in generated arrays const double null_proportion; - RegressionArgs(benchmark::State& state) + explicit RegressionArgs(benchmark::State& state) : size(state.range(0) / 4), null_proportion(static_cast(state.range(1)) / 100.0), state_(state) {} From 495e5217b6c4d4cda0e354d61f846e2ec18cbdd6 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 13 Jun 2019 09:38:40 -0400 Subject: [PATCH 18/29] Add support for filtering MapArray --- cpp/src/arrow/compute/kernels/filter-test.cc | 40 +++++++++++++++++--- cpp/src/arrow/compute/kernels/filter.cc | 2 + 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index 0607a708856..6ccca0a381d 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -323,18 +323,46 @@ TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { "[[1, null, 3], [7, 8, null]]"); } +class TestFilterKernelWithMap : public TestFilterKernel {}; + +TEST_F(TestFilterKernelWithMap, FilterMapStringToInt32) { + std::string map_json = R"([ + [["joe", 0], ["mark", null]], + null, + [["cap", 8]], + [] + ])"; + this->AssertFilter(map(utf8(), int32()), map_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(map(utf8(), int32()), map_json, "[0, 1, 1, null]", R"([ + null, + [["cap", 8]], + null + ])"); + this->AssertFilter(map(utf8(), int32()), map_json, "[1, 1, 1, 1]", map_json); + this->AssertFilter(map(utf8(), int32()), map_json, "[0, 1, 0, 1]", "[null, []]"); +} + class TestFilterKernelWithStruct : public TestFilterKernel {}; TEST_F(TestFilterKernelWithStruct, FilterStruct) { auto struct_type = struct_({field("a", int32()), field("b", utf8())}); - auto struct_json = - R"([null, {"a": 1, "b": ""}, {"a": 2, "b": "hello"}, {"a": 4, "b": "eh"}])"; + auto struct_json = R"([ + null, + {"a": 1, "b": ""}, + {"a": 2, "b": "hello"}, + {"a": 4, "b": "eh"} + ])"; this->AssertFilter(struct_type, struct_json, "[0, 0, 0, 0]", "[]"); - this->AssertFilter(struct_type, struct_json, "[0, 1, 1, null]", - R"([{"a": 1, "b": ""}, {"a": 2, "b": "hello"}, null])"); + this->AssertFilter(struct_type, struct_json, "[0, 1, 1, null]", R"([ + {"a": 1, "b": ""}, + {"a": 2, "b": "hello"}, + null + ])"); this->AssertFilter(struct_type, struct_json, "[1, 1, 1, 1]", struct_json); - this->AssertFilter(struct_type, struct_json, "[1, 0, 1, 0]", - R"([null, {"a": 2, "b": "hello"}])"); + this->AssertFilter(struct_type, struct_json, "[1, 0, 1, 0]", R"([ + null, + {"a": 2, "b": "hello"} + ])"); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index 0b369a3a4bd..7d5930748ae 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -407,6 +407,8 @@ struct UnpackValues { Status Visit(const ListType& t) { return Make(); } + Status Visit(const MapType& t) { return Make(); } + Status Visit(const FixedSizeListType& t) { return Make>(); } From 3387f21b95276888830ecc294d190ded3132f177 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 13 Jun 2019 11:12:51 -0400 Subject: [PATCH 19/29] use new path for concatenate.h --- cpp/src/arrow/compute/kernels/filter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index 7d5930748ae..577b52dbcdf 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -19,12 +19,12 @@ #include #include +#include "arrow/array/concatenate.h" #include "arrow/builder.h" #include "arrow/compute/context.h" #include "arrow/compute/kernels/filter.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/concatenate.h" #include "arrow/util/logging.h" #include "arrow/util/stl.h" #include "arrow/visitor_inline.h" From f424f34c5cfa1c9f1693591e3c5f669d8368c130 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 14 Jun 2019 10:00:29 -0400 Subject: [PATCH 20/29] fix nits and typos --- cpp/src/arrow/compute/kernels/filter-test.cc | 2 +- cpp/src/arrow/compute/kernels/filter.cc | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index 6ccca0a381d..f4ad016f10f 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -1,7 +1,7 @@ // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information -// returnGegarding copyright ownership. The ASF licenses this file +// regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index 577b52dbcdf..bd5f4a9b44a 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -1,7 +1,7 @@ // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information -// returnGegarding copyright ownership. The ASF licenses this file +// regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at @@ -444,8 +444,7 @@ Status FilterKernel::Make(const std::shared_ptr& value_type, params.value_type = value_type; params.out = out; UnpackValues unpack = {params}; - RETURN_NOT_OK(VisitTypeInline(*value_type, &unpack)); - return Status::OK(); + return VisitTypeInline(*value_type, &unpack); } Status Filter(FunctionContext* context, const Array& values, const Array& filter, @@ -460,8 +459,7 @@ Status Filter(FunctionContext* context, const Datum& values, const Datum& filter Datum* out) { std::unique_ptr kernel; RETURN_NOT_OK(FilterKernel::Make(values.type(), &kernel)); - RETURN_NOT_OK(kernel->Call(context, values, filter, out)); - return Status::OK(); + return kernel->Call(context, values, filter, out); } } // namespace compute From f833e02fc8d9fb73e418c9ff48c38e86747c06a6 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 14 Jun 2019 11:12:29 -0400 Subject: [PATCH 21/29] add benchmark for fixed_size_list(int64(), 1) --- .../arrow/compute/kernels/filter-benchmark.cc | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter-benchmark.cc b/cpp/src/arrow/compute/kernels/filter-benchmark.cc index 8f61d7bc5fd..cd2517dba3e 100644 --- a/cpp/src/arrow/compute/kernels/filter-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/filter-benchmark.cc @@ -27,7 +27,7 @@ namespace arrow { namespace compute { -static void Filter(benchmark::State& state) { +static void FilterInt64(benchmark::State& state) { RegressionArgs args(state); const int64_t array_size = args.size / sizeof(int64_t); @@ -45,7 +45,29 @@ static void Filter(benchmark::State& state) { } } -BENCHMARK(Filter)->Apply(RegressionSetArgs); +static void FilterFixedSizeList1Int64(benchmark::State& state) { + RegressionArgs args(state); + + const int64_t array_size = args.size / sizeof(int64_t); + auto rand = random::RandomArrayGenerator(0x94378165); + auto int_array = std::static_pointer_cast>( + rand.Int64(array_size, -100, 100, args.null_proportion)); + auto array = std::make_shared( + fixed_size_list(int64(), 1), array_size, int_array, int_array->null_bitmap(), + int_array->null_count()); + auto filter = std::static_pointer_cast( + rand.Boolean(array_size, 0.75, args.null_proportion)); + + FunctionContext ctx; + for (auto _ : state) { + Datum out; + ABORT_NOT_OK(Filter(&ctx, Datum(array), Datum(filter), &out)); + benchmark::DoNotOptimize(out); + } +} + +BENCHMARK(FilterInt64)->Apply(RegressionSetArgs); +BENCHMARK(FilterFixedSizeList1Int64)->Apply(RegressionSetArgs); } // namespace compute } // namespace arrow From e4d9d85eb09e0eb8456373de2baa4c7ff969272c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 14 Jun 2019 12:15:46 -0400 Subject: [PATCH 22/29] refactor FilterKernel::Make to use a switch --- cpp/src/arrow/compute/kernels/filter.cc | 129 ++++++++++++------------ 1 file changed, 62 insertions(+), 67 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index bd5f4a9b44a..a9efda8b18e 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -64,11 +64,6 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, return Status::OK(); } -struct FilterParameters { - std::shared_ptr value_type; - std::unique_ptr* out; -}; - template static Status UnsafeAppend(Builder* builder, Scalar&& value) { builder->UnsafeAppend(std::forward(value)); @@ -91,7 +86,7 @@ static Status UnsafeAppend(StringBuilder* builder, util::string_view value) { static int64_t OutputSize(const BooleanArray& filter) { auto offset = filter.offset(); auto length = filter.length(); - internal::BitmapReader filter_data(filter.data()->buffers[1]->data(), offset, length); + internal::BitmapReader filter_data(filter.values()->data(), offset, length); int64_t size = 0; for (auto i = offset; i < offset + length; ++i) { if (filter.IsNull(i) || filter_data.IsSet()) { @@ -271,7 +266,8 @@ class FilterImpl : public FilterKernel { } }; -class ListFilterImpl : public FilterKernel { +template <> +class FilterImpl : public FilterKernel { public: using FilterKernel::FilterKernel; @@ -330,6 +326,11 @@ class ListFilterImpl : public FilterKernel { } }; +template <> +class FilterImpl : public FilterImpl { + using FilterImpl::FilterImpl; +}; + class DictionaryFilterImpl : public FilterKernel { public: DictionaryFilterImpl(const std::shared_ptr& type, @@ -370,60 +371,63 @@ class ExtensionFilterImpl : public FilterKernel { std::unique_ptr impl_; }; -template -using ArrayType = typename TypeTraits::ArrayType; - -struct UnpackValues { - template - Status Visit(const ValueType&) { - return Make>(); - } - - Status Visit(const NullType&) { return Make>(); } - - Status Visit(const DictionaryType& t) { - std::unique_ptr indices_filter_impl; - FilterParameters params = params_; - params.value_type = t.index_type(); - params.out = &indices_filter_impl; - UnpackValues unpack = {params}; - RETURN_NOT_OK(VisitTypeInline(*t.index_type(), &unpack)); - return Make(std::move(indices_filter_impl)); - } - - Status Visit(const ExtensionType& t) { - std::unique_ptr storage_filter_impl; - FilterParameters params = params_; - params.value_type = t.storage_type(); - params.out = &storage_filter_impl; - UnpackValues unpack = {params}; - RETURN_NOT_OK(VisitTypeInline(*t.storage_type(), &unpack)); - return Make(std::move(storage_filter_impl)); - } - - Status Visit(const UnionType& t) { - return Status::NotImplemented("gathering values of type ", t); - } - - Status Visit(const ListType& t) { return Make(); } - - Status Visit(const MapType& t) { return Make(); } - - Status Visit(const FixedSizeListType& t) { - return Make>(); +Status FilterKernel::Make(const std::shared_ptr& value_type, + std::unique_ptr* out) { + switch (value_type->id()) { +#define NO_CHILD_CASE(T) \ + case T##Type::type_id: \ + *out = internal::make_unique>(value_type); \ + return Status::OK() + +#define SINGLE_CHILD_CASE(T, IMPL, CHILD_TYPE) \ + case T##Type::type_id: { \ + auto t = checked_pointer_cast(value_type); \ + std::unique_ptr child_filter_impl; \ + RETURN_NOT_OK(FilterKernel::Make(t->CHILD_TYPE(), &child_filter_impl)); \ + *out = internal::make_unique(t, std::move(child_filter_impl)); \ + return Status::OK(); \ } - Status Visit(const StructType& t) { return Make>(); } - - template - Status Make(Extra&&... extra) { - *params_.out = - internal::make_unique(params_.value_type, std::forward(extra)...); - return Status::OK(); + NO_CHILD_CASE(Null); + NO_CHILD_CASE(Boolean); + NO_CHILD_CASE(Int8); + NO_CHILD_CASE(Int16); + NO_CHILD_CASE(Int32); + NO_CHILD_CASE(Int64); + NO_CHILD_CASE(UInt8); + NO_CHILD_CASE(UInt16); + NO_CHILD_CASE(UInt32); + NO_CHILD_CASE(UInt64); + NO_CHILD_CASE(Date32); + NO_CHILD_CASE(Date64); + NO_CHILD_CASE(Time32); + NO_CHILD_CASE(Time64); + NO_CHILD_CASE(Timestamp); + NO_CHILD_CASE(Duration); + NO_CHILD_CASE(HalfFloat); + NO_CHILD_CASE(Float); + NO_CHILD_CASE(Double); + NO_CHILD_CASE(String); + NO_CHILD_CASE(Binary); + NO_CHILD_CASE(FixedSizeBinary); + NO_CHILD_CASE(Decimal128); + + SINGLE_CHILD_CASE(Dictionary, DictionaryFilterImpl, index_type); + SINGLE_CHILD_CASE(Extension, ExtensionFilterImpl, storage_type); + + NO_CHILD_CASE(List); + NO_CHILD_CASE(FixedSizeList); + NO_CHILD_CASE(Map); + + NO_CHILD_CASE(Struct); + +#undef NO_CHILD_CASE +#undef SINGLE_CHILD_CASE + + default: + return Status::NotImplemented("gathering values of type ", *value_type); } - - const FilterParameters& params_; -}; +} Status FilterKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& filter, Datum* out) { @@ -438,15 +442,6 @@ Status FilterKernel::Call(FunctionContext* ctx, const Datum& values, const Datum return Status::OK(); } -Status FilterKernel::Make(const std::shared_ptr& value_type, - std::unique_ptr* out) { - FilterParameters params; - params.value_type = value_type; - params.out = out; - UnpackValues unpack = {params}; - return VisitTypeInline(*value_type, &unpack); -} - Status Filter(FunctionContext* context, const Array& values, const Array& filter, std::shared_ptr* out) { Datum out_datum; From 24f2e852f1658a5112542fd5079a7b99d3e5f4de Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 14 Jun 2019 12:16:10 -0400 Subject: [PATCH 23/29] add larger benchmarks to test for O(N^2) perf --- cpp/src/arrow/compute/kernels/filter-benchmark.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter-benchmark.cc b/cpp/src/arrow/compute/kernels/filter-benchmark.cc index cd2517dba3e..ad4821822c6 100644 --- a/cpp/src/arrow/compute/kernels/filter-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/filter-benchmark.cc @@ -66,8 +66,11 @@ static void FilterFixedSizeList1Int64(benchmark::State& state) { } } -BENCHMARK(FilterInt64)->Apply(RegressionSetArgs); -BENCHMARK(FilterFixedSizeList1Int64)->Apply(RegressionSetArgs); +BENCHMARK(FilterInt64)->Apply(RegressionSetArgs)->Args({1 << 20, 1})->Args({1 << 23, 1}); +BENCHMARK(FilterFixedSizeList1Int64) + ->Apply(RegressionSetArgs) + ->Args({1 << 20, 1}) + ->Args({1 << 23, 1}); } // namespace compute } // namespace arrow From 060313c47c542d14c4c081667347ef665984c4cc Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 14 Jun 2019 12:28:18 -0400 Subject: [PATCH 24/29] refactor FilterImpl to own child kernels --- cpp/src/arrow/compute/kernels/filter.cc | 50 ++++++++++++++++--------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index a9efda8b18e..6a4e3465116 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -172,7 +172,9 @@ class FilterImpl : public FilterKernel { template <> class FilterImpl : public FilterKernel { public: - using FilterKernel::FilterKernel; + FilterImpl(const std::shared_ptr& type, + std::vector> child_kernels) + : FilterKernel(type), child_kernels_(std::move(child_kernels)) {} Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, std::shared_ptr* out) override { @@ -185,7 +187,7 @@ class FilterImpl : public FilterKernel { ArrayVector fields(type_->num_children()); for (int i = 0; i < type_->num_children(); ++i) { RETURN_NOT_OK( - arrow::compute::Filter(ctx, *struct_array.field(i), filter, &fields[i])); + child_kernels_[i]->Filter(ctx, *struct_array.field(i), filter, &fields[i])); } for (int64_t i = 0; i < filter.length(); ++i) { @@ -210,6 +212,9 @@ class FilterImpl : public FilterKernel { out->reset(new StructArray(type_, length, fields, null_bitmap, null_count)); return Status::OK(); } + + private: + std::vector> child_kernels_; }; template <> @@ -331,10 +336,10 @@ class FilterImpl : public FilterImpl { using FilterImpl::FilterImpl; }; -class DictionaryFilterImpl : public FilterKernel { +template <> +class FilterImpl : public FilterKernel { public: - DictionaryFilterImpl(const std::shared_ptr& type, - std::unique_ptr impl) + FilterImpl(const std::shared_ptr& type, std::unique_ptr impl) : FilterKernel(type), impl_(std::move(impl)) {} Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, @@ -351,10 +356,10 @@ class DictionaryFilterImpl : public FilterKernel { std::unique_ptr impl_; }; -class ExtensionFilterImpl : public FilterKernel { +template <> +class FilterImpl : public FilterKernel { public: - ExtensionFilterImpl(const std::shared_ptr& type, - std::unique_ptr impl) + FilterImpl(const std::shared_ptr& type, std::unique_ptr impl) : FilterKernel(type), impl_(std::move(impl)) {} Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, @@ -379,13 +384,13 @@ Status FilterKernel::Make(const std::shared_ptr& value_type, *out = internal::make_unique>(value_type); \ return Status::OK() -#define SINGLE_CHILD_CASE(T, IMPL, CHILD_TYPE) \ - case T##Type::type_id: { \ - auto t = checked_pointer_cast(value_type); \ - std::unique_ptr child_filter_impl; \ - RETURN_NOT_OK(FilterKernel::Make(t->CHILD_TYPE(), &child_filter_impl)); \ - *out = internal::make_unique(t, std::move(child_filter_impl)); \ - return Status::OK(); \ +#define SINGLE_CHILD_CASE(T, CHILD_TYPE) \ + case T##Type::type_id: { \ + auto t = checked_pointer_cast(value_type); \ + std::unique_ptr child_filter_impl; \ + RETURN_NOT_OK(FilterKernel::Make(t->CHILD_TYPE(), &child_filter_impl)); \ + *out = internal::make_unique>(t, std::move(child_filter_impl)); \ + return Status::OK(); \ } NO_CHILD_CASE(Null); @@ -412,14 +417,23 @@ Status FilterKernel::Make(const std::shared_ptr& value_type, NO_CHILD_CASE(FixedSizeBinary); NO_CHILD_CASE(Decimal128); - SINGLE_CHILD_CASE(Dictionary, DictionaryFilterImpl, index_type); - SINGLE_CHILD_CASE(Extension, ExtensionFilterImpl, storage_type); + SINGLE_CHILD_CASE(Dictionary, index_type); + SINGLE_CHILD_CASE(Extension, storage_type); NO_CHILD_CASE(List); NO_CHILD_CASE(FixedSizeList); NO_CHILD_CASE(Map); - NO_CHILD_CASE(Struct); + case Type::STRUCT: { + std::vector> child_kernels; + for (auto child : value_type->children()) { + child_kernels.emplace_back(); + RETURN_NOT_OK(FilterKernel::Make(child->type(), &child_kernels.back())); + } + *out = internal::make_unique>(value_type, + std::move(child_kernels)); + return Status::OK(); + } #undef NO_CHILD_CASE #undef SINGLE_CHILD_CASE From 7702055356c2f4f2e2ecac3234e540ae62ceb554 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 14 Jun 2019 14:38:06 -0400 Subject: [PATCH 25/29] use expanded bitmap for FixedSizeList and List --- cpp/src/arrow/array/builder_primitive.cc | 7 ++ cpp/src/arrow/array/builder_primitive.h | 2 + cpp/src/arrow/compute/kernels/filter-test.cc | 4 + cpp/src/arrow/compute/kernels/filter.cc | 103 ++++++++++--------- 4 files changed, 68 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc index d4def927600..3c899c068cb 100644 --- a/cpp/src/arrow/array/builder_primitive.cc +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -128,4 +128,11 @@ Status BooleanBuilder::AppendValues(const std::vector& values) { return Status::OK(); } +Status BooleanBuilder::AppendValues(int64_t length, bool value) { + RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value); + ArrayBuilder::UnsafeSetNotNull(length); + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 3d566846d19..8abbe029e13 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -409,6 +409,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendValues(int64_t length, bool value); + Status FinishInternal(std::shared_ptr* out) override; /// \cond FALSE diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index f4ad016f10f..7b349492b1d 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -307,6 +307,8 @@ TEST_F(TestFilterKernelWithList, FilterListInt32) { std::string list_json = "[[], [1,2], null, [3]]"; this->AssertFilter(list(int32()), list_json, "[0, 0, 0, 0]", "[]"); this->AssertFilter(list(int32()), list_json, "[0, 1, 1, null]", "[[1,2], null, null]"); + this->AssertFilter(list(int32()), list_json, "[0, 0, 1, null]", "[null, null]"); + this->AssertFilter(list(int32()), list_json, "[1, 0, 0, 1]", "[[], [3]]"); this->AssertFilter(list(int32()), list_json, "[1, 1, 1, 1]", list_json); this->AssertFilter(list(int32()), list_json, "[0, 1, 0, 1]", "[[1,2], [3]]"); } @@ -318,6 +320,8 @@ TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[0, 0, 0, 0]", "[]"); this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[0, 1, 1, null]", "[[1, null, 3], [4, 5, 6], null]"); + this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[0, 0, 1, null]", + "[[4, 5, 6], null]"); this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[1, 1, 1, 1]", list_json); this->AssertFilter(fixed_size_list(int32(), 3), list_json, "[0, 1, 0, 1]", "[[1, null, 3], [7, 8, null]]"); diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index 6a4e3465116..bc590e727b2 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -19,7 +19,6 @@ #include #include -#include "arrow/array/concatenate.h" #include "arrow/builder.h" #include "arrow/compute/context.h" #include "arrow/compute/kernels/filter.h" @@ -46,7 +45,8 @@ class ARROW_EXPORT FilterKernel : public BinaryKernel { std::shared_ptr out_type() const override { return type_; } virtual Status Filter(FunctionContext* ctx, const Array& values, - const BooleanArray& filter, std::shared_ptr* out) = 0; + const BooleanArray& filter, int64_t length, + std::shared_ptr* out) = 0; static Status Make(const std::shared_ptr& value_type, std::unique_ptr* out); @@ -86,13 +86,11 @@ static Status UnsafeAppend(StringBuilder* builder, util::string_view value) { static int64_t OutputSize(const BooleanArray& filter) { auto offset = filter.offset(); auto length = filter.length(); - internal::BitmapReader filter_data(filter.values()->data(), offset, length); int64_t size = 0; for (auto i = offset; i < offset + length; ++i) { - if (filter.IsNull(i) || filter_data.IsSet()) { + if (filter.IsNull(i) || filter.Value(i)) { ++size; } - filter_data.Next(); } return size; } @@ -106,8 +104,8 @@ class FilterImpl : public FilterKernel { using FilterKernel::FilterKernel; Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, - std::shared_ptr* out) override { - out->reset(new NullArray(OutputSize(checked_cast(filter)))); + int64_t length, std::shared_ptr* out) override { + out->reset(new NullArray(length)); return Status::OK(); } }; @@ -121,7 +119,7 @@ class FilterImpl : public FilterKernel { using FilterKernel::FilterKernel; Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, - std::shared_ptr* out) override { + int64_t length, std::shared_ptr* out) override { std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type_, &builder)); RETURN_NOT_OK(builder->Resize(OutputSize(filter))); @@ -177,17 +175,16 @@ class FilterImpl : public FilterKernel { : FilterKernel(type), child_kernels_(std::move(child_kernels)) {} Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, - std::shared_ptr* out) override { + int64_t length, std::shared_ptr* out) override { const auto& struct_array = checked_cast(values); - auto length = OutputSize(filter); TypedBufferBuilder null_bitmap_builder(ctx->memory_pool()); RETURN_NOT_OK(null_bitmap_builder.Resize(length)); ArrayVector fields(type_->num_children()); for (int i = 0; i < type_->num_children(); ++i) { - RETURN_NOT_OK( - child_kernels_[i]->Filter(ctx, *struct_array.field(i), filter, &fields[i])); + RETURN_NOT_OK(child_kernels_[i]->Filter(ctx, *struct_array.field(i), filter, length, + &fields[i])); } for (int64_t i = 0; i < filter.length(); ++i) { @@ -223,43 +220,48 @@ class FilterImpl : public FilterKernel { using FilterKernel::FilterKernel; Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, - std::shared_ptr* out) override { + int64_t length, std::shared_ptr* out) override { const auto& list_array = checked_cast(values); TypedBufferBuilder null_bitmap_builder(ctx->memory_pool()); - auto length = OutputSize(filter); RETURN_NOT_OK(null_bitmap_builder.Resize(length)); - std::shared_ptr null_slice; - RETURN_NOT_OK(MakeArrayOfNull(list_array.value_type(), - list_array.list_type()->list_size(), &null_slice)); - ArrayVector value_slices(length, null_slice); + BooleanBuilder value_filter_builder(ctx->memory_pool()); + auto list_size = list_array.list_type()->list_size(); + RETURN_NOT_OK(value_filter_builder.Resize(list_size * length)); - for (int64_t filtered_i = 0, i = 0; i < filter.length(); ++i) { + for (int64_t i = 0; i < filter.length(); ++i) { if (filter.IsNull(i)) { null_bitmap_builder.UnsafeAppend(false); - ++filtered_i; + for (int64_t j = 0; j < list_size; ++j) { + value_filter_builder.UnsafeAppendNull(); + } continue; } if (!filter.Value(i)) { + for (int64_t j = 0; j < list_size; ++j) { + value_filter_builder.UnsafeAppend(false); + } continue; } if (values.IsNull(i)) { null_bitmap_builder.UnsafeAppend(false); - ++filtered_i; + for (int64_t j = 0; j < list_size; ++j) { + value_filter_builder.UnsafeAppendNull(); + } continue; } + for (int64_t j = 0; j < list_size; ++j) { + value_filter_builder.UnsafeAppend(true); + } null_bitmap_builder.UnsafeAppend(true); - value_slices[filtered_i] = list_array.value_slice(i); - ++filtered_i; } + std::shared_ptr value_filter; + RETURN_NOT_OK(value_filter_builder.Finish(&value_filter)); std::shared_ptr out_values; - if (length != 0) { - RETURN_NOT_OK(Concatenate(value_slices, ctx->memory_pool(), &out_values)); - } else { - out_values = list_array.values()->Slice(0, 0); - } + RETURN_NOT_OK( + arrow::compute::Filter(ctx, *list_array.values(), *value_filter, &out_values)); auto null_count = null_bitmap_builder.false_count(); std::shared_ptr null_bitmap; @@ -277,48 +279,50 @@ class FilterImpl : public FilterKernel { using FilterKernel::FilterKernel; Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, - std::shared_ptr* out) override { + int64_t length, std::shared_ptr* out) override { const auto& list_array = checked_cast(values); TypedBufferBuilder null_bitmap_builder(ctx->memory_pool()); - auto length = OutputSize(filter); RETURN_NOT_OK(null_bitmap_builder.Resize(length)); + BooleanBuilder value_filter_builder(ctx->memory_pool()); + TypedBufferBuilder offset_builder(ctx->memory_pool()); RETURN_NOT_OK(offset_builder.Resize(length + 1)); int32_t offset = 0; offset_builder.UnsafeAppend(offset); - ArrayVector value_slices(length, list_array.values()->Slice(0, 0)); - for (int64_t filtered_i = 0, i = 0; i < filter.length(); ++i) { + for (int64_t i = 0; i < filter.length(); ++i) { if (filter.IsNull(i)) { null_bitmap_builder.UnsafeAppend(false); offset_builder.UnsafeAppend(offset); - ++filtered_i; + RETURN_NOT_OK( + value_filter_builder.AppendValues(list_array.value_length(i), false)); continue; } if (!filter.Value(i)) { + RETURN_NOT_OK( + value_filter_builder.AppendValues(list_array.value_length(i), false)); continue; } if (values.IsNull(i)) { null_bitmap_builder.UnsafeAppend(false); offset_builder.UnsafeAppend(offset); - ++filtered_i; + RETURN_NOT_OK( + value_filter_builder.AppendValues(list_array.value_length(i), false)); continue; } null_bitmap_builder.UnsafeAppend(true); - value_slices[filtered_i] = list_array.value_slice(i); - ++filtered_i; offset += list_array.value_length(i); offset_builder.UnsafeAppend(offset); + RETURN_NOT_OK(value_filter_builder.AppendValues(list_array.value_length(i), true)); } + std::shared_ptr value_filter; + RETURN_NOT_OK(value_filter_builder.Finish(&value_filter)); std::shared_ptr out_values; - if (length != 0) { - RETURN_NOT_OK(Concatenate(value_slices, ctx->memory_pool(), &out_values)); - } else { - out_values = list_array.values()->Slice(0, 0); - } + RETURN_NOT_OK( + arrow::compute::Filter(ctx, *list_array.values(), *value_filter, &out_values)); auto null_count = null_bitmap_builder.false_count(); std::shared_ptr offsets, null_bitmap; @@ -343,11 +347,12 @@ class FilterImpl : public FilterKernel { : FilterKernel(type), impl_(std::move(impl)) {} Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, - std::shared_ptr* out) override { + int64_t length, std::shared_ptr* out) override { auto dict_array = checked_cast(&values); // To filter a dictionary, apply the current kernel to the dictionary's indices. std::shared_ptr taken_indices; - RETURN_NOT_OK(impl_->Filter(ctx, *dict_array->indices(), filter, &taken_indices)); + RETURN_NOT_OK( + impl_->Filter(ctx, *dict_array->indices(), filter, length, &taken_indices)); return DictionaryArray::FromArrays(values.type(), taken_indices, dict_array->dictionary(), out); } @@ -363,11 +368,12 @@ class FilterImpl : public FilterKernel { : FilterKernel(type), impl_(std::move(impl)) {} Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter, - std::shared_ptr* out) override { + int64_t length, std::shared_ptr* out) override { auto ext_array = checked_cast(&values); // To take from an extension array, apply the current kernel to storage. std::shared_ptr taken_storage; - RETURN_NOT_OK(impl_->Filter(ctx, *ext_array->storage(), filter, &taken_storage)); + RETURN_NOT_OK( + impl_->Filter(ctx, *ext_array->storage(), filter, length, &taken_storage)); *out = ext_array->extension_type()->MakeArray(taken_storage->data()); return Status::OK(); } @@ -448,10 +454,11 @@ Status FilterKernel::Call(FunctionContext* ctx, const Datum& values, const Datum if (!values.is_array() || !filter.is_array()) { return Status::Invalid("FilterKernel expects array values and filter"); } - std::shared_ptr out_array; - auto filter_array = checked_pointer_cast(filter.make_array()); auto values_array = values.make_array(); - RETURN_NOT_OK(this->Filter(ctx, *values_array, *filter_array, &out_array)); + auto filter_array = checked_pointer_cast(filter.make_array()); + const auto length = OutputSize(*filter_array); + std::shared_ptr out_array; + RETURN_NOT_OK(this->Filter(ctx, *values_array, *filter_array, length, &out_array)); *out = out_array; return Status::OK(); } From 030ac57eab58ae690c33b434660e3aa3c1890b80 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 14 Jun 2019 16:52:33 -0400 Subject: [PATCH 26/29] filter benchmarks += MinTime(1.0) nanoseconds --- cpp/src/arrow/compute/kernels/filter-benchmark.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter-benchmark.cc b/cpp/src/arrow/compute/kernels/filter-benchmark.cc index ad4821822c6..f503741b7c0 100644 --- a/cpp/src/arrow/compute/kernels/filter-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/filter-benchmark.cc @@ -66,11 +66,19 @@ static void FilterFixedSizeList1Int64(benchmark::State& state) { } } -BENCHMARK(FilterInt64)->Apply(RegressionSetArgs)->Args({1 << 20, 1})->Args({1 << 23, 1}); +BENCHMARK(FilterInt64) + ->Apply(RegressionSetArgs) + ->Args({1 << 20, 1}) + ->Args({1 << 23, 1}) + ->MinTime(1.0) + ->Unit(benchmark::TimeUnit::kNanosecond); + BENCHMARK(FilterFixedSizeList1Int64) ->Apply(RegressionSetArgs) ->Args({1 << 20, 1}) - ->Args({1 << 23, 1}); + ->Args({1 << 23, 1}) + ->MinTime(1.0) + ->Unit(benchmark::TimeUnit::kNanosecond); } // namespace compute } // namespace arrow From e8465e5d2f29f9d84b3f0c9cba1d9b80ec3f4831 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Sat, 15 Jun 2019 12:56:44 -0400 Subject: [PATCH 27/29] iwyu: vector --- cpp/src/arrow/compute/kernels/filter.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index bc590e727b2..7b5f9890349 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "arrow/builder.h" #include "arrow/compute/context.h" From 3d92b6e12af6e70c543cfb8589696ee19280c631 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 19 Jun 2019 09:34:44 -0400 Subject: [PATCH 28/29] Make FilterKernel public --- cpp/src/arrow/compute/benchmark-util.h | 2 +- .../compute/kernels/compare-benchmark.cc | 10 +++--- .../arrow/compute/kernels/filter-benchmark.cc | 6 ++-- cpp/src/arrow/compute/kernels/filter.cc | 21 ------------ cpp/src/arrow/compute/kernels/filter.h | 34 +++++++++++++++++-- 5 files changed, 43 insertions(+), 30 deletions(-) diff --git a/cpp/src/arrow/compute/benchmark-util.h b/cpp/src/arrow/compute/benchmark-util.h index 51e57071c61..113fdd70312 100644 --- a/cpp/src/arrow/compute/benchmark-util.h +++ b/cpp/src/arrow/compute/benchmark-util.h @@ -79,7 +79,7 @@ struct RegressionArgs { const double null_proportion; explicit RegressionArgs(benchmark::State& state) - : size(state.range(0) / 4), + : size(state.range(0)), null_proportion(static_cast(state.range(1)) / 100.0), state_(state) {} diff --git a/cpp/src/arrow/compute/kernels/compare-benchmark.cc b/cpp/src/arrow/compute/kernels/compare-benchmark.cc index 00de199bfe9..6983fe5cc1b 100644 --- a/cpp/src/arrow/compute/kernels/compare-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/compare-benchmark.cc @@ -29,11 +29,13 @@ namespace arrow { namespace compute { +constexpr auto kSeed = 0x94378165; + static void CompareArrayScalarKernel(benchmark::State& state) { - const int64_t memory_size = state.range(0) / 4; + const int64_t memory_size = state.range(0); const int64_t array_size = memory_size / sizeof(int64_t); const double null_percent = static_cast(state.range(1)) / 100.0; - auto rand = random::RandomArrayGenerator(0x94378165); + auto rand = random::RandomArrayGenerator(kSeed); auto array = std::static_pointer_cast>( rand.Int64(array_size, -100, 100, null_percent)); @@ -52,10 +54,10 @@ static void CompareArrayScalarKernel(benchmark::State& state) { } static void CompareArrayArrayKernel(benchmark::State& state) { - const int64_t memory_size = state.range(0) / 4; + const int64_t memory_size = state.range(0); const int64_t array_size = memory_size / sizeof(int64_t); const double null_percent = static_cast(state.range(1)) / 100.0; - auto rand = random::RandomArrayGenerator(0x94378165); + auto rand = random::RandomArrayGenerator(kSeed); auto lhs = std::static_pointer_cast>( rand.Int64(array_size, -100, 100, null_percent)); auto rhs = std::static_pointer_cast>( diff --git a/cpp/src/arrow/compute/kernels/filter-benchmark.cc b/cpp/src/arrow/compute/kernels/filter-benchmark.cc index f503741b7c0..3eb460adc02 100644 --- a/cpp/src/arrow/compute/kernels/filter-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/filter-benchmark.cc @@ -27,11 +27,13 @@ namespace arrow { namespace compute { +constexpr auto kSeed = 0x0ff1ce; + static void FilterInt64(benchmark::State& state) { RegressionArgs args(state); const int64_t array_size = args.size / sizeof(int64_t); - auto rand = random::RandomArrayGenerator(0x94378165); + auto rand = random::RandomArrayGenerator(kSeed); auto array = std::static_pointer_cast>( rand.Int64(array_size, -100, 100, args.null_proportion)); auto filter = std::static_pointer_cast( @@ -49,7 +51,7 @@ static void FilterFixedSizeList1Int64(benchmark::State& state) { RegressionArgs args(state); const int64_t array_size = args.size / sizeof(int64_t); - auto rand = random::RandomArrayGenerator(0x94378165); + auto rand = random::RandomArrayGenerator(kSeed); auto int_array = std::static_pointer_cast>( rand.Int64(array_size, -100, 100, args.null_proportion)); auto array = std::make_shared( diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index 7b5f9890349..654ec610352 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -35,27 +35,6 @@ namespace compute { using internal::checked_cast; using internal::checked_pointer_cast; -/// \brief BinaryKernel implementing Filter operation -class ARROW_EXPORT FilterKernel : public BinaryKernel { - public: - explicit FilterKernel(const std::shared_ptr& type) : type_(type) {} - - Status Call(FunctionContext* ctx, const Datum& values, const Datum& filter, - Datum* out) override; - - std::shared_ptr out_type() const override { return type_; } - - virtual Status Filter(FunctionContext* ctx, const Array& values, - const BooleanArray& filter, int64_t length, - std::shared_ptr* out) = 0; - - static Status Make(const std::shared_ptr& value_type, - std::unique_ptr* out); - - protected: - std::shared_ptr type_; -}; - template Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out) { diff --git a/cpp/src/arrow/compute/kernels/filter.h b/cpp/src/arrow/compute/kernels/filter.h index 6192a277794..008b938214c 100644 --- a/cpp/src/arrow/compute/kernels/filter.h +++ b/cpp/src/arrow/compute/kernels/filter.h @@ -42,7 +42,7 @@ class FunctionContext; /// = ["b", "c", null, "f"] /// /// \param[in] context the FunctionContext -/// \param[in] values array from which to take +/// \param[in] values array to filter /// \param[in] filter indicates which values should be filtered out /// \param[out] out resulting array ARROW_EXPORT @@ -52,12 +52,42 @@ Status Filter(FunctionContext* context, const Array& values, const Array& filter /// \brief Filter an array with a boolean selection filter /// /// \param[in] context the FunctionContext -/// \param[in] values datum from which to take +/// \param[in] values datum to filter /// \param[in] filter indicates which values should be filtered out /// \param[out] out resulting datum ARROW_EXPORT Status Filter(FunctionContext* context, const Datum& values, const Datum& filter, Datum* out); +/// \brief BinaryKernel implementing Filter operation +class ARROW_EXPORT FilterKernel : public BinaryKernel { + public: + explicit FilterKernel(const std::shared_ptr& type) : type_(type) {} + + /// \brief BinaryKernel interface + /// + /// delegates to subclasses via Filter() + Status Call(FunctionContext* ctx, const Datum& values, const Datum& filter, + Datum* out) override; + + /// \brief output type of this kernel (identical to type of values filtered) + std::shared_ptr out_type() const override { return type_; } + + /// \brief factory for FilterKernels + /// + /// \param[in] value_type constructed FilterKernel will support filtering + /// values of this type + static Status Make(const std::shared_ptr& value_type, + std::unique_ptr* out); + + /// \brief single-array implementation + virtual Status Filter(FunctionContext* ctx, const Array& values, + const BooleanArray& filter, int64_t length, + std::shared_ptr* out) = 0; + + protected: + std::shared_ptr type_; +}; + } // namespace compute } // namespace arrow From 032d341bce46b09981f219984062e5efd3b08160 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 19 Jun 2019 14:10:10 -0400 Subject: [PATCH 29/29] fix doc error --- cpp/src/arrow/compute/kernels/filter.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/kernels/filter.h b/cpp/src/arrow/compute/kernels/filter.h index 008b938214c..46ad3d42b87 100644 --- a/cpp/src/arrow/compute/kernels/filter.h +++ b/cpp/src/arrow/compute/kernels/filter.h @@ -77,6 +77,7 @@ class ARROW_EXPORT FilterKernel : public BinaryKernel { /// /// \param[in] value_type constructed FilterKernel will support filtering /// values of this type + /// \param[out] out created kernel static Status Make(const std::shared_ptr& value_type, std::unique_ptr* out);