From 8295c3fa339f563ce1c12d0936d546404551adad Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 9 Jun 2020 10:52:29 -0400 Subject: [PATCH 01/13] ARROW-971: [C++][Compute] IsValid, IsNull kernels --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/builder_base.cc | 43 +++++ cpp/src/arrow/array/builder_base.h | 2 + cpp/src/arrow/compute/api_scalar.cc | 6 + cpp/src/arrow/compute/api_scalar.h | 24 +++ cpp/src/arrow/compute/exec.cc | 18 ++- cpp/src/arrow/compute/kernel.h | 2 +- cpp/src/arrow/compute/kernels/CMakeLists.txt | 1 + .../arrow/compute/kernels/codegen_internal.h | 12 +- .../compute/kernels/scalar_arithmetic_test.cc | 41 +++++ .../arrow/compute/kernels/scalar_boolean.cc | 7 +- .../arrow/compute/kernels/scalar_validity.cc | 97 +++++++++++ .../compute/kernels/scalar_validity_test.cc | 151 ++++++++++++++++++ cpp/src/arrow/compute/kernels/test_util.cc | 140 ++++++++++++++++ cpp/src/arrow/compute/kernels/test_util.h | 60 +++++++ cpp/src/arrow/compute/registry.cc | 1 + cpp/src/arrow/compute/registry_internal.h | 1 + cpp/src/arrow/datum.h | 3 + cpp/src/arrow/testing/random.cc | 56 +++++++ cpp/src/arrow/testing/random.h | 3 + cpp/src/arrow/util/iterator_test.cc | 3 + .../encryption_read_configurations_test.cc | 2 +- 22 files changed, 662 insertions(+), 12 deletions(-) create mode 100644 cpp/src/arrow/compute/kernels/scalar_validity.cc create mode 100644 cpp/src/arrow/compute/kernels/scalar_validity_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 27cdd02440c..715373fad8e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -355,6 +355,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_compare.cc compute/kernels/scalar_set_lookup.cc compute/kernels/scalar_string.cc + compute/kernels/scalar_validity.cc compute/kernels/vector_filter.cc compute/kernels/vector_hash.cc compute/kernels/vector_sort.cc diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 4c21859fae3..4b571d2b5ec 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -21,11 +21,15 @@ #include #include "arrow/array/array_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_primitive.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { @@ -111,4 +115,43 @@ void ArrayBuilder::UnsafeSetNull(int64_t length) { null_bitmap_builder_.UnsafeAppend(length, false); } +struct ArrayBuilderAppendScalarImpl { + template ::BuilderType> + B& builder() { + return *internal::checked_cast(builder_); + } + + template + auto Visit(const S& scalar) -> decltype(builder().Append(scalar.value)) { + return builder().Append(scalar.value); + } + + template + enable_if_base_binary Visit(const S& scalar) { + return builder().Append(scalar.value->data(), scalar.value->size()); + } + + Status Visit(const Scalar& scalar) { + return Status::NotImplemented("Appending scalars to builders of type ", *scalar.type); + } + + Status Finish() && { return VisitScalarInline(scalar_, this); } + ArrayBuilder* builder_; + const Scalar& scalar_; +}; + +Status ArrayBuilder::Append(const Scalar& scalar) { + auto builder_type = type(); + if (!scalar.type->Equals(builder_type)) { + return Status::TypeError("Cannot append scalars of type ", *scalar.type, + " to a builder of type ", *builder_type); + } + if (!scalar.is_valid) { + return AppendNull(); + } + RETURN_NOT_OK(Reserve(1)); + return ArrayBuilderAppendScalarImpl{this, scalar}.Finish(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 105425539cd..c8b313419d8 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -98,6 +98,8 @@ class ARROW_EXPORT ArrayBuilder { virtual Status AppendNull() = 0; virtual Status AppendNulls(int64_t length) = 0; + Status Append(const Scalar& scalar); + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 9cdce7c1f16..b6c1279fe5e 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -113,5 +113,11 @@ Result Compare(const Datum& left, const Datum& right, CompareOptions opti return CallFunction(func_name, {left, right}, &options, ctx); } +// ---------------------------------------------------------------------- +// Validity functions + +SCALAR_EAGER_UNARY(IsValid, "is_valid") +SCALAR_EAGER_UNARY(IsNull, "is_null") + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index bc502f7bcb9..ffca1af7c49 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -221,6 +221,30 @@ ARROW_EXPORT Result Match(const Datum& values, const Datum& value_set, ExecContext* ctx = NULLPTR); +/// \brief IsValid returns true for each element of `values` that is not null, +/// false otherwise +/// +/// \param[in] values input to examine for validity +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief IsNull returns true for each element of `values` that is null, +/// false otherwise +/// +/// \param[in] values input to examine for nullity +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IsNull(const Datum& values, ExecContext* ctx = NULLPTR); + // ---------------------------------------------------------------------- // Temporal functions diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index e6f648a325d..d9caec543fd 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -265,7 +266,7 @@ class NullPropagator { } } else { // Scalar - is_all_null = true; + is_all_null = !value->scalar()->is_valid; } } if (!is_all_null) { @@ -591,9 +592,18 @@ class ScalarExecutor : public FunctionExecutorImpl { Datum out; RETURN_NOT_OK(PrepareNextOutput(batch, &out)); - if (kernel_->null_handling == NullHandling::INTERSECTION && - output_descr_.shape == ValueDescr::ARRAY) { - RETURN_NOT_OK(PropagateNulls(&kernel_ctx_, batch, out.mutable_array())); + if (kernel_->null_handling == NullHandling::INTERSECTION) { + if (output_descr_.shape == ValueDescr::ARRAY) { + RETURN_NOT_OK(PropagateNulls(&kernel_ctx_, batch, out.mutable_array())); + } else { + // set scalar validity + out.scalar()->is_valid = + std::all_of(batch.values.begin(), batch.values.end(), + [](const Datum& input) { return input.scalar()->is_valid; }); + } + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL && + output_descr_.shape == ValueDescr::SCALAR) { + out.scalar()->is_valid = true; } kernel_->exec(&kernel_ctx_, batch, &out); diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 059a7c2001a..c14354da2cf 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -55,7 +55,7 @@ class ARROW_EXPORT KernelContext { explicit KernelContext(ExecContext* exec_ctx) : exec_ctx_(exec_ctx) {} /// \brief Allocate buffer from the context's memory pool. The contents are - /// not uninitialized. + /// not initialized. Result> Allocate(int64_t nbytes); /// \brief Allocate buffer for bitmap from the context's memory pool. Like diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index e3fa987fdaf..9ff0d0973fd 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -26,6 +26,7 @@ add_arrow_compute_test(scalar_test scalar_compare_test.cc scalar_set_lookup_test.cc scalar_string_test.cc + scalar_validity_test.cc test_util.cc) add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 1a9aef90e49..c547c807757 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -282,10 +282,11 @@ namespace codegen { // Operator must implement // // static void Call(KernelContext*, const ArrayData& in, ArrayData* out) +// static void Call(KernelContext*, const Scalar& in, Scalar* out) template void SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) { if (batch[0].kind() == Datum::SCALAR) { - ctx->SetStatus(Status::NotImplemented("NYI")); + Operator::Call(ctx, *batch[0].scalar(), out->scalar().get()); } else if (batch.length > 0) { Operator::Call(ctx, *batch[0].array(), out->mutable_array()); } @@ -612,9 +613,12 @@ struct ScalarBinary { } static void ScalarScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - auto arg0 = UnboxScalar::Unbox(batch[0]); - auto arg1 = UnboxScalar::Unbox(batch[1]); - out->value = BoxScalar::Box(Op::template Call(ctx, arg0, arg1), out->type()); + if (out->scalar()->is_valid) { + auto arg0 = UnboxScalar::Unbox(batch[0]); + auto arg1 = UnboxScalar::Unbox(batch[1]); + out->value = + BoxScalar::Box(Op::template Call(ctx, arg0, arg1), out->type()); + } } static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index 2f2159ef642..d541b7bc7c9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -290,5 +290,46 @@ TYPED_TEST(TestBinaryArithmeticsFloating, Sub) { "[null, -0.9, -3.2, null, -1.9, -5.2]"); } +class AddProperty : public ScalarFunctionPropertyMixin { + public: + std::shared_ptr GetFunction() override { + return internal::checked_pointer_cast( + *GetFunctionRegistry()->GetFunction("add")); + } + + Result> Contract(const ScalarVector& args, + const FunctionOptions*) override { + const auto& out_type = args[0]->type; + + if (!args[0]->is_valid || !args[1]->is_valid) { + return MakeNullScalar(out_type); + } + + if (is_integer(out_type->id())) { + ARROW_ASSIGN_OR_RAISE(auto lhs, Cast(args[0])); + ARROW_ASSIGN_OR_RAISE(auto rhs, Cast(args[1])); + return UInt64Scalar(lhs->value + rhs->value).CastTo(out_type); + } + + if (is_floating(out_type->id())) { + ARROW_ASSIGN_OR_RAISE(auto lhs, Cast(args[0])); + ARROW_ASSIGN_OR_RAISE(auto rhs, Cast(args[1])); + return DoubleScalar(lhs->value + rhs->value).CastTo(out_type); + } + + return Status::NotImplemented("NYI"); + } +}; + +TEST_P(AddProperty, TestAddProperty) { Validate(); } + +INSTANTIATE_TEST_SUITE_P(AddPropertyTest, AddProperty, + ScalarFunctionPropertyTestParam::Values({ + {0, 0.0}, + {1, 0.0}, + {2, 0.0}, + {1024, 0.25}, + })); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc index 89f4de08052..bc1b121f23e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc @@ -81,8 +81,11 @@ void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayDa } struct Invert { - static void Call(KernelContext* ctx, bool value) { - ctx->SetStatus(Status::NotImplemented("NYI")); + static void Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + if (in.is_valid) { + checked_cast(out)->value = + !checked_cast(in).value; + } } static void Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc new file mode 100644 index 00000000000..16faed52d99 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/kernels/common.h" + +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" + +namespace arrow { +namespace compute { + +struct IsValid { + static void Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + checked_cast(out)->value = in.is_valid; + } + + static void Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { + if (arr.buffers[0] != nullptr && out->offset == arr.offset && + out->length == arr.length) { + out->buffers[1] = arr.buffers[0]; + return; + } + + if (arr.null_count == 0 || arr.buffers[0] == nullptr) { + BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true); + return; + } + + internal::CopyBitmap(arr.buffers[0]->data(), arr.offset, arr.length, + out->buffers[1]->mutable_data(), out->offset); + } +}; + +struct IsNull { + static void Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + checked_cast(out)->value = !in.is_valid; + } + + static void Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { + if (arr.null_count == 0 || arr.buffers[0] == nullptr) { + BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, + false); + return; + } + + internal::InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, + out->buffers[1]->mutable_data(), out->offset); + } +}; + +namespace codegen { + +void MakeFunction(std::string name, std::vector in_types, OutputType out_type, + ArrayKernelExec exec, FunctionRegistry* registry, + NullHandling::type null_handling) { + Arity arity{static_cast(in_types.size())}; + auto func = std::make_shared(name, arity); + + ScalarKernel kernel(std::move(in_types), out_type, exec); + kernel.null_handling = null_handling; + kernel.can_write_into_slices = true; + + DCHECK_OK(func->AddKernel(std::move(kernel))); + DCHECK_OK(registry->AddFunction(std::move(func))); +} + +} // namespace codegen + +namespace internal { + +void RegisterScalarValidity(FunctionRegistry* registry) { + codegen::MakeFunction("is_valid", {ValueDescr::ANY}, boolean(), + codegen::SimpleUnary, registry, + NullHandling::OUTPUT_NOT_NULL); + + codegen::MakeFunction("is_null", {ValueDescr::ANY}, boolean(), + codegen::SimpleUnary, registry, + NullHandling::OUTPUT_NOT_NULL); +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc new file mode 100644 index 00000000000..70e6f6323ab --- /dev/null +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array.h" +#include "arrow/compute/api.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/testing/gtest_common.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bitmap_reader.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace compute { + +class TestValidityKernels : public ::testing::Test { + protected: + // XXX Since IsValid and IsNull don't touch any buffers but the null bitmap + // testing multiple types seems redundant. + using ArrowType = BooleanType; + + using CType = typename ArrowType::c_type; + + static std::shared_ptr type_singleton() { + return TypeTraits::type_singleton(); + } + + void AssertUnary(Datum arg, Datum expected) { + ASSERT_OK_AND_ASSIGN(auto actual, func_(arg, nullptr)); + ASSERT_EQ(actual.kind(), expected.kind()); + if (actual.kind() == Datum::ARRAY) { + ASSERT_OK(actual.make_array()->ValidateFull()); + AssertArraysApproxEqual(*expected.make_array(), *actual.make_array()); + } else { + AssertScalarsEqual(*expected.scalar(), *actual.scalar()); + } + } + + void AssertUnary(const std::string& arg_json, const std::string& expected_json) { + AssertUnary(ArrayFromJSON(type_singleton(), arg_json), + ArrayFromJSON(type_singleton(), expected_json)); + } + + using UnaryFunction = std::function(const Datum&, ExecContext*)>; + UnaryFunction func_; +}; + +TEST_F(TestValidityKernels, ArrayIsValid) { + func_ = arrow::compute::IsValid; + + this->AssertUnary("[]", "[]"); + this->AssertUnary("[null]", "[false]"); + this->AssertUnary("[1]", "[true]"); + this->AssertUnary("[null, 1, 0, null]", "[false, true, true, false]"); +} + +TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) { + Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]"); + ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg)); + ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]); +} + +TEST_F(TestValidityKernels, ScalarIsValid) { + func_ = arrow::compute::IsValid; + + AssertUnary(Datum(19.7), Datum(true)); + AssertUnary(MakeNullScalar(float64()), Datum(false)); +} + +TEST_F(TestValidityKernels, ArrayIsNull) { + func_ = arrow::compute::IsNull; + + this->AssertUnary("[]", "[]"); + this->AssertUnary("[null]", "[true]"); + this->AssertUnary("[1]", "[false]"); + this->AssertUnary("[null, 1, 0, null]", "[true, false, false, true]"); +} + +TEST_F(TestValidityKernels, DISABLED_ScalarIsNull) { + func_ = arrow::compute::IsNull; + + AssertUnary(Datum(19.7), Datum(false)); + AssertUnary(MakeNullScalar(float64()), Datum(true)); +} + +class IsValidProperty : public ScalarFunctionPropertyMixin { + public: + std::shared_ptr GetFunction() override { + return internal::checked_pointer_cast( + *GetFunctionRegistry()->GetFunction("is_valid")); + } + + Result> Contract(const ScalarVector& args, + const FunctionOptions*) override { + return std::make_shared(args[0]->is_valid); + } +}; + +TEST_P(IsValidProperty, TestIsValidProperty) { Validate(); } + +INSTANTIATE_TEST_SUITE_P(IsValidPropertyTest, IsValidProperty, + ScalarFunctionPropertyTestParam::Values({ + {0, 0.0}, + {1, 0.0}, + {2, 0.0}, + {1024, 0.25}, + })); + +class IsNullProperty : public ScalarFunctionPropertyMixin { + public: + std::shared_ptr GetFunction() override { + return internal::checked_pointer_cast( + *GetFunctionRegistry()->GetFunction("is_null")); + } + + Result> Contract(const ScalarVector& args, + const FunctionOptions*) override { + return std::make_shared(!args[0]->is_valid); + } +}; + +TEST_P(IsNullProperty, TestIsNullProperty) { Validate(); } + +INSTANTIATE_TEST_SUITE_P(IsNullPropertyTest, IsNullProperty, + ScalarFunctionPropertyTestParam::Values({ + {0, 0.0}, + {1, 0.0}, + {2, 0.0}, + {1024, 0.25}, + })); + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc index 49b8bcec7b2..7db2cdcbe47 100644 --- a/cpp/src/arrow/compute/kernels/test_util.cc +++ b/cpp/src/arrow/compute/kernels/test_util.cc @@ -47,5 +47,145 @@ void CheckScalarUnary(std::string func_name, std::shared_ptr in_ty, } } +void ScalarFunctionPropertyMixin::Validate() { + auto function = GetFunction(); + + for (const auto& kernel : function->kernels()) { + for (auto inputs : GenerateInputs(*kernel->signature)) { + // FIXME(bkietz) get the output type alongside the inputs + auto out_type = inputs[0].type(); + + auto actual = function->Execute(inputs, GetParam().options); + auto expected = ComputeExpected(inputs, out_type, GetParam().options); + + if (actual.ok()) { + // TODO(bkietz) allow approximate equality for floats + ASSERT_OK(expected.status()); + AssertDatumsEqual(*expected, *actual); + } else { + // don't require properties to get the error message right + ASSERT_EQ(actual.status().code(), expected.status().code()); + } + } + } +} + +Result ScalarFunctionPropertyMixin::ComputeExpected( + const std::vector& args, const std::shared_ptr& out_type, + const FunctionOptions* options) { + util::optional length; + for (const Datum& arg : args) { + if (arg.is_scalar()) continue; + + if (length && *length != arg.length()) { + return Status::Invalid("mismatched array lengths in ScalarFunction application"); + } + length = arg.length(); + } + + auto ApplyContractOnce = [&](int64_t i) { + ScalarVector scalar_args; + for (const Datum& arg : args) { + if (arg.is_scalar()) { + scalar_args.push_back(arg.scalar()); + } else { + // TODO(bkietz) provide GetScalar(ArrayData) + scalar_args.push_back(*arg.make_array()->GetScalar(i)); + } + } + return Contract(scalar_args, options); + }; + + if (!length) { + // scalar output + return ApplyContractOnce(0); + } + + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), out_type, &builder)); + RETURN_NOT_OK(builder->Resize(*length)); + + for (int64_t i = 0; i < *length; ++i) { + ARROW_ASSIGN_OR_RAISE(auto value, ApplyContractOnce(i)); + RETURN_NOT_OK(builder->Append(*value)); + } + + std::shared_ptr array; + RETURN_NOT_OK(builder->FinishInternal(&array)); + return array; +} + +template +std::vector> CartesianProduct(std::vector> axes) { + if (axes.empty()) { + return {}; + } + + size_t out_length = 1; + std::vector::const_iterator> its(axes.size()); + for (size_t i = 0; i < axes.size(); ++i) { + out_length *= axes[i].size(); + its[i] = axes[i].begin(); + } + + std::vector> out(out_length); + for (auto& point : out) { + for (auto it : its) { + point.emplace_back(*it); + } + + for (size_t i = 0; i < axes.size(); ++i) { + if (++its[i] != axes[i].end()) { + break; + } + its[i] = axes[i].begin(); + } + } + + return out; +} + +std::vector> ScalarFunctionPropertyMixin::GenerateInputs( + const KernelSignature& signature) { + std::vector> inputs; + + auto AppendInputsForType = [&](const std::shared_ptr& type, + std::vector* vec) { + auto array = rag_.Of(type, GetParam().length, GetParam().null_probability); + vec->push_back(array); + + auto array_slice = array->Slice(GetParam().length / 3, 2 * GetParam().length / 3); + vec->push_back(array_slice); + + auto scalar = *rag_.Of(type, 1, 0.0)->GetScalar(0); + vec->push_back(scalar); + + auto null_scalar = MakeNullScalar(type); + vec->push_back(null_scalar); + }; + + DCHECK(!signature.is_varargs()); + for (const InputType& in_type : signature.in_types()) { + DCHECK_EQ(in_type.shape(), ValueDescr::ANY); + + inputs.emplace_back(); + + if (in_type.kind() == InputType::EXACT_TYPE) { + AppendInputsForType(in_type.type(), &inputs.back()); + continue; + } + + if (in_type.kind() == InputType::ANY_TYPE) { + // TODO(bkietz) use more input types + AppendInputsForType(boolean(), &inputs.back()); + continue; + } + + // TODO(bkietz) iterate over possible types based on a matcher + } + + return CartesianProduct(std::move(inputs)); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/test_util.h b/cpp/src/arrow/compute/kernels/test_util.h index 88c3c3f4485..2129cc79509 100644 --- a/cpp/src/arrow/compute/kernels/test_util.h +++ b/cpp/src/arrow/compute/kernels/test_util.h @@ -33,7 +33,9 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/iterator.h" +#include "arrow/compute/function.h" #include "arrow/compute/kernel.h" // IWYU pragma: end_exports @@ -97,5 +99,63 @@ using TestingStringTypes = static constexpr random::SeedType kRandomSeed = 0x0ff1ce; +struct ScalarFunctionPropertyTestParam { + ScalarFunctionPropertyTestParam(int64_t length, double null_probability) + : length(length), null_probability(null_probability) {} + + ScalarFunctionPropertyTestParam(int64_t length, double null_probability, + const FunctionOptions* options) + : length(length), null_probability(null_probability), options(options) {} + + static auto Values(std::initializer_list params) + -> decltype(testing::ValuesIn(params)) { + return testing::ValuesIn(params); + } + + int64_t length; + double null_probability; + const FunctionOptions* options = nullptr; +}; + +class ScalarFunctionPropertyMixin + : public testing::TestWithParam { + protected: + /// Return an instance of the ScalarFunction which should be tested. + virtual std::shared_ptr GetFunction() = 0; + + /// Contract() should contain a minimal implementation of the function's + /// intended behavior. For example, a Property expressing a division function + /// could unbox the Scalars and perform division on them. + /// + /// The arguments will be generated based on kernel signatures so validation of anything + /// expressible in InputType is unnecessary (for example inputs will always have valid + /// arity and type). Other errors should be emitted with the same StatusCode that a + /// kernel would raise - in the example case of a division function a divide by zero + /// error should be emitted by Contract(). + virtual Result> Contract(const ScalarVector& args, + const FunctionOptions* options) = 0; + + /// Run randomized testing of all kernels in the function. + void Validate(); + + /// Helper for unboxing scalars efficiently in implementations of Contract() + template + Result> Cast(const std::shared_ptr& scalar) { + ARROW_ASSIGN_OR_RAISE( + auto out, scalar->CastTo(TypeTraits::type_singleton())); + return internal::checked_pointer_cast(std::move(out)); + } + + // apply Contract() to all inputs, building the expected output + Result ComputeExpected(const std::vector& args, + const std::shared_ptr& out_type, + const FunctionOptions* options); + + // Randomly generate valid function arguments from a KernelSignature. + std::vector> GenerateInputs(const KernelSignature& signature); + + random::RandomArrayGenerator rag_{kRandomSeed}; +}; + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc index 1ef61d2d75a..ebae60abab8 100644 --- a/cpp/src/arrow/compute/registry.cc +++ b/cpp/src/arrow/compute/registry.cc @@ -104,6 +104,7 @@ static std::unique_ptr CreateBuiltInRegistry() { RegisterScalarComparison(registry.get()); RegisterScalarSetLookup(registry.get()); RegisterScalarStringAscii(registry.get()); + RegisterScalarValidity(registry.get()); // Aggregate functions RegisterScalarAggregateBasic(registry.get()); diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h index 2c3a5e3d652..515b17b635d 100644 --- a/cpp/src/arrow/compute/registry_internal.h +++ b/cpp/src/arrow/compute/registry_internal.h @@ -31,6 +31,7 @@ void RegisterScalarCast(FunctionRegistry* registry); void RegisterScalarComparison(FunctionRegistry* registry); void RegisterScalarSetLookup(FunctionRegistry* registry); void RegisterScalarStringAscii(FunctionRegistry* registry); +void RegisterScalarValidity(FunctionRegistry* registry); // Vector functions void RegisterVectorFilter(FunctionRegistry* registry); diff --git a/cpp/src/arrow/datum.h b/cpp/src/arrow/datum.h index a25ee5b024c..624657940ce 100644 --- a/cpp/src/arrow/datum.h +++ b/cpp/src/arrow/datum.h @@ -261,6 +261,9 @@ struct ARROW_EXPORT Datum { bool Equals(const Datum& other) const; + bool operator==(const Datum& other) const { return Equals(other); } + bool operator!=(const Datum& other) const { return !Equals(other); } + std::string ToString() const; }; diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 140ab453ccd..844ed0e1d18 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -262,5 +262,61 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first return std::make_shared(array_data); } +struct RandomArrayGeneratorOfImpl { + Status Visit(const NullType&) { + DCHECK_NE(null_probability_, 0.0); + out_ = std::make_shared(size_); + return Status::OK(); + } + + Status Visit(const BooleanType&) { + double probability = 0.25; + out_ = rag_->Boolean(size_, probability, null_probability_); + return Status::OK(); + } + + template + enable_if_number Visit(const T&) { + auto max = std::numeric_limits::max(); + auto min = std::numeric_limits::lowest(); + + out_ = rag_->Numeric(size_, min, max, null_probability_); + return Status::OK(); + } + + template + enable_if_base_binary Visit(const T& t) { + int32_t min_length = 0; + auto max_length = static_cast(std::sqrt(size_)); + + if (t.layout().buffers[1].byte_width == sizeof(int32_t)) { + out_ = rag_->String(size_, min_length, max_length, null_probability_); + } else { + out_ = rag_->LargeString(size_, min_length, max_length, null_probability_); + } + return out_->View(type_).Value(&out_); + } + + Status Visit(const DataType& t) { + return Status::NotImplemented("generation of random arrays of type ", t); + } + + std::shared_ptr Finish() && { + DCHECK_OK(VisitTypeInline(*type_, this)); + return std::move(out_); + } + + RandomArrayGenerator* rag_; + const std::shared_ptr& type_; + int64_t size_; + double null_probability_; + std::shared_ptr out_; +}; + +std::shared_ptr RandomArrayGenerator::Of(std::shared_ptr type, + int64_t size, double null_probability) { + return RandomArrayGeneratorOfImpl{this, type, size, null_probability, nullptr}.Finish(); +} + } // namespace random } // namespace arrow diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 36b0eb05a19..e6affc685a6 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -250,6 +250,9 @@ class ARROW_EXPORT RandomArrayGenerator { int32_t min_length, int32_t max_length, double null_probability = 0); + std::shared_ptr Of(std::shared_ptr type, int64_t size, + double null_probability); + SeedType seed() { return seed_distribution_(seed_rng_); } private: diff --git a/cpp/src/arrow/util/iterator_test.cc b/cpp/src/arrow/util/iterator_test.cc index 9724fdfaf76..14a496b3e6a 100644 --- a/cpp/src/arrow/util/iterator_test.cc +++ b/cpp/src/arrow/util/iterator_test.cc @@ -27,6 +27,9 @@ #include #include +#include +#include + #include "arrow/testing/gtest_util.h" namespace arrow { diff --git a/cpp/src/parquet/encryption_read_configurations_test.cc b/cpp/src/parquet/encryption_read_configurations_test.cc index 7f8968f541d..a794a8cf2ad 100644 --- a/cpp/src/parquet/encryption_read_configurations_test.cc +++ b/cpp/src/parquet/encryption_read_configurations_test.cc @@ -350,7 +350,7 @@ class TestDecryptionConfiguration parquet::DoubleReader* double_reader = static_cast(column_reader.get()); - // Get the ColumnChunkMetaData for the Dobule column + // Get the ColumnChunkMetaData for the Double column std::unique_ptr double_md = rg_metadata->ColumnChunk(5); // Read all the rows in the column From 908504810ee9332f651cceb33a8b40f253383efe Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 09:38:25 -0400 Subject: [PATCH 02/13] revert Property testing --- cpp/src/arrow/array/builder_base.cc | 43 ------ cpp/src/arrow/array/builder_base.h | 2 - .../compute/kernels/scalar_arithmetic_test.cc | 41 ----- .../compute/kernels/scalar_validity_test.cc | 46 ------ cpp/src/arrow/compute/kernels/test_util.cc | 140 ------------------ cpp/src/arrow/compute/kernels/test_util.h | 60 -------- 6 files changed, 332 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 4b571d2b5ec..4c21859fae3 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -21,15 +21,11 @@ #include #include "arrow/array/array_base.h" -#include "arrow/array/builder_binary.h" -#include "arrow/array/builder_primitive.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" -#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/util/logging.h" -#include "arrow/visitor_inline.h" namespace arrow { @@ -115,43 +111,4 @@ void ArrayBuilder::UnsafeSetNull(int64_t length) { null_bitmap_builder_.UnsafeAppend(length, false); } -struct ArrayBuilderAppendScalarImpl { - template ::BuilderType> - B& builder() { - return *internal::checked_cast(builder_); - } - - template - auto Visit(const S& scalar) -> decltype(builder().Append(scalar.value)) { - return builder().Append(scalar.value); - } - - template - enable_if_base_binary Visit(const S& scalar) { - return builder().Append(scalar.value->data(), scalar.value->size()); - } - - Status Visit(const Scalar& scalar) { - return Status::NotImplemented("Appending scalars to builders of type ", *scalar.type); - } - - Status Finish() && { return VisitScalarInline(scalar_, this); } - ArrayBuilder* builder_; - const Scalar& scalar_; -}; - -Status ArrayBuilder::Append(const Scalar& scalar) { - auto builder_type = type(); - if (!scalar.type->Equals(builder_type)) { - return Status::TypeError("Cannot append scalars of type ", *scalar.type, - " to a builder of type ", *builder_type); - } - if (!scalar.is_valid) { - return AppendNull(); - } - RETURN_NOT_OK(Reserve(1)); - return ArrayBuilderAppendScalarImpl{this, scalar}.Finish(); -} - } // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index c8b313419d8..105425539cd 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -98,8 +98,6 @@ class ARROW_EXPORT ArrayBuilder { virtual Status AppendNull() = 0; virtual Status AppendNulls(int64_t length) = 0; - Status Append(const Scalar& scalar); - /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index d541b7bc7c9..2f2159ef642 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -290,46 +290,5 @@ TYPED_TEST(TestBinaryArithmeticsFloating, Sub) { "[null, -0.9, -3.2, null, -1.9, -5.2]"); } -class AddProperty : public ScalarFunctionPropertyMixin { - public: - std::shared_ptr GetFunction() override { - return internal::checked_pointer_cast( - *GetFunctionRegistry()->GetFunction("add")); - } - - Result> Contract(const ScalarVector& args, - const FunctionOptions*) override { - const auto& out_type = args[0]->type; - - if (!args[0]->is_valid || !args[1]->is_valid) { - return MakeNullScalar(out_type); - } - - if (is_integer(out_type->id())) { - ARROW_ASSIGN_OR_RAISE(auto lhs, Cast(args[0])); - ARROW_ASSIGN_OR_RAISE(auto rhs, Cast(args[1])); - return UInt64Scalar(lhs->value + rhs->value).CastTo(out_type); - } - - if (is_floating(out_type->id())) { - ARROW_ASSIGN_OR_RAISE(auto lhs, Cast(args[0])); - ARROW_ASSIGN_OR_RAISE(auto rhs, Cast(args[1])); - return DoubleScalar(lhs->value + rhs->value).CastTo(out_type); - } - - return Status::NotImplemented("NYI"); - } -}; - -TEST_P(AddProperty, TestAddProperty) { Validate(); } - -INSTANTIATE_TEST_SUITE_P(AddPropertyTest, AddProperty, - ScalarFunctionPropertyTestParam::Values({ - {0, 0.0}, - {1, 0.0}, - {2, 0.0}, - {1024, 0.25}, - })); - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 70e6f6323ab..c2ab8b39425 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -101,51 +101,5 @@ TEST_F(TestValidityKernels, DISABLED_ScalarIsNull) { AssertUnary(MakeNullScalar(float64()), Datum(true)); } -class IsValidProperty : public ScalarFunctionPropertyMixin { - public: - std::shared_ptr GetFunction() override { - return internal::checked_pointer_cast( - *GetFunctionRegistry()->GetFunction("is_valid")); - } - - Result> Contract(const ScalarVector& args, - const FunctionOptions*) override { - return std::make_shared(args[0]->is_valid); - } -}; - -TEST_P(IsValidProperty, TestIsValidProperty) { Validate(); } - -INSTANTIATE_TEST_SUITE_P(IsValidPropertyTest, IsValidProperty, - ScalarFunctionPropertyTestParam::Values({ - {0, 0.0}, - {1, 0.0}, - {2, 0.0}, - {1024, 0.25}, - })); - -class IsNullProperty : public ScalarFunctionPropertyMixin { - public: - std::shared_ptr GetFunction() override { - return internal::checked_pointer_cast( - *GetFunctionRegistry()->GetFunction("is_null")); - } - - Result> Contract(const ScalarVector& args, - const FunctionOptions*) override { - return std::make_shared(!args[0]->is_valid); - } -}; - -TEST_P(IsNullProperty, TestIsNullProperty) { Validate(); } - -INSTANTIATE_TEST_SUITE_P(IsNullPropertyTest, IsNullProperty, - ScalarFunctionPropertyTestParam::Values({ - {0, 0.0}, - {1, 0.0}, - {2, 0.0}, - {1024, 0.25}, - })); - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc index 7db2cdcbe47..49b8bcec7b2 100644 --- a/cpp/src/arrow/compute/kernels/test_util.cc +++ b/cpp/src/arrow/compute/kernels/test_util.cc @@ -47,145 +47,5 @@ void CheckScalarUnary(std::string func_name, std::shared_ptr in_ty, } } -void ScalarFunctionPropertyMixin::Validate() { - auto function = GetFunction(); - - for (const auto& kernel : function->kernels()) { - for (auto inputs : GenerateInputs(*kernel->signature)) { - // FIXME(bkietz) get the output type alongside the inputs - auto out_type = inputs[0].type(); - - auto actual = function->Execute(inputs, GetParam().options); - auto expected = ComputeExpected(inputs, out_type, GetParam().options); - - if (actual.ok()) { - // TODO(bkietz) allow approximate equality for floats - ASSERT_OK(expected.status()); - AssertDatumsEqual(*expected, *actual); - } else { - // don't require properties to get the error message right - ASSERT_EQ(actual.status().code(), expected.status().code()); - } - } - } -} - -Result ScalarFunctionPropertyMixin::ComputeExpected( - const std::vector& args, const std::shared_ptr& out_type, - const FunctionOptions* options) { - util::optional length; - for (const Datum& arg : args) { - if (arg.is_scalar()) continue; - - if (length && *length != arg.length()) { - return Status::Invalid("mismatched array lengths in ScalarFunction application"); - } - length = arg.length(); - } - - auto ApplyContractOnce = [&](int64_t i) { - ScalarVector scalar_args; - for (const Datum& arg : args) { - if (arg.is_scalar()) { - scalar_args.push_back(arg.scalar()); - } else { - // TODO(bkietz) provide GetScalar(ArrayData) - scalar_args.push_back(*arg.make_array()->GetScalar(i)); - } - } - return Contract(scalar_args, options); - }; - - if (!length) { - // scalar output - return ApplyContractOnce(0); - } - - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(default_memory_pool(), out_type, &builder)); - RETURN_NOT_OK(builder->Resize(*length)); - - for (int64_t i = 0; i < *length; ++i) { - ARROW_ASSIGN_OR_RAISE(auto value, ApplyContractOnce(i)); - RETURN_NOT_OK(builder->Append(*value)); - } - - std::shared_ptr array; - RETURN_NOT_OK(builder->FinishInternal(&array)); - return array; -} - -template -std::vector> CartesianProduct(std::vector> axes) { - if (axes.empty()) { - return {}; - } - - size_t out_length = 1; - std::vector::const_iterator> its(axes.size()); - for (size_t i = 0; i < axes.size(); ++i) { - out_length *= axes[i].size(); - its[i] = axes[i].begin(); - } - - std::vector> out(out_length); - for (auto& point : out) { - for (auto it : its) { - point.emplace_back(*it); - } - - for (size_t i = 0; i < axes.size(); ++i) { - if (++its[i] != axes[i].end()) { - break; - } - its[i] = axes[i].begin(); - } - } - - return out; -} - -std::vector> ScalarFunctionPropertyMixin::GenerateInputs( - const KernelSignature& signature) { - std::vector> inputs; - - auto AppendInputsForType = [&](const std::shared_ptr& type, - std::vector* vec) { - auto array = rag_.Of(type, GetParam().length, GetParam().null_probability); - vec->push_back(array); - - auto array_slice = array->Slice(GetParam().length / 3, 2 * GetParam().length / 3); - vec->push_back(array_slice); - - auto scalar = *rag_.Of(type, 1, 0.0)->GetScalar(0); - vec->push_back(scalar); - - auto null_scalar = MakeNullScalar(type); - vec->push_back(null_scalar); - }; - - DCHECK(!signature.is_varargs()); - for (const InputType& in_type : signature.in_types()) { - DCHECK_EQ(in_type.shape(), ValueDescr::ANY); - - inputs.emplace_back(); - - if (in_type.kind() == InputType::EXACT_TYPE) { - AppendInputsForType(in_type.type(), &inputs.back()); - continue; - } - - if (in_type.kind() == InputType::ANY_TYPE) { - // TODO(bkietz) use more input types - AppendInputsForType(boolean(), &inputs.back()); - continue; - } - - // TODO(bkietz) iterate over possible types based on a matcher - } - - return CartesianProduct(std::move(inputs)); -} - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/test_util.h b/cpp/src/arrow/compute/kernels/test_util.h index 2129cc79509..88c3c3f4485 100644 --- a/cpp/src/arrow/compute/kernels/test_util.h +++ b/cpp/src/arrow/compute/kernels/test_util.h @@ -33,9 +33,7 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" -#include "arrow/util/iterator.h" -#include "arrow/compute/function.h" #include "arrow/compute/kernel.h" // IWYU pragma: end_exports @@ -99,63 +97,5 @@ using TestingStringTypes = static constexpr random::SeedType kRandomSeed = 0x0ff1ce; -struct ScalarFunctionPropertyTestParam { - ScalarFunctionPropertyTestParam(int64_t length, double null_probability) - : length(length), null_probability(null_probability) {} - - ScalarFunctionPropertyTestParam(int64_t length, double null_probability, - const FunctionOptions* options) - : length(length), null_probability(null_probability), options(options) {} - - static auto Values(std::initializer_list params) - -> decltype(testing::ValuesIn(params)) { - return testing::ValuesIn(params); - } - - int64_t length; - double null_probability; - const FunctionOptions* options = nullptr; -}; - -class ScalarFunctionPropertyMixin - : public testing::TestWithParam { - protected: - /// Return an instance of the ScalarFunction which should be tested. - virtual std::shared_ptr GetFunction() = 0; - - /// Contract() should contain a minimal implementation of the function's - /// intended behavior. For example, a Property expressing a division function - /// could unbox the Scalars and perform division on them. - /// - /// The arguments will be generated based on kernel signatures so validation of anything - /// expressible in InputType is unnecessary (for example inputs will always have valid - /// arity and type). Other errors should be emitted with the same StatusCode that a - /// kernel would raise - in the example case of a division function a divide by zero - /// error should be emitted by Contract(). - virtual Result> Contract(const ScalarVector& args, - const FunctionOptions* options) = 0; - - /// Run randomized testing of all kernels in the function. - void Validate(); - - /// Helper for unboxing scalars efficiently in implementations of Contract() - template - Result> Cast(const std::shared_ptr& scalar) { - ARROW_ASSIGN_OR_RAISE( - auto out, scalar->CastTo(TypeTraits::type_singleton())); - return internal::checked_pointer_cast(std::move(out)); - } - - // apply Contract() to all inputs, building the expected output - Result ComputeExpected(const std::vector& args, - const std::shared_ptr& out_type, - const FunctionOptions* options); - - // Randomly generate valid function arguments from a KernelSignature. - std::vector> GenerateInputs(const KernelSignature& signature); - - random::RandomArrayGenerator rag_{kRandomSeed}; -}; - } // namespace compute } // namespace arrow From 174a53f8f3b1c08ddb7776011cefbb0a4bca5ebb Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 09:53:00 -0400 Subject: [PATCH 03/13] use CheckScalarUnary --- .../arrow/compute/kernels/scalar_validity.cc | 2 + .../compute/kernels/scalar_validity_test.cc | 46 ++++++------------- cpp/src/arrow/compute/kernels/test_util.cc | 9 +++- cpp/src/arrow/compute/kernels/test_util.h | 4 ++ 4 files changed, 28 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 16faed52d99..0916ffdb094 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -22,6 +22,7 @@ namespace arrow { namespace compute { +namespace { struct IsValid { static void Call(KernelContext* ctx, const Scalar& in, Scalar* out) { @@ -93,5 +94,6 @@ void RegisterScalarValidity(FunctionRegistry* registry) { } } // namespace internal +} // namespace } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index c2ab8b39425..a0657133c0a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -43,33 +43,16 @@ class TestValidityKernels : public ::testing::Test { return TypeTraits::type_singleton(); } - void AssertUnary(Datum arg, Datum expected) { - ASSERT_OK_AND_ASSIGN(auto actual, func_(arg, nullptr)); - ASSERT_EQ(actual.kind(), expected.kind()); - if (actual.kind() == Datum::ARRAY) { - ASSERT_OK(actual.make_array()->ValidateFull()); - AssertArraysApproxEqual(*expected.make_array(), *actual.make_array()); - } else { - AssertScalarsEqual(*expected.scalar(), *actual.scalar()); - } - } - - void AssertUnary(const std::string& arg_json, const std::string& expected_json) { - AssertUnary(ArrayFromJSON(type_singleton(), arg_json), - ArrayFromJSON(type_singleton(), expected_json)); - } - using UnaryFunction = std::function(const Datum&, ExecContext*)>; UnaryFunction func_; }; TEST_F(TestValidityKernels, ArrayIsValid) { - func_ = arrow::compute::IsValid; - - this->AssertUnary("[]", "[]"); - this->AssertUnary("[null]", "[false]"); - this->AssertUnary("[1]", "[true]"); - this->AssertUnary("[null, 1, 0, null]", "[false, true, true, false]"); + CheckScalarUnary("is_valid", type_singleton(), "[]", type_singleton(), "[]"); + CheckScalarUnary("is_valid", type_singleton(), "[null]", type_singleton(), "[false]"); + CheckScalarUnary("is_valid", type_singleton(), "[1]", type_singleton(), "[true]"); + CheckScalarUnary("is_valid", type_singleton(), "[null, 1, 0, null]", type_singleton(), + "[false, true, true, false]"); } TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) { @@ -81,24 +64,25 @@ TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) { TEST_F(TestValidityKernels, ScalarIsValid) { func_ = arrow::compute::IsValid; - AssertUnary(Datum(19.7), Datum(true)); - AssertUnary(MakeNullScalar(float64()), Datum(false)); + CheckScalarUnary("is_valid", MakeScalar(19.7), MakeScalar(true)); + CheckScalarUnary("is_valid", MakeNullScalar(float64()), MakeScalar(false)); } TEST_F(TestValidityKernels, ArrayIsNull) { func_ = arrow::compute::IsNull; - this->AssertUnary("[]", "[]"); - this->AssertUnary("[null]", "[true]"); - this->AssertUnary("[1]", "[false]"); - this->AssertUnary("[null, 1, 0, null]", "[true, false, false, true]"); + CheckScalarUnary("is_null", type_singleton(), "[]", type_singleton(), "[]"); + CheckScalarUnary("is_null", type_singleton(), "[null]", type_singleton(), "[true]"); + CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); + CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), + "[true, false, false, true]"); } -TEST_F(TestValidityKernels, DISABLED_ScalarIsNull) { +TEST_F(TestValidityKernels, ScalarIsNull) { func_ = arrow::compute::IsNull; - AssertUnary(Datum(19.7), Datum(false)); - AssertUnary(MakeNullScalar(float64()), Datum(true)); + CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false)); + CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc index 49b8bcec7b2..bad5b848d21 100644 --- a/cpp/src/arrow/compute/kernels/test_util.cc +++ b/cpp/src/arrow/compute/kernels/test_util.cc @@ -42,10 +42,15 @@ void CheckScalarUnary(std::string func_name, std::shared_ptr in_ty, for (int64_t i = 0; i < input->length(); ++i) { ASSERT_OK_AND_ASSIGN(auto val, input->GetScalar(i)); ASSERT_OK_AND_ASSIGN(auto ex_val, expected->GetScalar(i)); - ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {val}, options)); - AssertScalarsEqual(*ex_val, *out.scalar(), /*verbose=*/true); + CheckScalarUnary(func_name, val, ex_val, options); } } +void CheckScalarUnary(std::string func_name, std::shared_ptr input, + std::shared_ptr expected, const FunctionOptions* options) { + ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {input}, options)); + AssertScalarsEqual(*expected, *out.scalar(), /*verbose=*/true); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/test_util.h b/cpp/src/arrow/compute/kernels/test_util.h index 88c3c3f4485..c4e1f07075e 100644 --- a/cpp/src/arrow/compute/kernels/test_util.h +++ b/cpp/src/arrow/compute/kernels/test_util.h @@ -92,6 +92,10 @@ void CheckScalarUnary(std::string func_name, std::shared_ptr in_ty, std::string json_expected, const FunctionOptions* options = nullptr); +void CheckScalarUnary(std::string func_name, std::shared_ptr input, + std::shared_ptr expected, + const FunctionOptions* options = nullptr); + using TestingStringTypes = ::testing::Types; From 1d6f6c9486d0f9dc7f6c0a7f9035666087e83da9 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 10:00:32 -0400 Subject: [PATCH 04/13] document RAG::ArrayOf --- cpp/src/arrow/testing/random.cc | 5 +++-- cpp/src/arrow/testing/random.h | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 844ed0e1d18..d4db44d9f17 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -313,8 +313,9 @@ struct RandomArrayGeneratorOfImpl { std::shared_ptr out_; }; -std::shared_ptr RandomArrayGenerator::Of(std::shared_ptr type, - int64_t size, double null_probability) { +std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr type, + int64_t size, + double null_probability) { return RandomArrayGeneratorOfImpl{this, type, size, null_probability, nullptr}.Finish(); } diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index e6affc685a6..0b4e7e3b6fe 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -250,8 +250,20 @@ class ARROW_EXPORT RandomArrayGenerator { int32_t min_length, int32_t max_length, double null_probability = 0); - std::shared_ptr Of(std::shared_ptr type, int64_t size, - double null_probability); + /// \brief Randomly generate an Array of the specified type, size, and null_probability. + /// + /// Generation parameters other than size and null_probability are determined based on + /// the type of Array to be generated. + /// If boolean the probabilities of true,false values are 0.25,0.75 respectively. + /// If numeric min,max will be the least and greatest representable values. + /// If string min_length,max_length will be 0,sqrt(size) respectively. + /// + /// \param[in] type the type of Array to generate + /// \param[in] size the size of the Array to generate + /// \param[in] null_probability the probability of a slot being null + /// \return a generated Array + std::shared_ptr ArrayOf(std::shared_ptr type, int64_t size, + double null_probability); SeedType seed() { return seed_distribution_(seed_rng_); } From 71d082e3d45884e181fca8296e4f60dcad122f80 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 10:38:50 -0400 Subject: [PATCH 05/13] remove unused #includes --- cpp/src/arrow/compute/exec.cc | 1 - cpp/src/arrow/util/iterator_test.cc | 3 --- 2 files changed, 4 deletions(-) diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index d9caec543fd..3c43cc1d4e5 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/arrow/util/iterator_test.cc b/cpp/src/arrow/util/iterator_test.cc index 14a496b3e6a..9724fdfaf76 100644 --- a/cpp/src/arrow/util/iterator_test.cc +++ b/cpp/src/arrow/util/iterator_test.cc @@ -27,9 +27,6 @@ #include #include -#include -#include - #include "arrow/testing/gtest_util.h" namespace arrow { From 78350009660e6bf44f6ff59d3567606d98ae68d6 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 10:41:01 -0400 Subject: [PATCH 06/13] explicit internal:: usage --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 0916ffdb094..c1724e786e5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -21,6 +21,10 @@ #include "arrow/util/bitmap_ops.h" namespace arrow { + +using internal::CopyBitmap; +using internal::InvertBitmap; + namespace compute { namespace { @@ -41,8 +45,8 @@ struct IsValid { return; } - internal::CopyBitmap(arr.buffers[0]->data(), arr.offset, arr.length, - out->buffers[1]->mutable_data(), out->offset); + CopyBitmap(arr.buffers[0]->data(), arr.offset, arr.length, + out->buffers[1]->mutable_data(), out->offset); } }; @@ -58,8 +62,8 @@ struct IsNull { return; } - internal::InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, - out->buffers[1]->mutable_data(), out->offset); + InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, + out->buffers[1]->mutable_data(), out->offset); } }; From 8034e7ac081b0ceefec8529e2175dbb9c493634f Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 10:59:39 -0400 Subject: [PATCH 07/13] fix anonymous namespace --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index c1724e786e5..f09d2d90ced 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -66,8 +66,12 @@ struct IsNull { out->buffers[1]->mutable_data(), out->offset); } }; +} // namespace namespace codegen { +namespace { + +using arrow::compute::codegen::SimpleUnary; void MakeFunction(std::string name, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, @@ -83,6 +87,7 @@ void MakeFunction(std::string name, std::vector in_types, OutputType DCHECK_OK(registry->AddFunction(std::move(func))); } +} // namespace } // namespace codegen namespace internal { @@ -98,6 +103,5 @@ void RegisterScalarValidity(FunctionRegistry* registry) { } } // namespace internal -} // namespace } // namespace compute } // namespace arrow From 932ecc104cd8534861312ae4796540b2e780ad22 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 13:51:13 -0400 Subject: [PATCH 08/13] switch to in-kernel allocation when zero copy is unavailable --- cpp/src/arrow/compute/exec.cc | 1 + cpp/src/arrow/compute/kernel.h | 28 +++++++++++-------- .../arrow/compute/kernels/scalar_validity.cc | 9 ++++-- .../compute/kernels/scalar_validity_test.cc | 11 -------- 4 files changed, 23 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 3c43cc1d4e5..9b398ab2d9a 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -307,6 +307,7 @@ class NullPropagator { // the bitmap is not preallocated, and that precondition is asserted // higher in the call stack. if (arr.offset == 0) { + DCHECK(false); output_->buffers[0] = arr_bitmap; } else if (arr.offset % 8 == 0) { output_->buffers[0] = diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index c14354da2cf..b51efe5a953 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -191,8 +191,8 @@ class ARROW_EXPORT InputType { : kind_(ANY_TYPE), shape_(shape) {} /// \brief Accept an exact value type. - InputType(std::shared_ptr type, - ValueDescr::Shape shape = ValueDescr::ANY) // NOLINT implicit construction + InputType(std::shared_ptr type, // NOLINT implicit construction + ValueDescr::Shape shape = ValueDescr::ANY) : kind_(EXACT_TYPE), shape_(shape), type_(std::move(type)) {} /// \brief Accept an exact value type and shape provided by a ValueDescr. @@ -200,7 +200,7 @@ class ARROW_EXPORT InputType { : InputType(descr.type, descr.shape) {} /// \brief Use the passed TypeMatcher to type check. - InputType(std::shared_ptr type_matcher, + InputType(std::shared_ptr type_matcher, // NOLINT implicit construction ValueDescr::Shape shape = ValueDescr::ANY) : kind_(USE_TYPE_MATCHER), shape_(shape), type_matcher_(std::move(type_matcher)) {} @@ -329,7 +329,8 @@ class ARROW_EXPORT OutputType { /// \brief Output the exact type and shape provided by a ValueDescr OutputType(ValueDescr descr); // NOLINT implicit construction - explicit OutputType(Resolver resolver) : kind_(COMPUTED), resolver_(resolver) {} + explicit OutputType(Resolver resolver) + : kind_(COMPUTED), resolver_(std::move(resolver)) {} OutputType(const OutputType& other) { this->kind_ = other.kind_; @@ -529,7 +530,7 @@ struct Kernel { Kernel() {} Kernel(std::shared_ptr sig, KernelInit init) - : signature(std::move(sig)), init(init) {} + : signature(std::move(sig)), init(std::move(init)) {} Kernel(std::vector in_types, OutputType out_type, KernelInit init) : Kernel(KernelSignature::Make(std::move(in_types), out_type), init) {} @@ -566,11 +567,11 @@ struct ArrayKernel : public Kernel { ArrayKernel(std::shared_ptr sig, ArrayKernelExec exec, KernelInit init = NULLPTR) - : Kernel(std::move(sig), init), exec(exec) {} + : Kernel(std::move(sig), init), exec(std::move(exec)) {} ArrayKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR) - : Kernel(std::move(in_types), std::move(out_type), init), exec(exec) {} + : Kernel(std::move(in_types), std::move(out_type), init), exec(std::move(exec)) {} /// \brief Perform a single invocation of this kernel. Depending on the /// implementation, it may only write into preallocated memory, while in some @@ -617,11 +618,14 @@ struct VectorKernel : public ArrayKernel { VectorKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR) - : ArrayKernel(std::move(in_types), out_type, exec, init), finalize(finalize) {} + : ArrayKernel(std::move(in_types), std::move(out_type), std::move(exec), + std::move(init)), + finalize(std::move(finalize)) {} VectorKernel(std::shared_ptr sig, ArrayKernelExec exec, KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR) - : ArrayKernel(std::move(sig), exec, init), finalize(finalize) {} + : ArrayKernel(std::move(sig), std::move(exec), std::move(init)), + finalize(std::move(finalize)) {} /// \brief For VectorKernel, convert intermediate results into finalized /// results. Mutates input argument. Some kernels may accumulate state @@ -679,9 +683,9 @@ struct ScalarAggregateKernel : public Kernel { ScalarAggregateConsume consume, ScalarAggregateMerge merge, ScalarAggregateFinalize finalize) : Kernel(std::move(sig), init), - consume(consume), - merge(merge), - finalize(finalize) {} + consume(std::move(consume)), + merge(std::move(merge)), + finalize(std::move(finalize)) {} ScalarAggregateKernel(std::vector in_types, OutputType out_type, KernelInit init, ScalarAggregateConsume consume, diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index f09d2d90ced..62690c10980 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -40,6 +40,8 @@ struct IsValid { return; } + KERNEL_RETURN_IF_ERROR(ctx, ctx->AllocateBitmap(out->length).Value(&out->buffers[1])); + if (arr.null_count == 0 || arr.buffers[0] == nullptr) { BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true); return; @@ -75,13 +77,14 @@ using arrow::compute::codegen::SimpleUnary; void MakeFunction(std::string name, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - NullHandling::type null_handling) { + NullHandling::type null_handling, MemAllocation::type mem_allocation) { Arity arity{static_cast(in_types.size())}; auto func = std::make_shared(name, arity); ScalarKernel kernel(std::move(in_types), out_type, exec); kernel.null_handling = null_handling; kernel.can_write_into_slices = true; + kernel.mem_allocation = mem_allocation; DCHECK_OK(func->AddKernel(std::move(kernel))); DCHECK_OK(registry->AddFunction(std::move(func))); @@ -95,11 +98,11 @@ namespace internal { void RegisterScalarValidity(FunctionRegistry* registry) { codegen::MakeFunction("is_valid", {ValueDescr::ANY}, boolean(), codegen::SimpleUnary, registry, - NullHandling::OUTPUT_NOT_NULL); + NullHandling::OUTPUT_NOT_NULL, MemAllocation::NO_PREALLOCATE); codegen::MakeFunction("is_null", {ValueDescr::ANY}, boolean(), codegen::SimpleUnary, registry, - NullHandling::OUTPUT_NOT_NULL); + NullHandling::OUTPUT_NOT_NULL, MemAllocation::PREALLOCATE); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index a0657133c0a..e4153dce3b2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -37,14 +37,9 @@ class TestValidityKernels : public ::testing::Test { // testing multiple types seems redundant. using ArrowType = BooleanType; - using CType = typename ArrowType::c_type; - static std::shared_ptr type_singleton() { return TypeTraits::type_singleton(); } - - using UnaryFunction = std::function(const Datum&, ExecContext*)>; - UnaryFunction func_; }; TEST_F(TestValidityKernels, ArrayIsValid) { @@ -62,15 +57,11 @@ TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) { } TEST_F(TestValidityKernels, ScalarIsValid) { - func_ = arrow::compute::IsValid; - CheckScalarUnary("is_valid", MakeScalar(19.7), MakeScalar(true)); CheckScalarUnary("is_valid", MakeNullScalar(float64()), MakeScalar(false)); } TEST_F(TestValidityKernels, ArrayIsNull) { - func_ = arrow::compute::IsNull; - CheckScalarUnary("is_null", type_singleton(), "[]", type_singleton(), "[]"); CheckScalarUnary("is_null", type_singleton(), "[null]", type_singleton(), "[true]"); CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); @@ -79,8 +70,6 @@ TEST_F(TestValidityKernels, ArrayIsNull) { } TEST_F(TestValidityKernels, ScalarIsNull) { - func_ = arrow::compute::IsNull; - CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false)); CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); } From cada19fe57dd18bbcfab529d1e0b668652af40c3 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 15:07:24 -0400 Subject: [PATCH 09/13] remove debug check --- cpp/src/arrow/compute/exec.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 9b398ab2d9a..3c43cc1d4e5 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -307,7 +307,6 @@ class NullPropagator { // the bitmap is not preallocated, and that precondition is asserted // higher in the call stack. if (arr.offset == 0) { - DCHECK(false); output_->buffers[0] = arr_bitmap; } else if (arr.offset % 8 == 0) { output_->buffers[0] = From 95e2545ddb3b2980c05826ce5f1e4da27164dc85 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 15 Jun 2020 15:28:51 -0400 Subject: [PATCH 10/13] rename Operators for validity kernels --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 62690c10980..338e3836c3d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -28,7 +28,7 @@ using internal::InvertBitmap; namespace compute { namespace { -struct IsValid { +struct IsValidOperator { static void Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; } @@ -52,7 +52,7 @@ struct IsValid { } }; -struct IsNull { +struct IsNullOperator { static void Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = !in.is_valid; } @@ -97,11 +97,11 @@ namespace internal { void RegisterScalarValidity(FunctionRegistry* registry) { codegen::MakeFunction("is_valid", {ValueDescr::ANY}, boolean(), - codegen::SimpleUnary, registry, + codegen::SimpleUnary, registry, NullHandling::OUTPUT_NOT_NULL, MemAllocation::NO_PREALLOCATE); codegen::MakeFunction("is_null", {ValueDescr::ANY}, boolean(), - codegen::SimpleUnary, registry, + codegen::SimpleUnary, registry, NullHandling::OUTPUT_NOT_NULL, MemAllocation::PREALLOCATE); } From 6f31cd25889aefc8c445092afcc0501d6acb0bca Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 16 Jun 2020 13:28:59 -0400 Subject: [PATCH 11/13] review comments --- .../arrow/compute/kernels/scalar_validity.cc | 19 ++++++----------- cpp/src/arrow/compute/kernels/test_util.cc | 21 ++++++++++++++----- cpp/src/arrow/testing/random.cc | 1 - 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 338e3836c3d..5f588c7cffa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -68,12 +68,6 @@ struct IsNullOperator { out->buffers[1]->mutable_data(), out->offset); } }; -} // namespace - -namespace codegen { -namespace { - -using arrow::compute::codegen::SimpleUnary; void MakeFunction(std::string name, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, @@ -91,18 +85,17 @@ void MakeFunction(std::string name, std::vector in_types, OutputType } } // namespace -} // namespace codegen namespace internal { void RegisterScalarValidity(FunctionRegistry* registry) { - codegen::MakeFunction("is_valid", {ValueDescr::ANY}, boolean(), - codegen::SimpleUnary, registry, - NullHandling::OUTPUT_NOT_NULL, MemAllocation::NO_PREALLOCATE); + MakeFunction("is_valid", {ValueDescr::ANY}, boolean(), + codegen::SimpleUnary, registry, + NullHandling::OUTPUT_NOT_NULL, MemAllocation::NO_PREALLOCATE); - codegen::MakeFunction("is_null", {ValueDescr::ANY}, boolean(), - codegen::SimpleUnary, registry, - NullHandling::OUTPUT_NOT_NULL, MemAllocation::PREALLOCATE); + MakeFunction("is_null", {ValueDescr::ANY}, boolean(), + codegen::SimpleUnary, registry, + NullHandling::OUTPUT_NOT_NULL, MemAllocation::PREALLOCATE); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc index bad5b848d21..15c80234406 100644 --- a/cpp/src/arrow/compute/kernels/test_util.cc +++ b/cpp/src/arrow/compute/kernels/test_util.cc @@ -30,11 +30,9 @@ namespace arrow { namespace compute { -void CheckScalarUnary(std::string func_name, std::shared_ptr in_ty, - std::string json_input, std::shared_ptr out_ty, - std::string json_expected, const FunctionOptions* options) { - auto input = ArrayFromJSON(in_ty, json_input); - auto expected = ArrayFromJSON(out_ty, json_expected); +void CheckScalarUnary(std::string func_name, std::shared_ptr input, + std::shared_ptr expected, const FunctionOptions* options, + bool slices = true) { ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {input}, options)); AssertArraysEqual(*expected, *out.make_array(), /*verbose=*/true); @@ -44,6 +42,19 @@ void CheckScalarUnary(std::string func_name, std::shared_ptr in_ty, ASSERT_OK_AND_ASSIGN(auto ex_val, expected->GetScalar(i)); CheckScalarUnary(func_name, val, ex_val, options); } + + if (slices) { + auto length = input->length() / 3; + CheckScalarUnary(func_name, input->Slice(length, length), + expected->Slice(length, length), options, /*slices=*/false); + } +} + +void CheckScalarUnary(std::string func_name, std::shared_ptr in_ty, + std::string json_input, std::shared_ptr out_ty, + std::string json_expected, const FunctionOptions* options) { + CheckScalarUnary(func_name, ArrayFromJSON(in_ty, json_input), + ArrayFromJSON(out_ty, json_expected), options, /*slices=*/true); } void CheckScalarUnary(std::string func_name, std::shared_ptr input, diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index d4db44d9f17..6146cb8d002 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -264,7 +264,6 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first struct RandomArrayGeneratorOfImpl { Status Visit(const NullType&) { - DCHECK_NE(null_probability_, 0.0); out_ = std::make_shared(size_); return Status::OK(); } From 5bd105718849b12c86b3be0eadd0021e56f31a26 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 16 Jun 2020 15:38:47 -0400 Subject: [PATCH 12/13] use zero copy with buffer slices --- .../arrow/compute/kernels/scalar_validity.cc | 22 ++++++++------- cpp/src/arrow/compute/kernels/test_util.cc | 27 ++++++++++++++----- cpp/src/arrow/testing/gtest_util.cc | 21 ++++++++++++--- cpp/src/arrow/testing/gtest_util.h | 3 ++- 4 files changed, 54 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 5f588c7cffa..48abe6f660f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -34,9 +34,13 @@ struct IsValidOperator { } static void Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { - if (arr.buffers[0] != nullptr && out->offset == arr.offset && - out->length == arr.length) { - out->buffers[1] = arr.buffers[0]; + DCHECK_EQ(out->offset, 0); + DCHECK_LE(out->length, arr.length); + if (arr.buffers[0] != nullptr) { + out->buffers[1] = arr.offset == 0 + ? arr.buffers[0] + : SliceBuffer(arr.buffers[0], arr.offset / 8, arr.length / 8); + out->offset = arr.offset % 8; return; } @@ -71,13 +75,13 @@ struct IsNullOperator { void MakeFunction(std::string name, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - NullHandling::type null_handling, MemAllocation::type mem_allocation) { + MemAllocation::type mem_allocation, bool can_write_into_slices) { Arity arity{static_cast(in_types.size())}; auto func = std::make_shared(name, arity); ScalarKernel kernel(std::move(in_types), out_type, exec); - kernel.null_handling = null_handling; - kernel.can_write_into_slices = true; + kernel.null_handling = NullHandling::OUTPUT_NOT_NULL; + kernel.can_write_into_slices = can_write_into_slices; kernel.mem_allocation = mem_allocation; DCHECK_OK(func->AddKernel(std::move(kernel))); @@ -91,11 +95,11 @@ namespace internal { void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", {ValueDescr::ANY}, boolean(), codegen::SimpleUnary, registry, - NullHandling::OUTPUT_NOT_NULL, MemAllocation::NO_PREALLOCATE); + MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); MakeFunction("is_null", {ValueDescr::ANY}, boolean(), - codegen::SimpleUnary, registry, - NullHandling::OUTPUT_NOT_NULL, MemAllocation::PREALLOCATE); + codegen::SimpleUnary, registry, MemAllocation::PREALLOCATE, + /*can_write_into_slices=*/true); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc index 15c80234406..4ac12299fdf 100644 --- a/cpp/src/arrow/compute/kernels/test_util.cc +++ b/cpp/src/arrow/compute/kernels/test_util.cc @@ -25,14 +25,14 @@ #include "arrow/compute/exec.h" #include "arrow/datum.h" #include "arrow/result.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" namespace arrow { namespace compute { void CheckScalarUnary(std::string func_name, std::shared_ptr input, - std::shared_ptr expected, const FunctionOptions* options, - bool slices = true) { + std::shared_ptr expected, const FunctionOptions* options) { ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {input}, options)); AssertArraysEqual(*expected, *out.make_array(), /*verbose=*/true); @@ -43,10 +43,25 @@ void CheckScalarUnary(std::string func_name, std::shared_ptr input, CheckScalarUnary(func_name, val, ex_val, options); } - if (slices) { - auto length = input->length() / 3; + if (auto length = input->length() / 3) { + CheckScalarUnary(func_name, input->Slice(0, length), expected->Slice(0, length), + options); + CheckScalarUnary(func_name, input->Slice(length, length), - expected->Slice(length, length), options, /*slices=*/false); + expected->Slice(length, length), options); + + CheckScalarUnary(func_name, input->Slice(2 * length), expected->Slice(2 * length), + options); + } + + if (auto length = input->length() / 3) { + ArrayVector input_chunks{input->Slice(0, length), input->Slice(length)}, + expected_chunks{expected->Slice(0, 2 * length), expected->Slice(2 * length)}; + + ASSERT_OK_AND_ASSIGN( + Datum out, + CallFunction(func_name, {std::make_shared(input_chunks)}, options)); + AssertDatumsEqual(std::make_shared(expected_chunks), out); } } @@ -54,7 +69,7 @@ void CheckScalarUnary(std::string func_name, std::shared_ptr in_ty, std::string json_input, std::shared_ptr out_ty, std::string json_expected, const FunctionOptions* options) { CheckScalarUnary(func_name, ArrayFromJSON(in_ty, json_input), - ArrayFromJSON(out_ty, json_expected), options, /*slices=*/true); + ArrayFromJSON(out_ty, json_expected), options); } void CheckScalarUnary(std::string func_name, std::shared_ptr input, diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 280a6dd56a2..f601e578a26 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -254,9 +254,24 @@ ASSERT_EQUAL_IMPL(Field, Field, "fields") ASSERT_EQUAL_IMPL(Schema, Schema, "schemas") #undef ASSERT_EQUAL_IMPL -void AssertDatumsEqual(const Datum& expected, const Datum& actual) { - // TODO: Implement better print - ASSERT_TRUE(actual.Equals(expected)); +void AssertDatumsEqual(const Datum& expected, const Datum& actual, bool verbose) { + ASSERT_EQ(expected.kind(), actual.kind()) + << "expected:" << expected.ToString() << " got:" << actual.ToString(); + + switch (expected.kind()) { + case Datum::SCALAR: + AssertScalarsEqual(*expected.scalar(), *actual.scalar(), verbose); + case Datum::ARRAY: { + auto expected_array = expected.make_array(); + auto actual_array = actual.make_array(); + AssertArraysEqual(*expected_array, *actual_array, verbose); + } + case Datum::CHUNKED_ARRAY: + AssertChunkedEquivalent(*expected.chunked_array(), *actual.chunked_array()); + default: + // TODO: Implement better print + ASSERT_TRUE(actual.Equals(expected)); + } } std::shared_ptr ArrayFromJSON(const std::shared_ptr& type, diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 32c338ab538..89e870da9d2 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -215,7 +215,8 @@ ARROW_EXPORT void AssertSchemaNotEqual(const std::shared_ptr& lhs, ARROW_EXPORT void AssertTablesEqual(const Table& expected, const Table& actual, bool same_chunk_layout = true, bool flatten = false); -ARROW_EXPORT void AssertDatumsEqual(const Datum& expected, const Datum& actual); +ARROW_EXPORT void AssertDatumsEqual(const Datum& expected, const Datum& actual, + bool verbose = false); template void AssertNumericDataEqual(const C_TYPE* raw_data, From be7a03f8fffc67219af2a51f3403ac455d765709 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 16 Jun 2020 16:15:01 -0500 Subject: [PATCH 13/13] Disable AsciiUpper/AsciiLower unit tests until ARROW-9122 done, add missing break statements --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 4 ++-- cpp/src/arrow/testing/gtest_util.cc | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 209ad9408f5..d2a75ecb92a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -56,12 +56,12 @@ TYPED_TEST(TestStringKernels, AsciiLength) { "[3, null, 0, 1]"); } -TYPED_TEST(TestStringKernels, AsciiUpper) { +TYPED_TEST(TestStringKernels, DISABLED_AsciiUpper) { this->CheckUnary("ascii_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->string_type(), "[\"AAAZZæÆ&\", null, \"\", \"B\"]"); } -TYPED_TEST(TestStringKernels, AsciiLower) { +TYPED_TEST(TestStringKernels, DISABLED_AsciiLower) { this->CheckUnary("ascii_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->string_type(), "[\"aaazzæÆ&\", null, \"\", \"b\"]"); } diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index f601e578a26..93d157965b2 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -261,16 +261,19 @@ void AssertDatumsEqual(const Datum& expected, const Datum& actual, bool verbose) switch (expected.kind()) { case Datum::SCALAR: AssertScalarsEqual(*expected.scalar(), *actual.scalar(), verbose); + break; case Datum::ARRAY: { auto expected_array = expected.make_array(); auto actual_array = actual.make_array(); AssertArraysEqual(*expected_array, *actual_array, verbose); - } + } break; case Datum::CHUNKED_ARRAY: AssertChunkedEquivalent(*expected.chunked_array(), *actual.chunked_array()); + break; default: // TODO: Implement better print ASSERT_TRUE(actual.Equals(expected)); + break; } }