From 1610207c1821c99ccf5f9f0c23776753137ba87d Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Mon, 21 Sep 2020 08:05:32 +0000 Subject: [PATCH 1/6] ARROW-10070: [C++][Compute] Implement stdev aggregate kernel --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/compute/api_aggregate.cc | 4 + cpp/src/arrow/compute/api_aggregate.h | 11 ++ .../arrow/compute/kernels/aggregate_basic.cc | 1 + .../kernels/aggregate_basic_internal.h | 1 + .../arrow/compute/kernels/aggregate_stdev.cc | 165 ++++++++++++++++++ .../arrow/compute/kernels/aggregate_test.cc | 101 +++++++++++ docs/source/cpp/compute.rst | 2 + 8 files changed, 286 insertions(+) create mode 100644 cpp/src/arrow/compute/kernels/aggregate_stdev.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f40fa3798b4..758ad77a13c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -365,6 +365,7 @@ if(ARROW_COMPUTE) compute/registry.cc compute/kernels/aggregate_basic.cc compute/kernels/aggregate_mode.cc + compute/kernels/aggregate_stdev.cc compute/kernels/codegen_internal.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 2802b02105d..4794cf1f070 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -45,5 +45,9 @@ Result Mode(const Datum& value, ExecContext* ctx) { return CallFunction("mode", {value}, ctx); } +Result Stdev(const Datum& value, ExecContext* ctx) { + return CallFunction("stdev", {value}, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 9528ce3477b..947900b919d 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -145,5 +145,16 @@ Result MinMax(const Datum& value, ARROW_EXPORT Result Mode(const Datum& value, ExecContext* ctx = NULLPTR); +/// \brief Calculate the population standard deviation of a numeric array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed stdev as a DoubleScalar +/// +/// \since 2.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Stdev(const Datum& value, ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 94914d00f10..7451e5c987c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -241,6 +241,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); DCHECK_OK(registry->AddFunction(aggregate::AddModeAggKernels())); + DCHECK_OK(registry->AddFunction(aggregate::AddStdevAggKernels())); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index cd8390e1adf..91a5ec0e205 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -59,6 +59,7 @@ void AddMeanAvx512AggKernels(ScalarAggregateFunction* func); void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func); std::shared_ptr AddModeAggKernels(); +std::shared_ptr AddStdevAggKernels(); // ---------------------------------------------------------------------- // Sum implementation diff --git a/cpp/src/arrow/compute/kernels/aggregate_stdev.cc b/cpp/src/arrow/compute/kernels/aggregate_stdev.cc new file mode 100644 index 00000000000..148a019be02 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate_stdev.cc @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/kernels/aggregate_basic_internal.h" + +namespace arrow { +namespace compute { +namespace aggregate { + +namespace { + +// Based on https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance +template +struct StdevState { + using ArrayType = typename TypeTraits::ArrayType; + using c_type = typename ArrowType::c_type; + using ThisType = StdevState; + + // Calculate stdev of one chunk with two pass algorithm + // Always use `double` to calculate stdev for any array type + void Consume(const ArrayType& array) { + int64_t count = array.length() - array.null_count(); + if (count == 0) { + return; + } + + double sum = 0; + VisitArrayDataInline( + *array.data(), [&sum](c_type value) { sum += static_cast(value); }, + []() {}); + + double mean = sum / count, m2 = 0; + VisitArrayDataInline( + *array.data(), + [mean, &m2](c_type value) { + double v = static_cast(value); + m2 += (v - mean) * (v - mean); + }, + []() {}); + + this->count = count; + this->sum = sum; + this->m2 = m2; + } + + // Combine stdev from two chunks + void MergeFrom(const ThisType& state) { + if (state.count == 0) { + return; + } + if (this->count == 0) { + this->count = state.count; + this->sum = state.sum; + this->m2 = state.m2; + return; + } + double delta = this->sum / this->count - state.sum / state.count; + this->m2 += state.m2 + + delta * delta * this->count * state.count / (this->count + state.count); + this->count += state.count; + this->sum += state.sum; + } + + int64_t count = 0; + double sum = 0; + double m2 = 0; // sum((X-mean)^2) +}; + +template +struct StdevImpl : public ScalarAggregator { + using ThisType = StdevImpl; + using ArrayType = typename TypeTraits::ArrayType; + + explicit StdevImpl(const std::shared_ptr& out_type) : out_type(out_type) {} + + void Consume(KernelContext*, const ExecBatch& batch) override { + ArrayType array(batch[0].array()); + this->state.Consume(array); + } + + void MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->state.MergeFrom(other.state); + } + + void Finalize(KernelContext*, Datum* out) override { + if (this->state.count == 0) { + out->value = std::make_shared(); + } else { + double stdev = sqrt(this->state.m2 / this->state.count); + out->value = std::make_shared(stdev); + } + } + + std::shared_ptr out_type; + StdevState state; +}; + +struct StdevInitState { + std::unique_ptr state; + KernelContext* ctx; + const DataType& in_type; + const std::shared_ptr& out_type; + + StdevInitState(KernelContext* ctx, const DataType& in_type, + const std::shared_ptr& out_type) + : ctx(ctx), in_type(in_type), out_type(out_type) {} + + Status Visit(const DataType&) { return Status::NotImplemented("No stdev implemented"); } + + Status Visit(const HalfFloatType&) { + return Status::NotImplemented("No stdev implemented"); + } + + template + enable_if_t::value, Status> Visit(const Type&) { + state.reset(new StdevImpl(out_type)); + return Status::OK(); + } + + std::unique_ptr Create() { + ctx->SetStatus(VisitTypeInline(in_type, this)); + return std::move(state); + } +}; + +std::unique_ptr StdevInit(KernelContext* ctx, const KernelInitArgs& args) { + StdevInitState visitor(ctx, *args.inputs[0].type, + args.kernel->signature->out_type().type()); + return visitor.Create(); +} + +void AddStdevKernels(KernelInit init, const std::vector>& types, + ScalarAggregateFunction* func) { + for (const auto& ty : types) { + auto sig = KernelSignature::Make({InputType::Array(ty)}, float64()); + AddAggKernel(std::move(sig), init, func); + } +} + +} // namespace + +std::shared_ptr AddStdevAggKernels() { + auto func = std::make_shared("stdev", Arity::Unary()); + AddStdevKernels(StdevInit, internal::NumericTypes(), func.get()); + return func; +} + +} // namespace aggregate +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index ee02d197ea2..2bf9acb2078 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -942,5 +942,106 @@ TEST_F(TestInt32ModeKernel, LargeValueRange) { CheckModeWithRange(-10000000, 10000000); } +// +// Stdev +// + +template +class TestPrimitiveStdevKernel : public ::testing::Test { + public: + using Traits = TypeTraits; + using ScalarType = typename TypeTraits::ScalarType; + + void AssertStdevIs(const Datum& array, double expected, double diff = 0) { + ASSERT_OK_AND_ASSIGN(Datum out, Stdev(array)); + auto value = checked_cast(out.scalar().get()); + ASSERT_TRUE(value->is_valid); + if (diff == 0) { + ASSERT_DOUBLE_EQ(value->value, expected); // |diff| < 4ULP + } else { + ASSERT_NEAR(value->value, expected, diff); + } + } + + void AssertStdevIs(const std::string& json, double expected) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertStdevIs(array, expected); + } + + void AssertStdevIs(const std::vector& json, double expected) { + auto chunked = ChunkedArrayFromJSON(type_singleton(), json); + AssertStdevIs(chunked, expected); + } + + void AssertStdevIsInvalid(const Datum& array) { + ASSERT_OK_AND_ASSIGN(Datum out, Stdev(array)); + auto value = checked_cast(out.scalar().get()); + ASSERT_FALSE(value->is_valid); + } + + void AssertStdevIsInvalid(const std::string& json) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertStdevIsInvalid(array); + } + + std::shared_ptr type_singleton() { return Traits::type_singleton(); } +}; + +template +class TestNumericStdevKernel : public TestPrimitiveStdevKernel {}; + +TYPED_TEST_SUITE(TestNumericStdevKernel, NumericArrowTypes); +TYPED_TEST(TestNumericStdevKernel, Basics) { + // Reference value from numpy.std + this->AssertStdevIs("[100]", 0); + this->AssertStdevIs("[1, 2, 3]", 0.816496580927726); + this->AssertStdevIs("[null, 1, 2, null, 3]", 0.816496580927726); + + this->AssertStdevIs({"[]", "[1]", "[2]", "[null]", "[3]"}, 0.816496580927726); + this->AssertStdevIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"}, 2.581988897471611); + + this->AssertStdevIsInvalid("[null, null, null]"); + this->AssertStdevIsInvalid("[]"); +} + +class TestStdevKernelStability : public TestPrimitiveStdevKernel {}; + +// Test numerical stability +TEST_F(TestStdevKernelStability, Basics) { + this->AssertStdevIs("[100000004, 100000007, 100000013, 100000016]", 4.743416490252569); + this->AssertStdevIs("[1000000004, 1000000007, 1000000013, 1000000016]", + 4.743416490252569); +} + +// Calculate reference stdev with welford online algorithm +double WelfordStdev(const Array& array) { + const auto& array_numeric = reinterpret_cast(array); + const auto values = array_numeric.raw_values(); + internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); + double count = 0, mean = 0, m2 = 0; + for (int64_t i = 0; i < array.length(); ++i) { + if (reader.IsSet()) { + ++count; + double delta = values[i] - mean; + mean += delta / count; + double delta2 = values[i] - mean; + m2 += delta * delta2; + } + reader.Next(); + } + return sqrt(m2 / count); +} + +class TestStdevKernelRandom : public TestPrimitiveStdevKernel {}; + +TEST_F(TestStdevKernelRandom, Basics) { + auto rand = random::RandomArrayGenerator(0x5487656); + auto array = rand.Numeric(40000, -10000.0, 1000000.0, 0.1); + auto chunked = *ChunkedArray::Make( + {array->Slice(0, 1000), array->Slice(1000, 9000), array->Slice(10000, 30000)}); + double expected = WelfordStdev(*array); + this->AssertStdevIs(chunked, expected, 0.0000001); +} + } // namespace compute } // namespace arrow diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 3a0abe39a38..5d52bd946f8 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -140,6 +140,8 @@ Aggregations +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | mode | Unary | Numeric | Scalar Struct (2) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| stdev | Unary | Numeric | Scalar Float64 | | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | sum | Unary | Numeric | Scalar Numeric (3) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ From 12f72a532f6db9b3cc60b2f74c79521700e8a980 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Sun, 27 Sep 2020 02:14:52 +0000 Subject: [PATCH 2/6] Add options to control ddof --- cpp/src/arrow/compute/api_aggregate.cc | 4 +- cpp/src/arrow/compute/api_aggregate.h | 19 +++++- .../arrow/compute/kernels/aggregate_stdev.cc | 23 ++++--- .../arrow/compute/kernels/aggregate_test.cc | 61 ++++++++++++------- docs/source/cpp/compute.rst | 2 +- 5 files changed, 74 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 4794cf1f070..d7c6a5216b2 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -45,8 +45,8 @@ Result Mode(const Datum& value, ExecContext* ctx) { return CallFunction("mode", {value}, ctx); } -Result Stdev(const Datum& value, ExecContext* ctx) { - return CallFunction("stdev", {value}, ctx); +Result Stdev(const Datum& value, const StdevOptions& options, ExecContext* ctx) { + return CallFunction("stdev", {value}, &options, ctx); } } // namespace compute diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 947900b919d..59e5989d040 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -76,6 +76,18 @@ struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { enum Mode null_handling = SKIP; }; +/// \brief Control Delta Degrees of Freedom (ddof) of Stdev kernel +/// +/// The divisor used in calculations is N - ddof, where N is the number of elements. +/// By default, ddof is zero, and population standard deviation is returned. +struct ARROW_EXPORT StdevOptions : public FunctionOptions { + explicit StdevOptions(int ddof = 0) : ddof(ddof) {} + + static StdevOptions Defaults() { return StdevOptions{}; } + + int ddof = 0; +}; + /// @} /// \brief Count non-null (or null) values in an array. @@ -145,16 +157,19 @@ Result MinMax(const Datum& value, ARROW_EXPORT Result Mode(const Datum& value, ExecContext* ctx = NULLPTR); -/// \brief Calculate the population standard deviation of a numeric array +/// \brief Calculate the standard deviation of a numeric array /// /// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see StdevOptions for more information /// \param[in] ctx the function execution context, optional /// \return datum of the computed stdev as a DoubleScalar /// /// \since 2.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Stdev(const Datum& value, ExecContext* ctx = NULLPTR); +Result Stdev(const Datum& value, + const StdevOptions& options = StdevOptions::Defaults(), + ExecContext* ctx = NULLPTR); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_stdev.cc b/cpp/src/arrow/compute/kernels/aggregate_stdev.cc index 148a019be02..23af61386c1 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_stdev.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_stdev.cc @@ -85,7 +85,9 @@ struct StdevImpl : public ScalarAggregator { using ThisType = StdevImpl; using ArrayType = typename TypeTraits::ArrayType; - explicit StdevImpl(const std::shared_ptr& out_type) : out_type(out_type) {} + explicit StdevImpl(const std::shared_ptr& out_type, + const StdevOptions& options) + : out_type(out_type), options(options) {} void Consume(KernelContext*, const ExecBatch& batch) override { ArrayType array(batch[0].array()); @@ -98,16 +100,17 @@ struct StdevImpl : public ScalarAggregator { } void Finalize(KernelContext*, Datum* out) override { - if (this->state.count == 0) { + if (this->state.count <= options.ddof) { out->value = std::make_shared(); } else { - double stdev = sqrt(this->state.m2 / this->state.count); + double stdev = sqrt(this->state.m2 / (this->state.count - options.ddof)); out->value = std::make_shared(stdev); } } std::shared_ptr out_type; StdevState state; + StdevOptions options; }; struct StdevInitState { @@ -115,10 +118,11 @@ struct StdevInitState { KernelContext* ctx; const DataType& in_type; const std::shared_ptr& out_type; + const StdevOptions& options; StdevInitState(KernelContext* ctx, const DataType& in_type, - const std::shared_ptr& out_type) - : ctx(ctx), in_type(in_type), out_type(out_type) {} + const std::shared_ptr& out_type, const StdevOptions& options) + : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {} Status Visit(const DataType&) { return Status::NotImplemented("No stdev implemented"); } @@ -128,7 +132,7 @@ struct StdevInitState { template enable_if_t::value, Status> Visit(const Type&) { - state.reset(new StdevImpl(out_type)); + state.reset(new StdevImpl(out_type, options)); return Status::OK(); } @@ -140,7 +144,8 @@ struct StdevInitState { std::unique_ptr StdevInit(KernelContext* ctx, const KernelInitArgs& args) { StdevInitState visitor(ctx, *args.inputs[0].type, - args.kernel->signature->out_type().type()); + args.kernel->signature->out_type().type(), + static_cast(*args.options)); return visitor.Create(); } @@ -155,7 +160,9 @@ void AddStdevKernels(KernelInit init, const std::vector AddStdevAggKernels() { - auto func = std::make_shared("stdev", Arity::Unary()); + static auto default_stdev_options = StdevOptions::Defaults(); + auto func = std::make_shared("stdev", Arity::Unary(), + &default_stdev_options); AddStdevKernels(StdevInit, internal::NumericTypes(), func.get()); return func; } diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 2bf9acb2078..34b1db535ab 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -952,36 +952,39 @@ class TestPrimitiveStdevKernel : public ::testing::Test { using Traits = TypeTraits; using ScalarType = typename TypeTraits::ScalarType; - void AssertStdevIs(const Datum& array, double expected, double diff = 0) { - ASSERT_OK_AND_ASSIGN(Datum out, Stdev(array)); + void AssertStdevIs(const Datum& array, const StdevOptions& options, double expected, + double diff = 0) { + ASSERT_OK_AND_ASSIGN(Datum out, Stdev(array, options)); auto value = checked_cast(out.scalar().get()); ASSERT_TRUE(value->is_valid); if (diff == 0) { - ASSERT_DOUBLE_EQ(value->value, expected); // |diff| < 4ULP + ASSERT_DOUBLE_EQ(value->value, expected); // < 4ULP } else { ASSERT_NEAR(value->value, expected, diff); } } - void AssertStdevIs(const std::string& json, double expected) { + void AssertStdevIs(const std::string& json, const StdevOptions& options, + double expected) { auto array = ArrayFromJSON(type_singleton(), json); - AssertStdevIs(array, expected); + AssertStdevIs(array, options, expected); } - void AssertStdevIs(const std::vector& json, double expected) { + void AssertStdevIs(const std::vector& json, const StdevOptions& options, + double expected) { auto chunked = ChunkedArrayFromJSON(type_singleton(), json); - AssertStdevIs(chunked, expected); + AssertStdevIs(chunked, options, expected); } - void AssertStdevIsInvalid(const Datum& array) { - ASSERT_OK_AND_ASSIGN(Datum out, Stdev(array)); + void AssertStdevIsInvalid(const Datum& array, const StdevOptions& options) { + ASSERT_OK_AND_ASSIGN(Datum out, Stdev(array, options)); auto value = checked_cast(out.scalar().get()); ASSERT_FALSE(value->is_valid); } - void AssertStdevIsInvalid(const std::string& json) { + void AssertStdevIsInvalid(const std::string& json, const StdevOptions& options) { auto array = ArrayFromJSON(type_singleton(), json); - AssertStdevIsInvalid(array); + AssertStdevIsInvalid(array, options); } std::shared_ptr type_singleton() { return Traits::type_singleton(); } @@ -990,26 +993,38 @@ class TestPrimitiveStdevKernel : public ::testing::Test { template class TestNumericStdevKernel : public TestPrimitiveStdevKernel {}; +// Reference value from numpy.std TYPED_TEST_SUITE(TestNumericStdevKernel, NumericArrowTypes); TYPED_TEST(TestNumericStdevKernel, Basics) { - // Reference value from numpy.std - this->AssertStdevIs("[100]", 0); - this->AssertStdevIs("[1, 2, 3]", 0.816496580927726); - this->AssertStdevIs("[null, 1, 2, null, 3]", 0.816496580927726); + StdevOptions options; // ddof = 0, population stdev - this->AssertStdevIs({"[]", "[1]", "[2]", "[null]", "[3]"}, 0.816496580927726); - this->AssertStdevIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"}, 2.581988897471611); + this->AssertStdevIs("[100]", options, 0); + this->AssertStdevIs("[1, 2, 3]", options, 0.816496580927726); + this->AssertStdevIs("[null, 1, 2, null, 3]", options, 0.816496580927726); - this->AssertStdevIsInvalid("[null, null, null]"); - this->AssertStdevIsInvalid("[]"); + this->AssertStdevIs({"[]", "[1]", "[2]", "[null]", "[3]"}, options, 0.816496580927726); + this->AssertStdevIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 2.29128784747792); + + this->AssertStdevIsInvalid("[null, null, null]", options); + this->AssertStdevIsInvalid("[]", options); + + options.ddof = 1; // sample stdev + + this->AssertStdevIs("[1, 2]", options, 0.7071067811865476); + this->AssertStdevIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 2.449489742783178); + + this->AssertStdevIsInvalid("[100]", options); + this->AssertStdevIsInvalid("[100, null, null]", options); } class TestStdevKernelStability : public TestPrimitiveStdevKernel {}; // Test numerical stability TEST_F(TestStdevKernelStability, Basics) { - this->AssertStdevIs("[100000004, 100000007, 100000013, 100000016]", 4.743416490252569); - this->AssertStdevIs("[1000000004, 1000000007, 1000000013, 1000000016]", + StdevOptions options; + this->AssertStdevIs("[100000004, 100000007, 100000013, 100000016]", options, + 4.743416490252569); + this->AssertStdevIs("[1000000004, 1000000007, 1000000013, 1000000016]", options, 4.743416490252569); } @@ -1040,7 +1055,9 @@ TEST_F(TestStdevKernelRandom, Basics) { auto chunked = *ChunkedArray::Make( {array->Slice(0, 1000), array->Slice(1000, 9000), array->Slice(10000, 30000)}); double expected = WelfordStdev(*array); - this->AssertStdevIs(chunked, expected, 0.0000001); + + StdevOptions options; + this->AssertStdevIs(chunked, options, expected, 0.0000001); } } // namespace compute diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 5d52bd946f8..7b9179705d6 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -140,7 +140,7 @@ Aggregations +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | mode | Unary | Numeric | Scalar Struct (2) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ -| stdev | Unary | Numeric | Scalar Float64 | | +| stdev | Unary | Numeric | Scalar Float64 | :struct:`StdevOptions` | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | sum | Unary | Numeric | Scalar Numeric (3) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ From 43318f8c33f32ece4efe9b7c2af2b2095d409860 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Tue, 29 Sep 2020 04:41:43 +0000 Subject: [PATCH 3/6] Add variance kernel --- cpp/src/arrow/CMakeLists.txt | 2 +- cpp/src/arrow/compute/api_aggregate.cc | 8 +- cpp/src/arrow/compute/api_aggregate.h | 32 +++-- .../arrow/compute/kernels/aggregate_basic.cc | 3 +- .../kernels/aggregate_basic_internal.h | 3 +- .../arrow/compute/kernels/aggregate_test.cc | 112 +++++++++--------- ...ggregate_stdev.cc => aggregate_var_std.cc} | 98 +++++++++------ docs/source/cpp/compute.rst | 4 +- 8 files changed, 158 insertions(+), 104 deletions(-) rename cpp/src/arrow/compute/kernels/{aggregate_stdev.cc => aggregate_var_std.cc} (56%) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 758ad77a13c..f3aae96a80d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -365,7 +365,7 @@ if(ARROW_COMPUTE) compute/registry.cc compute/kernels/aggregate_basic.cc compute/kernels/aggregate_mode.cc - compute/kernels/aggregate_stdev.cc + compute/kernels/aggregate_var_std.cc compute/kernels/codegen_internal.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index d7c6a5216b2..084835af23c 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -45,8 +45,12 @@ Result Mode(const Datum& value, ExecContext* ctx) { return CallFunction("mode", {value}, ctx); } -Result Stdev(const Datum& value, const StdevOptions& options, ExecContext* ctx) { - return CallFunction("stdev", {value}, &options, ctx); +Result Std(const Datum& value, const VarStdOptions& options, ExecContext* ctx) { + return CallFunction("std", {value}, &options, ctx); +} + +Result Var(const Datum& value, const VarStdOptions& options, ExecContext* ctx) { + return CallFunction("var", {value}, &options, ctx); } } // namespace compute diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 59e5989d040..f76c882c589 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -76,14 +76,14 @@ struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { enum Mode null_handling = SKIP; }; -/// \brief Control Delta Degrees of Freedom (ddof) of Stdev kernel +/// \brief Control Delta Degrees of Freedom (ddof) of Var and Std kernel /// /// The divisor used in calculations is N - ddof, where N is the number of elements. /// By default, ddof is zero, and population standard deviation is returned. -struct ARROW_EXPORT StdevOptions : public FunctionOptions { - explicit StdevOptions(int ddof = 0) : ddof(ddof) {} +struct ARROW_EXPORT VarStdOptions : public FunctionOptions { + explicit VarStdOptions(int ddof = 0) : ddof(ddof) {} - static StdevOptions Defaults() { return StdevOptions{}; } + static VarStdOptions Defaults() { return VarStdOptions{}; } int ddof = 0; }; @@ -160,16 +160,30 @@ Result Mode(const Datum& value, ExecContext* ctx = NULLPTR); /// \brief Calculate the standard deviation of a numeric array /// /// \param[in] value input datum, expecting Array or ChunkedArray -/// \param[in] options see StdevOptions for more information +/// \param[in] options see VarStdOptions for more information /// \param[in] ctx the function execution context, optional -/// \return datum of the computed stdev as a DoubleScalar +/// \return datum of the computed standard deviation as a DoubleScalar /// /// \since 2.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Stdev(const Datum& value, - const StdevOptions& options = StdevOptions::Defaults(), - ExecContext* ctx = NULLPTR); +Result Std(const Datum& value, + const VarStdOptions& options = VarStdOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the variance of a numeric array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see VarStdOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed variance as a DoubleScalar +/// +/// \since 2.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Var(const Datum& value, + const VarStdOptions& options = VarStdOptions::Defaults(), + ExecContext* ctx = NULLPTR); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 7451e5c987c..f7148f1dd28 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -241,7 +241,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); DCHECK_OK(registry->AddFunction(aggregate::AddModeAggKernels())); - DCHECK_OK(registry->AddFunction(aggregate::AddStdevAggKernels())); + DCHECK_OK(registry->AddFunction(aggregate::AddStdAggKernels())); + DCHECK_OK(registry->AddFunction(aggregate::AddVarAggKernels())); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 91a5ec0e205..81abf6ec2a5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -59,7 +59,8 @@ void AddMeanAvx512AggKernels(ScalarAggregateFunction* func); void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func); std::shared_ptr AddModeAggKernels(); -std::shared_ptr AddStdevAggKernels(); +std::shared_ptr AddStdAggKernels(); +std::shared_ptr AddVarAggKernels(); // ---------------------------------------------------------------------- // Sum implementation diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 34b1db535ab..5af715643f5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -943,93 +943,97 @@ TEST_F(TestInt32ModeKernel, LargeValueRange) { } // -// Stdev +// Var/Std // template -class TestPrimitiveStdevKernel : public ::testing::Test { +class TestPrimitiveVarStdKernel : public ::testing::Test { public: using Traits = TypeTraits; using ScalarType = typename TypeTraits::ScalarType; - void AssertStdevIs(const Datum& array, const StdevOptions& options, double expected, - double diff = 0) { - ASSERT_OK_AND_ASSIGN(Datum out, Stdev(array, options)); - auto value = checked_cast(out.scalar().get()); - ASSERT_TRUE(value->is_valid); + void AssertVarStdIs(const Datum& array, const VarStdOptions& options, + double expected_var, double diff = 0) { + ASSERT_OK_AND_ASSIGN(Datum out_var, Var(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_std, Std(array, options)); + auto var = checked_cast(out_var.scalar().get()); + auto std = checked_cast(out_std.scalar().get()); + ASSERT_TRUE(var->is_valid && std->is_valid); + ASSERT_DOUBLE_EQ(std->value * std->value, var->value); if (diff == 0) { - ASSERT_DOUBLE_EQ(value->value, expected); // < 4ULP + ASSERT_DOUBLE_EQ(var->value, expected_var); // < 4ULP } else { - ASSERT_NEAR(value->value, expected, diff); + ASSERT_NEAR(var->value, expected_var, diff); } } - void AssertStdevIs(const std::string& json, const StdevOptions& options, - double expected) { + void AssertVarStdIs(const std::string& json, const VarStdOptions& options, + double expected_var) { auto array = ArrayFromJSON(type_singleton(), json); - AssertStdevIs(array, options, expected); + AssertVarStdIs(array, options, expected_var); } - void AssertStdevIs(const std::vector& json, const StdevOptions& options, - double expected) { + void AssertVarStdIs(const std::vector& json, const VarStdOptions& options, + double expected_var) { auto chunked = ChunkedArrayFromJSON(type_singleton(), json); - AssertStdevIs(chunked, options, expected); + AssertVarStdIs(chunked, options, expected_var); } - void AssertStdevIsInvalid(const Datum& array, const StdevOptions& options) { - ASSERT_OK_AND_ASSIGN(Datum out, Stdev(array, options)); - auto value = checked_cast(out.scalar().get()); - ASSERT_FALSE(value->is_valid); + void AssertVarStdIsInvalid(const Datum& array, const VarStdOptions& options) { + ASSERT_OK_AND_ASSIGN(Datum out_var, Var(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_std, Std(array, options)); + auto var = checked_cast(out_var.scalar().get()); + auto std = checked_cast(out_std.scalar().get()); + ASSERT_FALSE(var->is_valid || std->is_valid); } - void AssertStdevIsInvalid(const std::string& json, const StdevOptions& options) { + void AssertVarStdIsInvalid(const std::string& json, const VarStdOptions& options) { auto array = ArrayFromJSON(type_singleton(), json); - AssertStdevIsInvalid(array, options); + AssertVarStdIsInvalid(array, options); } std::shared_ptr type_singleton() { return Traits::type_singleton(); } }; template -class TestNumericStdevKernel : public TestPrimitiveStdevKernel {}; +class TestNumericVarStdKernel : public TestPrimitiveVarStdKernel {}; -// Reference value from numpy.std -TYPED_TEST_SUITE(TestNumericStdevKernel, NumericArrowTypes); -TYPED_TEST(TestNumericStdevKernel, Basics) { - StdevOptions options; // ddof = 0, population stdev +// Reference value from numpy.var +TYPED_TEST_SUITE(TestNumericVarStdKernel, NumericArrowTypes); +TYPED_TEST(TestNumericVarStdKernel, Basics) { + VarStdOptions options; // ddof = 0, population var/std - this->AssertStdevIs("[100]", options, 0); - this->AssertStdevIs("[1, 2, 3]", options, 0.816496580927726); - this->AssertStdevIs("[null, 1, 2, null, 3]", options, 0.816496580927726); + this->AssertVarStdIs("[100]", options, 0); + this->AssertVarStdIs("[1, 2, 3]", options, 0.6666666666666666); + this->AssertVarStdIs("[null, 1, 2, null, 3]", options, 0.6666666666666666); - this->AssertStdevIs({"[]", "[1]", "[2]", "[null]", "[3]"}, options, 0.816496580927726); - this->AssertStdevIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 2.29128784747792); + this->AssertVarStdIs({"[]", "[1]", "[2]", "[null]", "[3]"}, options, + 0.6666666666666666); + this->AssertVarStdIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 5.25); - this->AssertStdevIsInvalid("[null, null, null]", options); - this->AssertStdevIsInvalid("[]", options); + this->AssertVarStdIsInvalid("[null, null, null]", options); + this->AssertVarStdIsInvalid("[]", options); - options.ddof = 1; // sample stdev + options.ddof = 1; // sample var/std - this->AssertStdevIs("[1, 2]", options, 0.7071067811865476); - this->AssertStdevIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 2.449489742783178); + this->AssertVarStdIs("[1, 2]", options, 0.5); + this->AssertVarStdIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 6.0); - this->AssertStdevIsInvalid("[100]", options); - this->AssertStdevIsInvalid("[100, null, null]", options); + this->AssertVarStdIsInvalid("[100]", options); + this->AssertVarStdIsInvalid("[100, null, null]", options); } -class TestStdevKernelStability : public TestPrimitiveStdevKernel {}; +class TestVarStdKernelStability : public TestPrimitiveVarStdKernel {}; // Test numerical stability -TEST_F(TestStdevKernelStability, Basics) { - StdevOptions options; - this->AssertStdevIs("[100000004, 100000007, 100000013, 100000016]", options, - 4.743416490252569); - this->AssertStdevIs("[1000000004, 1000000007, 1000000013, 1000000016]", options, - 4.743416490252569); +TEST_F(TestVarStdKernelStability, Basics) { + VarStdOptions options{1}; // ddof = 1 + this->AssertVarStdIs("[100000004, 100000007, 100000013, 100000016]", options, 30.0); + this->AssertVarStdIs("[1000000004, 1000000007, 1000000013, 1000000016]", options, 30.0); } -// Calculate reference stdev with welford online algorithm -double WelfordStdev(const Array& array) { +// Calculate reference variance with welford online algorithm +double WelfordVar(const Array& array) { const auto& array_numeric = reinterpret_cast(array); const auto values = array_numeric.raw_values(); internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); @@ -1044,20 +1048,20 @@ double WelfordStdev(const Array& array) { } reader.Next(); } - return sqrt(m2 / count); + return m2 / count; } -class TestStdevKernelRandom : public TestPrimitiveStdevKernel {}; +class TestVarStdKernelRandom : public TestPrimitiveVarStdKernel {}; -TEST_F(TestStdevKernelRandom, Basics) { +TEST_F(TestVarStdKernelRandom, Basics) { auto rand = random::RandomArrayGenerator(0x5487656); - auto array = rand.Numeric(40000, -10000.0, 1000000.0, 0.1); + auto array = rand.Numeric(40000, -10000.0, 100000.0, 0.1); auto chunked = *ChunkedArray::Make( {array->Slice(0, 1000), array->Slice(1000, 9000), array->Slice(10000, 30000)}); - double expected = WelfordStdev(*array); + double expected_var = WelfordVar(*array); - StdevOptions options; - this->AssertStdevIs(chunked, options, expected, 0.0000001); + VarStdOptions options; + this->AssertVarStdIs(chunked, options, expected_var, 0.0001); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/aggregate_stdev.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc similarity index 56% rename from cpp/src/arrow/compute/kernels/aggregate_stdev.cc rename to cpp/src/arrow/compute/kernels/aggregate_var_std.cc index 23af61386c1..f29b7233d7c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_stdev.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -25,13 +25,13 @@ namespace { // Based on https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance template -struct StdevState { +struct VarStdState { using ArrayType = typename TypeTraits::ArrayType; using c_type = typename ArrowType::c_type; - using ThisType = StdevState; + using ThisType = VarStdState; - // Calculate stdev of one chunk with two pass algorithm - // Always use `double` to calculate stdev for any array type + // Calculate variance of one chunk with `two pass algorithm` + // Always use `double` to calculate variance for any array type void Consume(const ArrayType& array) { int64_t count = array.length() - array.null_count(); if (count == 0) { @@ -57,7 +57,7 @@ struct StdevState { this->m2 = m2; } - // Combine stdev from two chunks + // Combine variance from two chunks void MergeFrom(const ThisType& state) { if (state.count == 0) { return; @@ -80,14 +80,16 @@ struct StdevState { double m2 = 0; // sum((X-mean)^2) }; +enum class VarOrStd : bool { Var, Std }; + template -struct StdevImpl : public ScalarAggregator { - using ThisType = StdevImpl; +struct VarStdImpl : public ScalarAggregator { + using ThisType = VarStdImpl; using ArrayType = typename TypeTraits::ArrayType; - explicit StdevImpl(const std::shared_ptr& out_type, - const StdevOptions& options) - : out_type(out_type), options(options) {} + explicit VarStdImpl(const std::shared_ptr& out_type, + const VarStdOptions& options, VarOrStd return_type) + : out_type(out_type), options(options), return_type(return_type) {} void Consume(KernelContext*, const ExecBatch& batch) override { ArrayType array(batch[0].array()); @@ -103,36 +105,46 @@ struct StdevImpl : public ScalarAggregator { if (this->state.count <= options.ddof) { out->value = std::make_shared(); } else { - double stdev = sqrt(this->state.m2 / (this->state.count - options.ddof)); - out->value = std::make_shared(stdev); + double var = this->state.m2 / (this->state.count - options.ddof); + out->value = + std::make_shared(return_type == VarOrStd::Var ? var : sqrt(var)); } } std::shared_ptr out_type; - StdevState state; - StdevOptions options; + VarStdState state; + VarStdOptions options; + VarOrStd return_type; }; -struct StdevInitState { +struct VarStdInitState { std::unique_ptr state; KernelContext* ctx; const DataType& in_type; const std::shared_ptr& out_type; - const StdevOptions& options; - - StdevInitState(KernelContext* ctx, const DataType& in_type, - const std::shared_ptr& out_type, const StdevOptions& options) - : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {} - - Status Visit(const DataType&) { return Status::NotImplemented("No stdev implemented"); } + const VarStdOptions& options; + VarOrStd return_type; + + VarStdInitState(KernelContext* ctx, const DataType& in_type, + const std::shared_ptr& out_type, const VarStdOptions& options, + VarOrStd return_type) + : ctx(ctx), + in_type(in_type), + out_type(out_type), + options(options), + return_type(return_type) {} + + Status Visit(const DataType&) { + return Status::NotImplemented("No var/std implemented"); + } Status Visit(const HalfFloatType&) { - return Status::NotImplemented("No stdev implemented"); + return Status::NotImplemented("No var/std implemented"); } template enable_if_t::value, Status> Visit(const Type&) { - state.reset(new StdevImpl(out_type, options)); + state.reset(new VarStdImpl(out_type, options, return_type)); return Status::OK(); } @@ -142,15 +154,23 @@ struct StdevInitState { } }; -std::unique_ptr StdevInit(KernelContext* ctx, const KernelInitArgs& args) { - StdevInitState visitor(ctx, *args.inputs[0].type, - args.kernel->signature->out_type().type(), - static_cast(*args.options)); +std::unique_ptr StdInit(KernelContext* ctx, const KernelInitArgs& args) { + VarStdInitState visitor( + ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + static_cast(*args.options), VarOrStd::Std); + return visitor.Create(); +} + +std::unique_ptr VarInit(KernelContext* ctx, const KernelInitArgs& args) { + VarStdInitState visitor( + ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + static_cast(*args.options), VarOrStd::Var); return visitor.Create(); } -void AddStdevKernels(KernelInit init, const std::vector>& types, - ScalarAggregateFunction* func) { +void AddVarStdKernels(KernelInit init, + const std::vector>& types, + ScalarAggregateFunction* func) { for (const auto& ty : types) { auto sig = KernelSignature::Make({InputType::Array(ty)}, float64()); AddAggKernel(std::move(sig), init, func); @@ -159,11 +179,19 @@ void AddStdevKernels(KernelInit init, const std::vector AddStdevAggKernels() { - static auto default_stdev_options = StdevOptions::Defaults(); - auto func = std::make_shared("stdev", Arity::Unary(), - &default_stdev_options); - AddStdevKernels(StdevInit, internal::NumericTypes(), func.get()); +std::shared_ptr AddStdAggKernels() { + static auto default_std_options = VarStdOptions::Defaults(); + auto func = std::make_shared("std", Arity::Unary(), + &default_std_options); + AddVarStdKernels(StdInit, internal::NumericTypes(), func.get()); + return func; +} + +std::shared_ptr AddVarAggKernels() { + static auto default_var_options = VarStdOptions::Defaults(); + auto func = std::make_shared("var", Arity::Unary(), + &default_var_options); + AddVarStdKernels(VarInit, internal::NumericTypes(), func.get()); return func; } diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 7b9179705d6..b91e2e62dce 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -140,10 +140,12 @@ Aggregations +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | mode | Unary | Numeric | Scalar Struct (2) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ -| stdev | Unary | Numeric | Scalar Float64 | :struct:`StdevOptions` | +| std | Unary | Numeric | Scalar Float64 | :struct:`VarStdOptions` | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | sum | Unary | Numeric | Scalar Numeric (3) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| var | Unary | Numeric | Scalar Float64 | :struct:`VarStdOptions` | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ Notes: From dbf7278aa5ade5346d80cd72df5db7126f136f58 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Wed, 30 Sep 2020 03:27:27 +0000 Subject: [PATCH 4/6] Refine naming and test case --- cpp/src/arrow/compute/api_aggregate.cc | 9 +- cpp/src/arrow/compute/api_aggregate.h | 16 +-- .../arrow/compute/kernels/aggregate_basic.cc | 4 +- .../kernels/aggregate_basic_internal.h | 4 +- .../arrow/compute/kernels/aggregate_test.cc | 115 ++++++++++++------ .../compute/kernels/aggregate_var_std.cc | 28 +++-- docs/source/cpp/compute.rst | 4 +- 7 files changed, 115 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 084835af23c..93155c2c29b 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -45,12 +45,13 @@ Result Mode(const Datum& value, ExecContext* ctx) { return CallFunction("mode", {value}, ctx); } -Result Std(const Datum& value, const VarStdOptions& options, ExecContext* ctx) { - return CallFunction("std", {value}, &options, ctx); +Result Stddev(const Datum& value, const VarStdOptions& options, ExecContext* ctx) { + return CallFunction("stddev", {value}, &options, ctx); } -Result Var(const Datum& value, const VarStdOptions& options, ExecContext* ctx) { - return CallFunction("var", {value}, &options, ctx); +Result Variance(const Datum& value, const VarStdOptions& options, + ExecContext* ctx) { + return CallFunction("variance", {value}, &options, ctx); } } // namespace compute diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index f76c882c589..11a8baa1515 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -76,10 +76,10 @@ struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { enum Mode null_handling = SKIP; }; -/// \brief Control Delta Degrees of Freedom (ddof) of Var and Std kernel +/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel /// /// The divisor used in calculations is N - ddof, where N is the number of elements. -/// By default, ddof is zero, and population standard deviation is returned. +/// By default, ddof is zero, and population variance or stddev is returned. struct ARROW_EXPORT VarStdOptions : public FunctionOptions { explicit VarStdOptions(int ddof = 0) : ddof(ddof) {} @@ -167,9 +167,9 @@ Result Mode(const Datum& value, ExecContext* ctx = NULLPTR); /// \since 2.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Std(const Datum& value, - const VarStdOptions& options = VarStdOptions::Defaults(), - ExecContext* ctx = NULLPTR); +Result Stddev(const Datum& value, + const VarStdOptions& options = VarStdOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Calculate the variance of a numeric array /// @@ -181,9 +181,9 @@ Result Std(const Datum& value, /// \since 2.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Var(const Datum& value, - const VarStdOptions& options = VarStdOptions::Defaults(), - ExecContext* ctx = NULLPTR); +Result Variance(const Datum& value, + const VarStdOptions& options = VarStdOptions::Defaults(), + ExecContext* ctx = NULLPTR); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index f7148f1dd28..562e17485fd 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -241,8 +241,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); DCHECK_OK(registry->AddFunction(aggregate::AddModeAggKernels())); - DCHECK_OK(registry->AddFunction(aggregate::AddStdAggKernels())); - DCHECK_OK(registry->AddFunction(aggregate::AddVarAggKernels())); + DCHECK_OK(registry->AddFunction(aggregate::AddStddevAggKernels())); + DCHECK_OK(registry->AddFunction(aggregate::AddVarianceAggKernels())); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 81abf6ec2a5..2b0631ee2f2 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -59,8 +59,8 @@ void AddMeanAvx512AggKernels(ScalarAggregateFunction* func); void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func); std::shared_ptr AddModeAggKernels(); -std::shared_ptr AddStdAggKernels(); -std::shared_ptr AddVarAggKernels(); +std::shared_ptr AddStddevAggKernels(); +std::shared_ptr AddVarianceAggKernels(); // ---------------------------------------------------------------------- // Sum implementation diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 5af715643f5..2bd4f055d04 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -943,7 +943,7 @@ TEST_F(TestInt32ModeKernel, LargeValueRange) { } // -// Var/Std +// Variance/Stddev // template @@ -952,25 +952,21 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { using Traits = TypeTraits; using ScalarType = typename TypeTraits::ScalarType; - void AssertVarStdIs(const Datum& array, const VarStdOptions& options, + void AssertVarStdIs(const Array& array, const VarStdOptions& options, double expected_var, double diff = 0) { - ASSERT_OK_AND_ASSIGN(Datum out_var, Var(array, options)); - ASSERT_OK_AND_ASSIGN(Datum out_std, Std(array, options)); - auto var = checked_cast(out_var.scalar().get()); - auto std = checked_cast(out_std.scalar().get()); - ASSERT_TRUE(var->is_valid && std->is_valid); - ASSERT_DOUBLE_EQ(std->value * std->value, var->value); - if (diff == 0) { - ASSERT_DOUBLE_EQ(var->value, expected_var); // < 4ULP - } else { - ASSERT_NEAR(var->value, expected_var, diff); - } + AssertVarStdIsInternal(array, options, expected_var, diff); + } + + void AssertVarStdIs(const std::shared_ptr& array, + const VarStdOptions& options, double expected_var, + double diff = 0) { + AssertVarStdIsInternal(array, options, expected_var, diff); } - void AssertVarStdIs(const std::string& json, const VarStdOptions& options, + void AssertVarStdIs(const char* json, const VarStdOptions& options, double expected_var) { auto array = ArrayFromJSON(type_singleton(), json); - AssertVarStdIs(array, options, expected_var); + AssertVarStdIs(*array, options, expected_var); } void AssertVarStdIs(const std::vector& json, const VarStdOptions& options, @@ -979,20 +975,51 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { AssertVarStdIs(chunked, options, expected_var); } - void AssertVarStdIsInvalid(const Datum& array, const VarStdOptions& options) { - ASSERT_OK_AND_ASSIGN(Datum out_var, Var(array, options)); - ASSERT_OK_AND_ASSIGN(Datum out_std, Std(array, options)); - auto var = checked_cast(out_var.scalar().get()); - auto std = checked_cast(out_std.scalar().get()); - ASSERT_FALSE(var->is_valid || std->is_valid); + void AssertVarStdIsInvalid(const Array& array, const VarStdOptions& options) { + AssertVarStdIsInvalidInternal(array, options); + } + + void AssertVarStdIsInvalid(const std::shared_ptr& array, + const VarStdOptions& options) { + AssertVarStdIsInvalidInternal(array, options); } - void AssertVarStdIsInvalid(const std::string& json, const VarStdOptions& options) { + void AssertVarStdIsInvalid(const char* json, const VarStdOptions& options) { auto array = ArrayFromJSON(type_singleton(), json); + AssertVarStdIsInvalid(*array, options); + } + + void AssertVarStdIsInvalid(const std::vector& json, + const VarStdOptions& options) { + auto array = ChunkedArrayFromJSON(type_singleton(), json); AssertVarStdIsInvalid(array, options); } std::shared_ptr type_singleton() { return Traits::type_singleton(); } + + private: + void AssertVarStdIsInternal(const Datum& array, const VarStdOptions& options, + double expected_var, double diff = 0) { + ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); + auto var = checked_cast(out_var.scalar().get()); + auto std = checked_cast(out_std.scalar().get()); + ASSERT_TRUE(var->is_valid && std->is_valid); + ASSERT_DOUBLE_EQ(std->value * std->value, var->value); + if (diff == 0) { + ASSERT_DOUBLE_EQ(var->value, expected_var); // < 4ULP + } else { + ASSERT_NEAR(var->value, expected_var, diff); + } + } + + void AssertVarStdIsInvalidInternal(const Datum& array, const VarStdOptions& options) { + ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); + auto var = checked_cast(out_var.scalar().get()); + auto std = checked_cast(out_std.scalar().get()); + ASSERT_FALSE(var->is_valid || std->is_valid); + } }; template @@ -1001,26 +1028,30 @@ class TestNumericVarStdKernel : public TestPrimitiveVarStdKernel {}; // Reference value from numpy.var TYPED_TEST_SUITE(TestNumericVarStdKernel, NumericArrowTypes); TYPED_TEST(TestNumericVarStdKernel, Basics) { - VarStdOptions options; // ddof = 0, population var/std + VarStdOptions options; // ddof = 0, population variance/stddev this->AssertVarStdIs("[100]", options, 0); this->AssertVarStdIs("[1, 2, 3]", options, 0.6666666666666666); this->AssertVarStdIs("[null, 1, 2, null, 3]", options, 0.6666666666666666); - this->AssertVarStdIs({"[]", "[1]", "[2]", "[null]", "[3]"}, options, 0.6666666666666666); this->AssertVarStdIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 5.25); + this->AssertVarStdIs({"[1, 2, 3, 4, 5, 6, 7]", "[8]"}, options, 5.25); this->AssertVarStdIsInvalid("[null, null, null]", options); this->AssertVarStdIsInvalid("[]", options); + this->AssertVarStdIsInvalid("[]", options); - options.ddof = 1; // sample var/std + options.ddof = 1; // sample variance/stddev this->AssertVarStdIs("[1, 2]", options, 0.5); + this->AssertVarStdIs({"[1]", "[2]"}, options, 0.5); this->AssertVarStdIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 6.0); + this->AssertVarStdIs({"[1, 2, 3, 4, 5, 6, 7]", "[8]"}, options, 6.0); this->AssertVarStdIsInvalid("[100]", options); this->AssertVarStdIsInvalid("[100, null, null]", options); + this->AssertVarStdIsInvalid({"[100]", "[null]", "[]"}, options); } class TestVarStdKernelStability : public TestPrimitiveVarStdKernel {}; @@ -1032,8 +1063,9 @@ TEST_F(TestVarStdKernelStability, Basics) { this->AssertVarStdIs("[1000000004, 1000000007, 1000000013, 1000000016]", options, 30.0); } -// Calculate reference variance with welford online algorithm -double WelfordVar(const Array& array) { +// Calculate reference variance with Welford's online algorithm +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm +std::pair WelfordVar(const Array& array) { const auto& array_numeric = reinterpret_cast(array); const auto values = array_numeric.raw_values(); internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); @@ -1048,20 +1080,35 @@ double WelfordVar(const Array& array) { } reader.Next(); } - return m2 / count; + return std::make_pair(m2 / count, m2 / (count - 1)); } class TestVarStdKernelRandom : public TestPrimitiveVarStdKernel {}; TEST_F(TestVarStdKernelRandom, Basics) { + // Cut array into small chunks + constexpr int array_size = 5000; + constexpr int chunk_size_max = 50; + constexpr int chunk_count = array_size / chunk_size_max; + auto rand = random::RandomArrayGenerator(0x5487656); - auto array = rand.Numeric(40000, -10000.0, 100000.0, 0.1); - auto chunked = *ChunkedArray::Make( - {array->Slice(0, 1000), array->Slice(1000, 9000), array->Slice(10000, 30000)}); - double expected_var = WelfordVar(*array); + auto array = rand.Numeric(array_size, -10000.0, 100000.0, 0.1); + auto chunk_size_array = rand.Numeric(chunk_count, 0, chunk_size_max); + const int* chunk_size = chunk_size_array->data()->GetValues(1); + int total_size = 0; + + ArrayVector array_vector; + for (int i = 0; i < chunk_count; ++i) { + array_vector.emplace_back(array->Slice(total_size, chunk_size[i])); + total_size += chunk_size[i]; + } + auto chunked = *ChunkedArray::Make(array_vector); + + double var_population, var_sample; + std::tie(var_population, var_sample) = WelfordVar(*(array->Slice(0, total_size))); - VarStdOptions options; - this->AssertVarStdIs(chunked, options, expected_var, 0.0001); + this->AssertVarStdIs(chunked, VarStdOptions{0}, var_population, 0.0001); + this->AssertVarStdIs(chunked, VarStdOptions{1}, var_sample, 0.0001); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc index f29b7233d7c..6a96cd33f38 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -23,14 +23,14 @@ namespace aggregate { namespace { -// Based on https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance template struct VarStdState { using ArrayType = typename TypeTraits::ArrayType; using c_type = typename ArrowType::c_type; using ThisType = VarStdState; - // Calculate variance of one chunk with `two pass algorithm` + // Calculate `m2` (sum((X-mean)^2)) of one chunk with `two pass algorithm` + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm // Always use `double` to calculate variance for any array type void Consume(const ArrayType& array) { int64_t count = array.length() - array.null_count(); @@ -57,7 +57,8 @@ struct VarStdState { this->m2 = m2; } - // Combine variance from two chunks + // Combine `m2` from two chunks + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm void MergeFrom(const ThisType& state) { if (state.count == 0) { return; @@ -135,11 +136,11 @@ struct VarStdInitState { return_type(return_type) {} Status Visit(const DataType&) { - return Status::NotImplemented("No var/std implemented"); + return Status::NotImplemented("No variance/stddev implemented"); } Status Visit(const HalfFloatType&) { - return Status::NotImplemented("No var/std implemented"); + return Status::NotImplemented("No variance/stddev implemented"); } template @@ -154,14 +155,15 @@ struct VarStdInitState { } }; -std::unique_ptr StdInit(KernelContext* ctx, const KernelInitArgs& args) { +std::unique_ptr StddevInit(KernelContext* ctx, const KernelInitArgs& args) { VarStdInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), static_cast(*args.options), VarOrStd::Std); return visitor.Create(); } -std::unique_ptr VarInit(KernelContext* ctx, const KernelInitArgs& args) { +std::unique_ptr VarianceInit(KernelContext* ctx, + const KernelInitArgs& args) { VarStdInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), static_cast(*args.options), VarOrStd::Var); @@ -179,19 +181,19 @@ void AddVarStdKernels(KernelInit init, } // namespace -std::shared_ptr AddStdAggKernels() { +std::shared_ptr AddStddevAggKernels() { static auto default_std_options = VarStdOptions::Defaults(); - auto func = std::make_shared("std", Arity::Unary(), + auto func = std::make_shared("stddev", Arity::Unary(), &default_std_options); - AddVarStdKernels(StdInit, internal::NumericTypes(), func.get()); + AddVarStdKernels(StddevInit, internal::NumericTypes(), func.get()); return func; } -std::shared_ptr AddVarAggKernels() { +std::shared_ptr AddVarianceAggKernels() { static auto default_var_options = VarStdOptions::Defaults(); - auto func = std::make_shared("var", Arity::Unary(), + auto func = std::make_shared("variance", Arity::Unary(), &default_var_options); - AddVarStdKernels(VarInit, internal::NumericTypes(), func.get()); + AddVarStdKernels(VarianceInit, internal::NumericTypes(), func.get()); return func; } diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index b91e2e62dce..aa25db69483 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -140,11 +140,11 @@ Aggregations +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | mode | Unary | Numeric | Scalar Struct (2) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ -| std | Unary | Numeric | Scalar Float64 | :struct:`VarStdOptions` | +| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarStdOptions` | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | sum | Unary | Numeric | Scalar Numeric (3) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ -| var | Unary | Numeric | Scalar Float64 | :struct:`VarStdOptions` | +| variance | Unary | Numeric | Scalar Float64 | :struct:`VarStdOptions` | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ Notes: From 1957a2a554f45a915338b183331e9f9dbbe2b671 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Wed, 30 Sep 2020 06:30:54 +0000 Subject: [PATCH 5/6] Fix build error on windows --- .../arrow/compute/kernels/aggregate_test.cc | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 2bd4f055d04..db25278df4a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -963,7 +963,7 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { AssertVarStdIsInternal(array, options, expected_var, diff); } - void AssertVarStdIs(const char* json, const VarStdOptions& options, + void AssertVarStdIs(const std::string& json, const VarStdOptions& options, double expected_var) { auto array = ArrayFromJSON(type_singleton(), json); AssertVarStdIs(*array, options, expected_var); @@ -984,7 +984,7 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { AssertVarStdIsInvalidInternal(array, options); } - void AssertVarStdIsInvalid(const char* json, const VarStdOptions& options) { + void AssertVarStdIsInvalid(const std::string& json, const VarStdOptions& options) { auto array = ArrayFromJSON(type_singleton(), json); AssertVarStdIsInvalid(*array, options); } @@ -1033,10 +1033,14 @@ TYPED_TEST(TestNumericVarStdKernel, Basics) { this->AssertVarStdIs("[100]", options, 0); this->AssertVarStdIs("[1, 2, 3]", options, 0.6666666666666666); this->AssertVarStdIs("[null, 1, 2, null, 3]", options, 0.6666666666666666); - this->AssertVarStdIs({"[]", "[1]", "[2]", "[null]", "[3]"}, options, - 0.6666666666666666); - this->AssertVarStdIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 5.25); - this->AssertVarStdIs({"[1, 2, 3, 4, 5, 6, 7]", "[8]"}, options, 5.25); + + std::vector chunks; + chunks = {"[]", "[1]", "[2]", "[null]", "[3]"}; + this->AssertVarStdIs(chunks, options, 0.6666666666666666); + chunks = {"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}; + this->AssertVarStdIs(chunks, options, 5.25); + chunks = {"[1, 2, 3, 4, 5, 6, 7]", "[8]"}; + this->AssertVarStdIs(chunks, options, 5.25); this->AssertVarStdIsInvalid("[null, null, null]", options); this->AssertVarStdIsInvalid("[]", options); @@ -1045,13 +1049,18 @@ TYPED_TEST(TestNumericVarStdKernel, Basics) { options.ddof = 1; // sample variance/stddev this->AssertVarStdIs("[1, 2]", options, 0.5); - this->AssertVarStdIs({"[1]", "[2]"}, options, 0.5); - this->AssertVarStdIs({"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}, options, 6.0); - this->AssertVarStdIs({"[1, 2, 3, 4, 5, 6, 7]", "[8]"}, options, 6.0); + + chunks = {"[1]", "[2]"}; + this->AssertVarStdIs(chunks, options, 0.5); + chunks = {"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}; + this->AssertVarStdIs(chunks, options, 6.0); + chunks = {"[1, 2, 3, 4, 5, 6, 7]", "[8]"}; + this->AssertVarStdIs(chunks, options, 6.0); this->AssertVarStdIsInvalid("[100]", options); this->AssertVarStdIsInvalid("[100, null, null]", options); - this->AssertVarStdIsInvalid({"[100]", "[null]", "[]"}, options); + chunks = {"[100]", "[null]", "[]"}; + this->AssertVarStdIsInvalid(chunks, options); } class TestVarStdKernelStability : public TestPrimitiveVarStdKernel {}; From 1c77f16fe050c1f66d321962b3d6840109d5a572 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 30 Sep 2020 13:32:03 +0200 Subject: [PATCH 6/6] VarianceOptions --- cpp/src/arrow/compute/api_aggregate.cc | 5 ++-- cpp/src/arrow/compute/api_aggregate.h | 14 ++++----- .../arrow/compute/kernels/aggregate_test.cc | 30 +++++++++---------- .../compute/kernels/aggregate_var_std.cc | 18 +++++------ docs/source/cpp/compute.rst | 4 +-- 5 files changed, 36 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 93155c2c29b..53ee5b9a2b2 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -45,11 +45,12 @@ Result Mode(const Datum& value, ExecContext* ctx) { return CallFunction("mode", {value}, ctx); } -Result Stddev(const Datum& value, const VarStdOptions& options, ExecContext* ctx) { +Result Stddev(const Datum& value, const VarianceOptions& options, + ExecContext* ctx) { return CallFunction("stddev", {value}, &options, ctx); } -Result Variance(const Datum& value, const VarStdOptions& options, +Result Variance(const Datum& value, const VarianceOptions& options, ExecContext* ctx) { return CallFunction("variance", {value}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 11a8baa1515..41c6e1d613a 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -80,10 +80,10 @@ struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { /// /// The divisor used in calculations is N - ddof, where N is the number of elements. /// By default, ddof is zero, and population variance or stddev is returned. -struct ARROW_EXPORT VarStdOptions : public FunctionOptions { - explicit VarStdOptions(int ddof = 0) : ddof(ddof) {} +struct ARROW_EXPORT VarianceOptions : public FunctionOptions { + explicit VarianceOptions(int ddof = 0) : ddof(ddof) {} - static VarStdOptions Defaults() { return VarStdOptions{}; } + static VarianceOptions Defaults() { return VarianceOptions{}; } int ddof = 0; }; @@ -160,7 +160,7 @@ Result Mode(const Datum& value, ExecContext* ctx = NULLPTR); /// \brief Calculate the standard deviation of a numeric array /// /// \param[in] value input datum, expecting Array or ChunkedArray -/// \param[in] options see VarStdOptions for more information +/// \param[in] options see VarianceOptions for more information /// \param[in] ctx the function execution context, optional /// \return datum of the computed standard deviation as a DoubleScalar /// @@ -168,13 +168,13 @@ Result Mode(const Datum& value, ExecContext* ctx = NULLPTR); /// \note API not yet finalized ARROW_EXPORT Result Stddev(const Datum& value, - const VarStdOptions& options = VarStdOptions::Defaults(), + const VarianceOptions& options = VarianceOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Calculate the variance of a numeric array /// /// \param[in] value input datum, expecting Array or ChunkedArray -/// \param[in] options see VarStdOptions for more information +/// \param[in] options see VarianceOptions for more information /// \param[in] ctx the function execution context, optional /// \return datum of the computed variance as a DoubleScalar /// @@ -182,7 +182,7 @@ Result Stddev(const Datum& value, /// \note API not yet finalized ARROW_EXPORT Result Variance(const Datum& value, - const VarStdOptions& options = VarStdOptions::Defaults(), + const VarianceOptions& options = VarianceOptions::Defaults(), ExecContext* ctx = NULLPTR); } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index db25278df4a..39b3f8827fb 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -952,45 +952,45 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { using Traits = TypeTraits; using ScalarType = typename TypeTraits::ScalarType; - void AssertVarStdIs(const Array& array, const VarStdOptions& options, + void AssertVarStdIs(const Array& array, const VarianceOptions& options, double expected_var, double diff = 0) { AssertVarStdIsInternal(array, options, expected_var, diff); } void AssertVarStdIs(const std::shared_ptr& array, - const VarStdOptions& options, double expected_var, + const VarianceOptions& options, double expected_var, double diff = 0) { AssertVarStdIsInternal(array, options, expected_var, diff); } - void AssertVarStdIs(const std::string& json, const VarStdOptions& options, + void AssertVarStdIs(const std::string& json, const VarianceOptions& options, double expected_var) { auto array = ArrayFromJSON(type_singleton(), json); AssertVarStdIs(*array, options, expected_var); } - void AssertVarStdIs(const std::vector& json, const VarStdOptions& options, - double expected_var) { + void AssertVarStdIs(const std::vector& json, + const VarianceOptions& options, double expected_var) { auto chunked = ChunkedArrayFromJSON(type_singleton(), json); AssertVarStdIs(chunked, options, expected_var); } - void AssertVarStdIsInvalid(const Array& array, const VarStdOptions& options) { + void AssertVarStdIsInvalid(const Array& array, const VarianceOptions& options) { AssertVarStdIsInvalidInternal(array, options); } void AssertVarStdIsInvalid(const std::shared_ptr& array, - const VarStdOptions& options) { + const VarianceOptions& options) { AssertVarStdIsInvalidInternal(array, options); } - void AssertVarStdIsInvalid(const std::string& json, const VarStdOptions& options) { + void AssertVarStdIsInvalid(const std::string& json, const VarianceOptions& options) { auto array = ArrayFromJSON(type_singleton(), json); AssertVarStdIsInvalid(*array, options); } void AssertVarStdIsInvalid(const std::vector& json, - const VarStdOptions& options) { + const VarianceOptions& options) { auto array = ChunkedArrayFromJSON(type_singleton(), json); AssertVarStdIsInvalid(array, options); } @@ -998,7 +998,7 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { std::shared_ptr type_singleton() { return Traits::type_singleton(); } private: - void AssertVarStdIsInternal(const Datum& array, const VarStdOptions& options, + void AssertVarStdIsInternal(const Datum& array, const VarianceOptions& options, double expected_var, double diff = 0) { ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); @@ -1013,7 +1013,7 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { } } - void AssertVarStdIsInvalidInternal(const Datum& array, const VarStdOptions& options) { + void AssertVarStdIsInvalidInternal(const Datum& array, const VarianceOptions& options) { ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); auto var = checked_cast(out_var.scalar().get()); @@ -1028,7 +1028,7 @@ class TestNumericVarStdKernel : public TestPrimitiveVarStdKernel {}; // Reference value from numpy.var TYPED_TEST_SUITE(TestNumericVarStdKernel, NumericArrowTypes); TYPED_TEST(TestNumericVarStdKernel, Basics) { - VarStdOptions options; // ddof = 0, population variance/stddev + VarianceOptions options; // ddof = 0, population variance/stddev this->AssertVarStdIs("[100]", options, 0); this->AssertVarStdIs("[1, 2, 3]", options, 0.6666666666666666); @@ -1067,7 +1067,7 @@ class TestVarStdKernelStability : public TestPrimitiveVarStdKernel { // Test numerical stability TEST_F(TestVarStdKernelStability, Basics) { - VarStdOptions options{1}; // ddof = 1 + VarianceOptions options{1}; // ddof = 1 this->AssertVarStdIs("[100000004, 100000007, 100000013, 100000016]", options, 30.0); this->AssertVarStdIs("[1000000004, 1000000007, 1000000013, 1000000016]", options, 30.0); } @@ -1116,8 +1116,8 @@ TEST_F(TestVarStdKernelRandom, Basics) { double var_population, var_sample; std::tie(var_population, var_sample) = WelfordVar(*(array->Slice(0, total_size))); - this->AssertVarStdIs(chunked, VarStdOptions{0}, var_population, 0.0001); - this->AssertVarStdIs(chunked, VarStdOptions{1}, var_sample, 0.0001); + this->AssertVarStdIs(chunked, VarianceOptions{0}, var_population, 0.0001); + this->AssertVarStdIs(chunked, VarianceOptions{1}, var_sample, 0.0001); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc index 6a96cd33f38..e2b98bb38fc 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -89,7 +89,7 @@ struct VarStdImpl : public ScalarAggregator { using ArrayType = typename TypeTraits::ArrayType; explicit VarStdImpl(const std::shared_ptr& out_type, - const VarStdOptions& options, VarOrStd return_type) + const VarianceOptions& options, VarOrStd return_type) : out_type(out_type), options(options), return_type(return_type) {} void Consume(KernelContext*, const ExecBatch& batch) override { @@ -114,7 +114,7 @@ struct VarStdImpl : public ScalarAggregator { std::shared_ptr out_type; VarStdState state; - VarStdOptions options; + VarianceOptions options; VarOrStd return_type; }; @@ -123,12 +123,12 @@ struct VarStdInitState { KernelContext* ctx; const DataType& in_type; const std::shared_ptr& out_type; - const VarStdOptions& options; + const VarianceOptions& options; VarOrStd return_type; VarStdInitState(KernelContext* ctx, const DataType& in_type, - const std::shared_ptr& out_type, const VarStdOptions& options, - VarOrStd return_type) + const std::shared_ptr& out_type, + const VarianceOptions& options, VarOrStd return_type) : ctx(ctx), in_type(in_type), out_type(out_type), @@ -158,7 +158,7 @@ struct VarStdInitState { std::unique_ptr StddevInit(KernelContext* ctx, const KernelInitArgs& args) { VarStdInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options), VarOrStd::Std); + static_cast(*args.options), VarOrStd::Std); return visitor.Create(); } @@ -166,7 +166,7 @@ std::unique_ptr VarianceInit(KernelContext* ctx, const KernelInitArgs& args) { VarStdInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options), VarOrStd::Var); + static_cast(*args.options), VarOrStd::Var); return visitor.Create(); } @@ -182,7 +182,7 @@ void AddVarStdKernels(KernelInit init, } // namespace std::shared_ptr AddStddevAggKernels() { - static auto default_std_options = VarStdOptions::Defaults(); + static auto default_std_options = VarianceOptions::Defaults(); auto func = std::make_shared("stddev", Arity::Unary(), &default_std_options); AddVarStdKernels(StddevInit, internal::NumericTypes(), func.get()); @@ -190,7 +190,7 @@ std::shared_ptr AddStddevAggKernels() { } std::shared_ptr AddVarianceAggKernels() { - static auto default_var_options = VarStdOptions::Defaults(); + static auto default_var_options = VarianceOptions::Defaults(); auto func = std::make_shared("variance", Arity::Unary(), &default_var_options); AddVarStdKernels(VarianceInit, internal::NumericTypes(), func.get()); diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index aa25db69483..6ef10abf67d 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -140,11 +140,11 @@ Aggregations +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | mode | Unary | Numeric | Scalar Struct (2) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ -| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarStdOptions` | +| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | sum | Unary | Numeric | Scalar Numeric (3) | | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ -| variance | Unary | Numeric | Scalar Float64 | :struct:`VarStdOptions` | +| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ Notes: