diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 275e406be79..d284a430b81 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -130,8 +130,9 @@ G_BEGIN_DECLS * #GArrowCastOptions is a class to customize the `cast` function and * garrow_array_cast(). * - * #GArrowCountOptions is a class to customize the `count` function and - * garrow_array_count(). + * #GArrowScalarAggregateOptions is a class to customize the scalar + * aggregate functions such as `count` function and convenient + * functions of them such as garrow_array_count(). * * #GArrowFilterOptions is a class to customize the `filter` function and * garrow_array_filter() family. @@ -636,60 +637,65 @@ garrow_cast_options_new(void) } -typedef struct GArrowCountOptionsPrivate_ { - arrow::compute::CountOptions options; -} GArrowCountOptionsPrivate; +typedef struct GArrowScalarAggregateOptionsPrivate_ { + arrow::compute::ScalarAggregateOptions options; +} GArrowScalarAggregateOptionsPrivate; enum { - PROP_MODE = 1, + PROP_SKIP_NULLS = 1, + PROP_MIN_COUNT, }; static arrow::compute::FunctionOptions * -garrow_count_options_get_raw_function_options(GArrowFunctionOptions *options) +garrow_scalar_aggregate_options_get_raw_function_options( + GArrowFunctionOptions *options) { - return garrow_count_options_get_raw(GARROW_COUNT_OPTIONS(options)); + return garrow_scalar_aggregate_options_get_raw( + GARROW_SCALAR_AGGREGATE_OPTIONS(options)); } static void -garrow_count_options_function_options_interface_init( +garrow_scalar_aggregate_options_function_options_interface_init( GArrowFunctionOptionsInterface *iface) { - iface->get_raw = garrow_count_options_get_raw_function_options; + iface->get_raw = garrow_scalar_aggregate_options_get_raw_function_options; } -G_DEFINE_TYPE_WITH_CODE(GArrowCountOptions, - garrow_count_options, +G_DEFINE_TYPE_WITH_CODE(GArrowScalarAggregateOptions, + garrow_scalar_aggregate_options, G_TYPE_OBJECT, - G_ADD_PRIVATE(GArrowCountOptions) + G_ADD_PRIVATE(GArrowScalarAggregateOptions) G_IMPLEMENT_INTERFACE( GARROW_TYPE_FUNCTION_OPTIONS, - garrow_count_options_function_options_interface_init)) + garrow_scalar_aggregate_options_function_options_interface_init)) -#define GARROW_COUNT_OPTIONS_GET_PRIVATE(object) \ - static_cast( \ - garrow_count_options_get_instance_private( \ - GARROW_COUNT_OPTIONS(object))) +#define GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_scalar_aggregate_options_get_instance_private( \ + GARROW_SCALAR_AGGREGATE_OPTIONS(object))) static void -garrow_count_options_finalize(GObject *object) +garrow_scalar_aggregate_options_finalize(GObject *object) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); - priv->options.~CountOptions(); - G_OBJECT_CLASS(garrow_count_options_parent_class)->finalize(object); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object); + priv->options.~ScalarAggregateOptions(); + G_OBJECT_CLASS(garrow_scalar_aggregate_options_parent_class)->finalize(object); } static void -garrow_count_options_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +garrow_scalar_aggregate_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_MODE: - priv->options.count_mode = - static_cast(g_value_get_enum(value)); + case PROP_SKIP_NULLS: + priv->options.skip_nulls = g_value_get_boolean(value); + break; + case PROP_MIN_COUNT: + priv->options.min_count = g_value_get_uint(value); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -698,16 +704,19 @@ garrow_count_options_set_property(GObject *object, } static void -garrow_count_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +garrow_scalar_aggregate_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_MODE: - g_value_set_enum(value, priv->options.count_mode); + case PROP_SKIP_NULLS: + g_value_set_boolean(value, priv->options.skip_nulls); + break; + case PROP_MIN_COUNT: + g_value_set_uint(value, priv->options.min_count); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -716,51 +725,69 @@ garrow_count_options_get_property(GObject *object, } static void -garrow_count_options_init(GArrowCountOptions *object) +garrow_scalar_aggregate_options_init(GArrowScalarAggregateOptions *object) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); - new(&priv->options) arrow::compute::CountOptions( - arrow::compute::CountOptions::COUNT_NON_NULL); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object); + new(&priv->options) arrow::compute::ScalarAggregateOptions(); } static void -garrow_count_options_class_init(GArrowCountOptionsClass *klass) +garrow_scalar_aggregate_options_class_init( + GArrowScalarAggregateOptionsClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); - gobject_class->finalize = garrow_count_options_finalize; - gobject_class->set_property = garrow_count_options_set_property; - gobject_class->get_property = garrow_count_options_get_property; + gobject_class->finalize = garrow_scalar_aggregate_options_finalize; + gobject_class->set_property = garrow_scalar_aggregate_options_set_property; + gobject_class->get_property = garrow_scalar_aggregate_options_get_property; + + auto options = arrow::compute::ScalarAggregateOptions::Defaults(); GParamSpec *spec; /** - * GArrowCountOptions:mode: + * GArrowScalarAggregateOptions:skip-nulls: * - * How to count values. + * Whether NULLs are skipped or not. * - * Since: 0.13.0 + * Since: 5.0.0 */ - spec = g_param_spec_enum("mode", - "Mode", - "How to count values", - GARROW_TYPE_COUNT_MODE, - GARROW_COUNT_ALL, + spec = g_param_spec_boolean("skip-nulls", + "Skip NULLs", + "Whether NULLs are skipped or not", + options.skip_nulls, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SKIP_NULLS, spec); + + /** + * GArrowScalarAggregateOptions:min-count: + * + * The minimum required number of values. + * + * Since: 5.0.0 + */ + spec = g_param_spec_uint("min-count", + "Min count", + "The minimum required number of values", + 0, + G_MAXUINT, + options.min_count, static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_MODE, spec); + g_object_class_install_property(gobject_class, PROP_MIN_COUNT, spec); } /** - * garrow_count_options_new: + * garrow_scalar_aggregate_options_new: * - * Returns: A newly created #GArrowCountOptions. + * Returns: A newly created #GArrowScalarAggregateOptions. * - * Since: 0.13.0 + * Since: 5.0.0 */ -GArrowCountOptions * -garrow_count_options_new(void) +GArrowScalarAggregateOptions * +garrow_scalar_aggregate_options_new(void) { - auto count_options = g_object_new(GARROW_TYPE_COUNT_OPTIONS, NULL); - return GARROW_COUNT_OPTIONS(count_options); + auto scalar_aggregate_options = + g_object_new(GARROW_TYPE_SCALAR_AGGREGATE_OPTIONS, NULL); + return GARROW_SCALAR_AGGREGATE_OPTIONS(scalar_aggregate_options); } @@ -863,14 +890,14 @@ garrow_filter_options_class_init(GArrowFilterOptionsClass *klass) GParamSpec *spec; /** - * GArrowFilterOptions:null_selection_behavior: + * GArrowFilterOptions:null-selection-behavior: * * How to handle filtered values. * * Since: 0.17.0 */ - spec = g_param_spec_enum("null_selection_behavior", - "Null selection behavior", + spec = g_param_spec_enum("null-selection-behavior", + "NULL selection behavior", "How to handle filtered values", GARROW_TYPE_FILTER_NULL_SELECTION_BEHAVIOR, static_cast( @@ -1682,7 +1709,7 @@ garrow_array_dictionary_encode(GArrowArray *array, /** * garrow_array_count: * @array: A #GArrowArray. - * @options: (nullable): A #GArrowCountOptions. + * @options: (nullable): A #GArrowScalarAggregateOptions. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: The number of target values on success. If an error is occurred, @@ -1692,14 +1719,14 @@ garrow_array_dictionary_encode(GArrowArray *array, */ gint64 garrow_array_count(GArrowArray *array, - GArrowCountOptions *options, + GArrowScalarAggregateOptions *options, GError **error) { auto arrow_array = garrow_array_get_raw(array); auto arrow_array_raw = arrow_array.get(); arrow::Result arrow_counted_datum; if (options) { - auto arrow_options = garrow_count_options_get_raw(options); + auto arrow_options = garrow_scalar_aggregate_options_get_raw(options); arrow_counted_datum = arrow::compute::Count(*arrow_array_raw, *arrow_options); } else { @@ -3059,20 +3086,23 @@ garrow_cast_options_get_raw(GArrowCastOptions *cast_options) return &(priv->options); } -GArrowCountOptions * -garrow_count_options_new_raw(arrow::compute::CountOptions *arrow_count_options) +GArrowScalarAggregateOptions * +garrow_scalar_aggregate_options_new_raw( + arrow::compute::ScalarAggregateOptions *arrow_scalar_aggregate_options) { - auto count_options = - g_object_new(GARROW_TYPE_COUNT_OPTIONS, - "mode", arrow_count_options->count_mode, + auto scalar_aggregate_options = + g_object_new(GARROW_TYPE_SCALAR_AGGREGATE_OPTIONS, + "skip-nulls", arrow_scalar_aggregate_options->skip_nulls, + "min-count", arrow_scalar_aggregate_options->min_count, NULL); - return GARROW_COUNT_OPTIONS(count_options); + return GARROW_SCALAR_AGGREGATE_OPTIONS(scalar_aggregate_options); } -arrow::compute::CountOptions * -garrow_count_options_get_raw(GArrowCountOptions *count_options) +arrow::compute::ScalarAggregateOptions * +garrow_scalar_aggregate_options_get_raw( + GArrowScalarAggregateOptions *scalar_aggregate_options) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(count_options); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(scalar_aggregate_options); return &(priv->options); } diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 63ba6e0eae5..a9e57945ba5 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -83,32 +83,20 @@ struct _GArrowCastOptionsClass GArrowCastOptions *garrow_cast_options_new(void); -/** - * GArrowCountMode: - * @GARROW_COUNT_ALL: Count all non-null values. - * @GARROW_COUNT_NULL: Count all null values. - * - * They are corresponding to `arrow::compute::CountOptions::Mode` values. - */ -typedef enum { - GARROW_COUNT_ALL, - GARROW_COUNT_NULL, -} GArrowCountMode; - -#define GARROW_TYPE_COUNT_OPTIONS (garrow_count_options_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowCountOptions, - garrow_count_options, +#define GARROW_TYPE_SCALAR_AGGREGATE_OPTIONS (garrow_scalar_aggregate_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowScalarAggregateOptions, + garrow_scalar_aggregate_options, GARROW, - COUNT_OPTIONS, + SCALAR_AGGREGATE_OPTIONS, GObject) -struct _GArrowCountOptionsClass +struct _GArrowScalarAggregateOptionsClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_0_13 -GArrowCountOptions * -garrow_count_options_new(void); +GARROW_AVAILABLE_IN_5_0 +GArrowScalarAggregateOptions * +garrow_scalar_aggregate_options_new(void); /** @@ -290,7 +278,7 @@ GArrowDictionaryArray *garrow_array_dictionary_encode(GArrowArray *array, GError **error); GARROW_AVAILABLE_IN_0_13 gint64 garrow_array_count(GArrowArray *array, - GArrowCountOptions *options, + GArrowScalarAggregateOptions *options, GError **error); GARROW_AVAILABLE_IN_0_13 GArrowStructArray *garrow_array_count_values(GArrowArray *array, diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 1bc6fefdd40..289bcbe31af 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -46,10 +46,12 @@ garrow_function_get_raw(GArrowFunction *function); GArrowCastOptions *garrow_cast_options_new_raw(arrow::compute::CastOptions *arrow_cast_options); arrow::compute::CastOptions *garrow_cast_options_get_raw(GArrowCastOptions *cast_options); -GArrowCountOptions * -garrow_count_options_new_raw(arrow::compute::CountOptions *arrow_count_options); -arrow::compute::CountOptions * -garrow_count_options_get_raw(GArrowCountOptions *count_options); +GArrowScalarAggregateOptions * +garrow_scalar_aggregate_options_new_raw( + arrow::compute::ScalarAggregateOptions *arrow_scalar_aggregate_options); +arrow::compute::ScalarAggregateOptions * +garrow_scalar_aggregate_options_get_raw( + GArrowScalarAggregateOptions *scalar_aggregate_options); arrow::compute::FilterOptions * garrow_filter_options_get_raw(GArrowFilterOptions *filter_options); diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index 5a74566fd4a..3dc2f702c5e 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_5_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 5.0.0 + */ +#define GARROW_VERSION_5_0 G_ENCODE_VERSION(5, 0) + /** * GARROW_VERSION_4_0: * @@ -256,6 +265,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_5_0 +# define GARROW_DEPRECATED_IN_5_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_5_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_5_0 +# define GARROW_DEPRECATED_IN_5_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_5_0 +# define GARROW_AVAILABLE_IN_5_0 GARROW_UNAVAILABLE(5, 0) +#else +# define GARROW_AVAILABLE_IN_5_0 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_4_0 # define GARROW_DEPRECATED_IN_4_0 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_4_0_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 9198b6a13a6..80af9506ecd 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -179,6 +179,10 @@ Index of deprecated API + + Index of new symbols in 5.0.0 + + Index of new symbols in 4.0.0 diff --git a/c_glib/test/test-count.rb b/c_glib/test/test-count.rb index 36390f880aa..39b6f06c4e6 100644 --- a/c_glib/test/test-count.rb +++ b/c_glib/test/test-count.rb @@ -19,27 +19,14 @@ class TestCount < Test::Unit::TestCase include Helper::Buildable include Helper::Omittable - sub_test_case("CountOptions") do - def test_default_mode - assert_equal(Arrow::CountMode::ALL, - Arrow::CountOptions.new.mode) - end - end - - sub_test_case("mode") do + sub_test_case("skip_nulls") do def test_default assert_equal(2, build_int32_array([1, nil, 3]).count) end - def test_all - options = Arrow::CountOptions.new - options.mode = :all - assert_equal(2, build_int32_array([1, nil, 3]).count(options)) - end - - def test_null - options = Arrow::CountOptions.new - options.mode = :null + def test_false + options = Arrow::ScalarAggregateOptions.new + options.skip_nulls = false assert_equal(1, build_int32_array([1, nil, 3]).count(options)) end end diff --git a/c_glib/test/test-scalar-aggregate-options.rb b/c_glib/test/test-scalar-aggregate-options.rb new file mode 100644 index 00000000000..a794b53243a --- /dev/null +++ b/c_glib/test/test-scalar-aggregate-options.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestScalarAggregateOptions < Test::Unit::TestCase + def setup + @options = Arrow::ScalarAggregateOptions.new + end + + sub_test_case("skip_nulls") do + def test_default + assert do + @options.skip_nulls? + end + end + + def test_accessor + @options.skip_nulls = false + assert do + not @options.skip_nulls? + end + end + end + + sub_test_case("min_count") do + def test_default + assert_equal(1, @options.min_count) + end + + def test_accessor + @options.min_count = 0 + assert_equal(0, @options.min_count) + end + end +end diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 5afa1048960..dca54a0faba 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -25,19 +25,23 @@ namespace compute { // ---------------------------------------------------------------------- // Scalar aggregates -Result Count(const Datum& value, CountOptions options, ExecContext* ctx) { +Result Count(const Datum& value, ScalarAggregateOptions options, + ExecContext* ctx) { return CallFunction("count", {value}, &options, ctx); } -Result Mean(const Datum& value, ExecContext* ctx) { - return CallFunction("mean", {value}, ctx); +Result Mean(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { + return CallFunction("mean", {value}, &options, ctx); } -Result Sum(const Datum& value, ExecContext* ctx) { - return CallFunction("sum", {value}, ctx); +Result Sum(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { + return CallFunction("sum", {value}, &options, ctx); } -Result MinMax(const Datum& value, const MinMaxOptions& options, ExecContext* ctx) { +Result MinMax(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { return CallFunction("min_max", {value}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index ca118ec5678..a7ceb2ac2fd 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -40,40 +40,17 @@ class ExecContext; /// \addtogroup compute-concrete-options /// @{ -/// \brief Control Count kernel behavior -/// -/// By default, all non-null values are counted. -struct ARROW_EXPORT CountOptions : public FunctionOptions { - enum Mode { - /// Count all non-null values. - COUNT_NON_NULL = 0, - /// Count all null values. - COUNT_NULL, - }; - - explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {} - - static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); } - - enum Mode count_mode; -}; - -/// \brief Control MinMax kernel behavior +/// \brief Control general scalar aggregate kernel behavior /// /// By default, null values are ignored -struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { - enum Mode { - /// Skip null values - SKIP = 0, - /// Any nulls will result in null output - EMIT_NULL - }; +struct ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { + explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1) + : skip_nulls(skip_nulls), min_count(min_count) {} - explicit MinMaxOptions(enum Mode null_handling = SKIP) : null_handling(null_handling) {} + static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; } - static MinMaxOptions Defaults() { return MinMaxOptions{}; } - - enum Mode null_handling; + bool skip_nulls; + uint32_t min_count; }; /// \brief Control Mode kernel behavior @@ -153,7 +130,7 @@ struct ARROW_EXPORT TDigestOptions : public FunctionOptions { /// \brief Count non-null (or null) values in an array. /// -/// \param[in] options counting options, see CountOptions for more information +/// \param[in] options counting options, see ScalarAggregateOptions for more information /// \param[in] datum to count /// \param[in] ctx the function execution context, optional /// \return out resulting datum @@ -161,30 +138,39 @@ struct ARROW_EXPORT TDigestOptions : public FunctionOptions { /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Count(const Datum& datum, CountOptions options = CountOptions::Defaults(), +Result Count(const Datum& datum, + ScalarAggregateOptions options = ScalarAggregateOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Compute the mean of a numeric array. /// /// \param[in] value datum to compute the mean, expecting Array +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return datum of the computed mean as a DoubleScalar /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Mean(const Datum& value, ExecContext* ctx = NULLPTR); +Result Mean( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Sum values of a numeric array. /// /// \param[in] value datum to sum, expecting Array or ChunkedArray +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return datum of the computed sum as a Scalar /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Sum(const Datum& value, ExecContext* ctx = NULLPTR); +Result Sum( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Calculate the min / max of a numeric array /// @@ -192,16 +178,17 @@ Result Sum(const Datum& value, ExecContext* ctx = NULLPTR); /// struct, where T is the input type /// /// \param[in] value input datum, expecting Array or ChunkedArray -/// \param[in] options see MinMaxOptions for more information +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return resulting datum as a struct scalar /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result MinMax(const Datum& value, - const MinMaxOptions& options = MinMaxOptions::Defaults(), - ExecContext* ctx = NULLPTR); +Result MinMax( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Test whether any element in a boolean array evaluates to true. /// diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index e4eec50c66d..446c1b9fc62 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -56,7 +56,7 @@ namespace aggregate { // Count implementation struct CountImpl : public ScalarAggregator { - explicit CountImpl(CountOptions options) : options(std::move(options)) {} + explicit CountImpl(ScalarAggregateOptions options) : options(std::move(options)) {} Status Consume(KernelContext*, const ExecBatch& batch) override { const ArrayData& input = *batch[0].array(); @@ -75,20 +75,15 @@ struct CountImpl : public ScalarAggregator { Status Finalize(KernelContext* ctx, Datum* out) override { const auto& state = checked_cast(*ctx->state()); - switch (state.options.count_mode) { - case CountOptions::COUNT_NON_NULL: - *out = Datum(state.non_nulls); - break; - case CountOptions::COUNT_NULL: - *out = Datum(state.nulls); - break; - default: - return Status::Invalid("Unknown CountOptions encountered"); + if (state.options.skip_nulls) { + *out = Datum(state.non_nulls); + } else { + *out = Datum(state.nulls); } return Status::OK(); } - CountOptions options; + ScalarAggregateOptions options; int64_t non_nulls = 0; int64_t nulls = 0; }; @@ -96,27 +91,39 @@ struct CountImpl : public ScalarAggregator { Result> CountInit(KernelContext*, const KernelInitArgs& args) { return ::arrow::internal::make_unique( - static_cast(*args.options)); + static_cast(*args.options)); } // ---------------------------------------------------------------------- // Sum implementation template -struct SumImplDefault : public SumImpl {}; +struct SumImplDefault : public SumImpl { + explicit SumImplDefault(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; template -struct MeanImplDefault : public MeanImpl {}; +struct MeanImplDefault : public MeanImpl { + explicit MeanImplDefault(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; Result> SumInit(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } Result> MeanInit(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } @@ -127,7 +134,7 @@ Result> MinMaxInit(KernelContext* ctx, const KernelInitArgs& args) { MinMaxInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options)); + static_cast(*args.options)); return visitor.Create(); } @@ -247,23 +254,33 @@ namespace { const FunctionDoc count_doc{"Count the number of null / non-null values", ("By default, non-null values are counted.\n" - "This can be changed through CountOptions."), + "This can be changed through ScalarAggregateOptions."), {"array"}, - "CountOptions"}; - -const FunctionDoc sum_doc{ - "Sum values of a numeric array", ("Null values are ignored."), {"array"}}; - -const FunctionDoc mean_doc{"Compute the mean of a numeric array", - ("Null values are ignored. The result is always computed\n" - "as a double, regardless of the input types"), - {"array"}}; + "ScalarAggregateOptions"}; + +const FunctionDoc sum_doc{"Sum values of a numeric array", + ("Null values are ignored. Minimum count of non-NA\n" + "values can be set and NAN is returned if too " + "few are present.\n" + "This can be changed through ScalarAggregateOptions."), + {"array"}, + "ScalarAggregateOptions"}; + +const FunctionDoc mean_doc{ + "Compute the mean of a numeric array", + ("Null values are ignored by default. Minimum count of non-NA\n" + "values can be set and NAN is returned if too few are \n" + "present. The result is always computed as a double, \n" + "regardless of the input types.\n" + "This can be changed through ScalarAggregateOptions."), + {"array"}, + "ScalarAggregateOptions"}; const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numeric array", ("Null values are ignored by default.\n" - "This can be changed through MinMaxOptions."), + "This can be changed through ScalarAggregateOptions."), {"array"}, - "MinMaxOptions"}; + "ScalarAggregateOptions"}; const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true", ("Null values are ignored."), @@ -276,9 +293,10 @@ const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate } // namespace void RegisterScalarAggregateBasic(FunctionRegistry* registry) { - static auto default_count_options = CountOptions::Defaults(); + static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults(); + auto func = std::make_shared( - "count", Arity::Unary(), &count_doc, &default_count_options); + "count", Arity::Unary(), &count_doc, &default_scalar_aggregate_options); // Takes any array input, outputs int64 scalar InputType any_array(ValueDescr::ARRAY); @@ -286,7 +304,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { aggregate::CountInit, func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("sum", Arity::Unary(), &sum_doc); + func = std::make_shared("sum", Arity::Unary(), &sum_doc, + &default_scalar_aggregate_options); aggregate::AddBasicAggKernels(aggregate::SumInit, {boolean()}, int64(), func.get()); aggregate::AddBasicAggKernels(aggregate::SumInit, SignedIntTypes(), int64(), func.get()); @@ -310,7 +329,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("mean", Arity::Unary(), &mean_doc); + func = std::make_shared("mean", Arity::Unary(), &mean_doc, + &default_scalar_aggregate_options); aggregate::AddBasicAggKernels(aggregate::MeanInit, {boolean()}, float64(), func.get()); aggregate::AddBasicAggKernels(aggregate::MeanInit, NumericTypes(), float64(), func.get()); @@ -327,9 +347,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); - static auto default_minmax_options = MinMaxOptions::Defaults(); - func = std::make_shared("min_max", Arity::Unary(), - &min_max_doc, &default_minmax_options); + func = std::make_shared( + "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {boolean()}, func.get()); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, NumericTypes(), func.get()); // Add the SIMD variants for min max diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc index a70363aab9b..8d3e5a0409d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc @@ -25,20 +25,32 @@ namespace aggregate { // Sum implementation template -struct SumImplAvx2 : public SumImpl {}; +struct SumImplAvx2 : public SumImpl { + explicit SumImplAvx2(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; template -struct MeanImplAvx2 : public MeanImpl {}; +struct MeanImplAvx2 : public MeanImpl { + explicit MeanImplAvx2(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; Result> SumInitAvx2(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } Result> MeanInitAvx2(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } @@ -49,7 +61,7 @@ Result> MinMaxInitAvx2(KernelContext* ctx, const KernelInitArgs& args) { MinMaxInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options)); + static_cast(*args.options)); return visitor.Create(); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc index 1ecbd7041e6..4f8ad74a086 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc @@ -25,20 +25,32 @@ namespace aggregate { // Sum implementation template -struct SumImplAvx512 : public SumImpl {}; +struct SumImplAvx512 : public SumImpl { + explicit SumImplAvx512(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; template -struct MeanImplAvx512 : public MeanImpl {}; +struct MeanImplAvx512 : public MeanImpl { + explicit MeanImplAvx512(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; Result> SumInitAvx512(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } Result> MeanInitAvx512(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } @@ -49,7 +61,7 @@ Result> MinMaxInitAvx512(KernelContext* ctx, const KernelInitArgs& args) { MinMaxInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options)); + static_cast(*args.options)); return visitor.Create(); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index f8db180b1e3..86e321ba522 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -78,7 +78,7 @@ struct SumImpl : public ScalarAggregator { } Status Finalize(KernelContext*, Datum* out) override { - if (this->count == 0) { + if (this->count < options.min_count) { out->value = std::make_shared(); } else { out->value = MakeScalar(this->sum); @@ -88,12 +88,13 @@ struct SumImpl : public ScalarAggregator { size_t count = 0; typename SumType::c_type sum = 0; + ScalarAggregateOptions options; }; template struct MeanImpl : public SumImpl { Status Finalize(KernelContext*, Datum* out) override { - if (this->count == 0) { + if (this->count < options.min_count) { out->value = std::make_shared(); } else { const double mean = static_cast(this->sum) / this->count; @@ -101,6 +102,7 @@ struct MeanImpl : public SumImpl { } return Status::OK(); } + ScalarAggregateOptions options; }; template