From f0962d0b6be3f4a7d8322f0a255e61b2d94b0e51 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 9 Jul 2020 21:02:38 +0200 Subject: [PATCH 1/2] ARROW-8989: [C++][Doc] Document available compute functions Also fix glaring bugs in arithmetic kernels (signed overflow detection was broken). --- c_glib/arrow-glib/compute.cpp | 5 +- cpp/src/arrow/array/validate.cc | 7 +- cpp/src/arrow/compute/api.h | 4 + cpp/src/arrow/compute/api_aggregate.h | 61 +-- cpp/src/arrow/compute/api_scalar.h | 97 +++-- cpp/src/arrow/compute/api_vector.h | 37 +- cpp/src/arrow/compute/cast.cc | 2 +- cpp/src/arrow/compute/cast.h | 5 + cpp/src/arrow/compute/exec.h | 14 +- cpp/src/arrow/compute/function.h | 6 + .../arrow/compute/kernels/aggregate_basic.cc | 2 +- .../arrow/compute/kernels/aggregate_test.cc | 2 +- .../compute/kernels/scalar_arithmetic.cc | 28 +- .../compute/kernels/scalar_arithmetic_test.cc | 47 +- cpp/src/arrow/compute/registry.h | 2 +- cpp/src/arrow/util/int_util.h | 33 +- cpp/src/parquet/column_reader.cc | 7 +- docs/source/conf.py | 7 +- docs/source/cpp/api.rst | 1 + docs/source/cpp/api/compute.rst | 56 +++ docs/source/cpp/compute.rst | 409 ++++++++++++++++++ docs/source/cpp/getting_started.rst | 1 + docs/source/python/api/arrays.rst | 71 +-- docs/source/python/dataset.rst | 4 +- 24 files changed, 745 insertions(+), 163 deletions(-) create mode 100644 docs/source/cpp/api/compute.rst create mode 100644 docs/source/cpp/compute.rst diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index d8d0bdc41c6..3e318999d31 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -676,7 +676,7 @@ garrow_count_options_set_property(GObject *object, switch (prop_id) { case PROP_MODE: priv->options.count_mode = - static_cast(g_value_get_enum(value)); + static_cast(g_value_get_enum(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -706,7 +706,8 @@ static void garrow_count_options_init(GArrowCountOptions *object) { auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); - new(&priv->options) arrow::compute::CountOptions(arrow::compute::CountOptions::COUNT_ALL); + new(&priv->options) arrow::compute::CountOptions( + arrow::compute::CountOptions::COUNT_NON_NULL); } static void diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 3dd0ffd8901..8fb8b59b9bc 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -98,7 +98,7 @@ struct ValidateArrayVisitor { if (value_size < 0) { return Status::Invalid("FixedSizeListArray has negative value size ", value_size); } - if (HasMultiplyOverflow(len, value_size) || + if (HasPositiveMultiplyOverflow(len, value_size) || array.values()->length() != len * value_size) { return Status::Invalid("Values Length (", array.values()->length(), ") is not equal to the length (", len, @@ -329,7 +329,7 @@ Status ValidateArray(const Array& array) { type.ToString(), ", got ", data.buffers.size()); } // This check is required to avoid addition overflow below - if (HasAdditionOverflow(array.length(), array.offset())) { + if (HasPositiveAdditionOverflow(array.length(), array.offset())) { return Status::Invalid("Array of type ", type.ToString(), " has impossibly large length and offset"); } @@ -346,7 +346,8 @@ Status ValidateArray(const Array& array) { min_buffer_size = BitUtil::BytesForBits(array.length() + array.offset()); break; case DataTypeLayout::FIXED_WIDTH: - if (HasMultiplyOverflow(array.length() + array.offset(), spec.byte_width)) { + if (HasPositiveMultiplyOverflow(array.length() + array.offset(), + spec.byte_width)) { return Status::Invalid("Array of type ", type.ToString(), " has impossibly large length and offset"); } diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h index 3fc6e22b4be..a890cd362f8 100644 --- a/cpp/src/arrow/compute/api.h +++ b/cpp/src/arrow/compute/api.h @@ -20,6 +20,10 @@ #pragma once +/// \defgroup compute-concrete-options Concrete option classes for compute functions +/// @{ +/// @} + #include "arrow/compute/api_aggregate.h" // IWYU pragma: export #include "arrow/compute/api_scalar.h" // IWYU pragma: export #include "arrow/compute/api_vector.h" // IWYU pragma: export diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 82a4ebf76b6..72b31082c10 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -37,25 +37,47 @@ class ExecContext; // ---------------------------------------------------------------------- // Aggregate functions -/// \class CountOptions +/// \addtogroup compute-concrete-options +/// @{ + +/// \brief Control Count kernel behavior /// -/// The user control the Count kernel behavior with this class. By default, the -/// it will count all non-null values. +/// By default, all non-null values are counted. struct ARROW_EXPORT CountOptions : public FunctionOptions { - enum mode { - // Count all non-null values. - COUNT_ALL = 0, - // Count all null values. + enum Mode { + /// Count all non-null values. + COUNT_NON_NULL = 0, + /// Count all null values. COUNT_NULL, }; - explicit CountOptions(enum mode count_mode) : count_mode(count_mode) {} + explicit CountOptions(enum Mode count_mode) : count_mode(count_mode) {} + + static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); } + + enum Mode count_mode = COUNT_NON_NULL; +}; + +/// \brief Control MinMax kernel behavior +/// +/// By default, null values are ignored +struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { + enum Mode { + /// Skip null values + SKIP = 0, + /// Any nulls will result in null output + OUTPUT_NULL + }; + + explicit MinMaxOptions(enum Mode null_handling = SKIP) : null_handling(null_handling) {} - static CountOptions Defaults() { return CountOptions(COUNT_ALL); } + static MinMaxOptions Defaults() { return MinMaxOptions{}; } - enum mode count_mode = COUNT_ALL; + enum Mode null_handling = SKIP; }; +/// @} + /// \brief Count non-null (or null) values in an array. /// /// \param[in] options counting options, see CountOptions for more information @@ -91,25 +113,6 @@ Result Mean(const Datum& value, ExecContext* ctx = NULLPTR); ARROW_EXPORT Result Sum(const Datum& value, ExecContext* ctx = NULLPTR); -/// \class MinMaxOptions -/// -/// The user can control the MinMax kernel behavior with this class. By default, -/// it will skip null if there is a null value present. -struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { - enum mode { - /// skip null values - SKIP = 0, - /// any nulls will result in null output - OUTPUT_NULL - }; - - explicit MinMaxOptions(enum mode null_handling = SKIP) : null_handling(null_handling) {} - - static MinMaxOptions Defaults() { return MinMaxOptions{}; } - - enum mode null_handling = SKIP; -}; - /// \brief Calculate the min / max of a numeric array /// /// This function returns both the min and max as a struct scalar, with type diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 858e1ff6a19..1d8ef091815 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -33,13 +33,64 @@ namespace arrow { namespace compute { -// ---------------------------------------------------------------------- +/// \addtogroup compute-concrete-options +/// +/// @{ struct ArithmeticOptions : public FunctionOptions { ArithmeticOptions() : check_overflow(false) {} bool check_overflow; }; +struct ARROW_EXPORT BinaryContainsExactOptions : public FunctionOptions { + explicit BinaryContainsExactOptions(std::string pattern) + : pattern(std::move(pattern)) {} + + /// The exact pattern to look for inside input values. + std::string pattern; +}; + +/// Options for IsIn and Match functions +struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { + explicit SetLookupOptions(Datum value_set, bool skip_nulls) + : value_set(std::move(value_set)), skip_nulls(skip_nulls) {} + + /// The set of values to look up input values into. + Datum value_set; + /// Whether nulls in `value_set` count for lookup. + /// + /// If true, any null in `value_set` is ignored and nulls in the input + /// produce null (Match) or false (IsIn) values in the output. + /// If false, any null in `value_set` is successfully matched in + /// the input. + bool skip_nulls; +}; + +struct ARROW_EXPORT StrptimeOptions : public FunctionOptions { + explicit StrptimeOptions(std::string format, TimeUnit::type unit) + : format(format), unit(unit) {} + + std::string format; + TimeUnit::type unit; +}; + +enum CompareOperator : int8_t { + EQUAL, + NOT_EQUAL, + GREATER, + GREATER_EQUAL, + LESS, + LESS_EQUAL, +}; + +struct CompareOptions : public FunctionOptions { + explicit CompareOptions(CompareOperator op) : op(op) {} + + enum CompareOperator op; +}; + +/// @} + /// \brief Add two values together. Array values must be the same length. If /// either addend is null the result will be null. /// @@ -79,21 +130,6 @@ Result Multiply(const Datum& left, const Datum& right, ArithmeticOptions options = ArithmeticOptions(), ExecContext* ctx = NULLPTR); -enum CompareOperator { - EQUAL, - NOT_EQUAL, - GREATER, - GREATER_EQUAL, - LESS, - LESS_EQUAL, -}; - -struct CompareOptions : public FunctionOptions { - explicit CompareOptions(CompareOperator op) : op(op) {} - - enum CompareOperator op; -}; - /// \brief Compare a numeric array with a scalar. /// /// \param[in] left datum to compare, must be an Array @@ -185,15 +221,6 @@ Result KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx = ARROW_EXPORT Result Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); -/// For set lookup operations like IsIn, Match -struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { - explicit SetLookupOptions(Datum value_set, bool skip_nulls) - : value_set(std::move(value_set)), skip_nulls(skip_nulls) {} - - Datum value_set; - bool skip_nulls; -}; - /// \brief IsIn returns true for each element of `values` that is contained in /// `value_set` /// @@ -274,25 +301,5 @@ ARROW_EXPORT Result FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx = NULLPTR); -// ---------------------------------------------------------------------- -// String functions - -struct ARROW_EXPORT BinaryContainsExactOptions : public FunctionOptions { - explicit BinaryContainsExactOptions(std::string pattern) : pattern(pattern) {} - - std::string pattern; -}; - -// ---------------------------------------------------------------------- -// Temporal functions - -struct ARROW_EXPORT StrptimeOptions : public FunctionOptions { - explicit StrptimeOptions(std::string format, TimeUnit::type unit) - : format(format), unit(unit) {} - - std::string format; - TimeUnit::type unit; -}; - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 28812c32f8c..c3e9dc9d2fc 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -29,6 +29,9 @@ namespace compute { class ExecContext; +/// \addtogroup compute-concrete-options +/// @{ + struct FilterOptions : public FunctionOptions { /// Configure the action taken when a slot of the selection mask is null enum NullSelectionBehavior { @@ -46,6 +49,25 @@ struct FilterOptions : public FunctionOptions { NullSelectionBehavior null_selection_behavior = DROP; }; +struct ARROW_EXPORT TakeOptions : public FunctionOptions { + explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {} + + bool boundscheck = true; + static TakeOptions BoundsCheck() { return TakeOptions(true); } + static TakeOptions NoBoundsCheck() { return TakeOptions(false); } + static TakeOptions Defaults() { return BoundsCheck(); } +}; + +/// \brief Partitioning options for NthToIndices +struct PartitionOptions : public FunctionOptions { + explicit PartitionOptions(int64_t pivot) : pivot(pivot) {} + + /// The index into the equivalent sorted array of the partition pivot element. + int64_t pivot; +}; + +/// @} + /// \brief Filter with a boolean selection filter /// /// The output will be populated with values from the input at positions @@ -85,15 +107,6 @@ Result> GetTakeIndices( } // namespace internal -struct ARROW_EXPORT TakeOptions : public FunctionOptions { - explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {} - - bool boundscheck = true; - static TakeOptions BoundsCheck() { return TakeOptions(true); } - static TakeOptions NoBoundsCheck() { return TakeOptions(false); } - static TakeOptions Defaults() { return BoundsCheck(); } -}; - /// \brief Take from an array of values at indices in another array /// /// The output array will be of the same type as the input values @@ -121,11 +134,6 @@ Result> Take(const Array& values, const Array& indices, const TakeOptions& options = TakeOptions::Defaults(), ExecContext* ctx = NULLPTR); -struct PartitionOptions : public FunctionOptions { - explicit PartitionOptions(int64_t pivot) : pivot(pivot) {} - int64_t pivot; -}; - /// \brief Returns indices that partition an array around n-th /// sorted element. /// @@ -178,6 +186,7 @@ ARROW_EXPORT extern const char kValuesFieldName[]; ARROW_EXPORT extern const char kCountsFieldName[]; ARROW_EXPORT extern const int32_t kValuesFieldIndex; ARROW_EXPORT extern const int32_t kCountsFieldIndex; + /// \brief Return counts of unique elements from an array-like object. /// /// Note that the counts do not include counts for nulls in the array. These can be diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index 9c8ea6675a8..211e5a2054a 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -136,7 +136,7 @@ Result CastFunction::DispatchExact( // Validate arity if (passed_num_args != 1) { - return Status::Invalid("Cast sunctions accept 1 argument but passed ", + return Status::Invalid("Cast functions accept 1 argument but passed ", passed_num_args); } std::vector candidate_kernels; diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index 907eef3cff9..82dd357e9dd 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -38,6 +38,9 @@ namespace compute { class ExecContext; +/// \addtogroup compute-concrete-options +/// @{ + struct ARROW_EXPORT CastOptions : public FunctionOptions { CastOptions() : allow_int_overflow(false), @@ -73,6 +76,8 @@ struct ARROW_EXPORT CastOptions : public FunctionOptions { bool allow_invalid_utf8; }; +/// @} + // Cast functions are _not_ registered in the FunctionRegistry, though they use // the same execution machinery class CastFunction : public ScalarFunction { diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index aae37c7a739..142e149bccc 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -211,18 +211,26 @@ struct ExecBatch { } }; -/// \brief One-shot invoker for all types of functions. Does kernel dispatch, -/// argument checking, iteration of ChunkedArray inputs, and wrapping of -/// outputs +/// \defgroup compute-call-function One-shot calls to compute functions +/// +/// @{ + +/// \brief One-shot invoker for all types of functions. +/// +/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs, +/// and wrapping of outputs. ARROW_EXPORT Result CallFunction(const std::string& func_name, const std::vector& args, const FunctionOptions* options, ExecContext* ctx = NULLPTR); /// \brief Variant of CallFunction which uses a function's default options. +/// /// NB: Some functions require FunctionOptions be provided. ARROW_EXPORT Result CallFunction(const std::string& func_name, const std::vector& args, ExecContext* ctx = NULLPTR); +/// @} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 67af4df471a..93a200ee212 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -35,6 +35,10 @@ namespace arrow { namespace compute { +/// \defgroup compute-functions Abstract compute function API +/// +/// @{ + /// \brief Base class for specifying options configuring a function's behavior, /// such as error handling. struct ARROW_EXPORT FunctionOptions {}; @@ -277,5 +281,7 @@ class ARROW_EXPORT MetaFunction : public Function { : Function(std::move(name), Function::META, arity, default_options) {} }; +/// @} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 53e89ce243a..8765914ac73 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -61,7 +61,7 @@ struct CountImpl : public ScalarAggregator { void Finalize(KernelContext* ctx, Datum* out) override { const auto& state = checked_cast(*ctx->state()); switch (state.options.count_mode) { - case CountOptions::COUNT_ALL: + case CountOptions::COUNT_NON_NULL: *out = Datum(state.non_nulls); break; case CountOptions::COUNT_NULL: diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 3b2d4e099ce..db548f27b36 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -277,7 +277,7 @@ static CountPair NaiveCount(const Array& array) { } void ValidateCount(const Array& input, CountPair expected) { - CountOptions all = CountOptions(CountOptions::COUNT_ALL); + CountOptions all = CountOptions(CountOptions::COUNT_NON_NULL); CountOptions nulls = CountOptions(CountOptions::COUNT_NULL); ASSERT_OK_AND_ASSIGN(Datum result, Count(input, all)); diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index 82a8f1521ce..1f0cd3785a7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -17,6 +17,7 @@ #include "arrow/compute/kernels/common.h" #include "arrow/util/int_util.h" +#include "arrow/util/macros.h" #ifndef __has_builtin #define __has_builtin(x) 0 @@ -66,7 +67,7 @@ struct Add { template static constexpr enable_if_signed_integer Call(KernelContext*, T left, T right) { - return to_unsigned(left) + to_unsigned(right); + return arrow::internal::SafeSignedAdd(left, right); } }; @@ -75,7 +76,7 @@ struct AddChecked { template static enable_if_integer Call(KernelContext* ctx, T left, T right) { T result; - if (__builtin_add_overflow(left, right, &result)) { + if (ARROW_PREDICT_FALSE(__builtin_add_overflow(left, right, &result))) { ctx->SetStatus(Status::Invalid("overflow")); } return result; @@ -83,7 +84,7 @@ struct AddChecked { #else template static enable_if_unsigned_integer Call(KernelContext* ctx, T left, T right) { - if (arrow::internal::HasAdditionOverflow(left, right)) { + if (ARROW_PREDICT_FALSE(arrow::internal::HasPositiveAdditionOverflow(left, right))) { ctx->SetStatus(Status::Invalid("overflow")); } return left + right; @@ -91,12 +92,10 @@ struct AddChecked { template static enable_if_signed_integer Call(KernelContext* ctx, T left, T right) { - auto unsigned_left = to_unsigned(left); - auto unsigned_right = to_unsigned(right); - if (arrow::internal::HasAdditionOverflow(unsigned_left, unsigned_right)) { + if (ARROW_PREDICT_FALSE(arrow::internal::HasSignedAdditionOverflow(left, right))) { ctx->SetStatus(Status::Invalid("overflow")); } - return unsigned_left + unsigned_right; + return left + right; } #endif @@ -119,7 +118,7 @@ struct Subtract { template static constexpr enable_if_signed_integer Call(KernelContext*, T left, T right) { - return to_unsigned(left) - to_unsigned(right); + return arrow::internal::SafeSignedSubtract(left, right); } }; @@ -128,7 +127,7 @@ struct SubtractChecked { template static enable_if_integer Call(KernelContext* ctx, T left, T right) { T result; - if (__builtin_sub_overflow(left, right, &result)) { + if (ARROW_PREDICT_FALSE(__builtin_sub_overflow(left, right, &result))) { ctx->SetStatus(Status::Invalid("overflow")); } return result; @@ -136,7 +135,8 @@ struct SubtractChecked { #else template static enable_if_unsigned_integer Call(KernelContext* ctx, T left, T right) { - if (arrow::internal::HasSubtractionOverflow(left, right)) { + if (ARROW_PREDICT_FALSE( + arrow::internal::HasPositiveSubtractionOverflow(left, right))) { ctx->SetStatus(Status::Invalid("overflow")); } return left - right; @@ -144,10 +144,10 @@ struct SubtractChecked { template static enable_if_signed_integer Call(KernelContext* ctx, T left, T right) { - if (arrow::internal::HasSubtractionOverflow(left, right)) { + if (ARROW_PREDICT_FALSE(arrow::internal::HasSignedSubtractionOverflow(left, right))) { ctx->SetStatus(Status::Invalid("overflow")); } - return to_unsigned(left) - to_unsigned(right); + return left - right; } #endif @@ -201,12 +201,12 @@ struct MultiplyChecked { static enable_if_integer Call(KernelContext* ctx, T left, T right) { T result; #if __has_builtin(__builtin_mul_overflow) - if (__builtin_mul_overflow(left, right, &result)) { + if (ARROW_PREDICT_FALSE(__builtin_mul_overflow(left, right, &result))) { ctx->SetStatus(Status::Invalid("overflow")); } #else result = Multiply::Call(ctx, left, right); - if (left != 0 && result / left != right) { + if (left != 0 && ARROW_PREDICT_FALSE(result / left != right)) { ctx->SetStatus(Status::Invalid("overflow")); } #endif diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index e0f4890d7eb..ceb46235353 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -235,9 +235,6 @@ TYPED_TEST(TestBinaryArithmeticSigned, OverflowWraps) { auto min = std::numeric_limits::lowest(); auto max = std::numeric_limits::max(); - this->AssertBinop(Add, MakeArray(min, max, max), MakeArray(CType(-1), 1, max), - MakeArray(max, min, CType(-2))); - this->AssertBinop(Subtract, MakeArray(min, max, min), MakeArray(1, max, max), MakeArray(max, 0, 1)); this->AssertBinop(Multiply, MakeArray(min, max, max), MakeArray(max, 2, max), @@ -261,7 +258,41 @@ TYPED_TEST(TestBinaryArithmeticIntegral, OverflowRaises) { "overflow"); } -TYPED_TEST(TestBinaryArithmeticSigned, OverflowRaises) { +TYPED_TEST(TestBinaryArithmeticSigned, AddOverflowRaises) { + using CType = typename TestFixture::CType; + + auto min = std::numeric_limits::lowest(); + auto max = std::numeric_limits::max(); + + this->SetOverflowCheck(true); + + this->AssertBinop(Add, MakeArray(max), MakeArray(-1), MakeArray(max - 1)); + this->AssertBinop(Add, MakeArray(min), MakeArray(1), MakeArray(min + 1)); + this->AssertBinop(Add, MakeArray(-1), MakeArray(2), MakeArray(1)); + this->AssertBinop(Add, MakeArray(1), MakeArray(-2), MakeArray(-1)); + + this->AssertBinopRaises(Add, MakeArray(max), MakeArray(1), "overflow"); + this->AssertBinopRaises(Add, MakeArray(min), MakeArray(-1), "overflow"); +} + +TYPED_TEST(TestBinaryArithmeticSigned, SubOverflowRaises) { + using CType = typename TestFixture::CType; + + auto min = std::numeric_limits::lowest(); + auto max = std::numeric_limits::max(); + + this->SetOverflowCheck(true); + + this->AssertBinop(Subtract, MakeArray(max), MakeArray(1), MakeArray(max - 1)); + this->AssertBinop(Subtract, MakeArray(min), MakeArray(-1), MakeArray(min + 1)); + this->AssertBinop(Subtract, MakeArray(-1), MakeArray(-2), MakeArray(1)); + this->AssertBinop(Subtract, MakeArray(1), MakeArray(2), MakeArray(-1)); + + this->AssertBinopRaises(Subtract, MakeArray(max), MakeArray(-1), "overflow"); + this->AssertBinopRaises(Subtract, MakeArray(min), MakeArray(1), "overflow"); +} + +TYPED_TEST(TestBinaryArithmeticSigned, MulOverflowRaises) { using CType = typename TestFixture::CType; auto min = std::numeric_limits::lowest(); @@ -270,8 +301,16 @@ TYPED_TEST(TestBinaryArithmeticSigned, OverflowRaises) { this->SetOverflowCheck(true); this->AssertBinop(Multiply, MakeArray(max), MakeArray(-1), MakeArray(min + 1)); + this->AssertBinop(Multiply, MakeArray(max / 2), MakeArray(-2), MakeArray(min + 2)); + this->AssertBinopRaises(Multiply, MakeArray(max), MakeArray(2), "overflow"); + this->AssertBinopRaises(Multiply, MakeArray(max / 2), MakeArray(3), "overflow"); + this->AssertBinopRaises(Multiply, MakeArray(max / 2), MakeArray(-3), "overflow"); + + this->AssertBinopRaises(Multiply, MakeArray(min), MakeArray(2), "overflow"); + this->AssertBinopRaises(Multiply, MakeArray(min / 2), MakeArray(3), "overflow"); this->AssertBinopRaises(Multiply, MakeArray(min), MakeArray(-1), "overflow"); + this->AssertBinopRaises(Multiply, MakeArray(min / 2), MakeArray(-2), "overflow"); } TYPED_TEST(TestBinaryArithmeticUnsigned, OverflowWraps) { diff --git a/cpp/src/arrow/compute/registry.h b/cpp/src/arrow/compute/registry.h index bb3ded47b9e..2d4c40b8dd2 100644 --- a/cpp/src/arrow/compute/registry.h +++ b/cpp/src/arrow/compute/registry.h @@ -72,7 +72,7 @@ class ARROW_EXPORT FunctionRegistry { std::unique_ptr impl_; }; -// \brief Return the process-global function registry +/// \brief Return the process-global function registry ARROW_EXPORT FunctionRegistry* GetFunctionRegistry(); } // namespace compute diff --git a/cpp/src/arrow/util/int_util.h b/cpp/src/arrow/util/int_util.h index c4ed0eb7d5b..59e4f9c4946 100644 --- a/cpp/src/arrow/util/int_util.h +++ b/cpp/src/arrow/util/int_util.h @@ -83,6 +83,14 @@ SignedInt SafeSignedAdd(SignedInt u, SignedInt v) { static_cast(v)); } +/// Signed subtraction with well-defined behaviour on overflow (as unsigned) +template +SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) - + static_cast(v)); +} + /// Signed left shift with well-defined behaviour on negative numbers or overflow template SignedInt SafeLeftShift(SignedInt u, Shift shift) { @@ -90,25 +98,42 @@ SignedInt SafeLeftShift(SignedInt u, Shift shift) { return static_cast(static_cast(u) << shift); } +// TODO Add portable wrappers for __builtin_add_overflow and friends +// see http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2428.pdf + /// Detect multiplication overflow between *positive* integers template -bool HasMultiplyOverflow(Integer value, Integer multiplicand) { +bool HasPositiveMultiplyOverflow(Integer value, Integer multiplicand) { return (multiplicand != 0 && value > std::numeric_limits::max() / multiplicand); } /// Detect addition overflow between *positive* integers template -bool HasAdditionOverflow(Integer value, Integer addend) { +bool HasPositiveAdditionOverflow(Integer value, Integer addend) { return (value > std::numeric_limits::max() - addend); } -/// Detect addition overflow between integers +/// Detect addition overflow between signed integers template -bool HasSubtractionOverflow(Integer value, Integer minuend) { +bool HasSignedAdditionOverflow(Integer value, Integer addend) { + return (addend > 0) ? (value > std::numeric_limits::max() - addend) + : (value < std::numeric_limits::min() - addend); +} + +/// Detect subtraction overflow between *positive* integers +template +bool HasPositiveSubtractionOverflow(Integer value, Integer minuend) { return (value < minuend); } +/// Detect subtraction overflow between signed integers +template +bool HasSignedSubtractionOverflow(Integer value, Integer subtrahend) { + return (subtrahend > 0) ? (value < std::numeric_limits::min() + subtrahend) + : (value > std::numeric_limits::max() + subtrahend); +} + /// Upcast an integer to the largest possible width (currently 64 bits) template diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 27a3a92341c..0bfc303dba0 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1028,7 +1028,7 @@ class TypedRecordReader : public ColumnReaderImplBase, // Compute the values capacity in bytes for the given number of elements int64_t bytes_for_values(int64_t nitems) const { int64_t type_size = GetTypeByteSize(this->descr_->physical_type()); - if (::arrow::internal::HasMultiplyOverflow(nitems, type_size)) { + if (::arrow::internal::HasPositiveMultiplyOverflow(nitems, type_size)) { throw ParquetException("Total size of items too large"); } return nitems * type_size; @@ -1184,7 +1184,7 @@ class TypedRecordReader : public ColumnReaderImplBase, if (extra_size < 0) { throw ParquetException("Negative size (corrupt file?)"); } - if (::arrow::internal::HasAdditionOverflow(size, extra_size)) { + if (::arrow::internal::HasPositiveAdditionOverflow(size, extra_size)) { throw ParquetException("Allocation size too large (corrupt file?)"); } const int64_t target_size = size + extra_size; @@ -1203,7 +1203,8 @@ class TypedRecordReader : public ColumnReaderImplBase, UpdateCapacity(levels_capacity_, levels_written_, extra_levels); if (new_levels_capacity > levels_capacity_) { constexpr auto kItemSize = static_cast(sizeof(int16_t)); - if (::arrow::internal::HasMultiplyOverflow(new_levels_capacity, kItemSize)) { + if (::arrow::internal::HasPositiveMultiplyOverflow(new_levels_capacity, + kItemSize)) { throw ParquetException("Allocation size too large (corrupt file?)"); } PARQUET_THROW_NOT_OK(def_levels_->Resize(new_levels_capacity * kItemSize, false)); diff --git a/docs/source/conf.py b/docs/source/conf.py index 2013a7235fd..4508faa8a62 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -74,6 +74,10 @@ 'inherited-members': None } +# Breathe configuration +breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} +breathe_default_project = "arrow_cpp" + # Overriden conditionally below autodoc_mock_imports = [] @@ -86,9 +90,6 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} -breathe_default_project = "arrow_cpp" - # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 9b7d356980b..68323f15d4d 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -28,6 +28,7 @@ API Reference api/array api/builder api/table + api/compute api/tensor api/utilities api/io diff --git a/docs/source/cpp/api/compute.rst b/docs/source/cpp/api/compute.rst new file mode 100644 index 00000000000..3b0a89f83f8 --- /dev/null +++ b/docs/source/cpp/api/compute.rst @@ -0,0 +1,56 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Compute Functions +================= + +Datum class +----------- + +.. doxygenclass:: arrow::Datum + :members: + +Abstract Function classes +------------------------- + +.. doxygengroup:: compute-functions + :content-only: + :members: + +Function registry +----------------- + +.. doxygenclass:: arrow::compute::FunctionRegistry + :members: + +.. doxygenfunction:: arrow::compute::GetFunctionRegistry + +Convenience functions +--------------------- + +.. doxygengroup:: compute-call-function + :content-only: + +Concrete options classes +------------------------ + +.. doxygengroup:: compute-concrete-options + :content-only: + :members: + :undoc-members: + +.. TODO: List concrete function invocation shortcuts? diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst new file mode 100644 index 00000000000..6cc2baaa673 --- /dev/null +++ b/docs/source/cpp/compute.rst @@ -0,0 +1,409 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp +.. cpp:namespace:: arrow::compute + +================= +Compute Functions +================= + +.. TODO: describe API and how to invoke compute functions + +Available functions +=================== + +Aggregations +------------ + ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++==========================+============+====================+=======================+============================================+ +| count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| mean | Unary | Numeric | Scalar Float64 | | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| minmax | Unary | Numeric | Scalar Struct (1) | :struct:`MinMaxOptions` | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| sum | Unary | Numeric | Scalar Numeric (2) | | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ + +Notes: + +* \(1) Output is a ``{"min": input type, "max": input type}`` Struct + +* \(2) Output is Int64, UInt64 or Float64, depending on the input type + + +Element-wise ("scalar") functions +--------------------------------- + +Arithmetic functions +~~~~~~~~~~~~~~~~~~~~ + +Those functions expect two inputs of the same type and apply a given binary +operation to each pair of elements gathered from the inputs. Each function +is also available in an overflow-checking variant, suffixed ``_checked``. + +If any of the input elements in a pair is null, the corresponding output +element is null. + ++--------------------------+------------+--------------------+---------------------+ +| Function name | Arity | Input types | Output type | ++==========================+============+====================+=====================+ +| add | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| add_checked | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| multiply | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| multiply_checked | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| subtract | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| subtract_checked | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ + +Comparisons +~~~~~~~~~~~ + +Those functions expect two inputs of the same type and apply a given +comparison operator. If any of the input elements in a pair is null, +the corresponding output element is null. + ++--------------------------+------------+---------------------------------------------+---------------------+ +| Function names | Arity | Input types | Output type | ++==========================+============+=============================================+=====================+ +| equal, not_equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean | ++--------------------------+------------+---------------------------------------------+---------------------+ +| greater, greater_equal, | Binary | Numeric, Temporal, Binary- and String-like | Boolean | +| less, less_equal | | | | ++--------------------------+------------+---------------------------------------------+---------------------+ + +Logical functions +~~~~~~~~~~~~~~~~~~ + +The normal behaviour for these functions is to emit a null if any of the +inputs is null. + +Some of them are also available in a "`Kleene logic`_" variant (suffixed +``_kleene``) where null is taken to mean "undefined". For those variants +therefore: + +* "true AND null", "null AND true" give "null" (the result is undefined) +* "true OR null", "null OR true" give "true" +* "false AND null", "null AND false" give "false" +* "false OR null", "null OR false" give "null" (the result is undefined) + ++--------------------------+------------+--------------------+---------------------+ +| Function name | Arity | Input types | Output type | ++==========================+============+====================+=====================+ +| and | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| and_kleene | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| invert | Unary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| or | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| or_kleene | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| xor | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ + +.. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics + +String functions +~~~~~~~~~~~~~~~~ + ++--------------------------+------------+--------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==========================+============+====================+=====================+=========+ +| ascii_length | Unary | String-like | Int32 or Int64 | \(1) | ++--------------------------+------------+--------------------+---------------------+---------+ +| ascii_lower | Unary | String-like | String-like | \(2) | ++--------------------------+------------+--------------------+---------------------+---------+ +| ascii_upper | Unary | String-like | String-like | \(2) | ++--------------------------+------------+--------------------+---------------------+---------+ +| utf8_lower | Unary | String-like | String-like | \(3) | ++--------------------------+------------+--------------------+---------------------+---------+ +| utf8_upper | Unary | String-like | String-like | \(3) | ++--------------------------+------------+--------------------+---------------------+---------+ + +* \(1) Output is the physical length in bytes of each input element. + +* \(2) Each ASCII character in the input is converted to lowercase or + uppercase. Non-ASCII characters are left untouched. + +* \(3) Each UTF8-encoded character in the input is converted to lowercase or + uppercase. + +Containment tests +~~~~~~~~~~~~~~~~~ + ++--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++==========================+============+==================================+=======================+============================================+ +| binary_contains_exact | Unary | String-like | Boolean (1) | :struct:`BinaryContainsExactOptions` | ++--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ +| isin | Unary | Binary- and String-like | Boolean (2) | :struct:`SetLookupOptions` | ++--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ +| isin | Unary | Null | Boolean (2) | :struct:`SetLookupOptions` | ++--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ +| isin | Unary | Boolean,Numeric, Temporal | Boolean (2) | :struct:`SetLookupOptions` | ++--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ +| match | Unary | Binary- and String-like | Int32 (3) | :struct:`SetLookupOptions` | ++--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ +| match | Unary | Null | Int32 (3) | :struct:`SetLookupOptions` | ++--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ +| match | Unary | Boolean,Numeric, Temporal | Int32 (3) | :struct:`SetLookupOptions` | ++--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ + +* \(1) Output is true iff :member:`BinaryContainsExactOptions::pattern` + is a substring of the corresponding input element. + +* \(2) Output is true iff the corresponding input element is equal to one + of the elements in :member:`SetLookupOptions::value_set`. + +* \(3) Output is the index of the corresponding input element in + :member:`SetLookupOptions::value_set`, if found there. Otherwise, + output is null. + +Structural transforms +~~~~~~~~~~~~~~~~~~~~~ + ++--------------------------+------------+--------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==========================+============+====================+=====================+=========+ +| is_null | Unary | Any | Boolean | \(1) | ++--------------------------+------------+--------------------+---------------------+---------+ +| is_valid | Unary | Any | Boolean | \(2) | ++--------------------------+------------+--------------------+---------------------+---------+ +| list_value_lengths | Unary | List-like | Int32 or Int64 | \(3) | ++--------------------------+------------+--------------------+---------------------+---------+ + +* \(1) Output is true iff the corresponding input element is non-null. + +* \(2) Output is true iff the corresponding input element is null. + +* \(3) Each output element is the length of the corresponding input element + (null if input is null). + +Conversions +~~~~~~~~~~~ + +A general conversion function named ``cast`` is provided which accepts a large +number of input and output types. The type to cast to can be passed in a +:struct:`CastOptions` instance. + ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++==========================+============+====================+=======================+============================================+ +| cast | Unary | Many | Variable | :struct:`CastOptions` | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ + +The conversions available with ``cast`` are listed below. In all cases, a +null input value is converted into a null output value. + +**Truth value extraction** + ++-----------------------------+------------------------------------+--------------+ +| Input type | Output type | Notes | ++=============================+====================================+==============+ +| Binary- and String-like | Boolean | \(1) | ++-----------------------------+------------------------------------+--------------+ +| Numeric | Boolean | \(2) | ++-----------------------------+------------------------------------+--------------+ + +* \(1) Output is true iff the corresponding input value has non-zero length. + +* \(2) Output is true iff the corresponding input value is non-zero. + +**Same-kind conversion** + ++-----------------------------+------------------------------------+--------------+ +| Input type | Output type | Notes | ++=============================+====================================+==============+ +| Int32 | 32-bit Temporal | \(1) | ++-----------------------------+------------------------------------+--------------+ +| Int64 | 64-bit Temporal | \(1) | ++-----------------------------+------------------------------------+--------------+ +| (Large)Binary | (Large)String | \(2) | ++-----------------------------+------------------------------------+--------------+ +| (Large)String | (Large)Binary | \(3) | ++-----------------------------+------------------------------------+--------------+ +| Numeric | Numeric | \(4) \(5) | ++-----------------------------+------------------------------------+--------------+ +| 32-bit Temporal | Int32 | \(1) | ++-----------------------------+------------------------------------+--------------+ +| 64-bit Temporal | Int64 | \(1) | ++-----------------------------+------------------------------------+--------------+ +| Temporal | Temporal | \(4) \(5) | ++-----------------------------+------------------------------------+--------------+ + +* \(1) No-operation cast: the raw values are kept identical, only + the type is changed. + +* \(2) Validates the contents if :member:`CastOptions::allow_invalid_utf8` + is false. + +* \(3) No-operation cast: only the type is changed. + +* \(4) Overflow and truncation checks are enabled depending on + the given :struct:`CastOptions`. + +* \(5) Not all such casts have been implemented. + +**String representations** + ++-----------------------------+------------------------------------+---------+ +| Input type | Output type | Notes | ++=============================+====================================+=========+ +| Boolean | String-like | | ++-----------------------------+------------------------------------+---------+ +| Numeric | String-like | | ++-----------------------------+------------------------------------+---------+ + +**Generic conversions** + ++-----------------------------+------------------------------------+---------+ +| Input type | Output type | Notes | ++=============================+====================================+=========+ +| Dictionary | Dictionary value type | | ++-----------------------------+------------------------------------+---------+ +| Extension | Extension storage type | | ++-----------------------------+------------------------------------+---------+ +| List-like | List-like | \(1) | ++-----------------------------+------------------------------------+---------+ +| Null | Any | | ++-----------------------------+------------------------------------+---------+ + +* \(1) The list offsets are unchanged, the list values are cast from the + input value type to the output value type (if a conversion is + available). + + +.. TODO: add C++ cast example + +Array-wise ("vector") functions +------------------------------- + +Associative transforms +~~~~~~~~~~~~~~~~~~~~~~ + ++--------------------------+------------+----------------------------+----------------------------+ +| Function name | Arity | Input types | Output type | ++==========================+============+============================+============================+ +| dictionary_encode | Unary | Binary- and String-like | Dictionary (1) | ++--------------------------+------------+----------------------------+----------------------------+ +| dictionary_encode | Unary | Boolean, Numeric, Temporal | Dictionary (1) | ++--------------------------+------------+----------------------------+----------------------------+ +| dictionary_encode | Unary | Null | Dictionary (1) | ++--------------------------+------------+----------------------------+----------------------------+ +| unique | Unary | Binary- and String-like | Input type (2) | ++--------------------------+------------+----------------------------+----------------------------+ +| unique | Unary | Boolean, Numeric, Temporal | Input type (2) | ++--------------------------+------------+----------------------------+----------------------------+ +| unique | Unary | Null | Input type (2) | ++--------------------------+------------+----------------------------+----------------------------+ +| value_counts | Unary | Binary- and String-like | Struct (3) | ++--------------------------+------------+----------------------------+----------------------------+ +| value_counts | Unary | Boolean, Numeric, Temporal | Struct (3) | ++--------------------------+------------+----------------------------+----------------------------+ +| value_counts | Unary | Null | Struct (3) | ++--------------------------+------------+----------------------------+----------------------------+ + +* \(1) Output is ``Dictionary(Int32, input type)``. + +* \(2) Duplicates are removed from the output while the original order is + maintained. + +* \(3) Output is a ``{"values": input type, "counts": Int64}`` Struct. + Each output element corresponds to a unique value in the input, along + with the number of times this value has appeared. + +Selections +~~~~~~~~~~ + +These functions select a subset of the first input defined by the second input. + ++-----------------+------------+---------------+--------------+------------------+-------------------------+-------------+ +| Function name | Arity | Input type 1 | Input type 2 | Output type | Options class | Notes | ++=================+============+===============+==============+==================+=========================+=============+ +| filter | Binary | Any (1) | Boolean | Input type 1 | :struct:`FilterOptions` | \(2) | ++-----------------+------------+---------------+--------------+------------------+-------------------------+-------------+ +| take | Binary | Any (1) | Integer | Input type 1 | :struct:`TakeOptions` | \(3) | ++-----------------+------------+---------------+--------------+------------------+-------------------------+-------------+ + +* \(1) Unions are unsupported. + +* \(2) Each element in input 1 is appended to the output iff the corresponding + element in input 2 is true. + +* \(3) For each element *i* in input 2, the *i*'th element in input 1 is + appended to the output. + +Sorts and partitions +~~~~~~~~~~~~~~~~~~~~ + +In these functions, nulls are considered greater than any other value +(they will be sorted or partitioned at the end of the array). + ++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+============+=========================+===================+================================+=============+ +| partition_indices | Unary | Binary- and String-like | UInt64 | :struct:`PartitionOptions` | \(1) \(3) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ +| partition_indices | Unary | Numeric | UInt64 | :struct:`PartitionOptions` | \(1) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ +| sort_indices | Unary | Binary- and String-like | UInt64 | | \(2) \(3) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ +| sort_indices | Unary | Numeric | UInt64 | | \(2) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ + +* \(1) The output is an array of indices into the input array, that define + a partition around the *N*'th input array element in sorted order. *N* is + given in :member:`PartitionOptions::pivot`. + +* \(2) The output is an array of indices into the input array, that define + a non-stable sort of the input array. + +* \(3) Input values are ordered lexicographically as bytestrings (even + for String arrays). + + +Structural transforms +~~~~~~~~~~~~~~~~~~~~~ + ++--------------------------+------------+--------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==========================+============+====================+=====================+=========+ +| list_flatten | Unary | List-like | List value type | \(1) | ++--------------------------+------------+--------------------+---------------------+---------+ +| list_parent_indices | Unary | List-like | Int32 or Int64 | \(2) | ++--------------------------+------------+--------------------+---------------------+---------+ + +* \(1) The top level of nesting is removed: all values in the list child array, + including nulls, are appended to the output. However, nulls in the parent + list array are discarded. + +* \(2) For each value in the list child array, the index at which it is found + in the list array is appended to the output. Nulls in the parent list array + are discarded. diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 5ec0dec43c4..0927e7f6756 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -30,6 +30,7 @@ User Guide arrays datatypes tables + compute io parquet csv diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index fa4902d852d..81a00d8de3d 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -75,43 +75,48 @@ may expose data type-specific methods or properties. .. _api.scalar: -Array Scalars -------------- +Scalars +------- -Indexing an array wraps the represented value in a scalar object whose -concrete type depends on the array data type. You shouldn't instantiate -any of those classes directly. +This function constructs a new Arrow scalar: + +.. autosummary:: + :toctree: ../generated/ + + scalar + +A scalar's python class depends on its data type. Concrete scalar +classes may expose data type-specific methods or properties. .. autosummary:: :toctree: ../generated/ NA Scalar - ArrayValue - BooleanValue - Int8Value - Int16Value - Int32Value - Int64Value - UInt8Value - UInt16Value - UInt32Value - UInt64Value - FloatValue - DoubleValue - BinaryValue - StringValue - FixedSizeBinaryValue - LargeBinaryValue - LargeStringValue - Time32Value - Time64Value - Date32Value - Date64Value - TimestampValue - DecimalValue - DictionaryValue - ListValue - LargeListValue - StructValue - UnionValue + BooleanScalar + Int8Scalar + Int16Scalar + Int32Scalar + Int64Scalar + UInt8Scalar + UInt16Scalar + UInt32Scalar + UInt64Scalar + FloatScalar + DoubleScalar + BinaryScalar + StringScalar + FixedSizeBinaryScalar + LargeBinaryScalar + LargeStringScalar + Time32Scalar + Time64Scalar + Date32Scalar + Date64Scalar + TimestampScalar + Decimal128Scalar + DictionaryScalar + ListScalar + LargeListScalar + StructScalar + UnionScalar diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index 3d9983415ce..1d2240ed8a3 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -329,8 +329,8 @@ Reading from Minio ------------------ In addition to cloud storage, pyarrow also supports reading from a -`MinIO https://github.com/minio/minio`_ object storage instance emulating S3 -APIs. Paired with `toxiproxy https://github.com/shopify/toxiproxy`_, this is +`MinIO `_ object storage instance emulating S3 +APIs. Paired with `toxiproxy `_, this is useful for testing or benchmarking. .. code-block:: python From 721daa6d7b931d0429d890880f74239ee7f7195c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 13 Jul 2020 15:30:21 +0200 Subject: [PATCH 2/2] Address review comments, add fill_null --- cpp/src/arrow/scalar.h | 40 +++++- docs/source/cpp/api.rst | 1 + docs/source/cpp/compute.rst | 251 ++++++++++++++++++++++++++---------- 3 files changed, 222 insertions(+), 70 deletions(-) diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index eadeff5b124..8f7454a75cc 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -42,8 +42,12 @@ namespace arrow { class Array; -/// \brief Base class for scalar values, representing a single value occupying -/// an array "slot" +/// \brief Base class for scalar values +/// +/// A Scalar represents a single value with a specific DataType. +/// Scalars are useful for passing single value inputs to compute functions, +/// or for representing individual array elements (with a non-trivial +/// wrapping cost, though). struct ARROW_EXPORT Scalar : public util::EqualityComparable { virtual ~Scalar() = default; @@ -82,6 +86,10 @@ struct ARROW_EXPORT Scalar : public util::EqualityComparable { : type(std::move(type)), is_valid(is_valid) {} }; +/// \defgroup concrete-scalar-classes Concrete Scalar subclasses +/// +/// @{ + /// \brief A scalar value for NullType. Never valid struct ARROW_EXPORT NullScalar : public Scalar { public: @@ -90,6 +98,8 @@ struct ARROW_EXPORT NullScalar : public Scalar { NullScalar() : Scalar{null(), false} {} }; +/// @} + namespace internal { struct ARROW_EXPORT PrimitiveScalarBase : public Scalar { @@ -119,6 +129,10 @@ struct ARROW_EXPORT PrimitiveScalar : public PrimitiveScalarBase { } // namespace internal +/// \addtogroup concrete-scalar-classes Concrete Scalar subclasses +/// +/// @{ + struct ARROW_EXPORT BooleanScalar : public internal::PrimitiveScalar { using Base = internal::PrimitiveScalar; using Base::Base; @@ -421,9 +435,18 @@ struct ARROW_EXPORT ExtensionScalar : public Scalar { using TypeClass = ExtensionType; }; +/// @} + +/// \defgroup scalar-factories Scalar factory functions +/// +/// @{ + +/// \brief Scalar factory for null scalars ARROW_EXPORT std::shared_ptr MakeNullScalar(std::shared_ptr type); +/// @} + namespace internal { inline Status CheckBufferLength(...) { return Status::OK(); } @@ -463,13 +486,22 @@ struct MakeScalarImpl { std::shared_ptr out_; }; +/// \addtogroup scalar-factories +/// +/// @{ + +/// \brief Scalar factory for non-null scalars template Result> MakeScalar(std::shared_ptr type, Value&& value) { return MakeScalarImpl{type, std::forward(value), NULLPTR}.Finish(); } -/// \brief type inferring scalar factory +/// \brief Type-inferring scalar factory for non-null scalars +/// +/// Construct a Scalar instance with a DataType determined by the input C++ type. +/// (for example Int8Scalar for a int8_t input). +/// Only non-parametric primitive types and String are supported. template ::type>, typename ScalarType = typename Traits::ScalarType, typename Enable = decltype(ScalarType(std::declval(), @@ -482,4 +514,6 @@ inline std::shared_ptr MakeScalar(std::string value) { return std::make_shared(std::move(value)); } +/// @} + } // namespace arrow diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 68323f15d4d..59d221012d3 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -26,6 +26,7 @@ API Reference api/memory api/datatype api/array + api/scalar api/builder api/table api/compute diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 6cc2baaa673..149dbb39e7f 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -23,11 +23,109 @@ Compute Functions ================= +The generic Compute API +======================= + .. TODO: describe API and how to invoke compute functions +Functions and function registry +------------------------------- + +Functions represent logical compute operations over inputs of possibly +varying types. Internally, a function is implemented by one or several +"kernels", depending on the concrete input types (for example, a function +adding values from two inputs can have different kernels depending on +whether the inputs are integral or floating-point). + +Functions are stored in a global :class:`FunctionRegistry` where +they can be looked up by name. + +Input shapes +------------ + +Computation inputs are represented as a general :class:`Datum` class, +which is a tagged union of several shapes of data such as :class:`Scalar`, +:class:`Array` and :class:`ChunkedArray`. Many compute functions support +both array (chunked or not) and scalar inputs, however some will mandate +either. For example, the ``fill_null`` function requires its second input +to be a scalar, while ``sort_indices`` requires its first and only input to +be an array. + +Invoking functions +------------------ + +Compute functions can be invoked by name using +:func:`arrow::compute::CallFunction`:: + + std::shared_ptr numbers_array = ...; + std::shared_ptr increment = ...; + arrow::Datum incremented_datum; + + ARROW_ASSIGN_OR_RAISE(incremented_datum, + arrow::compute::CallFunction("add", {numbers_array, increment})); + std::shared_ptr incremented_array = std::move(incremented_datum).array(); + +(note this example uses implicit conversion from ``std::shared_ptr`` +to ``Datum``) + +Many compute functions are also available directly as concrete APIs, here +:func:`arrow::compute::Add`:: + + std::shared_ptr numbers_array = ...; + std::shared_ptr increment = ...; + arrow::Datum incremented_datum; + + ARROW_ASSIGN_OR_RAISE(incremented_datum, + arrow::compute::Add(numbers_array, increment)); + std::shared_ptr incremented_array = std::move(incremented_datum).array(); + +Some functions accept or require an options structure that determines the +exact semantics of the function:: + + MinMaxOptions options; + options.null_handling = MinMaxOptions::OUTPUT_NULL; + + std::shared_ptr array = ...; + arrow::Datum minmax_datum; + + ARROW_ASSIGN_OR_RAISE(minmax_datum, + arrow::compute::CallFunction("minmax", {array}, &options)); + + // Unpack struct scalar result (a two-field {"min", "max"} scalar) + const auto& minmax_scalar = \ + static_cast(*minmax_datum.scalar()); + const auto min_value = minmax_scalar.value[0]; + const auto max_value = minmax_scalar.value[1]; + +.. seealso:: + :doc:`Compute API reference ` + + Available functions =================== +Type categories +--------------- + +To avoid exhaustively listing supported types, the tables below use a number +of general type categories: + +* "Numeric": Integer types (Int8, etc.) and Floating-point types (Float32, + Float64, sometimes Float16). Some functions also accept Decimal128 input. + +* "Temporal": Date types (Date32, Date64), Time types (Time32, Time64), + Timestamp, Duration, Interval. + +* "Binary-like": Binary, LargeBinary, sometimes also FixedSizeBinary. + +* "String-like": String, LargeString. + +* "List-like": List, LargeList, sometimes also FixedSizeList. + +If you are unsure whether a function supports a concrete input type, we +recommend you try it out. Unsupported input types return a ``TypeError`` +:class:`Status`. + Aggregations ------------ @@ -53,15 +151,33 @@ Notes: Element-wise ("scalar") functions --------------------------------- +All element-wise functions accept both arrays and scalars as input. The +semantics for unary functions are as follow: + +* scalar inputs produce a scalar output +* array inputs produce an array output + +Binary functions have the following semantics (which is sometimes called +"broadcasting" in other systems such as NumPy): + +* ``(scalar, scalar)`` inputs produce a scalar output +* ``(array, array)`` inputs produce an array output (and both inputs must + be of the same length) +* ``(scalar, array)`` and ``(array, scalar)`` produce an array output. + The scalar input is handled as if it were an array of the same length N + as the other input, with the same value repeated N times. + Arithmetic functions ~~~~~~~~~~~~~~~~~~~~ -Those functions expect two inputs of the same type and apply a given binary -operation to each pair of elements gathered from the inputs. Each function -is also available in an overflow-checking variant, suffixed ``_checked``. +These functions expect two inputs of the same type and apply a given binary +operation to each pair of elements gathered from the inputs. If any of the +input elements in a pair is null, the corresponding output element is null. -If any of the input elements in a pair is null, the corresponding output -element is null. +The default variant of these functions does not detect overflow (the result +then typically wraps around). Each function is also available in an +overflow-checking variant, suffixed ``_checked``, which returns +an ``Invalid`` :class:`Status` when overflow is detected. +--------------------------+------------+--------------------+---------------------+ | Function name | Arity | Input types | Output type | @@ -99,11 +215,15 @@ Logical functions ~~~~~~~~~~~~~~~~~~ The normal behaviour for these functions is to emit a null if any of the -inputs is null. +inputs is null (similar to the semantics of ``NaN`` in floating-point +computations). -Some of them are also available in a "`Kleene logic`_" variant (suffixed -``_kleene``) where null is taken to mean "undefined". For those variants -therefore: +Some of them are also available in a `Kleene logic`_ variant (suffixed +``_kleene``) where null is taken to mean "undefined". This is the +interpretation of null used in SQL systems as well as R and Julia, +for example. + +For the Kleene logic variants, therefore: * "true AND null", "null AND true" give "null" (the result is undefined) * "true OR null", "null OR true" give "true" @@ -145,7 +265,8 @@ String functions | utf8_upper | Unary | String-like | String-like | \(3) | +--------------------------+------------+--------------------+---------------------+---------+ -* \(1) Output is the physical length in bytes of each input element. +* \(1) Output is the physical length in bytes of each input element. Output + type is Int32 for String, Int64 for LargeString. * \(2) Each ASCII character in the input is converted to lowercase or uppercase. Non-ASCII characters are left untouched. @@ -156,23 +277,17 @@ String functions Containment tests ~~~~~~~~~~~~~~~~~ -+--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ -| Function name | Arity | Input types | Output type | Options class | -+==========================+============+==================================+=======================+============================================+ -| binary_contains_exact | Unary | String-like | Boolean (1) | :struct:`BinaryContainsExactOptions` | -+--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ -| isin | Unary | Binary- and String-like | Boolean (2) | :struct:`SetLookupOptions` | -+--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ -| isin | Unary | Null | Boolean (2) | :struct:`SetLookupOptions` | -+--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ -| isin | Unary | Boolean,Numeric, Temporal | Boolean (2) | :struct:`SetLookupOptions` | -+--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ -| match | Unary | Binary- and String-like | Int32 (3) | :struct:`SetLookupOptions` | -+--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ -| match | Unary | Null | Int32 (3) | :struct:`SetLookupOptions` | -+--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ -| match | Unary | Boolean,Numeric, Temporal | Int32 (3) | :struct:`SetLookupOptions` | -+--------------------------+------------+----------------------------------+-----------------------+--------------------------------------------+ ++--------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++==========================+============+====================================+===============+========================================+ +| binary_contains_exact | Unary | String-like | Boolean (1) | :struct:`BinaryContainsExactOptions` | ++--------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| isin | Unary | Boolean, Null, Numeric, Temporal, | Boolean (2) | :struct:`SetLookupOptions` | +| | | Binary- and String-like | | | ++--------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| match | Unary | Boolean, Null, Numeric, Temporal, | Int32 (3) | :struct:`SetLookupOptions` | +| | | Binary- and String-like | | | ++--------------------------+------------+------------------------------------+---------------+----------------------------------------+ * \(1) Output is true iff :member:`BinaryContainsExactOptions::pattern` is a substring of the corresponding input element. @@ -187,29 +302,38 @@ Containment tests Structural transforms ~~~~~~~~~~~~~~~~~~~~~ -+--------------------------+------------+--------------------+---------------------+---------+ -| Function name | Arity | Input types | Output type | Notes | -+==========================+============+====================+=====================+=========+ -| is_null | Unary | Any | Boolean | \(1) | -+--------------------------+------------+--------------------+---------------------+---------+ -| is_valid | Unary | Any | Boolean | \(2) | -+--------------------------+------------+--------------------+---------------------+---------+ -| list_value_lengths | Unary | List-like | Int32 or Int64 | \(3) | -+--------------------------+------------+--------------------+---------------------+---------+ +.. XXX (this category is a bit of a hodgepodge) + ++--------------------------+------------+---------------------------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==========================+============+=======================================+=====================+=========+ +| fill_null | Binary | Boolean, Null, Numeric, Temporal | Boolean | \(1) | ++--------------------------+------------+---------------------------------------+---------------------+---------+ +| is_null | Unary | Any | Boolean | \(2) | ++--------------------------+------------+---------------------------------------+---------------------+---------+ +| is_valid | Unary | Any | Boolean | \(2) | ++--------------------------+------------+---------------------------------------+---------------------+---------+ +| list_value_lengths | Unary | List-like | Int32 or Int64 | \(4) | ++--------------------------+------------+---------------------------------------+---------------------+---------+ + +* \(1) First input must be an array, second input a scalar of the same type. + Output is an array of the same type as the inputs, and with the same values + as the first input, except for nulls replaced with the second input value. -* \(1) Output is true iff the corresponding input element is non-null. +* \(2) Output is true iff the corresponding input element is non-null. -* \(2) Output is true iff the corresponding input element is null. +* \(3) Output is true iff the corresponding input element is null. -* \(3) Each output element is the length of the corresponding input element - (null if input is null). +* \(4) Each output element is the length of the corresponding input element + (null if input is null). Output type is Int32 for List, Int64 for LargeList. Conversions ~~~~~~~~~~~ A general conversion function named ``cast`` is provided which accepts a large number of input and output types. The type to cast to can be passed in a -:struct:`CastOptions` instance. +:struct:`CastOptions` instance. As an alternative, the same service is +provided by a concrete function :func:`~arrow::compute::Cast`. +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | Function name | Arity | Input types | Output type | Options class | @@ -286,49 +410,42 @@ null input value is converted into a null output value. +-----------------------------+------------------------------------+---------+ | Input type | Output type | Notes | +=============================+====================================+=========+ -| Dictionary | Dictionary value type | | +| Dictionary | Dictionary value type | \(1) | +-----------------------------+------------------------------------+---------+ | Extension | Extension storage type | | +-----------------------------+------------------------------------+---------+ -| List-like | List-like | \(1) | +| List-like | List-like | \(2) | +-----------------------------+------------------------------------+---------+ | Null | Any | | +-----------------------------+------------------------------------+---------+ -* \(1) The list offsets are unchanged, the list values are cast from the +* \(1) The dictionary indices are unchanged, the dictionary values are + cast from the input value type to the output value type (if a conversion + is available). + +* \(2) The list offsets are unchanged, the list values are cast from the input value type to the output value type (if a conversion is available). -.. TODO: add C++ cast example - Array-wise ("vector") functions ------------------------------- Associative transforms ~~~~~~~~~~~~~~~~~~~~~~ -+--------------------------+------------+----------------------------+----------------------------+ -| Function name | Arity | Input types | Output type | -+==========================+============+============================+============================+ -| dictionary_encode | Unary | Binary- and String-like | Dictionary (1) | -+--------------------------+------------+----------------------------+----------------------------+ -| dictionary_encode | Unary | Boolean, Numeric, Temporal | Dictionary (1) | -+--------------------------+------------+----------------------------+----------------------------+ -| dictionary_encode | Unary | Null | Dictionary (1) | -+--------------------------+------------+----------------------------+----------------------------+ -| unique | Unary | Binary- and String-like | Input type (2) | -+--------------------------+------------+----------------------------+----------------------------+ -| unique | Unary | Boolean, Numeric, Temporal | Input type (2) | -+--------------------------+------------+----------------------------+----------------------------+ -| unique | Unary | Null | Input type (2) | -+--------------------------+------------+----------------------------+----------------------------+ -| value_counts | Unary | Binary- and String-like | Struct (3) | -+--------------------------+------------+----------------------------+----------------------------+ -| value_counts | Unary | Boolean, Numeric, Temporal | Struct (3) | -+--------------------------+------------+----------------------------+----------------------------+ -| value_counts | Unary | Null | Struct (3) | -+--------------------------+------------+----------------------------+----------------------------+ ++--------------------------+------------+------------------------------------+----------------------------+ +| Function name | Arity | Input types | Output type | ++==========================+============+====================================+============================+ +| dictionary_encode | Unary | Boolean, Null, Numeric, | Dictionary (1) | +| | | Temporal, Binary- and String-like | | ++--------------------------+------------+------------------------------------+----------------------------+ +| unique | Unary | Boolean, Null, Numeric, | Input type (2) | +| | | Temporal, Binary- and String-like | | ++--------------------------+------------+------------------------------------+----------------------------+ +| value_counts | Unary | Boolean, Null, Numeric, | Input type (3) | +| | | Temporal, Binary- and String-like | | ++--------------------------+------------+------------------------------------+----------------------------+ * \(1) Output is ``Dictionary(Int32, input type)``.