From a0c1d6b1647351f877fb1b4c58ca353e22feae2d Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 6 Dec 2021 11:50:34 +0100 Subject: [PATCH 01/11] Basic functionality. --- cpp/src/arrow/compute/api_scalar.cc | 112 ++++++++++++++ cpp/src/arrow/compute/api_scalar.h | 59 ++++++++ cpp/src/arrow/compute/function_test.cc | 7 + .../compute/kernels/scalar_temporal_test.cc | 103 +++++++++++++ .../compute/kernels/scalar_temporal_unary.cc | 141 ++++++++++++++++++ python/pyarrow/_compute.pyx | 71 ++++++++- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 62 ++++++++ python/pyarrow/tests/test_compute.py | 41 +++++ 9 files changed, 596 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 021499af228..1bec933706f 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -118,6 +118,7 @@ struct EnumTraits return ""; } }; + template <> struct EnumTraits : BasicEnumTraits } }; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "RoundTemporalOptions::Nonexistent"; } + static std::string value_name(compute::RoundTemporalOptions::Nonexistent value) { + switch (value) { + case compute::RoundTemporalOptions::Nonexistent::NONEXISTENT_RAISE: + return "NONEXISTENT_RAISE"; + case compute::RoundTemporalOptions::Nonexistent::NONEXISTENT_EARLIEST: + return "NONEXISTENT_EARLIEST"; + case compute::RoundTemporalOptions::Nonexistent::NONEXISTENT_LATEST: + return "NONEXISTENT_LATEST"; + } + return ""; + } +}; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "AssumeTimezoneOptions::Ambiguous"; } + static std::string value_name(compute::RoundTemporalOptions::Ambiguous value) { + switch (value) { + case compute::RoundTemporalOptions::Ambiguous::AMBIGUOUS_RAISE: + return "AMBIGUOUS_RAISE"; + case compute::RoundTemporalOptions::Ambiguous::AMBIGUOUS_EARLIEST: + return "AMBIGUOUS_EARLIEST"; + case compute::RoundTemporalOptions::Ambiguous::AMBIGUOUS_LATEST: + return "AMBIGUOUS_LATEST"; + } + return ""; + } +}; + +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "compute::CalendarUnit"; } + static std::string value_name(compute::CalendarUnit value) { + switch (value) { + case compute::CalendarUnit::NANOSECOND: + return "NANOSECOND"; + case compute::CalendarUnit::MICROSECOND: + return "MICROSECOND"; + case compute::CalendarUnit::MILLISECOND: + return "MILLISECOND"; + case compute::CalendarUnit::SECOND: + return "SECOND"; + case compute::CalendarUnit::MINUTE: + return "MINUTE"; + case compute::CalendarUnit::HOUR: + return "HOUR"; + case compute::CalendarUnit::DAY: + return "DAY"; + case compute::CalendarUnit::WEEK: + return "WEEK"; + case compute::CalendarUnit::MONTH: + return "DAY"; + case compute::CalendarUnit::BIMONTH: + return "MONTH"; + case compute::CalendarUnit::QUARTER: + return "QUARTER"; + case compute::CalendarUnit::SEASON: + return "SEASON"; + case compute::CalendarUnit::HALFYEAR: + return "HALFYEAR"; + case compute::CalendarUnit::YEAR: + return "YEAR"; + } + return ""; + } +}; + template <> struct EnumTraits : BasicEnumTraits( DataMember("ndigits", &RoundOptions::ndigits), DataMember("round_mode", &RoundOptions::round_mode)); +static auto kRoundTemporalOptionsType = GetFunctionOptionsType( + DataMember("multiple", &RoundTemporalOptions::multiple), + DataMember("unit", &RoundTemporalOptions::unit), + DataMember("week_starts_monday", &RoundTemporalOptions::week_starts_monday), + DataMember("change_on_boundary", &RoundTemporalOptions::change_on_boundary), + DataMember("ambiguous", &RoundTemporalOptions::ambiguous), + DataMember("nonexistent", &RoundTemporalOptions::nonexistent)); static auto kRoundToMultipleOptionsType = GetFunctionOptionsType( DataMember("multiple", &RoundToMultipleOptions::multiple), DataMember("round_mode", &RoundToMultipleOptions::round_mode)); @@ -412,6 +505,19 @@ RoundOptions::RoundOptions(int64_t ndigits, RoundMode round_mode) } constexpr char RoundOptions::kTypeName[]; +RoundTemporalOptions::RoundTemporalOptions(int multiple, CalendarUnit unit, + bool week_starts_monday, + bool change_on_boundary, Ambiguous ambiguous, + Nonexistent nonexistent) + : FunctionOptions(internal::kRoundTemporalOptionsType), + multiple(std::move(multiple)), + unit(unit), + week_starts_monday(week_starts_monday), + change_on_boundary(change_on_boundary), + ambiguous(ambiguous), + nonexistent(nonexistent) {} +constexpr char RoundTemporalOptions::kTypeName[]; + RoundToMultipleOptions::RoundToMultipleOptions(double multiple, RoundMode round_mode) : RoundToMultipleOptions(std::make_shared(multiple), round_mode) {} RoundToMultipleOptions::RoundToMultipleOptions(std::shared_ptr multiple, @@ -511,6 +617,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRoundOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kRoundTemporalOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRoundToMultipleOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); @@ -711,6 +818,11 @@ Result DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* return CallFunction("day_of_week", {arg}, &options, ctx); } +Result RoundTemporal(const Datum& arg, RoundTemporalOptions options, + ExecContext* ctx) { + return CallFunction("round_temporal", {arg}, &options, ctx); +} + Result Strftime(const Datum& arg, StrftimeOptions options, ExecContext* ctx) { return CallFunction("strftime", {arg}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 3d922157c9c..0ac74383b3c 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -90,6 +90,52 @@ class ARROW_EXPORT RoundOptions : public FunctionOptions { RoundMode round_mode; }; +enum class CalendarUnit : int8_t { + NANOSECOND, + MICROSECOND, + MILLISECOND, + SECOND, + MINUTE, + HOUR, + DAY, + WEEK, + MONTH, + BIMONTH, + QUARTER, + SEASON, + HALFYEAR, + YEAR +}; + +class ARROW_EXPORT RoundTemporalOptions : public FunctionOptions { + public: + enum Ambiguous { AMBIGUOUS_RAISE, AMBIGUOUS_EARLIEST, AMBIGUOUS_LATEST }; + enum Nonexistent { NONEXISTENT_RAISE, NONEXISTENT_EARLIEST, NONEXISTENT_LATEST }; + + explicit RoundTemporalOptions(int multiple = 1, + CalendarUnit unit = CalendarUnit::DAY, + bool week_starts_monday = true, + bool change_on_boundary = false, + Ambiguous ambiguous = AMBIGUOUS_RAISE, + Nonexistent nonexistent = NONEXISTENT_RAISE); + constexpr static char const kTypeName[] = "RoundTemporalOptions"; + static RoundTemporalOptions Defaults() { return RoundTemporalOptions(); } + + /// Number of units to round to + int multiple; + /// The unit used for rounding of time + CalendarUnit unit; + /// What day does the week start with (Monday=true, Sunday=false) + bool week_starts_monday; + /// If true timestamps on the boundary are rounded up to the next boundary. + /// If false nothing on the boundary is rounded up at all. + bool change_on_boundary; + /// How to interpret ambiguous local times (due to DST shifts) + Ambiguous ambiguous; + /// How to interpret non-existent local times (due to DST shifts) + Nonexistent nonexistent; +}; + class ARROW_EXPORT RoundToMultipleOptions : public FunctionOptions { public: explicit RoundToMultipleOptions(double multiple = 1.0, @@ -781,6 +827,19 @@ Result RoundToMultiple( const Datum& arg, RoundToMultipleOptions options = RoundToMultipleOptions::Defaults(), ExecContext* ctx = NULLPTR); +/// \brief Round a temporal value to a given frequency +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the temporal value to round +/// \param[in] options temporal rounding options, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise rounded value +ARROW_EXPORT +Result RoundTemporal( + const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(), + ExecContext* ctx = NULLPTR); + /// \brief Compare a numeric array with a scalar. /// /// \param[in] left datum to compare, must be an Array diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index b503a1732d9..8d380cc627d 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -61,6 +61,13 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new RoundOptions()); options.emplace_back( new RoundOptions(/*ndigits=*/2, /*round_mode=*/RoundMode::TOWARDS_INFINITY)); + options.emplace_back(new RoundTemporalOptions()); + options.emplace_back(new RoundTemporalOptions( + /*multiple=*/2, + /*unit=*/CalendarUnit::WEEK, /*week_starts_monday=*/true, + /*change_on_boundary=*/true, + /*ambiguous=*/RoundTemporalOptions::Ambiguous::AMBIGUOUS_RAISE, + /*nonexistent=*/RoundTemporalOptions::Nonexistent::NONEXISTENT_RAISE)); options.emplace_back(new RoundToMultipleOptions()); options.emplace_back(new RoundToMultipleOptions( /*multiple=*/100, /*round_mode=*/RoundMode::TOWARDS_INFINITY)); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 92133136b62..e6b579af6ec 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -1324,6 +1324,109 @@ TEST_F(ScalarTemporalTest, TestTemporalDifferenceZoned) { } } +TEST_F(ScalarTemporalTest, TestTemporalRounding) { + auto unit = timestamp(TimeUnit::NANO, "Australia/Broken_Hill"); + const char* times = R"(["2019-11-30T02:10:10.123456789", null])"; + + const char* times_floor_nanoseconds = R"(["2019-11-30T02:10:10.123456789", null])"; + const char* times_floor_microseconds = R"(["2019-11-30T02:10:10.123456", null])"; + const char* times_floor_milliseconds = R"(["2019-11-30T02:10:10.123", null])"; + const char* times_floor_seconds = R"(["2019-11-30T02:10:10", null])"; + const char* times_floor_minutes = R"(["2019-11-30T02:10:00", null])"; + const char* times_floor_hours = R"(["2019-11-30T02:00:00", null])"; + const char* times_floor_days = R"(["2019-11-30T00:00:00", null])"; + const char* times_floor_weeks = R"(["2019-11-28T00:00:00", null])"; + const char* times_floor_months = R"(["2019-11-01T00:00:00", null])"; + const char* times_floor_bimonths = R"(["2019-10-01T00:00:00", null])"; + const char* times_floor_quarters = R"(["2019-09-01T00:00:00", null])"; + const char* times_floor_seasons = R"(["2019-09-01T00:00:00", null])"; + const char* times_floor_halfyears = R"(["2019-06-01T00:00:00", null])"; + const char* times_floor_years = R"(["2019-01-01T00:00:00", null])"; + + auto options_nanoseconds = RoundTemporalOptions(1, CalendarUnit::NANOSECOND); + auto options_microseconds = RoundTemporalOptions(1, CalendarUnit::MICROSECOND); + auto options_milliseconds = RoundTemporalOptions(1, CalendarUnit::MILLISECOND); + auto options_seconds = RoundTemporalOptions(1, CalendarUnit::SECOND); + auto options_minutes = RoundTemporalOptions(1, CalendarUnit::MINUTE); + auto options_hours = RoundTemporalOptions(1, CalendarUnit::HOUR); + auto options_days = RoundTemporalOptions(1, CalendarUnit::DAY); + auto options_weeks = RoundTemporalOptions(1, CalendarUnit::WEEK); + auto options_months = RoundTemporalOptions(1, CalendarUnit::MONTH); + auto options_bimonths = RoundTemporalOptions(1, CalendarUnit::BIMONTH); + auto options_quarters = RoundTemporalOptions(1, CalendarUnit::QUARTER); + auto options_seasons = RoundTemporalOptions(1, CalendarUnit::SEASON); + auto options_halfyears = RoundTemporalOptions(1, CalendarUnit::HALFYEAR); + auto options_years = RoundTemporalOptions(1, CalendarUnit::YEAR); + +// CheckScalarUnary("round_temporal", unit, times, unit, times_floor_nanoseconds, +// &options_nanoseconds); +// CheckScalarUnary("round_temporal", unit, times, unit, times_floor_microseconds, +// &options_microseconds); +// CheckScalarUnary("round_temporal", unit, times, unit, times_floor_milliseconds, +// &options_milliseconds); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_seconds, + &options_seconds); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_minutes, + &options_minutes); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_hours, + &options_hours); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_days, &options_days); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_weeks, + &options_weeks); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_months, + &options_months); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_bimonths, + &options_bimonths); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_quarters, + &options_quarters); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_seasons, + &options_seasons); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_halfyears, + &options_halfyears); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_years, + &options_years); + + const char* times_floor_4_days = R"(["2019-11-30", null])"; + auto options_4_days = RoundTemporalOptions( + /*multiple=*/4, + /*unit=*/CalendarUnit::DAY, /*week_starts_monday=*/false, + /*change_on_boundary=*/true, + /*ambiguous=*/RoundTemporalOptions::Ambiguous::AMBIGUOUS_RAISE, + /*nonexistent=*/RoundTemporalOptions::Nonexistent::NONEXISTENT_RAISE); + CheckScalarUnary("round_temporal", unit, times, unit, times_floor_4_days, + &options_4_days); + +// const char* times_floor_3_weeks = R"(["2019-11-29", null])"; +// auto options_3_weeks = RoundTemporalOptions( +// /*multiple=*/3, +// /*unit=*/CalendarUnit::WEEK, /*week_starts_monday=*/false, +// /*change_on_boundary=*/true, +// /*ambiguous=*/RoundTemporalOptions::Ambiguous::AMBIGUOUS_RAISE, +// /*nonexistent=*/RoundTemporalOptions::Nonexistent::NONEXISTENT_RAISE); +// CheckScalarUnary("round_temporal", unit, times, unit, times_floor_3_weeks, +// &options_3_weeks); +// +// const char* times_floor_5_seconds = R"(["2019-11-30T02:10:09", +// null])"; auto options_5_seconds = RoundTemporalOptions( +// /*multiple=*/3, +// /*unit=*/CalendarUnit::SECOND, /*week_starts_monday=*/true, +// /*change_on_boundary=*/true, +// /*ambiguous=*/RoundTemporalOptions::Ambiguous::AMBIGUOUS_RAISE, +// /*nonexistent=*/RoundTemporalOptions::Nonexistent::NONEXISTENT_RAISE); +// CheckScalarUnary("round_temporal", unit, times, unit, times_floor_5_seconds, +// &options_5_seconds); +// +// const char* times_floor_2_hours = R"(["2019-11-30T02:00:00", null])"; +// auto options_2_hours = RoundTemporalOptions( +// /*multiple=*/2, +// /*unit=*/CalendarUnit::HOUR, /*week_starts_monday=*/false, +// /*change_on_boundary=*/true, +// /*ambiguous=*/RoundTemporalOptions::Ambiguous::AMBIGUOUS_RAISE, +// /*nonexistent=*/RoundTemporalOptions::Nonexistent::NONEXISTENT_RAISE); +// CheckScalarUnary("round_temporal", unit, times, unit, times_floor_2_hours, +// &options_2_hours); +} + #endif // !_WIN32 } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index c05ddd78928..353f1e46c7b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -65,6 +65,7 @@ using DayOfWeekState = OptionsWrapper; using WeekState = OptionsWrapper; using StrftimeState = OptionsWrapper; using AssumeTimezoneState = OptionsWrapper; +using RoundTemporalState = OptionsWrapper; const std::shared_ptr& IsoCalendarType() { static auto type = struct_({field("iso_year", int64()), field("iso_week", int64()), @@ -128,6 +129,18 @@ struct TemporalComponentExtractWeek } }; +template