diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index cead4ec4f61..4363fe5593c 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -786,6 +786,7 @@ SCALAR_EAGER_UNARY(Quarter, "quarter") SCALAR_EAGER_UNARY(Second, "second") SCALAR_EAGER_UNARY(Subsecond, "subsecond") SCALAR_EAGER_UNARY(USWeek, "us_week") +SCALAR_EAGER_UNARY(USYear, "us_year") SCALAR_EAGER_UNARY(Year, "year") Result AssumeTimezone(const Datum& arg, AssumeTimezoneOptions options, diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index c13b45b2798..3b3b0115c8e 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -1199,6 +1199,20 @@ ARROW_EXPORT Result DayOfYear(const Datum& values, ExecContext* ctx = NUL ARROW_EXPORT Result ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); +/// \brief USYear returns US epidemiological year number for each element of `values`. +/// First week of US epidemiological year has the majority (4 or more) of it's +/// days in January. Last week of US epidemiological year has the year's last +/// Wednesday in it. US epidemiological week starts on Sunday. +/// +/// \param[in] values input to extract US epidemiological year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result USYear(const Datum& values, ExecContext* ctx = NULLPTR); + /// \brief ISOWeek returns ISO week of year number for each element of `values`. /// First ISO week has the majority (4 or more) of its days in January. /// ISO week starts on Monday. Year can have 52 or 53 weeks. diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 0ab74453610..63260c15984 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -166,6 +166,9 @@ class ScalarTemporalTest : public ::testing::Test { std::string day_of_week = "[3, 1, 6, 2, 2, 1, 0, 3, 4, 6, 0, 6, 5, 6, 0, 6, null]"; std::string day_of_year = "[1, 60, 1, 138, 1, 365, 364, 365, 1, 3, 4, 1, 365, 363, 364, 1, null]"; + std::string us_year = + "[1969, 2000, 1899, 2033, 2020, 2020, 2020, 2009, 2009, 2010, 2010, 2006, 2005, " + "2008, 2008, 2012, null]"; std::string iso_year = "[1970, 2000, 1898, 2033, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, " "2005, 2008, 2009, 2011, null]"; @@ -411,6 +414,7 @@ TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionAllTemporalTypes) { CheckScalarUnary("year_month_day", ArrayFromJSON(unit, sample), year_month_day); CheckScalarUnary("day_of_week", unit, sample, int64(), day_of_week); CheckScalarUnary("day_of_year", unit, sample, int64(), day_of_year); + CheckScalarUnary("us_year", unit, sample, int64(), us_year); CheckScalarUnary("iso_year", unit, sample, int64(), iso_year); CheckScalarUnary("iso_week", unit, sample, int64(), iso_week); CheckScalarUnary("us_week", unit, sample, int64(), us_week); @@ -477,6 +481,7 @@ TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionWithDifferentUnits) { CheckScalarUnary("day_of_year", unit, times_seconds_precision, int64(), day_of_year); ASSERT_RAISES(Invalid, IsDaylightSavings(ArrayFromJSON(unit, times_seconds_precision))); + CheckScalarUnary("us_year", unit, times_seconds_precision, int64(), us_year); CheckScalarUnary("iso_year", unit, times_seconds_precision, int64(), iso_year); CheckScalarUnary("iso_week", unit, times_seconds_precision, int64(), iso_week); CheckScalarUnary("us_week", unit, times_seconds_precision, int64(), us_week); @@ -505,6 +510,7 @@ TEST_F(ScalarTemporalTest, TestOutsideNanosecondRange) { {"year": 2262, "month": 4, "day": 13}])"); auto day_of_week = "[0, 6]"; auto day_of_year = "[263, 103]"; + auto us_year = "[1677, 2262]"; auto iso_year = "[1677, 2262]"; auto iso_week = "[38, 15]"; auto us_week = "[38, 16]"; @@ -528,6 +534,7 @@ TEST_F(ScalarTemporalTest, TestOutsideNanosecondRange) { CheckScalarUnary("year_month_day", ArrayFromJSON(unit, times), year_month_day); CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("us_year", unit, times, int64(), us_year); CheckScalarUnary("iso_year", unit, times, int64(), iso_year); CheckScalarUnary("iso_week", unit, times, int64(), iso_week); CheckScalarUnary("us_week", unit, times, int64(), us_week); @@ -575,6 +582,9 @@ TEST_F(ScalarTemporalTest, TestZoned1) { std::string is_dst = "[false, false, false, false, false, false, false, false, false, false, false, " "false, false, false, false, false, null]"; + auto us_year = + "[1969, 2000, 1898, 2033, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, 2005, " + "2008, 2008, 2011, null]"; auto iso_year = "[1970, 2000, 1898, 2033, 2020, 2020, 2019, 2009, 2009, 2009, 2009, 2005, 2005, " "2008, 2008, 2011, null]"; @@ -610,6 +620,7 @@ TEST_F(ScalarTemporalTest, TestZoned1) { CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); CheckScalarUnary("is_dst", unit, times, boolean(), is_dst); + CheckScalarUnary("us_year", unit, times, int64(), us_year); CheckScalarUnary("iso_year", unit, times, int64(), iso_year); CheckScalarUnary("iso_week", unit, times, int64(), iso_week); CheckScalarUnary("us_week", unit, times, int64(), us_week); @@ -653,6 +664,9 @@ TEST_F(ScalarTemporalTest, TestZoned2) { std::string is_dst = "[false, true, false, false, true, true, true, true, true, true, true, true, " "true, true, true, true, null]"; + auto us_year = + "[1969, 2000, 1899, 2033, 2020, 2020, 2020, 2009, 2009, 2010, 2010, 2006, 2005, " + "2008, 2008, 2012, null]"; auto iso_year = "[1970, 2000, 1898, 2033, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, 2005, " "2008, 2009, 2011, null]"; @@ -689,6 +703,7 @@ TEST_F(ScalarTemporalTest, TestZoned2) { CheckScalarUnary("day_of_week", unit, times_seconds_precision, int64(), day_of_week); CheckScalarUnary("day_of_year", unit, times_seconds_precision, int64(), day_of_year); CheckScalarUnary("is_dst", unit, times_seconds_precision, boolean(), is_dst); + CheckScalarUnary("us_year", unit, times_seconds_precision, int64(), us_year); CheckScalarUnary("iso_year", unit, times_seconds_precision, int64(), iso_year); CheckScalarUnary("iso_week", unit, times_seconds_precision, int64(), iso_week); CheckScalarUnary("us_week", unit, times_seconds_precision, int64(), us_week); @@ -721,6 +736,7 @@ TEST_F(ScalarTemporalTest, TestNonexistentTimezone) { ASSERT_RAISES(Invalid, DayOfWeek(timestamp_array)); ASSERT_RAISES(Invalid, DayOfYear(timestamp_array)); ASSERT_RAISES(Invalid, IsDaylightSavings(timestamp_array)); + ASSERT_RAISES(Invalid, USYear(timestamp_array)); ASSERT_RAISES(Invalid, ISOYear(timestamp_array)); ASSERT_RAISES(Invalid, Week(timestamp_array)); ASSERT_RAISES(Invalid, ISOCalendar(timestamp_array)); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index 8bf82cc037a..995470cfb1f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -429,6 +429,32 @@ struct ISOYear { Localizer localizer_; }; +// ---------------------------------------------------------------------- +// Extract US epidemiological year values from temporal types +// +// First week of US epidemiological year has the majority (4 or more) of it's +// days in January. Last week of US epidemiological year has the year's last +// Wednesday in it. US epidemiological week starts on Sunday. + +template +struct USYear { + explicit USYear(const FunctionOptions* options, Localizer&& localizer) + : localizer_(std::move(localizer)) {} + + template + T Call(KernelContext*, Arg0 arg, Status*) const { + const auto t = floor(localizer_.template ConvertTimePoint(arg)); + auto y = year_month_day{t + days{3}}.year(); + auto start = localizer_.ConvertDays((y - years{1}) / dec / wed[last]) + (mon - thu); + if (t < start) { + --y; + } + return static_cast(static_cast(y)); + } + + Localizer localizer_; +}; + // ---------------------------------------------------------------------- // Extract week from temporal types // @@ -1351,6 +1377,16 @@ const FunctionDoc iso_year_doc{ "cannot be found in the timezone database."), {"values"}}; +const FunctionDoc us_year_doc{ + "Extract US epidemiological year number", + ("First week of US epidemiological year has the majority (4 or more) of\n" + "it's days in January. Last week of US epidemiological year has the\n" + "year's last Wednesday in it. US epidemiological week starts on Sunday.\n" + "Null values emit null.\n" + "An error is returned if the values have a defined timezone but it\n" + "cannot be found in the timezone database."), + {"values"}}; + const FunctionDoc iso_week_doc{ "Extract ISO week of year number", ("First ISO week has the majority (4 or more) of its days in January.\n" @@ -1557,6 +1593,12 @@ void RegisterScalarTemporalUnary(FunctionRegistry* registry) { &iso_year_doc); DCHECK_OK(registry->AddFunction(std::move(iso_year))); + auto us_year = + UnaryTemporalFactory::Make("us_year", int64(), + &us_year_doc); + DCHECK_OK(registry->AddFunction(std::move(us_year))); + static const auto default_iso_week_options = WeekOptions::ISODefaults(); auto iso_week = UnaryTemporalFactory::Make< diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 535d31b26c2..191f2658185 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1408,6 +1408,8 @@ For timestamps inputs with non-empty timezone, localized timestamp components wi +--------------------+------------+-------------------+---------------+----------------------------+-------+ | us_week | Unary | Temporal | Int64 | | \(4) | +--------------------+------------+-------------------+---------------+----------------------------+-------+ +| us_year | Unary | Temporal | Int64 | | \(4) | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ | week | Unary | Timestamp | Int64 | :struct:`WeekOptions` | \(5) | +--------------------+------------+-------------------+---------------+----------------------------+-------+ | year | Unary | Temporal | Int64 | | | diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index b6ba414c5bb..fd362bf6e68 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -404,6 +404,7 @@ Temporal Component Extraction second subsecond us_week + us_year week year year_month_day