From 4f199b9423641485c8e9eab0c4d509217a71e865 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 9 Jun 2021 20:09:35 +0200 Subject: [PATCH 01/10] First commit. --- .../arrow/compute/kernels/scalar_temporal.cc | 176 ++++++++++++-- .../compute/kernels/scalar_temporal_test.cc | 221 ++++++++++++------ docs/source/cpp/compute.rst | 3 +- 3 files changed, 315 insertions(+), 85 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index f0257772d4a..e5525500892 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -35,7 +35,10 @@ namespace { using arrow_vendored::date::days; using arrow_vendored::date::floor; using arrow_vendored::date::hh_mm_ss; +using arrow_vendored::date::local_days; +using arrow_vendored::date::locate_zone; using arrow_vendored::date::sys_time; +using arrow_vendored::date::time_zone; using arrow_vendored::date::trunc; using arrow_vendored::date::weekday; using arrow_vendored::date::weeks; @@ -64,13 +67,12 @@ const std::string& GetInputTimezone(const ArrayData& array) { } template -Status TemporalComponentExtractCheckTimezone(const T& input) { +const std::string TemporalComponentExtractCheckTimezone(const T& input) { const auto& timezone = GetInputTimezone(input); if (!timezone.empty()) { - return Status::NotImplemented( - "Cannot extract components from timestamp with specific timezone: ", timezone); + return ""; } - return Status::OK(); + return timezone; } template @@ -78,8 +80,13 @@ struct TemporalComponentExtract { using OutValue = typename internal::GetOutputType::T; static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0])); - return ScalarUnaryNotNull::Exec(ctx, batch, out); + const std::string timezone = TemporalComponentExtractCheckTimezone(batch.values[0]); + if (timezone.empty()) { + return ScalarUnaryNotNull::Exec(ctx, batch, out); + } else { +// const time_zone* tz = locate_zone(timezone); + return ScalarUnaryNotNull::Exec(ctx, batch, out); + } } }; @@ -112,6 +119,12 @@ struct Year { return static_cast(static_cast( year_month_day(floor(sys_time(Duration{arg}))).year())); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + return static_cast(static_cast( + year_month_day(floor(tz->to_local(sys_time(Duration{arg})))) + .year())); + } }; // ---------------------------------------------------------------------- @@ -124,6 +137,12 @@ struct Month { return static_cast(static_cast( year_month_day(floor(sys_time(Duration{arg}))).month())); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + return static_cast(static_cast( + year_month_day(floor(tz->to_local(sys_time(Duration{arg})))) + .month())); + } }; // ---------------------------------------------------------------------- @@ -136,6 +155,12 @@ struct Day { return static_cast(static_cast( year_month_day(floor(sys_time(Duration{arg}))).day())); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + return static_cast(static_cast( + year_month_day(floor(tz->to_local(sys_time(Duration{arg})))) + .day())); + } }; // ---------------------------------------------------------------------- @@ -163,6 +188,14 @@ struct DayOfWeek { .iso_encoding(); return lookup_table[wd - 1]; } + template + T Call(KernelContext*, Arg0 arg, const time_zone* tz, Status*) const { + const auto wd = arrow_vendored::date::year_month_weekday( + floor(tz->to_local(sys_time(Duration{arg})))) + .weekday() + .iso_encoding(); + return lookup_table[wd - 1]; + } std::array lookup_table; }; @@ -177,6 +210,12 @@ struct DayOfYear { return static_cast( (t - sys_time(year_month_day(t).year() / jan / 0)).count()); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + const auto t = floor(tz->to_local(sys_time(Duration{arg}))); + auto start = local_days(year_month_day(t).year() / jan / 0); + return static_cast((t - start).count()); + } }; // ---------------------------------------------------------------------- @@ -197,6 +236,16 @@ struct ISOYear { } return static_cast(static_cast(y)); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + const auto t = floor(tz->to_local(sys_time(Duration{arg}))); + auto y = year_month_day{t + days{3}}.year(); + auto start = local_days((y - years{1}) / dec / thu[last]) + (mon - thu); + if (t < start) { + --y; + } + return static_cast(static_cast(y)); + } }; // ---------------------------------------------------------------------- @@ -219,6 +268,17 @@ struct ISOWeek { } return static_cast(trunc(t - start).count() + 1); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + const auto t = floor(tz->to_local(sys_time(Duration{arg}))); + auto y = year_month_day{t + days{3}}.year(); + auto start = local_days((y - years{1}) / dec / thu[last]) + (mon - thu); + if (t < start) { + --y; + start = local_days((y - years{1}) / dec / thu[last]) + (mon - thu); + } + return static_cast(trunc(local_days(t) - start).count() + 1); + } }; // ---------------------------------------------------------------------- @@ -231,6 +291,12 @@ struct Quarter { const auto ymd = year_month_day(floor(sys_time(Duration{arg}))); return static_cast((static_cast(ymd.month()) - 1) / 3 + 1); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + const auto ymd = + year_month_day(floor(tz->to_local(sys_time(Duration{arg})))); + return static_cast((static_cast(ymd.month()) - 1) / 3 + 1); + } }; // ---------------------------------------------------------------------- @@ -243,6 +309,11 @@ struct Hour { Duration t = Duration{arg}; return static_cast((t - floor(t)) / std::chrono::hours(1)); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + const auto t = tz->to_local(sys_time(Duration{arg})); + return static_cast((t - floor(t)) / std::chrono::hours(1)); + } }; // ---------------------------------------------------------------------- @@ -255,6 +326,11 @@ struct Minute { Duration t = Duration{arg}; return static_cast((t - floor(t)) / std::chrono::minutes(1)); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + const auto t = tz->to_local(sys_time(Duration{arg})); + return static_cast((t - floor(t)) / std::chrono::minutes(1)); + } }; // ---------------------------------------------------------------------- @@ -267,6 +343,11 @@ struct Second { Duration t = Duration{arg}; return static_cast((t - floor(t)) / std::chrono::seconds(1)); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + const auto t = tz->to_local(sys_time(Duration{arg})); + return static_cast((t - floor(t)) / std::chrono::seconds(1)); + } }; // ---------------------------------------------------------------------- @@ -280,6 +361,12 @@ struct Subsecond { return static_cast( (std::chrono::duration(t - floor(t)).count())); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + Duration t = Duration{arg}; + return static_cast( + (std::chrono::duration(t - floor(t)).count())); + } }; // ---------------------------------------------------------------------- @@ -293,6 +380,12 @@ struct Millisecond { return static_cast( ((t - floor(t)) / std::chrono::milliseconds(1)) % 1000); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + Duration t = Duration{arg}; + return static_cast( + ((t - floor(t)) / std::chrono::milliseconds(1)) % 1000); + } }; // ---------------------------------------------------------------------- @@ -306,6 +399,12 @@ struct Microsecond { return static_cast( ((t - floor(t)) / std::chrono::microseconds(1)) % 1000); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + Duration t = Duration{arg}; + return static_cast( + ((t - floor(t)) / std::chrono::microseconds(1)) % 1000); + } }; // ---------------------------------------------------------------------- @@ -319,6 +418,12 @@ struct Nanosecond { return static_cast( ((t - floor(t)) / std::chrono::nanoseconds(1)) % 1000); } + template + static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + Duration t = Duration{arg}; + return static_cast( + ((t - floor(t)) / std::chrono::nanoseconds(1)) % 1000); + } }; template @@ -336,20 +441,42 @@ inline std::vector get_iso_calendar(int64_t arg) { static_cast(weekday(ymd).iso_encoding())}; } +template +inline std::vector get_iso_calendar(int64_t arg, const time_zone* tz) { + const auto t = floor(tz->to_local(sys_time(Duration{arg}))); + const auto ymd = year_month_day(t); + auto y = year_month_day{t + days{3}}.year(); + auto start = local_days((y - years{1}) / dec / thu[last]) + (mon - thu); + if (t < start) { + --y; + start = local_days((y - years{1}) / dec / thu[last]) + (mon - thu); + } + return {static_cast(static_cast(y)), + static_cast(trunc(t - start).count() + 1), + static_cast(weekday(ymd).iso_encoding())}; +} + // ---------------------------------------------------------------------- // Extract ISO calendar values from timestamp template struct ISOCalendar { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in)); +// RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in)); + const std::string timezone = TemporalComponentExtractCheckTimezone(in); + // const time_zone* tz = locate_zone(timezone); + if (in.is_valid) { const std::shared_ptr iso_calendar_type = struct_({field("iso_year", int64()), field("iso_week", int64()), field("iso_day_of_week", int64())}); const auto& in_val = internal::UnboxScalar::Unbox(in); - const auto iso_calendar = get_iso_calendar(in_val); - + std::vector iso_calendar; + if (timezone.empty()) { + iso_calendar = get_iso_calendar(in_val); + } else { + iso_calendar = get_iso_calendar(in_val, locate_zone(timezone)); + } std::vector> values = { std::make_shared(iso_calendar[0]), std::make_shared(iso_calendar[1]), @@ -363,8 +490,10 @@ struct ISOCalendar { static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { using BuilderType = typename TypeTraits::BuilderType; +// RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in)); + const std::string timezone = TemporalComponentExtractCheckTimezone(in); + // const time_zone* tz = locate_zone(timezone); - RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in)); const std::shared_ptr iso_calendar_type = struct_({field("iso_year", int64()), field("iso_week", int64()), field("iso_day_of_week", int64())}); @@ -382,14 +511,25 @@ struct ISOCalendar { RETURN_NOT_OK(field_builders[i]->Reserve(1)); } auto visit_null = [&]() { return struct_builder->AppendNull(); }; - auto visit_value = [&](int64_t arg) { - const auto iso_calendar = get_iso_calendar(arg); - field_builders[0]->UnsafeAppend(iso_calendar[0]); - field_builders[1]->UnsafeAppend(iso_calendar[1]); - field_builders[2]->UnsafeAppend(iso_calendar[2]); - return struct_builder->Append(); - }; - RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); + if (timezone.empty()) { + auto visit_value = [&](int64_t arg) { + const auto iso_calendar = get_iso_calendar(arg); + field_builders[0]->UnsafeAppend(iso_calendar[0]); + field_builders[1]->UnsafeAppend(iso_calendar[1]); + field_builders[2]->UnsafeAppend(iso_calendar[2]); + return struct_builder->Append(); + }; + RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); + } else { + auto visit_value = [&](int64_t arg) { + const auto iso_calendar = get_iso_calendar(arg, locate_zone(timezone)); + field_builders[0]->UnsafeAppend(iso_calendar[0]); + field_builders[1]->UnsafeAppend(iso_calendar[1]); + field_builders[2]->UnsafeAppend(iso_calendar[2]); + return struct_builder->Append(); + }; + RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); + } std::shared_ptr out_array; RETURN_NOT_OK(struct_builder->Finish(&out_array)); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index f2e9c12a050..2319f75f2aa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -31,76 +31,73 @@ class ScalarTemporalTest : public ::testing::Test { const char* times = R"(["1970-01-01T00:00:59.123456789","2000-02-29T23:23:23.999999999", "1899-01-01T00:59:20.001001001","2033-05-18T03:33:20.000000000", - null, "2020-01-01T01:05:05.001", "2019-12-31T02:10:10.002", + "2020-01-01T01:05:05.001", "2019-12-31T02:10:10.002", "2019-12-30T03:15:15.003", "2009-12-31T04:20:20.004132", "2010-01-01T05:25:25.005321", "2010-01-03T06:30:30.006163", "2010-01-04T07:35:35", "2006-01-01T08:40:40", "2005-12-31T09:45:45", - "2008-12-28", "2008-12-29", "2012-01-01 01:02:03"])"; + "2008-12-28", "2008-12-29", "2012-01-01 01:02:03", null])"; const char* times_seconds_precision = R"(["1970-01-01T00:00:59","2000-02-29T23:23:23", "1899-01-01T00:59:20","2033-05-18T03:33:20", - null, "2020-01-01T01:05:05", "2019-12-31T02:10:10", + "2020-01-01T01:05:05", "2019-12-31T02:10:10", "2019-12-30T03:15:15", "2009-12-31T04:20:20", "2010-01-01T05:25:25", "2010-01-03T06:30:30", - "2010-01-04T07:35:35", "2006-01-01T08:40:40", "2005-12-31T09:45:45", - "2008-12-28", "2008-12-29", "2012-01-01 01:02:03"])"; + "2010-01-04T07:35:35", "2006-01-01T08:40:40", + "2005-12-31T09:45:45", "2008-12-28", "2008-12-29", + "2012-01-01 01:02:03", null])"; std::shared_ptr iso_calendar_type = struct_({field("iso_year", int64()), field("iso_week", int64()), field("iso_day_of_week", int64())}); - std::shared_ptr iso_calendar = - ArrayFromJSON(iso_calendar_type, - R"([{"iso_year": 1970, "iso_week": 1, "iso_day_of_week": 4}, - {"iso_year": 2000, "iso_week": 9, "iso_day_of_week": 2}, - {"iso_year": 1898, "iso_week": 52, "iso_day_of_week": 7}, - {"iso_year": 2033, "iso_week": 20, "iso_day_of_week": 3}, - null, - {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 3}, - {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 2}, - {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 1}, - {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 4}, - {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 5}, - {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 7}, - {"iso_year": 2010, "iso_week": 1, "iso_day_of_week": 1}, - {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 7}, - {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 6}, - {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 7}, - {"iso_year": 2009, "iso_week": 1, "iso_day_of_week": 1}, - {"iso_year": 2011, "iso_week": 52, "iso_day_of_week": 7}])"); + std::shared_ptr iso_calendar = ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "iso_day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "iso_day_of_week": 2}, + {"iso_year": 1898, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "iso_day_of_week": 3}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 3}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 2}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 4}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 5}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 7}, + {"iso_year": 2010, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2009, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2011, "iso_week": 52, "iso_day_of_week": 7}, null])"); std::string year = - "[1970, 2000, 1899, 2033, null, 2020, 2019, 2019, 2009, 2010, 2010, 2010, 2006, " - "2005, 2008, 2008, 2012]"; - std::string month = "[1, 2, 1, 5, null, 1, 12, 12, 12, 1, 1, 1, 1, 12, 12, 12, 1]"; - std::string day = "[1, 29, 1, 18, null, 1, 31, 30, 31, 1, 3, 4, 1, 31, 28, 29, 1]"; - std::string day_of_week = "[3, 1, 6, 2, null, 2, 1, 0, 3, 4, 6, 0, 6, 5, 6, 0, 6]"; + "[1970, 2000, 1899, 2033, 2020, 2019, 2019, 2009, 2010, 2010, 2010, 2006, " + "2005, 2008, 2008, 2012, null]"; + std::string month = "[1, 2, 1, 5, 1, 12, 12, 12, 1, 1, 1, 1, 12, 12, 12, 1, null]"; + std::string day = "[1, 29, 1, 18, 1, 31, 30, 31, 1, 3, 4, 1, 31, 28, 29, 1, null]"; + std::string day_of_week = "[3, 1, 6, 2, 2, 1, 0, 3, 4, 6, 0, 6, 5, 6, 0, 6, null]"; std::string day_of_year = - "[1, 60, 1, 138, null, 1, 365, 364, 365, 1, 3, 4, 1, 365, 363, 364, 1]"; + "[1, 60, 1, 138, 1, 365, 364, 365, 1, 3, 4, 1, 365, 363, 364, 1, null]"; std::string iso_year = - "[1970, 2000, 1898, 2033, null, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, " - "2005, 2008, 2009, 2011]"; + "[1970, 2000, 1898, 2033, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, " + "2005, 2008, 2009, 2011, null]"; std::string iso_week = - "[1, 9, 52, 20, null, 1, 1, 1, 53, 53, 53, 1, 52, 52, 52, 1, 52]"; + "[1, 9, 52, 20, 1, 1, 1, 53, 53, 53, 1, 52, 52, 52, 1, 52, null]"; - std::string quarter = "[1, 1, 1, 2, null, 1, 4, 4, 4, 1, 1, 1, 1, 4, 4, 4, 1]"; - std::string hour = "[0, 23, 0, 3, null, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 1]"; + std::string quarter = "[1, 1, 1, 2, 1, 4, 4, 4, 1, 1, 1, 1, 4, 4, 4, 1, null]"; + std::string hour = "[0, 23, 0, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 1, null]"; std::string minute = - "[0, 23, 59, 33, null, 5, 10, 15, 20, 25, 30, 35, 40, 45, 0, 0, 2]"; + "[0, 23, 59, 33, 5, 10, 15, 20, 25, 30, 35, 40, 45, 0, 0, 2, null]"; std::string second = - "[59, 23, 20, 20, null, 5, 10, 15, 20, 25, 30, 35, 40, 45, 0, 0, 3]"; - std::string millisecond = "[123, 999, 1, 0, null, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0]"; + "[59, 23, 20, 20, 5, 10, 15, 20, 25, 30, 35, 40, 45, 0, 0, 3, null]"; + std::string millisecond = "[123, 999, 1, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, null]"; std::string microsecond = - "[456, 999, 1, 0, null, 0, 0, 0, 132, 321, 163, 0, 0, 0, 0, 0, 0]"; - std::string nanosecond = "[789, 999, 1, 0, null, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"; + "[456, 999, 1, 0, 0, 0, 0, 132, 321, 163, 0, 0, 0, 0, 0, 0, null]"; + std::string nanosecond = "[789, 999, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, null]"; std::string subsecond = - "[0.123456789, 0.999999999, 0.001001001, 0, null, 0.001, 0.002, 0.003, 0.004132, " - "0.005321, 0.006163, 0, 0, 0, 0, 0, 0]"; - std::string zeros = "[0, 0, 0, 0, null, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"; + "[0.123456789, 0.999999999, 0.001001001, 0, 0.001, 0.002, 0.003, 0.004132, " + "0.005321, 0.006163, 0, 0, 0, 0, 0, 0, null]"; + std::string zeros = "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, null]"; }; namespace compute { TEST_F(ScalarTemporalTest, TestTemporalComponentExtraction) { - auto unit = timestamp(TimeUnit::NANO); - CheckScalarUnary("year", unit, times, int64(), year); CheckScalarUnary("month", unit, times, int64(), month); CheckScalarUnary("day", unit, times, int64(), day); @@ -142,29 +139,121 @@ TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionWithDifferentUnits) { } } -TEST_F(ScalarTemporalTest, TestZonedTemporalComponentExtraction) { - std::string timezone = "Asia/Kolkata"; +TEST(ScalarTemporalTest, TestZoned1) { + auto unit = timestamp(TimeUnit::NANO, "Pacific/Marquesas"); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + auto year = + "[1969, 2000, 1898, 2033, 2019, 2019, 2019, 2009, 2009, 2010, 2010, 2005, 2005, " + "2008, 2008, 2011, null]"; + auto month = "[12, 2, 12, 5, 12, 12, 12, 12, 12, 1, 1, 12, 12, 12, 12, 12, null]"; + auto day = "[31, 29, 31, 17, 31, 30, 29, 30, 31, 2, 3, 31, 31, 27, 28, 31, null]"; + auto day_of_week = "[2, 1, 5, 1, 1, 0, 6, 2, 3, 5, 6, 5, 5, 5, 6, 5, null]"; + auto day_of_year = + "[365, 60, 365, 137, 365, 364, 363, 364, 365, 2, 3, 365, 365, 362, 363, 365, null]"; + auto iso_year = + "[1970, 2000, 1898, 2033, 2020, 2020, 2019, 2009, 2009, 2009, 2009, 2005, 2005, " + "2008, 2008, 2011, null]"; + auto iso_week = "[1, 9, 52, 20, 1, 1, 52, 53, 53, 53, 53, 52, 52, 52, 52, 52, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "iso_day_of_week": 3}, + {"iso_year": 2000, "iso_week": 9, "iso_day_of_week": 2}, + {"iso_year": 1898, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2033, "iso_week": 20, "iso_day_of_week": 2}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 2}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2019, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 3}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 4}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 6}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 7}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2011, "iso_week": 52, "iso_day_of_week": 6}, null])"); + auto quarter = "[4, 1, 4, 2, 4, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 4, null]"; + auto hour = "[14, 13, 15, 18, 15, 16, 17, 18, 19, 21, 22, 23, 0, 14, 14, 15, null]"; + auto minute = "[30, 53, 41, 3, 35, 40, 45, 50, 55, 0, 5, 10, 15, 30, 30, 32, null]"; + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, float64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, float64(), subsecond); +} + +TEST(ScalarTemporalTest, TestZoned2) { for (auto u : internal::AllTimeUnits()) { - auto unit = timestamp(u, timezone); - auto timestamps = ArrayFromJSON(unit, times_seconds_precision); - - ASSERT_RAISES(NotImplemented, Year(timestamps)); - ASSERT_RAISES(NotImplemented, Month(timestamps)); - ASSERT_RAISES(NotImplemented, Day(timestamps)); - ASSERT_RAISES(NotImplemented, DayOfWeek(timestamps)); - ASSERT_RAISES(NotImplemented, DayOfYear(timestamps)); - ASSERT_RAISES(NotImplemented, ISOYear(timestamps)); - ASSERT_RAISES(NotImplemented, ISOWeek(timestamps)); - ASSERT_RAISES(NotImplemented, ISOCalendar(timestamps)); - ASSERT_RAISES(NotImplemented, Quarter(timestamps)); - ASSERT_RAISES(NotImplemented, Hour(timestamps)); - ASSERT_RAISES(NotImplemented, Minute(timestamps)); - ASSERT_RAISES(NotImplemented, Second(timestamps)); - ASSERT_RAISES(NotImplemented, Millisecond(timestamps)); - ASSERT_RAISES(NotImplemented, Microsecond(timestamps)); - ASSERT_RAISES(NotImplemented, Nanosecond(timestamps)); - ASSERT_RAISES(NotImplemented, Subsecond(timestamps)); + auto unit = timestamp(u, "Australia/Broken_Hill"); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + auto year = + "[1970, 2000, 1899, 2033, 2020, 2019, 2019, 2009, 2010, 2010, 2010, 2006, 2005, " + "2008, 2008, 2012, null]"; + auto month = "[1, 3, 1, 5, 1, 12, 12, 12, 1, 1, 1, 1, 12, 12, 12, 1, null]"; + auto day = "[1, 1, 1, 18, 1, 31, 30, 31, 1, 3, 4, 1, 31, 28, 29, 1, null]"; + auto day_of_week = "[3, 2, 6, 2, 2, 1, 0, 3, 4, 6, 0, 6, 5, 6, 0, 6, null]"; + auto day_of_year = + "[1, 61, 1, 138, 1, 365, 364, 365, 1, 3, 4, 1, 365, 363, 364, 1, null]"; + auto iso_year = + "[1970, 2000, 1898, 2033, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, 2005, " + "2008, 2009, 2011, null]"; + auto iso_week = "[1, 9, 52, 20, 1, 1, 1, 53, 53, 53, 1, 52, 52, 52, 1, 52, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "iso_day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "iso_day_of_week": 3}, + {"iso_year": 1898, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "iso_day_of_week": 3}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 3}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 2}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 4}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 5}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 7}, + {"iso_year": 2010, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2009, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2011, "iso_week": 52, "iso_day_of_week": 7}, null])"); + auto quarter = "[1, 1, 1, 2, 1, 4, 4, 4, 1, 1, 1, 1, 4, 4, 4, 1, null]"; + // TODO: pandas expects [9, 9, 10, ... + auto hour = "[9, 9, 9, 13, 11, 12, 13, 14, 15, 17, 18, 19, 20, 10, 10, 11, null]"; + // TODO: pandas expects [30, 53, 25, ... + auto minute = "[30, 53, 59, 3, 35, 40, 45, 50, 55, 0, 5, 10, 15, 30, 30, 32, null]"; + + CheckScalarUnary("year", unit, times_seconds_precision, int64(), year); + CheckScalarUnary("month", unit, times_seconds_precision, int64(), month); + CheckScalarUnary("day", unit, times_seconds_precision, int64(), day); + CheckScalarUnary("day_of_week", unit, times_seconds_precision, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times_seconds_precision, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times_seconds_precision, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times_seconds_precision, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times_seconds_precision), iso_calendar); + CheckScalarUnary("quarter", unit, times_seconds_precision, int64(), quarter); + CheckScalarUnary("hour", unit, times_seconds_precision, int64(), hour); + CheckScalarUnary("minute", unit, times_seconds_precision, int64(), minute); + CheckScalarUnary("second", unit, times_seconds_precision, float64(), second); + CheckScalarUnary("millisecond", unit, times_seconds_precision, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times_seconds_precision, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times_seconds_precision, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times_seconds_precision, float64(), subsecond); } } diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index d429b6cfecd..2dbfdbbfb6d 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1083,7 +1083,8 @@ Temporal component extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ These functions extract datetime components (year, month, day, etc) from timestamp type. -Note: this is currently not supported for timestamps with timezone information. +Note: supports timestamps with timezone information and will return localized timestamp +components. +--------------------+------------+-------------------+---------------+----------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | From d69c35ee26b52e55d850fe4ceb907c1fc7969cb8 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 10 Jun 2021 16:54:48 +0200 Subject: [PATCH 02/10] Adding python tests. --- .../arrow/compute/kernels/scalar_temporal_test.cc | 5 ++++- python/pyarrow/tests/test_compute.py | 13 ++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 2319f75f2aa..7b6fbd65c66 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -189,7 +189,7 @@ TEST(ScalarTemporalTest, TestZoned1) { CheckScalarUnary("quarter", unit, times, int64(), quarter); CheckScalarUnary("hour", unit, times, int64(), hour); CheckScalarUnary("minute", unit, times, int64(), minute); - CheckScalarUnary("second", unit, times, float64(), second); + CheckScalarUnary("second", unit, times, int64(), second); CheckScalarUnary("millisecond", unit, times, int64(), millisecond); CheckScalarUnary("microsecond", unit, times, int64(), microsecond); CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); @@ -305,5 +305,8 @@ TEST_F(ScalarTemporalTest, DayOfWeek) { DayOfWeek(timestamps, DayOfWeekOptions(/*one_based_numbering=*/false, /*week_start=*/8))); } +// TODO: Also, it would maybe be good to add a test for a timestamp that doesn't fit into +// the nanosecond range? I meant a date that falls outside the date range of 1677-09-21 - +// 2262-04-11 (the range that ns resolution can cover). } // namespace compute } // namespace arrow diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 60a2f60f942..d3fea05a2a3 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1433,7 +1433,18 @@ def test_extract_datetime_components(): "2008-12-29", "2012-01-01 01:02:03"] - _check_datetime_components(timestamps) + timezones = ["US/Central", "Pacific/Marquesas", "Asia/Kolkata", + "Etc/GMT-4", "Etc/GMT+4", "Pacific/Marquesas", + "Australia/Broken_Hill"] + + # Test timezone naive timestamp array + ts = pd.to_datetime(timestamps) + _check_datetime_components(ts) + + # Test timezone aware timestamp array + for timezone in timezones: + ts = pd.to_datetime(timestamps).tz_localize("UTC").tz_convert(timezone) + _check_datetime_components(ts) def test_count(): From be3d70d2fc529e992086e029d254872873f83c6e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 14 Jun 2021 19:11:27 +0200 Subject: [PATCH 03/10] Update python/pyarrow/tests/test_compute.py Co-authored-by: Joris Van den Bossche --- python/pyarrow/tests/test_compute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index d3fea05a2a3..fe08dc52b91 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1443,7 +1443,7 @@ def test_extract_datetime_components(): # Test timezone aware timestamp array for timezone in timezones: - ts = pd.to_datetime(timestamps).tz_localize("UTC").tz_convert(timezone) + ts = pd.to_datetime(timestamps).tz_localize(timezone) _check_datetime_components(ts) From 4b66070aea179acc8f081a9557a7d468cb33c903 Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 14 Jun 2021 19:18:30 +0200 Subject: [PATCH 04/10] Review feedback. --- .../compute/kernels/scalar_temporal_test.cc | 50 +++++++++++++++++-- python/pyarrow/tests/test_compute.py | 23 +++++---- 2 files changed, 60 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 7b6fbd65c66..908533f8a29 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -305,8 +305,52 @@ TEST_F(ScalarTemporalTest, DayOfWeek) { DayOfWeek(timestamps, DayOfWeekOptions(/*one_based_numbering=*/false, /*week_start=*/8))); } -// TODO: Also, it would maybe be good to add a test for a timestamp that doesn't fit into -// the nanosecond range? I meant a date that falls outside the date range of 1677-09-21 - -// 2262-04-11 (the range that ns resolution can cover). + +TEST(ScalarTemporalTest, TestOverflow) { + const char* times = + R"(["1677-09-20T00:00:59.123456", "2262-04-13T23:23:23.999999"])"; + + auto unit = timestamp(TimeUnit::MICRO); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + + auto year = "[1677, 2262]"; + auto month = "[9, 4]"; + auto day = "[20, 13]"; + auto day_of_week = "[0, 6]"; + auto day_of_year = "[263, 103]"; + auto iso_year = "[1677, 2262]"; + auto iso_week = "[38, 15]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1677, "iso_week": 38, "iso_day_of_week": 1}, + {"iso_year": 2262, "iso_week": 15, "iso_day_of_week": 7}])"); + auto quarter = "[3, 2]"; + auto hour = "[0, 23]"; + auto minute = "[0, 23]"; + auto second = "[59, 23]"; + auto millisecond = "[123, 999]"; + auto microsecond = "[456, 999]"; + auto nanosecond = "[0, 0]"; + auto subsecond = "[0.123456, 0.999999]"; + + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, int64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, float64(), subsecond); +} } // namespace compute } // namespace arrow diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index fe08dc52b91..cc2a89f5dc5 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1363,13 +1363,18 @@ def test_strptime(): def _check_datetime_components(timestamps, timezone=None): + import pandas as pd from pyarrow.vendored.version import Version - ts = pd.to_datetime(timestamps).to_series() + if timezone: + ts = pd.to_datetime(timestamps).tz_localize(timezone).to_series() + else: + ts = pd.to_datetime(timestamps).to_series() + tsa = pa.array(ts) subseconds = ((ts.dt.microsecond * 10**3 + - ts.dt.nanosecond) * 10**-9).round(9) + ts.dt.nanosecond) * 10**-9).round(9) iso_calendar_fields = [ pa.field('iso_year', pa.int64()), pa.field('iso_week', pa.int64()), @@ -1417,6 +1422,8 @@ def _check_datetime_components(timestamps, timezone=None): @pytest.mark.pandas def test_extract_datetime_components(): + # TODO: see https://github.com/pandas-dev/pandas/issues/41834 + # "1899-01-01T00:59:20.001001001" timestamps = ["1970-01-01T00:00:59.123456789", "2000-02-29T23:23:23.999999999", "2033-05-18T03:33:20.000000000", @@ -1432,19 +1439,15 @@ def test_extract_datetime_components(): "2008-12-28", "2008-12-29", "2012-01-01 01:02:03"] - - timezones = ["US/Central", "Pacific/Marquesas", "Asia/Kolkata", - "Etc/GMT-4", "Etc/GMT+4", "Pacific/Marquesas", - "Australia/Broken_Hill"] + timezones = ["UTC", "US/Central", "Pacific/Marquesas", "Asia/Kolkata", + "Etc/GMT-4", "Etc/GMT+4", "Australia/Broken_Hill"] # Test timezone naive timestamp array - ts = pd.to_datetime(timestamps) - _check_datetime_components(ts) + _check_datetime_components(timestamps) # Test timezone aware timestamp array for timezone in timezones: - ts = pd.to_datetime(timestamps).tz_localize(timezone) - _check_datetime_components(ts) + _check_datetime_components(timestamps, timezone) def test_count(): From eeb0d5233510baf02873d172c9abb01a93991f8e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 15 Jun 2021 16:03:24 +0200 Subject: [PATCH 05/10] Update cpp/src/arrow/compute/kernels/scalar_temporal_test.cc Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/compute/kernels/scalar_temporal_test.cc | 5 ++--- python/pyarrow/tests/test_compute.py | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 908533f8a29..6a9f3bfdc7e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -306,9 +306,8 @@ TEST_F(ScalarTemporalTest, DayOfWeek) { /*week_start=*/8))); } -TEST(ScalarTemporalTest, TestOverflow) { - const char* times = - R"(["1677-09-20T00:00:59.123456", "2262-04-13T23:23:23.999999"])"; +TEST(ScalarTemporalTest, TestOutsideNanosecondRange) { + const char* times = R"(["1677-09-20T00:00:59.123456", "2262-04-13T23:23:23.999999"])"; auto unit = timestamp(TimeUnit::MICRO); auto iso_calendar_type = diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index cc2a89f5dc5..4d70ab73e0b 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1422,8 +1422,6 @@ def _check_datetime_components(timestamps, timezone=None): @pytest.mark.pandas def test_extract_datetime_components(): - # TODO: see https://github.com/pandas-dev/pandas/issues/41834 - # "1899-01-01T00:59:20.001001001" timestamps = ["1970-01-01T00:00:59.123456789", "2000-02-29T23:23:23.999999999", "2033-05-18T03:33:20.000000000", From 44909d74723e2bd13d5e1ed5ca8558194b0ff5e2 Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 21 Jun 2021 13:56:29 +0200 Subject: [PATCH 06/10] Disabling TZ tests on windows. --- .../compute/kernels/scalar_temporal_test.cc | 94 ++++++++++--------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 6a9f3bfdc7e..0a553227b4f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -139,6 +139,53 @@ TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionWithDifferentUnits) { } } +TEST(ScalarTemporalTest, TestOutsideNanosecondRange) { + const char* times = R"(["1677-09-20T00:00:59.123456", "2262-04-13T23:23:23.999999"])"; + + auto unit = timestamp(TimeUnit::MICRO); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + + auto year = "[1677, 2262]"; + auto month = "[9, 4]"; + auto day = "[20, 13]"; + auto day_of_week = "[0, 6]"; + auto day_of_year = "[263, 103]"; + auto iso_year = "[1677, 2262]"; + auto iso_week = "[38, 15]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1677, "iso_week": 38, "iso_day_of_week": 1}, + {"iso_year": 2262, "iso_week": 15, "iso_day_of_week": 7}])"); + auto quarter = "[3, 2]"; + auto hour = "[0, 23]"; + auto minute = "[0, 23]"; + auto second = "[59, 23]"; + auto millisecond = "[123, 999]"; + auto microsecond = "[456, 999]"; + auto nanosecond = "[0, 0]"; + auto subsecond = "[0.123456, 0.999999]"; + + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, int64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, float64(), subsecond); +} + +#ifndef _WIN32 TEST(ScalarTemporalTest, TestZoned1) { auto unit = timestamp(TimeUnit::NANO, "Pacific/Marquesas"); auto iso_calendar_type = @@ -305,51 +352,6 @@ TEST_F(ScalarTemporalTest, DayOfWeek) { DayOfWeek(timestamps, DayOfWeekOptions(/*one_based_numbering=*/false, /*week_start=*/8))); } - -TEST(ScalarTemporalTest, TestOutsideNanosecondRange) { - const char* times = R"(["1677-09-20T00:00:59.123456", "2262-04-13T23:23:23.999999"])"; - - auto unit = timestamp(TimeUnit::MICRO); - auto iso_calendar_type = - struct_({field("iso_year", int64()), field("iso_week", int64()), - field("iso_day_of_week", int64())}); - - auto year = "[1677, 2262]"; - auto month = "[9, 4]"; - auto day = "[20, 13]"; - auto day_of_week = "[0, 6]"; - auto day_of_year = "[263, 103]"; - auto iso_year = "[1677, 2262]"; - auto iso_week = "[38, 15]"; - auto iso_calendar = - ArrayFromJSON(iso_calendar_type, - R"([{"iso_year": 1677, "iso_week": 38, "iso_day_of_week": 1}, - {"iso_year": 2262, "iso_week": 15, "iso_day_of_week": 7}])"); - auto quarter = "[3, 2]"; - auto hour = "[0, 23]"; - auto minute = "[0, 23]"; - auto second = "[59, 23]"; - auto millisecond = "[123, 999]"; - auto microsecond = "[456, 999]"; - auto nanosecond = "[0, 0]"; - auto subsecond = "[0.123456, 0.999999]"; - - CheckScalarUnary("year", unit, times, int64(), year); - CheckScalarUnary("month", unit, times, int64(), month); - CheckScalarUnary("day", unit, times, int64(), day); - CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); - CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); - CheckScalarUnary("iso_year", unit, times, int64(), iso_year); - CheckScalarUnary("iso_week", unit, times, int64(), iso_week); - CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); - CheckScalarUnary("quarter", unit, times, int64(), quarter); - CheckScalarUnary("hour", unit, times, int64(), hour); - CheckScalarUnary("minute", unit, times, int64(), minute); - CheckScalarUnary("second", unit, times, int64(), second); - CheckScalarUnary("millisecond", unit, times, int64(), millisecond); - CheckScalarUnary("microsecond", unit, times, int64(), microsecond); - CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); - CheckScalarUnary("subsecond", unit, times, float64(), subsecond); -} +#endif } // namespace compute } // namespace arrow From a39c391fa0a277a6980c2cd9e81d1817b86b37db Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 23 Jun 2021 23:58:53 +0200 Subject: [PATCH 07/10] Post rebase fixes. --- .../arrow/compute/kernels/scalar_temporal.cc | 319 ++++++++++-------- .../compute/kernels/scalar_temporal_test.cc | 36 +- python/pyarrow/tests/test_compute.py | 29 +- r/configure.win | 2 +- r/tests/testthat/test-dplyr-lubridate.R | 22 +- 5 files changed, 219 insertions(+), 189 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index e5525500892..928b589786a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -66,26 +66,28 @@ const std::string& GetInputTimezone(const ArrayData& array) { return checked_cast(*array.type).timezone(); } -template -const std::string TemporalComponentExtractCheckTimezone(const T& input) { - const auto& timezone = GetInputTimezone(input); - if (!timezone.empty()) { - return ""; +template +struct TemporalComponentExtract { + using OutValue = typename internal::GetOutputType::T; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return ScalarUnaryNotNull::Exec(ctx, batch, out); } - return timezone; -} +}; template -struct TemporalComponentExtract { +struct TemporalComponentExtractZoned { using OutValue = typename internal::GetOutputType::T; static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const std::string timezone = TemporalComponentExtractCheckTimezone(batch.values[0]); + const auto& timezone = GetInputTimezone(batch.values[0]); if (timezone.empty()) { - return ScalarUnaryNotNull::Exec(ctx, batch, out); + applicator::ScalarUnaryNotNullStateful kernel{Op()}; + return kernel.Exec(ctx, batch, out); } else { -// const time_zone* tz = locate_zone(timezone); - return ScalarUnaryNotNull::Exec(ctx, batch, out); + applicator::ScalarUnaryNotNullStateful kernel{ + Op(locate_zone(timezone))}; + return kernel.Exec(ctx, batch, out); } } }; @@ -95,6 +97,7 @@ struct DayOfWeekExec { using OutValue = typename internal::GetOutputType::T; static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& timezone = GetInputTimezone(batch.values[0]); const DayOfWeekOptions& options = DayOfWeekState::Get(ctx); if (options.week_start < 1 || 7 < options.week_start) { return Status::Invalid( @@ -102,10 +105,15 @@ struct DayOfWeekExec { options.week_start); } - RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0])); - applicator::ScalarUnaryNotNullStateful kernel{ - Op(options)}; - return kernel.Exec(ctx, batch, out); + if (timezone.empty()) { + applicator::ScalarUnaryNotNullStateful kernel{ + Op(options)}; + return kernel.Exec(ctx, batch, out); + } else { + applicator::ScalarUnaryNotNullStateful kernel{ + Op(options, locate_zone(timezone))}; + return kernel.Exec(ctx, batch, out); + } } }; @@ -114,17 +122,19 @@ struct DayOfWeekExec { template struct Year { + explicit Year(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - return static_cast(static_cast( - year_month_day(floor(sys_time(Duration{arg}))).year())); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + return static_cast(static_cast( + year_month_day(floor(sys_time(Duration{arg}))).year())); + } return static_cast(static_cast( - year_month_day(floor(tz->to_local(sys_time(Duration{arg})))) + year_month_day(floor(tz_->to_local(sys_time(Duration{arg})))) .year())); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -132,17 +142,19 @@ struct Year { template struct Month { + explicit Month(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - return static_cast(static_cast( - year_month_day(floor(sys_time(Duration{arg}))).month())); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + return static_cast(static_cast( + year_month_day(floor(sys_time(Duration{arg}))).month())); + } return static_cast(static_cast( - year_month_day(floor(tz->to_local(sys_time(Duration{arg})))) + year_month_day(floor(tz_->to_local(sys_time(Duration{arg})))) .month())); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -150,17 +162,19 @@ struct Month { template struct Day { + explicit Day(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - return static_cast(static_cast( - year_month_day(floor(sys_time(Duration{arg}))).day())); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + return static_cast(static_cast( + year_month_day(floor(sys_time(Duration{arg}))).day())); + } return static_cast(static_cast( - year_month_day(floor(tz->to_local(sys_time(Duration{arg})))) + year_month_day(floor(tz_->to_local(sys_time(Duration{arg})))) .day())); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -172,7 +186,8 @@ struct Day { template struct DayOfWeek { - explicit DayOfWeek(const DayOfWeekOptions& options) { + explicit DayOfWeek(const DayOfWeekOptions& options, const time_zone* tz = nullptr) + : tz_(tz) { for (int i = 0; i < 7; i++) { lookup_table[i] = i + 8 - options.week_start; lookup_table[i] = (lookup_table[i] > 6) ? lookup_table[i] - 7 : lookup_table[i]; @@ -182,21 +197,21 @@ struct DayOfWeek { template T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + const auto wd = arrow_vendored::date::year_month_weekday( + floor(sys_time(Duration{arg}))) + .weekday() + .iso_encoding(); + return lookup_table[wd - 1]; + } const auto wd = arrow_vendored::date::year_month_weekday( - floor(sys_time(Duration{arg}))) - .weekday() - .iso_encoding(); - return lookup_table[wd - 1]; - } - template - T Call(KernelContext*, Arg0 arg, const time_zone* tz, Status*) const { - const auto wd = arrow_vendored::date::year_month_weekday( - floor(tz->to_local(sys_time(Duration{arg})))) + floor(tz_->to_local(sys_time(Duration{arg})))) .weekday() .iso_encoding(); return lookup_table[wd - 1]; } std::array lookup_table; + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -204,18 +219,20 @@ struct DayOfWeek { template struct DayOfYear { + explicit DayOfYear(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - const auto t = floor(sys_time(Duration{arg})); - return static_cast( - (t - sys_time(year_month_day(t).year() / jan / 0)).count()); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - const auto t = floor(tz->to_local(sys_time(Duration{arg}))); + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + const auto t = floor(sys_time(Duration{arg})); + return static_cast( + (t - sys_time(year_month_day(t).year() / jan / 0)).count()); + } + const auto t = floor(tz_->to_local(sys_time(Duration{arg}))); auto start = local_days(year_month_day(t).year() / jan / 0); return static_cast((t - start).count()); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -226,19 +243,20 @@ struct DayOfYear { template struct ISOYear { + explicit ISOYear(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - const auto t = floor(sys_time(Duration{arg})); - auto y = year_month_day{t + days{3}}.year(); - auto start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); - if (t < start) { - --y; + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + const auto t = floor(sys_time(Duration{arg})); + auto y = year_month_day{t + days{3}}.year(); + auto start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + if (t < start) { + --y; + } + return static_cast(static_cast(y)); } - return static_cast(static_cast(y)); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - const auto t = floor(tz->to_local(sys_time(Duration{arg}))); + const auto t = floor(tz_->to_local(sys_time(Duration{arg}))); auto y = year_month_day{t + days{3}}.year(); auto start = local_days((y - years{1}) / dec / thu[last]) + (mon - thu); if (t < start) { @@ -246,6 +264,7 @@ struct ISOYear { } return static_cast(static_cast(y)); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -257,20 +276,21 @@ struct ISOYear { // https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503 template struct ISOWeek { + explicit ISOWeek(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - const auto t = floor(sys_time(Duration{arg})); - auto y = year_month_day{t + days{3}}.year(); - auto start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); - if (t < start) { - --y; - start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + const auto t = floor(sys_time(Duration{arg})); + auto y = year_month_day{t + days{3}}.year(); + auto start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + if (t < start) { + --y; + start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + } + return static_cast(trunc(t - start).count() + 1); } - return static_cast(trunc(t - start).count() + 1); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - const auto t = floor(tz->to_local(sys_time(Duration{arg}))); + const auto t = floor(tz_->to_local(sys_time(Duration{arg}))); auto y = year_month_day{t + days{3}}.year(); auto start = local_days((y - years{1}) / dec / thu[last]) + (mon - thu); if (t < start) { @@ -279,6 +299,7 @@ struct ISOWeek { } return static_cast(trunc(local_days(t) - start).count() + 1); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -286,17 +307,19 @@ struct ISOWeek { template struct Quarter { + explicit Quarter(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - const auto ymd = year_month_day(floor(sys_time(Duration{arg}))); - return static_cast((static_cast(ymd.month()) - 1) / 3 + 1); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + const auto ymd = year_month_day(floor(sys_time(Duration{arg}))); + return static_cast((static_cast(ymd.month()) - 1) / 3 + 1); + } const auto ymd = - year_month_day(floor(tz->to_local(sys_time(Duration{arg})))); + year_month_day(floor(tz_->to_local(sys_time(Duration{arg})))); return static_cast((static_cast(ymd.month()) - 1) / 3 + 1); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -304,16 +327,18 @@ struct Quarter { template struct Hour { + explicit Hour(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - Duration t = Duration{arg}; - return static_cast((t - floor(t)) / std::chrono::hours(1)); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - const auto t = tz->to_local(sys_time(Duration{arg})); + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + Duration t = Duration{arg}; + return static_cast((t - floor(t)) / std::chrono::hours(1)); + } + const auto t = tz_->to_local(sys_time(Duration{arg})); return static_cast((t - floor(t)) / std::chrono::hours(1)); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -321,16 +346,18 @@ struct Hour { template struct Minute { + explicit Minute(const time_zone* tz = nullptr) : tz_(tz) {} + template - static T Call(KernelContext*, Arg0 arg, Status*) { - Duration t = Duration{arg}; - return static_cast((t - floor(t)) / std::chrono::minutes(1)); - } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - const auto t = tz->to_local(sys_time(Duration{arg})); + T Call(KernelContext*, Arg0 arg, Status*) const { + if (tz_ == nullptr) { + Duration t = Duration{arg}; + return static_cast((t - floor(t)) / std::chrono::minutes(1)); + } + const auto t = tz_->to_local(sys_time(Duration{arg})); return static_cast((t - floor(t)) / std::chrono::minutes(1)); } + const time_zone* tz_; }; // ---------------------------------------------------------------------- @@ -343,11 +370,6 @@ struct Second { Duration t = Duration{arg}; return static_cast((t - floor(t)) / std::chrono::seconds(1)); } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - const auto t = tz->to_local(sys_time(Duration{arg})); - return static_cast((t - floor(t)) / std::chrono::seconds(1)); - } }; // ---------------------------------------------------------------------- @@ -361,12 +383,6 @@ struct Subsecond { return static_cast( (std::chrono::duration(t - floor(t)).count())); } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - Duration t = Duration{arg}; - return static_cast( - (std::chrono::duration(t - floor(t)).count())); - } }; // ---------------------------------------------------------------------- @@ -380,12 +396,6 @@ struct Millisecond { return static_cast( ((t - floor(t)) / std::chrono::milliseconds(1)) % 1000); } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - Duration t = Duration{arg}; - return static_cast( - ((t - floor(t)) / std::chrono::milliseconds(1)) % 1000); - } }; // ---------------------------------------------------------------------- @@ -399,12 +409,6 @@ struct Microsecond { return static_cast( ((t - floor(t)) / std::chrono::microseconds(1)) % 1000); } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - Duration t = Duration{arg}; - return static_cast( - ((t - floor(t)) / std::chrono::microseconds(1)) % 1000); - } }; // ---------------------------------------------------------------------- @@ -418,12 +422,6 @@ struct Nanosecond { return static_cast( ((t - floor(t)) / std::chrono::nanoseconds(1)) % 1000); } - template - static T Call(KernelContext*, int64_t arg, const time_zone* tz, Status*) { - Duration t = Duration{arg}; - return static_cast( - ((t - floor(t)) / std::chrono::nanoseconds(1)) % 1000); - } }; template @@ -462,9 +460,7 @@ inline std::vector get_iso_calendar(int64_t arg, const time_zone* tz) { template struct ISOCalendar { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { -// RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in)); - const std::string timezone = TemporalComponentExtractCheckTimezone(in); - // const time_zone* tz = locate_zone(timezone); + std::string timezone = GetInputTimezone(in); if (in.is_valid) { const std::shared_ptr iso_calendar_type = @@ -490,9 +486,7 @@ struct ISOCalendar { static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { using BuilderType = typename TypeTraits::BuilderType; -// RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in)); - const std::string timezone = TemporalComponentExtractCheckTimezone(in); - // const time_zone* tz = locate_zone(timezone); + std::string timezone = GetInputTimezone(in); const std::shared_ptr iso_calendar_type = struct_({field("iso_year", int64()), field("iso_week", int64()), @@ -539,6 +533,44 @@ struct ISOCalendar { } }; +template