diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f6d5a540c98..8e411898a34 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -397,6 +397,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_nested.cc compute/kernels/scalar_set_lookup.cc compute/kernels/scalar_string.cc + compute/kernels/scalar_temporal.cc compute/kernels/scalar_validity.cc compute/kernels/scalar_fill_null.cc compute/kernels/scalar_if_else.cc diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 6f77d6f9785..dba71456c29 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -172,5 +172,25 @@ Result IfElse(const Datum& cond, const Datum& if_true, const Datum& if_fa return CallFunction("if_else", {cond, if_true, if_false}, ctx); } +// ---------------------------------------------------------------------- +// Temporal functions + +SCALAR_EAGER_UNARY(Year, "year") +SCALAR_EAGER_UNARY(Month, "month") +SCALAR_EAGER_UNARY(Day, "day") +SCALAR_EAGER_UNARY(DayOfWeek, "day_of_week") +SCALAR_EAGER_UNARY(DayOfYear, "day_of_year") +SCALAR_EAGER_UNARY(ISOYear, "iso_year") +SCALAR_EAGER_UNARY(ISOWeek, "iso_week") +SCALAR_EAGER_UNARY(ISOCalendar, "iso_calendar") +SCALAR_EAGER_UNARY(Quarter, "quarter") +SCALAR_EAGER_UNARY(Hour, "hour") +SCALAR_EAGER_UNARY(Minute, "minute") +SCALAR_EAGER_UNARY(Second, "second") +SCALAR_EAGER_UNARY(Millisecond, "millisecond") +SCALAR_EAGER_UNARY(Microsecond, "microsecond") +SCALAR_EAGER_UNARY(Nanosecond, "nanosecond") +SCALAR_EAGER_UNARY(Subsecond, "subsecond") + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 21d5c5324d4..190696f6ed5 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -521,5 +521,188 @@ ARROW_EXPORT Result IfElse(const Datum& cond, const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. Week starts on Monday denoted by 0 and ends on Sunday denoted by 6. +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result DayOfWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of its days in January. +/// +/// \param[in] values input to extract ISO year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOWeek returns ISO week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of its days in January. +/// Week of the year starts with 1 and can run up to 53. +/// +/// \param[in] values input to extract ISO week of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for +/// each element of `values`. +/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7. +/// +/// \param[in] values input to ISO calendar struct from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Quarter returns the quarter of year number for each element of `values` +/// First quarter maps to 1 and fourth quarter maps to 4. +/// +/// \param[in] values input to extract quarter of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Quarter(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Hour returns hour value for each element of `values` +/// +/// \param[in] values input to extract hour from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Hour(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Minute returns minutes value for each element of `values` +/// +/// \param[in] values input to extract minutes from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Minute(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Second returns seconds value for each element of `values` +/// +/// \param[in] values input to extract seconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Second(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Millisecond returns number of milliseconds since the last full second +/// for each element of `values` +/// +/// \param[in] values input to extract milliseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Millisecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Microsecond returns number of microseconds since the last full millisecond +/// for each element of `values` +/// +/// \param[in] values input to extract microseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Microsecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Nanosecond returns number of nanoseconds since the last full millisecond +/// for each element of `values` +/// +/// \param[in] values input to extract nanoseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Subsecond returns the fraction of second elapsed since last full second +/// as a float for each element of `values` +/// +/// \param[in] values input to extract subsecond from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Subsecond(const Datum& values, ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index fc11d144105..326578588a7 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -27,6 +27,7 @@ add_arrow_compute_test(scalar_test scalar_nested_test.cc scalar_set_lookup_test.cc scalar_string_test.cc + scalar_temporal_test.cc scalar_validity_test.cc scalar_fill_null_test.cc scalar_if_else_test.cc diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc new file mode 100644 index 00000000000..cc22ccf044a --- /dev/null +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -0,0 +1,631 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/builder.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/util/time.h" +#include "arrow/vendored/datetime.h" + +namespace arrow { + +namespace compute { + +namespace internal { + +namespace { + +using arrow_vendored::date::days; +using arrow_vendored::date::floor; +using arrow_vendored::date::hh_mm_ss; +using arrow_vendored::date::sys_time; +using arrow_vendored::date::trunc; +using arrow_vendored::date::weekday; +using arrow_vendored::date::weeks; +using arrow_vendored::date::year_month_day; +using arrow_vendored::date::years; +using arrow_vendored::date::literals::dec; +using arrow_vendored::date::literals::jan; +using arrow_vendored::date::literals::last; +using arrow_vendored::date::literals::mon; +using arrow_vendored::date::literals::thu; +using internal::applicator::ScalarUnaryNotNull; +using internal::applicator::SimpleUnary; + +// Based on ScalarUnaryNotNullStateful. Adds timezone awareness. +template +struct ScalarUnaryStatefulTemporal { + using ThisType = ScalarUnaryStatefulTemporal; + using OutValue = typename internal::GetOutputType::T; + + Op op; + explicit ScalarUnaryStatefulTemporal(Op op) : op(std::move(op)) {} + + template + struct ArrayExec { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, + Datum* out) { + const std::string timezone = + checked_pointer_cast(arg0.type)->timezone(); + Status st = Status::OK(); + ArrayData* out_arr = out->mutable_array(); + auto out_data = out_arr->GetMutableValues(1); + + if (timezone.empty()) { + internal::VisitArrayValuesInline( + arg0, + [&](int64_t v) { + *out_data++ = functor.op.template Call(ctx, v, &st); + }, + [&]() { + // null + ++out_data; + }); + } else { + st = Status::Invalid("Timezone aware timestamps not supported. Timezone found: ", + timezone); + } + return st; + } + }; + + Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) { + const std::string timezone = + checked_pointer_cast(arg0.type)->timezone(); + Status st = Status::OK(); + if (timezone.empty()) { + if (arg0.is_valid) { + int64_t arg0_val = internal::UnboxScalar::Unbox(arg0); + internal::BoxScalar::Box( + this->op.template Call(ctx, arg0_val, &st), out->scalar().get()); + } + } else { + st = Status::Invalid("Timezone aware timestamps not supported. Timezone found: ", + timezone); + } + return st; + } + + Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].kind() == Datum::ARRAY) { + return ArrayExec::Exec(*this, ctx, *batch[0].array(), out); + } else { + return Scalar(ctx, *batch[0].scalar(), out); + } + } +}; + +template +struct ScalarUnaryTemporal { + using OutValue = typename internal::GetOutputType::T; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + // Seed kernel with dummy state + ScalarUnaryStatefulTemporal kernel({}); + return kernel.Exec(ctx, batch, out); + } +}; + +// ---------------------------------------------------------------------- +// Extract year from timestamp + +template +struct Year { + template + static T Call(KernelContext*, int64_t arg, Status*) { + return static_cast(static_cast( + year_month_day(floor(sys_time(Duration{arg}))).year())); + } +}; + +// ---------------------------------------------------------------------- +// Extract month from timestamp + +template +struct Month { + template + static T Call(KernelContext*, int64_t arg, Status*) { + return static_cast(static_cast( + year_month_day(floor(sys_time(Duration{arg}))).month())); + } +}; + +// ---------------------------------------------------------------------- +// Extract day from timestamp + +template +struct Day { + template + static T Call(KernelContext*, int64_t arg, Status*) { + return static_cast(static_cast( + year_month_day(floor(sys_time(Duration{arg}))).day())); + } +}; + +// ---------------------------------------------------------------------- +// Extract day of week from timestamp + +template +struct DayOfWeek { + template + static T Call(KernelContext*, int64_t arg, Status*) { + return static_cast( + weekday(year_month_day(floor(sys_time(Duration{arg})))) + .iso_encoding() - + 1); + } +}; + +// ---------------------------------------------------------------------- +// Extract day of year from timestamp + +template +struct DayOfYear { + template + static T Call(KernelContext*, int64_t arg, Status*) { + const auto t = floor(sys_time(Duration{arg})); + return static_cast( + (t - sys_time(year_month_day(t).year() / jan / 0)).count()); + } +}; + +// ---------------------------------------------------------------------- +// Extract ISO Year values from timestamp +// +// First week of an ISO year has the majority (4 or more) of it's days in January. +// Last week of an ISO year has the year's last Thursday in it. + +template +struct ISOYear { + template + static T Call(KernelContext*, int64_t arg, Status*) { + const auto t = floor(sys_time(Duration{arg})); + auto y = year_month_day{t + days{3}}.year(); + auto start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + if (t < start) { + --y; + } + return static_cast(static_cast(y)); + } +}; + +// ---------------------------------------------------------------------- +// Extract ISO week from timestamp +// +// First week of an ISO year has the majority (4 or more) of it's days in January. +// Last week of an ISO year has the year's last Thursday in it. +// Based on +// https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503 +template +struct ISOWeek { + template + static T Call(KernelContext*, int64_t arg, Status*) { + const auto t = floor(sys_time(Duration{arg})); + auto y = year_month_day{t + days{3}}.year(); + auto start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + if (t < start) { + --y; + start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + } + return static_cast(trunc(t - start).count() + 1); + } +}; + +// ---------------------------------------------------------------------- +// Extract quarter from timestamp + +template +struct Quarter { + template + static T Call(KernelContext*, int64_t arg, Status*) { + const auto ymd = year_month_day(floor(sys_time(Duration{arg}))); + return static_cast((static_cast(ymd.month()) - 1) / 3 + 1); + } +}; + +// ---------------------------------------------------------------------- +// Extract hour from timestamp + +template +struct Hour { + template + static T Call(KernelContext*, int64_t arg, Status*) { + Duration t = Duration{arg}; + return static_cast((t - floor(t)) / std::chrono::hours(1)); + } +}; + +// ---------------------------------------------------------------------- +// Extract minute from timestamp + +template +struct Minute { + template + static T Call(KernelContext*, int64_t arg, Status*) { + Duration t = Duration{arg}; + return static_cast((t - floor(t)) / std::chrono::minutes(1)); + } +}; + +// ---------------------------------------------------------------------- +// Extract second from timestamp + +template +struct Second { + template + static T Call(KernelContext*, int64_t arg, Status*) { + Duration t = Duration{arg}; + return static_cast((t - floor(t)) / std::chrono::seconds(1)); + } +}; + +// ---------------------------------------------------------------------- +// Extract subsecond from timestamp + +template +struct Subsecond { + template + static T Call(KernelContext*, int64_t arg, Status*) { + Duration t = Duration{arg}; + return static_cast( + (std::chrono::duration(t - floor(t)).count())); + } +}; + +// ---------------------------------------------------------------------- +// Extract milliseconds from timestamp + +template +struct Millisecond { + template + static T Call(KernelContext*, int64_t arg, Status*) { + Duration t = Duration{arg}; + return static_cast( + ((t - floor(t)) / std::chrono::milliseconds(1)) % 1000); + } +}; + +// ---------------------------------------------------------------------- +// Extract microseconds from timestamp + +template +struct Microsecond { + template + static T Call(KernelContext*, int64_t arg, Status*) { + Duration t = Duration{arg}; + return static_cast( + ((t - floor(t)) / std::chrono::microseconds(1)) % 1000); + } +}; + +// ---------------------------------------------------------------------- +// Extract nanoseconds from timestamp + +template +struct Nanosecond { + template + static T Call(KernelContext*, int64_t arg, Status*) { + Duration t = Duration{arg}; + return static_cast( + ((t - floor(t)) / std::chrono::nanoseconds(1)) % 1000); + } +}; + +template +inline std::vector get_iso_calendar(int64_t arg) { + const auto t = floor(sys_time(Duration{arg})); + const auto ymd = year_month_day(t); + auto y = year_month_day{t + days{3}}.year(); + auto start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + if (t < start) { + --y; + start = sys_time((y - years{1}) / dec / thu[last]) + (mon - thu); + } + return {static_cast(static_cast(y)), + static_cast(trunc(t - start).count() + 1), + static_cast(weekday(ymd).iso_encoding())}; +} + +// ---------------------------------------------------------------------- +// Extract ISO calendar values from timestamp + +template +struct ISOCalendar { + static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + const std::string timezone = + checked_pointer_cast(in.type)->timezone(); + if (!timezone.empty()) { + return Status::Invalid("Timezone aware timestamps not supported. Timezone found: ", + timezone); + } + + if (in.is_valid) { + const std::shared_ptr iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + const auto& in_val = internal::UnboxScalar::Unbox(in); + const auto iso_calendar = get_iso_calendar(in_val); + + std::vector> values = { + std::make_shared(iso_calendar[0]), + std::make_shared(iso_calendar[1]), + std::make_shared(iso_calendar[2])}; + *checked_cast(out) = StructScalar(values, iso_calendar_type); + } else { + out->is_valid = false; + } + return Status::OK(); + } + + static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { + using BuilderType = typename TypeTraits::BuilderType; + const std::string timezone = + checked_pointer_cast(in.type)->timezone(); + if (!timezone.empty()) { + return Status::Invalid("Timezone aware timestamps not supported. Timezone found: ", + timezone); + } + const std::shared_ptr iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + + std::unique_ptr array_builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), iso_calendar_type, &array_builder)); + StructBuilder* struct_builder = checked_cast(array_builder.get()); + RETURN_NOT_OK(struct_builder->Reserve(in.length)); + + std::vector field_builders; + field_builders.reserve(3); + for (int i = 0; i < 3; i++) { + field_builders.push_back( + checked_cast(struct_builder->field_builder(i))); + RETURN_NOT_OK(field_builders[i]->Reserve(1)); + } + auto visit_null = [&]() { return struct_builder->AppendNull(); }; + auto visit_value = [&](int64_t arg) { + const auto iso_calendar = get_iso_calendar(arg); + field_builders[0]->UnsafeAppend(iso_calendar[0]); + field_builders[1]->UnsafeAppend(iso_calendar[1]); + field_builders[2]->UnsafeAppend(iso_calendar[2]); + return struct_builder->Append(); + }; + RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); + + std::shared_ptr out_array; + RETURN_NOT_OK(struct_builder->Finish(&out_array)); + *out = *std::move(out_array->data()); + + return Status::OK(); + } +}; + +template