From 0e8f2faa6e08b33c2da2f056d36fe18fb0724562 Mon Sep 17 00:00:00 2001 From: Rok Date: Sun, 27 Jun 2021 22:40:09 +0200 Subject: [PATCH 01/10] Add strftime kernel. --- cpp/src/arrow/compute/api_scalar.cc | 17 ++++ cpp/src/arrow/compute/api_scalar.h | 27 +++++ cpp/src/arrow/compute/function_test.cc | 3 + .../arrow/compute/kernels/scalar_temporal.cc | 99 +++++++++++++++++++ .../compute/kernels/scalar_temporal_test.cc | 23 +++++ docs/source/cpp/compute.rst | 4 + python/pyarrow/_compute.pyx | 13 +++ python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 4 + python/pyarrow/tests/test_compute.py | 31 +++++- r/src/compute.cpp | 12 +++ 11 files changed, 233 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f9c95a7745b..7a1a28f9e9c 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -145,6 +145,9 @@ static auto kSetLookupOptionsType = GetFunctionOptionsType( static auto kStrptimeOptionsType = GetFunctionOptionsType( DataMember("format", &StrptimeOptions::format), DataMember("unit", &StrptimeOptions::unit)); +static auto kStrftimeOptionsType = GetFunctionOptionsType( + DataMember("format", &StrftimeOptions::format), + DataMember("timezone", &StrftimeOptions::timezone)); static auto kPadOptionsType = GetFunctionOptionsType( DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding)); static auto kTrimOptionsType = GetFunctionOptionsType( @@ -238,6 +241,15 @@ StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit) StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {} constexpr char StrptimeOptions::kTypeName[]; +StrftimeOptions::StrftimeOptions(std::string format, std::string timezone) + : FunctionOptions(internal::kStrftimeOptionsType), + format(std::move(format)), + timezone(std::move(timezone)) { + tz = arrow_vendored::date::locate_zone(this->timezone); +} +StrftimeOptions::StrftimeOptions() : StrftimeOptions("%Y-%m-%dT%H:%M:%S", "UTC") {} +constexpr char StrftimeOptions::kTypeName[]; + PadOptions::PadOptions(int64_t width, std::string padding) : FunctionOptions(internal::kPadOptionsType), width(width), @@ -294,6 +306,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kStrftimeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); @@ -495,5 +508,9 @@ Result DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* return CallFunction("day_of_week", {arg}, &options, ctx); } +Result Strftime(const Datum& arg, StrftimeOptions options, ExecContext* ctx) { + return CallFunction("strftime", {arg}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 115cad13d6f..5a053bff05a 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -29,6 +29,7 @@ #include "arrow/result.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" +#include "arrow/vendored/datetime.h" namespace arrow { namespace compute { @@ -178,6 +179,20 @@ class ARROW_EXPORT StrptimeOptions : public FunctionOptions { TimeUnit::type unit; }; +class ARROW_EXPORT StrftimeOptions : public FunctionOptions { + public: + explicit StrftimeOptions(std::string format, std::string timezone); + StrftimeOptions(); + constexpr static char const kTypeName[] = "StrftimeOptions"; + + /// The desired format string. + std::string format; + /// Timezone to output the time in. + std::string timezone; + /// Timezone to output the time in. + const arrow_vendored::date::time_zone* tz; +}; + class ARROW_EXPORT PadOptions : public FunctionOptions { public: explicit PadOptions(int64_t width, std::string padding = " "); @@ -999,5 +1014,17 @@ Result Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR); /// \note API not yet finalized ARROW_EXPORT Result Subsecond(const Datum& values, ExecContext* ctx = NULLPTR); +/// \brief Strftime +/// +/// \param[in] values input to print time string from +/// \param[in] options for setting time format and timezone +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Strftime(const Datum& values, StrftimeOptions options, + ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 16d3affe720..7d344886886 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -79,6 +79,9 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new SetLookupOptions(ArrayFromJSON(boolean(), "[true, false]"))); options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI)); options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO)); +#ifndef _WIN32 + options.emplace_back(new StrftimeOptions("%Y-%m-%dT%H:%M:%S", "UTC")); +#endif options.emplace_back(new PadOptions(5, " ")); options.emplace_back(new PadOptions(10, "A")); options.emplace_back(new TrimOptions(" ")); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index b6cabddd118..c3ae720bbda 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -55,6 +55,7 @@ using internal::applicator::ScalarUnaryNotNull; using internal::applicator::SimpleUnary; using DayOfWeekState = OptionsWrapper; +using StrftimeState = OptionsWrapper; const std::shared_ptr& IsoCalendarType() { static auto type = struct_({field("iso_year", int64()), field("iso_week", int64()), @@ -444,6 +445,53 @@ struct Nanosecond { } }; +// ---------------------------------------------------------------------- +// Convert timestamps to a string representation with an arbitrary format + +template +inline std::string get_timestamp(int64_t arg, const StrftimeOptions* options) { + auto zt = arrow_vendored::date::zoned_time{options->tz, + sys_time(Duration{arg})}; + return arrow_vendored::date::format(options->format, zt.get_local_time()); +} + +template +struct Strftime { + static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + const StrftimeOptions options = StrftimeState::Get(ctx); + + if (in.is_valid) { + const auto& in_val = internal::UnboxScalar::Unbox(in); + *checked_cast(out) = + StringScalar(get_timestamp(in_val, &options)); + } else { + out->is_valid = false; + } + return Status::OK(); + } + + static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { + const StrftimeOptions options = StrftimeState::Get(ctx); + + std::unique_ptr array_builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), utf8(), &array_builder)); + StringBuilder* string_builder = checked_cast(array_builder.get()); + RETURN_NOT_OK(string_builder->Reserve(in.length * 30)); + + auto visit_null = [&]() { return string_builder->AppendNull(); }; + auto visit_value = [&](int64_t arg) { + return string_builder->Append(get_timestamp(arg, &options)); + }; + RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); + + std::shared_ptr out_array; + RETURN_NOT_OK(string_builder->Finish(&out_array)); + *out = *std::move(out_array->data()); + + return Status::OK(); + } +}; + // ---------------------------------------------------------------------- // Extract ISO calendar values from timestamp @@ -580,6 +628,41 @@ std::shared_ptr MakeTemporal( return func; } +std::shared_ptr MakeStrftime(std::string name, const FunctionDoc* doc, + const StrftimeOptions& default_options, + KernelInit init) { + const auto& out_type = utf8(); + auto func = + std::make_shared(name, Arity::Unary(), doc, &default_options); + + for (auto unit : internal::AllTimeUnits()) { + InputType in_type{match::TimestampTypeUnit(unit)}; + switch (unit) { + case TimeUnit::SECOND: { + auto exec = SimpleUnary>; + DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init)); + break; + } + case TimeUnit::MILLI: { + auto exec = SimpleUnary>; + DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init)); + break; + } + case TimeUnit::MICRO: { + auto exec = SimpleUnary>; + DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init)); + break; + } + case TimeUnit::NANO: { + auto exec = SimpleUnary>; + DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init)); + break; + } + } + } + return func; +} + template