diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f9c95a7745b..b0aeec7c92e 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -145,6 +145,8 @@ static auto kSetLookupOptionsType = GetFunctionOptionsType( static auto kStrptimeOptionsType = GetFunctionOptionsType( DataMember("format", &StrptimeOptions::format), DataMember("unit", &StrptimeOptions::unit)); +static auto kStrftimeOptionsType = GetFunctionOptionsType( + DataMember("format", &StrftimeOptions::format)); static auto kPadOptionsType = GetFunctionOptionsType( DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding)); static auto kTrimOptionsType = GetFunctionOptionsType( @@ -238,6 +240,14 @@ StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit) StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {} constexpr char StrptimeOptions::kTypeName[]; +StrftimeOptions::StrftimeOptions(std::string format, std::string locale) + : FunctionOptions(internal::kStrftimeOptionsType), + format(std::move(format)), + locale(std::move(locale)) {} +StrftimeOptions::StrftimeOptions() : StrftimeOptions(kDefaultFormat) {} +constexpr char StrftimeOptions::kTypeName[]; +constexpr const char* StrftimeOptions::kDefaultFormat; + PadOptions::PadOptions(int64_t width, std::string padding) : FunctionOptions(internal::kPadOptionsType), width(width), @@ -294,6 +304,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kStrftimeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); @@ -495,5 +506,9 @@ Result DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* return CallFunction("day_of_week", {arg}, &options, ctx); } +Result Strftime(const Datum& arg, StrftimeOptions options, ExecContext* ctx) { + return CallFunction("strftime", {arg}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 115cad13d6f..f1672a05223 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -178,6 +178,21 @@ class ARROW_EXPORT StrptimeOptions : public FunctionOptions { TimeUnit::type unit; }; +class ARROW_EXPORT StrftimeOptions : public FunctionOptions { + public: + explicit StrftimeOptions(std::string format, std::string locale = "C"); + StrftimeOptions(); + + constexpr static char const kTypeName[] = "StrftimeOptions"; + + constexpr static const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%SZ"; + + /// The desired format string. + std::string format; + /// The desired output locale string. + std::string locale; +}; + class ARROW_EXPORT PadOptions : public FunctionOptions { public: explicit PadOptions(int64_t width, std::string padding = " "); @@ -999,5 +1014,20 @@ Result Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR); /// \note API not yet finalized ARROW_EXPORT Result Subsecond(const Datum& values, ExecContext* ctx = NULLPTR); +/// \brief Format timestamps according to a format string +/// +/// Return formatted time strings according to the format string +/// `StrftimeOptions::format` and to the locale specifier `Strftime::locale`. +/// +/// \param[in] values input timestamps +/// \param[in] options for setting format string and locale +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 6.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Strftime(const Datum& values, StrftimeOptions options, + ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 16d3affe720..d7ebdf3de1d 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -79,6 +79,7 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new SetLookupOptions(ArrayFromJSON(boolean(), "[true, false]"))); options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI)); options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO)); + options.emplace_back(new StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C")); options.emplace_back(new PadOptions(5, " ")); options.emplace_back(new PadOptions(10, "A")); options.emplace_back(new TrimOptions(" ")); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index b6cabddd118..32e92ea0d08 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include + #include "arrow/builder.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" @@ -55,6 +57,7 @@ using internal::applicator::ScalarUnaryNotNull; using internal::applicator::SimpleUnary; using DayOfWeekState = OptionsWrapper; +using StrftimeState = OptionsWrapper; const std::shared_ptr& IsoCalendarType() { static auto type = struct_({field("iso_year", int64()), field("iso_week", int64()), @@ -62,6 +65,10 @@ const std::shared_ptr& IsoCalendarType() { return type; } +const std::string& GetInputTimezone(const DataType& type) { + return checked_cast(type).timezone(); +} + const std::string& GetInputTimezone(const Datum& datum) { return checked_cast(*datum.type()).timezone(); } @@ -74,6 +81,22 @@ const std::string& GetInputTimezone(const ArrayData& array) { return checked_cast(*array.type).timezone(); } +Result LocateZone(const std::string& timezone) { + try { + return locate_zone(timezone); + } catch (const std::runtime_error& ex) { + return Status::Invalid("Cannot locate timezone '", timezone, "': ", ex.what()); + } +} + +Result GetLocale(const std::string& locale) { + try { + return std::locale(locale.c_str()); + } catch (const std::runtime_error& ex) { + return Status::Invalid("Cannot find locale '", locale, "': ", ex.what()); + } +} + struct NonZonedLocalizer { // No-op conversions: UTC -> UTC template @@ -117,12 +140,7 @@ struct TemporalComponentExtractBase { op}; return kernel.Exec(ctx, batch, out); } else { - const time_zone* tz; - try { - tz = locate_zone(timezone); - } catch (const std::runtime_error& ex) { - return Status::Invalid(ex.what()); - } + ARROW_ASSIGN_OR_RAISE(auto tz, LocateZone(timezone)); using ExecTemplate = Op; auto op = ExecTemplate(options, ZonedLocalizer{tz}); applicator::ScalarUnaryNotNullStateful kernel{ @@ -444,6 +462,114 @@ struct Nanosecond { } }; +// ---------------------------------------------------------------------- +// Convert timestamps to a string representation with an arbitrary format + +#ifndef _WIN32 +template +struct Strftime { + const StrftimeOptions& options; + const time_zone* tz; + const std::locale locale; + + static Result Make(KernelContext* ctx, const DataType& type) { + const StrftimeOptions& options = StrftimeState::Get(ctx); + + const auto& timezone = GetInputTimezone(type); + if (timezone.empty()) { + return Status::Invalid( + "Timestamps without a time zone cannot be reliably formatted."); + } + ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone)); + + ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale(options.locale)); + + return Strftime{options, tz, std::move(locale)}; + } + + static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type)); + TimestampFormatter formatter{self.options.format, self.tz, self.locale}; + + if (in.is_valid) { + const int64_t in_val = internal::UnboxScalar::Unbox(in); + ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(in_val)); + checked_cast(out)->value = Buffer::FromString(std::move(formatted)); + } else { + out->is_valid = false; + } + return Status::OK(); + } + + static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { + ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type)); + TimestampFormatter formatter{self.options.format, self.tz, self.locale}; + + StringBuilder string_builder; + // Presize string data using a heuristic + { + ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(42)); + const auto string_size = static_cast(ceil(formatted.size() * 1.1)); + RETURN_NOT_OK(string_builder.Reserve(in.length)); + RETURN_NOT_OK( + string_builder.ReserveData((in.length - in.GetNullCount()) * string_size)); + } + + auto visit_null = [&]() { return string_builder.AppendNull(); }; + auto visit_value = [&](int64_t arg) { + ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(arg)); + return string_builder.Append(std::move(formatted)); + }; + RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); + + std::shared_ptr out_array; + RETURN_NOT_OK(string_builder.Finish(&out_array)); + *out = *std::move(out_array->data()); + + return Status::OK(); + } + + struct TimestampFormatter { + const char* format; + const time_zone* tz; + std::ostringstream bufstream; + + explicit TimestampFormatter(const std::string& format, const time_zone* tz, + const std::locale& locale) + : format(format.c_str()), tz(tz) { + bufstream.imbue(locale); + // Propagate errors as C++ exceptions (to get an actual error message) + bufstream.exceptions(std::ios::failbit | std::ios::badbit); + } + + Result operator()(int64_t arg) { + bufstream.str(""); + const auto zt = arrow_vendored::date::zoned_time{ + tz, sys_time(Duration{arg})}; + try { + arrow_vendored::date::to_stream(bufstream, format, zt); + } catch (const std::runtime_error& ex) { + bufstream.clear(); + return Status::Invalid("Failed formatting timestamp: ", ex.what()); + } + // XXX could return a view with std::ostringstream::view() (C++20) + return std::move(bufstream).str(); + } + }; +}; + +#else +template +struct Strftime { + static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + return Status::NotImplemented("Strftime not yet implemented on windows."); + } + static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { + return Status::NotImplemented("Strftime not yet implemented on windows."); + } +}; +#endif + // ---------------------------------------------------------------------- // Extract ISO calendar values from timestamp @@ -473,12 +599,8 @@ struct ISOCalendar { if (timezone.empty()) { iso_calendar = GetIsoCalendar(in_val, NonZonedLocalizer{}); } else { - try { - const time_zone* tz = locate_zone(timezone); - iso_calendar = GetIsoCalendar(in_val, ZonedLocalizer{tz}); - } catch (const std::runtime_error& ex) { - return Status::Invalid(ex.what()); - } + ARROW_ASSIGN_OR_RAISE(auto tz, LocateZone(timezone)); + iso_calendar = GetIsoCalendar(in_val, ZonedLocalizer{tz}); } ScalarVector values = {std::make_shared(iso_calendar[0]), std::make_shared(iso_calendar[1]), @@ -518,19 +640,15 @@ struct ISOCalendar { }; RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); } else { - try { - const time_zone* tz = locate_zone(timezone); - auto visit_value = [&](int64_t arg) { - const auto iso_calendar = GetIsoCalendar(arg, ZonedLocalizer{tz}); - field_builders[0]->UnsafeAppend(iso_calendar[0]); - field_builders[1]->UnsafeAppend(iso_calendar[1]); - field_builders[2]->UnsafeAppend(iso_calendar[2]); - return struct_builder->Append(); - }; - RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); - } catch (const std::runtime_error& ex) { - return Status::Invalid(ex.what()); - } + ARROW_ASSIGN_OR_RAISE(auto tz, LocateZone(timezone)); + auto visit_value = [&](int64_t arg) { + const auto iso_calendar = GetIsoCalendar(arg, ZonedLocalizer{tz}); + field_builders[0]->UnsafeAppend(iso_calendar[0]); + field_builders[1]->UnsafeAppend(iso_calendar[1]); + field_builders[2]->UnsafeAppend(iso_calendar[2]); + return struct_builder->Append(); + }; + RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); } std::shared_ptr out_array; @@ -713,6 +831,15 @@ const FunctionDoc subsecond_doc{ "Returns an error if timestamp has a defined timezone. Null values return null."), {"values"}}; +const FunctionDoc strftime_doc{ + "Format timestamps according to a format string", + ("For each input timestamp, emit a formatted string.\n" + "The time format string and locale can be set using StrftimeOptions.\n" + "An error is returned if the timestamps don't have a defined timezone,\n" + "or if the timezone cannot be found in the timezone database."), + {"timestamps"}, + "StrftimeOptions"}; + } // namespace void RegisterScalarTemporal(FunctionRegistry* registry) { @@ -782,6 +909,11 @@ void RegisterScalarTemporal(FunctionRegistry* registry) { auto subsecond = MakeTemporal( "subsecond", float64(), &subsecond_doc); DCHECK_OK(registry->AddFunction(std::move(subsecond))); + + static auto default_strftime_options = StrftimeOptions(); + auto strftime = MakeSimpleUnaryTemporal( + "strftime", utf8(), &strftime_doc, &default_strftime_options, StrftimeState::Init); + DCHECK_OK(registry->AddFunction(std::move(strftime))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index fa00d985287..d8199089328 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -382,5 +382,112 @@ TEST_F(ScalarTemporalTest, DayOfWeek) { DayOfWeek(timestamps, DayOfWeekOptions(/*one_based_numbering=*/false, /*week_start=*/8))); } + +#ifndef _WIN32 +TEST_F(ScalarTemporalTest, Strftime) { + auto options_default = StrftimeOptions(); + auto options = StrftimeOptions("%Y-%m-%dT%H:%M:%S%z"); + + const char* seconds = R"(["1970-01-01T00:00:59", "2021-08-18T15:11:50", null])"; + const char* milliseconds = R"(["1970-01-01T00:00:59.123", null])"; + const char* microseconds = R"(["1970-01-01T00:00:59.123456", null])"; + const char* nanoseconds = R"(["1970-01-01T00:00:59.123456789", null])"; + + const char* default_seconds = R"( + ["1970-01-01T00:00:59Z", "2021-08-18T15:11:50Z", null])"; + const char* string_seconds = R"( + ["1970-01-01T00:00:59+0000", "2021-08-18T15:11:50+0000", null])"; + const char* string_milliseconds = R"(["1970-01-01T00:00:59.123+0000", null])"; + const char* string_microseconds = R"(["1970-01-01T05:30:59.123456+0530", null])"; + const char* string_nanoseconds = R"(["1969-12-31T14:00:59.123456789-1000", null])"; + + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND, "UTC"), seconds, utf8(), + default_seconds, &options_default); + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND, "UTC"), seconds, utf8(), + string_seconds, &options); + CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "GMT"), milliseconds, utf8(), + string_milliseconds, &options); + CheckScalarUnary("strftime", timestamp(TimeUnit::MICRO, "Asia/Kolkata"), microseconds, + utf8(), string_microseconds, &options); + CheckScalarUnary("strftime", timestamp(TimeUnit::NANO, "US/Hawaii"), nanoseconds, + utf8(), string_nanoseconds, &options); +} + +TEST_F(ScalarTemporalTest, StrftimeNoTimezone) { + const char* seconds = R"(["1970-01-01T00:00:59", null])"; + auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND), seconds); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Timestamps without a time zone cannot be reliably formatted"), + Strftime(arr, StrftimeOptions())); +} + +TEST_F(ScalarTemporalTest, StrftimeInvalidTimezone) { + const char* seconds = R"(["1970-01-01T00:00:59", null])"; + auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "non-existent"), seconds); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Cannot locate timezone 'non-existent'"), + Strftime(arr, StrftimeOptions())); +} + +TEST_F(ScalarTemporalTest, StrftimeCLocale) { + auto options_default = StrftimeOptions(); + auto options = StrftimeOptions("%Y-%m-%dT%H:%M:%S%z", "C"); + auto options_locale_specific = StrftimeOptions("%a", "C"); + + const char* seconds = R"(["1970-01-01T00:00:59", null])"; + const char* milliseconds = R"(["1970-01-01T00:00:59.123", null])"; + const char* microseconds = R"(["1970-01-01T00:00:59.123456", null])"; + const char* nanoseconds = R"(["1970-01-01T00:00:59.123456789", null])"; + + const char* default_seconds = R"(["1970-01-01T00:00:59Z", null])"; + const char* string_seconds = R"(["1970-01-01T00:00:59+0000", null])"; + const char* string_milliseconds = R"(["1970-01-01T00:00:59.123+0000", null])"; + const char* string_microseconds = R"(["1970-01-01T05:30:59.123456+0530", null])"; + const char* string_nanoseconds = R"(["1969-12-31T14:00:59.123456789-1000", null])"; + + const char* string_locale_specific = R"(["Wed", null])"; + + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND, "UTC"), seconds, utf8(), + default_seconds, &options_default); + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND, "UTC"), seconds, utf8(), + string_seconds, &options); + CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "GMT"), milliseconds, utf8(), + string_milliseconds, &options); + CheckScalarUnary("strftime", timestamp(TimeUnit::MICRO, "Asia/Kolkata"), microseconds, + utf8(), string_microseconds, &options); + CheckScalarUnary("strftime", timestamp(TimeUnit::NANO, "US/Hawaii"), nanoseconds, + utf8(), string_nanoseconds, &options); + + CheckScalarUnary("strftime", timestamp(TimeUnit::NANO, "US/Hawaii"), nanoseconds, + utf8(), string_locale_specific, &options_locale_specific); +} + +TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { + if (!LocaleExists("fr_FR.UTF-8")) { + GTEST_SKIP() << "locale 'fr_FR.UTF-8' doesn't exist on this system"; + } + + auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "fr_FR.UTF-8"); + const char* milliseconds = R"( + ["1970-01-01T00:00:59.123", "2021-08-18T15:11:50.456", null])"; + const char* expected = R"( + ["01 janvier 1970 00:00:59,123", "18 août 2021 15:11:50,456", null])"; + CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "UTC"), milliseconds, utf8(), + expected, &options); +} + +TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) { + auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "non-existent"); + const char* seconds = R"(["1970-01-01T00:00:59", null])"; + auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), seconds); + + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + testing::HasSubstr("Cannot find locale 'non-existent'"), + Strftime(arr, options)); +} + +#endif + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 30cc59800f4..6147201a32e 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -566,6 +566,15 @@ std::shared_ptr TweakValidityBit(const std::shared_ptr& array, return MakeArray(data); } +bool LocaleExists(const char* locale) { + try { + std::locale loc(locale); + return true; + } catch (std::runtime_error&) { + return false; + } +} + class LocaleGuard::Impl { public: explicit Impl(const char* new_locale) : global_locale_(std::locale()) { diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 951fafdf968..3f9408ecdcb 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -494,6 +494,9 @@ std::vector IteratorToVector(Iterator iterator) { return out; } +ARROW_TESTING_EXPORT +bool LocaleExists(const char* locale); + // A RAII-style object that switches to a new locale, and switches back // to the old locale when going out of scope. Doesn't do anything if the // new locale doesn't exist on the local machine. diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 39bbbec3e16..aca7adfdf22 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -986,13 +986,15 @@ number of input and output types. The type to cast to can be passed in a :struct:`CastOptions` instance. As an alternative, the same service is provided by a concrete function :func:`~arrow::compute::Cast`. -+--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ -| Function name | Arity | Input types | Output type | Options class | -+==========================+============+====================+=======================+============================================+ -| cast | Unary | Many | Variable | :struct:`CastOptions` | -+--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ -| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | -+--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ ++--------------------------+------------+--------------------+------------------+------------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++==========================+============+====================+==================+==============================+ +| cast | Unary | Many | Variable | :struct:`CastOptions` | ++--------------------------+------------+--------------------+------------------+------------------------------+ +| strftime | Unary | Timestamp | String | :struct:`StrftimeOptions` | ++--------------------------+------------+--------------------+------------------+------------------------------+ +| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | ++--------------------------+------------+--------------------+------------------+------------------------------+ The conversions available with ``cast`` are listed below. In all cases, a null input value is converted into a null output value. diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 2d82928f377..35a4ff696db 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -981,6 +981,18 @@ class StrptimeOptions(_StrptimeOptions): self._set_options(format, unit) +cdef class _StrftimeOptions(FunctionOptions): + def _set_options(self, format, locale): + self.wrapped.reset( + new CStrftimeOptions(tobytes(format), tobytes(locale)) + ) + + +class StrftimeOptions(_StrftimeOptions): + def __init__(self, format="%Y-%m-%dT%H:%M:%SZ", locale="C"): + self._set_options(format, locale) + + cdef class _DayOfWeekOptions(FunctionOptions): def _set_options(self, one_based_numbering, week_start): self.wrapped.reset( diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 10880c2974c..eea2328a49a 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -53,6 +53,7 @@ SplitOptions, SplitPatternOptions, StrptimeOptions, + StrftimeOptions, DayOfWeekOptions, TakeOptions, TDigestOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9260cd28f85..025eeef18af 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1951,6 +1951,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: "arrow::compute::StrptimeOptions"(CFunctionOptions): CStrptimeOptions(c_string format, TimeUnit unit) + cdef cppclass CStrftimeOptions \ + "arrow::compute::StrftimeOptions"(CFunctionOptions): + CStrftimeOptions(c_string format, c_string locale) + cdef cppclass CDayOfWeekOptions \ "arrow::compute::DayOfWeekOptions"(CFunctionOptions): CDayOfWeekOptions(c_bool one_based_numbering, uint32_t week_start) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 26d1f0e647c..c20d097f7c8 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -129,6 +129,7 @@ def test_option_class_equality(): pc.SplitPatternOptions(pattern="pattern"), pc.StrptimeOptions("%Y", "s"), pc.TrimOptions(" "), + pc.StrftimeOptions(), ] classes = {type(option) for option in options} for cls in exported_option_classes: @@ -1363,6 +1364,72 @@ def test_strptime(): assert got == expected +# TODO: We should test on windows once ARROW-13168 is resolved. +@pytest.mark.pandas +@pytest.mark.skipif(sys.platform == 'win32', + reason="Timezone database is not available on Windows yet") +def test_strftime(): + from pyarrow.vendored.version import Version + + def _fix_timestamp(s): + if Version(pd.__version__) <= Version("0.23.0"): + return s.to_series().replace("NaT", pd.NaT) + else: + return s + + times = ["2018-03-10 09:00", "2038-01-31 12:23", None] + timezones = ["CET", "UTC", "Europe/Ljubljana"] + + formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", + "%I", "%p", "%M", "%z", "%Z", "%j", "%U", "%W", "%c", "%x", + "%X", "%%", "%G", "%V", "%u", "%V"] + + for timezone in timezones: + ts = pd.to_datetime(times).tz_localize(timezone) + for unit in ["s", "ms", "us", "ns"]: + tsa = pa.array(ts, type=pa.timestamp(unit, timezone)) + for fmt in formats: + options = pc.StrftimeOptions(fmt) + result = pc.strftime(tsa, options=options) + expected = pa.array(_fix_timestamp(ts.strftime(fmt))) + assert result.equals(expected) + + # Default format + tsa = pa.array(ts, type=pa.timestamp("s", timezone)) + result = pc.strftime(tsa, options=pc.StrftimeOptions()) + expected = pa.array(_fix_timestamp(ts.strftime("%Y-%m-%dT%H:%M:%SZ"))) + assert result.equals(expected) + + # Pandas %S is equivalent to %S in arrow for unit="s" + tsa = pa.array(ts, type=pa.timestamp("s", timezone)) + options = pc.StrftimeOptions("%S") + result = pc.strftime(tsa, options=options) + expected = pa.array(_fix_timestamp(ts.strftime("%S"))) + assert result.equals(expected) + + # Pandas %S.%f is equivalent to %S in arrow for unit="us" + tsa = pa.array(ts, type=pa.timestamp("us", timezone)) + options = pc.StrftimeOptions("%S") + result = pc.strftime(tsa, options=options) + expected = pa.array(_fix_timestamp(ts.strftime("%S.%f"))) + assert result.equals(expected) + + # Test setting locale + tsa = pa.array(ts, type=pa.timestamp("s", timezone)) + options = pc.StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C") + result = pc.strftime(tsa, options=options) + expected = pa.array(_fix_timestamp(ts.strftime("%Y-%m-%dT%H:%M:%SZ"))) + assert result.equals(expected) + + for unit in ["s", "ms", "us", "ns"]: + tsa = pa.array(ts, type=pa.timestamp(unit)) + for fmt in formats: + with pytest.raises(pa.ArrowInvalid, + match="Timestamps without a time zone " + "cannot be reliably formatted"): + pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) + + def _check_datetime_components(timestamps, timezone=None): from pyarrow.vendored.version import Version diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index d216ec82c1e..64bb42a0ecf 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -26,6 +26,7 @@ library(dplyr) # TODO: consider reevaluating this workaround after ARROW-12980 withr::local_timezone("UTC") +# TODO: We should test on windows once ARROW-13168 is resolved. if (tolower(Sys.info()[["sysname"]]) == "windows") { test_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "") } else {