From 55ca5a91be1538614314a9910eccc2bbe0d90932 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 5 Oct 2021 13:40:16 -0400 Subject: [PATCH 1/4] ARROW-14231: [C++] Support casting timestamp with timezone to string --- .../compute/kernels/scalar_cast_string.cc | 81 ++++++++++++++++++- .../arrow/compute/kernels/scalar_cast_test.cc | 56 +++++++++++++ .../compute/kernels/scalar_temporal_unary.cc | 39 ++------- .../arrow/compute/kernels/temporal_internal.h | 39 ++++++++- cpp/src/arrow/csv/writer_test.cc | 34 +++++--- 5 files changed, 204 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index eb2f9043955..2e0e11d32dc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -21,6 +21,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/compute/kernels/common.h" #include "arrow/compute/kernels/scalar_cast_internal.h" +#include "arrow/compute/kernels/temporal_internal.h" #include "arrow/result.h" #include "arrow/util/formatting.h" #include "arrow/util/int_util.h" @@ -105,6 +106,84 @@ struct TemporalToStringCastFunctor { } }; +template +struct TemporalToStringCastFunctor { + using value_type = typename TypeTraits::CType; + using BuilderType = typename TypeTraits::BuilderType; + using FormatterType = StringFormatter; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + DCHECK(out->is_array()); + const ArrayData& input = *batch[0].array(); + ArrayData* output = out->mutable_array(); + return Convert(ctx, input, output); + } + + static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) { + const auto& timezone = GetInputTimezone(*input.type); + BuilderType builder(input.type, ctx->memory_pool()); + + if (timezone.empty() || timezone == "UTC") { + FormatterType formatter(input.type); + RETURN_NOT_OK(VisitArrayDataInline( + input, + [&](value_type v) { + return formatter(v, [&](util::string_view v) { return builder.Append(v); }); + }, + [&]() { return builder.AppendNull(); })); + } else { +#ifdef _WIN32 + // TODO(ARROW-13168): + return Status::NotImplemented( + "Casting a timestamp with time zone to string is not yet supported on " + "Windows."); +#else + switch (checked_cast(*input.type).unit()) { + case TimeUnit::SECOND: + RETURN_NOT_OK(ConvertZoned(input, timezone, &builder)); + break; + case TimeUnit::MILLI: + RETURN_NOT_OK( + ConvertZoned(input, timezone, &builder)); + break; + case TimeUnit::MICRO: + RETURN_NOT_OK( + ConvertZoned(input, timezone, &builder)); + break; + case TimeUnit::NANO: + RETURN_NOT_OK( + ConvertZoned(input, timezone, &builder)); + break; + default: + DCHECK(false); + return Status::NotImplemented("Unimplemented time unit"); + } +#endif + } + std::shared_ptr output_array; + RETURN_NOT_OK(builder.Finish(&output_array)); + *output = std::move(*output_array->data()); + return Status::OK(); + } + + template + static Status ConvertZoned(const ArrayData& input, const std::string& timezone, + BuilderType* builder) { + static std::string kFormatString = "%Y-%m-%d %H:%M:%S%z"; + ARROW_ASSIGN_OR_RAISE(const time_zone* tz, + LocateZone(timezone.empty() ? "UTC" : timezone)); + ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale("C")); + TimestampFormatter formatter{kFormatString, tz, locale}; + return VisitArrayDataInline( + input, + [&](value_type v) { + ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(v)); + return builder->Append(std::move(formatted)); + }, + [&]() { return builder->AppendNull(); }); + } +}; + // ---------------------------------------------------------------------- // Binary-like to binary-like // @@ -304,7 +383,7 @@ void AddTemporalToStringCasts(CastFunction* func) { auto out_ty = TypeTraits::type_singleton(); for (const std::shared_ptr& in_ty : TemporalTypes()) { DCHECK_OK(func->AddKernel( - in_ty->id(), {in_ty}, out_ty, + in_ty->id(), {InputType(in_ty->id())}, out_ty, TrivialScalarUnaryAsArraysExec( GenerateTemporal(*in_ty)), NullHandling::COMPUTED_NO_PREALLOCATE)); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 5d516677669..afb64ddb043 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1553,8 +1553,64 @@ TEST(Cast, TimestampToString) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"), ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); + + CheckCast( + ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"), + ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); + + CheckCast(ArrayFromJSON(timestamp(TimeUnit::MILLI, "UTC"), + "[-30610224000000, -5364662400000]"), + ArrayFromJSON(string_type, + R"(["1000-01-01 00:00:00.000", "1800-01-01 00:00:00.000"])")); + } +} + +#ifndef _WIN32 +TEST(Cast, TimestampWithZoneToString) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast( + ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), + "[-34226955, 1456767743]"), + ArrayFromJSON(string_type, + R"(["1968-11-30 13:30:45-0700", "2016-02-29 10:42:23-0700"])")); + CheckCast(ArrayFromJSON(timestamp(TimeUnit::MILLI, "America/Phoenix"), + "[-34226955877, 1456767743456]"), + ArrayFromJSON( + string_type, + R"(["1968-11-30 13:30:44.123-0700", "2016-02-29 10:42:23.456-0700"])")); + CheckCast( + ArrayFromJSON(timestamp(TimeUnit::MICRO, "America/Phoenix"), + "[-34226955877000, 1456767743456789]"), + ArrayFromJSON( + string_type, + R"(["1968-11-30 13:30:44.123000-0700", "2016-02-29 10:42:23.456789-0700"])")); + CheckCast( + ArrayFromJSON(timestamp(TimeUnit::NANO, "America/Phoenix"), + "[-34226955876543211, 1456767743456789246]"), + ArrayFromJSON( + string_type, + R"(["1968-11-30 13:30:44.123456789-0700", "2016-02-29 10:42:23.456789246-0700"])")); } } +#else +// TODO(ARROW-13168): we lack tzdb on Windows +TEST(Cast, TimestampWithZoneToString) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCastFails(ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), + "[-34226955, 1456767743]"), + CastOptions::Safe(string_type)); + CheckCastFails(ArrayFromJSON(timestamp(TimeUnit::MILLI, "America/Phoenix"), + "[-34226955877, 1456767743456]"), + CastOptions::Safe(string_type)); + CheckCastFails(ArrayFromJSON(timestamp(TimeUnit::MICRO, "America/Phoenix"), + "[-34226955877000, 1456767743456789]"), + CastOptions::Safe(string_type)); + CheckCastFails(ArrayFromJSON(timestamp(TimeUnit::NANO, "America/Phoenix"), + "[-34226955876543211, 1456767743456789246]"), + CastOptions::Safe(string_type)); + } +} +#endif TEST(Cast, DateToDate) { auto day_32 = ArrayFromJSON(date32(), "[0, null, 100, 1, 10]"); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index d29ebca0ca8..d1c5855d2df 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -45,7 +45,6 @@ using arrow_vendored::date::local_time; using arrow_vendored::date::locate_zone; using arrow_vendored::date::sys_days; using arrow_vendored::date::sys_time; -using arrow_vendored::date::time_zone; using arrow_vendored::date::trunc; using arrow_vendored::date::weekday; using arrow_vendored::date::weeks; @@ -479,7 +478,7 @@ struct Strftime { if ((options.format.find("%c") != std::string::npos) && (options.locale != "C")) { return Status::Invalid("%c flag is not supported in non-C locales."); } - auto timezone = GetInputTimezone(type); + const auto& timezone = GetInputTimezone(type); if (timezone.empty()) { if ((options.format.find("%z") != std::string::npos) || @@ -488,10 +487,10 @@ struct Strftime { "Timezone not present, cannot convert to string with timezone: ", options.format); } - timezone = "UTC"; } - ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone)); + ARROW_ASSIGN_OR_RAISE(const time_zone* tz, + LocateZone(timezone.empty() ? "UTC" : timezone)); ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale(options.locale)); @@ -500,7 +499,7 @@ struct Strftime { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type)); - TimestampFormatter formatter{self.options.format, self.tz, self.locale}; + TimestampFormatter formatter{self.options.format, self.tz, self.locale}; if (in.is_valid) { const int64_t in_val = internal::UnboxScalar::Unbox(in); @@ -514,7 +513,7 @@ struct Strftime { static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type)); - TimestampFormatter formatter{self.options.format, self.tz, self.locale}; + TimestampFormatter formatter{self.options.format, self.tz, self.locale}; StringBuilder string_builder; // Presize string data using a heuristic @@ -539,35 +538,9 @@ struct Strftime { return Status::OK(); } - - struct TimestampFormatter { - const char* format; - const time_zone* tz; - std::ostringstream bufstream; - - explicit TimestampFormatter(const std::string& format, const time_zone* tz, - const std::locale& locale) - : format(format.c_str()), tz(tz) { - bufstream.imbue(locale); - // Propagate errors as C++ exceptions (to get an actual error message) - bufstream.exceptions(std::ios::failbit | std::ios::badbit); - } - - Result operator()(int64_t arg) { - bufstream.str(""); - const auto zt = zoned_time{tz, sys_time(Duration{arg})}; - try { - arrow_vendored::date::to_stream(bufstream, format, zt); - } catch (const std::runtime_error& ex) { - bufstream.clear(); - return Status::Invalid("Failed formatting timestamp: ", ex.what()); - } - // XXX could return a view with std::ostringstream::view() (C++20) - return std::move(bufstream).str(); - } - }; }; #else +// TODO(ARROW-13168) template struct Strftime { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { diff --git a/cpp/src/arrow/compute/kernels/temporal_internal.h b/cpp/src/arrow/compute/kernels/temporal_internal.h index 45fa67a9b9b..3d2d9c5b9bd 100644 --- a/cpp/src/arrow/compute/kernels/temporal_internal.h +++ b/cpp/src/arrow/compute/kernels/temporal_internal.h @@ -37,6 +37,7 @@ using arrow_vendored::date::sys_days; using arrow_vendored::date::sys_time; using arrow_vendored::date::time_zone; using arrow_vendored::date::year_month_day; +using arrow_vendored::date::zoned_time; inline int64_t GetQuarter(const year_month_day& ymd) { return static_cast((static_cast(ymd.month()) - 1) / 3); @@ -72,7 +73,7 @@ static inline const std::string& GetInputTimezone(const ArrayData& array) { return checked_cast(*array.type).timezone(); } -inline Status ValidateDayOfWeekOptions(const DayOfWeekOptions& options) { +static inline Status ValidateDayOfWeekOptions(const DayOfWeekOptions& options) { if (options.week_start < 1 || 7 < options.week_start) { return Status::Invalid( "week_start must follow ISO convention (Monday=1, Sunday=7). Got week_start=", @@ -81,6 +82,14 @@ inline Status ValidateDayOfWeekOptions(const DayOfWeekOptions& options) { return Status::OK(); } +static inline Result GetLocale(const std::string& locale) { + try { + return std::locale(locale.c_str()); + } catch (const std::runtime_error& ex) { + return Status::Invalid("Cannot find locale '", locale, "': ", ex.what()); + } +} + struct NonZonedLocalizer { using days_t = sys_days; @@ -107,6 +116,34 @@ struct ZonedLocalizer { local_days ConvertDays(sys_days d) const { return local_days(year_month_day(d)); } }; +template +struct TimestampFormatter { + const char* format; + const time_zone* tz; + std::ostringstream bufstream; + + explicit TimestampFormatter(const std::string& format, const time_zone* tz, + const std::locale& locale) + : format(format.c_str()), tz(tz) { + bufstream.imbue(locale); + // Propagate errors as C++ exceptions (to get an actual error message) + bufstream.exceptions(std::ios::failbit | std::ios::badbit); + } + + Result operator()(int64_t arg) { + bufstream.str(""); + const auto zt = zoned_time{tz, sys_time(Duration{arg})}; + try { + arrow_vendored::date::to_stream(bufstream, format, zt); + } catch (const std::runtime_error& ex) { + bufstream.clear(); + return Status::Invalid("Failed formatting timestamp: ", ex.what()); + } + // XXX could return a view with std::ostringstream::view() (C++20) + return std::move(bufstream).str(); + } +}; + // // Which types to generate a kernel for // diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 57b42c7f5a7..4fff8eac92f 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -59,6 +59,7 @@ std::vector GenerateTestCases() { {field("c ", int32())}, {field("d", date32())}, {field("e", date64())}, + {field("f", timestamp(TimeUnit::SECOND))}, }); auto populated_batch = R"([{"a": 1, "c ": -1}, { "a": 1, "b\"": "abc\"efg", "c ": 2324}, @@ -67,16 +68,18 @@ std::vector GenerateTestCases() { { "a": 546, "b\"": "", "c ": 517 }, { "a": 124, "b\"": "a\"\"b\"" }, { "d": 0 }, - { "e": 86400000 }])"; - std::string expected_without_header = std::string("1,,-1,,") + "\n" + // line 1 - R"(1,"abc""efg",2324,,)" + "\n" + // line 2 - R"(,"abcd",5467,,)" + "\n" + // line 3 - R"(,,,,)" + "\n" + // line 4 - R"(546,"",517,,)" + "\n" + // line 5 - R"(124,"a""""b""",,,)" + "\n" + // line 6 - R"(,,,1970-01-01,)" + "\n" + // line 7 - R"(,,,,1970-01-02)" + "\n"; // line 8 - std::string expected_header = std::string(R"("a","b""","c ","d","e")") + "\n"; + { "e": 86400000 }, + { "f": 1078016523 }])"; + std::string expected_without_header = std::string("1,,-1,,,") + "\n" + // line 1 + R"(1,"abc""efg",2324,,,)" + "\n" + // line 2 + R"(,"abcd",5467,,,)" + "\n" + // line 3 + R"(,,,,,)" + "\n" + // line 4 + R"(546,"",517,,,)" + "\n" + // line 5 + R"(124,"a""""b""",,,,)" + "\n" + // line 6 + R"(,,,1970-01-01,,)" + "\n" + // line 7 + R"(,,,,1970-01-02,)" + "\n" + // line 8 + R"(,,,,,2004-02-29 01:02:03)" + "\n"; // line 9 + std::string expected_header = std::string(R"("a","b""","c ","d","e","f")") + "\n"; return std::vector{ {abc_schema, "[]", DefaultTestOptions(/*header=*/false), ""}, @@ -155,5 +158,16 @@ INSTANTIATE_TEST_SUITE_P(SingleColumnWriteCSVTest, TestWriteCSV, R"("int64")" "\n9999\n\n-15\n"})); +#ifndef _WIN32 +// TODO(ARROW-13168): +INSTANTIATE_TEST_SUITE_P( + TimestampWithTimezoneWriteCSVTest, TestWriteCSV, + ::testing::Values(WriterTestParams{ + schema({field("tz", timestamp(TimeUnit::SECOND, "America/Phoenix"))}), + R"([{ "tz": 1456767743 }])", WriteOptions(), + R"("tz")" + "\n2016-02-29 10:42:23-0700\n"})); +#endif + } // namespace csv } // namespace arrow From d5fe9f22f80bab6b52b3366db782394950c1dd7f Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 5 Oct 2021 14:21:38 -0400 Subject: [PATCH 2/4] ARROW-14231: [C++] Fix Windows tests --- .../arrow/compute/kernels/scalar_cast_test.cc | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index afb64ddb043..fc8a1fada3b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1596,18 +1596,10 @@ TEST(Cast, TimestampWithZoneToString) { // TODO(ARROW-13168): we lack tzdb on Windows TEST(Cast, TimestampWithZoneToString) { for (auto string_type : {utf8(), large_utf8()}) { - CheckCastFails(ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), - "[-34226955, 1456767743]"), - CastOptions::Safe(string_type)); - CheckCastFails(ArrayFromJSON(timestamp(TimeUnit::MILLI, "America/Phoenix"), - "[-34226955877, 1456767743456]"), - CastOptions::Safe(string_type)); - CheckCastFails(ArrayFromJSON(timestamp(TimeUnit::MICRO, "America/Phoenix"), - "[-34226955877000, 1456767743456789]"), - CastOptions::Safe(string_type)); - CheckCastFails(ArrayFromJSON(timestamp(TimeUnit::NANO, "America/Phoenix"), - "[-34226955876543211, 1456767743456789246]"), - CastOptions::Safe(string_type)); + ASSERT_RAISES(NotImplemented, + Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), + "[-34226955, 1456767743]"), + CastOptions::Safe(string_type))); } } #endif From 20ad4d322770cd9af697bad1ea309235b48f854a Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 6 Oct 2021 09:20:36 -0400 Subject: [PATCH 3/4] ARROW-14231: [C++] Ensure UTC is not serialized to naive timestamp --- .../compute/kernels/scalar_cast_string.cc | 29 +++++++++++++--- .../arrow/compute/kernels/scalar_cast_test.cc | 33 +++++++++++++++---- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 2e0e11d32dc..4eff4761347 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -121,16 +121,34 @@ struct TemporalToStringCastFunctor { static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) { const auto& timezone = GetInputTimezone(*input.type); + const auto& ty = checked_cast(*input.type); BuilderType builder(input.type, ctx->memory_pool()); - if (timezone.empty() || timezone == "UTC") { + // Preallocate + int64_t string_length = 19; // YYYY-MM-DD HH:MM:SS + if (ty.unit() == TimeUnit::MILLI) { + string_length += 4; // .SSS + } else if (ty.unit() == TimeUnit::MICRO) { + string_length += 7; // .SSSSSS + } else if (ty.unit() == TimeUnit::NANO) { + string_length += 10; // .SSSSSSSSS + } + if (!timezone.empty()) string_length += 5; // +0000 + RETURN_NOT_OK(builder.Reserve(input.length)); + RETURN_NOT_OK( + builder.ReserveData((input.length - input.GetNullCount()) * string_length)); + + if (timezone.empty()) { FormatterType formatter(input.type); RETURN_NOT_OK(VisitArrayDataInline( input, [&](value_type v) { return formatter(v, [&](util::string_view v) { return builder.Append(v); }); }, - [&]() { return builder.AppendNull(); })); + [&]() { + builder.UnsafeAppendNull(); + return Status::OK(); + })); } else { #ifdef _WIN32 // TODO(ARROW-13168): @@ -138,7 +156,7 @@ struct TemporalToStringCastFunctor { "Casting a timestamp with time zone to string is not yet supported on " "Windows."); #else - switch (checked_cast(*input.type).unit()) { + switch (ty.unit()) { case TimeUnit::SECOND: RETURN_NOT_OK(ConvertZoned(input, timezone, &builder)); break; @@ -180,7 +198,10 @@ struct TemporalToStringCastFunctor { ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(v)); return builder->Append(std::move(formatted)); }, - [&]() { return builder->AppendNull(); }); + [&]() { + builder->UnsafeAppendNull(); + return Status::OK(); + }); } }; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index fc8a1fada3b..954f0166b21 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1555,35 +1555,52 @@ TEST(Cast, TimestampToString) { ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); CheckCast( - ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"), - ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); + ArrayFromJSON(timestamp(TimeUnit::MILLI), "[-30610224000000, -5364662400000]"), + ArrayFromJSON(string_type, + R"(["1000-01-01 00:00:00.000", "1800-01-01 00:00:00.000"])")); - CheckCast(ArrayFromJSON(timestamp(TimeUnit::MILLI, "UTC"), - "[-30610224000000, -5364662400000]"), - ArrayFromJSON(string_type, - R"(["1000-01-01 00:00:00.000", "1800-01-01 00:00:00.000"])")); + CheckCast( + ArrayFromJSON(timestamp(TimeUnit::MICRO), + "[-30610224000000000, -5364662400000000]"), + ArrayFromJSON(string_type, + R"(["1000-01-01 00:00:00.000000", "1800-01-01 00:00:00.000000"])")); + + CheckCast( + ArrayFromJSON(timestamp(TimeUnit::NANO), + "[-596933876543210988, 349837323456789012]"), + ArrayFromJSON( + string_type, + R"(["1951-02-01 01:02:03.456789012", "1981-02-01 01:02:03.456789012"])")); } } #ifndef _WIN32 TEST(Cast, TimestampWithZoneToString) { for (auto string_type : {utf8(), large_utf8()}) { + CheckCast( + ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"), + ArrayFromJSON(string_type, + R"(["1000-01-01 00:00:00+0000", "1800-01-01 00:00:00+0000"])")); + CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), "[-34226955, 1456767743]"), ArrayFromJSON(string_type, R"(["1968-11-30 13:30:45-0700", "2016-02-29 10:42:23-0700"])")); + CheckCast(ArrayFromJSON(timestamp(TimeUnit::MILLI, "America/Phoenix"), "[-34226955877, 1456767743456]"), ArrayFromJSON( string_type, R"(["1968-11-30 13:30:44.123-0700", "2016-02-29 10:42:23.456-0700"])")); + CheckCast( ArrayFromJSON(timestamp(TimeUnit::MICRO, "America/Phoenix"), "[-34226955877000, 1456767743456789]"), ArrayFromJSON( string_type, R"(["1968-11-30 13:30:44.123000-0700", "2016-02-29 10:42:23.456789-0700"])")); + CheckCast( ArrayFromJSON(timestamp(TimeUnit::NANO, "America/Phoenix"), "[-34226955876543211, 1456767743456789246]"), @@ -1596,6 +1613,10 @@ TEST(Cast, TimestampWithZoneToString) { // TODO(ARROW-13168): we lack tzdb on Windows TEST(Cast, TimestampWithZoneToString) { for (auto string_type : {utf8(), large_utf8()}) { + ASSERT_RAISES(NotImplemented, Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), + "[-34226955, 1456767743]"), + CastOptions::Safe(string_type))); + ASSERT_RAISES(NotImplemented, Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), "[-34226955, 1456767743]"), From 6ba13cf74d6778fde981d1f96eb0c6424e59bdc5 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 8 Nov 2021 11:37:38 -0500 Subject: [PATCH 4/4] ARROW-14231: [C++] Format UTC with trailing Z --- cpp/src/arrow/compute/kernels/scalar_cast_string.cc | 10 ++++++---- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 2 +- cpp/src/arrow/csv/writer_test.cc | 11 +++++++---- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 4eff4761347..4130c6a9487 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -187,11 +187,13 @@ struct TemporalToStringCastFunctor { template static Status ConvertZoned(const ArrayData& input, const std::string& timezone, BuilderType* builder) { - static std::string kFormatString = "%Y-%m-%d %H:%M:%S%z"; - ARROW_ASSIGN_OR_RAISE(const time_zone* tz, - LocateZone(timezone.empty() ? "UTC" : timezone)); + static const std::string kFormatString = "%Y-%m-%d %H:%M:%S%z"; + static const std::string kUtcFormatString = "%Y-%m-%d %H:%M:%SZ"; + DCHECK(!timezone.empty()); + ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone)); ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale("C")); - TimestampFormatter formatter{kFormatString, tz, locale}; + TimestampFormatter formatter{ + timezone == "UTC" ? kUtcFormatString : kFormatString, tz, locale}; return VisitArrayDataInline( input, [&](value_type v) { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 954f0166b21..92de7892f95 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1580,7 +1580,7 @@ TEST(Cast, TimestampWithZoneToString) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"), ArrayFromJSON(string_type, - R"(["1000-01-01 00:00:00+0000", "1800-01-01 00:00:00+0000"])")); + R"(["1000-01-01 00:00:00Z", "1800-01-01 00:00:00Z"])")); CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 4fff8eac92f..5d575887fdf 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -163,10 +163,13 @@ INSTANTIATE_TEST_SUITE_P(SingleColumnWriteCSVTest, TestWriteCSV, INSTANTIATE_TEST_SUITE_P( TimestampWithTimezoneWriteCSVTest, TestWriteCSV, ::testing::Values(WriterTestParams{ - schema({field("tz", timestamp(TimeUnit::SECOND, "America/Phoenix"))}), - R"([{ "tz": 1456767743 }])", WriteOptions(), - R"("tz")" - "\n2016-02-29 10:42:23-0700\n"})); + schema({ + field("tz", timestamp(TimeUnit::SECOND, "America/Phoenix")), + field("utc", timestamp(TimeUnit::SECOND, "UTC")), + }), + R"([{ "tz": 1456767743, "utc": 1456767743 }])", WriteOptions(), + R"("tz","utc")" + "\n2016-02-29 10:42:23-0700,2016-02-29 17:42:23Z\n"})); #endif } // namespace csv