diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc index 5f16f1e9db4..74274e963a1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc @@ -29,7 +29,7 @@ namespace arrow { -using internal::ParseValue; +using internal::ParseTimestampISO8601; namespace compute { namespace internal { @@ -422,17 +422,34 @@ struct CastFunctor { // String to Timestamp struct ParseTimestamp { + explicit ParseTimestamp(const TimestampType& type) + : type(type), expect_timezone(!type.timezone().empty()) {} template OutValue Call(KernelContext*, Arg0Value val, Status* st) const { OutValue result = 0; - if (ARROW_PREDICT_FALSE(!ParseValue(type, val.data(), val.size(), &result))) { + bool zone_offset_present = false; + if (ARROW_PREDICT_FALSE(!ParseTimestampISO8601(val.data(), val.size(), type.unit(), + &result, &zone_offset_present))) { *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ", type.ToString()); } + if (zone_offset_present != expect_timezone) { + if (expect_timezone) { + *st = Status::Invalid( + "Failed to parse string: '", val, "' as a scalar of type ", type.ToString(), + "expected a zone offset. If these timestamps " + "are in local time, cast to timestamp without timezone, then " + "call assume_timezone."); + } else { + *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ", + type.ToString(), "expected no zone offset"); + } + } return result; } const TimestampType& type; + bool expect_timezone; }; template diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 92de7892f95..b5cafead6b2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1945,7 +1945,37 @@ TEST(Cast, StringToTimestamp) { } } - // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc + auto zoned = ArrayFromJSON(string_type, + R"(["2020-02-29T00:00:00Z", "2020-03-02T10:11:12+0102"])"); + auto mixed = ArrayFromJSON(string_type, + R"(["2020-03-02T10:11:12+0102", "2020-02-29T00:00:00"])"); + + // Timestamp with zone offset should not parse as naive + CheckCastFails(zoned, CastOptions::Safe(timestamp(TimeUnit::SECOND))); + + // Mixed zoned/unzoned should not parse as naive + CheckCastFails(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("expected no zone offset"), + Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND)))); + + // ...or as timestamp with timezone + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("expected a zone offset"), + Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC")))); + + // Unzoned should not parse as timestamp with timezone + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("expected a zone offset"), + Cast(strings, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC")))); + + // Timestamp with zone offset can parse as any time zone (since they're unambiguous) + CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), + "[1582934400, 1583140152]")); + CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), + "[1582934400, 1583140152]")); + + // NOTE: timestamp parsing is tested comprehensively in value_parsing_test.cc } } diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 7eeb80b013b..b567f4f351b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -3625,11 +3625,24 @@ struct StrptimeExec { Result ResolveStrptimeOutput(KernelContext* ctx, const std::vector&) { - if (ctx->state()) { - return ::arrow::timestamp(StrptimeState::Get(ctx).unit); + if (!ctx->state()) { + return Status::Invalid("strptime does not provide default StrptimeOptions"); + } + const StrptimeOptions& options = StrptimeState::Get(ctx); + // Check for use of %z or %Z + size_t cur = 0; + std::string zone = ""; + while (cur < options.format.size() - 1) { + if (options.format[cur] == '%') { + if (options.format[cur + 1] == 'z') { + zone = "UTC"; + break; + } + cur++; + } + cur++; } - - return Status::Invalid("strptime does not provide default StrptimeOptions"); + return ::arrow::timestamp(options.unit, zone); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 4551e8c61e5..18ca794f669 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -32,6 +32,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" +#include "arrow/util/value_parsing.h" namespace arrow { namespace compute { @@ -1757,6 +1758,24 @@ TYPED_TEST(TestStringKernels, Strptime) { std::string output1 = R"(["2020-05-01", null, "1900-12-11"])"; StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO); this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options); + + input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])"; + options.format = "%m/%d/%Y %%z"; + this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options); +} + +TYPED_TEST(TestStringKernels, StrptimeZoneOffset) { + if (!arrow::internal::kStrptimeSupportsZone) { + GTEST_SKIP() << "strptime does not support %z on this platform"; + } + // N.B. BSD strptime only supports (+/-)HHMM and not the wider range + // of values GNU strptime supports. + std::string input1 = R"(["5/1/2020 +0100", null, "12/11/1900 -0130"])"; + std::string output1 = + R"(["2020-04-30T23:00:00.000000", null, "1900-12-11T01:30:00.000000"])"; + StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO); + this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO, "UTC"), output1, + &options); } TYPED_TEST(TestStringKernels, StrptimeDoesNotProvideDefaultOptions) { diff --git a/cpp/src/arrow/csv/column_builder_test.cc b/cpp/src/arrow/csv/column_builder_test.cc index 7577c883e8c..53e69ada62f 100644 --- a/cpp/src/arrow/csv/column_builder_test.cc +++ b/cpp/src/arrow/csv/column_builder_test.cc @@ -33,6 +33,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/task_group.h" #include "arrow/util/thread_pool.h" +#include "arrow/util/value_parsing.h" namespace arrow { namespace csv { @@ -427,6 +428,13 @@ TEST_F(InferringColumnBuilderTest, SingleChunkTimestamp) { {{false, true, true}}, {{0, 0, 1542129070}}, &expected); CheckInferred(tg, {{"", "1970-01-01", "2018-11-13 17:11:10"}}, options, expected); + + options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y/%m/%d")); + tg = TaskGroup::MakeSerial(); + ChunkedArrayFromVector(timestamp(TimeUnit::SECOND), + {{false, true, true}}, {{0, 0, 1542067200}}, + &expected); + CheckInferred(tg, {{"", "1970/01/01", "2018/11/13"}}, options, expected); } TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) { @@ -438,6 +446,13 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) { {{false}, {true}, {true}}, {{0}, {0}, {1542129070}}, &expected); CheckInferred(tg, {{""}, {"1970-01-01"}, {"2018-11-13 17:11:10"}}, options, expected); + + options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y/%m/%d")); + tg = TaskGroup::MakeSerial(); + ChunkedArrayFromVector(timestamp(TimeUnit::SECOND), + {{false}, {true}, {true}}, + {{0}, {0}, {1542067200}}, &expected); + CheckInferred(tg, {{""}, {"1970/01/01"}, {"2018/11/13"}}, options, expected); } TEST_F(InferringColumnBuilderTest, SingleChunkTimestampNS) { @@ -471,6 +486,76 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampNS) { options, expected); } +TEST_F(InferringColumnBuilderTest, SingleChunkTimestampWithZone) { + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + + std::shared_ptr expected; + ChunkedArrayFromVector(timestamp(TimeUnit::SECOND, "UTC"), + {{false, true, true}}, {{0, 0, 1542129010}}, + &expected); + CheckInferred(tg, {{"", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10+0001"}}, options, + expected); + + tg = TaskGroup::MakeSerial(); + expected = ChunkedArrayFromJSON( + utf8(), {R"(["", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10"])"}); + CheckInferred(tg, {{"", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10"}}, options, + expected); +} + +TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampWithZone) { + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + + std::shared_ptr expected; + ChunkedArrayFromVector(timestamp(TimeUnit::SECOND, "UTC"), + {{false}, {true}, {true}}, + {{0}, {0}, {1542129010}}, &expected); + CheckInferred(tg, {{""}, {"1970-01-01T00:00:00Z"}, {"2018-11-13 17:11:10+0001"}}, + options, expected); + + tg = TaskGroup::MakeSerial(); + expected = ChunkedArrayFromJSON( + utf8(), {R"([""])", R"(["1970-01-01T00:00:00Z"])", R"(["2018-11-13 17:11:10"])"}); + CheckInferred(tg, {{""}, {"1970-01-01T00:00:00Z"}, {"2018-11-13 17:11:10"}}, options, + expected); +} + +TEST_F(InferringColumnBuilderTest, SingleChunkTimestampWithZoneNS) { + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + + std::shared_ptr expected; + ChunkedArrayFromVector( + timestamp(TimeUnit::NANO, "UTC"), {{false, true, true, true, true}}, + {{0, 3660000000000, 1542129070123000000, 1542129070123456000, 1542129070123456789}}, + &expected); + CheckInferred(tg, + {{"", "1970-01-01T00:00:00-0101", "2018-11-13 17:11:10.123Z", + "2018-11-13 17:11:10.123456Z", "2018-11-13 17:11:10.123456789Z"}}, + options, expected); +} + +TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampWithZoneNS) { + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + + std::shared_ptr expected; + ChunkedArrayFromVector( + timestamp(TimeUnit::NANO, "UTC"), {{false}, {true}, {true, true, true}}, + {{0}, + {3660000000000}, + {1542129070123000000, 1542129070123456000, 1542129070123456789}}, + &expected); + CheckInferred(tg, + {{""}, + {"1970-01-01T00:00:00-0101"}, + {"2018-11-13 17:11:10.123Z", "2018-11-13 17:11:10.123456Z", + "2018-11-13 17:11:10.123456789Z"}}, + options, expected); +} + TEST_F(InferringColumnBuilderTest, SingleChunkIntegerAndTime) { // Fallback to utf-8 auto options = ConvertOptions::Defaults(); diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 66d05458097..4d00cbec4eb 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -350,18 +350,37 @@ struct InlineISO8601ValueDecoder : public ValueDecoder { explicit InlineISO8601ValueDecoder(const std::shared_ptr& type, const ConvertOptions& options) : ValueDecoder(type, options), - unit_(checked_cast(*type_).unit()) {} + unit_(checked_cast(*type_).unit()), + expect_timezone_(!checked_cast(*type_).timezone().empty()) { + } Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { - if (ARROW_PREDICT_FALSE(!internal::ParseTimestampISO8601( - reinterpret_cast(data), size, unit_, out))) { + bool zone_offset_present = false; + if (ARROW_PREDICT_FALSE( + !internal::ParseTimestampISO8601(reinterpret_cast(data), size, + unit_, out, &zone_offset_present))) { return GenericConversionError(type_, data, size); } + if (zone_offset_present != expect_timezone_) { + if (expect_timezone_) { + return Status::Invalid("CSV conversion error to ", type_->ToString(), + ": expected a zone offset in '", + std::string(reinterpret_cast(data), size), + "'. If these timestamps are in local time, parse them as " + "timestamps without timezone, then call assume_timezone."); + } else { + return Status::Invalid("CSV conversion error to ", type_->ToString(), + ": expected no zone offset in '", + std::string(reinterpret_cast(data), size), + "'"); + } + } return Status::OK(); } protected: TimeUnit::type unit_; + bool expect_timezone_; }; struct SingleParserTimestampValueDecoder : public ValueDecoder { @@ -371,18 +390,36 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder { const ConvertOptions& options) : ValueDecoder(type, options), unit_(checked_cast(*type_).unit()), + expect_timezone_(!checked_cast(*type_).timezone().empty()), parser_(*options_.timestamp_parsers[0]) {} Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { - if (ARROW_PREDICT_FALSE( - !parser_(reinterpret_cast(data), size, unit_, out))) { + bool zone_offset_present = false; + if (ARROW_PREDICT_FALSE(!parser_(reinterpret_cast(data), size, unit_, + out, &zone_offset_present))) { return GenericConversionError(type_, data, size); } + if (zone_offset_present != expect_timezone_) { + if (expect_timezone_) { + return Status::Invalid("CSV conversion error to ", type_->ToString(), + ": expected a zone offset in '", + std::string(reinterpret_cast(data), size), + "'. If these timestamps are in local time, parse them as " + "timestamps without timezone, then call assume_timezone. " + "If using strptime, ensure '%z' is in the format string."); + } else { + return Status::Invalid("CSV conversion error to ", type_->ToString(), + ": expected no zone offset in '", + std::string(reinterpret_cast(data), size), + "'"); + } + } return Status::OK(); } protected: TimeUnit::type unit_; + bool expect_timezone_; const TimestampParser& parser_; }; @@ -393,11 +430,15 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder { const ConvertOptions& options) : ValueDecoder(type, options), unit_(checked_cast(*type_).unit()), + expect_timezone_(!checked_cast(*type_).timezone().empty()), parsers_(GetParsers(options_)) {} Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { + bool zone_offset_present = false; for (const auto& parser : parsers_) { - if (parser->operator()(reinterpret_cast(data), size, unit_, out)) { + if (parser->operator()(reinterpret_cast(data), size, unit_, out, + &zone_offset_present) && + zone_offset_present == expect_timezone_) { return Status::OK(); } } @@ -416,6 +457,7 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder { } TimeUnit::type unit_; + bool expect_timezone_; std::vector parsers_; }; diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc index 9a68b956803..9a83ef020de 100644 --- a/cpp/src/arrow/csv/converter_test.cc +++ b/cpp/src/arrow/csv/converter_test.cc @@ -548,12 +548,43 @@ TEST(TimestampConversion, Basics) { {"2018-11-13 17:11:10\n1900-02-28 12:34:56\n"}, {{1542129070, -2203932304LL}}); + // Zone offsets are not accepted + AssertConversionError(type, + {"1970-01-01T00Z\n2000-02-29T00-0200\n" + "3989-07-14T00+03:14\n1900-02-28 00-04:59\n"}, + {0}); + type = timestamp(TimeUnit::NANO); AssertConversion( type, {"1970-01-01\n2000-02-29\n1900-02-28\n"}, {{0, 951782400000000000LL, -2203977600000000000LL}}); } +TEST(TimestampConversion, WithZoneOffset) { + auto type = timestamp(TimeUnit::SECOND, "UTC"); + + AssertConversion( + type, + {"1970-01-01T00Z\n2000-02-29T00-0200\n" + "3989-07-14T00+03:14\n1900-02-28 00-04:59\n"}, + {{0, 951782400 + 7200, 63730281600LL - 11640, -2203977600LL + 17940}}); + + type = timestamp(TimeUnit::NANO, "UTC"); + AssertConversion( + type, + {"1970-01-01T00Z\n" + "2000-02-29T00:00:00.123456789+0117\n" + "1900-02-28 00:00:00.123456789-01:00\n"}, + {{0, 951782400000000000LL + 123456789LL - 4620000000000LL, + -2203977600000000000LL + 123456789LL + 3600000000000LL}}); + + // Local times are not accepted + AssertConversionError(type, + {"1970-01-01T00\n2000-02-29T00\n" + "3989-07-14T00\n1900-02-28 00\n"}, + {0}); +} + TEST(TimestampConversion, Nulls) { auto type = timestamp(TimeUnit::MILLI); AssertConversion( @@ -592,6 +623,48 @@ TEST(TimestampConversion, UserDefinedParsers) { {{86400000}, {172800000}}, options); } +#ifndef _WIN32 +TEST(TimestampConversion, UserDefinedParsersWithZone) { + auto options = ConvertOptions::Defaults(); + auto type = timestamp(TimeUnit::SECOND, "America/Phoenix"); + + // Test a single parser + options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %z")}; + AssertConversion(type, {"01/02/1970 +0000,01/03/1970 +0000\n"}, + {{86400}, {172800}}, options); + + // Test multiple parsers + options.timestamp_parsers.push_back(TimestampParser::MakeISO8601()); + AssertConversion( + type, {"01/02/1970 +0000,1970-01-03T00:00:00+0000\n"}, {{86400}, {172800}}, + options); + + // Test errors + options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y")}; + AssertConversionError(type, {"01/02/1970,01/03/1970\n"}, {0, 1}, options); + options.timestamp_parsers.push_back(TimestampParser::MakeISO8601()); + AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, options); +} +#else +// Windows uses the vendored musl strptime which doesn't support %z. +TEST(TimestampConversion, UserDefinedParsersWithZone) { + auto options = ConvertOptions::Defaults(); + auto type = timestamp(TimeUnit::SECOND, "America/Phoenix"); + + options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %z")}; + AssertConversionError(type, {"01/02/1970 +0000,01/03/1970 +0000\n"}, {0, 1}, options); + + options.timestamp_parsers.push_back(TimestampParser::MakeISO8601()); + AssertConversionError(type, {"01/02/1970 +0000,1970-01-03T00:00:00+0000\n"}, {0}, + options); + + options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y")}; + AssertConversionError(type, {"01/02/1970,01/03/1970\n"}, {0, 1}, options); + options.timestamp_parsers.push_back(TimestampParser::MakeISO8601()); + AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, options); +} +#endif + Decimal128 Dec128(util::string_view value) { Decimal128 dec; int32_t scale = 0; diff --git a/cpp/src/arrow/csv/inference_internal.h b/cpp/src/arrow/csv/inference_internal.h index 1fd6d41b5cc..ede9bea319e 100644 --- a/cpp/src/arrow/csv/inference_internal.h +++ b/cpp/src/arrow/csv/inference_internal.h @@ -35,6 +35,8 @@ enum class InferKind { Time, Timestamp, TimestampNS, + TimestampWithZone, + TimestampWithZoneNS, TextDict, BinaryDict, Text, @@ -67,6 +69,10 @@ class InferStatus { case InferKind::Timestamp: return SetKind(InferKind::TimestampNS); case InferKind::TimestampNS: + return SetKind(InferKind::TimestampWithZone); + case InferKind::TimestampWithZone: + return SetKind(InferKind::TimestampWithZoneNS); + case InferKind::TimestampWithZoneNS: return SetKind(InferKind::Real); case InferKind::Real: if (options_.auto_dict_encode) { @@ -123,6 +129,10 @@ class InferStatus { return make_converter(timestamp(TimeUnit::SECOND)); case InferKind::TimestampNS: return make_converter(timestamp(TimeUnit::NANO)); + case InferKind::TimestampWithZone: + return make_converter(timestamp(TimeUnit::SECOND, "UTC")); + case InferKind::TimestampWithZoneNS: + return make_converter(timestamp(TimeUnit::NANO, "UTC")); case InferKind::Real: return make_converter(float64()); case InferKind::Text: diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index adc333ecfcc..ccd7674aa7a 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -42,10 +42,27 @@ namespace { class StrptimeTimestampParser : public TimestampParser { public: - explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {} + explicit StrptimeTimestampParser(std::string format) + : format_(std::move(format)), have_zone_offset_(false) { + // Check for use of %z + size_t cur = 0; + while (cur < format_.size()) { + if (format_[cur] == '%') { + if (cur + 1 < format_.size() && format_[cur + 1] == 'z') { + have_zone_offset_ = true; + break; + } + cur++; + } + cur++; + } + } - bool operator()(const char* s, size_t length, TimeUnit::type out_unit, - int64_t* out) const override { + bool operator()(const char* s, size_t length, TimeUnit::type out_unit, int64_t* out, + bool* out_zone_offset_present = NULLPTR) const override { + if (out_zone_offset_present) { + *out_zone_offset_present = have_zone_offset_; + } return ParseTimestampStrptime(s, length, format_.c_str(), /*ignore_time_in_day=*/false, /*allow_trailing_chars=*/false, out_unit, out); @@ -57,15 +74,16 @@ class StrptimeTimestampParser : public TimestampParser { private: std::string format_; + bool have_zone_offset_; }; class ISO8601Parser : public TimestampParser { public: ISO8601Parser() {} - bool operator()(const char* s, size_t length, TimeUnit::type out_unit, - int64_t* out) const override { - return ParseTimestampISO8601(s, length, out_unit, out); + bool operator()(const char* s, size_t length, TimeUnit::type out_unit, int64_t* out, + bool* out_zone_offset_present = NULLPTR) const override { + return ParseTimestampISO8601(s, length, out_unit, out, out_zone_offset_present); } const char* kind() const override { return "iso8601"; } diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index d99634e122e..927bcffcca3 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -45,7 +45,8 @@ class ARROW_EXPORT TimestampParser { virtual ~TimestampParser() = default; virtual bool operator()(const char* s, size_t length, TimeUnit::type out_unit, - int64_t* out) const = 0; + int64_t* out, + bool* out_zone_offset_present = NULLPTR) const = 0; virtual const char* kind() const = 0; @@ -495,6 +496,27 @@ static inline bool ParseHH_MM(const char* s, Duration* out) { return true; } +template +static inline bool ParseHHMM(const char* s, Duration* out) { + uint8_t hours = 0; + uint8_t minutes = 0; + if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) { + return false; + } + if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 2, 2, &minutes))) { + return false; + } + if (ARROW_PREDICT_FALSE(hours >= 24)) { + return false; + } + if (ARROW_PREDICT_FALSE(minutes >= 60)) { + return false; + } + *out = std::chrono::duration_cast(std::chrono::hours(hours) + + std::chrono::minutes(minutes)); + return true; +} + template static inline bool ParseHH_MM_SS(const char* s, Duration* out) { uint8_t hours = 0; @@ -609,10 +631,15 @@ static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type } // namespace detail static inline bool ParseTimestampISO8601(const char* s, size_t length, - TimeUnit::type unit, - TimestampType::c_type* out) { + TimeUnit::type unit, TimestampType::c_type* out, + bool* out_zone_offset_present = NULLPTR) { using seconds_type = std::chrono::duration; + // We allow the following zone offset formats: + // - (none) + // - Z + // - [+-]HH(:?MM)? + // // We allow the following formats for all units: // - "YYYY-MM-DD" // - "YYYY-MM-DD[ T]hhZ?" @@ -647,8 +674,38 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length, return false; } + if (out_zone_offset_present) { + *out_zone_offset_present = false; + } + + seconds_type zone_offset(0); if (s[length - 1] == 'Z') { --length; + if (out_zone_offset_present) *out_zone_offset_present = true; + } else if (s[length - 3] == '+' || s[length - 3] == '-') { + // [+-]HH + length -= 3; + if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + length + 1, &zone_offset))) { + return false; + } + if (out_zone_offset_present) *out_zone_offset_present = true; + } else if (s[length - 5] == '+' || s[length - 5] == '-') { + // [+-]HHMM + length -= 5; + if (ARROW_PREDICT_FALSE(!detail::ParseHHMM(s + length + 1, &zone_offset))) { + return false; + } + if (out_zone_offset_present) *out_zone_offset_present = true; + } else if ((s[length - 6] == '+' || s[length - 6] == '-') && (s[length - 3] == ':')) { + // [+-]HH:MM + length -= 6; + if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + length + 1, &zone_offset))) { + return false; + } + if (out_zone_offset_present) *out_zone_offset_present = true; + } + if (s[length] == '+') { + zone_offset *= -1; } seconds_type seconds_since_midnight; @@ -682,6 +739,7 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length, } seconds_since_epoch += seconds_since_midnight; + seconds_since_epoch += zone_offset; if (length <= 19) { *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count()); @@ -702,6 +760,12 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length, return true; } +#ifdef _WIN32 +static constexpr bool kStrptimeSupportsZone = false; +#else +static constexpr bool kStrptimeSupportsZone = true; +#endif + /// \brief Returns time since the UNIX epoch in the requested unit static inline bool ParseTimestampStrptime(const char* buf, size_t length, const char* format, bool ignore_time_in_day, @@ -730,6 +794,9 @@ static inline bool ParseTimestampStrptime(const char* buf, size_t length, if (!ignore_time_in_day) { secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) + std::chrono::seconds(result.tm_sec)); +#ifndef _WIN32 + secs -= std::chrono::seconds(result.tm_gmtoff); +#endif } *out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count()); return true; diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index ebbb733398d..708d5ecd60f 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -503,6 +503,15 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) { AssertConversion(type, "1970-01-01 00:00:00", 0); AssertConversion(type, "2018-11-13 17", 1542128400); + AssertConversion(type, "2018-11-13 17+00", 1542128400); + AssertConversion(type, "2018-11-13 17+0000", 1542128400); + AssertConversion(type, "2018-11-13 17+00:00", 1542128400); + AssertConversion(type, "2018-11-13 17+01", 1542124800); + AssertConversion(type, "2018-11-13 17+0117", 1542123780); + AssertConversion(type, "2018-11-13 17+01:17", 1542123780); + AssertConversion(type, "2018-11-13 17-01", 1542132000); + AssertConversion(type, "2018-11-13 17-0117", 1542133020); + AssertConversion(type, "2018-11-13 17-01:17", 1542133020); AssertConversion(type, "2018-11-13T17", 1542128400); AssertConversion(type, "2018-11-13 17Z", 1542128400); AssertConversion(type, "2018-11-13T17Z", 1542128400); @@ -510,10 +519,28 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) { AssertConversion(type, "2018-11-13T17:11", 1542129060); AssertConversion(type, "2018-11-13 17:11Z", 1542129060); AssertConversion(type, "2018-11-13T17:11Z", 1542129060); + AssertConversion(type, "2018-11-13 17:11+00", 1542129060); + AssertConversion(type, "2018-11-13 17:11+0000", 1542129060); + AssertConversion(type, "2018-11-13 17:11+00:00", 1542129060); + AssertConversion(type, "2018-11-13 17:11+01", 1542125460); + AssertConversion(type, "2018-11-13 17:11+0117", 1542124440); + AssertConversion(type, "2018-11-13 17:11+01:17", 1542124440); + AssertConversion(type, "2018-11-13 17:11-01", 1542132660); + AssertConversion(type, "2018-11-13 17:11-0117", 1542133680); + AssertConversion(type, "2018-11-13 17:11-01:17", 1542133680); AssertConversion(type, "2018-11-13 17:11:10", 1542129070); AssertConversion(type, "2018-11-13T17:11:10", 1542129070); AssertConversion(type, "2018-11-13 17:11:10Z", 1542129070); AssertConversion(type, "2018-11-13T17:11:10Z", 1542129070); + AssertConversion(type, "2018-11-13T17:11:10+00", 1542129070); + AssertConversion(type, "2018-11-13T17:11:10+0000", 1542129070); + AssertConversion(type, "2018-11-13T17:11:10+00:00", 1542129070); + AssertConversion(type, "2018-11-13T17:11:10+01", 1542125470); + AssertConversion(type, "2018-11-13T17:11:10+0117", 1542124450); + AssertConversion(type, "2018-11-13T17:11:10+01:17", 1542124450); + AssertConversion(type, "2018-11-13T17:11:10-01", 1542132670); + AssertConversion(type, "2018-11-13T17:11:10-0117", 1542133690); + AssertConversion(type, "2018-11-13T17:11:10-01:17", 1542133690); AssertConversion(type, "1900-02-28 12:34:56", -2203932304LL); // No subseconds allowed @@ -530,6 +557,22 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) { AssertConversionFails(type, "1970-01-01 00:00:60"); AssertConversionFails(type, "1970-01-01 00:00,00"); AssertConversionFails(type, "1970-01-01 00,00:00"); + // Invalid zone offsets + AssertConversionFails(type, "1970-01-01 00:00+0"); + AssertConversionFails(type, "1970-01-01 00:00+000"); + AssertConversionFails(type, "1970-01-01 00:00+00000"); + AssertConversionFails(type, "1970-01-01 00:00+2400"); + AssertConversionFails(type, "1970-01-01 00:00+0060"); + AssertConversionFails(type, "1970-01-01 00-0"); + AssertConversionFails(type, "1970-01-01 00-000"); + AssertConversionFails(type, "1970-01-01 00+00000"); + AssertConversionFails(type, "1970-01-01 00+2400"); + AssertConversionFails(type, "1970-01-01 00+0060"); + AssertConversionFails(type, "1970-01-01 00:00:00+0"); + AssertConversionFails(type, "1970-01-01 00:00:00-000"); + AssertConversionFails(type, "1970-01-01 00:00:00-00000"); + AssertConversionFails(type, "1970-01-01 00:00:00+2400"); + AssertConversionFails(type, "1970-01-01 00:00:00+00:99"); } { TimestampType type{TimeUnit::MILLI}; @@ -544,6 +587,13 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) { AssertConversion(type, "1900-02-28 12:34:56.12", -2203932304000LL + 120LL); AssertConversion(type, "1900-02-28 12:34:56.123", -2203932304000LL + 123LL); + AssertConversion(type, "2018-11-13 17:11:10.123+01", 1542129070123LL - 3600000LL); + AssertConversion(type, "2018-11-13 17:11:10.123+0117", 1542129070123LL - 4620000LL); + AssertConversion(type, "2018-11-13 17:11:10.123+01:17", 1542129070123LL - 4620000LL); + AssertConversion(type, "2018-11-13 17:11:10.123-01", 1542129070123LL + 3600000LL); + AssertConversion(type, "2018-11-13 17:11:10.123-0117", 1542129070123LL + 4620000LL); + AssertConversion(type, "2018-11-13 17:11:10.123-01:17", 1542129070123LL + 4620000LL); + // Invalid subseconds AssertConversionFails(type, "1900-02-28 12:34:56.1234"); AssertConversionFails(type, "1900-02-28 12:34:56.12345"); @@ -569,6 +619,19 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) { AssertConversion(type, "1900-02-28 12:34:56.12345", -2203932304000000LL + 123450LL); AssertConversion(type, "1900-02-28 12:34:56.123456", -2203932304000000LL + 123456LL); + AssertConversion(type, "1900-02-28 12:34:56.123456+01", + -2203932304000000LL + 123456LL - 3600000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456+0117", + -2203932304000000LL + 123456LL - 4620000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456+01:17", + -2203932304000000LL + 123456LL - 4620000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456-01", + -2203932304000000LL + 123456LL + 3600000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456-0117", + -2203932304000000LL + 123456LL + 4620000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456-01:17", + -2203932304000000LL + 123456LL + 4620000000LL); + // Invalid subseconds AssertConversionFails(type, "1900-02-28 12:34:56.1234567"); AssertConversionFails(type, "1900-02-28 12:34:56.12345678"); @@ -602,7 +665,21 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) { AssertConversion(type, "1900-02-28 12:34:56.123456789", -2203932304000000000LL + 123456789LL); + AssertConversion(type, "1900-02-28 12:34:56.123456789+01", + -2203932304000000000LL + 123456789LL - 3600000000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456789+0117", + -2203932304000000000LL + 123456789LL - 4620000000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456789+01:17", + -2203932304000000000LL + 123456789LL - 4620000000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456789-01", + -2203932304000000000LL + 123456789LL + 3600000000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456789-0117", + -2203932304000000000LL + 123456789LL + 4620000000000LL); + AssertConversion(type, "1900-02-28 12:34:56.123456789-01:17", + -2203932304000000000LL + 123456789LL + 4620000000000LL); + // Invalid subseconds + AssertConversionFails(type, "1900-02-28 12:34:56.1234567890"); } } @@ -618,10 +695,7 @@ TEST(TimestampParser, StrptimeParser) { std::vector cases = {{"5/31/2000 12:34:56", "2000-05-31 12:34:56"}, {"5/31/2000 00:00:00", "2000-05-31 00:00:00"}}; - std::vector units = {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, - TimeUnit::NANO}; - - for (auto unit : units) { + for (auto unit : TimeUnit::values()) { for (const auto& case_ : cases) { int64_t converted, expected; ASSERT_TRUE((*parser)(case_.value.c_str(), case_.value.size(), unit, &converted)); @@ -639,5 +713,33 @@ TEST(TimestampParser, StrptimeParser) { } } +TEST(TimestampParser, StrptimeZoneOffset) { + if (!kStrptimeSupportsZone) { + GTEST_SKIP() << "strptime does not support %z on this platform"; + } + std::string format = "%Y-%d-%m %H:%M:%S%z"; + auto parser = TimestampParser::MakeStrptime(format); + + // N.B. GNU %z supports ISO8601 format while BSD %z supports only + // +HHMM or -HHMM and POSIX doesn't appear to define %z at all + for (auto unit : TimeUnit::values()) { + for (const std::string& value : + {"2018-01-01 00:00:00+0000", "2018-01-01 00:00:00+0100", + "2018-01-01 00:00:00+0130", "2018-01-01 00:00:00-0117"}) { + SCOPED_TRACE(value); + int64_t converted = 0; + int64_t expected = 0; + ASSERT_TRUE((*parser)(value.c_str(), value.size(), unit, &converted)); + ASSERT_TRUE(ParseTimestampISO8601(value.c_str(), value.size(), unit, &expected)); + ASSERT_EQ(expected, converted); + } + for (const std::string& value : {"2018-01-01 00:00:00", "2018-01-01 00:00:00EST"}) { + SCOPED_TRACE(value); + int64_t converted = 0; + ASSERT_FALSE((*parser)(value.c_str(), value.size(), unit, &converted)); + } + } +} + } // namespace internal } // namespace arrow diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst index 42b5af67d80..d6bb66ce49e 100644 --- a/docs/source/cpp/csv.rst +++ b/docs/source/cpp/csv.rst @@ -190,6 +190,71 @@ dictionary-encoded string-like array. It switches to a plain string-like array when the threshold in :member:`ConvertOptions::auto_dict_max_cardinality` is reached. +Timestamp inference/parsing +--------------------------- + +If type inference is enabled, the CSV reader first tries to interpret +string-like columns as timestamps. If all rows have some zone offset +(e.g. ``Z`` or ``+0100``), even if the offsets are inconsistent, then the +inferred type will be UTC timestamp. If no rows have a zone offset, then the +inferred type will be timestamp without timezone. A mix of rows with/without +offsets will result in a string column. + +If the type is explicitly specified as a timestamp with/without timezone, then +the reader will error on values without/with zone offsets in that column. Note +that this means it isn't currently possible to have the reader parse a column +of timestamps without zone offsets as local times in a particular timezone; +instead, parse the column as timestamp without timezone, then convert the +values afterwards using the ``assume_timezone`` compute function. + ++-------------------+------------------------------+-------------------+ +| Specified Type | Input CSV | Result Type | ++===================+==============================+===================+ +| (inferred) | ``2021-01-01T00:00:00`` | timestamp[s] | +| +------------------------------+-------------------+ +| | ``2021-01-01T00:00:00Z`` | timestamp[s, UTC] | +| +------------------------------+ | +| | ``2021-01-01T00:00:00+0100`` | | +| +------------------------------+-------------------+ +| | :: | string | +| | | | +| | 2021-01-01T00:00:00 | | +| | 2021-01-01T00:00:00Z | | ++-------------------+------------------------------+-------------------+ +| timestamp[s] | ``2021-01-01T00:00:00`` | timestamp[s] | +| +------------------------------+-------------------+ +| | ``2021-01-01T00:00:00Z`` | (error) | +| +------------------------------+ | +| | ``2021-01-01T00:00:00+0100`` | | +| +------------------------------+ | +| | :: | | +| | | | +| | 2021-01-01T00:00:00 | | +| | 2021-01-01T00:00:00Z | | ++-------------------+------------------------------+-------------------+ +| timestamp[s, UTC] | ``2021-01-01T00:00:00`` | (error) | +| +------------------------------+-------------------+ +| | ``2021-01-01T00:00:00Z`` | timestamp[s, UTC] | +| +------------------------------+ | +| | ``2021-01-01T00:00:00+0100`` | | +| +------------------------------+-------------------+ +| | :: | (error) | +| | | | +| | 2021-01-01T00:00:00 | | +| | 2021-01-01T00:00:00Z | | ++-------------------+------------------------------+-------------------+ +| timestamp[s, | ``2021-01-01T00:00:00`` | (error) | +| America/New_York] +------------------------------+-------------------+ +| | ``2021-01-01T00:00:00Z`` | timestamp[s, | +| +------------------------------+ America/New_York] | +| | ``2021-01-01T00:00:00+0100`` | | +| +------------------------------+-------------------+ +| | :: | (error) | +| | | | +| | 2021-01-01T00:00:00 | | +| | 2021-01-01T00:00:00Z | | ++-------------------+------------------------------+-------------------+ + Nulls ----- diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R index f0416eb3f72..061fbbd4cdb 100644 --- a/r/tests/testthat/test-csv.R +++ b/r/tests/testthat/test-csv.R @@ -219,10 +219,10 @@ test_that("read_csv_arrow() can read timestamps", { on.exit(unlink(tf)) write.csv(tbl, tf, row.names = FALSE) - df <- read_csv_arrow(tf, col_types = schema(time = timestamp(timezone = "UTC"))) - expect_equal(tbl, df) - + df <- read_csv_arrow(tf, col_types = schema(time = timestamp())) # time zones are being read in as time zone-naive, hence ignore_attr = "tzone" + expect_equal(tbl, df, ignore_attr = "tzone") + df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1) expect_equal(tbl, df, ignore_attr = "tzone") }) @@ -235,10 +235,12 @@ test_that("read_csv_arrow(timestamp_parsers=)", { df <- read_csv_arrow( tf, - col_types = schema(time = timestamp(timezone = "UTC")), + col_types = schema(time = timestamp()), timestamp_parsers = "%d/%m/%Y" ) - expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC")) + # time zones are being read in as time zone-naive, hence ignore_attr = "tzone" + expected <- as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC") + expect_equal(df$time, expected, ignore_attr = "tzone") }) test_that("Skipping columns with null()", {