From 9e5b019ec4314ddf3766b49c31aa55a25e59ffd0 Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 7 Oct 2021 14:05:15 -0400
Subject: [PATCH 1/2] ARROW-12820: [C++] Support zone offset in ISO8601 parser
---
.../compute/kernels/scalar_cast_temporal.cc | 21 +++-
.../arrow/compute/kernels/scalar_cast_test.cc | 27 ++++-
.../arrow/compute/kernels/scalar_string.cc | 21 +++-
.../compute/kernels/scalar_string_test.cc | 19 +++
cpp/src/arrow/csv/column_builder_test.cc | 85 ++++++++++++++
cpp/src/arrow/csv/converter.cc | 54 ++++++++-
cpp/src/arrow/csv/converter_test.cc | 73 ++++++++++++
cpp/src/arrow/csv/inference_internal.h | 10 ++
cpp/src/arrow/util/value_parsing.cc | 30 ++++-
cpp/src/arrow/util/value_parsing.h | 73 +++++++++++-
cpp/src/arrow/util/value_parsing_test.cc | 110 +++++++++++++++++-
docs/source/cpp/csv.rst | 65 +++++++++++
r/tests/testthat/test-csv.R | 12 +-
13 files changed, 569 insertions(+), 31 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index 5f16f1e9db4..74274e963a1 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -29,7 +29,7 @@
namespace arrow {
-using internal::ParseValue;
+using internal::ParseTimestampISO8601;
namespace compute {
namespace internal {
@@ -422,17 +422,34 @@ struct CastFunctor {
// String to Timestamp
struct ParseTimestamp {
+ explicit ParseTimestamp(const TimestampType& type)
+ : type(type), expect_timezone(!type.timezone().empty()) {}
template
OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
OutValue result = 0;
- if (ARROW_PREDICT_FALSE(!ParseValue(type, val.data(), val.size(), &result))) {
+ bool zone_offset_present = false;
+ if (ARROW_PREDICT_FALSE(!ParseTimestampISO8601(val.data(), val.size(), type.unit(),
+ &result, &zone_offset_present))) {
*st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
type.ToString());
}
+ if (zone_offset_present != expect_timezone) {
+ if (expect_timezone) {
+ *st = Status::Invalid(
+ "Failed to parse string: '", val, "' as a scalar of type ", type.ToString(),
+ "expected a zone offset. If these timestamps "
+ "are in local time, cast to timestamp without timezone, then "
+ "call assume_timezone.");
+ } else {
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ type.ToString(), "expected no zone offset");
+ }
+ }
return result;
}
const TimestampType& type;
+ bool expect_timezone;
};
template
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 92de7892f95..608ca17bbe7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -1945,7 +1945,32 @@ TEST(Cast, StringToTimestamp) {
}
}
- // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc
+ auto zoned = ArrayFromJSON(string_type,
+ R"(["2020-02-29T00:00:00Z", "2020-03-02T10:11:12+0102"])");
+ auto mixed = ArrayFromJSON(string_type,
+ R"(["2020-03-02T10:11:12+0102", "2020-02-29T00:00:00"])");
+
+ // Timestamp with zone offset should not parse as naive
+ CheckCastFails(zoned, CastOptions::Safe(timestamp(TimeUnit::SECOND)));
+
+ // Mixed zoned/unzoned should not parse as naive
+ CheckCastFails(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND)));
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, ::testing::HasSubstr("expected no zone offset"),
+ Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND))));
+
+ // ...or as timestamp with timezone
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, ::testing::HasSubstr("expected a zone offset"),
+ Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC"))));
+
+ // Timestamp with zone offset can parse as any time zone (since they're unambiguous)
+ CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"),
+ "[1582934400, 1583140152]"));
+ CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"),
+ "[1582934400, 1583140152]"));
+
+ // NOTE: timestamp parsing is tested comprehensively in value_parsing_test.cc
}
}
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 7eeb80b013b..c2e56e9d3f4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -3625,11 +3625,24 @@ struct StrptimeExec {
Result ResolveStrptimeOutput(KernelContext* ctx,
const std::vector&) {
- if (ctx->state()) {
- return ::arrow::timestamp(StrptimeState::Get(ctx).unit);
+ if (!ctx->state()) {
+ return Status::Invalid("strptime does not provide default StrptimeOptions");
+ }
+ const StrptimeOptions& options = StrptimeState::Get(ctx);
+ // Check for use of %z or %Z
+ size_t cur = 0;
+ std::string zone = "";
+ while (cur < options.format.size() - 1) {
+ if (options.format[cur] == '%') {
+ if (cur + 1 < options.format.size() && options.format[cur + 1] == 'z') {
+ zone = "UTC";
+ break;
+ }
+ cur++;
+ }
+ cur++;
}
-
- return Status::Invalid("strptime does not provide default StrptimeOptions");
+ return ::arrow::timestamp(options.unit, zone);
}
// ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 4551e8c61e5..18ca794f669 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -32,6 +32,7 @@
#include "arrow/compute/kernels/test_util.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type.h"
+#include "arrow/util/value_parsing.h"
namespace arrow {
namespace compute {
@@ -1757,6 +1758,24 @@ TYPED_TEST(TestStringKernels, Strptime) {
std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
+
+ input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
+ options.format = "%m/%d/%Y %%z";
+ this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
+}
+
+TYPED_TEST(TestStringKernels, StrptimeZoneOffset) {
+ if (!arrow::internal::kStrptimeSupportsZone) {
+ GTEST_SKIP() << "strptime does not support %z on this platform";
+ }
+ // N.B. BSD strptime only supports (+/-)HHMM and not the wider range
+ // of values GNU strptime supports.
+ std::string input1 = R"(["5/1/2020 +0100", null, "12/11/1900 -0130"])";
+ std::string output1 =
+ R"(["2020-04-30T23:00:00.000000", null, "1900-12-11T01:30:00.000000"])";
+ StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO);
+ this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO, "UTC"), output1,
+ &options);
}
TYPED_TEST(TestStringKernels, StrptimeDoesNotProvideDefaultOptions) {
diff --git a/cpp/src/arrow/csv/column_builder_test.cc b/cpp/src/arrow/csv/column_builder_test.cc
index 7577c883e8c..53e69ada62f 100644
--- a/cpp/src/arrow/csv/column_builder_test.cc
+++ b/cpp/src/arrow/csv/column_builder_test.cc
@@ -33,6 +33,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/task_group.h"
#include "arrow/util/thread_pool.h"
+#include "arrow/util/value_parsing.h"
namespace arrow {
namespace csv {
@@ -427,6 +428,13 @@ TEST_F(InferringColumnBuilderTest, SingleChunkTimestamp) {
{{false, true, true}}, {{0, 0, 1542129070}},
&expected);
CheckInferred(tg, {{"", "1970-01-01", "2018-11-13 17:11:10"}}, options, expected);
+
+ options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y/%m/%d"));
+ tg = TaskGroup::MakeSerial();
+ ChunkedArrayFromVector(timestamp(TimeUnit::SECOND),
+ {{false, true, true}}, {{0, 0, 1542067200}},
+ &expected);
+ CheckInferred(tg, {{"", "1970/01/01", "2018/11/13"}}, options, expected);
}
TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) {
@@ -438,6 +446,13 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) {
{{false}, {true}, {true}},
{{0}, {0}, {1542129070}}, &expected);
CheckInferred(tg, {{""}, {"1970-01-01"}, {"2018-11-13 17:11:10"}}, options, expected);
+
+ options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y/%m/%d"));
+ tg = TaskGroup::MakeSerial();
+ ChunkedArrayFromVector(timestamp(TimeUnit::SECOND),
+ {{false}, {true}, {true}},
+ {{0}, {0}, {1542067200}}, &expected);
+ CheckInferred(tg, {{""}, {"1970/01/01"}, {"2018/11/13"}}, options, expected);
}
TEST_F(InferringColumnBuilderTest, SingleChunkTimestampNS) {
@@ -471,6 +486,76 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampNS) {
options, expected);
}
+TEST_F(InferringColumnBuilderTest, SingleChunkTimestampWithZone) {
+ auto options = ConvertOptions::Defaults();
+ auto tg = TaskGroup::MakeSerial();
+
+ std::shared_ptr expected;
+ ChunkedArrayFromVector(timestamp(TimeUnit::SECOND, "UTC"),
+ {{false, true, true}}, {{0, 0, 1542129010}},
+ &expected);
+ CheckInferred(tg, {{"", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10+0001"}}, options,
+ expected);
+
+ tg = TaskGroup::MakeSerial();
+ expected = ChunkedArrayFromJSON(
+ utf8(), {R"(["", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10"])"});
+ CheckInferred(tg, {{"", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10"}}, options,
+ expected);
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampWithZone) {
+ auto options = ConvertOptions::Defaults();
+ auto tg = TaskGroup::MakeSerial();
+
+ std::shared_ptr expected;
+ ChunkedArrayFromVector(timestamp(TimeUnit::SECOND, "UTC"),
+ {{false}, {true}, {true}},
+ {{0}, {0}, {1542129010}}, &expected);
+ CheckInferred(tg, {{""}, {"1970-01-01T00:00:00Z"}, {"2018-11-13 17:11:10+0001"}},
+ options, expected);
+
+ tg = TaskGroup::MakeSerial();
+ expected = ChunkedArrayFromJSON(
+ utf8(), {R"([""])", R"(["1970-01-01T00:00:00Z"])", R"(["2018-11-13 17:11:10"])"});
+ CheckInferred(tg, {{""}, {"1970-01-01T00:00:00Z"}, {"2018-11-13 17:11:10"}}, options,
+ expected);
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkTimestampWithZoneNS) {
+ auto options = ConvertOptions::Defaults();
+ auto tg = TaskGroup::MakeSerial();
+
+ std::shared_ptr expected;
+ ChunkedArrayFromVector(
+ timestamp(TimeUnit::NANO, "UTC"), {{false, true, true, true, true}},
+ {{0, 3660000000000, 1542129070123000000, 1542129070123456000, 1542129070123456789}},
+ &expected);
+ CheckInferred(tg,
+ {{"", "1970-01-01T00:00:00-0101", "2018-11-13 17:11:10.123Z",
+ "2018-11-13 17:11:10.123456Z", "2018-11-13 17:11:10.123456789Z"}},
+ options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampWithZoneNS) {
+ auto options = ConvertOptions::Defaults();
+ auto tg = TaskGroup::MakeSerial();
+
+ std::shared_ptr expected;
+ ChunkedArrayFromVector(
+ timestamp(TimeUnit::NANO, "UTC"), {{false}, {true}, {true, true, true}},
+ {{0},
+ {3660000000000},
+ {1542129070123000000, 1542129070123456000, 1542129070123456789}},
+ &expected);
+ CheckInferred(tg,
+ {{""},
+ {"1970-01-01T00:00:00-0101"},
+ {"2018-11-13 17:11:10.123Z", "2018-11-13 17:11:10.123456Z",
+ "2018-11-13 17:11:10.123456789Z"}},
+ options, expected);
+}
+
TEST_F(InferringColumnBuilderTest, SingleChunkIntegerAndTime) {
// Fallback to utf-8
auto options = ConvertOptions::Defaults();
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 66d05458097..4d00cbec4eb 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -350,18 +350,37 @@ struct InlineISO8601ValueDecoder : public ValueDecoder {
explicit InlineISO8601ValueDecoder(const std::shared_ptr& type,
const ConvertOptions& options)
: ValueDecoder(type, options),
- unit_(checked_cast(*type_).unit()) {}
+ unit_(checked_cast(*type_).unit()),
+ expect_timezone_(!checked_cast(*type_).timezone().empty()) {
+ }
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
- if (ARROW_PREDICT_FALSE(!internal::ParseTimestampISO8601(
- reinterpret_cast(data), size, unit_, out))) {
+ bool zone_offset_present = false;
+ if (ARROW_PREDICT_FALSE(
+ !internal::ParseTimestampISO8601(reinterpret_cast(data), size,
+ unit_, out, &zone_offset_present))) {
return GenericConversionError(type_, data, size);
}
+ if (zone_offset_present != expect_timezone_) {
+ if (expect_timezone_) {
+ return Status::Invalid("CSV conversion error to ", type_->ToString(),
+ ": expected a zone offset in '",
+ std::string(reinterpret_cast(data), size),
+ "'. If these timestamps are in local time, parse them as "
+ "timestamps without timezone, then call assume_timezone.");
+ } else {
+ return Status::Invalid("CSV conversion error to ", type_->ToString(),
+ ": expected no zone offset in '",
+ std::string(reinterpret_cast(data), size),
+ "'");
+ }
+ }
return Status::OK();
}
protected:
TimeUnit::type unit_;
+ bool expect_timezone_;
};
struct SingleParserTimestampValueDecoder : public ValueDecoder {
@@ -371,18 +390,36 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder {
const ConvertOptions& options)
: ValueDecoder(type, options),
unit_(checked_cast(*type_).unit()),
+ expect_timezone_(!checked_cast(*type_).timezone().empty()),
parser_(*options_.timestamp_parsers[0]) {}
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
- if (ARROW_PREDICT_FALSE(
- !parser_(reinterpret_cast(data), size, unit_, out))) {
+ bool zone_offset_present = false;
+ if (ARROW_PREDICT_FALSE(!parser_(reinterpret_cast(data), size, unit_,
+ out, &zone_offset_present))) {
return GenericConversionError(type_, data, size);
}
+ if (zone_offset_present != expect_timezone_) {
+ if (expect_timezone_) {
+ return Status::Invalid("CSV conversion error to ", type_->ToString(),
+ ": expected a zone offset in '",
+ std::string(reinterpret_cast(data), size),
+ "'. If these timestamps are in local time, parse them as "
+ "timestamps without timezone, then call assume_timezone. "
+ "If using strptime, ensure '%z' is in the format string.");
+ } else {
+ return Status::Invalid("CSV conversion error to ", type_->ToString(),
+ ": expected no zone offset in '",
+ std::string(reinterpret_cast(data), size),
+ "'");
+ }
+ }
return Status::OK();
}
protected:
TimeUnit::type unit_;
+ bool expect_timezone_;
const TimestampParser& parser_;
};
@@ -393,11 +430,15 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
const ConvertOptions& options)
: ValueDecoder(type, options),
unit_(checked_cast(*type_).unit()),
+ expect_timezone_(!checked_cast(*type_).timezone().empty()),
parsers_(GetParsers(options_)) {}
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ bool zone_offset_present = false;
for (const auto& parser : parsers_) {
- if (parser->operator()(reinterpret_cast(data), size, unit_, out)) {
+ if (parser->operator()(reinterpret_cast(data), size, unit_, out,
+ &zone_offset_present) &&
+ zone_offset_present == expect_timezone_) {
return Status::OK();
}
}
@@ -416,6 +457,7 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
}
TimeUnit::type unit_;
+ bool expect_timezone_;
std::vector parsers_;
};
diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc
index 9a68b956803..9a83ef020de 100644
--- a/cpp/src/arrow/csv/converter_test.cc
+++ b/cpp/src/arrow/csv/converter_test.cc
@@ -548,12 +548,43 @@ TEST(TimestampConversion, Basics) {
{"2018-11-13 17:11:10\n1900-02-28 12:34:56\n"},
{{1542129070, -2203932304LL}});
+ // Zone offsets are not accepted
+ AssertConversionError(type,
+ {"1970-01-01T00Z\n2000-02-29T00-0200\n"
+ "3989-07-14T00+03:14\n1900-02-28 00-04:59\n"},
+ {0});
+
type = timestamp(TimeUnit::NANO);
AssertConversion(
type, {"1970-01-01\n2000-02-29\n1900-02-28\n"},
{{0, 951782400000000000LL, -2203977600000000000LL}});
}
+TEST(TimestampConversion, WithZoneOffset) {
+ auto type = timestamp(TimeUnit::SECOND, "UTC");
+
+ AssertConversion(
+ type,
+ {"1970-01-01T00Z\n2000-02-29T00-0200\n"
+ "3989-07-14T00+03:14\n1900-02-28 00-04:59\n"},
+ {{0, 951782400 + 7200, 63730281600LL - 11640, -2203977600LL + 17940}});
+
+ type = timestamp(TimeUnit::NANO, "UTC");
+ AssertConversion(
+ type,
+ {"1970-01-01T00Z\n"
+ "2000-02-29T00:00:00.123456789+0117\n"
+ "1900-02-28 00:00:00.123456789-01:00\n"},
+ {{0, 951782400000000000LL + 123456789LL - 4620000000000LL,
+ -2203977600000000000LL + 123456789LL + 3600000000000LL}});
+
+ // Local times are not accepted
+ AssertConversionError(type,
+ {"1970-01-01T00\n2000-02-29T00\n"
+ "3989-07-14T00\n1900-02-28 00\n"},
+ {0});
+}
+
TEST(TimestampConversion, Nulls) {
auto type = timestamp(TimeUnit::MILLI);
AssertConversion(
@@ -592,6 +623,48 @@ TEST(TimestampConversion, UserDefinedParsers) {
{{86400000}, {172800000}}, options);
}
+#ifndef _WIN32
+TEST(TimestampConversion, UserDefinedParsersWithZone) {
+ auto options = ConvertOptions::Defaults();
+ auto type = timestamp(TimeUnit::SECOND, "America/Phoenix");
+
+ // Test a single parser
+ options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %z")};
+ AssertConversion(type, {"01/02/1970 +0000,01/03/1970 +0000\n"},
+ {{86400}, {172800}}, options);
+
+ // Test multiple parsers
+ options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+ AssertConversion(
+ type, {"01/02/1970 +0000,1970-01-03T00:00:00+0000\n"}, {{86400}, {172800}},
+ options);
+
+ // Test errors
+ options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y")};
+ AssertConversionError(type, {"01/02/1970,01/03/1970\n"}, {0, 1}, options);
+ options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+ AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, options);
+}
+#else
+// Windows uses the vendored musl strptime which doesn't support %z.
+TEST(TimestampConversion, UserDefinedParsersWithZone) {
+ auto options = ConvertOptions::Defaults();
+ auto type = timestamp(TimeUnit::SECOND, "America/Phoenix");
+
+ options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %z")};
+ AssertConversionError(type, {"01/02/1970 +0000,01/03/1970 +0000\n"}, {0, 1}, options);
+
+ options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+ AssertConversionError(type, {"01/02/1970 +0000,1970-01-03T00:00:00+0000\n"}, {0},
+ options);
+
+ options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y")};
+ AssertConversionError(type, {"01/02/1970,01/03/1970\n"}, {0, 1}, options);
+ options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+ AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, options);
+}
+#endif
+
Decimal128 Dec128(util::string_view value) {
Decimal128 dec;
int32_t scale = 0;
diff --git a/cpp/src/arrow/csv/inference_internal.h b/cpp/src/arrow/csv/inference_internal.h
index 1fd6d41b5cc..ede9bea319e 100644
--- a/cpp/src/arrow/csv/inference_internal.h
+++ b/cpp/src/arrow/csv/inference_internal.h
@@ -35,6 +35,8 @@ enum class InferKind {
Time,
Timestamp,
TimestampNS,
+ TimestampWithZone,
+ TimestampWithZoneNS,
TextDict,
BinaryDict,
Text,
@@ -67,6 +69,10 @@ class InferStatus {
case InferKind::Timestamp:
return SetKind(InferKind::TimestampNS);
case InferKind::TimestampNS:
+ return SetKind(InferKind::TimestampWithZone);
+ case InferKind::TimestampWithZone:
+ return SetKind(InferKind::TimestampWithZoneNS);
+ case InferKind::TimestampWithZoneNS:
return SetKind(InferKind::Real);
case InferKind::Real:
if (options_.auto_dict_encode) {
@@ -123,6 +129,10 @@ class InferStatus {
return make_converter(timestamp(TimeUnit::SECOND));
case InferKind::TimestampNS:
return make_converter(timestamp(TimeUnit::NANO));
+ case InferKind::TimestampWithZone:
+ return make_converter(timestamp(TimeUnit::SECOND, "UTC"));
+ case InferKind::TimestampWithZoneNS:
+ return make_converter(timestamp(TimeUnit::NANO, "UTC"));
case InferKind::Real:
return make_converter(float64());
case InferKind::Text:
diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc
index adc333ecfcc..ccd7674aa7a 100644
--- a/cpp/src/arrow/util/value_parsing.cc
+++ b/cpp/src/arrow/util/value_parsing.cc
@@ -42,10 +42,27 @@ namespace {
class StrptimeTimestampParser : public TimestampParser {
public:
- explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {}
+ explicit StrptimeTimestampParser(std::string format)
+ : format_(std::move(format)), have_zone_offset_(false) {
+ // Check for use of %z
+ size_t cur = 0;
+ while (cur < format_.size()) {
+ if (format_[cur] == '%') {
+ if (cur + 1 < format_.size() && format_[cur + 1] == 'z') {
+ have_zone_offset_ = true;
+ break;
+ }
+ cur++;
+ }
+ cur++;
+ }
+ }
- bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
- int64_t* out) const override {
+ bool operator()(const char* s, size_t length, TimeUnit::type out_unit, int64_t* out,
+ bool* out_zone_offset_present = NULLPTR) const override {
+ if (out_zone_offset_present) {
+ *out_zone_offset_present = have_zone_offset_;
+ }
return ParseTimestampStrptime(s, length, format_.c_str(),
/*ignore_time_in_day=*/false,
/*allow_trailing_chars=*/false, out_unit, out);
@@ -57,15 +74,16 @@ class StrptimeTimestampParser : public TimestampParser {
private:
std::string format_;
+ bool have_zone_offset_;
};
class ISO8601Parser : public TimestampParser {
public:
ISO8601Parser() {}
- bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
- int64_t* out) const override {
- return ParseTimestampISO8601(s, length, out_unit, out);
+ bool operator()(const char* s, size_t length, TimeUnit::type out_unit, int64_t* out,
+ bool* out_zone_offset_present = NULLPTR) const override {
+ return ParseTimestampISO8601(s, length, out_unit, out, out_zone_offset_present);
}
const char* kind() const override { return "iso8601"; }
diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h
index d99634e122e..927bcffcca3 100644
--- a/cpp/src/arrow/util/value_parsing.h
+++ b/cpp/src/arrow/util/value_parsing.h
@@ -45,7 +45,8 @@ class ARROW_EXPORT TimestampParser {
virtual ~TimestampParser() = default;
virtual bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
- int64_t* out) const = 0;
+ int64_t* out,
+ bool* out_zone_offset_present = NULLPTR) const = 0;
virtual const char* kind() const = 0;
@@ -495,6 +496,27 @@ static inline bool ParseHH_MM(const char* s, Duration* out) {
return true;
}
+template
+static inline bool ParseHHMM(const char* s, Duration* out) {
+ uint8_t hours = 0;
+ uint8_t minutes = 0;
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 2, 2, &minutes))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(hours >= 24)) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+ return false;
+ }
+ *out = std::chrono::duration_cast(std::chrono::hours(hours) +
+ std::chrono::minutes(minutes));
+ return true;
+}
+
template
static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
uint8_t hours = 0;
@@ -609,10 +631,15 @@ static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type
} // namespace detail
static inline bool ParseTimestampISO8601(const char* s, size_t length,
- TimeUnit::type unit,
- TimestampType::c_type* out) {
+ TimeUnit::type unit, TimestampType::c_type* out,
+ bool* out_zone_offset_present = NULLPTR) {
using seconds_type = std::chrono::duration;
+ // We allow the following zone offset formats:
+ // - (none)
+ // - Z
+ // - [+-]HH(:?MM)?
+ //
// We allow the following formats for all units:
// - "YYYY-MM-DD"
// - "YYYY-MM-DD[ T]hhZ?"
@@ -647,8 +674,38 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
return false;
}
+ if (out_zone_offset_present) {
+ *out_zone_offset_present = false;
+ }
+
+ seconds_type zone_offset(0);
if (s[length - 1] == 'Z') {
--length;
+ if (out_zone_offset_present) *out_zone_offset_present = true;
+ } else if (s[length - 3] == '+' || s[length - 3] == '-') {
+ // [+-]HH
+ length -= 3;
+ if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + length + 1, &zone_offset))) {
+ return false;
+ }
+ if (out_zone_offset_present) *out_zone_offset_present = true;
+ } else if (s[length - 5] == '+' || s[length - 5] == '-') {
+ // [+-]HHMM
+ length -= 5;
+ if (ARROW_PREDICT_FALSE(!detail::ParseHHMM(s + length + 1, &zone_offset))) {
+ return false;
+ }
+ if (out_zone_offset_present) *out_zone_offset_present = true;
+ } else if ((s[length - 6] == '+' || s[length - 6] == '-') && (s[length - 3] == ':')) {
+ // [+-]HH:MM
+ length -= 6;
+ if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + length + 1, &zone_offset))) {
+ return false;
+ }
+ if (out_zone_offset_present) *out_zone_offset_present = true;
+ }
+ if (s[length] == '+') {
+ zone_offset *= -1;
}
seconds_type seconds_since_midnight;
@@ -682,6 +739,7 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
}
seconds_since_epoch += seconds_since_midnight;
+ seconds_since_epoch += zone_offset;
if (length <= 19) {
*out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
@@ -702,6 +760,12 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
return true;
}
+#ifdef _WIN32
+static constexpr bool kStrptimeSupportsZone = false;
+#else
+static constexpr bool kStrptimeSupportsZone = true;
+#endif
+
/// \brief Returns time since the UNIX epoch in the requested unit
static inline bool ParseTimestampStrptime(const char* buf, size_t length,
const char* format, bool ignore_time_in_day,
@@ -730,6 +794,9 @@ static inline bool ParseTimestampStrptime(const char* buf, size_t length,
if (!ignore_time_in_day) {
secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
std::chrono::seconds(result.tm_sec));
+#ifndef _WIN32
+ secs -= std::chrono::seconds(result.tm_gmtoff);
+#endif
}
*out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count());
return true;
diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc
index ebbb733398d..708d5ecd60f 100644
--- a/cpp/src/arrow/util/value_parsing_test.cc
+++ b/cpp/src/arrow/util/value_parsing_test.cc
@@ -503,6 +503,15 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
AssertConversion(type, "1970-01-01 00:00:00", 0);
AssertConversion(type, "2018-11-13 17", 1542128400);
+ AssertConversion(type, "2018-11-13 17+00", 1542128400);
+ AssertConversion(type, "2018-11-13 17+0000", 1542128400);
+ AssertConversion(type, "2018-11-13 17+00:00", 1542128400);
+ AssertConversion(type, "2018-11-13 17+01", 1542124800);
+ AssertConversion(type, "2018-11-13 17+0117", 1542123780);
+ AssertConversion(type, "2018-11-13 17+01:17", 1542123780);
+ AssertConversion(type, "2018-11-13 17-01", 1542132000);
+ AssertConversion(type, "2018-11-13 17-0117", 1542133020);
+ AssertConversion(type, "2018-11-13 17-01:17", 1542133020);
AssertConversion(type, "2018-11-13T17", 1542128400);
AssertConversion(type, "2018-11-13 17Z", 1542128400);
AssertConversion(type, "2018-11-13T17Z", 1542128400);
@@ -510,10 +519,28 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
AssertConversion(type, "2018-11-13T17:11", 1542129060);
AssertConversion(type, "2018-11-13 17:11Z", 1542129060);
AssertConversion(type, "2018-11-13T17:11Z", 1542129060);
+ AssertConversion(type, "2018-11-13 17:11+00", 1542129060);
+ AssertConversion(type, "2018-11-13 17:11+0000", 1542129060);
+ AssertConversion(type, "2018-11-13 17:11+00:00", 1542129060);
+ AssertConversion(type, "2018-11-13 17:11+01", 1542125460);
+ AssertConversion(type, "2018-11-13 17:11+0117", 1542124440);
+ AssertConversion(type, "2018-11-13 17:11+01:17", 1542124440);
+ AssertConversion(type, "2018-11-13 17:11-01", 1542132660);
+ AssertConversion(type, "2018-11-13 17:11-0117", 1542133680);
+ AssertConversion(type, "2018-11-13 17:11-01:17", 1542133680);
AssertConversion(type, "2018-11-13 17:11:10", 1542129070);
AssertConversion(type, "2018-11-13T17:11:10", 1542129070);
AssertConversion(type, "2018-11-13 17:11:10Z", 1542129070);
AssertConversion(type, "2018-11-13T17:11:10Z", 1542129070);
+ AssertConversion(type, "2018-11-13T17:11:10+00", 1542129070);
+ AssertConversion(type, "2018-11-13T17:11:10+0000", 1542129070);
+ AssertConversion(type, "2018-11-13T17:11:10+00:00", 1542129070);
+ AssertConversion(type, "2018-11-13T17:11:10+01", 1542125470);
+ AssertConversion(type, "2018-11-13T17:11:10+0117", 1542124450);
+ AssertConversion(type, "2018-11-13T17:11:10+01:17", 1542124450);
+ AssertConversion(type, "2018-11-13T17:11:10-01", 1542132670);
+ AssertConversion(type, "2018-11-13T17:11:10-0117", 1542133690);
+ AssertConversion(type, "2018-11-13T17:11:10-01:17", 1542133690);
AssertConversion(type, "1900-02-28 12:34:56", -2203932304LL);
// No subseconds allowed
@@ -530,6 +557,22 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
AssertConversionFails(type, "1970-01-01 00:00:60");
AssertConversionFails(type, "1970-01-01 00:00,00");
AssertConversionFails(type, "1970-01-01 00,00:00");
+ // Invalid zone offsets
+ AssertConversionFails(type, "1970-01-01 00:00+0");
+ AssertConversionFails(type, "1970-01-01 00:00+000");
+ AssertConversionFails(type, "1970-01-01 00:00+00000");
+ AssertConversionFails(type, "1970-01-01 00:00+2400");
+ AssertConversionFails(type, "1970-01-01 00:00+0060");
+ AssertConversionFails(type, "1970-01-01 00-0");
+ AssertConversionFails(type, "1970-01-01 00-000");
+ AssertConversionFails(type, "1970-01-01 00+00000");
+ AssertConversionFails(type, "1970-01-01 00+2400");
+ AssertConversionFails(type, "1970-01-01 00+0060");
+ AssertConversionFails(type, "1970-01-01 00:00:00+0");
+ AssertConversionFails(type, "1970-01-01 00:00:00-000");
+ AssertConversionFails(type, "1970-01-01 00:00:00-00000");
+ AssertConversionFails(type, "1970-01-01 00:00:00+2400");
+ AssertConversionFails(type, "1970-01-01 00:00:00+00:99");
}
{
TimestampType type{TimeUnit::MILLI};
@@ -544,6 +587,13 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
AssertConversion(type, "1900-02-28 12:34:56.12", -2203932304000LL + 120LL);
AssertConversion(type, "1900-02-28 12:34:56.123", -2203932304000LL + 123LL);
+ AssertConversion(type, "2018-11-13 17:11:10.123+01", 1542129070123LL - 3600000LL);
+ AssertConversion(type, "2018-11-13 17:11:10.123+0117", 1542129070123LL - 4620000LL);
+ AssertConversion(type, "2018-11-13 17:11:10.123+01:17", 1542129070123LL - 4620000LL);
+ AssertConversion(type, "2018-11-13 17:11:10.123-01", 1542129070123LL + 3600000LL);
+ AssertConversion(type, "2018-11-13 17:11:10.123-0117", 1542129070123LL + 4620000LL);
+ AssertConversion(type, "2018-11-13 17:11:10.123-01:17", 1542129070123LL + 4620000LL);
+
// Invalid subseconds
AssertConversionFails(type, "1900-02-28 12:34:56.1234");
AssertConversionFails(type, "1900-02-28 12:34:56.12345");
@@ -569,6 +619,19 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
AssertConversion(type, "1900-02-28 12:34:56.12345", -2203932304000000LL + 123450LL);
AssertConversion(type, "1900-02-28 12:34:56.123456", -2203932304000000LL + 123456LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456+01",
+ -2203932304000000LL + 123456LL - 3600000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456+0117",
+ -2203932304000000LL + 123456LL - 4620000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456+01:17",
+ -2203932304000000LL + 123456LL - 4620000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456-01",
+ -2203932304000000LL + 123456LL + 3600000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456-0117",
+ -2203932304000000LL + 123456LL + 4620000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456-01:17",
+ -2203932304000000LL + 123456LL + 4620000000LL);
+
// Invalid subseconds
AssertConversionFails(type, "1900-02-28 12:34:56.1234567");
AssertConversionFails(type, "1900-02-28 12:34:56.12345678");
@@ -602,7 +665,21 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
AssertConversion(type, "1900-02-28 12:34:56.123456789",
-2203932304000000000LL + 123456789LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456789+01",
+ -2203932304000000000LL + 123456789LL - 3600000000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456789+0117",
+ -2203932304000000000LL + 123456789LL - 4620000000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456789+01:17",
+ -2203932304000000000LL + 123456789LL - 4620000000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456789-01",
+ -2203932304000000000LL + 123456789LL + 3600000000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456789-0117",
+ -2203932304000000000LL + 123456789LL + 4620000000000LL);
+ AssertConversion(type, "1900-02-28 12:34:56.123456789-01:17",
+ -2203932304000000000LL + 123456789LL + 4620000000000LL);
+
// Invalid subseconds
+ AssertConversionFails(type, "1900-02-28 12:34:56.1234567890");
}
}
@@ -618,10 +695,7 @@ TEST(TimestampParser, StrptimeParser) {
std::vector cases = {{"5/31/2000 12:34:56", "2000-05-31 12:34:56"},
{"5/31/2000 00:00:00", "2000-05-31 00:00:00"}};
- std::vector units = {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO,
- TimeUnit::NANO};
-
- for (auto unit : units) {
+ for (auto unit : TimeUnit::values()) {
for (const auto& case_ : cases) {
int64_t converted, expected;
ASSERT_TRUE((*parser)(case_.value.c_str(), case_.value.size(), unit, &converted));
@@ -639,5 +713,33 @@ TEST(TimestampParser, StrptimeParser) {
}
}
+TEST(TimestampParser, StrptimeZoneOffset) {
+ if (!kStrptimeSupportsZone) {
+ GTEST_SKIP() << "strptime does not support %z on this platform";
+ }
+ std::string format = "%Y-%d-%m %H:%M:%S%z";
+ auto parser = TimestampParser::MakeStrptime(format);
+
+ // N.B. GNU %z supports ISO8601 format while BSD %z supports only
+ // +HHMM or -HHMM and POSIX doesn't appear to define %z at all
+ for (auto unit : TimeUnit::values()) {
+ for (const std::string& value :
+ {"2018-01-01 00:00:00+0000", "2018-01-01 00:00:00+0100",
+ "2018-01-01 00:00:00+0130", "2018-01-01 00:00:00-0117"}) {
+ SCOPED_TRACE(value);
+ int64_t converted = 0;
+ int64_t expected = 0;
+ ASSERT_TRUE((*parser)(value.c_str(), value.size(), unit, &converted));
+ ASSERT_TRUE(ParseTimestampISO8601(value.c_str(), value.size(), unit, &expected));
+ ASSERT_EQ(expected, converted);
+ }
+ for (const std::string& value : {"2018-01-01 00:00:00", "2018-01-01 00:00:00EST"}) {
+ SCOPED_TRACE(value);
+ int64_t converted = 0;
+ ASSERT_FALSE((*parser)(value.c_str(), value.size(), unit, &converted));
+ }
+ }
+}
+
} // namespace internal
} // namespace arrow
diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst
index 42b5af67d80..d6bb66ce49e 100644
--- a/docs/source/cpp/csv.rst
+++ b/docs/source/cpp/csv.rst
@@ -190,6 +190,71 @@ dictionary-encoded string-like array. It switches to a plain string-like
array when the threshold in :member:`ConvertOptions::auto_dict_max_cardinality`
is reached.
+Timestamp inference/parsing
+---------------------------
+
+If type inference is enabled, the CSV reader first tries to interpret
+string-like columns as timestamps. If all rows have some zone offset
+(e.g. ``Z`` or ``+0100``), even if the offsets are inconsistent, then the
+inferred type will be UTC timestamp. If no rows have a zone offset, then the
+inferred type will be timestamp without timezone. A mix of rows with/without
+offsets will result in a string column.
+
+If the type is explicitly specified as a timestamp with/without timezone, then
+the reader will error on values without/with zone offsets in that column. Note
+that this means it isn't currently possible to have the reader parse a column
+of timestamps without zone offsets as local times in a particular timezone;
+instead, parse the column as timestamp without timezone, then convert the
+values afterwards using the ``assume_timezone`` compute function.
+
++-------------------+------------------------------+-------------------+
+| Specified Type | Input CSV | Result Type |
++===================+==============================+===================+
+| (inferred) | ``2021-01-01T00:00:00`` | timestamp[s] |
+| +------------------------------+-------------------+
+| | ``2021-01-01T00:00:00Z`` | timestamp[s, UTC] |
+| +------------------------------+ |
+| | ``2021-01-01T00:00:00+0100`` | |
+| +------------------------------+-------------------+
+| | :: | string |
+| | | |
+| | 2021-01-01T00:00:00 | |
+| | 2021-01-01T00:00:00Z | |
++-------------------+------------------------------+-------------------+
+| timestamp[s] | ``2021-01-01T00:00:00`` | timestamp[s] |
+| +------------------------------+-------------------+
+| | ``2021-01-01T00:00:00Z`` | (error) |
+| +------------------------------+ |
+| | ``2021-01-01T00:00:00+0100`` | |
+| +------------------------------+ |
+| | :: | |
+| | | |
+| | 2021-01-01T00:00:00 | |
+| | 2021-01-01T00:00:00Z | |
++-------------------+------------------------------+-------------------+
+| timestamp[s, UTC] | ``2021-01-01T00:00:00`` | (error) |
+| +------------------------------+-------------------+
+| | ``2021-01-01T00:00:00Z`` | timestamp[s, UTC] |
+| +------------------------------+ |
+| | ``2021-01-01T00:00:00+0100`` | |
+| +------------------------------+-------------------+
+| | :: | (error) |
+| | | |
+| | 2021-01-01T00:00:00 | |
+| | 2021-01-01T00:00:00Z | |
++-------------------+------------------------------+-------------------+
+| timestamp[s, | ``2021-01-01T00:00:00`` | (error) |
+| America/New_York] +------------------------------+-------------------+
+| | ``2021-01-01T00:00:00Z`` | timestamp[s, |
+| +------------------------------+ America/New_York] |
+| | ``2021-01-01T00:00:00+0100`` | |
+| +------------------------------+-------------------+
+| | :: | (error) |
+| | | |
+| | 2021-01-01T00:00:00 | |
+| | 2021-01-01T00:00:00Z | |
++-------------------+------------------------------+-------------------+
+
Nulls
-----
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index f0416eb3f72..061fbbd4cdb 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -219,10 +219,10 @@ test_that("read_csv_arrow() can read timestamps", {
on.exit(unlink(tf))
write.csv(tbl, tf, row.names = FALSE)
- df <- read_csv_arrow(tf, col_types = schema(time = timestamp(timezone = "UTC")))
- expect_equal(tbl, df)
-
+ df <- read_csv_arrow(tf, col_types = schema(time = timestamp()))
# time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
+ expect_equal(tbl, df, ignore_attr = "tzone")
+
df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1)
expect_equal(tbl, df, ignore_attr = "tzone")
})
@@ -235,10 +235,12 @@ test_that("read_csv_arrow(timestamp_parsers=)", {
df <- read_csv_arrow(
tf,
- col_types = schema(time = timestamp(timezone = "UTC")),
+ col_types = schema(time = timestamp()),
timestamp_parsers = "%d/%m/%Y"
)
- expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC"))
+ # time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
+ expected <- as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC")
+ expect_equal(df$time, expected, ignore_attr = "tzone")
})
test_that("Skipping columns with null()", {
From e0ec8b03f01c5b32c8796adfd6ed7e0553db5d84 Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 8 Nov 2021 11:28:14 -0500
Subject: [PATCH 2/2] ARROW-12820: [C++] Fix some oversights
---
cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 5 +++++
cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 608ca17bbe7..b5cafead6b2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -1964,6 +1964,11 @@ TEST(Cast, StringToTimestamp) {
Invalid, ::testing::HasSubstr("expected a zone offset"),
Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC"))));
+ // Unzoned should not parse as timestamp with timezone
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, ::testing::HasSubstr("expected a zone offset"),
+ Cast(strings, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC"))));
+
// Timestamp with zone offset can parse as any time zone (since they're unambiguous)
CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"),
"[1582934400, 1583140152]"));
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index c2e56e9d3f4..b567f4f351b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -3634,7 +3634,7 @@ Result ResolveStrptimeOutput(KernelContext* ctx,
std::string zone = "";
while (cur < options.format.size() - 1) {
if (options.format[cur] == '%') {
- if (cur + 1 < options.format.size() && options.format[cur + 1] == 'z') {
+ if (options.format[cur + 1] == 'z') {
zone = "UTC";
break;
}