Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

namespace arrow {

using internal::ParseValue;
using internal::ParseTimestampISO8601;

namespace compute {
namespace internal {
Expand Down Expand Up @@ -422,17 +422,34 @@ struct CastFunctor<TimestampType, Date64Type> {
// String to Timestamp

struct ParseTimestamp {
explicit ParseTimestamp(const TimestampType& type)
: type(type), expect_timezone(!type.timezone().empty()) {}
template <typename OutValue, typename Arg0Value>
OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
OutValue result = 0;
if (ARROW_PREDICT_FALSE(!ParseValue(type, val.data(), val.size(), &result))) {
bool zone_offset_present = false;
if (ARROW_PREDICT_FALSE(!ParseTimestampISO8601(val.data(), val.size(), type.unit(),
&result, &zone_offset_present))) {
*st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
type.ToString());
}
if (zone_offset_present != expect_timezone) {
if (expect_timezone) {
*st = Status::Invalid(
"Failed to parse string: '", val, "' as a scalar of type ", type.ToString(),
"expected a zone offset. If these timestamps "
"are in local time, cast to timestamp without timezone, then "
"call assume_timezone.");
} else {
*st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
type.ToString(), "expected no zone offset");
}
}
return result;
}

const TimestampType& type;
bool expect_timezone;
};

template <typename I>
Expand Down
32 changes: 31 additions & 1 deletion cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1945,7 +1945,37 @@ TEST(Cast, StringToTimestamp) {
}
}

// NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc
auto zoned = ArrayFromJSON(string_type,
R"(["2020-02-29T00:00:00Z", "2020-03-02T10:11:12+0102"])");
auto mixed = ArrayFromJSON(string_type,
R"(["2020-03-02T10:11:12+0102", "2020-02-29T00:00:00"])");

// Timestamp with zone offset should not parse as naive
CheckCastFails(zoned, CastOptions::Safe(timestamp(TimeUnit::SECOND)));

// Mixed zoned/unzoned should not parse as naive
CheckCastFails(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND)));
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr("expected no zone offset"),
Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND))));

// ...or as timestamp with timezone
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr("expected a zone offset"),
Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC"))));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test strings here for the case of fully unzoned? (although in code that probably triggers the same check as the mixed case?)


// Unzoned should not parse as timestamp with timezone
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr("expected a zone offset"),
Cast(strings, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC"))));

// Timestamp with zone offset can parse as any time zone (since they're unambiguous)
CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"),
"[1582934400, 1583140152]"));
CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"),
"[1582934400, 1583140152]"));

// NOTE: timestamp parsing is tested comprehensively in value_parsing_test.cc
}
}

Expand Down
21 changes: 17 additions & 4 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3625,11 +3625,24 @@ struct StrptimeExec {

Result<ValueDescr> ResolveStrptimeOutput(KernelContext* ctx,
const std::vector<ValueDescr>&) {
if (ctx->state()) {
return ::arrow::timestamp(StrptimeState::Get(ctx).unit);
if (!ctx->state()) {
return Status::Invalid("strptime does not provide default StrptimeOptions");
}
const StrptimeOptions& options = StrptimeState::Get(ctx);
// Check for use of %z or %Z
size_t cur = 0;
std::string zone = "";
while (cur < options.format.size() - 1) {
if (options.format[cur] == '%') {
if (options.format[cur + 1] == 'z') {
zone = "UTC";
break;
}
cur++;
}
cur++;
}

return Status::Invalid("strptime does not provide default StrptimeOptions");
return ::arrow::timestamp(options.unit, zone);
}

// ----------------------------------------------------------------------
Expand Down
19 changes: 19 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "arrow/compute/kernels/test_util.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type.h"
#include "arrow/util/value_parsing.h"

namespace arrow {
namespace compute {
Expand Down Expand Up @@ -1757,6 +1758,24 @@ TYPED_TEST(TestStringKernels, Strptime) {
std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);

input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
options.format = "%m/%d/%Y %%z";
this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
}

TYPED_TEST(TestStringKernels, StrptimeZoneOffset) {
if (!arrow::internal::kStrptimeSupportsZone) {
GTEST_SKIP() << "strptime does not support %z on this platform";
}
// N.B. BSD strptime only supports (+/-)HHMM and not the wider range
// of values GNU strptime supports.
std::string input1 = R"(["5/1/2020 +0100", null, "12/11/1900 -0130"])";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also add the case of parsing a string with trailing Z?

That now seems to work with this PR:

In [26]: pc.strptime(["2012-01-01 09:00:00Z"], format="%Y-%m-%d %H:%M:%S%z", unit="s")
Out[26]: 
<pyarrow.lib.TimestampArray object at 0x7fb709a65760>
[
  2012-01-01 09:00:00
]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, so this actually already worked before as well, but the resulting type is different (timezone naive vs aware). So might still be worth checking that change in type explicitly, unless that is already covered elsewhere in the tests.

Copy link
Member Author

@lidavidm lidavidm Nov 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple things here:

  • BSD strptime doesn't support "Z" as noted in the comment. It only supports the syntax here, making it hard to test.
  • The result type is based on the presence of a "%z" so that's covered here already.

std::string output1 =
R"(["2020-04-30T23:00:00.000000", null, "1900-12-11T01:30:00.000000"])";
StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO);
this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO, "UTC"), output1,
&options);
}

TYPED_TEST(TestStringKernels, StrptimeDoesNotProvideDefaultOptions) {
Expand Down
85 changes: 85 additions & 0 deletions cpp/src/arrow/csv/column_builder_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/task_group.h"
#include "arrow/util/thread_pool.h"
#include "arrow/util/value_parsing.h"

namespace arrow {
namespace csv {
Expand Down Expand Up @@ -427,6 +428,13 @@ TEST_F(InferringColumnBuilderTest, SingleChunkTimestamp) {
{{false, true, true}}, {{0, 0, 1542129070}},
&expected);
CheckInferred(tg, {{"", "1970-01-01", "2018-11-13 17:11:10"}}, options, expected);

options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y/%m/%d"));
tg = TaskGroup::MakeSerial();
ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
{{false, true, true}}, {{0, 0, 1542067200}},
&expected);
CheckInferred(tg, {{"", "1970/01/01", "2018/11/13"}}, options, expected);
}

TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) {
Expand All @@ -438,6 +446,13 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) {
{{false}, {true}, {true}},
{{0}, {0}, {1542129070}}, &expected);
CheckInferred(tg, {{""}, {"1970-01-01"}, {"2018-11-13 17:11:10"}}, options, expected);

options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y/%m/%d"));
tg = TaskGroup::MakeSerial();
ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
{{false}, {true}, {true}},
{{0}, {0}, {1542067200}}, &expected);
CheckInferred(tg, {{""}, {"1970/01/01"}, {"2018/11/13"}}, options, expected);
}

TEST_F(InferringColumnBuilderTest, SingleChunkTimestampNS) {
Expand Down Expand Up @@ -471,6 +486,76 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampNS) {
options, expected);
}

TEST_F(InferringColumnBuilderTest, SingleChunkTimestampWithZone) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();

std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND, "UTC"),
{{false, true, true}}, {{0, 0, 1542129010}},
&expected);
CheckInferred(tg, {{"", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10+0001"}}, options,
expected);

tg = TaskGroup::MakeSerial();
expected = ChunkedArrayFromJSON(
utf8(), {R"(["", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10"])"});
CheckInferred(tg, {{"", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10"}}, options,
expected);
}

TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampWithZone) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();

std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND, "UTC"),
{{false}, {true}, {true}},
{{0}, {0}, {1542129010}}, &expected);
CheckInferred(tg, {{""}, {"1970-01-01T00:00:00Z"}, {"2018-11-13 17:11:10+0001"}},
options, expected);

tg = TaskGroup::MakeSerial();
expected = ChunkedArrayFromJSON(
utf8(), {R"([""])", R"(["1970-01-01T00:00:00Z"])", R"(["2018-11-13 17:11:10"])"});
CheckInferred(tg, {{""}, {"1970-01-01T00:00:00Z"}, {"2018-11-13 17:11:10"}}, options,
expected);
}

TEST_F(InferringColumnBuilderTest, SingleChunkTimestampWithZoneNS) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();

std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<TimestampType>(
timestamp(TimeUnit::NANO, "UTC"), {{false, true, true, true, true}},
{{0, 3660000000000, 1542129070123000000, 1542129070123456000, 1542129070123456789}},
&expected);
CheckInferred(tg,
{{"", "1970-01-01T00:00:00-0101", "2018-11-13 17:11:10.123Z",
"2018-11-13 17:11:10.123456Z", "2018-11-13 17:11:10.123456789Z"}},
options, expected);
}

TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampWithZoneNS) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();

std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<TimestampType>(
timestamp(TimeUnit::NANO, "UTC"), {{false}, {true}, {true, true, true}},
{{0},
{3660000000000},
{1542129070123000000, 1542129070123456000, 1542129070123456789}},
&expected);
CheckInferred(tg,
{{""},
{"1970-01-01T00:00:00-0101"},
{"2018-11-13 17:11:10.123Z", "2018-11-13 17:11:10.123456Z",
"2018-11-13 17:11:10.123456789Z"}},
options, expected);
}

TEST_F(InferringColumnBuilderTest, SingleChunkIntegerAndTime) {
// Fallback to utf-8
auto options = ConvertOptions::Defaults();
Expand Down
54 changes: 48 additions & 6 deletions cpp/src/arrow/csv/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -350,18 +350,37 @@ struct InlineISO8601ValueDecoder : public ValueDecoder {
explicit InlineISO8601ValueDecoder(const std::shared_ptr<DataType>& type,
const ConvertOptions& options)
: ValueDecoder(type, options),
unit_(checked_cast<const TimestampType&>(*type_).unit()) {}
unit_(checked_cast<const TimestampType&>(*type_).unit()),
expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()) {
}

Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
if (ARROW_PREDICT_FALSE(!internal::ParseTimestampISO8601(
reinterpret_cast<const char*>(data), size, unit_, out))) {
bool zone_offset_present = false;
if (ARROW_PREDICT_FALSE(
!internal::ParseTimestampISO8601(reinterpret_cast<const char*>(data), size,
unit_, out, &zone_offset_present))) {
return GenericConversionError(type_, data, size);
}
if (zone_offset_present != expect_timezone_) {
if (expect_timezone_) {
return Status::Invalid("CSV conversion error to ", type_->ToString(),
": expected a zone offset in '",
std::string(reinterpret_cast<const char*>(data), size),
"'. If these timestamps are in local time, parse them as "
"timestamps without timezone, then call assume_timezone.");
} else {
return Status::Invalid("CSV conversion error to ", type_->ToString(),
": expected no zone offset in '",
std::string(reinterpret_cast<const char*>(data), size),
"'");
}
}
return Status::OK();
}

protected:
TimeUnit::type unit_;
bool expect_timezone_;
};

struct SingleParserTimestampValueDecoder : public ValueDecoder {
Expand All @@ -371,18 +390,36 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder {
const ConvertOptions& options)
: ValueDecoder(type, options),
unit_(checked_cast<const TimestampType&>(*type_).unit()),
expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
parser_(*options_.timestamp_parsers[0]) {}

Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
if (ARROW_PREDICT_FALSE(
!parser_(reinterpret_cast<const char*>(data), size, unit_, out))) {
bool zone_offset_present = false;
if (ARROW_PREDICT_FALSE(!parser_(reinterpret_cast<const char*>(data), size, unit_,
out, &zone_offset_present))) {
return GenericConversionError(type_, data, size);
}
if (zone_offset_present != expect_timezone_) {
if (expect_timezone_) {
return Status::Invalid("CSV conversion error to ", type_->ToString(),
": expected a zone offset in '",
std::string(reinterpret_cast<const char*>(data), size),
"'. If these timestamps are in local time, parse them as "
"timestamps without timezone, then call assume_timezone. "
"If using strptime, ensure '%z' is in the format string.");
} else {
return Status::Invalid("CSV conversion error to ", type_->ToString(),
": expected no zone offset in '",
std::string(reinterpret_cast<const char*>(data), size),
"'");
}
}
return Status::OK();
}

protected:
TimeUnit::type unit_;
bool expect_timezone_;
const TimestampParser& parser_;
};

Expand All @@ -393,11 +430,15 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
const ConvertOptions& options)
: ValueDecoder(type, options),
unit_(checked_cast<const TimestampType&>(*type_).unit()),
expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
parsers_(GetParsers(options_)) {}

Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
bool zone_offset_present = false;
for (const auto& parser : parsers_) {
if (parser->operator()(reinterpret_cast<const char*>(data), size, unit_, out)) {
if (parser->operator()(reinterpret_cast<const char*>(data), size, unit_, out,
&zone_offset_present) &&
zone_offset_present == expect_timezone_) {
return Status::OK();
}
}
Expand All @@ -416,6 +457,7 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
}

TimeUnit::type unit_;
bool expect_timezone_;
std::vector<const TimestampParser*> parsers_;
};

Expand Down
Loading