Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class ARROW_EXPORT StrftimeOptions : public FunctionOptions {

constexpr static char const kTypeName[] = "StrftimeOptions";

constexpr static const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%SZ";
constexpr static const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%S";

/// The desired format string.
std::string format;
Expand Down
94 changes: 69 additions & 25 deletions cpp/src/arrow/compute/kernels/scalar_temporal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -476,11 +476,17 @@ struct Strftime {
static Result<Strftime> Make(KernelContext* ctx, const DataType& type) {
const StrftimeOptions& options = StrftimeState::Get(ctx);

const auto& timezone = GetInputTimezone(type);
auto timezone = GetInputTimezone(type);
if (timezone.empty()) {
return Status::Invalid(
"Timestamps without a time zone cannot be reliably formatted.");
if ((options.format.find("%z") != std::string::npos) ||
(options.format.find("%Z") != std::string::npos)) {
return Status::Invalid(
"Timezone not present, cannot convert to string with timezone: ",
options.format);
}
timezone = "UTC";
}

ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone));

ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale(options.locale));
Expand Down Expand Up @@ -737,107 +743,145 @@ std::shared_ptr<ScalarFunction> MakeSimpleUnaryTemporal(

const FunctionDoc year_doc{
"Extract year from timestamp",
"Returns an error if timestamp has a defined timezone. Null values return null.",
("Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc month_doc{
"Extract month number",
("Month is encoded as January=1, December=12.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc day_doc{
"Extract day number",
"Returns an error if timestamp has a defined timezone. Null values return null.",
("Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc day_of_week_doc{
"Extract day of the week number",
("By default, the week starts on Monday represented by 0 and ends on Sunday "
("By default, the week starts on Monday represented by 0 and ends on Sunday\n"
"represented by 6.\n"
"DayOfWeekOptions.week_start can be used to set another starting day using ISO "
"convention (Monday=1, Sunday=7). Day numbering can start with 0 or 1 using "
"DayOfWeekOptions.one_based_numbering parameter.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"`DayOfWeekOptions.week_start` can be used to set another starting day using\n"
"the ISO numbering convention (1=start week on Monday, 7=start week on Sunday).\n"
"Day numbers can start at 0 or 1 based on `DayOfWeekOptions.one_based_numbering`.\n"
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"},
"DayOfWeekOptions"};

const FunctionDoc day_of_year_doc{
"Extract number of day of year",
("January 1st maps to day number 1, February 1st to 32, etc.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc iso_year_doc{
"Extract ISO year number",
("First week of an ISO year has the majority (4 or more) of its days in January."
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc iso_week_doc{
"Extract ISO week of year number",
("First ISO week has the majority (4 or more) of its days in January.\n"
"Week of the year starts with 1 and can run up to 53.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc iso_calendar_doc{
"Extract (ISO year, ISO week, ISO day of week) struct",
("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc quarter_doc{
"Extract quarter of year number",
("First quarter maps to 1 and forth quarter maps to 4.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc hour_doc{
"Extract hour value",
"Returns an error if timestamp has a defined timezone. Null values return null.",
("Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc minute_doc{
"Extract minute values",
"Returns an error if timestamp has a defined timezone. Null values return null.",
("Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc second_doc{
"Extract second values",
"Returns an error if timestamp has a defined timezone. Null values return null.",
("Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc millisecond_doc{
"Extract millisecond values",
("Millisecond returns number of milliseconds since the last full second.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc microsecond_doc{
"Extract microsecond values",
("Millisecond returns number of microseconds since the last full millisecond.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc nanosecond_doc{
"Extract nanosecond values",
("Nanosecond returns number of nanoseconds since the last full microsecond.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc subsecond_doc{
"Extract subsecond values",
("Subsecond returns the fraction of a second since the last full second.\n"
"Returns an error if timestamp has a defined timezone. Null values return null."),
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc strftime_doc{
"Format timestamps according to a format string",
("For each input timestamp, emit a formatted string.\n"
"The time format string and locale can be set using StrftimeOptions.\n"
"An error is returned if the timestamps don't have a defined timezone,\n"
"or if the timezone cannot be found in the timezone database."),
"The output precision of the \"%S\" (seconds) format code depends on\n"
"the input timestamp precision: it is an integer for timestamps with\n"
"second precision, a real number with the required number of fractional\n"
"digits for higher precisions.\n"
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database, or if the specified locale\n"
"does not exist on this system."),
{"timestamps"},
"StrftimeOptions"};

Expand Down
16 changes: 12 additions & 4 deletions cpp/src/arrow/compute/kernels/scalar_temporal_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ TEST_F(ScalarTemporalTest, Strftime) {
const char* nanoseconds = R"(["1970-01-01T00:00:59.123456789", null])";

const char* default_seconds = R"(
["1970-01-01T00:00:59Z", "2021-08-18T15:11:50Z", null])";
["1970-01-01T00:00:59", "2021-08-18T15:11:50", null])";
const char* string_seconds = R"(
["1970-01-01T00:00:59+0000", "2021-08-18T15:11:50+0000", null])";
const char* string_milliseconds = R"(["1970-01-01T00:00:59.123+0000", null])";
Expand All @@ -414,12 +414,20 @@ TEST_F(ScalarTemporalTest, Strftime) {
}

TEST_F(ScalarTemporalTest, StrftimeNoTimezone) {
auto options_default = StrftimeOptions();
const char* seconds = R"(["1970-01-01T00:00:59", null])";
auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND), seconds);

CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND), seconds, utf8(), seconds,
&options_default);
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid,
testing::HasSubstr("Timestamps without a time zone cannot be reliably formatted"),
Strftime(arr, StrftimeOptions()));
testing::HasSubstr("Invalid: Timezone not present, cannot convert to string"),
Strftime(arr, StrftimeOptions("%Y-%m-%dT%H:%M:%S%z")));
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid,
testing::HasSubstr("Invalid: Timezone not present, cannot convert to string"),
Strftime(arr, StrftimeOptions("%Y-%m-%dT%H:%M:%S%Z")));
}

TEST_F(ScalarTemporalTest, StrftimeInvalidTimezone) {
Expand All @@ -440,7 +448,7 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) {
const char* microseconds = R"(["1970-01-01T00:00:59.123456", null])";
const char* nanoseconds = R"(["1970-01-01T00:00:59.123456789", null])";

const char* default_seconds = R"(["1970-01-01T00:00:59Z", null])";
const char* default_seconds = R"(["1970-01-01T00:00:59", null])";
const char* string_seconds = R"(["1970-01-01T00:00:59+0000", null])";
const char* string_milliseconds = R"(["1970-01-01T00:00:59.123+0000", null])";
const char* string_microseconds = R"(["1970-01-01T05:30:59.123456+0530", null])";
Expand Down
28 changes: 19 additions & 9 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1105,19 +1105,29 @@ number of input and output types. The type to cast to can be passed in a
:struct:`CastOptions` instance. As an alternative, the same service is
provided by a concrete function :func:`~arrow::compute::Cast`.

+--------------------------+------------+--------------------+------------------+------------------------------+
| Function name | Arity | Input types | Output type | Options class |
+==========================+============+====================+==================+==============================+
| cast | Unary | Many | Variable | :struct:`CastOptions` |
+--------------------------+------------+--------------------+------------------+------------------------------+
| strftime | Unary | Timestamp | String | :struct:`StrftimeOptions` |
+--------------------------+------------+--------------------+------------------+------------------------------+
| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` |
+--------------------------+------------+--------------------+------------------+------------------------------+
+-----------------+------------+--------------------+------------------+------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+=================+============+====================+==================+==============================+=======+
| cast | Unary | Many | Variable | :struct:`CastOptions` | |
+-----------------+------------+--------------------+------------------+------------------------------+-------+
| strftime | Unary | Timestamp | String | :struct:`StrftimeOptions` | \(1) |
+-----------------+------------+--------------------+------------------+------------------------------+-------+
| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | |
+-----------------+------------+--------------------+------------------+------------------------------+-------+

The conversions available with ``cast`` are listed below. In all cases, a
null input value is converted into a null output value.

* \(1) Output precision of ``%S`` (seconds) flag depends on the input timestamp
precision. Timestamps with second precision are represented as integers while
milliseconds, microsecond and nanoseconds are represented as fixed floating
point numbers with 3, 6 and 9 decimal places respectively. To obtain integer
seconds, cast to timestamp with second resolution.
The character for the decimal point is localized according to the locale.
See `detailed formatting documentation`_ for descriptions of other flags.

.. _detailed formatting documentation: https://howardhinnant.github.io/date/date.html#to_stream_formatting

**Truth value extraction**

+-----------------------------+------------------------------------+--------------+
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/_compute.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -989,7 +989,7 @@ cdef class _StrftimeOptions(FunctionOptions):


class StrftimeOptions(_StrftimeOptions):
def __init__(self, format="%Y-%m-%dT%H:%M:%SZ", locale="C"):
def __init__(self, format="%Y-%m-%dT%H:%M:%S", locale="C"):
self._set_options(format, locale)


Expand Down
37 changes: 27 additions & 10 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1496,10 +1496,18 @@ def _fix_timestamp(s):
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
assert result.equals(expected)

fmt = "%Y-%m-%dT%H:%M:%S"

# Default format
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
result = pc.strftime(tsa, options=pc.StrftimeOptions())
expected = pa.array(_fix_timestamp(ts.strftime("%Y-%m-%dT%H:%M:%SZ")))
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
assert result.equals(expected)

# Default format plus timezone
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z")))
assert result.equals(expected)

# Pandas %S is equivalent to %S in arrow for unit="s"
Expand All @@ -1518,18 +1526,27 @@ def _fix_timestamp(s):

# Test setting locale
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
options = pc.StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C")
options = pc.StrftimeOptions(fmt, "C")
result = pc.strftime(tsa, options=options)
expected = pa.array(_fix_timestamp(ts.strftime("%Y-%m-%dT%H:%M:%SZ")))
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
assert result.equals(expected)

for unit in ["s", "ms", "us", "ns"]:
tsa = pa.array(ts, type=pa.timestamp(unit))
for fmt in formats:
with pytest.raises(pa.ArrowInvalid,
match="Timestamps without a time zone "
"cannot be reliably formatted"):
pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
# Test timestamps without timezone
fmt = "%Y-%m-%dT%H:%M:%S"
ts = pd.to_datetime(times)
tsa = pa.array(ts, type=pa.timestamp("s"))
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))

assert result.equals(expected)
with pytest.raises(
pa.ArrowInvalid,
match="Timezone not present, cannot convert to string"):
pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
with pytest.raises(
pa.ArrowInvalid,
match="Timezone not present, cannot convert to string"):
pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z"))


def _check_datetime_components(timestamps, timezone=None):
Expand Down