diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 2cbc0fde2b2..e959884b233 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -185,7 +185,7 @@ class ARROW_EXPORT StrftimeOptions : public FunctionOptions { constexpr static char const kTypeName[] = "StrftimeOptions"; - constexpr static const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%SZ"; + constexpr static const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%S"; /// The desired format string. std::string format; diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index 44c7f75a038..d70411f8338 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -476,11 +476,17 @@ struct Strftime { static Result Make(KernelContext* ctx, const DataType& type) { const StrftimeOptions& options = StrftimeState::Get(ctx); - const auto& timezone = GetInputTimezone(type); + auto timezone = GetInputTimezone(type); if (timezone.empty()) { - return Status::Invalid( - "Timestamps without a time zone cannot be reliably formatted."); + if ((options.format.find("%z") != std::string::npos) || + (options.format.find("%Z") != std::string::npos)) { + return Status::Invalid( + "Timezone not present, cannot convert to string with timezone: ", + options.format); + } + timezone = "UTC"; } + ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone)); ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale(options.locale)); @@ -737,107 +743,145 @@ std::shared_ptr MakeSimpleUnaryTemporal( const FunctionDoc year_doc{ "Extract year from timestamp", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc month_doc{ "Extract month number", ("Month is encoded as January=1, December=12.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc day_doc{ "Extract day number", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc day_of_week_doc{ "Extract day of the week number", - ("By default, the week starts on Monday represented by 0 and ends on Sunday " + ("By default, the week starts on Monday represented by 0 and ends on Sunday\n" "represented by 6.\n" - "DayOfWeekOptions.week_start can be used to set another starting day using ISO " - "convention (Monday=1, Sunday=7). Day numbering can start with 0 or 1 using " - "DayOfWeekOptions.one_based_numbering parameter.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "`DayOfWeekOptions.week_start` can be used to set another starting day using\n" + "the ISO numbering convention (1=start week on Monday, 7=start week on Sunday).\n" + "Day numbers can start at 0 or 1 based on `DayOfWeekOptions.one_based_numbering`.\n" + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}, "DayOfWeekOptions"}; const FunctionDoc day_of_year_doc{ "Extract number of day of year", ("January 1st maps to day number 1, February 1st to 32, etc.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc iso_year_doc{ "Extract ISO year number", ("First week of an ISO year has the majority (4 or more) of its days in January." - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc iso_week_doc{ "Extract ISO week of year number", ("First ISO week has the majority (4 or more) of its days in January.\n" "Week of the year starts with 1 and can run up to 53.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc iso_calendar_doc{ "Extract (ISO year, ISO week, ISO day of week) struct", ("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc quarter_doc{ "Extract quarter of year number", ("First quarter maps to 1 and forth quarter maps to 4.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc hour_doc{ "Extract hour value", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc minute_doc{ "Extract minute values", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc second_doc{ "Extract second values", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc millisecond_doc{ "Extract millisecond values", ("Millisecond returns number of milliseconds since the last full second.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc microsecond_doc{ "Extract microsecond values", ("Millisecond returns number of microseconds since the last full millisecond.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc nanosecond_doc{ "Extract nanosecond values", ("Nanosecond returns number of nanoseconds since the last full microsecond.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc subsecond_doc{ "Extract subsecond values", ("Subsecond returns the fraction of a second since the last full second.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc strftime_doc{ "Format timestamps according to a format string", ("For each input timestamp, emit a formatted string.\n" "The time format string and locale can be set using StrftimeOptions.\n" - "An error is returned if the timestamps don't have a defined timezone,\n" - "or if the timezone cannot be found in the timezone database."), + "The output precision of the \"%S\" (seconds) format code depends on\n" + "the input timestamp precision: it is an integer for timestamps with\n" + "second precision, a real number with the required number of fractional\n" + "digits for higher precisions.\n" + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database, or if the specified locale\n" + "does not exist on this system."), {"timestamps"}, "StrftimeOptions"}; diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index d8199089328..32e46ae5818 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -394,7 +394,7 @@ TEST_F(ScalarTemporalTest, Strftime) { const char* nanoseconds = R"(["1970-01-01T00:00:59.123456789", null])"; const char* default_seconds = R"( - ["1970-01-01T00:00:59Z", "2021-08-18T15:11:50Z", null])"; + ["1970-01-01T00:00:59", "2021-08-18T15:11:50", null])"; const char* string_seconds = R"( ["1970-01-01T00:00:59+0000", "2021-08-18T15:11:50+0000", null])"; const char* string_milliseconds = R"(["1970-01-01T00:00:59.123+0000", null])"; @@ -414,12 +414,20 @@ TEST_F(ScalarTemporalTest, Strftime) { } TEST_F(ScalarTemporalTest, StrftimeNoTimezone) { + auto options_default = StrftimeOptions(); const char* seconds = R"(["1970-01-01T00:00:59", null])"; auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND), seconds); + + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND), seconds, utf8(), seconds, + &options_default); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, - testing::HasSubstr("Timestamps without a time zone cannot be reliably formatted"), - Strftime(arr, StrftimeOptions())); + testing::HasSubstr("Invalid: Timezone not present, cannot convert to string"), + Strftime(arr, StrftimeOptions("%Y-%m-%dT%H:%M:%S%z"))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Timezone not present, cannot convert to string"), + Strftime(arr, StrftimeOptions("%Y-%m-%dT%H:%M:%S%Z"))); } TEST_F(ScalarTemporalTest, StrftimeInvalidTimezone) { @@ -440,7 +448,7 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) { const char* microseconds = R"(["1970-01-01T00:00:59.123456", null])"; const char* nanoseconds = R"(["1970-01-01T00:00:59.123456789", null])"; - const char* default_seconds = R"(["1970-01-01T00:00:59Z", null])"; + const char* default_seconds = R"(["1970-01-01T00:00:59", null])"; const char* string_seconds = R"(["1970-01-01T00:00:59+0000", null])"; const char* string_milliseconds = R"(["1970-01-01T00:00:59.123+0000", null])"; const char* string_microseconds = R"(["1970-01-01T05:30:59.123456+0530", null])"; diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 7263d77acf2..f4ef440f100 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1105,19 +1105,29 @@ number of input and output types. The type to cast to can be passed in a :struct:`CastOptions` instance. As an alternative, the same service is provided by a concrete function :func:`~arrow::compute::Cast`. -+--------------------------+------------+--------------------+------------------+------------------------------+ -| Function name | Arity | Input types | Output type | Options class | -+==========================+============+====================+==================+==============================+ -| cast | Unary | Many | Variable | :struct:`CastOptions` | -+--------------------------+------------+--------------------+------------------+------------------------------+ -| strftime | Unary | Timestamp | String | :struct:`StrftimeOptions` | -+--------------------------+------------+--------------------+------------------+------------------------------+ -| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | -+--------------------------+------------+--------------------+------------------+------------------------------+ ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=================+============+====================+==================+==============================+=======+ +| cast | Unary | Many | Variable | :struct:`CastOptions` | | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| strftime | Unary | Timestamp | String | :struct:`StrftimeOptions` | \(1) | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ The conversions available with ``cast`` are listed below. In all cases, a null input value is converted into a null output value. +* \(1) Output precision of ``%S`` (seconds) flag depends on the input timestamp + precision. Timestamps with second precision are represented as integers while + milliseconds, microsecond and nanoseconds are represented as fixed floating + point numbers with 3, 6 and 9 decimal places respectively. To obtain integer + seconds, cast to timestamp with second resolution. + The character for the decimal point is localized according to the locale. + See `detailed formatting documentation`_ for descriptions of other flags. + +.. _detailed formatting documentation: https://howardhinnant.github.io/date/date.html#to_stream_formatting + **Truth value extraction** +-----------------------------+------------------------------------+--------------+ diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 29c579f85a9..aaf4c9f2916 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -989,7 +989,7 @@ cdef class _StrftimeOptions(FunctionOptions): class StrftimeOptions(_StrftimeOptions): - def __init__(self, format="%Y-%m-%dT%H:%M:%SZ", locale="C"): + def __init__(self, format="%Y-%m-%dT%H:%M:%S", locale="C"): self._set_options(format, locale) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index bbef46f2477..334a2a7bda3 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1496,10 +1496,18 @@ def _fix_timestamp(s): expected = pa.array(_fix_timestamp(ts.strftime(fmt))) assert result.equals(expected) + fmt = "%Y-%m-%dT%H:%M:%S" + # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions()) - expected = pa.array(_fix_timestamp(ts.strftime("%Y-%m-%dT%H:%M:%SZ"))) + expected = pa.array(_fix_timestamp(ts.strftime(fmt))) + assert result.equals(expected) + + # Default format plus timezone + tsa = pa.array(ts, type=pa.timestamp("s", timezone)) + result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) + expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z"))) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" @@ -1518,18 +1526,27 @@ def _fix_timestamp(s): # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) - options = pc.StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C") + options = pc.StrftimeOptions(fmt, "C") result = pc.strftime(tsa, options=options) - expected = pa.array(_fix_timestamp(ts.strftime("%Y-%m-%dT%H:%M:%SZ"))) + expected = pa.array(_fix_timestamp(ts.strftime(fmt))) assert result.equals(expected) - for unit in ["s", "ms", "us", "ns"]: - tsa = pa.array(ts, type=pa.timestamp(unit)) - for fmt in formats: - with pytest.raises(pa.ArrowInvalid, - match="Timestamps without a time zone " - "cannot be reliably formatted"): - pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) + # Test timestamps without timezone + fmt = "%Y-%m-%dT%H:%M:%S" + ts = pd.to_datetime(times) + tsa = pa.array(ts, type=pa.timestamp("s")) + result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) + expected = pa.array(_fix_timestamp(ts.strftime(fmt))) + + assert result.equals(expected) + with pytest.raises( + pa.ArrowInvalid, + match="Timezone not present, cannot convert to string"): + pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) + with pytest.raises( + pa.ArrowInvalid, + match="Timezone not present, cannot convert to string"): + pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z")) def _check_datetime_components(timestamps, timezone=None):