diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 6d92a6531de..a9e2565a3ea 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -353,7 +353,8 @@ static auto kStrftimeOptionsType = GetFunctionOptionsType( DataMember("format", &StrftimeOptions::format)); static auto kStrptimeOptionsType = GetFunctionOptionsType( DataMember("format", &StrptimeOptions::format), - DataMember("unit", &StrptimeOptions::unit)); + DataMember("unit", &StrptimeOptions::unit), + DataMember("error_is_null", &StrptimeOptions::error_is_null)); static auto kStructFieldOptionsType = GetFunctionOptionsType( DataMember("indices", &StructFieldOptions::indices)); static auto kTrimOptionsType = GetFunctionOptionsType( @@ -544,11 +545,13 @@ StrftimeOptions::StrftimeOptions() : StrftimeOptions(kDefaultFormat) {} constexpr char StrftimeOptions::kTypeName[]; constexpr const char* StrftimeOptions::kDefaultFormat; -StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit) +StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit, + bool error_is_null) : FunctionOptions(internal::kStrptimeOptionsType), format(std::move(format)), - unit(unit) {} -StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {} + unit(unit), + error_is_null(error_is_null) {} +StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::MICRO, false) {} constexpr char StrptimeOptions::kTypeName[]; StructFieldOptions::StructFieldOptions(std::vector indices) @@ -822,6 +825,10 @@ Result Strftime(const Datum& arg, StrftimeOptions options, ExecContext* c return CallFunction("strftime", {arg}, &options, ctx); } +Result Strptime(const Datum& arg, StrptimeOptions options, ExecContext* ctx) { + return CallFunction("strptime", {arg}, &options, ctx); +} + Result Week(const Datum& arg, WeekOptions options, ExecContext* ctx) { return CallFunction("week", {arg}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 41e8d5a49c2..1ba03fd7a64 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -267,12 +267,17 @@ class ARROW_EXPORT StructFieldOptions : public FunctionOptions { class ARROW_EXPORT StrptimeOptions : public FunctionOptions { public: - explicit StrptimeOptions(std::string format, TimeUnit::type unit); + explicit StrptimeOptions(std::string format, TimeUnit::type unit, + bool error_is_null = false); StrptimeOptions(); static constexpr char const kTypeName[] = "StrptimeOptions"; + /// The desired format string. std::string format; + /// The desired time resolution TimeUnit::type unit; + /// Return null on parsing errors if true or raise if false + bool error_is_null; }; class ARROW_EXPORT StrftimeOptions : public FunctionOptions { @@ -1398,6 +1403,22 @@ ARROW_EXPORT Result Subsecond(const Datum& values, ExecContext* ctx = NUL ARROW_EXPORT Result Strftime(const Datum& values, StrftimeOptions options, ExecContext* ctx = NULLPTR); +/// \brief Parse timestamps according to a format string +/// +/// Return parsed timestamps according to the format string +/// `StrptimeOptions::format` at time resolution `Strftime::unit`. Parse errors are +/// raised depending on the `Strftime::error_is_null` setting. +/// +/// \param[in] values input strings +/// \param[in] options for setting format string, unit and error_is_null +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Strptime(const Datum& values, StrptimeOptions options, + ExecContext* ctx = NULLPTR); + /// \brief Converts timestamps from local timestamp without a timezone to a timestamp with /// timezone, interpreting the local timestamp as being in the specified timezone for each /// element of `values` diff --git a/cpp/src/arrow/compute/exec/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc index fa05ddf6422..30ddef69010 100644 --- a/cpp/src/arrow/compute/exec/expression_test.cc +++ b/cpp/src/arrow/compute/exec/expression_test.cc @@ -737,7 +737,7 @@ TEST(Expression, ExecuteCall) { ])")); ExpectExecute(call("strptime", {field_ref("a")}, - compute::StrptimeOptions("%m/%d/%Y", TimeUnit::MICRO)), + compute::StrptimeOptions("%m/%d/%Y", TimeUnit::MICRO, true)), ArrayFromJSON(struct_({field("a", utf8())}), R"([ {"a": "5/1/2020"}, {"a": null}, diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 3f5f7c08d66..13de2a29ab8 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -88,7 +88,7 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new ExtractRegexOptions("pattern2")); options.emplace_back(new SetLookupOptions(ArrayFromJSON(int64(), "[1, 2, 3, 4]"))); options.emplace_back(new SetLookupOptions(ArrayFromJSON(boolean(), "[true, false]"))); - options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI)); + options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI, true)); options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO)); options.emplace_back(new StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C")); #ifndef _WIN32 diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc index 9b855505314..a332364bd82 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc @@ -2770,79 +2770,6 @@ void AddAsciiStringSplitRegex(FunctionRegistry* registry) { } #endif // ARROW_WITH_RE2 -// ---------------------------------------------------------------------- -// strptime string parsing - -using StrptimeState = OptionsWrapper; - -struct ParseStrptime { - explicit ParseStrptime(const StrptimeOptions& options) - : parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {} - - template - int64_t Call(KernelContext*, util::string_view val, Status* st) const { - int64_t result = 0; - if (!(*parser)(val.data(), val.size(), unit, &result)) { - *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ", - TimestampType(unit).ToString()); - } - return result; - } - - std::shared_ptr parser; - TimeUnit::type unit; -}; - -template -struct StrptimeExec { - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - applicator::ScalarUnaryNotNullStateful - kernel{ParseStrptime(StrptimeState::Get(ctx))}; - return kernel.Exec(ctx, batch, out); - } -}; - -Result ResolveStrptimeOutput(KernelContext* ctx, - const std::vector&) { - if (!ctx->state()) { - return Status::Invalid("strptime does not provide default StrptimeOptions"); - } - const StrptimeOptions& options = StrptimeState::Get(ctx); - // Check for use of %z or %Z - size_t cur = 0; - std::string zone = ""; - while (cur < options.format.size() - 1) { - if (options.format[cur] == '%') { - if (options.format[cur + 1] == 'z') { - zone = "UTC"; - break; - } - cur++; - } - cur++; - } - return ::arrow::timestamp(options.unit, zone); -} - -const FunctionDoc strptime_doc( - "Parse timestamps", - ("For each string in `strings`, parse it as a timestamp.\n" - "The timestamp unit and the expected string pattern must be given\n" - "in StrptimeOptions. Null inputs emit null. If a non-null string\n" - "fails parsing, an error is returned."), - {"strings"}, "StrptimeOptions", /*options_required=*/true); - -void AddAsciiStringStrptime(FunctionRegistry* registry) { - auto func = std::make_shared("strptime", Arity::Unary(), &strptime_doc); - - OutputType out_ty(ResolveStrptimeOutput); - for (const auto& ty : StringTypes()) { - auto exec = GenerateVarBinaryToVarBinary(ty); - DCHECK_OK(func->AddKernel({ty}, out_ty, std::move(exec), StrptimeState::Init)); - } - DCHECK_OK(registry->AddFunction(std::move(func))); -} - // ---------------------------------------------------------------------- // Binary join @@ -3518,7 +3445,6 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { #ifdef ARROW_WITH_RE2 AddAsciiStringSplitRegex(registry); #endif - AddAsciiStringStrptime(registry); AddAsciiStringJoin(registry); AddAsciiStringRepeat(registry); } diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index ce7ffee526b..5b69b990cc3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1840,14 +1840,34 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) { #endif TYPED_TEST(TestStringKernels, Strptime) { - std::string input1 = R"(["5/1/2020", null, "12/11/1900"])"; - std::string output1 = R"(["2020-05-01", null, "1900-12-11"])"; - StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO); - this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options); + std::string input1 = R"(["5/1/2020", null, null, "12/13/1900", null])"; + std::string input2 = R"(["5/1/2020", "12/13/1900"])"; + std::string input3 = R"(["5/1/2020", "AA/BB/CCCC"])"; + std::string input4 = R"(["5/1/2020", "AA/BB/CCCC", "AA/BB/CCCC", "AA/BB/CCCC", null])"; + std::string input5 = R"(["5/1/2020 %z", null, null, "12/13/1900 %z", null])"; + std::string output1 = R"(["2020-05-01", null, null, "1900-12-13", null])"; + std::string output4 = R"(["2020-01-05", null, null, null, null])"; + std::string output2 = R"(["2020-05-01", "1900-12-13"])"; + std::string output3 = R"(["2020-05-01", null])"; + + StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, /*error_is_null=*/true); + auto unit = timestamp(TimeUnit::MICRO); + this->CheckUnary("strptime", input1, unit, output1, &options); + this->CheckUnary("strptime", input2, unit, output2, &options); + this->CheckUnary("strptime", input3, unit, output3, &options); + + options.format = "%d/%m/%Y"; + this->CheckUnary("strptime", input4, unit, output4, &options); - input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])"; options.format = "%m/%d/%Y %%z"; - this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options); + this->CheckUnary("strptime", input5, unit, output1, &options); + + options.error_is_null = false; + this->CheckUnary("strptime", input5, unit, output1, &options); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Failed to parse string: '5/1/2020'"), + Strptime(ArrayFromJSON(this->type(), input1), options)); } TYPED_TEST(TestStringKernels, StrptimeZoneOffset) { @@ -1859,7 +1879,7 @@ TYPED_TEST(TestStringKernels, StrptimeZoneOffset) { std::string input1 = R"(["5/1/2020 +0100", null, "12/11/1900 -0130"])"; std::string output1 = R"(["2020-04-30T23:00:00.000000", null, "1900-12-11T01:30:00.000000"])"; - StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO); + StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO, /*error_is_null=*/true); this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO, "UTC"), output1, &options); } diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index ed08c367664..c268dc5464a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -25,6 +25,7 @@ #include "arrow/compute/kernels/temporal_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/time.h" +#include "arrow/util/value_parsing.h" #include "arrow/vendored/datetime.h" namespace arrow { @@ -72,6 +73,7 @@ using std::chrono::minutes; using DayOfWeekState = OptionsWrapper; using WeekState = OptionsWrapper; using StrftimeState = OptionsWrapper; +using StrptimeState = OptionsWrapper; using AssumeTimezoneState = OptionsWrapper; using RoundTemporalState = OptionsWrapper; @@ -1143,6 +1145,130 @@ struct Strftime { }; #endif +// ---------------------------------------------------------------------- +// Convert string representations of timestamps in arbitrary format to timestamps + +const std::string GetZone(const std::string& format) { + // Check for use of %z or %Z + size_t cur = 0; + size_t count = 0; + std::string zone = ""; + while (cur < format.size() - 1) { + if (format[cur] == '%') { + count++; + if (format[cur + 1] == 'z' && count % 2 == 1) { + zone = "UTC"; + break; + } + cur++; + } else { + count = 0; + } + cur++; + } + return zone; +} + +template +struct Strptime { + const std::shared_ptr parser; + const TimeUnit::type unit; + const std::string zone; + const bool error_is_null; + + static Result Make(KernelContext* ctx, const DataType& type) { + const StrptimeOptions& options = StrptimeState::Get(ctx); + + return Strptime{TimestampParser::MakeStrptime(options.format), + std::move(options.unit), GetZone(options.format), + options.error_is_null}; + } + + static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type)); + + if (in.is_valid) { + auto s = internal::UnboxScalar::Unbox(in); + int64_t result; + if ((*self.parser)(s.data(), s.size(), self.unit, &result)) { + *checked_cast(out) = + TimestampScalar(result, timestamp(self.unit, self.zone)); + } else { + if (self.error_is_null) { + out->is_valid = false; + } else { + return Status::Invalid("Failed to parse string: '", s, "' as a scalar of type ", + TimestampType(self.unit).ToString()); + } + } + } else { + out->is_valid = false; + } + return Status::OK(); + } + + static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { + ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type)); + int64_t* out_data = out->GetMutableValues(1); + + if (self.error_is_null) { + if (out->buffers[0] == nullptr) { + ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(in.length)); + bit_util::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length); + } + + int64_t null_count = 0; + arrow::internal::BitmapWriter out_writer(out->GetMutableValues(0, 0), + out->offset, out->length); + auto visit_null = [&]() { + out_data++; + out_writer.Next(); + null_count++; + }; + auto visit_value = [&](util::string_view s) { + int64_t result; + if ((*self.parser)(s.data(), s.size(), self.unit, &result)) { + *out_data++ = result; + } else { + out_writer.Clear(); + null_count++; + } + out_writer.Next(); + }; + VisitArrayDataInline(in, visit_value, visit_null); + out_writer.Finish(); + out->null_count = null_count; + } else { + auto visit_null = [&]() { + out_data++; + return Status::OK(); + }; + auto visit_value = [&](util::string_view s) { + int64_t result; + if ((*self.parser)(s.data(), s.size(), self.unit, &result)) { + *out_data++ = result; + return Status::OK(); + } else { + return Status::Invalid("Failed to parse string: '", s, "' as a scalar of type ", + TimestampType(self.unit).ToString()); + } + }; + RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); + } + return Status::OK(); + } +}; + +Result ResolveStrptimeOutput(KernelContext* ctx, + const std::vector&) { + if (!ctx->state()) { + return Status::Invalid("strptime does not provide default StrptimeOptions"); + } + const StrptimeOptions& options = StrptimeState::Get(ctx); + auto type = timestamp(options.unit, GetZone(options.format)); + return ValueDescr(std::move(type)); +} + // ---------------------------------------------------------------------- // Convert timestamps from local timestamp without a timezone to timestamps with a // timezone, interpreting the local timestamp as being in the specified timezone @@ -1399,6 +1525,7 @@ struct SimpleUnaryTemporalFactory { void AddKernel(InputType in_type) { auto exec = SimpleUnary>; DCHECK_OK(func->AddKernel({std::move(in_type)}, out_type, std::move(exec), init)); + ScalarKernel kernel({std::move(in_type)}, out_type, exec, init); } }; @@ -1590,7 +1717,13 @@ const FunctionDoc strftime_doc{ "does not exist on this system."), {"timestamps"}, "StrftimeOptions"}; - +const FunctionDoc strptime_doc( + "Parse timestamps", + ("For each string in `strings`, parse it as a timestamp.\n" + "The timestamp unit and the expected string pattern must be given\n" + "in StrptimeOptions. Null inputs emit null. If a non-null string\n" + "fails parsing, an error is returned by default."), + {"strings"}, "StrptimeOptions", /*options_required=*/true); const FunctionDoc assume_timezone_doc{ "Convert naive timestamp to timezone-aware timestamp", ("Input timestamps are assumed to be relative to the timezone given in the\n" @@ -1780,6 +1913,11 @@ void RegisterScalarTemporalUnary(FunctionRegistry* registry) { StrftimeState::Init); DCHECK_OK(registry->AddFunction(std::move(strftime))); + auto strptime = SimpleUnaryTemporalFactory::Make( + "strptime", OutputType::Resolver(ResolveStrptimeOutput), &strptime_doc, nullptr, + StrptimeState::Init); + DCHECK_OK(registry->AddFunction(std::move(strptime))); + auto assume_timezone = UnaryTemporalFactory::Make< WithTimestamps>("assume_timezone", diff --git a/cpp/src/arrow/compute/kernels/temporal_internal.h b/cpp/src/arrow/compute/kernels/temporal_internal.h index ed4ca48b7be..5a1bd970cb5 100644 --- a/cpp/src/arrow/compute/kernels/temporal_internal.h +++ b/cpp/src/arrow/compute/kernels/temporal_internal.h @@ -171,6 +171,7 @@ struct TimestampFormatter { struct WithDates {}; struct WithTimes {}; struct WithTimestamps {}; +struct WithStringTypes {}; // This helper allows generating temporal kernels for selected type categories // without any spurious code generation for other categories (e.g. avoid @@ -207,6 +208,13 @@ void AddTemporalKernels(Factory* fac, WithTimestamps, WithOthers... others) { AddTemporalKernels(fac, std::forward(others)...); } +template +void AddTemporalKernels(Factory* fac, WithStringTypes, WithOthers... others) { + fac->template AddKernel(utf8()); + fac->template AddKernel(large_utf8()); + AddTemporalKernels(fac, std::forward(others)...); +} + // // Executor class for temporal component extractors, i.e. scalar kernels // with the signature Timestamp -> diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index c6b33f26baf..81b0b801a28 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1438,10 +1438,11 @@ cdef class _StrptimeOptions(FunctionOptions): "ns": TimeUnit_NANO, } - def _set_options(self, format, unit): + def _set_options(self, format, unit, error_is_null): try: self.wrapped.reset( - new CStrptimeOptions(tobytes(format), self._unit_map[unit]) + new CStrptimeOptions(tobytes(format), self._unit_map[unit], + error_is_null) ) except KeyError: _raise_invalid_function_option(unit, "time unit") @@ -1458,10 +1459,12 @@ class StrptimeOptions(_StrptimeOptions): unit : str Timestamp unit of the output. Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. """ - def __init__(self, format, unit): - self._set_options(format, unit) + def __init__(self, format, unit, error_is_null=False): + self._set_options(format, unit, error_is_null) cdef class _StrftimeOptions(FunctionOptions): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 0eb9948a2e0..e3717818b70 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2085,9 +2085,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CStrptimeOptions \ "arrow::compute::StrptimeOptions"(CFunctionOptions): - CStrptimeOptions(c_string format, TimeUnit unit) + CStrptimeOptions(c_string format, TimeUnit unit, c_bool raise_error) c_string format TimeUnit unit + c_bool raise_error cdef cppclass CStrftimeOptions \ "arrow::compute::StrftimeOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index dccbd6d1532..46d302c214c 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -162,7 +162,7 @@ def test_option_class_equality(): pc.SplitOptions(), pc.SplitPatternOptions("pattern"), pc.StrftimeOptions(), - pc.StrptimeOptions("%Y", "s"), + pc.StrptimeOptions("%Y", "s", True), pc.StructFieldOptions(indices=[]), pc.TakeOptions(), pc.TDigestOptions(), @@ -1722,6 +1722,22 @@ def test_strptime(): # Positional format assert pc.strptime(arr, '%m/%d/%Y', unit='s') == got + expected = pa.array([datetime(2020, 1, 5), None, None], + type=pa.timestamp('s')) + got = pc.strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True) + assert got == expected + + with pytest.raises(pa.ArrowInvalid, + match="Failed to parse string: '5/1/2020'"): + pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=False) + + with pytest.raises(pa.ArrowInvalid, + match="Failed to parse string: '5/1/2020'"): + pc.strptime(arr, format='%Y-%m-%d', unit='s') + + got = pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=True) + assert got == pa.array([None, None, None], type=pa.timestamp('s')) + # TODO: We should test on windows once ARROW-13168 is resolved. @pytest.mark.pandas diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 0f0ef2f7dd1..91a55458b79 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -375,9 +375,13 @@ std::shared_ptr make_compute_options( if (func_name == "strptime") { using Options = arrow::compute::StrptimeOptions; + bool error_is_null = false; + if (!Rf_isNull(options["error_is_null"])) { + error_is_null = cpp11::as_cpp(options["error_is_null"]); + } return std::make_shared( cpp11::as_cpp(options["format"]), - cpp11::as_cpp(options["unit"])); + cpp11::as_cpp(options["unit"]), error_is_null); } if (func_name == "strftime") {