From b53a2e882e97a32bdd823f95970714063484aef5 Mon Sep 17 00:00:00 2001 From: Rok Date: Fri, 18 Feb 2022 16:38:36 +0100 Subject: [PATCH 1/8] Adding raise_errors to StrptimeOptions --- cpp/src/arrow/compute/api_scalar.cc | 11 +- cpp/src/arrow/compute/api_scalar.h | 7 +- cpp/src/arrow/compute/exec/expression_test.cc | 2 +- cpp/src/arrow/compute/function_test.cc | 2 +- .../compute/kernels/scalar_string_ascii.cc | 74 ----------- .../compute/kernels/scalar_string_test.cc | 4 +- .../compute/kernels/scalar_temporal_unary.cc | 115 +++++++++++++++++- .../arrow/compute/kernels/temporal_internal.h | 8 ++ python/pyarrow/_compute.pyx | 10 +- python/pyarrow/includes/libarrow.pxd | 3 +- python/pyarrow/tests/test_compute.py | 2 +- r/src/compute.cpp | 3 +- 12 files changed, 150 insertions(+), 91 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 6d92a6531de..18f2657ece3 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -353,7 +353,8 @@ static auto kStrftimeOptionsType = GetFunctionOptionsType( DataMember("format", &StrftimeOptions::format)); static auto kStrptimeOptionsType = GetFunctionOptionsType( DataMember("format", &StrptimeOptions::format), - DataMember("unit", &StrptimeOptions::unit)); + DataMember("unit", &StrptimeOptions::unit), + DataMember("raise_errors", &StrptimeOptions::raise_errors)); static auto kStructFieldOptionsType = GetFunctionOptionsType( DataMember("indices", &StructFieldOptions::indices)); static auto kTrimOptionsType = GetFunctionOptionsType( @@ -544,11 +545,13 @@ StrftimeOptions::StrftimeOptions() : StrftimeOptions(kDefaultFormat) {} constexpr char StrftimeOptions::kTypeName[]; constexpr const char* StrftimeOptions::kDefaultFormat; -StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit) +StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit, + bool raise_errors) : FunctionOptions(internal::kStrptimeOptionsType), format(std::move(format)), - unit(unit) {} -StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {} + unit(unit), + raise_errors(raise_errors) {} +StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::MICRO, true) {} constexpr char StrptimeOptions::kTypeName[]; StructFieldOptions::StructFieldOptions(std::vector indices) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 41e8d5a49c2..8d5ce4436c7 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -267,12 +267,17 @@ class ARROW_EXPORT StructFieldOptions : public FunctionOptions { class ARROW_EXPORT StrptimeOptions : public FunctionOptions { public: - explicit StrptimeOptions(std::string format, TimeUnit::type unit); + explicit StrptimeOptions(std::string format, TimeUnit::type unit, + bool raise_errors = true); StrptimeOptions(); static constexpr char const kTypeName[] = "StrptimeOptions"; + /// The desired format string. std::string format; + /// The desired time resolution TimeUnit::type unit; + /// Raise on parsing errors + bool raise_errors; }; class ARROW_EXPORT StrftimeOptions : public FunctionOptions { diff --git a/cpp/src/arrow/compute/exec/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc index fa05ddf6422..30ddef69010 100644 --- a/cpp/src/arrow/compute/exec/expression_test.cc +++ b/cpp/src/arrow/compute/exec/expression_test.cc @@ -737,7 +737,7 @@ TEST(Expression, ExecuteCall) { ])")); ExpectExecute(call("strptime", {field_ref("a")}, - compute::StrptimeOptions("%m/%d/%Y", TimeUnit::MICRO)), + compute::StrptimeOptions("%m/%d/%Y", TimeUnit::MICRO, true)), ArrayFromJSON(struct_({field("a", utf8())}), R"([ {"a": "5/1/2020"}, {"a": null}, diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 3f5f7c08d66..13de2a29ab8 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -88,7 +88,7 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new ExtractRegexOptions("pattern2")); options.emplace_back(new SetLookupOptions(ArrayFromJSON(int64(), "[1, 2, 3, 4]"))); options.emplace_back(new SetLookupOptions(ArrayFromJSON(boolean(), "[true, false]"))); - options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI)); + options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI, true)); options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO)); options.emplace_back(new StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C")); #ifndef _WIN32 diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc index 9b855505314..a332364bd82 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc @@ -2770,79 +2770,6 @@ void AddAsciiStringSplitRegex(FunctionRegistry* registry) { } #endif // ARROW_WITH_RE2 -// ---------------------------------------------------------------------- -// strptime string parsing - -using StrptimeState = OptionsWrapper; - -struct ParseStrptime { - explicit ParseStrptime(const StrptimeOptions& options) - : parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {} - - template - int64_t Call(KernelContext*, util::string_view val, Status* st) const { - int64_t result = 0; - if (!(*parser)(val.data(), val.size(), unit, &result)) { - *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ", - TimestampType(unit).ToString()); - } - return result; - } - - std::shared_ptr parser; - TimeUnit::type unit; -}; - -template -struct StrptimeExec { - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - applicator::ScalarUnaryNotNullStateful - kernel{ParseStrptime(StrptimeState::Get(ctx))}; - return kernel.Exec(ctx, batch, out); - } -}; - -Result ResolveStrptimeOutput(KernelContext* ctx, - const std::vector&) { - if (!ctx->state()) { - return Status::Invalid("strptime does not provide default StrptimeOptions"); - } - const StrptimeOptions& options = StrptimeState::Get(ctx); - // Check for use of %z or %Z - size_t cur = 0; - std::string zone = ""; - while (cur < options.format.size() - 1) { - if (options.format[cur] == '%') { - if (options.format[cur + 1] == 'z') { - zone = "UTC"; - break; - } - cur++; - } - cur++; - } - return ::arrow::timestamp(options.unit, zone); -} - -const FunctionDoc strptime_doc( - "Parse timestamps", - ("For each string in `strings`, parse it as a timestamp.\n" - "The timestamp unit and the expected string pattern must be given\n" - "in StrptimeOptions. Null inputs emit null. If a non-null string\n" - "fails parsing, an error is returned."), - {"strings"}, "StrptimeOptions", /*options_required=*/true); - -void AddAsciiStringStrptime(FunctionRegistry* registry) { - auto func = std::make_shared("strptime", Arity::Unary(), &strptime_doc); - - OutputType out_ty(ResolveStrptimeOutput); - for (const auto& ty : StringTypes()) { - auto exec = GenerateVarBinaryToVarBinary(ty); - DCHECK_OK(func->AddKernel({ty}, out_ty, std::move(exec), StrptimeState::Init)); - } - DCHECK_OK(registry->AddFunction(std::move(func))); -} - // ---------------------------------------------------------------------- // Binary join @@ -3518,7 +3445,6 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { #ifdef ARROW_WITH_RE2 AddAsciiStringSplitRegex(registry); #endif - AddAsciiStringStrptime(registry); AddAsciiStringJoin(registry); AddAsciiStringRepeat(registry); } diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index ce7ffee526b..286cdc72b54 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1842,7 +1842,7 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) { TYPED_TEST(TestStringKernels, Strptime) { std::string input1 = R"(["5/1/2020", null, "12/11/1900"])"; std::string output1 = R"(["2020-05-01", null, "1900-12-11"])"; - StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO); + StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, true); this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options); input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])"; @@ -1859,7 +1859,7 @@ TYPED_TEST(TestStringKernels, StrptimeZoneOffset) { std::string input1 = R"(["5/1/2020 +0100", null, "12/11/1900 -0130"])"; std::string output1 = R"(["2020-04-30T23:00:00.000000", null, "1900-12-11T01:30:00.000000"])"; - StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO); + StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO, true); this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO, "UTC"), output1, &options); } diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index ed08c367664..8732d6d46b3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -25,6 +25,7 @@ #include "arrow/compute/kernels/temporal_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/time.h" +#include "arrow/util/value_parsing.h" #include "arrow/vendored/datetime.h" namespace arrow { @@ -1143,6 +1144,107 @@ struct Strftime { }; #endif +// ---------------------------------------------------------------------- +// Convert string representations of timestamps in arbitrary format to timestamps + +using StrptimeState = OptionsWrapper; + +template +struct Strptime { + std::shared_ptr parser; + const StrptimeOptions& options; + + static Result Make(KernelContext* ctx, const DataType& type) { + const StrptimeOptions& options = StrptimeState::Get(ctx); + return Strptime{TimestampParser::MakeStrptime(options.format), options}; + } + + static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type)); + + if (in.is_valid) { + auto s = internal::UnboxScalar::Unbox(in); + int64_t result; + if (!(*self.parser)(s.data(), s.size(), self.options.unit, &result)) { + if (self.options.raise_errors) { + return Status::Invalid("Failed to parse string: '", s.data(), + "' as a scalar of type ", + TimestampType(self.options.unit).ToString()); + } + } + + *checked_cast(out) = + TimestampScalar(result, timestamp(self.options.unit)); + } else { + out->is_valid = false; + } + return Status::OK(); + } + + static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { + ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type)); + + std::unique_ptr array_builder; + RETURN_NOT_OK( + MakeBuilder(ctx->memory_pool(), timestamp(self.options.unit), &array_builder)); + TimestampBuilder* builder = checked_cast(array_builder.get()); + RETURN_NOT_OK(builder->Reserve(in.length)); + + if (self.options.raise_errors) { + auto visit_null = [&]() { return builder->AppendNull(); }; + auto visit_value = [&](util::string_view s) { + int64_t result; + if (!(*self.parser)(s.data(), s.size(), self.options.unit, &result)) { + return Status::Invalid("Failed to parse string: '", s.data(), + "' as a scalar of type ", + TimestampType(self.options.unit).ToString()); + } + return builder->Append(result); + }; + RETURN_NOT_OK(VisitArrayDataInline(in, visit_value, visit_null)); + } else { + auto visit_null = [&]() { builder->UnsafeAppendNull(); }; + auto visit_value = [&](util::string_view s) { + int64_t result; + if (!(*self.parser)(s.data(), s.size(), self.options.unit, &result)) { + builder->UnsafeAppendNull(); + } else { + builder->UnsafeAppend(result); + } + }; + VisitArrayDataInline(in, visit_value, visit_null); + } + + std::shared_ptr out_array; + RETURN_NOT_OK(builder->Finish(&out_array)); + *out = *std::move(out_array->data()); + + return Status::OK(); + } +}; + +Result ResolveStrptimeOutput(KernelContext* ctx, + const std::vector&) { + if (!ctx->state()) { + return Status::Invalid("strptime does not provide default StrptimeOptions"); + } + const StrptimeOptions& options = StrptimeState::Get(ctx); + // Check for use of %z or %Z + size_t cur = 0; + std::string zone = ""; + while (cur < options.format.size() - 1) { + if (options.format[cur] == '%') { + if (options.format[cur + 1] == 'z') { + zone = "UTC"; + break; + } + cur++; + } + cur++; + } + return ::arrow::timestamp(options.unit, zone); +} + // ---------------------------------------------------------------------- // Convert timestamps from local timestamp without a timezone to timestamps with a // timezone, interpreting the local timestamp as being in the specified timezone @@ -1590,7 +1692,13 @@ const FunctionDoc strftime_doc{ "does not exist on this system."), {"timestamps"}, "StrftimeOptions"}; - +const FunctionDoc strptime_doc( + "Parse timestamps", + ("For each string in `strings`, parse it as a timestamp.\n" + "The timestamp unit and the expected string pattern must be given\n" + "in StrptimeOptions. Null inputs emit null. If a non-null string\n" + "fails parsing, an error is returned by default."), + {"strings"}, "StrptimeOptions", /*options_required=*/true); const FunctionDoc assume_timezone_doc{ "Convert naive timestamp to timezone-aware timestamp", ("Input timestamps are assumed to be relative to the timezone given in the\n" @@ -1780,6 +1888,11 @@ void RegisterScalarTemporalUnary(FunctionRegistry* registry) { StrftimeState::Init); DCHECK_OK(registry->AddFunction(std::move(strftime))); + auto strptime = SimpleUnaryTemporalFactory::Make( + "strptime", OutputType::Resolver(ResolveStrptimeOutput), &strptime_doc, nullptr, + StrptimeState::Init); + DCHECK_OK(registry->AddFunction(std::move(strptime))); + auto assume_timezone = UnaryTemporalFactory::Make< WithTimestamps>("assume_timezone", diff --git a/cpp/src/arrow/compute/kernels/temporal_internal.h b/cpp/src/arrow/compute/kernels/temporal_internal.h index ed4ca48b7be..5a1bd970cb5 100644 --- a/cpp/src/arrow/compute/kernels/temporal_internal.h +++ b/cpp/src/arrow/compute/kernels/temporal_internal.h @@ -171,6 +171,7 @@ struct TimestampFormatter { struct WithDates {}; struct WithTimes {}; struct WithTimestamps {}; +struct WithStringTypes {}; // This helper allows generating temporal kernels for selected type categories // without any spurious code generation for other categories (e.g. avoid @@ -207,6 +208,13 @@ void AddTemporalKernels(Factory* fac, WithTimestamps, WithOthers... others) { AddTemporalKernels(fac, std::forward(others)...); } +template +void AddTemporalKernels(Factory* fac, WithStringTypes, WithOthers... others) { + fac->template AddKernel(utf8()); + fac->template AddKernel(large_utf8()); + AddTemporalKernels(fac, std::forward(others)...); +} + // // Executor class for temporal component extractors, i.e. scalar kernels // with the signature Timestamp -> diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index c6b33f26baf..ecbb585aa36 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1438,10 +1438,10 @@ cdef class _StrptimeOptions(FunctionOptions): "ns": TimeUnit_NANO, } - def _set_options(self, format, unit): + def _set_options(self, format, unit, raise_errors): try: self.wrapped.reset( - new CStrptimeOptions(tobytes(format), self._unit_map[unit]) + new CStrptimeOptions(tobytes(format), self._unit_map[unit], raise_errors) ) except KeyError: _raise_invalid_function_option(unit, "time unit") @@ -1458,10 +1458,12 @@ class StrptimeOptions(_StrptimeOptions): unit : str Timestamp unit of the output. Accepted values are "s", "ms", "us", "ns". + raise_errors: boolean + Raise on parsing errors. """ - def __init__(self, format, unit): - self._set_options(format, unit) + def __init__(self, format, unit, raise_errors): + self._set_options(format, unit, raise_errors) cdef class _StrftimeOptions(FunctionOptions): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 0eb9948a2e0..e3717818b70 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2085,9 +2085,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CStrptimeOptions \ "arrow::compute::StrptimeOptions"(CFunctionOptions): - CStrptimeOptions(c_string format, TimeUnit unit) + CStrptimeOptions(c_string format, TimeUnit unit, c_bool raise_error) c_string format TimeUnit unit + c_bool raise_error cdef cppclass CStrftimeOptions \ "arrow::compute::StrftimeOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index dccbd6d1532..95bf17c6911 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -162,7 +162,7 @@ def test_option_class_equality(): pc.SplitOptions(), pc.SplitPatternOptions("pattern"), pc.StrftimeOptions(), - pc.StrptimeOptions("%Y", "s"), + pc.StrptimeOptions("%Y", "s", True), pc.StructFieldOptions(indices=[]), pc.TakeOptions(), pc.TDigestOptions(), diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 0f0ef2f7dd1..746c873a347 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -377,7 +377,8 @@ std::shared_ptr make_compute_options( using Options = arrow::compute::StrptimeOptions; return std::make_shared( cpp11::as_cpp(options["format"]), - cpp11::as_cpp(options["unit"])); + cpp11::as_cpp(options["unit"]), + cpp11::as_cpp(options["raise_errors"])); } if (func_name == "strftime") { From cb73a005ab4bf7bd05123dcb048f0b3f54cbd8d8 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 16 Mar 2022 23:34:36 +0100 Subject: [PATCH 2/8] Some changes and kernel.can_write_into_slices=false --- cpp/src/arrow/compute/api_scalar.cc | 4 + cpp/src/arrow/compute/api_scalar.h | 16 ++++ .../compute/kernels/scalar_string_test.cc | 15 +++- .../compute/kernels/scalar_temporal_unary.cc | 83 +++++++++++-------- 4 files changed, 82 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 18f2657ece3..568c146c081 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -825,6 +825,10 @@ Result Strftime(const Datum& arg, StrftimeOptions options, ExecContext* c return CallFunction("strftime", {arg}, &options, ctx); } +Result Strptime(const Datum& arg, StrptimeOptions options, ExecContext* ctx) { + return CallFunction("strptime", {arg}, &options, ctx); +} + Result Week(const Datum& arg, WeekOptions options, ExecContext* ctx) { return CallFunction("week", {arg}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 8d5ce4436c7..2bd2e9120c3 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -1403,6 +1403,22 @@ ARROW_EXPORT Result Subsecond(const Datum& values, ExecContext* ctx = NUL ARROW_EXPORT Result Strftime(const Datum& values, StrftimeOptions options, ExecContext* ctx = NULLPTR); +/// \brief Parse timestamps according to a format string +/// +/// Return parsed timestamps according to the format string +/// `StrptimeOptions::format` at time resolution `Strftime::unit`. Parse errors are +/// raised depending on the `Strftime::raise_errors` setting. +/// +/// \param[in] values input strings +/// \param[in] options for setting format string, unit and raise_errors +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Strptime(const Datum& values, StrptimeOptions options, + ExecContext* ctx = NULLPTR); + /// \brief Converts timestamps from local timestamp without a timezone to a timestamp with /// timezone, interpreting the local timestamp as being in the specified timezone for each /// element of `values` diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 286cdc72b54..b68f567f93a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -27,6 +27,7 @@ #include #endif +#include "arrow/compute/api.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/test_util.h" @@ -1842,12 +1843,24 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) { TYPED_TEST(TestStringKernels, Strptime) { std::string input1 = R"(["5/1/2020", null, "12/11/1900"])"; std::string output1 = R"(["2020-05-01", null, "1900-12-11"])"; - StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, true); + auto input_array = ArrayFromJSON(utf8(), input1); + StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, false); this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options); input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])"; options.format = "%m/%d/%Y %%z"; this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options); + + ASSERT_OK_AND_ASSIGN(auto result, CallFunction("strptime", {input_array}, &options)); + + options.format = "%Y-%m-%d"; + options.raise_errors = true; + ASSERT_RAISES(Invalid, CallFunction("strptime", {input_array}, &options)); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Failed to parse string: '5/1/202012/11/1900'"), + Strptime(input_array, options)); } TYPED_TEST(TestStringKernels, StrptimeZoneOffset) { diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index 8732d6d46b3..1780d29c991 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -73,6 +73,7 @@ using std::chrono::minutes; using DayOfWeekState = OptionsWrapper; using WeekState = OptionsWrapper; using StrftimeState = OptionsWrapper; +using StrptimeState = OptionsWrapper; using AssumeTimezoneState = OptionsWrapper; using RoundTemporalState = OptionsWrapper; @@ -1147,16 +1148,36 @@ struct Strftime { // ---------------------------------------------------------------------- // Convert string representations of timestamps in arbitrary format to timestamps -using StrptimeState = OptionsWrapper; +static std::string GetZone(std::string format) { + // Check for use of %z or %Z + size_t cur = 0; + std::string zone = ""; + while (cur < format.size() - 1) { + if (format[cur] == '%') { + if (format[cur + 1] == 'z') { + zone = "UTC"; + break; + } + cur++; + } + cur++; + } + return zone; +} template struct Strptime { - std::shared_ptr parser; - const StrptimeOptions& options; + const std::shared_ptr parser; + const TimeUnit::type unit; + const std::string zone; + const bool raise_errors; static Result Make(KernelContext* ctx, const DataType& type) { const StrptimeOptions& options = StrptimeState::Get(ctx); - return Strptime{TimestampParser::MakeStrptime(options.format), options}; + + return Strptime{TimestampParser::MakeStrptime(options.format), + std::move(options.unit), GetZone(options.format), + options.raise_errors}; } static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { @@ -1165,16 +1186,18 @@ struct Strptime { if (in.is_valid) { auto s = internal::UnboxScalar::Unbox(in); int64_t result; - if (!(*self.parser)(s.data(), s.size(), self.options.unit, &result)) { - if (self.options.raise_errors) { + if ((*self.parser)(s.data(), s.length(), self.unit, &result)) { + *checked_cast(out) = + TimestampScalar(result, timestamp(self.unit, self.zone)); + } else { + if (self.raise_errors) { return Status::Invalid("Failed to parse string: '", s.data(), "' as a scalar of type ", - TimestampType(self.options.unit).ToString()); + TimestampType(self.unit).ToString()); + } else { + out->is_valid = false; } } - - *checked_cast(out) = - TimestampScalar(result, timestamp(self.options.unit)); } else { out->is_valid = false; } @@ -1186,18 +1209,18 @@ struct Strptime { std::unique_ptr array_builder; RETURN_NOT_OK( - MakeBuilder(ctx->memory_pool(), timestamp(self.options.unit), &array_builder)); - TimestampBuilder* builder = checked_cast(array_builder.get()); + MakeBuilder(ctx->memory_pool(), timestamp(self.unit, self.zone), &array_builder)); + auto builder = checked_pointer_cast(std::move(array_builder)); RETURN_NOT_OK(builder->Reserve(in.length)); - if (self.options.raise_errors) { + if (self.raise_errors) { auto visit_null = [&]() { return builder->AppendNull(); }; auto visit_value = [&](util::string_view s) { - int64_t result; - if (!(*self.parser)(s.data(), s.size(), self.options.unit, &result)) { + int64_t result = 0; + if (!(*self.parser)(s.data(), s.size(), self.unit, &result)) { return Status::Invalid("Failed to parse string: '", s.data(), "' as a scalar of type ", - TimestampType(self.options.unit).ToString()); + TimestampType(self.unit).ToString()); } return builder->Append(result); }; @@ -1205,8 +1228,8 @@ struct Strptime { } else { auto visit_null = [&]() { builder->UnsafeAppendNull(); }; auto visit_value = [&](util::string_view s) { - int64_t result; - if (!(*self.parser)(s.data(), s.size(), self.options.unit, &result)) { + int64_t result = 0; + if (!(*self.parser)(s.data(), s.size(), self.unit, &result)) { builder->UnsafeAppendNull(); } else { builder->UnsafeAppend(result); @@ -1229,20 +1252,8 @@ Result ResolveStrptimeOutput(KernelContext* ctx, return Status::Invalid("strptime does not provide default StrptimeOptions"); } const StrptimeOptions& options = StrptimeState::Get(ctx); - // Check for use of %z or %Z - size_t cur = 0; - std::string zone = ""; - while (cur < options.format.size() - 1) { - if (options.format[cur] == '%') { - if (options.format[cur + 1] == 'z') { - zone = "UTC"; - break; - } - cur++; - } - cur++; - } - return ::arrow::timestamp(options.unit, zone); + auto type = timestamp(options.unit, GetZone(options.format)); + return ValueDescr(std::move(type)); } // ---------------------------------------------------------------------- @@ -1479,7 +1490,7 @@ struct UnaryTemporalFactory { } }; -template