Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_temporal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,11 @@ struct Strftime {
const StrftimeOptions& options = StrftimeState::Get(ctx);

auto timezone = GetInputTimezone(type);
// This check is due to surprising %c behavior.
// See https://github.com/HowardHinnant/date/issues/704
if ((options.format.find("%c") != std::string::npos) && (options.locale != "C")) {
return Status::Invalid("%c flag is not supported in non-C locales.");
}
if (timezone.empty()) {
if ((options.format.find("%z") != std::string::npos) ||
(options.format.find("%Z") != std::string::npos)) {
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1560,7 +1560,7 @@ def _fix_timestamp(s):

formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H",
"%I", "%p", "%M", "%z", "%Z", "%j", "%U", "%W", "%c", "%x",
"%X", "%%", "%G", "%V", "%u", "%V"]
"%X", "%%", "%G", "%V", "%u"]

for timezone in timezones:
ts = pd.to_datetime(times).tz_localize(timezone)
Expand Down
42 changes: 42 additions & 0 deletions r/R/dplyr-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,48 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit
Expression$create("strptime", x, options = list(format = format, unit = unit))
}

nse_funcs$strftime <- function(x, format = "", tz = "", usetz = FALSE) {
if (usetz) {
format <- paste(format, "%Z")
}
if (tz == "") {
tz <- Sys.timezone()
}
# Arrow's strftime prints in timezone of the timestamp. To match R's strftime behavior we first
# cast the timestamp to desired timezone. This is a metadata only change.
ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz)))
Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME")))
}

nse_funcs$format_ISO8601 <- function(x, usetz = FALSE, precision = NULL, ...) {
ISO8601_precision_map <-
list(y = "%Y",
ym = "%Y-%m",
ymd = "%Y-%m-%d",
ymdh = "%Y-%m-%dT%H",
ymdhm = "%Y-%m-%dT%H:%M",
ymdhms = "%Y-%m-%dT%H:%M:%S")

if (is.null(precision)) {
precision <- "ymdhms"
}
if (!precision %in% names(ISO8601_precision_map)) {
abort(
paste(
"`precision` must be one of the following values:",
paste(names(ISO8601_precision_map), collapse = ", "),
"\nValue supplied was: ",
precision
)
)
}
format <- ISO8601_precision_map[[precision]]
if (usetz) {
format <- paste0(format, "%z")
}
Expression$create("strftime", x, options = list(format = format, locale = "C"))
}

nse_funcs$second <- function(x) {
Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x))
}
Expand Down
7 changes: 7 additions & 0 deletions r/src/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,13 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
cpp11::as_cpp<arrow::TimeUnit::type>(options["unit"]));
}

if (func_name == "strftime") {
using Options = arrow::compute::StrftimeOptions;
return std::make_shared<Options>(
Options(cpp11::as_cpp<std::string>(options["format"]),
cpp11::as_cpp<std::string>(options["locale"])));
}

if (func_name == "assume_timezone") {
using Options = arrow::compute::AssumeTimezoneOptions;
enum Options::Ambiguous ambiguous;
Expand Down
99 changes: 99 additions & 0 deletions r/tests/testthat/test-dplyr-string-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ skip_if_not_available("dataset")
skip_if_not_available("utf8proc")

library(dplyr)
library(lubridate)
library(stringr)
library(stringi)

Expand Down Expand Up @@ -719,6 +720,104 @@ test_that("errors in strptime", {
)
})

test_that("strftime", {
skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168

times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA))
seconds <- tibble(x = c("05.000000", NA))
formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u"

expect_dplyr_equal(
input %>%
mutate(x = strftime(x, format = formats)) %>%
collect(),
times
)

expect_dplyr_equal(
input %>%
mutate(x = strftime(x, format = formats, tz = "Pacific/Marquesas")) %>%
collect(),
times
)

expect_dplyr_equal(
input %>%
mutate(x = strftime(x, format = formats, tz = "EST", usetz = TRUE)) %>%
collect(),
times
)

withr::with_timezone("Pacific/Marquesas",
expect_dplyr_equal(
input %>%
mutate(x = strftime(x, format = formats, tz = "EST")) %>%
collect(),
times
)
)

# This check is due to differences in the way %c currently works in Arrow and R's strftime.
# We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved.
expect_error(
times %>%
Table$create() %>%
mutate(x = strftime(x, format = "%c")) %>%
collect(),
"%c flag is not supported in non-C locales."
)

# Output precision of %S depends on the input timestamp precision.
# Timestamps with second precision are represented as integers while
# milliseconds, microsecond and nanoseconds are represented as fixed floating
# point numbers with 3, 6 and 9 decimal places respectively.
expect_dplyr_equal(
input %>%
mutate(x = strftime(x, format = "%S")) %>%
transmute(as.double(substr(x, 1, 2))) %>%
collect(),
times,
tolerance = 1e-6
)
})

test_that("format_ISO8601", {
skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA))

expect_dplyr_equal(
input %>%
mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>%
collect(),
times
)

expect_dplyr_equal(
input %>%
mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
collect(),
times
)

# See comment regarding %S flag in strftime tests
expect_dplyr_equal(
input %>%
mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>%
mutate(x = gsub("\\.0*", "", x)) %>%
collect(),
times
)

# See comment regarding %S flag in strftime tests
expect_dplyr_equal(
input %>%
mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
mutate(x = gsub("\\.0*", "", x)) %>%
collect(),
times
)
})

test_that("arrow_find_substring and arrow_find_substring_regex", {
df <- tibble(x = c("Foo and Bar", "baz and qux and quux"))

Expand Down