diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index 59ffee73c97..396eec842ae 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -529,6 +529,11 @@ struct Strftime { const StrftimeOptions& options = StrftimeState::Get(ctx); auto timezone = GetInputTimezone(type); + // This check is due to surprising %c behavior. + // See https://github.com/HowardHinnant/date/issues/704 + if ((options.format.find("%c") != std::string::npos) && (options.locale != "C")) { + return Status::Invalid("%c flag is not supported in non-C locales."); + } if (timezone.empty()) { if ((options.format.find("%z") != std::string::npos) || (options.format.find("%Z") != std::string::npos)) { diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 46ba78027b5..b5e2fb2a6c2 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1560,7 +1560,7 @@ def _fix_timestamp(s): formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", "%I", "%p", "%M", "%z", "%Z", "%j", "%U", "%W", "%c", "%x", - "%X", "%%", "%G", "%V", "%u", "%V"] + "%X", "%%", "%G", "%V", "%u"] for timezone in timezones: ts = pd.to_datetime(times).tz_localize(timezone) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 808956efe15..a51bf38de23 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -672,6 +672,48 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit Expression$create("strptime", x, options = list(format = format, unit = unit)) } +nse_funcs$strftime <- function(x, format = "", tz = "", usetz = FALSE) { + if (usetz) { + format <- paste(format, "%Z") + } + if (tz == "") { + tz <- Sys.timezone() + } + # Arrow's strftime prints in timezone of the timestamp. To match R's strftime behavior we first + # cast the timestamp to desired timezone. This is a metadata only change. + ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) +} + +nse_funcs$format_ISO8601 <- function(x, usetz = FALSE, precision = NULL, ...) { + ISO8601_precision_map <- + list(y = "%Y", + ym = "%Y-%m", + ymd = "%Y-%m-%d", + ymdh = "%Y-%m-%dT%H", + ymdhm = "%Y-%m-%dT%H:%M", + ymdhms = "%Y-%m-%dT%H:%M:%S") + + if (is.null(precision)) { + precision <- "ymdhms" + } + if (!precision %in% names(ISO8601_precision_map)) { + abort( + paste( + "`precision` must be one of the following values:", + paste(names(ISO8601_precision_map), collapse = ", "), + "\nValue supplied was: ", + precision + ) + ) + } + format <- ISO8601_precision_map[[precision]] + if (usetz) { + format <- paste0(format, "%z") + } + Expression$create("strftime", x, options = list(format = format, locale = "C")) +} + nse_funcs$second <- function(x) { Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x)) } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index c6ba0a28046..02600566672 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -332,6 +332,13 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["unit"])); } + if (func_name == "strftime") { + using Options = arrow::compute::StrftimeOptions; + return std::make_shared( + Options(cpp11::as_cpp(options["format"]), + cpp11::as_cpp(options["locale"]))); + } + if (func_name == "assume_timezone") { using Options = arrow::compute::AssumeTimezoneOptions; enum Options::Ambiguous ambiguous; diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index b6b8f5a714a..5153a4fc088 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -19,6 +19,7 @@ skip_if_not_available("dataset") skip_if_not_available("utf8proc") library(dplyr) +library(lubridate) library(stringr) library(stringi) @@ -719,6 +720,104 @@ test_that("errors in strptime", { ) }) +test_that("strftime", { + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + + times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA)) + seconds <- tibble(x = c("05.000000", NA)) + formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" + + expect_dplyr_equal( + input %>% + mutate(x = strftime(x, format = formats)) %>% + collect(), + times + ) + + expect_dplyr_equal( + input %>% + mutate(x = strftime(x, format = formats, tz = "Pacific/Marquesas")) %>% + collect(), + times + ) + + expect_dplyr_equal( + input %>% + mutate(x = strftime(x, format = formats, tz = "EST", usetz = TRUE)) %>% + collect(), + times + ) + + withr::with_timezone("Pacific/Marquesas", + expect_dplyr_equal( + input %>% + mutate(x = strftime(x, format = formats, tz = "EST")) %>% + collect(), + times + ) + ) + + # This check is due to differences in the way %c currently works in Arrow and R's strftime. + # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved. + expect_error( + times %>% + Table$create() %>% + mutate(x = strftime(x, format = "%c")) %>% + collect(), + "%c flag is not supported in non-C locales." + ) + + # Output precision of %S depends on the input timestamp precision. + # Timestamps with second precision are represented as integers while + # milliseconds, microsecond and nanoseconds are represented as fixed floating + # point numbers with 3, 6 and 9 decimal places respectively. + expect_dplyr_equal( + input %>% + mutate(x = strftime(x, format = "%S")) %>% + transmute(as.double(substr(x, 1, 2))) %>% + collect(), + times, + tolerance = 1e-6 + ) +}) + +test_that("format_ISO8601", { + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA)) + + expect_dplyr_equal( + input %>% + mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>% + collect(), + times + ) + + expect_dplyr_equal( + input %>% + mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>% + collect(), + times + ) + + # See comment regarding %S flag in strftime tests + expect_dplyr_equal( + input %>% + mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>% + mutate(x = gsub("\\.0*", "", x)) %>% + collect(), + times + ) + + # See comment regarding %S flag in strftime tests + expect_dplyr_equal( + input %>% + mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>% + mutate(x = gsub("\\.0*", "", x)) %>% + collect(), + times + ) +}) + test_that("arrow_find_substring and arrow_find_substring_regex", { df <- tibble(x = c("Foo and Bar", "baz and qux and quux"))