diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index e3ff5cecebd..ec161d8361c 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -338,3 +338,20 @@ get_stringr_pattern_options <- function(pattern) { contains_regex <- function(string) { grepl("[.\\|()[{^$*+?]", string) } + +nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") { + # Arrow uses unit for time parsing, strptime() does not. + # Arrow has no default option for strptime (format, unit), + # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms", + # (ARROW-12809) + + # ParseTimestampStrptime currently ignores the timezone information (ARROW-12820). + # Stop if tz is provided. + if (is.character(tz)) { + arrow_not_supported("Time zone argument") + } + + unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) + + Expression$create("strptime", x, options = list(format = format, unit = unit)) +} diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 90c7b4129c7..5d594964294 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -234,6 +234,13 @@ std::shared_ptr make_compute_options( max_replacements); } + if (func_name == "strptime") { + using Options = arrow::compute::StrptimeOptions; + return std::make_shared( + cpp11::as_cpp(options["format"]), + cpp11::as_cpp(options["unit"])); + } + if (func_name == "split_pattern" || func_name == "split_pattern_regex") { using Options = arrow::compute::SplitPatternOptions; int64_t max_splits = -1; diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index bb4794ef4c5..ea27aa14777 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -493,3 +493,78 @@ test_that("edge cases in string detection and replacement", { tibble(x = c("ABC")) ) }) + +test_that("strptime", { + + t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) + t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) + + expect_equal( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x) + ) %>% + collect(), + t_stamp, + check.tzone = FALSE + ) + + expect_equal( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S") + ) %>% + collect(), + t_stamp, + check.tzone = FALSE + ) + + expect_equal( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns") + ) %>% + collect(), + t_stamp, + check.tzone = FALSE + ) + + expect_equal( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s") + ) %>% + collect(), + t_stamp, + check.tzone = FALSE + ) + + tstring <- tibble(x = c("08-05-2008", NA)) + tstamp <- tibble(x = c(strptime("08-05-2008", format = "%m-%d-%Y"), NA)) + + expect_equal( + tstring %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%m-%d-%Y") + ) %>% + collect(), + tstamp, + check.tzone = FALSE + ) + +}) + +test_that("errors in strptime", { + # Error when tz is passed + + x <- Expression$field_ref("x") + expect_error( + nse_funcs$strptime(x, tz = "PDT"), + 'Time zone argument not supported by Arrow' + ) +})