diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 02ec35bda26..64847f41fd3 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -618,4 +618,40 @@ register_bindings_datetime_parsers <- function() { for (ymd_order in ymd_parser_vec) { register_binding(ymd_order, ymd_parser_map_factory(ymd_order)) } + + register_binding("fast_strptime", function(x, + format, + tz = "UTC", + lt = FALSE, + cutoff_2000 = 68L) { + # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play + # well with mutate, for example) + if (lt) { + arrow_not_supported("`lt = TRUE` argument") + } + + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16596 is done + if (cutoff_2000 != 68L) { + arrow_not_supported("`cutoff_2000` != 68L argument") + } + + parse_attempt_expressions <- list() + + parse_attempt_expressions <- map( + format, + ~ build_expr( + "strptime", + x, + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) + ) + ) + + coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) + + build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) + }) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b1223630153..86b1862ab4c 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1812,3 +1812,133 @@ test_that("ym, my & yq parsers", { test_df ) }) + +test_that("lubridate's fast_strptime", { + + compare_dplyr_binding( + .input %>% + mutate( + y = + fast_strptime( + x, + format = "%Y-%m-%d %H:%M:%S", + lt = FALSE + ) + ) %>% + collect(), + tibble( + x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA) + )#, + # arrow does not preserve the `tzone` attribute + # test ignore_attr = TRUE + ) + + # R object + compare_dplyr_binding( + .input %>% + mutate( + y = + fast_strptime( + "68-10-07 19:04:05", + format = "%y-%m-%d %H:%M:%S", + lt = FALSE + ) + ) %>% + collect(), + tibble( + x = c("2018-10-07 19:04:05", NA) + )#, + # test ignore_attr = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate( + date_multi_formats = + fast_strptime( + x, + format = c("%Y-%m-%d %H:%M:%S", "%m-%d-%Y %H:%M:%S"), + lt = FALSE + ) + ) %>% + collect(), + tibble( + x = c("2018-10-07 19:04:05", "10-07-1968 19:04:05") + ) + ) + + compare_dplyr_binding( + .input %>% + mutate( + dttm_with_tz = fast_strptime( + dttm_as_string, + format = "%Y-%m-%d %H:%M:%S", + tz = "Pacific/Marquesas", + lt = FALSE + ) + ) %>% + collect(), + tibble( + dttm_as_string = + c("2018-10-07 19:04:05", "1969-10-07 19:04:05", NA) + ) + ) + + # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is + # implicitly set to 68L both in lubridate and in Arrow + compare_dplyr_binding( + .input %>% + mutate( + date_short_year = + fast_strptime( + x, + format = "%y-%m-%d %H:%M:%S", + lt = FALSE + ) + ) %>% + collect(), + tibble( + x = + c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) + )#, + # arrow does not preserve the `tzone` attribute + # test ignore_attr = TRUE + ) + + # the arrow binding errors for a value different from 68L for `cutoff_2000` + compare_dplyr_binding( + .input %>% + mutate( + date_short_year = + fast_strptime( + x, + format = "%y-%m-%d %H:%M:%S", + lt = FALSE, + cutoff_2000 = 69L + ) + ) %>% + collect(), + tibble( + x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) + ), + warning = TRUE + ) + + # compare_dplyr_binding would not work here since lt = TRUE returns a list + # and it also errors in regular dplyr pipelines + expect_warning( + tibble( + x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) + ) %>% + arrow_table() %>% + mutate( + date_short_year = + fast_strptime( + x, + format = "%y-%m-%d %H:%M:%S", + lt = TRUE + ) + ) %>% + collect() + ) +})