diff --git a/r/NEWS.md b/r/NEWS.md index 6b2e0532168..0a7d30d2a37 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,6 +24,7 @@ * component extraction functions: `tz()` (timezone), `semester()` (semester), `dst()` (daylight savings time indicator), `date()` (extract date), `epiyear()` (epiyear), improvements to `month()`, which now works with integer inputs. * `make_date()` & `make_datetime()` + `ISOdatetime()` & `ISOdate()` to create date-times from numeric representations. * date-time functionality: + * `difftime` and `as.difftime()` * `as.Date()` to convert to date # arrow 7.0.0 diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 0f031a3c87f..62da029c08a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -188,6 +188,9 @@ register_bindings_datetime <- function() { register_binding("date", function(x) { build_expr("cast", x, options = list(to_type = date32())) }) +} + +register_bindings_duration <- function() { register_binding("make_datetime", function(year = 1970L, month = 1L, day = 1L, @@ -236,6 +239,67 @@ register_bindings_datetime <- function() { tz = "UTC") { call_binding("make_datetime", year, month, day, hour, min, sec, tz) }) + register_binding("difftime", function(time1, + time2, + tz, + units = "secs") { + if (units != "secs") { + abort("`difftime()` with units other than `secs` not supported in Arrow") + } + + if (!missing(tz)) { + warn("`tz` argument is not supported in Arrow, so it will be ignored") + } + + # cast to timestamp if time1 and time2 are not dates or timestamp expressions + # (the subtraction of which would output a `duration`) + if (!call_binding("is.instant", time1)) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) + } + + if (!call_binding("is.instant", time2)) { + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) + } + + # we need to go build the subtract expression instead of `time1 - time2` to + # prevent complaints when we try to subtract an R object from an Expression + subtract_output <- build_expr("-", time1, time2) + build_expr("cast", subtract_output, options = cast_options(to_type = duration("s"))) + }) + register_binding("as.difftime", function(x, + format = "%X", + units = "secs") { + # windows doesn't seem to like "%X" + if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { + format <- "%H:%M:%S" + } + + if (units != "secs") { + abort("`as.difftime()` with units other than 'secs' not supported in Arrow") + } + + if (call_binding("is.character", x)) { + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + # complex casting only due to cast type restrictions: time64 -> int64 -> duration(us) + # and then we cast to duration ("s") at the end + x <- x$cast(time64("us"))$cast(int64())$cast(duration("us")) + } + + # numeric -> duration not supported in Arrow yet so we use int64() as an + # intermediate step + # TODO revisit if https://issues.apache.org/jira/browse/ARROW-15862 results + # in numeric -> duration support + + if (call_binding("is.numeric", x)) { + # coerce x to be int64(). it should work for integer-like doubles and fail + # for pure doubles + # if we abort for all doubles, we risk erroring in cases in which + # coercion to int64() would work + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + } + + build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) + }) } binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 4d7cb3bc63d..01e522e537b 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -106,6 +106,7 @@ create_binding_cache <- function() { register_bindings_aggregate() register_bindings_conditional() register_bindings_datetime() + register_bindings_duration() register_bindings_math() register_bindings_string() register_bindings_type() diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 62d682a600b..6328a4c8276 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1076,3 +1076,140 @@ test_that("ISO_datetime & ISOdate", { ignore_attr = TRUE ) }) + +test_that("difftime works correctly", { + test_df <- tibble( + time1 = as.POSIXct( + c("2021-02-20", "2021-07-31 0:0:0", "2021-10-30", "2021-01-31 0:0:0") + ), + time2 = as.POSIXct( + c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36") + ), + secs = c(121L, 234L, 345L, 456L) + ) + + compare_dplyr_binding( + .input %>% + mutate( + secs2 = difftime(time1, time2, units = "secs") + ) %>% + collect(), + test_df, + ignore_attr = TRUE + ) + + # units other than "secs" not supported in arrow + compare_dplyr_binding( + .input %>% + mutate( + mins = difftime(time1, time2, units = "mins") + ) %>% + collect(), + test_df, + warning = TRUE, + ignore_attr = TRUE + ) + + skip_on_os("windows") + test_df_with_tz <- tibble( + time1 = as.POSIXct( + c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"), + tz = "Pacific/Marquesas" + ), + time2 = as.POSIXct( + c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36"), + tz = "Asia/Kathmandu" + ), + secs = c(121L, 234L, 345L, 456L) + ) + + compare_dplyr_binding( + .input %>% + mutate(secs2 = difftime(time2, time1, units = "secs")) %>% + collect(), + test_df_with_tz + ) + + compare_dplyr_binding( + .input %>% + mutate( + secs2 = difftime( + as.POSIXct("2022-03-07", tz = "Europe/Bucharest"), + time1, + units = "secs" + ) + ) %>% + collect(), + test_df_with_tz + ) + + # `tz` is effectively ignored both in R (used only if inputs are POSIXlt) and Arrow + compare_dplyr_binding( + .input %>% + mutate(secs2 = difftime(time2, time1, units = "secs", tz = "Pacific/Marquesas")) %>% + collect(), + test_df_with_tz, + warning = "`tz` argument is not supported in Arrow, so it will be ignored" + ) +}) + +test_that("as.difftime()", { + test_df <- tibble( + hms_string = c("0:7:45", "12:34:56"), + hm_string = c("7:45", "12:34"), + int = c(30L, 75L), + integerish_dbl = c(31, 76), + dbl = c(31.2, 76.4) + ) + + compare_dplyr_binding( + .input %>% + mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>% + collect(), + test_df + ) + + # TODO add test with `format` mismatch returning NA once + # https://issues.apache.org/jira/browse/ARROW-15659 is solved + # for example: as.difftime("07:", format = "%H:%M") should return NA + compare_dplyr_binding( + .input %>% + mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(int_difftime = as.difftime(int, units = "secs")) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(integerish_dbl_difftime = as.difftime(integerish_dbl, units = "secs")) %>% + collect(), + test_df + ) + + # "mins" or other values for units cannot be handled in Arrow + compare_dplyr_binding( + .input %>% + mutate(int_difftime = as.difftime(int, units = "mins")) %>% + collect(), + test_df, + warning = TRUE + ) + + # only integer (or integer-like) -> duration conversion supported in Arrow. + # double -> duration not supported. we're not testing the content of the + # error message as it is being generated in the C++ code and it might change, + # but we want to make sure that this error is raised in our binding implementation + expect_error( + test_df %>% + arrow_table() %>% + mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% + collect() + ) +})