From af22f2dab3db46fcf3e7ca0a7e5076d663d9060b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 24 Feb 2022 13:28:52 +0000 Subject: [PATCH 01/51] first pass at `as.difftime()` --- r/R/dplyr-funcs-type.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 1bb633d5322..503ed047dbe 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -121,6 +121,9 @@ register_bindings_type_cast <- function() { } build_expr("cast", x, options = cast_options(to_type = date32())) }) + register_binding("as.difftime", function(x) { + build_expr("cast", x, options = cast_options(to_type = duration())) + }) register_binding("is", function(object, class2) { if (is.string(class2)) { From 4d997a9e7e7da4a436eecb90c56363d168a620bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 25 Feb 2022 09:29:28 +0000 Subject: [PATCH 02/51] some additions to as.difftime --- r/R/dplyr-funcs-type.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 503ed047dbe..96fe6f86ab9 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -121,8 +121,14 @@ register_bindings_type_cast <- function() { } build_expr("cast", x, options = cast_options(to_type = date32())) }) - register_binding("as.difftime", function(x) { - build_expr("cast", x, options = cast_options(to_type = duration())) + register_binding("as.difftime", function(x, + format = "%X", + units = c("secs"), + tz = NULL) { + if (call_binding("is.character", x)) { + x <- call_binding("strptime", x, format = format) + } + build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) }) register_binding("is", function(object, class2) { From a1727a51b1d9fcffa3d2b883bcdcfbc735d83f99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Mar 2022 15:55:41 +0000 Subject: [PATCH 03/51] early stage unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 38 ++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 62d682a600b..d20d2a2b49b 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1076,3 +1076,41 @@ test_that("ISO_datetime & ISOdate", { ignore_attr = TRUE ) }) + +test_that("difftime works correctly", { + test_df <- tibble( + time1 = as.POSIXct( + c("2021-02-20", "2021-07-31", "2021-10-31", "2021-01-31"), + tz = "Europe/London"), + time2 = as.POSIXct( + c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-31 00:05:45", "2021-01-31 00:07:36"), + tz = "Europe/London"), + secs = c(121L, 234L, 345L, 456L) + ) + + test_df %>% + arrow_table() %>% + mutate( + secs2 = difftime(time1, time2, units = "secs") + ) %>% + collect() + + test_df %>% + arrow_table() %>% + mutate( + time3 = time1 + secs, + secs3 = difftime(time1, time3, units = "secs") + ) %>% + collect() + + + compare_dplyr_binding( + .input %>% + mutate( + nd = dates + secs, + secs2 = difftime(nd, dates, units = "secs") + ) %>% + collect(), + test_df + ) +}) From e6b93bc2702b5abb1d486789e2a69d4f5ff6ace8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Mar 2022 16:30:37 +0000 Subject: [PATCH 04/51] first pass at `difftime()` --- r/R/dplyr-funcs-datetime.R | 20 +++++++++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 35 ++++++++++++++++++-- r/tests/testthat/test-dplyr-funcs-type.R | 4 +++ 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 0f031a3c87f..c43a33e7eb2 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -187,6 +187,26 @@ register_bindings_datetime <- function() { }) register_binding("date", function(x) { build_expr("cast", x, options = list(to_type = date32())) + }) + register_binding("difftime", function(time1, + time2, + tz, + units = c("auto", "secs", "mins", + "hours", "days", "weeks")) { + # browser() + units <- match.arg(units) + if (units == "secs") { + # NB order of the args is different in the C++ kernel vs base::difftime() + build_expr("seconds_between", time2, time1) + } else if (units == "mins") { + build_expr("minutes_between", time2, time1) + } else if (units == "hours") { + build_expr("hours_between", time2, time1) + } else if (units == "days") { + build_expr("days_between", time2, time1) + } else if (units == "weeks") { + build_expr("weeks_between", time2, time1) + } }) register_binding("make_datetime", function(year = 1970L, month = 1L, diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index d20d2a2b49b..fb7e37a3865 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1085,9 +1085,18 @@ test_that("difftime works correctly", { time2 = as.POSIXct( c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-31 00:05:45", "2021-01-31 00:07:36"), tz = "Europe/London"), - secs = c(121L, 234L, 345L, 456L) + time3 = as.POSIXct( + c("2021-02-20 02:01:00", "2021-07-31 03:54:00", "2021-10-31 05:45:00", "2021-01-31 07:36:00"), + tz = "Europe/London"), + secs = c(121L, 234L, 345L, 456L), + mins = c(121L, 234L, 345L, 456L) ) + test_df %>% + arrow_table() %>% + mutate(mins2 = difftime(time1, time3, units = "mins")) %>% + collect() + test_df %>% arrow_table() %>% mutate( @@ -1095,6 +1104,7 @@ test_that("difftime works correctly", { ) %>% collect() + # fails but not because of difftime, but because of time + secs test_df %>% arrow_table() %>% mutate( @@ -1103,12 +1113,31 @@ test_that("difftime works correctly", { ) %>% collect() + # fails due to different attributes + compare_dplyr_binding( + .input %>% + mutate( + secs2 = difftime(time1, time2, units = "secs") + ) %>% + collect(), + test_df + ) + + # passes + compare_dplyr_binding( + .input %>% + mutate( + secs2 = difftime(time1, time2, units = "secs") + ) %>% + collect(), + test_df, + ignore_attr = TRUE + ) compare_dplyr_binding( .input %>% mutate( - nd = dates + secs, - secs2 = difftime(nd, dates, units = "secs") + mins2 = difftime(time1, time3, units = "mins") ) %>% collect(), test_df diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 6c9d9ac07a4..3ff538776a8 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -976,3 +976,7 @@ test_that("format() for unsupported types returns the input as string", { collect() ) }) + +test_that("conversion with as.difftime() works", { + +}) From 88444f5cfdfbab5314d3089cdf8423379ea010fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Mar 2022 16:36:42 +0000 Subject: [PATCH 05/51] pop --- r/tests/testthat/test-dplyr-funcs-datetime.R | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index fb7e37a3865..55747ce4b68 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1137,7 +1137,24 @@ test_that("difftime works correctly", { compare_dplyr_binding( .input %>% mutate( - mins2 = difftime(time1, time3, units = "mins") + mins2 = difftime(time1, time3, units = "mins")) %>% + collect(), + test_df + ) + + test_df %>% + # arrow_table() %>% + mutate( + nd = dates + secs, + secs2 = difftime(nd, dates, units = "secs") + ) %>% + collect() + + compare_dplyr_binding( + .input %>% + mutate( + nd = dates + secs, + secs2 = difftime(nd, dates, units = "secs") ) %>% collect(), test_df From ead654b0cdd40f9ebc656156b03daceaef417ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Mar 2022 17:58:18 +0000 Subject: [PATCH 06/51] make tests pass --- r/tests/testthat/test-dplyr-funcs-datetime.R | 87 +++++++++----------- 1 file changed, 40 insertions(+), 47 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 55747ce4b68..116ec0b5876 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1080,48 +1080,48 @@ test_that("ISO_datetime & ISOdate", { test_that("difftime works correctly", { test_df <- tibble( time1 = as.POSIXct( - c("2021-02-20", "2021-07-31", "2021-10-31", "2021-01-31"), + c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"), tz = "Europe/London"), time2 = as.POSIXct( - c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-31 00:05:45", "2021-01-31 00:07:36"), + c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36"), tz = "Europe/London"), time3 = as.POSIXct( - c("2021-02-20 02:01:00", "2021-07-31 03:54:00", "2021-10-31 05:45:00", "2021-01-31 07:36:00"), + c("2021-02-20 02:01:00", "2021-07-31 03:54:00", "2021-10-30 05:45:00", "2021-01-31 07:36:00"), tz = "Europe/London"), secs = c(121L, 234L, 345L, 456L), mins = c(121L, 234L, 345L, 456L) ) - test_df %>% - arrow_table() %>% - mutate(mins2 = difftime(time1, time3, units = "mins")) %>% - collect() - - test_df %>% - arrow_table() %>% - mutate( - secs2 = difftime(time1, time2, units = "secs") - ) %>% - collect() + # test_df %>% + # arrow_table() %>% + # mutate(mins2 = difftime(time1, time3, units = "mins")) %>% + # collect() + # + # test_df %>% + # arrow_table() %>% + # mutate( + # secs2 = difftime(time1, time2, units = "secs") + # ) %>% + # collect() # fails but not because of difftime, but because of time + secs - test_df %>% - arrow_table() %>% - mutate( - time3 = time1 + secs, - secs3 = difftime(time1, time3, units = "secs") - ) %>% - collect() + # test_df %>% + # arrow_table() %>% + # mutate( + # time3 = time1 + secs, + # secs3 = difftime(time1, time3, units = "secs") + # ) %>% + # collect() # fails due to different attributes - compare_dplyr_binding( - .input %>% - mutate( - secs2 = difftime(time1, time2, units = "secs") - ) %>% - collect(), - test_df - ) + # compare_dplyr_binding( + # .input %>% + # mutate( + # secs2 = difftime(time1, time2, units = "secs") + # ) %>% + # collect(), + # test_df + # ) # passes compare_dplyr_binding( @@ -1139,24 +1139,17 @@ test_that("difftime works correctly", { mutate( mins2 = difftime(time1, time3, units = "mins")) %>% collect(), - test_df + test_df, + ignore_attr = TRUE ) - test_df %>% - # arrow_table() %>% - mutate( - nd = dates + secs, - secs2 = difftime(nd, dates, units = "secs") - ) %>% - collect() - - compare_dplyr_binding( - .input %>% - mutate( - nd = dates + secs, - secs2 = difftime(nd, dates, units = "secs") - ) %>% - collect(), - test_df - ) + # compare_dplyr_binding( + # .input %>% + # mutate( + # nd = dates + secs, + # secs2 = difftime(nd, dates, units = "secs") + # ) %>% + # collect(), + # test_df + # ) }) From b0743aec81863e36b24e75cc2ad4d65c05d61dd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Mar 2022 20:41:55 +0000 Subject: [PATCH 07/51] test without timezones to see if Windows still complains --- r/tests/testthat/test-dplyr-funcs-datetime.R | 26 ++++++++++++++------ 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 116ec0b5876..7f6489c8ccb 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1080,18 +1080,30 @@ test_that("ISO_datetime & ISOdate", { test_that("difftime works correctly", { test_df <- tibble( time1 = as.POSIXct( - c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"), - tz = "Europe/London"), + c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31")#, + # tz = "Europe/London" + ), time2 = as.POSIXct( - c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36"), - tz = "Europe/London"), + c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36")#, + # tz = "Europe/London" + ), time3 = as.POSIXct( - c("2021-02-20 02:01:00", "2021-07-31 03:54:00", "2021-10-30 05:45:00", "2021-01-31 07:36:00"), - tz = "Europe/London"), + c("2021-02-20 02:01:00", "2021-07-31 03:54:00", "2021-10-30 05:45:00", "2021-01-31 07:36:00")#, + # tz = "Europe/London" + ), + time4 = as.POSIXct( + c("2021-02-25 01:00:00", "2021-08-09 18:00:00", "2021-11-13 09:00:00", "2021-02-19 00:00:00") + ), secs = c(121L, 234L, 345L, 456L), - mins = c(121L, 234L, 345L, 456L) + mins = c(121L, 234L, 345L, 456L), + hours = c(121L, 234L, 345L, 456L), + days = c(121L, 234L, 345L, 456L), + weeks = c(121L, 234L, 345L, 456L) ) + test_df %>% + mutate(hours2 = difftime(time4, time1, units = "hours")) + # test_df %>% # arrow_table() %>% # mutate(mins2 = difftime(time1, time3, units = "mins")) %>% From 3953f43d43bdb66a2b96b89b47d28749e9e05deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 7 Mar 2022 15:53:02 +0000 Subject: [PATCH 08/51] switched to `subtract` kernel for the implementation of `difftime` --- r/R/dplyr-funcs-datetime.R | 28 +++--- r/tests/testthat/test-dplyr-funcs-datetime.R | 92 ++++++-------------- 2 files changed, 43 insertions(+), 77 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index c43a33e7eb2..51bc42153a6 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -187,26 +187,26 @@ register_bindings_datetime <- function() { }) register_binding("date", function(x) { build_expr("cast", x, options = list(to_type = date32())) - }) + }) register_binding("difftime", function(time1, time2, tz, units = c("auto", "secs", "mins", "hours", "days", "weeks")) { - # browser() - units <- match.arg(units) - if (units == "secs") { - # NB order of the args is different in the C++ kernel vs base::difftime() - build_expr("seconds_between", time2, time1) - } else if (units == "mins") { - build_expr("minutes_between", time2, time1) - } else if (units == "hours") { - build_expr("hours_between", time2, time1) - } else if (units == "days") { - build_expr("days_between", time2, time1) - } else if (units == "weeks") { - build_expr("weeks_between", time2, time1) + + if (units != "secs") { + abort("`difftime()` with units other than seconds not supported in Arrow") + } + + if (missing(tz)) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(unit = "s"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(unit = "s"))) + } else { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) } + + build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) }) register_binding("make_datetime", function(year = 1970L, month = 1L, diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 7f6489c8ccb..1c2796098a7 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1080,62 +1080,14 @@ test_that("ISO_datetime & ISOdate", { test_that("difftime works correctly", { test_df <- tibble( time1 = as.POSIXct( - c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31")#, - # tz = "Europe/London" + c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31") ), time2 = as.POSIXct( - c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36")#, - # tz = "Europe/London" + c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36") ), - time3 = as.POSIXct( - c("2021-02-20 02:01:00", "2021-07-31 03:54:00", "2021-10-30 05:45:00", "2021-01-31 07:36:00")#, - # tz = "Europe/London" - ), - time4 = as.POSIXct( - c("2021-02-25 01:00:00", "2021-08-09 18:00:00", "2021-11-13 09:00:00", "2021-02-19 00:00:00") - ), - secs = c(121L, 234L, 345L, 456L), - mins = c(121L, 234L, 345L, 456L), - hours = c(121L, 234L, 345L, 456L), - days = c(121L, 234L, 345L, 456L), - weeks = c(121L, 234L, 345L, 456L) - ) - - test_df %>% - mutate(hours2 = difftime(time4, time1, units = "hours")) - - # test_df %>% - # arrow_table() %>% - # mutate(mins2 = difftime(time1, time3, units = "mins")) %>% - # collect() - # - # test_df %>% - # arrow_table() %>% - # mutate( - # secs2 = difftime(time1, time2, units = "secs") - # ) %>% - # collect() - - # fails but not because of difftime, but because of time + secs - # test_df %>% - # arrow_table() %>% - # mutate( - # time3 = time1 + secs, - # secs3 = difftime(time1, time3, units = "secs") - # ) %>% - # collect() - - # fails due to different attributes - # compare_dplyr_binding( - # .input %>% - # mutate( - # secs2 = difftime(time1, time2, units = "secs") - # ) %>% - # collect(), - # test_df - # ) - - # passes + secs = c(121L, 234L, 345L, 456L) + ) + compare_dplyr_binding( .input %>% mutate( @@ -1146,22 +1098,36 @@ test_that("difftime works correctly", { ignore_attr = TRUE ) + # units other than "secs" not supported in arrow compare_dplyr_binding( .input %>% mutate( - mins2 = difftime(time1, time3, units = "mins")) %>% + mins = difftime(time1, time2, units = "mins"), + hours = difftime(time1, time2, units = "hours"), + days = difftime(time1, time2, units = "days"), + weeks = difftime(time1, time2, units = "weeks")) %>% collect(), test_df, + warning = TRUE, ignore_attr = TRUE ) - # compare_dplyr_binding( - # .input %>% - # mutate( - # nd = dates + secs, - # secs2 = difftime(nd, dates, units = "secs") - # ) %>% - # collect(), - # test_df - # ) + test_df_with_tz <- tibble( + time1 = as.POSIXct( + c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"), + tz = "Europe/London" + ), + time2 = as.POSIXct( + c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36"), + tz = "America/Chicago" + ), + secs = c(121L, 234L, 345L, 456L) + ) + + compare_dplyr_binding( + .input %>% + mutate(secs2 = difftime(time2, time1, units = "secs", tz = "Pacific/Marquesas")) %>% + collect(), + test_df_with_tz + ) }) From f48eb155cfc9996c3e916055835cde9ab1891d19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 7 Mar 2022 15:54:34 +0000 Subject: [PATCH 09/51] skip timezone tests on windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 1c2796098a7..4b2e4ffdb04 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1112,6 +1112,7 @@ test_that("difftime works correctly", { ignore_attr = TRUE ) + skip_on_os("windows") test_df_with_tz <- tibble( time1 = as.POSIXct( c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"), From 7a45459c07e9dfc6aeeae53c6ad3d3778c76cbf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 7 Mar 2022 17:09:17 +0000 Subject: [PATCH 10/51] add unit tests passing a regular R object --- r/tests/testthat/test-dplyr-funcs-datetime.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 4b2e4ffdb04..d7d760eeb51 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1098,6 +1098,16 @@ test_that("difftime works correctly", { ignore_attr = TRUE ) + compare_dplyr_binding( + .input %>% + mutate( + secs2 = difftime(as.POSIXct("2022-03-07"), time1, units = "secs") + ) %>% + collect(), + test_df, + ignore_attr = TRUE + ) + # units other than "secs" not supported in arrow compare_dplyr_binding( .input %>% From 4f602e9b0f8c72304f4936738192188d059b1a35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 7 Mar 2022 17:09:49 +0000 Subject: [PATCH 11/51] match unit arg --- r/R/dplyr-funcs-datetime.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 51bc42153a6..eef53894602 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -194,6 +194,7 @@ register_bindings_datetime <- function() { units = c("auto", "secs", "mins", "hours", "days", "weeks")) { + units <- match.arg(units) if (units != "secs") { abort("`difftime()` with units other than seconds not supported in Arrow") } From 20ac204b948a178e4c82bc968c015e6603c1cd4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 7 Mar 2022 17:57:30 +0000 Subject: [PATCH 12/51] add unit test for difftime + tz + R object --- r/tests/testthat/test-dplyr-funcs-datetime.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index d7d760eeb51..5ac55c789dc 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1141,4 +1141,18 @@ test_that("difftime works correctly", { collect(), test_df_with_tz ) + + compare_dplyr_binding( + .input %>% + mutate( + secs2 = + difftime( + as.POSIXct("2022-03-07", tz = "Europe/Bucharest"), + time1, + units = "secs", + tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df_with_tz + ) }) From 9de212eafef087c0d8e92f0476ae74ebfa400e90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 8 Mar 2022 11:16:40 +0000 Subject: [PATCH 13/51] add a bit more substance to `as.difftime()` --- r/R/dplyr-funcs-type.R | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 96fe6f86ab9..fce488783af 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -123,11 +123,25 @@ register_bindings_type_cast <- function() { }) register_binding("as.difftime", function(x, format = "%X", - units = c("secs"), - tz = NULL) { + units = "auto", + tz = "UTC") { + if (units != "secs") { + abort("`as.difftime()` with units other than seconds not supported in Arrow") + } + if (call_binding("is.character", x)) { - x <- call_binding("strptime", x, format = format) + x <- build_expr("strptime", x, options = list(format = format, tz = tz, units = "s")) + y <- build_expr("strptime", "0:0:0", options = list(format = "%X", tz = tz, units = "s")) + diff_x_y <- call_binding("difftime", x - y, units = "secs", tz = tz) + return(diff_x_y) } + + # numeric -> duration not supported in Arrow yet + # https://issues.apache.org/jira/browse/ARROW-15862 + if (call_binding("is.numeric")) { + abort("`as.difftime()` with integer inputs not supported in Arrow ") + } + build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) }) From bc99d535486c496a7322e91823d5649e33ec5dff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 8 Mar 2022 13:03:58 +0000 Subject: [PATCH 14/51] `as.difftime()` can handle integers --- r/R/dplyr-funcs-datetime.R | 16 ++-- r/R/dplyr-funcs-type.R | 15 +++- r/tests/testthat/test-dplyr-funcs-type.R | 103 ++++------------------- 3 files changed, 38 insertions(+), 96 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index eef53894602..14ad6214f2b 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -193,18 +193,20 @@ register_bindings_datetime <- function() { tz, units = c("auto", "secs", "mins", "hours", "days", "weeks")) { - units <- match.arg(units) if (units != "secs") { abort("`difftime()` with units other than seconds not supported in Arrow") } - if (missing(tz)) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(unit = "s"))) - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(unit = "s"))) - } else { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) + # for time32() we do not need to worry about timezone + if (call_binding("is.instant", time1) & call_binding("is.instant", time1)) { + if (missing(tz)) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(unit = "s"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(unit = "s"))) + } else { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) + } } build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index fce488783af..9850d369b2f 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -130,16 +130,23 @@ register_bindings_type_cast <- function() { } if (call_binding("is.character", x)) { - x <- build_expr("strptime", x, options = list(format = format, tz = tz, units = "s")) - y <- build_expr("strptime", "0:0:0", options = list(format = "%X", tz = tz, units = "s")) - diff_x_y <- call_binding("difftime", x - y, units = "secs", tz = tz) + x <- build_expr("strptime", x, options = list(format = format, tz = tz, unit = 0L)) + y <- build_expr("strptime", "0:0:0", options = list(format = "%X", tz = tz, unit = 0L)) + diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) return(diff_x_y) } # numeric -> duration not supported in Arrow yet # https://issues.apache.org/jira/browse/ARROW-15862 - if (call_binding("is.numeric")) { + if (call_binding("is.numeric", x)) { + if (call_binding("is.integer", x)) { + x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) + y <- build_expr("cast", 0L, options = cast_options(to_type = time32(unit = "s"))) + diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) + return(diff_x_y) + } else { abort("`as.difftime()` with integer inputs not supported in Arrow ") + } } build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 3ff538776a8..e55d5ec4f37 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -875,108 +875,41 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) }) -test_that("format date/time", { - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - - times <- tibble( - datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA), - date = c(as.Date("2021-01-01"), NA) - ) - formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" - formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u" - compare_dplyr_binding( - .input %>% - mutate(x = format(datetime, format = formats)) %>% - collect(), - times - ) - - compare_dplyr_binding( - .input %>% - mutate(x = format(date, format = formats_date)) %>% - collect(), - times +test_that("as.difftime() works properly", { + test_df <- tibble( + hms_string = c("0:7:45", "12:34:56"), + hm_string = c("7:45", "12:34"), + int = c(30L, 75L), + dbl = c(30, 75) ) compare_dplyr_binding( - .input %>% - mutate(x = format(datetime, format = formats, tz = "Europe/Bucharest")) %>% + .input %>% + mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>% collect(), - times + test_df ) compare_dplyr_binding( .input %>% - mutate(x = format(datetime, format = formats, tz = "EST", usetz = TRUE)) %>% + mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% collect(), - times + test_df ) compare_dplyr_binding( .input %>% - mutate(x = format(1), - y = format(13.7, nsmall = 3)) %>% + mutate(int_difftime = as.difftime(int, units = "secs")) %>% collect(), - times + test_df ) compare_dplyr_binding( .input %>% - mutate(start_date = format(as.POSIXct("2022-01-01 01:01:00"))) %>% - collect(), - times - ) - - withr::with_timezone( - "Pacific/Marquesas", - { - compare_dplyr_binding( - .input %>% - mutate( - x = format(datetime, format = formats, tz = "EST"), - x_date = format(date, format = formats_date, tz = "EST") - ) %>% - collect(), - times - ) - - compare_dplyr_binding( - .input %>% - mutate( - x = format(datetime, format = formats), - x_date = format(date, format = formats_date) - ) %>% - collect(), - times - ) - } - ) -}) - -test_that("format() for unsupported types returns the input as string", { - expect_equal( - example_data %>% - record_batch() %>% - mutate(x = format(int)) %>% - collect(), - example_data %>% - record_batch() %>% - mutate(x = as.character(int)) %>% - collect() - ) - expect_equal( - example_data %>% - arrow_table() %>% - mutate(y = format(dbl)) %>% - collect(), - example_data %>% - arrow_table() %>% - mutate(y = as.character(dbl)) %>% - collect() - ) -}) - -test_that("conversion with as.difftime() works", { - + mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% + collect(), + test_df, + warning = TRUE +) }) From cbcd28f41a319592c30c008bbebc2c30d0e15c13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 8 Mar 2022 13:21:24 +0000 Subject: [PATCH 15/51] updated unit tests & error message --- r/R/dplyr-funcs-type.R | 2 +- r/tests/testthat/test-dplyr-funcs-type.R | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 9850d369b2f..8a07c3c44ff 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -145,7 +145,7 @@ register_bindings_type_cast <- function() { diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) return(diff_x_y) } else { - abort("`as.difftime()` with integer inputs not supported in Arrow ") + abort("`as.difftime()` with double/float inputs not supported in Arrow ") } } diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index e55d5ec4f37..949e1c37386 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -881,7 +881,7 @@ test_that("as.difftime() works properly", { hms_string = c("0:7:45", "12:34:56"), hm_string = c("7:45", "12:34"), int = c(30L, 75L), - dbl = c(30, 75) + dbl = c(31, 76) ) compare_dplyr_binding( @@ -905,11 +905,21 @@ test_that("as.difftime() works properly", { test_df ) + # coercing doubles to difftime/duration is not supported in Arrow compare_dplyr_binding( .input %>% - mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% - collect(), - test_df, - warning = TRUE -) + mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% + collect(), + test_df, + warning = TRUE + ) + + # "mins" or other values for units cannot be handled in Arrow + compare_dplyr_binding( + .input %>% + mutate(int_difftime = as.difftime(int, units = "mins")) %>% + collect(), + test_df, + warning = TRUE + ) }) From 22c88392f57d4244b5ba2fdd380dcdf0d3df4456 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 8 Mar 2022 14:19:39 +0000 Subject: [PATCH 16/51] added comment to a unit test --- r/tests/testthat/test-dplyr-funcs-type.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 949e1c37386..8a05875ba45 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -891,6 +891,9 @@ test_that("as.difftime() works properly", { test_df ) + # TODO add test with `format` mismatch returning NA once + # https://issues.apache.org/jira/browse/ARROW-15659 is solved + # as.difftime("07:", format = "%H:%M") compare_dplyr_binding( .input %>% mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% From 6a78ea93f9a7b147d4f3c8c791483836c23b320b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 8 Mar 2022 14:30:57 +0000 Subject: [PATCH 17/51] another try --- r/R/dplyr-funcs-type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 8a07c3c44ff..3bbd3cbbde2 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -131,7 +131,7 @@ register_bindings_type_cast <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, tz = tz, unit = 0L)) - y <- build_expr("strptime", "0:0:0", options = list(format = "%X", tz = tz, unit = 0L)) + y <- build_expr("strptime", "0:0", options = list(format = "%H:%M", tz = tz, unit = 0L)) diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) return(diff_x_y) } From 2b00aa28231677e493c75a5cc095ab480416a7da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 9 Mar 2022 09:05:58 +0000 Subject: [PATCH 18/51] replace `"%X"` with `"%H:%M:%S"` on windows + update comments --- r/R/dplyr-funcs-type.R | 9 +++++++-- r/tests/testthat/test-dplyr-funcs-type.R | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 3bbd3cbbde2..96b6b3638b7 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -125,6 +125,10 @@ register_bindings_type_cast <- function() { format = "%X", units = "auto", tz = "UTC") { + if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { + format <- "%H:%M:%S" + } + if (units != "secs") { abort("`as.difftime()` with units other than seconds not supported in Arrow") } @@ -136,8 +140,9 @@ register_bindings_type_cast <- function() { return(diff_x_y) } - # numeric -> duration not supported in Arrow yet - # https://issues.apache.org/jira/browse/ARROW-15862 + # numeric -> duration not supported in Arrow yet so we use time23() as + # intermediate step + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15862 done if (call_binding("is.numeric", x)) { if (call_binding("is.integer", x)) { x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 8a05875ba45..8964c39b94e 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -893,7 +893,7 @@ test_that("as.difftime() works properly", { # TODO add test with `format` mismatch returning NA once # https://issues.apache.org/jira/browse/ARROW-15659 is solved - # as.difftime("07:", format = "%H:%M") + # for example: as.difftime("07:", format = "%H:%M") should return NA compare_dplyr_binding( .input %>% mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% From b1caedcd76f55d329ea37be8b5d35af7052d1717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 9 Mar 2022 10:23:14 +0000 Subject: [PATCH 19/51] `"%H:%M:%S"` for consistency --- r/R/dplyr-funcs-type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 96b6b3638b7..9fdce96e2b2 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -135,7 +135,7 @@ register_bindings_type_cast <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, tz = tz, unit = 0L)) - y <- build_expr("strptime", "0:0", options = list(format = "%H:%M", tz = tz, unit = 0L)) + y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", tz = tz, unit = 0L)) diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) return(diff_x_y) } From 70d4a0d7dbde9e500b29b7e0d46a06f06eb6169a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 9 Mar 2022 10:23:14 +0000 Subject: [PATCH 20/51] comment + `"%H:%M:%S"` for consistency --- r/R/dplyr-funcs-type.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 9fdce96e2b2..a273e3f0efc 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -125,6 +125,7 @@ register_bindings_type_cast <- function() { format = "%X", units = "auto", tz = "UTC") { + # windows doesn't seem to like "%X" if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { format <- "%H:%M:%S" } From f13184b471563fdd6e9ae8dfc1b5d2c55e124f10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 10 Mar 2022 11:41:35 +0000 Subject: [PATCH 21/51] separate `compare_dplyr_binding()` blocks for the unsuported units --- r/tests/testthat/test-dplyr-funcs-datetime.R | 39 ++++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 5ac55c789dc..ea1a7dd9ba3 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1112,10 +1112,41 @@ test_that("difftime works correctly", { compare_dplyr_binding( .input %>% mutate( - mins = difftime(time1, time2, units = "mins"), - hours = difftime(time1, time2, units = "hours"), - days = difftime(time1, time2, units = "days"), - weeks = difftime(time1, time2, units = "weeks")) %>% + mins = difftime(time1, time2, units = "mins") + ) %>% + collect(), + test_df, + warning = TRUE, + ignore_attr = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate( + hours = difftime(time1, time2, units = "hours") + ) %>% + collect(), + test_df, + warning = TRUE, + ignore_attr = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate( + days = difftime(time1, time2, units = "days") + ) %>% + collect(), + test_df, + warning = TRUE, + ignore_attr = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate( + weeks = difftime(time1, time2, units = "weeks") + ) %>% collect(), test_df, warning = TRUE, From a797f9bd240428f4335596f166cce9a8a90e79dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 10 Mar 2022 11:42:01 +0000 Subject: [PATCH 22/51] typo + simplified logic for integer vs double --- r/R/dplyr-funcs-type.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index a273e3f0efc..dd8f2c02e80 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -141,18 +141,18 @@ register_bindings_type_cast <- function() { return(diff_x_y) } - # numeric -> duration not supported in Arrow yet so we use time23() as + # numeric -> duration not supported in Arrow yet so we use time32() as # intermediate step # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15862 done - if (call_binding("is.numeric", x)) { - if (call_binding("is.integer", x)) { - x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) - y <- build_expr("cast", 0L, options = cast_options(to_type = time32(unit = "s"))) - diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) - return(diff_x_y) - } else { + if (call_binding("is.integer", x)) { + x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) + y <- build_expr("cast", 0L, options = cast_options(to_type = time32(unit = "s"))) + diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) + return(diff_x_y) + } + + if (call_binding("is.double")) { abort("`as.difftime()` with double/float inputs not supported in Arrow ") - } } build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) From 047f5d03aa6c60e6cba5f8eb0a0e56282833c25e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 10 Mar 2022 12:02:03 +0000 Subject: [PATCH 23/51] style --- r/tests/testthat/test-dplyr-funcs-datetime.R | 20 ++++++++++---------- r/tests/testthat/test-dplyr-funcs-type.R | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index ea1a7dd9ba3..8c5e526a94d 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1167,21 +1167,21 @@ test_that("difftime works correctly", { ) compare_dplyr_binding( - .input %>% - mutate(secs2 = difftime(time2, time1, units = "secs", tz = "Pacific/Marquesas")) %>% - collect(), - test_df_with_tz + .input %>% + mutate(secs2 = difftime(time2, time1, units = "secs", tz = "Pacific/Marquesas")) %>% + collect(), + test_df_with_tz ) compare_dplyr_binding( .input %>% mutate( - secs2 = - difftime( - as.POSIXct("2022-03-07", tz = "Europe/Bucharest"), - time1, - units = "secs", - tz = "Pacific/Marquesas") + secs2 = difftime( + as.POSIXct("2022-03-07", tz = "Europe/Bucharest"), + time1, + units = "secs", + tz = "Pacific/Marquesas" + ) ) %>% collect(), test_df_with_tz diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 8964c39b94e..62c2311bcc1 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -886,8 +886,8 @@ test_that("as.difftime() works properly", { compare_dplyr_binding( .input %>% - mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>% - collect(), + mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>% + collect(), test_df ) From 11d0dec9cbefe028d846f979e1b1a1f5731aab96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 11 Mar 2022 13:30:10 +0000 Subject: [PATCH 24/51] typo + simplified implementation --- r/R/dplyr-funcs-datetime.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 14ad6214f2b..f1d8723026a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -199,17 +199,16 @@ register_bindings_datetime <- function() { } # for time32() we do not need to worry about timezone - if (call_binding("is.instant", time1) & call_binding("is.instant", time1)) { + if (call_binding("is.instant", time1) & call_binding("is.instant", time2)) { if (missing(tz)) { time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(unit = "s"))) time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(unit = "s"))) - } else { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) } + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) } - build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) + time1 - time2 }) register_binding("make_datetime", function(year = 1970L, month = 1L, From b1b3891399fc0cd1852ec7d5752a7f9c49c6522d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 11 Mar 2022 14:42:11 +0000 Subject: [PATCH 25/51] switched back to casting as the simplified version was causing some CI failures --- r/R/dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index f1d8723026a..76b215edd4f 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -208,7 +208,7 @@ register_bindings_datetime <- function() { time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) } - time1 - time2 + build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) }) register_binding("make_datetime", function(year = 1970L, month = 1L, From 694d4be4b9a06f6dca838777eaa27bf5840bd158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 11 Mar 2022 15:03:28 +0000 Subject: [PATCH 26/51] one more step back --- r/R/dplyr-funcs-datetime.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 76b215edd4f..d1ea93c7fcc 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -203,9 +203,10 @@ register_bindings_datetime <- function() { if (missing(tz)) { time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(unit = "s"))) time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(unit = "s"))) + } else { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) } - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) } build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) From d8eeafea89bf9df2d4c3ab47b7798463f0fbf5b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 11 Mar 2022 18:54:33 +0000 Subject: [PATCH 27/51] go via `int64()` instead of a difference between 2 `time32()` --- r/R/dplyr-funcs-type.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index dd8f2c02e80..ec3852367f6 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -145,10 +145,14 @@ register_bindings_type_cast <- function() { # intermediate step # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15862 done if (call_binding("is.integer", x)) { - x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) - y <- build_expr("cast", 0L, options = cast_options(to_type = time32(unit = "s"))) - diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) - return(diff_x_y) + # x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) + # y <- build_expr("cast", 0L, options = cast_options(to_type = time32(unit = "s"))) + # diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) + # return(diff_x_y) + # or we could go via int64() + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + x <- build_expr("cast", x, options = cast_options(to_type = duration("s"))) + return(x) } if (call_binding("is.double")) { From aa3a366f02e4f01e85dc7f6059463cc6f6b4570e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 11 Mar 2022 19:06:01 +0000 Subject: [PATCH 28/51] move `as.difftime()` definition and tests from `...-type.R` to `...-datetime.R` --- r/R/dplyr-funcs-datetime.R | 41 +++++++++++++++ r/R/dplyr-funcs-type.R | 40 --------------- r/tests/testthat/test-dplyr-funcs-datetime.R | 51 +++++++++++++++++++ r/tests/testthat/test-dplyr-funcs-type.R | 52 -------------------- 4 files changed, 92 insertions(+), 92 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index d1ea93c7fcc..ee0bf1daf67 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -279,4 +279,45 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { } build_expr("strftime", x, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + + register_binding("as.difftime", function(x, + format = "%X", + units = "auto", + tz = "UTC") { + # windows doesn't seem to like "%X" + if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { + format <- "%H:%M:%S" + } + + if (units != "secs") { + abort("`as.difftime()` with units other than seconds not supported in Arrow") + } + + if (call_binding("is.character", x)) { + x <- build_expr("strptime", x, options = list(format = format, tz = tz, unit = 0L)) + y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", tz = tz, unit = 0L)) + diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) + return(diff_x_y) + } + + # numeric -> duration not supported in Arrow yet so we use time32() as + # intermediate step + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15862 done + if (call_binding("is.integer", x)) { + # x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) + # y <- build_expr("cast", 0L, options = cast_options(to_type = time32(unit = "s"))) + # diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) + # return(diff_x_y) + # or we could go via int64() + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + x <- build_expr("cast", x, options = cast_options(to_type = duration("s"))) + return(x) + } + + if (call_binding("is.double")) { + abort("`as.difftime()` with double/float inputs not supported in Arrow ") + } + + build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) + }) } diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index ec3852367f6..1bb633d5322 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -121,46 +121,6 @@ register_bindings_type_cast <- function() { } build_expr("cast", x, options = cast_options(to_type = date32())) }) - register_binding("as.difftime", function(x, - format = "%X", - units = "auto", - tz = "UTC") { - # windows doesn't seem to like "%X" - if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { - format <- "%H:%M:%S" - } - - if (units != "secs") { - abort("`as.difftime()` with units other than seconds not supported in Arrow") - } - - if (call_binding("is.character", x)) { - x <- build_expr("strptime", x, options = list(format = format, tz = tz, unit = 0L)) - y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", tz = tz, unit = 0L)) - diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) - return(diff_x_y) - } - - # numeric -> duration not supported in Arrow yet so we use time32() as - # intermediate step - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15862 done - if (call_binding("is.integer", x)) { - # x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) - # y <- build_expr("cast", 0L, options = cast_options(to_type = time32(unit = "s"))) - # diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) - # return(diff_x_y) - # or we could go via int64() - x <- build_expr("cast", x, options = cast_options(to_type = int64())) - x <- build_expr("cast", x, options = cast_options(to_type = duration("s"))) - return(x) - } - - if (call_binding("is.double")) { - abort("`as.difftime()` with double/float inputs not supported in Arrow ") - } - - build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) - }) register_binding("is", function(object, class2) { if (is.string(class2)) { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 8c5e526a94d..e1e9814a061 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1187,3 +1187,54 @@ test_that("difftime works correctly", { test_df_with_tz ) }) + +test_that("as.difftime() works properly", { + test_df <- tibble( + hms_string = c("0:7:45", "12:34:56"), + hm_string = c("7:45", "12:34"), + int = c(30L, 75L), + dbl = c(31, 76) + ) + + compare_dplyr_binding( + .input %>% + mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>% + collect(), + test_df + ) + + # TODO add test with `format` mismatch returning NA once + # https://issues.apache.org/jira/browse/ARROW-15659 is solved + # for example: as.difftime("07:", format = "%H:%M") should return NA + compare_dplyr_binding( + .input %>% + mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(int_difftime = as.difftime(int, units = "secs")) %>% + collect(), + test_df + ) + + # coercing doubles to difftime/duration is not supported in Arrow + compare_dplyr_binding( + .input %>% + mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% + collect(), + test_df, + warning = TRUE + ) + + # "mins" or other values for units cannot be handled in Arrow + compare_dplyr_binding( + .input %>% + mutate(int_difftime = as.difftime(int, units = "mins")) %>% + collect(), + test_df, + warning = TRUE + ) +}) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 62c2311bcc1..9570ece9c44 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -874,55 +874,3 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a test_df ) }) - - -test_that("as.difftime() works properly", { - test_df <- tibble( - hms_string = c("0:7:45", "12:34:56"), - hm_string = c("7:45", "12:34"), - int = c(30L, 75L), - dbl = c(31, 76) - ) - - compare_dplyr_binding( - .input %>% - mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>% - collect(), - test_df - ) - - # TODO add test with `format` mismatch returning NA once - # https://issues.apache.org/jira/browse/ARROW-15659 is solved - # for example: as.difftime("07:", format = "%H:%M") should return NA - compare_dplyr_binding( - .input %>% - mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% - collect(), - test_df - ) - - compare_dplyr_binding( - .input %>% - mutate(int_difftime = as.difftime(int, units = "secs")) %>% - collect(), - test_df - ) - - # coercing doubles to difftime/duration is not supported in Arrow - compare_dplyr_binding( - .input %>% - mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% - collect(), - test_df, - warning = TRUE - ) - - # "mins" or other values for units cannot be handled in Arrow - compare_dplyr_binding( - .input %>% - mutate(int_difftime = as.difftime(int, units = "mins")) %>% - collect(), - test_df, - warning = TRUE - ) -}) From 31756b0cef107ac69c6d71cbd801af714df01be2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 14 Mar 2022 10:52:20 +0000 Subject: [PATCH 29/51] separate duration bindings and update `as.difftime()` + tests --- r/R/dplyr-funcs-datetime.R | 24 ++++++++++---------- r/R/dplyr-funcs.R | 1 + r/tests/testthat/test-dplyr-funcs-datetime.R | 19 ++++++++++++---- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index ee0bf1daf67..123f89e2466 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -188,6 +188,9 @@ register_bindings_datetime <- function() { register_binding("date", function(x) { build_expr("cast", x, options = list(to_type = date32())) }) +} + +register_bindings_duration <- function() { register_binding("difftime", function(time1, time2, tz, @@ -300,24 +303,21 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { return(diff_x_y) } - # numeric -> duration not supported in Arrow yet so we use time32() as + # numeric -> duration not supported in Arrow yet so we use int64() as an # intermediate step - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15862 done - if (call_binding("is.integer", x)) { - # x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) - # y <- build_expr("cast", 0L, options = cast_options(to_type = time32(unit = "s"))) - # diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) - # return(diff_x_y) - # or we could go via int64() + # TODO revisit if https://issues.apache.org/jira/browse/ARROW-15862 results + # in numeric -> duration support + + if (call_binding("is.numeric", x)) { + # coerce x to be int64(). it should work for integer-like doubles and fail + # for pure doubles + # if we abort for all doubles, we risk erroring in cases in which + # coercion to int64() would work x <- build_expr("cast", x, options = cast_options(to_type = int64())) x <- build_expr("cast", x, options = cast_options(to_type = duration("s"))) return(x) } - if (call_binding("is.double")) { - abort("`as.difftime()` with double/float inputs not supported in Arrow ") - } - build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) }) } diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 4d7cb3bc63d..01e522e537b 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -106,6 +106,7 @@ create_binding_cache <- function() { register_bindings_aggregate() register_bindings_conditional() register_bindings_datetime() + register_bindings_duration() register_bindings_math() register_bindings_string() register_bindings_type() diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e1e9814a061..01fd0f7114b 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1193,7 +1193,8 @@ test_that("as.difftime() works properly", { hms_string = c("0:7:45", "12:34:56"), hm_string = c("7:45", "12:34"), int = c(30L, 75L), - dbl = c(31, 76) + integerish_dbl = c(31, 76), + dbl = c(31.2, 76.4) ) compare_dplyr_binding( @@ -1220,13 +1221,11 @@ test_that("as.difftime() works properly", { test_df ) - # coercing doubles to difftime/duration is not supported in Arrow compare_dplyr_binding( .input %>% - mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% + mutate(integerish_dbl_difftime = as.difftime(integerish_dbl, units = "secs")) %>% collect(), - test_df, - warning = TRUE + test_df ) # "mins" or other values for units cannot be handled in Arrow @@ -1237,4 +1236,14 @@ test_that("as.difftime() works properly", { test_df, warning = TRUE ) + + # only integer (or integer-like) -> duration supported in Arrow. + # double -> duration not supported + expect_error( + test_df %>% + arrow_table() %>% + mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% + collect(), + regexp = "Float value 31.2 was truncated converting to int64" + ) }) From 0c56cd69342616366ffbd0f7b465784a6fb9de2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 14 Mar 2022 10:58:28 +0000 Subject: [PATCH 30/51] missing closing bracket --- r/R/dplyr-funcs-datetime.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 123f89e2466..ae6c7de7589 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -321,3 +321,23 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) }) } + +binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { + if (usetz) { + format <- paste(format, "%Z") + } + + if (call_binding("is.POSIXct", x)) { + # the casting part might not be required once + # https://issues.apache.org/jira/browse/ARROW-14442 is solved + # TODO revisit the steps below once the PR for that issue is merged + if (tz == "" && x$type()$timezone() != "") { + tz <- x$type()$timezone() + } else if (tz == "") { + tz <- Sys.timezone() + } + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(x$type()$unit(), tz))) + } + + build_expr("strftime", x, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) +} From 779fe91753c91d1aa5adcebe8615afe959d0bb2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 14 Mar 2022 15:39:13 +0000 Subject: [PATCH 31/51] updated NEWS --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index 6b2e0532168..0a7d30d2a37 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,6 +24,7 @@ * component extraction functions: `tz()` (timezone), `semester()` (semester), `dst()` (daylight savings time indicator), `date()` (extract date), `epiyear()` (epiyear), improvements to `month()`, which now works with integer inputs. * `make_date()` & `make_datetime()` + `ISOdatetime()` & `ISOdate()` to create date-times from numeric representations. * date-time functionality: + * `difftime` and `as.difftime()` * `as.Date()` to convert to date # arrow 7.0.0 From ed5c7ed45b878c8273b7152ff6fd851beee85819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Mar 2022 11:54:44 +0000 Subject: [PATCH 32/51] removed the testing of the actual error message since it's being surfaced from C++ --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 01fd0f7114b..0888fd17a17 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1243,7 +1243,6 @@ test_that("as.difftime() works properly", { test_df %>% arrow_table() %>% mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% - collect(), - regexp = "Float value 31.2 was truncated converting to int64" + collect() ) }) From 60edf1f31ca4eddf740b12fbe80e7b70112e5eb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Mar 2022 10:39:31 +0000 Subject: [PATCH 33/51] simplified `difftime` binding + updated unit tests --- r/R/dplyr-funcs-datetime.R | 16 ++++++---------- r/tests/testthat/test-dplyr-funcs-datetime.R | 14 +++++++++++--- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index ae6c7de7589..00bea4d0afd 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -201,17 +201,13 @@ register_bindings_duration <- function() { abort("`difftime()` with units other than seconds not supported in Arrow") } - # for time32() we do not need to worry about timezone - if (call_binding("is.instant", time1) & call_binding("is.instant", time2)) { - if (missing(tz)) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(unit = "s"))) - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(unit = "s"))) - } else { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = tz, unit = "s"))) - } + if (!missing(tz)) { + warn("`tz` is an optional argument to `difftime()` in R and will not be used in Arrow") } + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp())) + build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) }) register_binding("make_datetime", function(year = 1970L, @@ -299,7 +295,7 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, tz = tz, unit = 0L)) y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", tz = tz, unit = 0L)) - diff_x_y <- call_binding("difftime", x, y, units = "secs", tz = tz) + diff_x_y <- call_binding("difftime", x, y, units = "secs") return(diff_x_y) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 0888fd17a17..85a48046d72 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1168,7 +1168,7 @@ test_that("difftime works correctly", { compare_dplyr_binding( .input %>% - mutate(secs2 = difftime(time2, time1, units = "secs", tz = "Pacific/Marquesas")) %>% + mutate(secs2 = difftime(time2, time1, units = "secs")) %>% collect(), test_df_with_tz ) @@ -1179,13 +1179,21 @@ test_that("difftime works correctly", { secs2 = difftime( as.POSIXct("2022-03-07", tz = "Europe/Bucharest"), time1, - units = "secs", - tz = "Pacific/Marquesas" + units = "secs" ) ) %>% collect(), test_df_with_tz ) + + # `tz` is effectively ignored both in R (used only if inputs are POSIXlt) and Arrow + compare_dplyr_binding( + .input %>% + mutate(secs2 = difftime(time2, time1, units = "secs", tz = "Pacific/Marquesas")) %>% + collect(), + test_df_with_tz, + warning = "`tz` is an optional argument" + ) }) test_that("as.difftime() works properly", { From 3eddb4b49666ebf494acec56e7e3ccf182363905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Mar 2022 11:33:59 +0000 Subject: [PATCH 34/51] add some times to the test data frame --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 85a48046d72..af959100ec2 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1080,7 +1080,7 @@ test_that("ISO_datetime & ISOdate", { test_that("difftime works correctly", { test_df <- tibble( time1 = as.POSIXct( - c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31") + c("2021-02-20", "2021-07-31 0:0:0", "2021-10-30", "2021-01-31 0:0:0") ), time2 = as.POSIXct( c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36") From 6a1ca12d29cbbbc8cd96e2f439edfe02d061ae32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Mar 2022 11:35:16 +0000 Subject: [PATCH 35/51] remove the `tz` arg --- r/R/dplyr-funcs-datetime.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 00bea4d0afd..55e2a439511 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -281,8 +281,7 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { register_binding("as.difftime", function(x, format = "%X", - units = "auto", - tz = "UTC") { + units = "auto") { # windows doesn't seem to like "%X" if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { format <- "%H:%M:%S" @@ -293,8 +292,8 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { } if (call_binding("is.character", x)) { - x <- build_expr("strptime", x, options = list(format = format, tz = tz, unit = 0L)) - y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", tz = tz, unit = 0L)) + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", unit = 0L)) diff_x_y <- call_binding("difftime", x, y, units = "secs") return(diff_x_y) } From 2755e1b95c5d232ea2ef3bc0587ef56a2ad565c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Mar 2022 15:16:59 +0000 Subject: [PATCH 36/51] cleaned up some of the tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 35 +------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index af959100ec2..e3c0dc07dc9 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1120,39 +1120,6 @@ test_that("difftime works correctly", { ignore_attr = TRUE ) - compare_dplyr_binding( - .input %>% - mutate( - hours = difftime(time1, time2, units = "hours") - ) %>% - collect(), - test_df, - warning = TRUE, - ignore_attr = TRUE - ) - - compare_dplyr_binding( - .input %>% - mutate( - days = difftime(time1, time2, units = "days") - ) %>% - collect(), - test_df, - warning = TRUE, - ignore_attr = TRUE - ) - - compare_dplyr_binding( - .input %>% - mutate( - weeks = difftime(time1, time2, units = "weeks") - ) %>% - collect(), - test_df, - warning = TRUE, - ignore_attr = TRUE - ) - skip_on_os("windows") test_df_with_tz <- tibble( time1 = as.POSIXct( @@ -1196,7 +1163,7 @@ test_that("difftime works correctly", { ) }) -test_that("as.difftime() works properly", { +test_that("as.difftime()", { test_df <- tibble( hms_string = c("0:7:45", "12:34:56"), hm_string = c("7:45", "12:34"), From b2f7268d492f9423d39e109176029a7354d06ea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Mar 2022 15:18:08 +0000 Subject: [PATCH 37/51] `unit` defaults to `"secs"` + improved messages --- r/R/dplyr-funcs-datetime.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 55e2a439511..307b9f6a7ef 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -194,15 +194,13 @@ register_bindings_duration <- function() { register_binding("difftime", function(time1, time2, tz, - units = c("auto", "secs", "mins", - "hours", "days", "weeks")) { - units <- match.arg(units) + units = "secs") { if (units != "secs") { - abort("`difftime()` with units other than seconds not supported in Arrow") + abort("`difftime()` with units other than `secs` not supported in Arrow") } if (!missing(tz)) { - warn("`tz` is an optional argument to `difftime()` in R and will not be used in Arrow") + warn("`tz` argument is not supported in Arrow, so it will be ignored") } time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) @@ -281,14 +279,14 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { register_binding("as.difftime", function(x, format = "%X", - units = "auto") { + units = "secs") { # windows doesn't seem to like "%X" if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { format <- "%H:%M:%S" } if (units != "secs") { - abort("`as.difftime()` with units other than seconds not supported in Arrow") + abort("`as.difftime()` with units other than 'secs' not supported in Arrow") } if (call_binding("is.character", x)) { From f8837f817d9eb07a89067e6ac2249d432b29b0bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 09:49:43 +0000 Subject: [PATCH 38/51] cast only when needed --- r/R/dplyr-funcs-datetime.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 307b9f6a7ef..6385b1a148d 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -203,8 +203,17 @@ register_bindings_duration <- function() { warn("`tz` argument is not supported in Arrow, so it will be ignored") } - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp())) + # cast to timestamp if time1 and time2 are not dates or timpestamp expressions + # (the subtraction of which would output a `duration`) + if (!(inherits(time1, "Expression") && + time1$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) + } + + if (!(inherits(time2, "Expression") && + time2$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) + } build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) }) From 245ec890c1b3822d4fabf3cb1259cadde1cd10dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 10:07:32 +0000 Subject: [PATCH 39/51] updated unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 21 ++++++-------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e3c0dc07dc9..fd16e6b61de 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1098,16 +1098,6 @@ test_that("difftime works correctly", { ignore_attr = TRUE ) - compare_dplyr_binding( - .input %>% - mutate( - secs2 = difftime(as.POSIXct("2022-03-07"), time1, units = "secs") - ) %>% - collect(), - test_df, - ignore_attr = TRUE - ) - # units other than "secs" not supported in arrow compare_dplyr_binding( .input %>% @@ -1124,11 +1114,11 @@ test_that("difftime works correctly", { test_df_with_tz <- tibble( time1 = as.POSIXct( c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"), - tz = "Europe/London" + tz = "Pacific/Marquesas" ), time2 = as.POSIXct( c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36"), - tz = "America/Chicago" + tz = "Asia/Kathmandu" ), secs = c(121L, 234L, 345L, 456L) ) @@ -1159,7 +1149,7 @@ test_that("difftime works correctly", { mutate(secs2 = difftime(time2, time1, units = "secs", tz = "Pacific/Marquesas")) %>% collect(), test_df_with_tz, - warning = "`tz` is an optional argument" + warning = "`tz` argument is not supported in Arrow, so it will be ignored" ) }) @@ -1212,8 +1202,9 @@ test_that("as.difftime()", { warning = TRUE ) - # only integer (or integer-like) -> duration supported in Arrow. - # double -> duration not supported + # only integer (or integer-like) -> duration conversion supported in Arrow. + # double -> duration not supported. we're not testing the content of the + # error message as it is being generated in the C++ code and it might change expect_error( test_df %>% arrow_table() %>% From 448268dc58c60bff0b6a7b39df13dea23731356e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 17:35:28 +0000 Subject: [PATCH 40/51] rescue the format tests --- r/tests/testthat/test-dplyr-funcs-type.R | 102 +++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 9570ece9c44..6c9d9ac07a4 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -874,3 +874,105 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a test_df ) }) + +test_that("format date/time", { + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + + times <- tibble( + datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA), + date = c(as.Date("2021-01-01"), NA) + ) + formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" + formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u" + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(date, format = formats_date)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats, tz = "Europe/Bucharest")) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats, tz = "EST", usetz = TRUE)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(1), + y = format(13.7, nsmall = 3)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(start_date = format(as.POSIXct("2022-01-01 01:01:00"))) %>% + collect(), + times + ) + + withr::with_timezone( + "Pacific/Marquesas", + { + compare_dplyr_binding( + .input %>% + mutate( + x = format(datetime, format = formats, tz = "EST"), + x_date = format(date, format = formats_date, tz = "EST") + ) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate( + x = format(datetime, format = formats), + x_date = format(date, format = formats_date) + ) %>% + collect(), + times + ) + } + ) +}) + +test_that("format() for unsupported types returns the input as string", { + expect_equal( + example_data %>% + record_batch() %>% + mutate(x = format(int)) %>% + collect(), + example_data %>% + record_batch() %>% + mutate(x = as.character(int)) %>% + collect() + ) + expect_equal( + example_data %>% + arrow_table() %>% + mutate(y = format(dbl)) %>% + collect(), + example_data %>% + arrow_table() %>% + mutate(y = as.character(dbl)) %>% + collect() + ) +}) From 7e8a8372794757f40ae15c44a48930fb9ce7d352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 18:50:04 +0000 Subject: [PATCH 41/51] clean-up after the rebase mess --- r/R/dplyr-funcs-datetime.R | 63 +++++++++++++------------------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 6385b1a148d..7bbdb6290f9 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -191,32 +191,6 @@ register_bindings_datetime <- function() { } register_bindings_duration <- function() { - register_binding("difftime", function(time1, - time2, - tz, - units = "secs") { - if (units != "secs") { - abort("`difftime()` with units other than `secs` not supported in Arrow") - } - - if (!missing(tz)) { - warn("`tz` argument is not supported in Arrow, so it will be ignored") - } - - # cast to timestamp if time1 and time2 are not dates or timpestamp expressions - # (the subtraction of which would output a `duration`) - if (!(inherits(time1, "Expression") && - time1$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) - } - - if (!(inherits(time2, "Expression") && - time2$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) - } - - build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) - }) register_binding("make_datetime", function(year = 1970L, month = 1L, day = 1L, @@ -265,27 +239,32 @@ register_bindings_duration <- function() { tz = "UTC") { call_binding("make_datetime", year, month, day, hour, min, sec, tz) }) -} + register_binding("difftime", function(time1, + time2, + tz, + units = "secs") { + if (units != "secs") { + abort("`difftime()` with units other than `secs` not supported in Arrow") + } -binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { - if (usetz) { - format <- paste(format, "%Z") - } + if (!missing(tz)) { + warn("`tz` argument is not supported in Arrow, so it will be ignored") + } - if (call_binding("is.POSIXct", x)) { - # the casting part might not be required once - # https://issues.apache.org/jira/browse/ARROW-14442 is solved - # TODO revisit the steps below once the PR for that issue is merged - if (tz == "" && x$type()$timezone() != "") { - tz <- x$type()$timezone() - } else if (tz == "") { - tz <- Sys.timezone() + # cast to timestamp if time1 and time2 are not dates or timpestamp expressions + # (the subtraction of which would output a `duration`) + if (!(inherits(time1, "Expression") && + time1$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) } - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(x$type()$unit(), tz))) - } - build_expr("strftime", x, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + if (!(inherits(time2, "Expression") && + time2$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) + } + build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) + }) register_binding("as.difftime", function(x, format = "%X", units = "secs") { From d15b36cf6ede62db616ca514e9fcac77a4cbafb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Mar 2022 08:57:05 +0000 Subject: [PATCH 42/51] improved testing comment --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index fd16e6b61de..6328a4c8276 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1204,7 +1204,8 @@ test_that("as.difftime()", { # only integer (or integer-like) -> duration conversion supported in Arrow. # double -> duration not supported. we're not testing the content of the - # error message as it is being generated in the C++ code and it might change + # error message as it is being generated in the C++ code and it might change, + # but we want to make sure that this error is raised in our binding implementation expect_error( test_df %>% arrow_table() %>% From ae9566d0b3d42447eb25a9e5a0627f78b5c5b8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Mar 2022 13:47:18 +0000 Subject: [PATCH 43/51] trying the `subtract_checked()` route --- r/R/dplyr-funcs-datetime.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 7bbdb6290f9..5afb1cf00ed 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -253,17 +253,23 @@ register_bindings_duration <- function() { # cast to timestamp if time1 and time2 are not dates or timpestamp expressions # (the subtraction of which would output a `duration`) - if (!(inherits(time1, "Expression") && - time1$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { + # if (!(inherits(time1, "Expression") && + # time1$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { + if (!call_binding("is.instant", time1)) { time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) } - if (!(inherits(time2, "Expression") && - time2$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { + # if (!(inherits(time2, "Expression") && + # time2$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { + if (!call_binding("is.instant", time2)) { time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) } - build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) + # we need to do this instead of `time1 - time2` to prevent complaints when + # we try to subtract an R object from an Expression + subtraction_output <- build_expr("subtract_checked", time1, time2) + # build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) + build_expr("cast", subtraction_output, options = cast_options(to_type = duration("s"))) }) register_binding("as.difftime", function(x, format = "%X", From 638bf21a48669e9998125be379e1fc885fe810fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Mar 2022 14:35:46 +0000 Subject: [PATCH 44/51] clean up and replace `subtract_checked` with `-` --- r/R/dplyr-funcs-datetime.R | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5afb1cf00ed..7375fd73276 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -251,25 +251,20 @@ register_bindings_duration <- function() { warn("`tz` argument is not supported in Arrow, so it will be ignored") } - # cast to timestamp if time1 and time2 are not dates or timpestamp expressions + # cast to timestamp if time1 and time2 are not dates or timestamp expressions # (the subtraction of which would output a `duration`) - # if (!(inherits(time1, "Expression") && - # time1$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { if (!call_binding("is.instant", time1)) { time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) } - # if (!(inherits(time2, "Expression") && - # time2$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])) { if (!call_binding("is.instant", time2)) { time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) } - # we need to do this instead of `time1 - time2` to prevent complaints when - # we try to subtract an R object from an Expression - subtraction_output <- build_expr("subtract_checked", time1, time2) - # build_expr("cast", time1 - time2, options = cast_options(to_type = duration("s"))) - build_expr("cast", subtraction_output, options = cast_options(to_type = duration("s"))) + # we need to go build the subtract expression instead of `time1 - time2` to + # prevent complaints when we try to subtract an R object from an Expression + subtract_output <- build_expr("-", time1, time2) + build_expr("cast", subtract_output, options = cast_options(to_type = duration("s"))) }) register_binding("as.difftime", function(x, format = "%X", From e4cd5bb292ffd1768787255e52edf3640858d575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 11:11:52 +0000 Subject: [PATCH 45/51] simplify the implementation when `x` is character + ignore attributes since we now return an `hms::difftime` object --- r/R/dplyr-funcs-datetime.R | 5 ++--- r/tests/testthat/test-dplyr-funcs-datetime.R | 8 ++++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 7375fd73276..379105d207d 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -280,9 +280,8 @@ register_bindings_duration <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", unit = 0L)) - diff_x_y <- call_binding("difftime", x, y, units = "secs") - return(diff_x_y) + x <- x$cast(time32(unit = "s")) + return(x) } # numeric -> duration not supported in Arrow yet so we use int64() as an diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 6328a4c8276..a25be742e2a 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1166,7 +1166,9 @@ test_that("as.difftime()", { .input %>% mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>% collect(), - test_df + test_df, + # arrow returns an hms::difftime object, as opposed to a base::difftime one + ignore_attr = TRUE ) # TODO add test with `format` mismatch returning NA once @@ -1176,7 +1178,9 @@ test_that("as.difftime()", { .input %>% mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% collect(), - test_df + test_df, + # arrow returns an hms::difftime object, as opposed to a base::difftime one + ignore_attr = TRUE ) compare_dplyr_binding( From b019b925ed3722b3809e85ff7357905eb5501dd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 11:45:03 +0000 Subject: [PATCH 46/51] use `build_expr()` --- r/R/dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 379105d207d..6e744407cb2 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -280,7 +280,7 @@ register_bindings_duration <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - x <- x$cast(time32(unit = "s")) + x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) return(x) } From 19411567855f50a2d959ef4d170f7b5141841541 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 13:00:05 +0000 Subject: [PATCH 47/51] revert to subtracting `y = "0:0:0"` when `x` is an hms string --- r/R/dplyr-funcs-datetime.R | 5 +++-- r/tests/testthat/test-dplyr-funcs-datetime.R | 8 ++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 6e744407cb2..7375fd73276 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -280,8 +280,9 @@ register_bindings_duration <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - x <- build_expr("cast", x, options = cast_options(to_type = time32(unit = "s"))) - return(x) + y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", unit = 0L)) + diff_x_y <- call_binding("difftime", x, y, units = "secs") + return(diff_x_y) } # numeric -> duration not supported in Arrow yet so we use int64() as an diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a25be742e2a..6328a4c8276 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1166,9 +1166,7 @@ test_that("as.difftime()", { .input %>% mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>% collect(), - test_df, - # arrow returns an hms::difftime object, as opposed to a base::difftime one - ignore_attr = TRUE + test_df ) # TODO add test with `format` mismatch returning NA once @@ -1178,9 +1176,7 @@ test_that("as.difftime()", { .input %>% mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% collect(), - test_df, - # arrow returns an hms::difftime object, as opposed to a base::difftime one - ignore_attr = TRUE + test_df ) compare_dplyr_binding( From 9b6c7fb8033fccf599b2c527b0b372b7075aeb50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 14:10:50 +0000 Subject: [PATCH 48/51] implementation with a chain of casting --- r/R/dplyr-funcs-datetime.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 7375fd73276..137fbeea24c 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -280,9 +280,8 @@ register_bindings_duration <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - y <- build_expr("strptime", "0:0:0", options = list(format = "%H:%M:%S", unit = 0L)) - diff_x_y <- call_binding("difftime", x, y, units = "secs") - return(diff_x_y) + x <- x$cast(time64("us"))$cast(int64())$cast(duration("us"))$cast(duration("s")) + return(x) } # numeric -> duration not supported in Arrow yet so we use int64() as an From 958e4f936d8c110c861ef972eb4bf88bc93985f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 18:17:08 +0000 Subject: [PATCH 49/51] added Jon's comment for clarity --- r/R/dplyr-funcs-datetime.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 137fbeea24c..3663584d494 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -280,6 +280,8 @@ register_bindings_duration <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + # Complex casting only due to cast type restrictions: + # time64 -> int64 -> duration(us) -> duration(s) x <- x$cast(time64("us"))$cast(int64())$cast(duration("us"))$cast(duration("s")) return(x) } From 5492f71748eaeeda94b7a5ec41a0cc857bc87fc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 18:21:38 +0000 Subject: [PATCH 50/51] lint --- r/R/dplyr-funcs-datetime.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 3663584d494..c2fc1b83fb5 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -280,8 +280,7 @@ register_bindings_duration <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - # Complex casting only due to cast type restrictions: - # time64 -> int64 -> duration(us) -> duration(s) + # Complex casting only due to cast type restrictions: time64 -> int64 -> duration(us) -> duration(s) x <- x$cast(time64("us"))$cast(int64())$cast(duration("us"))$cast(duration("s")) return(x) } From a98f2f9ce1f2397a9a3d0946390855f0f3525e59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 24 Mar 2022 09:13:28 +0000 Subject: [PATCH 51/51] all branches go through the final cast to `duration("s")` step --- r/R/dplyr-funcs-datetime.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index c2fc1b83fb5..62da029c08a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -280,9 +280,9 @@ register_bindings_duration <- function() { if (call_binding("is.character", x)) { x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - # Complex casting only due to cast type restrictions: time64 -> int64 -> duration(us) -> duration(s) - x <- x$cast(time64("us"))$cast(int64())$cast(duration("us"))$cast(duration("s")) - return(x) + # complex casting only due to cast type restrictions: time64 -> int64 -> duration(us) + # and then we cast to duration ("s") at the end + x <- x$cast(time64("us"))$cast(int64())$cast(duration("us")) } # numeric -> duration not supported in Arrow yet so we use int64() as an @@ -296,8 +296,6 @@ register_bindings_duration <- function() { # if we abort for all doubles, we risk erroring in cases in which # coercion to int64() would work x <- build_expr("cast", x, options = cast_options(to_type = int64())) - x <- build_expr("cast", x, options = cast_options(to_type = duration("s"))) - return(x) } build_expr("cast", x, options = cast_options(to_type = duration(unit = "s")))