From da4158a37c75f693fdcc3424b1f8becc5e108da0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 17:02:04 +0000 Subject: [PATCH 01/39] added binding for `date()` + unit test + NEWS bullet point --- r/NEWS.md | 1 + r/R/dplyr-funcs-datetime.R | 3 +++ r/tests/testthat/test-dplyr-funcs-datetime.R | 12 ++++++++++++ 3 files changed, 16 insertions(+) diff --git a/r/NEWS.md b/r/NEWS.md index d9552e89bdf..5c10ee94c33 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,6 +24,7 @@ * `tz()` to extract/get timezone * `semester()` to extract/get semester * `dst()` to get daylight savings time indicator. +* Additional `lubridate` features: `date()` # arrow 7.0.0 diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index a72df4c777a..bb4370608e8 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -164,4 +164,7 @@ register_bindings_datetime <- function() { return(semester) } }) + register_binding("date", function(x) { + x$cast(date32()) + }) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 2cc76d36f7a..94276c95469 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -797,6 +797,18 @@ test_that("dst extracts daylight savings time correctly", { compare_dplyr_binding( .input %>% mutate(dst = dst(dates)) %>% + ) +}) + +test_that("date works in arrow", { + # this date is specific since lubridate::date() is different from base::as.Date() + # since as.Date returns the UTC date and date() doesn't + test_df <- tibble( + a = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York")) + + compare_dplyr_binding( + .input %>% + mutate(a_date = date(a)) %>% collect(), test_df ) From 1bce2a845c2bf50ef103cadc98e56b06dcb7a6bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 17:30:01 +0000 Subject: [PATCH 02/39] namespace the call to `date()` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 94276c95469..8a206de09d2 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -808,7 +808,7 @@ test_that("date works in arrow", { compare_dplyr_binding( .input %>% - mutate(a_date = date(a)) %>% + mutate(a_date = lubridate::date(a)) %>% collect(), test_df ) From 3ea6bebd0deb92004b0d73740290e3afc6f9dd4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 17:44:43 +0000 Subject: [PATCH 03/39] remove namespacing for `date()` inside the `mutate()` context --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 8a206de09d2..94276c95469 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -808,7 +808,7 @@ test_that("date works in arrow", { compare_dplyr_binding( .input %>% - mutate(a_date = lubridate::date(a)) %>% + mutate(a_date = date(a)) %>% collect(), test_df ) From 96c6b3f08c785c0565cdb05f8df51763fad34696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 18:06:27 +0000 Subject: [PATCH 04/39] trying to mask `base::date()` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 94276c95469..888c3ee4ec2 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -803,6 +803,7 @@ test_that("dst extracts daylight savings time correctly", { test_that("date works in arrow", { # this date is specific since lubridate::date() is different from base::as.Date() # since as.Date returns the UTC date and date() doesn't + library(lubridate) test_df <- tibble( a = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York")) From 27cf385c1d7af02bd923d02c5b2c53be36642d81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 18:31:54 +0000 Subject: [PATCH 05/39] test 2 --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 888c3ee4ec2..a6f8af79b42 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -803,10 +803,10 @@ test_that("dst extracts daylight savings time correctly", { test_that("date works in arrow", { # this date is specific since lubridate::date() is different from base::as.Date() # since as.Date returns the UTC date and date() doesn't - library(lubridate) test_df <- tibble( a = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York")) + date <- base:date compare_dplyr_binding( .input %>% mutate(a_date = date(a)) %>% From 4ef422dfa23725fcfdb4e6d238bf9aa8b18a3938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 19:12:37 +0000 Subject: [PATCH 06/39] test 2 for the `date()` masking issue --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a6f8af79b42..4c872aff8dc 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -806,7 +806,7 @@ test_that("date works in arrow", { test_df <- tibble( a = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York")) - date <- base:date + date <- lubridate::date compare_dplyr_binding( .input %>% mutate(a_date = date(a)) %>% From 35ddfec38390ab47e9205bd794474437dcaf521f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 20:03:01 +0000 Subject: [PATCH 07/39] added comment on namespacing --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 4c872aff8dc..9fe11d14ddf 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -806,6 +806,8 @@ test_that("date works in arrow", { test_df <- tibble( a = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York")) + # we can't (for now) use namespacing, so we need to make sure lubridate::date() + # and not base::date() is being used date <- lubridate::date compare_dplyr_binding( .input %>% From 53b91ace9259b243b56cca2cb906fd7ec883cd90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 20:04:25 +0000 Subject: [PATCH 08/39] skip on windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 9fe11d14ddf..d911092ab0c 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -801,6 +801,8 @@ test_that("dst extracts daylight savings time correctly", { }) test_that("date works in arrow", { + # https://issues.apache.org/jira/browse/ARROW-13168 + skip_on_os("windows") # this date is specific since lubridate::date() is different from base::as.Date() # since as.Date returns the UTC date and date() doesn't test_df <- tibble( From 87d809c90c365e2d03bba8f35794bd573cf3094c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 17 Feb 2022 16:27:27 +0000 Subject: [PATCH 09/39] point to relevant Jira --- r/tests/testthat/test-dplyr-funcs-datetime.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index d911092ab0c..00921d92bd8 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -810,6 +810,7 @@ test_that("date works in arrow", { # we can't (for now) use namespacing, so we need to make sure lubridate::date() # and not base::date() is being used + # TODO remove once https://issues.apache.org/jira/browse/ARROW-14575 is done date <- lubridate::date compare_dplyr_binding( .input %>% From 3ea30486b1a4d4d9a17dbdcd5d3714634519d11f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Feb 2022 10:56:07 +0000 Subject: [PATCH 10/39] test some unsupported inputs --- r/tests/testthat/test-dplyr-funcs-datetime.R | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 00921d92bd8..7084c4f4447 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -818,4 +818,32 @@ test_that("date works in arrow", { collect(), test_df ) + + # a timestamp is cast correctly to date + expect_equal( + call_binding("date", Array$create(as.POSIXct("2022-02-21"))), + Array$create(as.POSIXct("2022-02-21"), type = date32()) + ) +}) + +test_that("date() errors with unsupported inputs", { + expect_error( + call_binding("date", Scalar$create("a string")), + "NotImplemented: Unsupported cast from string to date32 using function cast_date32" + ) + + expect_error( + call_binding("date", Scalar$create(32L)), + "NotImplemented: Unsupported cast from integer to date32 using function cast_date32" + ) + + expect_error( + call_binding("date", Scalar$create(32.2)), + "NotImplemented: Unsupported cast from double to date32 using function cast_date32" + ) + + expect_error( + call_binding("date", Scalar$create(TRUE)), + "NotImplemented: Unsupported cast from bool to date32 using function cast_date32" + ) }) From eea11c2f87b75a7940104faf60c9aeb95b0a9cc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Feb 2022 11:44:46 +0000 Subject: [PATCH 11/39] create an Array from vector + unit test --- r/R/dplyr-funcs-datetime.R | 3 +++ r/tests/testthat/test-dplyr-funcs-datetime.R | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index bb4370608e8..70d5973d1f0 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -165,6 +165,9 @@ register_bindings_datetime <- function() { } }) register_binding("date", function(x) { + if (inherits(x, "POSIXct")) { + x <- vec_to_Array(x, date32()) + } x$cast(date32()) }) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 7084c4f4447..beba618b351 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -824,6 +824,12 @@ test_that("date works in arrow", { call_binding("date", Array$create(as.POSIXct("2022-02-21"))), Array$create(as.POSIXct("2022-02-21"), type = date32()) ) + + # date() supports R objects + expect_equal( + call_binding("date", as.POSIXct("2022-02-21")), + Array$create(as.POSIXct("2022-02-21"), type = date32()) + ) }) test_that("date() errors with unsupported inputs", { From 0ffe5554bd72094eb970c580012f22fc784802f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Feb 2022 09:17:01 +0000 Subject: [PATCH 12/39] comment + skip integer test --- r/tests/testthat/test-dplyr-funcs-datetime.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index beba618b351..1ce36e9bca6 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -838,11 +838,6 @@ test_that("date() errors with unsupported inputs", { "NotImplemented: Unsupported cast from string to date32 using function cast_date32" ) - expect_error( - call_binding("date", Scalar$create(32L)), - "NotImplemented: Unsupported cast from integer to date32 using function cast_date32" - ) - expect_error( call_binding("date", Scalar$create(32.2)), "NotImplemented: Unsupported cast from double to date32 using function cast_date32" @@ -852,4 +847,13 @@ test_that("date() errors with unsupported inputs", { call_binding("date", Scalar$create(TRUE)), "NotImplemented: Unsupported cast from bool to date32 using function cast_date32" ) + + # if we are aiming for equivalent behaviour to lubridate this should fail, but + # it doesn't as it is supported in arrow, where integer casting to date returns + # the date x days away from epoch + skip("supported in arrow, but not in lubridate") + expect_error( + call_binding("date", Scalar$create(32L)), + "NotImplemented: Unsupported cast from integer to date32 using function cast_date32" + ) }) From 5951160a56762b7ba1da96c80c3ab94c38126424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Feb 2022 14:53:24 +0000 Subject: [PATCH 13/39] oversight --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 1ce36e9bca6..b633074afec 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -797,6 +797,8 @@ test_that("dst extracts daylight savings time correctly", { compare_dplyr_binding( .input %>% mutate(dst = dst(dates)) %>% + collect(), + test_df ) }) From cefad69de79938e2dfeef8b6b6f70c2944527d55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 24 Feb 2022 12:39:32 +0000 Subject: [PATCH 14/39] use `build_expr()` when defining the `"date"` binding * this implies the expression created with `call_binding()` will no longer evaluate --- r/R/dplyr-funcs-datetime.R | 5 +--- r/R/dplyr-funcs-type.R | 7 ++++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 26 +++++++++++++++++++- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 70d5973d1f0..5a22c970965 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -165,9 +165,6 @@ register_bindings_datetime <- function() { } }) register_binding("date", function(x) { - if (inherits(x, "POSIXct")) { - x <- vec_to_Array(x, date32()) - } - x$cast(date32()) + build_expr("cast", x, options = list(to_type = date32())) }) } diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 2f1fa96b835..6cbbb69cf0d 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -76,6 +76,13 @@ register_bindings_type_cast <- function() { register_binding("as.numeric", function(x) { Expression$create("cast", x, options = cast_options(to_type = float64())) }) + register_binding("as.Date", function(x) { + # base::as.Date() first converts to UTC and then extracts the date, which is + # why we need to go through timestamp() first - see unit tests for the real + # life impact of the difference between lubridate::date() and base::as.Date + y <- build_expr("cast", x, options = list(to_type = timestamp())) + build_expr("cast", y, options = list(to_type = date32())) + }) register_binding("is", function(object, class2) { if (is.string(class2)) { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b633074afec..4edf01dd27e 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -821,6 +821,29 @@ test_that("date works in arrow", { test_df ) + compare_dplyr_binding( + .input %>% + mutate(a_date_base = as.Date(a)) %>% + collect(), + test_df + ) + + r_date_object <- lubridate::ymd_hms("2012-03-26 23:12:13") + compare_dplyr_binding( + .input %>% + mutate(b = date(r_date_object)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(b_base = as.Date(r_date_object)) %>% + collect(), + test_df + ) + + skip("All these will fail as we're not actually forcing evaluation") # a timestamp is cast correctly to date expect_equal( call_binding("date", Array$create(as.POSIXct("2022-02-21"))), @@ -835,6 +858,7 @@ test_that("date works in arrow", { }) test_that("date() errors with unsupported inputs", { + skip("All these will fail as we're not actually forcing evaluation") expect_error( call_binding("date", Scalar$create("a string")), "NotImplemented: Unsupported cast from string to date32 using function cast_date32" @@ -846,7 +870,7 @@ test_that("date() errors with unsupported inputs", { ) expect_error( - call_binding("date", Scalar$create(TRUE)), + arrow_eval(call_binding("date", Scalar$create(TRUE)), mask = arrow_mask(list())), "NotImplemented: Unsupported cast from bool to date32 using function cast_date32" ) From 1d965721aa69ec27ffed3c487dab68b2c1faa575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 24 Feb 2022 12:46:40 +0000 Subject: [PATCH 15/39] typo --- r/R/dplyr-funcs-type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 6cbbb69cf0d..2d2ab4f89df 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -79,7 +79,7 @@ register_bindings_type_cast <- function() { register_binding("as.Date", function(x) { # base::as.Date() first converts to UTC and then extracts the date, which is # why we need to go through timestamp() first - see unit tests for the real - # life impact of the difference between lubridate::date() and base::as.Date + # life impact of the difference between lubridate::date() and base::as.Date() y <- build_expr("cast", x, options = list(to_type = timestamp())) build_expr("cast", y, options = list(to_type = date32())) }) From 6210956f6d99471c22a9f47650e6bb411409aa43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 24 Feb 2022 13:30:11 +0000 Subject: [PATCH 16/39] using `cast_options()` to generate the `options` list --- r/R/dplyr-funcs-type.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 2d2ab4f89df..12605bcc906 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -80,8 +80,8 @@ register_bindings_type_cast <- function() { # base::as.Date() first converts to UTC and then extracts the date, which is # why we need to go through timestamp() first - see unit tests for the real # life impact of the difference between lubridate::date() and base::as.Date() - y <- build_expr("cast", x, options = list(to_type = timestamp())) - build_expr("cast", y, options = list(to_type = date32())) + y <- build_expr("cast", x, options = cast_options(to_type = timestamp())) + build_expr("cast", y, options = cast_options(to_type = date32())) }) register_binding("is", function(object, class2) { From 9d487c068874c6d46b5d7d3b1b104effd9a3caf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 25 Feb 2022 15:26:05 +0000 Subject: [PATCH 17/39] extended `as.Date()` to support multiple input types + tests --- r/R/dplyr-funcs-type.R | 46 +++++++++++- r/tests/testthat/test-dplyr-funcs-type.R | 89 ++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 12605bcc906..61dae59f8c1 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -76,12 +76,52 @@ register_bindings_type_cast <- function() { register_binding("as.numeric", function(x) { Expression$create("cast", x, options = cast_options(to_type = float64())) }) - register_binding("as.Date", function(x) { + register_binding("as.Date", function(x, + format = NULL, + origin = "1970-01-01", + tz = "UTC") { # base::as.Date() first converts to UTC and then extracts the date, which is # why we need to go through timestamp() first - see unit tests for the real # life impact of the difference between lubridate::date() and base::as.Date() - y <- build_expr("cast", x, options = cast_options(to_type = timestamp())) - build_expr("cast", y, options = cast_options(to_type = date32())) + # browser() + if (call_binding("is.Date", x)) { + # arrow_date <- build_expr("cast", x, options = cast_options(to_type = date32())) + return(x) + } else if (call_binding("is.POSIXct", x)) { + if (tz == "UTC") { + arrow_timestamp <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + return(build_expr("cast", arrow_timestamp, options = cast_options(to_type = date32()))) + } else { + abort("`as.Date()` with a timezone different to 'UTC' is not supported in Arrow") + } + } else if (call_binding("is.character", x)) { + # this could be improved with tryFormats once strptime returns NA and we + # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done + if (!is.null(format)) { + arrow_timestamp <- call_binding("strptime", x, format, unit = "s") + return(build_expr("cast", arrow_timestamp, options = cast_options(to_type = date32()))) + } else { + abort("`as.Date()` without `format` is not supported in Arrow") + } + + } else if (call_binding("is.numeric", x)) { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (!call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date so we have + # to convert to integers first - casting to int32() would error so we + # need to use round before casting + x <- call_binding("floor", x) + x <- build_expr("cast", x, options = (cast_options(to_type = int32()))) + } + if (origin == "1970-01-01") { + return(build_expr("cast", x, options = cast_options(to_type = date32()))) + } else { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + } }) register_binding("is", function(object, class2) { diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index c5fd83cb0f4..95dbdcf6398 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -19,6 +19,7 @@ skip_if_not_available("dataset") library(dplyr, warn.conflicts = FALSE) suppressPackageStartupMessages(library(bit64)) +suppressPackageStartupMessages(library(lubridate)) tbl <- example_data @@ -768,3 +769,91 @@ test_that("nested structs can be created from scalars and existing data frames", tibble(a = 1:2) ) }) + +test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + double_var = 34.56 + ) + + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as.Date(posixct_var), + date_dv = as.Date(date_var), + date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as.Date(integer_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # the way we go about it is a bit different, but the result is the same => + # testing without compare_dplyr_binding() + expect_equal( + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var)) %>% + collect(), + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + expect_equal( + test_df %>% + record_batch() %>% + mutate(date_double = as.Date(double_var)) %>% + collect(), + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + # actual and expected differ due to doubles are accounted for (floored in + # arrow and rouded to the next decimal in R) + expect_error( + compare_dplyr_binding( + .input %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect(), + test_df + ) + ) + + # currently we do not support an origin different to "1970-01-01" + expect_warning( + test_df %>% + arrow_table() %>% + mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% + collect(), + regexp = "`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow", + fixed = TRUE + + ) + + expect_warning( + test_df %>% + arrow_table() %>% + mutate(date_pv = as.Date(posixct_var, tz = "Pacific/Marquesas")) %>% + collect(), + regexp = "`as.Date()` with a timezone different to 'UTC' is not supported in Arrow", + fixed = TRUE + ) + + expect_warning( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as.Date(character_ymd_var)) %>% + collect(), + regexp = "`as.Date()` without `format` is not supported in Arrow", + fixed = TRUE + ) +}) From 42550411eabcd20fc98dbb22f8f612774a1ab2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 25 Feb 2022 16:09:23 +0000 Subject: [PATCH 18/39] cleaned-up the testing for `date()` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 65 +++++++++++--------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 4edf01dd27e..05766ab9202 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -809,6 +809,7 @@ test_that("date works in arrow", { # since as.Date returns the UTC date and date() doesn't test_df <- tibble( a = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York")) + r_date_object <- lubridate::ymd_hms("2012-03-26 23:12:13") # we can't (for now) use namespacing, so we need to make sure lubridate::date() # and not base::date() is being used @@ -828,7 +829,6 @@ test_that("date works in arrow", { test_df ) - r_date_object <- lubridate::ymd_hms("2012-03-26 23:12:13") compare_dplyr_binding( .input %>% mutate(b = date(r_date_object)) %>% @@ -842,44 +842,53 @@ test_that("date works in arrow", { collect(), test_df ) - - skip("All these will fail as we're not actually forcing evaluation") - # a timestamp is cast correctly to date - expect_equal( - call_binding("date", Array$create(as.POSIXct("2022-02-21"))), - Array$create(as.POSIXct("2022-02-21"), type = date32()) - ) - - # date() supports R objects - expect_equal( - call_binding("date", as.POSIXct("2022-02-21")), - Array$create(as.POSIXct("2022-02-21"), type = date32()) - ) }) test_that("date() errors with unsupported inputs", { - skip("All these will fail as we're not actually forcing evaluation") - expect_error( - call_binding("date", Scalar$create("a string")), - "NotImplemented: Unsupported cast from string to date32 using function cast_date32" + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + double_var = 34.56, + boolean_var = TRUE + ) + + # date from integer supported in arrow (similar to base::as.Date()), but in + # Arrow it assumes a fixed origin "1970-01-01" + expect_equal( + test_df %>% + arrow_table() %>% + select(integer_var) %>% + mutate(date_int = date(integer_var)) %>% + collect(), + tibble(integer_var = 32L, + date_int = as.Date("1970-02-02")) ) expect_error( - call_binding("date", Scalar$create(32.2)), - "NotImplemented: Unsupported cast from double to date32 using function cast_date32" + test_df %>% + arrow_table() %>% + mutate(date_char = date(character_ymd_var)) %>% + collect(), + regexp = "Unsupported cast from string to date32 using function cast_date32" ) expect_error( - arrow_eval(call_binding("date", Scalar$create(TRUE)), mask = arrow_mask(list())), - "NotImplemented: Unsupported cast from bool to date32 using function cast_date32" + test_df %>% + arrow_table() %>% + mutate(date_bool = date(boolean_var)) %>% + collect(), + regexp = "Unsupported cast from bool to date32 using function cast_date32" ) - # if we are aiming for equivalent behaviour to lubridate this should fail, but - # it doesn't as it is supported in arrow, where integer casting to date returns - # the date x days away from epoch - skip("supported in arrow, but not in lubridate") expect_error( - call_binding("date", Scalar$create(32L)), - "NotImplemented: Unsupported cast from integer to date32 using function cast_date32" + test_df %>% + arrow_table() %>% + mutate(date_double = date(double_var)) %>% + collect(), + regexp = "Unsupported cast from double to date32 using function cast_date32" ) + }) From a3b0f116b15d4ad184bbe63f3ca163a97d7acd60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 25 Feb 2022 16:18:08 +0000 Subject: [PATCH 19/39] clean-up --- r/R/dplyr-funcs-type.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 61dae59f8c1..94f7f6d24ab 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -80,12 +80,10 @@ register_bindings_type_cast <- function() { format = NULL, origin = "1970-01-01", tz = "UTC") { - # base::as.Date() first converts to UTC and then extracts the date, which is - # why we need to go through timestamp() first - see unit tests for the real - # life impact of the difference between lubridate::date() and base::as.Date() - # browser() + if (call_binding("is.Date", x)) { - # arrow_date <- build_expr("cast", x, options = cast_options(to_type = date32())) + # base::as.Date() first converts to the desired timestamp and then extracts + # the date, which is why we need to go through timestamp() first return(x) } else if (call_binding("is.POSIXct", x)) { if (tz == "UTC") { From ad49d2c18c9da0b474b94a5db0d9e05d8369b0fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Feb 2022 13:58:52 +0000 Subject: [PATCH 20/39] added `tryFormats` arg and handling for situations in which `x` is a character, but not an `Expression` --- r/R/dplyr-funcs-type.R | 20 ++++++++++++++------ r/tests/testthat/test-dplyr-funcs-type.R | 12 +++++++++++- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 94f7f6d24ab..eeef65af911 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -78,11 +78,12 @@ register_bindings_type_cast <- function() { }) register_binding("as.Date", function(x, format = NULL, + tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { if (call_binding("is.Date", x)) { - # base::as.Date() first converts to the desired timestamp and then extracts + # base::as.Date() first converts to the desired timezone and then extracts # the date, which is why we need to go through timestamp() first return(x) } else if (call_binding("is.POSIXct", x)) { @@ -96,12 +97,19 @@ register_bindings_type_cast <- function() { # this could be improved with tryFormats once strptime returns NA and we # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (!is.null(format)) { - arrow_timestamp <- call_binding("strptime", x, format, unit = "s") - return(build_expr("cast", arrow_timestamp, options = cast_options(to_type = date32()))) - } else { - abort("`as.Date()` without `format` is not supported in Arrow") + if (is.null(format)) { + if (length(tryFormats) == 1) { + format <- tryFormats[1] + } else { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + } + } + # if x is not an expression (e.g. passed as filter), convert it to one + if (!inherits(x, "Expression")) { + x <- build_expr("cast", x, options = cast_options(to_type = type(x))) } + arrow_timestamp <- call_binding("strptime", x, format, unit = "s") + return(build_expr("cast", arrow_timestamp, options = cast_options(to_type = date32()))) } else if (call_binding("is.numeric", x)) { # the origin argument will be better supported once we implement temporal diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 95dbdcf6398..e809906720c 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -849,11 +849,21 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) expect_warning( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as.Date(character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% + collect(), + regexp = "`as.Date()` with multiple `tryFormats` is not supported in Arrow", + fixed = TRUE + ) + + expect_error( test_df %>% arrow_table() %>% mutate(date_char_ymd = as.Date(character_ymd_var)) %>% collect(), - regexp = "`as.Date()` without `format` is not supported in Arrow", + regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", fixed = TRUE ) }) From 2c26ef7765a2dd8aafa856398b1fa1b41b3d5156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Feb 2022 15:05:10 +0000 Subject: [PATCH 21/39] figure out which one Windows doesn't like --- r/R/dplyr-funcs-type.R | 2 +- r/tests/testthat/test-dplyr-funcs-type.R | 46 +++++++++++++++++++++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index eeef65af911..0dd6a8d4839 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -101,7 +101,7 @@ register_bindings_type_cast <- function() { if (length(tryFormats) == 1) { format <- tryFormats[1] } else { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") } } # if x is not an expression (e.g. passed as filter), convert it to one diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index e809906720c..408095e1c73 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -780,13 +780,51 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a double_var = 34.56 ) + # compare_dplyr_binding( + # .input %>% + # mutate( + # date_pv = as.Date(posixct_var), + # date_dv = as.Date(date_var), + # date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + # date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + # date_int = as.Date(integer_var, origin = "1970-01-01") + # ) %>% + # collect(), + # test_df + # ) + + compare_dplyr_binding( + .input %>% + mutate(date_pv = as.Date(posixct_var)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(date_dv = as.Date(date_var)) %>% + collect(), + test_df + ) + compare_dplyr_binding( + .input %>% + mutate( + date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S") + ) %>% + collect(), + test_df + ) + compare_dplyr_binding( + .input %>% + mutate( + date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S") + ) %>% + collect(), + test_df + ) compare_dplyr_binding( .input %>% mutate( - date_pv = as.Date(posixct_var), - date_dv = as.Date(date_var), - date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), date_int = as.Date(integer_var, origin = "1970-01-01") ) %>% collect(), From 9b6a9e2ec8fbe0dfbad3a55d72dc9d0b6bc6c7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Feb 2022 16:02:32 +0000 Subject: [PATCH 22/39] use interim step + skip test involving tzdb on Windows --- r/R/dplyr-funcs-type.R | 21 +++++---- r/tests/testthat/test-dplyr-funcs-type.R | 55 ++++++------------------ 2 files changed, 26 insertions(+), 50 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 0dd6a8d4839..ff0be6af168 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -86,13 +86,16 @@ register_bindings_type_cast <- function() { # base::as.Date() first converts to the desired timezone and then extracts # the date, which is why we need to go through timestamp() first return(x) + + # cast from POSIXct } else if (call_binding("is.POSIXct", x)) { if (tz == "UTC") { - arrow_timestamp <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - return(build_expr("cast", arrow_timestamp, options = cast_options(to_type = date32()))) + interim_x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) } else { abort("`as.Date()` with a timezone different to 'UTC' is not supported in Arrow") } + + # cast from character } else if (call_binding("is.character", x)) { # this could be improved with tryFormats once strptime returns NA and we # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 @@ -108,9 +111,9 @@ register_bindings_type_cast <- function() { if (!inherits(x, "Expression")) { x <- build_expr("cast", x, options = cast_options(to_type = type(x))) } - arrow_timestamp <- call_binding("strptime", x, format, unit = "s") - return(build_expr("cast", arrow_timestamp, options = cast_options(to_type = date32()))) + interim_x <- call_binding("strptime", x, format, unit = "s") + # cast from numeric } else if (call_binding("is.numeric", x)) { # the origin argument will be better supported once we implement temporal # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) @@ -119,15 +122,17 @@ register_bindings_type_cast <- function() { # Arrow does not support direct casting from double to date so we have # to convert to integers first - casting to int32() would error so we # need to use round before casting + # TODO revisit if arrow decides to support double -> date casting x <- call_binding("floor", x) - x <- build_expr("cast", x, options = (cast_options(to_type = int32()))) - } - if (origin == "1970-01-01") { - return(build_expr("cast", x, options = cast_options(to_type = date32()))) + interim_x <- build_expr("cast", x, options = (cast_options(to_type = int32()))) } else { + interim_x <- x + } + if (origin != "1970-01-01") { abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } } + build_expr("cast", interim_x, options = cast_options(to_type = date32())) }) register_binding("is", function(object, class2) { diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 408095e1c73..c8f6b3aef9e 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -780,51 +780,14 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a double_var = 34.56 ) - # compare_dplyr_binding( - # .input %>% - # mutate( - # date_pv = as.Date(posixct_var), - # date_dv = as.Date(date_var), - # date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - # date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - # date_int = as.Date(integer_var, origin = "1970-01-01") - # ) %>% - # collect(), - # test_df - # ) - - compare_dplyr_binding( - .input %>% - mutate(date_pv = as.Date(posixct_var)) %>% - collect(), - test_df - ) - - compare_dplyr_binding( - .input %>% - mutate(date_dv = as.Date(date_var)) %>% - collect(), - test_df - ) - compare_dplyr_binding( - .input %>% - mutate( - date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S") - ) %>% - collect(), - test_df - ) - compare_dplyr_binding( - .input %>% - mutate( - date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S") - ) %>% - collect(), - test_df - ) + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done compare_dplyr_binding( .input %>% mutate( + date_dv = as.Date(date_var), + date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), date_int = as.Date(integer_var, origin = "1970-01-01") ) %>% collect(), @@ -904,4 +867,12 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", fixed = TRUE ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate(date_pv = as.Date(posixct_var)) %>% + collect(), + test_df + ) }) From 4105b82034f9ae6c085c87ac6d274ac8676f236b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Feb 2022 17:46:34 +0000 Subject: [PATCH 23/39] update NEWS --- r/NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 5c10ee94c33..c40e8d15248 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,7 +24,8 @@ * `tz()` to extract/get timezone * `semester()` to extract/get semester * `dst()` to get daylight savings time indicator. -* Additional `lubridate` features: `date()` + * `date()` to extract date +* `as.Date()` to convert to date # arrow 7.0.0 From 6e1be4e295a18e2a80828b61b10564838b8f1566 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 1 Mar 2022 10:19:58 +0000 Subject: [PATCH 24/39] moved unit test to the success block --- r/tests/testthat/test-dplyr-funcs-datetime.R | 41 +++++++++++--------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 05766ab9202..e4bc803d6f9 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -808,40 +808,57 @@ test_that("date works in arrow", { # this date is specific since lubridate::date() is different from base::as.Date() # since as.Date returns the UTC date and date() doesn't test_df <- tibble( - a = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York")) + posixct_date = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York"), + integer_var = c(32L, NA)) + r_date_object <- lubridate::ymd_hms("2012-03-26 23:12:13") # we can't (for now) use namespacing, so we need to make sure lubridate::date() - # and not base::date() is being used + # and not base::date() is being used. This is due to the way testthat runs and + # normal use of arrow would not have to do this explicitly. # TODO remove once https://issues.apache.org/jira/browse/ARROW-14575 is done date <- lubridate::date + compare_dplyr_binding( .input %>% - mutate(a_date = date(a)) %>% + mutate(a_date = date(posixct_date)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% - mutate(a_date_base = as.Date(a)) %>% + mutate(a_date_base = as.Date(posixct_date)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% - mutate(b = date(r_date_object)) %>% + mutate(date_from_r_object = date(r_date_object)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% - mutate(b_base = as.Date(r_date_object)) %>% + mutate(as_date_from_r_object = as.Date(r_date_object)) %>% collect(), test_df ) + + # date from integer supported in arrow (similar to base::as.Date()), but in + # Arrow it assumes a fixed origin "1970-01-01". However this is not supported + # by lubridate. lubridate::date(integer_var) errors without an `origin` + expect_equal( + test_df %>% + arrow_table() %>% + select(integer_var) %>% + mutate(date_int = date(integer_var)) %>% + collect(), + tibble(integer_var = c(32L, NA), + date_int = as.Date(c("1970-02-02", NA))) + ) }) test_that("date() errors with unsupported inputs", { @@ -855,18 +872,6 @@ test_that("date() errors with unsupported inputs", { boolean_var = TRUE ) - # date from integer supported in arrow (similar to base::as.Date()), but in - # Arrow it assumes a fixed origin "1970-01-01" - expect_equal( - test_df %>% - arrow_table() %>% - select(integer_var) %>% - mutate(date_int = date(integer_var)) %>% - collect(), - tibble(integer_var = 32L, - date_int = as.Date("1970-02-02")) - ) - expect_error( test_df %>% arrow_table() %>% From 47f81199ed33a4334c0bea763f7cf140ac4492fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 1 Mar 2022 10:20:22 +0000 Subject: [PATCH 25/39] remove the `interim_x` object and the additional step it introduced --- r/R/dplyr-funcs-type.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index ff0be6af168..d20d7c1b8b8 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -90,7 +90,7 @@ register_bindings_type_cast <- function() { # cast from POSIXct } else if (call_binding("is.POSIXct", x)) { if (tz == "UTC") { - interim_x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) } else { abort("`as.Date()` with a timezone different to 'UTC' is not supported in Arrow") } @@ -111,7 +111,7 @@ register_bindings_type_cast <- function() { if (!inherits(x, "Expression")) { x <- build_expr("cast", x, options = cast_options(to_type = type(x))) } - interim_x <- call_binding("strptime", x, format, unit = "s") + x <- call_binding("strptime", x, format, unit = "s") # cast from numeric } else if (call_binding("is.numeric", x)) { @@ -124,15 +124,13 @@ register_bindings_type_cast <- function() { # need to use round before casting # TODO revisit if arrow decides to support double -> date casting x <- call_binding("floor", x) - interim_x <- build_expr("cast", x, options = (cast_options(to_type = int32()))) - } else { - interim_x <- x + x <- build_expr("cast", x, options = (cast_options(to_type = int32()))) } if (origin != "1970-01-01") { abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } } - build_expr("cast", interim_x, options = cast_options(to_type = date32())) + build_expr("cast", x, options = cast_options(to_type = date32())) }) register_binding("is", function(object, class2) { From 0b8cfe37e8a2f909acb56e98959f38cd249f30b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 1 Mar 2022 11:25:58 +0000 Subject: [PATCH 26/39] float -> integer with `safe = FALSE` & 2 additional unit tests (with tolerance) --- r/R/dplyr-funcs-type.R | 6 ++--- r/tests/testthat/test-dplyr-funcs-type.R | 30 +++++++++++++++++++++++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index d20d7c1b8b8..11cdf484f15 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -120,11 +120,9 @@ register_bindings_type_cast <- function() { # TODO revisit once the above has been sorted if (!call_binding("is.integer", x)) { # Arrow does not support direct casting from double to date so we have - # to convert to integers first - casting to int32() would error so we - # need to use round before casting + # to convert to integers first # TODO revisit if arrow decides to support double -> date casting - x <- call_binding("floor", x) - x <- build_expr("cast", x, options = (cast_options(to_type = int32()))) + x <- build_expr("cast", x, options = (cast_options(to_type = int32(), safe = FALSE))) } if (origin != "1970-01-01") { abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index c8f6b3aef9e..79890f04696 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -819,7 +819,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) # actual and expected differ due to doubles are accounted for (floored in - # arrow and rouded to the next decimal in R) + # arrow and rounded to the next decimal in R) expect_error( compare_dplyr_binding( .input %>% @@ -829,6 +829,34 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) ) + expect_equal( + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect(), + test_df %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")), + # the absolute value for date_double is different due to arrow casting from + # integer and r from double => testing with a tolerance of 0.6 + # `actual$date_double`: 34.0 + # `expected$date_double`: 34.6 + tolerance = 0.6 + ) + + expect_equal( + test_df %>% + record_batch() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect(), + test_df %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")), + # the absolute value for date_double is different due to arrow casting from + # integer and r from double => testing with a tolerance of 0.6 + # `actual$date_double`: 34.0 + # `expected$date_double`: 34.6 + tolerance = 0.6 + ) + # currently we do not support an origin different to "1970-01-01" expect_warning( test_df %>% From 2be4e313a7fa33e3b8d7f596ee6cf4b8676e4b4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 1 Mar 2022 12:30:30 +0000 Subject: [PATCH 27/39] improved comments --- r/R/dplyr-funcs-type.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 11cdf484f15..bab1bb4881a 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -107,7 +107,8 @@ register_bindings_type_cast <- function() { abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") } } - # if x is not an expression (e.g. passed as filter), convert it to one + # if x is a character, but not an Expression (such as in some instances + # when passed via `filter.arrow_dplyr_query()`), convert it to one if (!inherits(x, "Expression")) { x <- build_expr("cast", x, options = cast_options(to_type = type(x))) } From a280f45c0b170da02a00173d8107272ee07ecefe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 1 Mar 2022 14:44:30 +0000 Subject: [PATCH 28/39] switch back to `floor` --- r/R/dplyr-funcs-type.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index bab1bb4881a..34d6c0d48aa 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -121,9 +121,13 @@ register_bindings_type_cast <- function() { # TODO revisit once the above has been sorted if (!call_binding("is.integer", x)) { # Arrow does not support direct casting from double to date so we have - # to convert to integers first + # to convert to integers first - casting to int32() would error so we + # need to use `floor` before casting. `floor` is also a bit safer than + # int32() with `safe = FALSE` since it doesn't switch to `ceiling` for + # negative numbers # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = (cast_options(to_type = int32(), safe = FALSE))) + x <- call_binding("floor", x) + x <- build_expr("cast", x, options = (cast_options(to_type = int32()))) } if (origin != "1970-01-01") { abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") From 81e9add76896d4c808cd1f0d9b6a946e3cb02c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 1 Mar 2022 16:27:02 +0000 Subject: [PATCH 29/39] let `build_expr()` handle conversion to `Expression` --- r/R/dplyr-funcs-type.R | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 34d6c0d48aa..da77d772824 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -107,12 +107,8 @@ register_bindings_type_cast <- function() { abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") } } - # if x is a character, but not an Expression (such as in some instances - # when passed via `filter.arrow_dplyr_query()`), convert it to one - if (!inherits(x, "Expression")) { - x <- build_expr("cast", x, options = cast_options(to_type = type(x))) - } - x <- call_binding("strptime", x, format, unit = "s") + unit <- make_valid_time_unit("s", c(valid_time64_units, valid_time32_units)) + x <- build_expr("strptime", x, options = list(format = format, unit = unit)) # cast from numeric } else if (call_binding("is.numeric", x)) { @@ -127,7 +123,7 @@ register_bindings_type_cast <- function() { # negative numbers # TODO revisit if arrow decides to support double -> date casting x <- call_binding("floor", x) - x <- build_expr("cast", x, options = (cast_options(to_type = int32()))) + x <- build_expr("cast", x, options = cast_options(to_type = int32())) } if (origin != "1970-01-01") { abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") From af3d5cf29b7c9dae2917099facafa956d82492c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 1 Mar 2022 16:38:54 +0000 Subject: [PATCH 30/39] simplify building the strptime expression (unit = O) --- r/R/dplyr-funcs-type.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index da77d772824..4c39c8739fb 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -107,8 +107,7 @@ register_bindings_type_cast <- function() { abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") } } - unit <- make_valid_time_unit("s", c(valid_time64_units, valid_time32_units)) - x <- build_expr("strptime", x, options = list(format = format, unit = unit)) + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) # cast from numeric } else if (call_binding("is.numeric", x)) { From d09bee8f697e8748139f9d795307bd8f6af8617f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 2 Mar 2022 10:43:05 +0000 Subject: [PATCH 31/39] unit tests updates --- r/tests/testthat/test-dplyr-funcs-datetime.R | 1 - r/tests/testthat/test-dplyr-funcs-type.R | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e4bc803d6f9..a221480955f 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -895,5 +895,4 @@ test_that("date() errors with unsupported inputs", { collect(), regexp = "Unsupported cast from double to date32 using function cast_date32" ) - }) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 79890f04696..ccb3ae4daae 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -883,7 +883,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a mutate(date_char_ymd = as.Date(character_ymd_var, tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% collect(), - regexp = "`as.Date()` with multiple `tryFormats` is not supported in Arrow", + regexp = "`as.Date()` with multiple `tryFormats` is not supported in Arrow yet", fixed = TRUE ) From 5da34fbd594977b3d6ce3970ce68777df1f64a5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 2 Mar 2022 10:43:58 +0000 Subject: [PATCH 32/39] simplified `format` vs `tryFormats` implementation and moved the check for `origin` at the top of the numeric block --- r/R/dplyr-funcs-type.R | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 4c39c8739fb..5a6692ead36 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -100,17 +100,17 @@ register_bindings_type_cast <- function() { # this could be improved with tryFormats once strptime returns NA and we # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (is.null(format)) { - if (length(tryFormats) == 1) { - format <- tryFormats[1] - } else { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") - } + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") } + format <- format %||% tryFormats[[1]] x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) # cast from numeric } else if (call_binding("is.numeric", x)) { + if (origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } # the origin argument will be better supported once we implement temporal # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) # TODO revisit once the above has been sorted @@ -124,9 +124,6 @@ register_bindings_type_cast <- function() { x <- call_binding("floor", x) x <- build_expr("cast", x, options = cast_options(to_type = int32())) } - if (origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } } build_expr("cast", x, options = cast_options(to_type = date32())) }) From 4f9d0c1e83339ce9c55983a57dc678c7dfb23c37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 2 Mar 2022 11:11:56 +0000 Subject: [PATCH 33/39] moved arg checking at the top and reorganised the body of the function + simplified some unit tests --- r/R/dplyr-funcs-type.R | 58 ++++++++++---------- r/tests/testthat/test-dplyr-funcs-datetime.R | 22 ++------ 2 files changed, 34 insertions(+), 46 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 5a6692ead36..4a92a56af83 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -82,48 +82,46 @@ register_bindings_type_cast <- function() { origin = "1970-01-01", tz = "UTC") { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + + if (tz != "UTC") { + abort("`as.Date()` with a timezone different to 'UTC' is not supported in Arrow") + } + # this could be improved with tryFormats once strptime returns NA and we + # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") + } + if (call_binding("is.Date", x)) { - # base::as.Date() first converts to the desired timezone and then extracts - # the date, which is why we need to go through timestamp() first return(x) # cast from POSIXct } else if (call_binding("is.POSIXct", x)) { - if (tz == "UTC") { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - } else { - abort("`as.Date()` with a timezone different to 'UTC' is not supported in Arrow") - } + # base::as.Date() first converts to the desired timezone and then extracts + # the date, which is why we need to go through timestamp() first + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = "UTC"))) # cast from character } else if (call_binding("is.character", x)) { - # this could be improved with tryFormats once strptime returns NA and we - # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") - } format <- format %||% tryFormats[[1]] x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) # cast from numeric - } else if (call_binding("is.numeric", x)) { - if (origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (!call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date so we have - # to convert to integers first - casting to int32() would error so we - # need to use `floor` before casting. `floor` is also a bit safer than - # int32() with `safe = FALSE` since it doesn't switch to `ceiling` for - # negative numbers - # TODO revisit if arrow decides to support double -> date casting - x <- call_binding("floor", x) - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date so we have to + # convert to integers first - casting to int32() would error so we need to + # use `floor` before casting. `floor` is also a bit safer than int32() with + # `safe = FALSE` since it doesn't switch to `ceiling` for negative numbers + # TODO revisit if arrow decides to support double -> date casting + x <- call_binding("floor", x) + x <- build_expr("cast", x, options = cast_options(to_type = int32())) } build_expr("cast", x, options = cast_options(to_type = date32())) }) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a221480955f..9e4cea04545 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -862,36 +862,26 @@ test_that("date works in arrow", { }) test_that("date() errors with unsupported inputs", { - test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), - date_var = as.Date("2022-02-25"), - character_ymd_var = "2022-02-25 00:00:01", - character_ydm_var = "2022/25/02 00:00:01", - integer_var = 32L, - double_var = 34.56, - boolean_var = TRUE - ) - expect_error( - test_df %>% + example_data %>% arrow_table() %>% - mutate(date_char = date(character_ymd_var)) %>% + mutate(date_char = date("2022-02-25 00:00:01")) %>% collect(), regexp = "Unsupported cast from string to date32 using function cast_date32" ) expect_error( - test_df %>% + example_data %>% arrow_table() %>% - mutate(date_bool = date(boolean_var)) %>% + mutate(date_bool = date(TRUE)) %>% collect(), regexp = "Unsupported cast from bool to date32 using function cast_date32" ) expect_error( - test_df %>% + example_data %>% arrow_table() %>% - mutate(date_double = date(double_var)) %>% + mutate(date_double = date(34.56)) %>% collect(), regexp = "Unsupported cast from double to date32 using function cast_date32" ) From 47587a93aad2de43865001a0bfa4fd52602d5977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 2 Mar 2022 11:29:29 +0000 Subject: [PATCH 34/39] removed redundant unit tests --- r/tests/testthat/test-dplyr-funcs-type.R | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index ccb3ae4daae..879de273b71 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -794,30 +794,6 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a test_df ) - # the way we go about it is a bit different, but the result is the same => - # testing without compare_dplyr_binding() - expect_equal( - test_df %>% - arrow_table() %>% - mutate(date_double = as.Date(double_var)) %>% - collect(), - test_df %>% - arrow_table() %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect() - ) - - expect_equal( - test_df %>% - record_batch() %>% - mutate(date_double = as.Date(double_var)) %>% - collect(), - test_df %>% - arrow_table() %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect() - ) - # actual and expected differ due to doubles are accounted for (floored in # arrow and rounded to the next decimal in R) expect_error( From fc7667c6424dc978fe295f7839c24beb931631db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Mar 2022 10:27:05 +0000 Subject: [PATCH 35/39] support other timezones than `"UTC"` and update tests to reflect that --- r/R/dplyr-funcs-type.R | 5 +---- r/tests/testthat/test-dplyr-funcs-type.R | 18 ++++++------------ 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 4a92a56af83..57aba18dcbf 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -89,9 +89,6 @@ register_bindings_type_cast <- function() { abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } - if (tz != "UTC") { - abort("`as.Date()` with a timezone different to 'UTC' is not supported in Arrow") - } # this could be improved with tryFormats once strptime returns NA and we # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done @@ -106,7 +103,7 @@ register_bindings_type_cast <- function() { } else if (call_binding("is.POSIXct", x)) { # base::as.Date() first converts to the desired timezone and then extracts # the date, which is why we need to go through timestamp() first - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = "UTC"))) + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) # cast from character } else if (call_binding("is.character", x)) { diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 879de273b71..2cb35b44840 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -768,7 +768,8 @@ test_that("nested structs can be created from scalars and existing data frames", collect(), tibble(a = 1:2) ) -}) + + }) test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { test_df <- tibble::tibble( @@ -841,16 +842,6 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a collect(), regexp = "`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow", fixed = TRUE - - ) - - expect_warning( - test_df %>% - arrow_table() %>% - mutate(date_pv = as.Date(posixct_var, tz = "Pacific/Marquesas")) %>% - collect(), - regexp = "`as.Date()` with a timezone different to 'UTC' is not supported in Arrow", - fixed = TRUE ) expect_warning( @@ -875,7 +866,10 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 compare_dplyr_binding( .input %>% - mutate(date_pv = as.Date(posixct_var)) %>% + mutate( + date_pv = as.Date(posixct_var), + date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") + ) %>% collect(), test_df ) From 036e3493e4dba08f14c441b00aba7100c306d1fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Mar 2022 10:27:20 +0000 Subject: [PATCH 36/39] update NEWS --- r/NEWS.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index cc912a03fad..81a23aa0318 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -25,8 +25,9 @@ * `semester()` to extract/get semester * `dst()` to get daylight savings time indicator. * `date()` to extract date - * * `epiyear()` to get epiyear -* `as.Date()` to convert to date + * `epiyear()` to get epiyear +* date-time functionality: + * `as.Date()` to convert to date # arrow 7.0.0 From 5788fd5b4fd69113935b7b0239bf8220dde735fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Mar 2022 14:03:30 +0000 Subject: [PATCH 37/39] remove suport for double and corresponding unit tests --- r/R/dplyr-funcs-type.R | 12 ++---- r/tests/testthat/test-dplyr-funcs-type.R | 52 ++++++------------------ 2 files changed, 16 insertions(+), 48 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 57aba18dcbf..e8ed6904823 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -85,7 +85,7 @@ register_bindings_type_cast <- function() { # the origin argument will be better supported once we implement temporal # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) # TODO revisit once the above has been sorted - if (origin != "1970-01-01") { + if (call_binding("is.numeric", x) & origin != "1970-01-01") { abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } @@ -93,7 +93,7 @@ register_bindings_type_cast <- function() { # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow yet") + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") } if (call_binding("is.Date", x)) { @@ -112,13 +112,9 @@ register_bindings_type_cast <- function() { # cast from numeric } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date so we have to - # convert to integers first - casting to int32() would error so we need to - # use `floor` before casting. `floor` is also a bit safer than int32() with - # `safe = FALSE` since it doesn't switch to `ceiling` for negative numbers + # Arrow does not support direct casting from double to date32() # TODO revisit if arrow decides to support double -> date casting - x <- call_binding("floor", x) - x <- build_expr("cast", x, options = cast_options(to_type = int32())) + abort("`as.Date()` with double/float is not supported in Arrow") } build_expr("cast", x, options = cast_options(to_type = date32())) }) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 2cb35b44840..e18b9784576 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -795,45 +795,6 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a test_df ) - # actual and expected differ due to doubles are accounted for (floored in - # arrow and rounded to the next decimal in R) - expect_error( - compare_dplyr_binding( - .input %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect(), - test_df - ) - ) - - expect_equal( - test_df %>% - arrow_table() %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect(), - test_df %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")), - # the absolute value for date_double is different due to arrow casting from - # integer and r from double => testing with a tolerance of 0.6 - # `actual$date_double`: 34.0 - # `expected$date_double`: 34.6 - tolerance = 0.6 - ) - - expect_equal( - test_df %>% - record_batch() %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect(), - test_df %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")), - # the absolute value for date_double is different due to arrow casting from - # integer and r from double => testing with a tolerance of 0.6 - # `actual$date_double`: 34.0 - # `expected$date_double`: 34.6 - tolerance = 0.6 - ) - # currently we do not support an origin different to "1970-01-01" expect_warning( test_df %>% @@ -844,13 +805,14 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a fixed = TRUE ) + # we do not support multiple tryFormats expect_warning( test_df %>% arrow_table() %>% mutate(date_char_ymd = as.Date(character_ymd_var, tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% collect(), - regexp = "`as.Date()` with multiple `tryFormats` is not supported in Arrow yet", + regexp = "`as.Date()` with multiple `tryFormats` is not supported in Arrow", fixed = TRUE ) @@ -863,6 +825,16 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a fixed = TRUE ) + # we do not support as.Date() with double/ float + expect_warning( + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect(), + regexp = "`as.Date()` with double/float is not supported in Arrow", + fixed = TRUE + ) + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 compare_dplyr_binding( .input %>% From a549f4c3cb431a28bfc86cd46a10798b6c598357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Mar 2022 15:46:12 +0000 Subject: [PATCH 38/39] use `compare_dplyr_binding()` to test pull back to R --- r/tests/testthat/test-dplyr-funcs-type.R | 27 +++++++++++------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index e18b9784576..18efc5326dc 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -796,24 +796,22 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) # currently we do not support an origin different to "1970-01-01" - expect_warning( - test_df %>% - arrow_table() %>% + compare_dplyr_binding( + .input %>% mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% collect(), - regexp = "`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow", - fixed = TRUE + test_df, + warning = TRUE ) # we do not support multiple tryFormats - expect_warning( - test_df %>% - arrow_table() %>% + compare_dplyr_binding( + .input %>% mutate(date_char_ymd = as.Date(character_ymd_var, tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% collect(), - regexp = "`as.Date()` with multiple `tryFormats` is not supported in Arrow", - fixed = TRUE + test_df, + warning = TRUE ) expect_error( @@ -826,13 +824,12 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) # we do not support as.Date() with double/ float - expect_warning( - test_df %>% - arrow_table() %>% + compare_dplyr_binding( + .input %>% mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% collect(), - regexp = "`as.Date()` with double/float is not supported in Arrow", - fixed = TRUE + test_df, + warning = TRUE ) skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 From 1a9bbfd320c5bb73401c223249b4683242e41397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Mar 2022 15:52:44 +0000 Subject: [PATCH 39/39] added a couple of comments to improve quality --- r/R/dplyr-funcs-type.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index e8ed6904823..fa839269abe 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -108,11 +108,13 @@ register_bindings_type_cast <- function() { # cast from character } else if (call_binding("is.character", x)) { format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) # cast from numeric } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { # Arrow does not support direct casting from double to date32() + # https://issues.apache.org/jira/browse/ARROW-15798 # TODO revisit if arrow decides to support double -> date casting abort("`as.Date()` with double/float is not supported in Arrow") }