From 713b2ed7aa245ca6623d7e2d4fc6c4dbb6cd0a88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 14:31:47 +0000 Subject: [PATCH 01/22] added integerish test for `as.Date()` --- r/tests/testthat/test-dplyr-funcs-type.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 6c9d9ac07a4..7556947ee70 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -809,6 +809,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a character_ymd_var = "2022-02-25 00:00:01", character_ydm_var = "2022/25/02 00:00:01", integer_var = 32L, + integerish_var = 32, double_var = 34.56 ) @@ -820,7 +821,8 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a date_dv = as.Date(date_var), date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as.Date(integer_var, origin = "1970-01-01") + date_int = as.Date(integer_var, origin = "1970-01-01"), + date_integerish = as.Date(integerish_var, origin = "1970-01-01") ) %>% collect(), test_df From 256e97ae9be896f4a47a3a26ca7fc0b3f0da3660 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 14:34:08 +0000 Subject: [PATCH 02/22] add integerish support for `as.Date()` --- r/R/dplyr-funcs-type.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 1bb633d5322..d9b56181545 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -114,10 +114,11 @@ register_bindings_type_cast <- function() { # cast from numeric } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32() + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() # https://issues.apache.org/jira/browse/ARROW-15798 # TODO revisit if arrow decides to support double -> date casting - abort("`as.Date()` with double/float is not supported in Arrow") + x <- build_expr("cast", x, options = cast_options(to_type = int32())) } build_expr("cast", x, options = cast_options(to_type = date32())) }) From ec99da33539a492c171f8d644adc8aaeb09e8528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 14:39:36 +0000 Subject: [PATCH 03/22] replace `compare_dplyr_binding()` with `expect_error()` --- r/tests/testthat/test-dplyr-funcs-type.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 7556947ee70..d05366b2d20 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -856,13 +856,12 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a fixed = TRUE ) - # we do not support as.Date() with double/ float - compare_dplyr_binding( - .input %>% + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect(), - test_df, - warning = TRUE + collect() ) skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 From 7571fc080e32a7b6bd2ec1cf1bb49d48c507c5b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 14:46:49 +0000 Subject: [PATCH 04/22] added the `as_date()` binding --- r/R/dplyr-funcs-type.R | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index d9b56181545..6da43a617db 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -123,6 +123,47 @@ register_bindings_type_cast <- function() { build_expr("cast", x, options = cast_options(to_type = date32())) }) + register_binding("as_date", function(x, + format = NULL, + origin = "1970-01-01", + tz = "UTC") { + + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + + if (!is.null(tz)) { + "`tz` argument is ignored by `as_date()`" + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # POSIXct is of type double -> we need this to prevent going down the + # "double" branch + x <- x + + # cast from character + } else if (call_binding("is.character", x)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) + register_binding("is", function(object, class2) { if (is.string(class2)) { switch(class2, From 11e99bdf0a809305c5334b0c610144a6d55d5f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 15:39:37 +0000 Subject: [PATCH 05/22] if `format` is unspecified, assume ISO --- r/R/dplyr-funcs-type.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 6da43a617db..038742428e4 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -82,7 +82,6 @@ register_bindings_type_cast <- function() { tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { - # the origin argument will be better supported once we implement temporal # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) # TODO revisit once the above has been sorted @@ -127,7 +126,6 @@ register_bindings_type_cast <- function() { format = NULL, origin = "1970-01-01", tz = "UTC") { - # the origin argument will be better supported once we implement temporal # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) # TODO revisit once the above has been sorted @@ -135,8 +133,9 @@ register_bindings_type_cast <- function() { abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } - if (!is.null(tz)) { - "`tz` argument is ignored by `as_date()`" + # assume format is ISO if unspecified (to align with lubridate::as_date) + if (is.null(format)) { + format <- "%Y-%m-%d" } if (call_binding("is.Date", x)) { @@ -144,6 +143,9 @@ register_bindings_type_cast <- function() { # cast from POSIXct } else if (call_binding("is.POSIXct", x)) { + if (!missing(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } # POSIXct is of type double -> we need this to prevent going down the # "double" branch x <- x From 071d68d0f919298f9675b49db72a6b3ee3299848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 15:39:55 +0000 Subject: [PATCH 06/22] unit tests for `as_date()` --- r/tests/testthat/test-dplyr-funcs-type.R | 82 ++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index d05366b2d20..1a15d173347 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -876,6 +876,88 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) }) +test_that("as_date()", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Europe/London"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + # test_df %>% + # arrow_table() %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% + collect(), + test_df + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as_date(date_var), + date_char_ymd = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as_date(integer_var, origin = "1970-01-01"), + date_integerish = as_date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # strptime does not support a partial format - + # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as_date(character_ymd_var)) %>% + collect() + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as_date(posixct_var), + date_pv_tz = as_date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) + test_that("format date/time", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 From 30ac13df4e510368e06b94046546a18e33954a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 15:53:40 +0000 Subject: [PATCH 07/22] update NEWS --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index 0a7d30d2a37..78e67ca4282 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -23,6 +23,7 @@ * `lubridate`: * component extraction functions: `tz()` (timezone), `semester()` (semester), `dst()` (daylight savings time indicator), `date()` (extract date), `epiyear()` (epiyear), improvements to `month()`, which now works with integer inputs. * `make_date()` & `make_datetime()` + `ISOdatetime()` & `ISOdate()` to create date-times from numeric representations. + * `as_date()` and `as_datetime()` * date-time functionality: * `difftime` and `as.difftime()` * `as.Date()` to convert to date From c9fb23a74f3f33a2f530e494d0b1e1f3255249c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 16:10:14 +0000 Subject: [PATCH 08/22] moved the `as_date()` binding to _dplyr-funcs-datetime_ --- r/R/dplyr-funcs-datetime.R | 43 ++++++++++++++++++++++++++++++++++++++ r/R/dplyr-funcs-type.R | 42 ------------------------------------- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 62da029c08a..5aa551f6a66 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -188,6 +188,49 @@ register_bindings_datetime <- function() { register_binding("date", function(x) { build_expr("cast", x, options = list(to_type = date32())) }) + register_binding("as_date", function(x, + format = NULL, + origin = "1970-01-01", + tz = "UTC") { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + + # assume format is ISO if unspecified (to align with lubridate::as_date) + if (is.null(format)) { + format <- "%Y-%m-%d" + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + if (!missing(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + # POSIXct is of type double -> we need this to prevent going down the + # "double" branch + x <- x + + # cast from character + } else if (call_binding("is.character", x)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) } register_bindings_duration <- function() { diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 038742428e4..3ac8de99040 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -122,49 +122,7 @@ register_bindings_type_cast <- function() { build_expr("cast", x, options = cast_options(to_type = date32())) }) - register_binding("as_date", function(x, - format = NULL, - origin = "1970-01-01", - tz = "UTC") { - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # assume format is ISO if unspecified (to align with lubridate::as_date) - if (is.null(format)) { - format <- "%Y-%m-%d" - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - if (!missing(tz)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - } - # POSIXct is of type double -> we need this to prevent going down the - # "double" branch - x <- x - - # cast from character - } else if (call_binding("is.character", x)) { - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - build_expr("cast", x, options = cast_options(to_type = date32())) - }) register_binding("is", function(object, class2) { if (is.string(class2)) { From 67c56b5399fba301c9e8c51dd11bbcc316313177 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 16:10:51 +0000 Subject: [PATCH 09/22] moved `as_date()` tests to _test-dplyr-funcs-datetime_ --- r/tests/testthat/test-dplyr-funcs-datetime.R | 82 ++++++++++++++++++++ r/tests/testthat/test-dplyr-funcs-type.R | 82 -------------------- 2 files changed, 82 insertions(+), 82 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 6328a4c8276..c28f244dbe5 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1213,3 +1213,85 @@ test_that("as.difftime()", { collect() ) }) + +test_that("as_date()", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Europe/London"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + # test_df %>% + # arrow_table() %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% + collect(), + test_df + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as_date(date_var), + date_char_ymd = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as_date(integer_var, origin = "1970-01-01"), + date_integerish = as_date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # strptime does not support a partial format - + # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as_date(character_ymd_var)) %>% + collect() + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as_date(posixct_var), + date_pv_tz = as_date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 1a15d173347..d05366b2d20 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -876,88 +876,6 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) }) -test_that("as_date()", { - test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), - date_var = as.Date("2022-02-25"), - difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Europe/London"), - character_ymd_var = "2022-02-25 00:00:01", - character_ydm_var = "2022/25/02 00:00:01", - integer_var = 32L, - integerish_var = 32, - double_var = 34.56 - ) - - # difference between as.Date() and as_date(): - #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg - # to `as.Date()` - # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object - # passsed if`tz` is NULL - compare_dplyr_binding( - .input %>% - # test_df %>% - # arrow_table() %>% - transmute( - date_diff_lubridate = as_date(difference_date), - date_diff_base = as.Date(difference_date) - ) %>% - collect(), - test_df - ) - - # casting from POSIXct treated separately so we can skip on Windows - # TODO move the test for casting from POSIXct below once ARROW-13168 is done - compare_dplyr_binding( - .input %>% - mutate( - date_dv = as_date(date_var), - date_char_ymd = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as_date(integer_var, origin = "1970-01-01"), - date_integerish = as_date(integerish_var, origin = "1970-01-01") - ) %>% - collect(), - test_df - ) - - # currently we do not support an origin different to "1970-01-01" - compare_dplyr_binding( - .input %>% - mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% - collect(), - test_df, - warning = TRUE - ) - - # strptime does not support a partial format - - # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd = as_date(character_ymd_var)) %>% - collect() - ) - - # we do not support as.Date() with double/ float (error surfaced from C++) - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% - collect() - ) - - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( - .input %>% - mutate( - date_pv = as_date(posixct_var), - date_pv_tz = as_date(posixct_var, tz = "Pacific/Marquesas") - ) %>% - collect(), - test_df - ) -}) - test_that("format date/time", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 From 517af0428c79351bc81f14f840d9eab634c0e208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Mar 2022 16:28:39 +0000 Subject: [PATCH 10/22] moved `as.Date()` & tests to dplyr-funcs-datetime --- r/R/dplyr-funcs-datetime.R | 52 ++++++++++++++ r/R/dplyr-funcs-type.R | 46 ------------ r/tests/testthat/test-dplyr-funcs-datetime.R | 74 ++++++++++++++++++++ r/tests/testthat/test-dplyr-funcs-type.R | 74 -------------------- 4 files changed, 126 insertions(+), 120 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5aa551f6a66..59793d48c4a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -188,6 +188,50 @@ register_bindings_datetime <- function() { register_binding("date", function(x) { build_expr("cast", x, options = list(to_type = date32())) }) + register_binding("as.Date", function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01", + tz = "UTC") { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + + # this could be improved with tryFormats once strptime returns NA and we + # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # base::as.Date() first converts to the desired timezone and then extracts + # the date, which is why we need to go through timestamp() first + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + + # cast from character + } else if (call_binding("is.character", x)) { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) register_binding("as_date", function(x, format = NULL, origin = "1970-01-01", @@ -209,6 +253,7 @@ register_bindings_datetime <- function() { # cast from POSIXct } else if (call_binding("is.POSIXct", x)) { + # this is where as_date() differs from as.Date() if (!missing(tz)) { x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) } @@ -231,6 +276,13 @@ register_bindings_datetime <- function() { } build_expr("cast", x, options = cast_options(to_type = date32())) }) + register_binding("as_datetime", function(x, + origin, + tz){ + if (call_binding("is.numeric", x)) { + + } + }) } register_bindings_duration <- function() { diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 3ac8de99040..653719fa2cc 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -77,52 +77,6 @@ register_bindings_type_cast <- function() { register_binding("as.numeric", function(x) { build_expr("cast", x, options = cast_options(to_type = float64())) }) - register_binding("as.Date", function(x, - format = NULL, - tryFormats = "%Y-%m-%d", - origin = "1970-01-01", - tz = "UTC") { - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # this could be improved with tryFormats once strptime returns NA and we - # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() first converts to the desired timezone and then extracts - # the date, which is why we need to go through timestamp() first - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - - # cast from character - } else if (call_binding("is.character", x)) { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - build_expr("cast", x, options = cast_options(to_type = date32())) - }) - - register_binding("is", function(object, class2) { if (is.string(class2)) { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index c28f244dbe5..1ecb4bf98ab 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1214,6 +1214,80 @@ test_that("as.difftime()", { ) }) +test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as.Date(date_var), + date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as.Date(integer_var, origin = "1970-01-01"), + date_integerish = as.Date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # we do not support multiple tryFormats + compare_dplyr_binding( + .input %>% + mutate(date_char_ymd = as.Date(character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% + collect(), + test_df, + warning = TRUE + ) + + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as.Date(character_ymd_var)) %>% + collect(), + regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", + fixed = TRUE + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as.Date(posixct_var), + date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) + test_that("as_date()", { test_df <- tibble::tibble( posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index d05366b2d20..cf84026f0b9 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -800,80 +800,6 @@ test_that("nested structs can be created from scalars and existing data frames", tibble(a = 1:2) ) - }) - -test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { - test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), - date_var = as.Date("2022-02-25"), - character_ymd_var = "2022-02-25 00:00:01", - character_ydm_var = "2022/25/02 00:00:01", - integer_var = 32L, - integerish_var = 32, - double_var = 34.56 - ) - - # casting from POSIXct treated separately so we can skip on Windows - # TODO move the test for casting from POSIXct below once ARROW-13168 is done - compare_dplyr_binding( - .input %>% - mutate( - date_dv = as.Date(date_var), - date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as.Date(integer_var, origin = "1970-01-01"), - date_integerish = as.Date(integerish_var, origin = "1970-01-01") - ) %>% - collect(), - test_df - ) - - # currently we do not support an origin different to "1970-01-01" - compare_dplyr_binding( - .input %>% - mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% - collect(), - test_df, - warning = TRUE - ) - - # we do not support multiple tryFormats - compare_dplyr_binding( - .input %>% - mutate(date_char_ymd = as.Date(character_ymd_var, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% - collect(), - test_df, - warning = TRUE - ) - - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd = as.Date(character_ymd_var)) %>% - collect(), - regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", - fixed = TRUE - ) - - # we do not support as.Date() with double/ float (error surfaced from C++) - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect() - ) - - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( - .input %>% - mutate( - date_pv = as.Date(posixct_var), - date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") - ) %>% - collect(), - test_df - ) }) test_that("format date/time", { From c53a8c8b02f3bef667d75e3985bc77cb2911408c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Mar 2022 11:04:22 +0000 Subject: [PATCH 11/22] gymnastics with the location of the bindings (in prep for rebase) --- r/R/dplyr-funcs-datetime.R | 95 -------------- r/R/dplyr-funcs-type.R | 94 ++++++++++++++ r/tests/testthat/test-dplyr-funcs-type.R | 156 +++++++++++++++++++++++ 3 files changed, 250 insertions(+), 95 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 59793d48c4a..62da029c08a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -188,101 +188,6 @@ register_bindings_datetime <- function() { register_binding("date", function(x) { build_expr("cast", x, options = list(to_type = date32())) }) - register_binding("as.Date", function(x, - format = NULL, - tryFormats = "%Y-%m-%d", - origin = "1970-01-01", - tz = "UTC") { - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # this could be improved with tryFormats once strptime returns NA and we - # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() first converts to the desired timezone and then extracts - # the date, which is why we need to go through timestamp() first - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - - # cast from character - } else if (call_binding("is.character", x)) { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - build_expr("cast", x, options = cast_options(to_type = date32())) - }) - register_binding("as_date", function(x, - format = NULL, - origin = "1970-01-01", - tz = "UTC") { - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # assume format is ISO if unspecified (to align with lubridate::as_date) - if (is.null(format)) { - format <- "%Y-%m-%d" - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # this is where as_date() differs from as.Date() - if (!missing(tz)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - } - # POSIXct is of type double -> we need this to prevent going down the - # "double" branch - x <- x - - # cast from character - } else if (call_binding("is.character", x)) { - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - build_expr("cast", x, options = cast_options(to_type = date32())) - }) - register_binding("as_datetime", function(x, - origin, - tz){ - if (call_binding("is.numeric", x)) { - - } - }) } register_bindings_duration <- function() { diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 653719fa2cc..d1750bf88b9 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -77,7 +77,101 @@ register_bindings_type_cast <- function() { register_binding("as.numeric", function(x) { build_expr("cast", x, options = cast_options(to_type = float64())) }) + register_binding("as.Date", function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01", + tz = "UTC") { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + + # this could be improved with tryFormats once strptime returns NA and we + # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # base::as.Date() first converts to the desired timezone and then extracts + # the date, which is why we need to go through timestamp() first + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + + # cast from character + } else if (call_binding("is.character", x)) { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) + register_binding("as_date", function(x, + format = NULL, + origin = "1970-01-01", + tz = "UTC") { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + + # assume format is ISO if unspecified (to align with lubridate::as_date) + if (is.null(format)) { + format <- "%Y-%m-%d" + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # this is where as_date() differs from as.Date() + if (!missing(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + # POSIXct is of type double -> we need this to prevent going down the + # "double" branch + x <- x + + # cast from character + } else if (call_binding("is.character", x)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) + register_binding("as_datetime", function(x, + origin, + tz) { + if (call_binding("is.numeric", x)) { + + } + }) register_binding("is", function(object, class2) { if (is.string(class2)) { switch(class2, diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index cf84026f0b9..7ec6fb7ae4f 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -903,3 +903,159 @@ test_that("format() for unsupported types returns the input as string", { collect() ) }) + +test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as.Date(date_var), + date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as.Date(integer_var, origin = "1970-01-01"), + date_integerish = as.Date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # we do not support multiple tryFormats + compare_dplyr_binding( + .input %>% + mutate(date_char_ymd = as.Date(character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% + collect(), + test_df, + warning = TRUE + ) + + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as.Date(character_ymd_var)) %>% + collect(), + regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", + fixed = TRUE + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as.Date(posixct_var), + date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) + +test_that("as_date()", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Europe/London"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + # test_df %>% + # arrow_table() %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% + collect(), + test_df + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as_date(date_var), + date_char_ymd = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as_date(integer_var, origin = "1970-01-01"), + date_integerish = as_date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # strptime does not support a partial format - + # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as_date(character_ymd_var)) %>% + collect() + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as_date(posixct_var), + date_pv_tz = as_date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) From 08e629f7535d3dea8d55232f1faec87773528258 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Mar 2022 13:37:11 +0000 Subject: [PATCH 12/22] first pass at `as_datetime()` --- r/R/dplyr-funcs-type.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index d1750bf88b9..af6f432c3d9 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -166,11 +166,9 @@ register_bindings_type_cast <- function() { build_expr("cast", x, options = cast_options(to_type = date32())) }) register_binding("as_datetime", function(x, - origin, - tz) { - if (call_binding("is.numeric", x)) { - - } + origin = "1970-01-01", + tz = "UTC") { + build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) }) register_binding("is", function(object, class2) { if (is.string(class2)) { From 14bf8eaef88989638503266b49164dabf4b0eeed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Mar 2022 15:01:36 +0000 Subject: [PATCH 13/22] lint + reorg --- r/tests/testthat/test-dplyr-funcs-type.R | 48 ++++++++++++------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 7ec6fb7ae4f..3767af82171 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -881,29 +881,6 @@ test_that("format date/time", { ) }) -test_that("format() for unsupported types returns the input as string", { - expect_equal( - example_data %>% - record_batch() %>% - mutate(x = format(int)) %>% - collect(), - example_data %>% - record_batch() %>% - mutate(x = as.character(int)) %>% - collect() - ) - expect_equal( - example_data %>% - arrow_table() %>% - mutate(y = format(dbl)) %>% - collect(), - example_data %>% - arrow_table() %>% - mutate(y = as.character(dbl)) %>% - collect() - ) -}) - test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { test_df <- tibble::tibble( posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), @@ -982,7 +959,7 @@ test_that("as_date()", { test_df <- tibble::tibble( posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), date_var = as.Date("2022-02-25"), - difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Europe/London"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"), character_ymd_var = "2022-02-25 00:00:01", character_ydm_var = "2022/25/02 00:00:01", integer_var = 32L, @@ -1059,3 +1036,26 @@ test_that("as_date()", { test_df ) }) + +test_that("format() for unsupported types returns the input as string", { + expect_equal( + example_data %>% + record_batch() %>% + mutate(x = format(int)) %>% + collect(), + example_data %>% + record_batch() %>% + mutate(x = as.character(int)) %>% + collect() + ) + expect_equal( + example_data %>% + arrow_table() %>% + mutate(y = format(dbl)) %>% + collect(), + example_data %>% + arrow_table() %>% + mutate(y = as.character(dbl)) %>% + collect() + ) +}) From 7cd97c9aa700e143a2436a35692e09af8c685684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Mar 2022 15:02:03 +0000 Subject: [PATCH 14/22] `as_datetime()` binding --- r/R/dplyr-funcs-type.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index af6f432c3d9..e50cf2a3c26 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -82,6 +82,7 @@ register_bindings_type_cast <- function() { tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { + # the origin argument will be better supported once we implement temporal # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) # TODO revisit once the above has been sorted @@ -168,6 +169,12 @@ register_bindings_type_cast <- function() { register_binding("as_datetime", function(x, origin = "1970-01-01", tz = "UTC") { + if (call_binding("is.numeric", x) && origin != "1970-01-01") { + delta <- call_binding("difftime", origin, "1970-01-01") + output <- build_expr("+", x, delta) + output <- build_expr("cast", output, options = cast_options(to_type = timestamp(timezone = tz))) + return(output) + } build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) }) register_binding("is", function(object, class2) { From 453199c751d3a294e7f9a08564bf529063ededaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 09:50:37 +0000 Subject: [PATCH 15/22] improvements to `as_datetime()` --- r/R/dplyr-funcs-type.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index e50cf2a3c26..fa2ae698d5b 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -172,10 +172,11 @@ register_bindings_type_cast <- function() { if (call_binding("is.numeric", x) && origin != "1970-01-01") { delta <- call_binding("difftime", origin, "1970-01-01") output <- build_expr("+", x, delta) - output <- build_expr("cast", output, options = cast_options(to_type = timestamp(timezone = tz))) - return(output) + output <- build_expr("cast", output, options = cast_options(to_type = timestamp())) + } else { + output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) } - build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + build_expr("assume_timezone", output, options = list(timezone = tz)) }) register_binding("is", function(object, class2) { if (is.string(class2)) { From 99cea4a6e18151a5dea5ccbdbfb32a293931fc33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 09:51:46 +0000 Subject: [PATCH 16/22] unit test first step --- r/tests/testthat/test-dplyr-funcs-type.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 3767af82171..9b0c2138be8 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -1037,6 +1037,28 @@ test_that("as_date()", { ) }) +test_that("`as_datetime()`", { + test_df <- tibble( + date = as.Date(c("2022-03-22", "2021-07-30", NA)), + char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), + int_date = c(10L, 25L, NA), + integerish_date = c(10, 25, NA), + double_date = c(10.1, 25.2, NA) + ) + + test_df %>% + arrow_table() %>% + mutate( + # ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + # ddouble_date = as_datetime(double_date) + ) %>% + collect() +}) + test_that("format() for unsupported types returns the input as string", { expect_equal( example_data %>% From 76803ee7f5d7e2e761aecbd526f1b48b5fe35757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Mar 2022 14:00:11 +0000 Subject: [PATCH 17/22] update --- r/R/dplyr-funcs-type.R | 2 +- r/tests/testthat/test-dplyr-funcs-type.R | 26 +++++++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index fa2ae698d5b..5b5a2e10c37 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -169,7 +169,7 @@ register_bindings_type_cast <- function() { register_binding("as_datetime", function(x, origin = "1970-01-01", tz = "UTC") { - if (call_binding("is.numeric", x) && origin != "1970-01-01") { + if (call_binding("is.numeric", x)) { delta <- call_binding("difftime", origin, "1970-01-01") output <- build_expr("+", x, delta) output <- build_expr("cast", output, options = cast_options(to_type = timestamp())) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 9b0c2138be8..2fd906418a8 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -1049,14 +1049,30 @@ test_that("`as_datetime()`", { test_df %>% arrow_table() %>% mutate( - # ddate = as_datetime(date), - dchar_date_no_tz = as_datetime(char_date), - dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), - dint_date = as_datetime(int_date, origin = "1970-01-02"), + # ddate = as_datetime(date)#, + # dchar_date_no_tz = as_datetime(char_date)#, + # dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas")#, + # dint_date = as_datetime(int_date, origin = "1970-01-02")#, + # dint_date2 = as_datetime(int_date, origin = "1970-01-01"), # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), - # ddouble_date = as_datetime(double_date) + # dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01"), + ddouble_date = as_datetime(double_date) ) %>% collect() + + compare_dplyr_binding( + .input %>% + mutate( + # ddate = as_datetime(date)#, + # dchar_date_no_tz = as_datetime(char_date)#, + # dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas")#, + dint_date = as_datetime(int_date, origin = "1970-01-02")#, + # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + # ddouble_date = as_datetime(double_date) + ) %>% + collect(), + test_df + ) }) test_that("format() for unsupported types returns the input as string", { From b496c12bd136c4eada66e439cfeb148cd1dc98fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 15:02:14 +0100 Subject: [PATCH 18/22] moved datetime-related tests to `test-dplyr-funcs-datetime.R` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 38 ++++++ r/tests/testthat/test-dplyr-funcs-type.R | 120 ------------------- 2 files changed, 38 insertions(+), 120 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 1ecb4bf98ab..7efa503c070 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1369,3 +1369,41 @@ test_that("as_date()", { test_df ) }) + +test_that("`as_datetime()`", { + test_df <- tibble( + date = as.Date(c("2022-03-22", "2021-07-30", NA)), + char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), + int_date = c(10L, 25L, NA), + integerish_date = c(10, 25, NA), + double_date = c(10.1, 25.2, NA) + ) + + test_df %>% + arrow_table() %>% + mutate( + # ddate = as_datetime(date)#, + # dchar_date_no_tz = as_datetime(char_date)#, + # dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas")#, + # dint_date = as_datetime(int_date, origin = "1970-01-02")#, + # dint_date2 = as_datetime(int_date, origin = "1970-01-01"), + # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + # dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01"), + ddouble_date = as_datetime(double_date) + ) %>% + collect() + + compare_dplyr_binding( + .input %>% + mutate( + # ddate = as_datetime(date)#, + # dchar_date_no_tz = as_datetime(char_date)#, + # dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas")#, + dint_date = as_datetime(int_date, origin = "1970-01-02")#, + # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + # ddouble_date = as_datetime(double_date) + ) %>% + collect(), + test_df + ) +}) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 2fd906418a8..77341bf1c22 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -955,126 +955,6 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) }) -test_that("as_date()", { - test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), - date_var = as.Date("2022-02-25"), - difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"), - character_ymd_var = "2022-02-25 00:00:01", - character_ydm_var = "2022/25/02 00:00:01", - integer_var = 32L, - integerish_var = 32, - double_var = 34.56 - ) - - # difference between as.Date() and as_date(): - #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg - # to `as.Date()` - # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object - # passsed if`tz` is NULL - compare_dplyr_binding( - .input %>% - # test_df %>% - # arrow_table() %>% - transmute( - date_diff_lubridate = as_date(difference_date), - date_diff_base = as.Date(difference_date) - ) %>% - collect(), - test_df - ) - - # casting from POSIXct treated separately so we can skip on Windows - # TODO move the test for casting from POSIXct below once ARROW-13168 is done - compare_dplyr_binding( - .input %>% - mutate( - date_dv = as_date(date_var), - date_char_ymd = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as_date(integer_var, origin = "1970-01-01"), - date_integerish = as_date(integerish_var, origin = "1970-01-01") - ) %>% - collect(), - test_df - ) - - # currently we do not support an origin different to "1970-01-01" - compare_dplyr_binding( - .input %>% - mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% - collect(), - test_df, - warning = TRUE - ) - - # strptime does not support a partial format - - # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd = as_date(character_ymd_var)) %>% - collect() - ) - - # we do not support as.Date() with double/ float (error surfaced from C++) - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% - collect() - ) - - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( - .input %>% - mutate( - date_pv = as_date(posixct_var), - date_pv_tz = as_date(posixct_var, tz = "Pacific/Marquesas") - ) %>% - collect(), - test_df - ) -}) - -test_that("`as_datetime()`", { - test_df <- tibble( - date = as.Date(c("2022-03-22", "2021-07-30", NA)), - char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), - int_date = c(10L, 25L, NA), - integerish_date = c(10, 25, NA), - double_date = c(10.1, 25.2, NA) - ) - - test_df %>% - arrow_table() %>% - mutate( - # ddate = as_datetime(date)#, - # dchar_date_no_tz = as_datetime(char_date)#, - # dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas")#, - # dint_date = as_datetime(int_date, origin = "1970-01-02")#, - # dint_date2 = as_datetime(int_date, origin = "1970-01-01"), - # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), - # dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01"), - ddouble_date = as_datetime(double_date) - ) %>% - collect() - - compare_dplyr_binding( - .input %>% - mutate( - # ddate = as_datetime(date)#, - # dchar_date_no_tz = as_datetime(char_date)#, - # dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas")#, - dint_date = as_datetime(int_date, origin = "1970-01-02")#, - # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), - # ddouble_date = as_datetime(double_date) - ) %>% - collect(), - test_df - ) -}) - test_that("format() for unsupported types returns the input as string", { expect_equal( example_data %>% From a219939239b2f0185d5098c3eac895825208b630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 15:02:46 +0100 Subject: [PATCH 19/22] move datetime bindings to ...-funcs-datetime --- r/R/dplyr-funcs-datetime.R | 194 ++++++++++++++++++++++++++++--------- r/R/dplyr-funcs-type.R | 101 ------------------- 2 files changed, 149 insertions(+), 146 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 62da029c08a..354b501bd10 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -191,6 +191,70 @@ register_bindings_datetime <- function() { } register_bindings_duration <- function() { + register_binding("difftime", function(time1, + time2, + tz, + units = "secs") { + if (units != "secs") { + abort("`difftime()` with units other than `secs` not supported in Arrow") + } + + if (!missing(tz)) { + warn("`tz` argument is not supported in Arrow, so it will be ignored") + } + + # cast to timestamp if time1 and time2 are not dates or timestamp expressions + # (the subtraction of which would output a `duration`) + if (!call_binding("is.instant", time1)) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) + } + + if (!call_binding("is.instant", time2)) { + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) + } + + # we need to go build the subtract expression instead of `time1 - time2` to + # prevent complaints when we try to subtract an R object from an Expression + subtract_output <- build_expr("-", time1, time2) + build_expr("cast", subtract_output, options = cast_options(to_type = duration("s"))) + }) + register_binding("as.difftime", function(x, + format = "%X", + units = "secs") { + # windows doesn't seem to like "%X" + if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { + format <- "%H:%M:%S" + } + + if (units != "secs") { + abort("`as.difftime()` with units other than 'secs' not supported in Arrow") + } + + if (call_binding("is.character", x)) { + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + # complex casting only due to cast type restrictions: time64 -> int64 -> duration(us) + # and then we cast to duration ("s") at the end + x <- x$cast(time64("us"))$cast(int64())$cast(duration("us")) + } + + # numeric -> duration not supported in Arrow yet so we use int64() as an + # intermediate step + # TODO revisit if https://issues.apache.org/jira/browse/ARROW-15862 results + # in numeric -> duration support + + if (call_binding("is.numeric", x)) { + # coerce x to be int64(). it should work for integer-like doubles and fail + # for pure doubles + # if we abort for all doubles, we risk erroring in cases in which + # coercion to int64() would work + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + } + + build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) + }) +} + +register_bindings_datetime_helpers <- function() { register_binding("make_datetime", function(year = 1970L, month = 1L, day = 1L, @@ -239,66 +303,106 @@ register_bindings_duration <- function() { tz = "UTC") { call_binding("make_datetime", year, month, day, hour, min, sec, tz) }) - register_binding("difftime", function(time1, - time2, - tz, - units = "secs") { - if (units != "secs") { - abort("`difftime()` with units other than `secs` not supported in Arrow") - } + register_binding("as.Date", function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01", + tz = "UTC") { - if (!missing(tz)) { - warn("`tz` argument is not supported in Arrow, so it will be ignored") + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } - # cast to timestamp if time1 and time2 are not dates or timestamp expressions - # (the subtraction of which would output a `duration`) - if (!call_binding("is.instant", time1)) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) + # this could be improved with tryFormats once strptime returns NA and we + # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") } - if (!call_binding("is.instant", time2)) { - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) - } + if (call_binding("is.Date", x)) { + return(x) - # we need to go build the subtract expression instead of `time1 - time2` to - # prevent complaints when we try to subtract an R object from an Expression - subtract_output <- build_expr("-", time1, time2) - build_expr("cast", subtract_output, options = cast_options(to_type = duration("s"))) + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # base::as.Date() first converts to the desired timezone and then extracts + # the date, which is why we need to go through timestamp() first + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + + # cast from character + } else if (call_binding("is.character", x)) { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) }) - register_binding("as.difftime", function(x, - format = "%X", - units = "secs") { - # windows doesn't seem to like "%X" - if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { - format <- "%H:%M:%S" + register_binding("as_date", function(x, + format = NULL, + origin = "1970-01-01", + tz = "UTC") { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } - if (units != "secs") { - abort("`as.difftime()` with units other than 'secs' not supported in Arrow") + # assume format is ISO if unspecified (to align with lubridate::as_date) + if (is.null(format)) { + format <- "%Y-%m-%d" } - if (call_binding("is.character", x)) { - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - # complex casting only due to cast type restrictions: time64 -> int64 -> duration(us) - # and then we cast to duration ("s") at the end - x <- x$cast(time64("us"))$cast(int64())$cast(duration("us")) - } + if (call_binding("is.Date", x)) { + return(x) - # numeric -> duration not supported in Arrow yet so we use int64() as an - # intermediate step - # TODO revisit if https://issues.apache.org/jira/browse/ARROW-15862 results - # in numeric -> duration support + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # this is where as_date() differs from as.Date() + if (!missing(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + # POSIXct is of type double -> we need this to prevent going down the + # "double" branch + x <- x + + # cast from character + } else if (call_binding("is.character", x)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) + register_binding("as_datetime", function(x, + origin = "1970-01-01", + tz = "UTC") { if (call_binding("is.numeric", x)) { - # coerce x to be int64(). it should work for integer-like doubles and fail - # for pure doubles - # if we abort for all doubles, we risk erroring in cases in which - # coercion to int64() would work - x <- build_expr("cast", x, options = cast_options(to_type = int64())) + delta <- call_binding("difftime", origin, "1970-01-01") + output <- build_expr("+", x, delta) + output <- build_expr("cast", output, options = cast_options(to_type = timestamp())) + } else { + output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) } - - build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) + build_expr("assume_timezone", output, options = list(timezone = tz)) }) } diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 5b5a2e10c37..ebfffed9eb9 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -77,107 +77,6 @@ register_bindings_type_cast <- function() { register_binding("as.numeric", function(x) { build_expr("cast", x, options = cast_options(to_type = float64())) }) - register_binding("as.Date", function(x, - format = NULL, - tryFormats = "%Y-%m-%d", - origin = "1970-01-01", - tz = "UTC") { - - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # this could be improved with tryFormats once strptime returns NA and we - # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() first converts to the desired timezone and then extracts - # the date, which is why we need to go through timestamp() first - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - - # cast from character - } else if (call_binding("is.character", x)) { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - build_expr("cast", x, options = cast_options(to_type = date32())) - }) - register_binding("as_date", function(x, - format = NULL, - origin = "1970-01-01", - tz = "UTC") { - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # assume format is ISO if unspecified (to align with lubridate::as_date) - if (is.null(format)) { - format <- "%Y-%m-%d" - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # this is where as_date() differs from as.Date() - if (!missing(tz)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - } - # POSIXct is of type double -> we need this to prevent going down the - # "double" branch - x <- x - - # cast from character - } else if (call_binding("is.character", x)) { - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - build_expr("cast", x, options = cast_options(to_type = date32())) - }) - register_binding("as_datetime", function(x, - origin = "1970-01-01", - tz = "UTC") { - if (call_binding("is.numeric", x)) { - delta <- call_binding("difftime", origin, "1970-01-01") - output <- build_expr("+", x, delta) - output <- build_expr("cast", output, options = cast_options(to_type = timestamp())) - } else { - output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) - } - build_expr("assume_timezone", output, options = list(timezone = tz)) - }) register_binding("is", function(object, class2) { if (is.string(class2)) { switch(class2, From 759c2fde10d6c44ced054f5826de0d16261994e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 15:03:05 +0100 Subject: [PATCH 20/22] create a new set of bindings (`datetime_helpers`) --- r/R/dplyr-funcs.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 01e522e537b..4b5fa2efcd0 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -107,6 +107,7 @@ create_binding_cache <- function() { register_bindings_conditional() register_bindings_datetime() register_bindings_duration() + register_bindings_datetime_helpers() register_bindings_math() register_bindings_string() register_bindings_type() From 4888ab9cf66b270236934ea94d8ca469edf006a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 15:38:15 +0100 Subject: [PATCH 21/22] casting `x` and `delta` to `int64()` + updated unit tests --- r/R/dplyr-funcs-datetime.R | 6 ++-- r/tests/testthat/test-dplyr-funcs-datetime.R | 38 ++++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 354b501bd10..3da8092d3d8 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -206,11 +206,11 @@ register_bindings_duration <- function() { # cast to timestamp if time1 and time2 are not dates or timestamp expressions # (the subtraction of which would output a `duration`) if (!call_binding("is.instant", time1)) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) } if (!call_binding("is.instant", time2)) { - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp())) } # we need to go build the subtract expression instead of `time1 - time2` to @@ -397,6 +397,8 @@ register_bindings_datetime_helpers <- function() { tz = "UTC") { if (call_binding("is.numeric", x)) { delta <- call_binding("difftime", origin, "1970-01-01") + delta <- build_expr("cast", delta, options = cast_options(to_type = int64())) + x <- build_expr("cast", x, options = cast_options(to_type = int64())) output <- build_expr("+", x, delta) output <- build_expr("cast", output, options = cast_options(to_type = timestamp())) } else { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 7efa503c070..d8436a911fd 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1379,31 +1379,29 @@ test_that("`as_datetime()`", { double_date = c(10.1, 25.2, NA) ) - test_df %>% - arrow_table() %>% - mutate( - # ddate = as_datetime(date)#, - # dchar_date_no_tz = as_datetime(char_date)#, - # dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas")#, - # dint_date = as_datetime(int_date, origin = "1970-01-02")#, - # dint_date2 = as_datetime(int_date, origin = "1970-01-01"), - # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), - # dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01"), - ddouble_date = as_datetime(double_date) - ) %>% - collect() - compare_dplyr_binding( .input %>% mutate( - # ddate = as_datetime(date)#, - # dchar_date_no_tz = as_datetime(char_date)#, - # dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas")#, - dint_date = as_datetime(int_date, origin = "1970-01-02")#, - # dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), - # ddouble_date = as_datetime(double_date) + ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") ) %>% collect(), test_df ) + + # Arrow does not support conversion of double to date + # the below should error with an error message originating in the C++ code + expect_error( + test_df %>% + arrow_table() %>% + mutate( + ddouble_date = as_datetime(double_date) + ) %>% + collect(), + regexp = "Float value 10.1 was truncated converting to int64" + ) }) From 8d8b6ab399fadf817e5dd6839db1b19d2dcd06a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 17:09:37 +0100 Subject: [PATCH 22/22] test with `Pacific/Marquesas` and skip on Windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 48 ++++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index d8436a911fd..fc4ed563bcd 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1290,9 +1290,9 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a test_that("as_date()", { test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), date_var = as.Date("2022-02-25"), - difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Europe/London"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Pacific/Marquesas"), character_ymd_var = "2022-02-25 00:00:01", character_ydm_var = "2022/25/02 00:00:01", integer_var = 32L, @@ -1300,23 +1300,6 @@ test_that("as_date()", { double_var = 34.56 ) - # difference between as.Date() and as_date(): - #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg - # to `as.Date()` - # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object - # passsed if`tz` is NULL - compare_dplyr_binding( - .input %>% - # test_df %>% - # arrow_table() %>% - transmute( - date_diff_lubridate = as_date(difference_date), - date_diff_base = as.Date(difference_date) - ) %>% - collect(), - test_df - ) - # casting from POSIXct treated separately so we can skip on Windows # TODO move the test for casting from POSIXct below once ARROW-13168 is done compare_dplyr_binding( @@ -1368,6 +1351,21 @@ test_that("as_date()", { collect(), test_df ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% + collect(), + test_df + ) }) test_that("`as_datetime()`", { @@ -1384,7 +1382,6 @@ test_that("`as_datetime()`", { mutate( ddate = as_datetime(date), dchar_date_no_tz = as_datetime(char_date), - dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), dint_date = as_datetime(int_date, origin = "1970-01-02"), dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") @@ -1404,4 +1401,15 @@ test_that("`as_datetime()`", { collect(), regexp = "Float value 10.1 was truncated converting to int64" ) + + # separate tz test so we can skip on Windows + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) })