From 3c0bd201e0a2a0631e427e17e2741234c05a442b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 21:02:19 +0100 Subject: [PATCH 01/30] `as.Date()` supports integer-like inputs --- r/R/dplyr-funcs-type.R | 5 +++-- r/tests/testthat/test-dplyr-funcs-type.R | 16 +++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 1bb633d5322..d9b56181545 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -114,10 +114,11 @@ register_bindings_type_cast <- function() { # cast from numeric } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32() + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() # https://issues.apache.org/jira/browse/ARROW-15798 # TODO revisit if arrow decides to support double -> date casting - abort("`as.Date()` with double/float is not supported in Arrow") + x <- build_expr("cast", x, options = cast_options(to_type = int32())) } build_expr("cast", x, options = cast_options(to_type = date32())) }) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 6c9d9ac07a4..101c1b0d799 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -809,6 +809,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a character_ymd_var = "2022-02-25 00:00:01", character_ydm_var = "2022/25/02 00:00:01", integer_var = 32L, + integerish_var = 32, double_var = 34.56 ) @@ -820,7 +821,8 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a date_dv = as.Date(date_var), date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as.Date(integer_var, origin = "1970-01-01") + date_int = as.Date(integer_var, origin = "1970-01-01"), + date_integerish = as.Date(integerish_var, origin = "1970-01-01") ) %>% collect(), test_df @@ -854,13 +856,13 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a fixed = TRUE ) - # we do not support as.Date() with double/ float - compare_dplyr_binding( - .input %>% + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect(), - test_df, - warning = TRUE + collect() ) skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 From e24de042a19fad7d50698807eeb0828682216e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 21:08:00 +0100 Subject: [PATCH 02/30] `as_date()` bindings + tests + NEWS --- r/NEWS.md | 1 + r/R/dplyr-funcs-type.R | 45 ++++++++++++- r/tests/testthat/test-dplyr-funcs-type.R | 82 ++++++++++++++++++++++++ 3 files changed, 127 insertions(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 0a7d30d2a37..033b8bf3b4f 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,6 +24,7 @@ * component extraction functions: `tz()` (timezone), `semester()` (semester), `dst()` (daylight savings time indicator), `date()` (extract date), `epiyear()` (epiyear), improvements to `month()`, which now works with integer inputs. * `make_date()` & `make_datetime()` + `ISOdatetime()` & `ISOdate()` to create date-times from numeric representations. * date-time functionality: + * `as_date()` and `as_datetime()` * `difftime` and `as.difftime()` * `as.Date()` to convert to date diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index d9b56181545..038742428e4 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -82,7 +82,6 @@ register_bindings_type_cast <- function() { tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { - # the origin argument will be better supported once we implement temporal # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) # TODO revisit once the above has been sorted @@ -123,6 +122,50 @@ register_bindings_type_cast <- function() { build_expr("cast", x, options = cast_options(to_type = date32())) }) + register_binding("as_date", function(x, + format = NULL, + origin = "1970-01-01", + tz = "UTC") { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + + # assume format is ISO if unspecified (to align with lubridate::as_date) + if (is.null(format)) { + format <- "%Y-%m-%d" + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + if (!missing(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + # POSIXct is of type double -> we need this to prevent going down the + # "double" branch + x <- x + + # cast from character + } else if (call_binding("is.character", x)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) + register_binding("is", function(object, class2) { if (is.string(class2)) { switch(class2, diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 101c1b0d799..89389a2c842 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -877,6 +877,88 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) }) +test_that("as_date()", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Europe/London"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + # test_df %>% + # arrow_table() %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% + collect(), + test_df + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as_date(date_var), + date_char_ymd = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as_date(integer_var, origin = "1970-01-01"), + date_integerish = as_date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # strptime does not support a partial format - + # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as_date(character_ymd_var)) %>% + collect() + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as_date(posixct_var), + date_pv_tz = as_date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) + test_that("format date/time", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 From 122865e3f1e1b29781343ffcf73d4462aad2387f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 21:25:40 +0100 Subject: [PATCH 03/30] tests for `as_datetime()` plus skipped another test involving timezones on Windows --- r/tests/testthat/test-dplyr-funcs-type.R | 84 ++++++++++++++++++------ 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 89389a2c842..2a886e1e3e2 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -879,9 +879,9 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a test_that("as_date()", { test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), date_var = as.Date("2022-02-25"), - difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Europe/London"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), character_ymd_var = "2022-02-25 00:00:01", character_ydm_var = "2022/25/02 00:00:01", integer_var = 32L, @@ -889,23 +889,6 @@ test_that("as_date()", { double_var = 34.56 ) - # difference between as.Date() and as_date(): - #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg - # to `as.Date()` - # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object - # passsed if`tz` is NULL - compare_dplyr_binding( - .input %>% - # test_df %>% - # arrow_table() %>% - transmute( - date_diff_lubridate = as_date(difference_date), - date_diff_base = as.Date(difference_date) - ) %>% - collect(), - test_df - ) - # casting from POSIXct treated separately so we can skip on Windows # TODO move the test for casting from POSIXct below once ARROW-13168 is done compare_dplyr_binding( @@ -957,6 +940,69 @@ test_that("as_date()", { collect(), test_df ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + # test_df %>% + # arrow_table() %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% + collect(), + test_df + ) +}) + +test_that("`as_datetime()`", { + test_df <- tibble( + date = as.Date(c("2022-03-22", "2021-07-30", NA)), + char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), + int_date = c(10L, 25L, NA), + integerish_date = c(10, 25, NA), + double_date = c(10.1, 25.2, NA) + ) + + compare_dplyr_binding( + .input %>% + mutate( + ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # Arrow does not support conversion of double to date + # the below should error with an error message originating in the C++ code + expect_error( + test_df %>% + arrow_table() %>% + mutate( + ddouble_date = as_datetime(double_date) + ) %>% + collect(), + regexp = "Float value 10.1 was truncated converting to int64" + ) + + # separate tz test so we can skip on Windows + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) }) test_that("format date/time", { From 3cc83658f66a4230b56fdd986250bd559f78b418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 21:25:55 +0100 Subject: [PATCH 04/30] `as_datetime()` binding --- r/R/dplyr-funcs-type.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 038742428e4..202f8b917ae 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -166,6 +166,21 @@ register_bindings_type_cast <- function() { build_expr("cast", x, options = cast_options(to_type = date32())) }) + register_binding("as_datetime", function(x, + origin = "1970-01-01", + tz = "UTC") { + if (call_binding("is.numeric", x)) { + delta <- call_binding("difftime", origin, "1970-01-01") + delta <- build_expr("cast", delta, options = cast_options(to_type = int64())) + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + output <- build_expr("+", x, delta) + output <- build_expr("cast", output, options = cast_options(to_type = timestamp())) + } else { + output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) + } + build_expr("assume_timezone", output, options = list(timezone = tz)) + }) + register_binding("is", function(object, class2) { if (is.string(class2)) { switch(class2, From 34316361af4239df228f4900e95fb849d67d6d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 28 Mar 2022 21:26:29 +0100 Subject: [PATCH 05/30] removed tz from the casting to timestamp step --- r/R/dplyr-funcs-datetime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index c583aed5472..b8692cfe360 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -263,11 +263,11 @@ register_bindings_duration <- function() { # cast to timestamp if time1 and time2 are not dates or timestamp expressions # (the subtraction of which would output a `duration`) if (!call_binding("is.instant", time1)) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) } if (!call_binding("is.instant", time2)) { - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp())) } # we need to go build the subtract expression instead of `time1 - time2` to From fd9ee4a0c3ca9acd9308c19a9c7dfa949f632147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 29 Mar 2022 14:05:57 +0100 Subject: [PATCH 06/30] simplified --- r/R/dplyr-funcs-type.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 202f8b917ae..8b2032cce31 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -173,11 +173,9 @@ register_bindings_type_cast <- function() { delta <- call_binding("difftime", origin, "1970-01-01") delta <- build_expr("cast", delta, options = cast_options(to_type = int64())) x <- build_expr("cast", x, options = cast_options(to_type = int64())) - output <- build_expr("+", x, delta) - output <- build_expr("cast", output, options = cast_options(to_type = timestamp())) - } else { - output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) + x <- build_expr("+", x, delta) } + output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) build_expr("assume_timezone", output, options = list(timezone = tz)) }) From ba1bee22d02a949332665128916fb53d814c418d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 29 Mar 2022 16:07:53 +0100 Subject: [PATCH 07/30] updated unit tests as I have added support for an `origin` different from epoch --- r/tests/testthat/test-dplyr-funcs-type.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 2a886e1e3e2..f57210ca1a8 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -828,13 +828,11 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a test_df ) - # currently we do not support an origin different to "1970-01-01" compare_dplyr_binding( .input %>% mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% collect(), - test_df, - warning = TRUE + test_df ) # we do not support multiple tryFormats @@ -904,13 +902,11 @@ test_that("as_date()", { test_df ) - # currently we do not support an origin different to "1970-01-01" compare_dplyr_binding( .input %>% mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% collect(), - test_df, - warning = TRUE + test_df ) # strptime does not support a partial format - From 28a12a96aa2e6f647625543ef1bc8e587ffbd725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 29 Mar 2022 16:08:49 +0100 Subject: [PATCH 08/30] added abstract function a level below `as.Date()` and `as_date()` --- r/R/dplyr-funcs-datetime.R | 53 ++++++++++++++++++++++ r/R/dplyr-funcs-type.R | 91 +++++++------------------------------- 2 files changed, 68 insertions(+), 76 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index b8692cfe360..d207a53e196 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -330,3 +330,56 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { build_expr("strftime", x, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) } + +binding_as_date <- function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01", + tz = "UTC", + base = TRUE) { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + # if (call_binding("is.numeric", x) & origin != "1970-01-01") { + # abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + # } +# browser() + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # base::as.Date() first converts to the desired timezone and then extracts + # the date, which is why we need to go through timestamp() first + if (base || !missing(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + # POSIXct is of type double -> we need this to prevent going down the + # "double" branch + x <- x + + # cast from character + } else if (call_binding("is.character", x)) { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & + (!call_binding("is.integer", x) | origin != "1970-01-01")) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + delta_in_sec <- call_binding("difftime", origin, "1970-01-01") + delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64())) + delta_in_days <- (delta_in_sec / 86400L)$cast(int32()) + x <- build_expr("+", x, delta_in_days) + } + build_expr("cast", x, options = cast_options(to_type = date32())) +} diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 8b2032cce31..c01ead9a530 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -82,88 +82,27 @@ register_bindings_type_cast <- function() { tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # this could be improved with tryFormats once strptime returns NA and we - # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() first converts to the desired timezone and then extracts - # the date, which is why we need to go through timestamp() first - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - - # cast from character - } else if (call_binding("is.character", x)) { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - build_expr("cast", x, options = cast_options(to_type = date32())) + binding_as_date( + x = x, + format = format, + tryFormats = tryFormats, + origin = origin, + tz = tz, + base = TRUE + ) }) register_binding("as_date", function(x, format = NULL, origin = "1970-01-01", tz = "UTC") { - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # assume format is ISO if unspecified (to align with lubridate::as_date) - if (is.null(format)) { - format <- "%Y-%m-%d" - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - if (!missing(tz)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - } - # POSIXct is of type double -> we need this to prevent going down the - # "double" branch - x <- x - - # cast from character - } else if (call_binding("is.character", x)) { - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - build_expr("cast", x, options = cast_options(to_type = date32())) + binding_as_date( + x = x, + format = format, + origin = origin, + tz = tz, + base = FALSE + ) }) register_binding("as_datetime", function(x, From bfbd3b7cc106509bb04180fd280a2fb1eef9d6fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 29 Mar 2022 16:21:40 +0100 Subject: [PATCH 09/30] single {testthat} block for `as_date()` and `as.Date()` --- r/tests/testthat/test-dplyr-funcs-type.R | 116 ++++++++++------------- 1 file changed, 50 insertions(+), 66 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index f57210ca1a8..b93a2e77fcd 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -802,10 +802,21 @@ test_that("nested structs can be created from scalars and existing data frames", }) -test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { +test_that("`as.Date()` and `as_date()`", { + # test_df <- tibble::tibble( + # posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + # date_var = as.Date("2022-02-25"), + # character_ymd_var = "2022-02-25 00:00:01", + # character_ydm_var = "2022/25/02 00:00:01", + # integer_var = 32L, + # integerish_var = 32, + # double_var = 34.56 + # ) + test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), date_var = as.Date("2022-02-25"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), character_ymd_var = "2022-02-25 00:00:01", character_ydm_var = "2022/25/02 00:00:01", integer_var = 32L, @@ -818,22 +829,29 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a compare_dplyr_binding( .input %>% mutate( - date_dv = as.Date(date_var), - date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as.Date(integer_var, origin = "1970-01-01"), - date_integerish = as.Date(integerish_var, origin = "1970-01-01") + date_dv1 = as.Date(date_var), + date_char_ymd1 = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm1 = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int1 = as.Date(integer_var, origin = "1970-01-01"), + date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"), + date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"), + date_dv2 = as_date(date_var), + date_char_ymd2 = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm2 = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int2 = as_date(integer_var, origin = "1970-01-01"), + date_int_origin2 = as_date(integer_var, origin = "1970-01-03"), + date_integerish2 = as_date(integerish_var, origin = "1970-01-01") ) %>% collect(), test_df ) - compare_dplyr_binding( - .input %>% - mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% - collect(), - test_df - ) + # compare_dplyr_binding( + # .input %>% + # mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% + # collect(), + # test_df + # ) # we do not support multiple tryFormats compare_dplyr_binding( @@ -845,6 +863,15 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a warning = TRUE ) + # strptime does not support a partial format - + # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as_date(character_ymd_var)) %>% + collect() + ) + expect_error( test_df %>% arrow_table() %>% @@ -856,6 +883,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a # we do not support as.Date() with double/ float (error surfaced from C++) + # TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798 expect_error( test_df %>% arrow_table() %>% @@ -863,6 +891,15 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a collect() ) + # we do not support as_date with double/ float (error surfaced from C++) + # TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% + collect() + ) + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 compare_dplyr_binding( .input %>% @@ -873,60 +910,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a collect(), test_df ) -}) - -test_that("as_date()", { - test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), - date_var = as.Date("2022-02-25"), - difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), - character_ymd_var = "2022-02-25 00:00:01", - character_ydm_var = "2022/25/02 00:00:01", - integer_var = 32L, - integerish_var = 32, - double_var = 34.56 - ) - - # casting from POSIXct treated separately so we can skip on Windows - # TODO move the test for casting from POSIXct below once ARROW-13168 is done - compare_dplyr_binding( - .input %>% - mutate( - date_dv = as_date(date_var), - date_char_ymd = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as_date(integer_var, origin = "1970-01-01"), - date_integerish = as_date(integerish_var, origin = "1970-01-01") - ) %>% - collect(), - test_df - ) - compare_dplyr_binding( - .input %>% - mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% - collect(), - test_df - ) - - # strptime does not support a partial format - - # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd = as_date(character_ymd_var)) %>% - collect() - ) - - # we do not support as.Date() with double/ float (error surfaced from C++) - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% - collect() - ) - - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 compare_dplyr_binding( .input %>% mutate( From 48b498e641eebb80d8b8fd4d0552ea221ac19c56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 29 Mar 2022 16:26:10 +0100 Subject: [PATCH 10/30] tidy-up --- r/R/dplyr-funcs-datetime.R | 8 +------- r/tests/testthat/test-dplyr-funcs-type.R | 17 ----------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index d207a53e196..4656044a876 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -337,13 +337,7 @@ binding_as_date <- function(x, origin = "1970-01-01", tz = "UTC", base = TRUE) { - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - # if (call_binding("is.numeric", x) & origin != "1970-01-01") { - # abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - # } -# browser() + if (is.null(format) && length(tryFormats) > 1) { abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") } diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index b93a2e77fcd..585c630f50b 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -803,16 +803,6 @@ test_that("nested structs can be created from scalars and existing data frames", }) test_that("`as.Date()` and `as_date()`", { - # test_df <- tibble::tibble( - # posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), - # date_var = as.Date("2022-02-25"), - # character_ymd_var = "2022-02-25 00:00:01", - # character_ydm_var = "2022/25/02 00:00:01", - # integer_var = 32L, - # integerish_var = 32, - # double_var = 34.56 - # ) - test_df <- tibble::tibble( posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), date_var = as.Date("2022-02-25"), @@ -846,13 +836,6 @@ test_that("`as.Date()` and `as_date()`", { test_df ) - # compare_dplyr_binding( - # .input %>% - # mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% - # collect(), - # test_df - # ) - # we do not support multiple tryFormats compare_dplyr_binding( .input %>% From af69bf09a240825c5d8fda8a3e92e147f8481d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 29 Mar 2022 16:26:49 +0100 Subject: [PATCH 11/30] some more tidying up --- r/tests/testthat/test-dplyr-funcs-type.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 585c630f50b..b3190986fc7 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -911,8 +911,6 @@ test_that("`as.Date()` and `as_date()`", { # passsed if`tz` is NULL compare_dplyr_binding( .input %>% - # test_df %>% - # arrow_table() %>% transmute( date_diff_lubridate = as_date(difference_date), date_diff_base = as.Date(difference_date) From 2fbba5ff364dea4e49f049b8c861af434ba349bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Apr 2022 15:04:39 +0300 Subject: [PATCH 12/30] default value for `tz` with `as_date()` is now `NULL` and `as_datetime()` supports `format` --- r/R/dplyr-funcs-type.R | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index c01ead9a530..2689633499d 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -95,7 +95,7 @@ register_bindings_type_cast <- function() { register_binding("as_date", function(x, format = NULL, origin = "1970-01-01", - tz = "UTC") { + tz = NULL) { binding_as_date( x = x, format = format, @@ -107,13 +107,23 @@ register_bindings_type_cast <- function() { register_binding("as_datetime", function(x, origin = "1970-01-01", - tz = "UTC") { + tz = "UTC", + format = NULL) { if (call_binding("is.numeric", x)) { delta <- call_binding("difftime", origin, "1970-01-01") delta <- build_expr("cast", delta, options = cast_options(to_type = int64())) x <- build_expr("cast", x, options = cast_options(to_type = int64())) x <- build_expr("+", x, delta) } + + if (call_binding("is.character", x) && !is.null(format)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr( + "strptime", + x, + options = list(format = format, unit = 0L, error_is_null = TRUE) + ) + } output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) build_expr("assume_timezone", output, options = list(timezone = tz)) }) From 9b3d6c4946be8d2230be30ead863c6884da2feca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Apr 2022 15:07:33 +0300 Subject: [PATCH 13/30] support for `NULL` `tz` in `binding_as_date()` --- r/R/dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index c325057180c..5650c6cdbea 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -391,7 +391,7 @@ binding_as_date <- function(x, } else if (call_binding("is.POSIXct", x)) { # base::as.Date() first converts to the desired timezone and then extracts # the date, which is why we need to go through timestamp() first - if (base || !missing(tz)) { + if (base || !is.null(tz)) { x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) } # POSIXct is of type double -> we need this to prevent going down the From 97f59b41dc53ac8fd8b904f84d32115967119c0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Apr 2022 15:10:10 +0300 Subject: [PATCH 14/30] added unit test to better show the difference between `as_date()` and `as.Date()` + unit test for `format` + not skipping tests on windows since we now have tzdb --- r/tests/testthat/test-dplyr-funcs-type.R | 44 +++++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index b3190986fc7..92e9795fcd8 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -805,6 +805,8 @@ test_that("nested structs can be created from scalars and existing data frames", test_that("`as.Date()` and `as_date()`", { test_df <- tibble::tibble( posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), + dt_europe = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"), + dt_utc = ymd_hms("2010-08-03 00:50:50"), date_var = as.Date("2022-02-25"), difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), character_ymd_var = "2022-02-25 00:00:01", @@ -820,12 +822,20 @@ test_that("`as.Date()` and `as_date()`", { .input %>% mutate( date_dv1 = as.Date(date_var), + date_pv1 = as.Date(posixct_var), + date_pv_tz1 = as.Date(posixct_var, tz = "Pacific/Marquesas"), + date_utc1 = as.Date(dt_utc), + date_europe1 = as.Date(dt_europe), date_char_ymd1 = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), date_char_ydm1 = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), date_int1 = as.Date(integer_var, origin = "1970-01-01"), date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"), date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"), date_dv2 = as_date(date_var), + date_pv2 = as_date(posixct_var), + date_pv_tz2 = as_date(posixct_var, tz = "Pacific/Marquesas"), + date_utc2 = as_date(dt_utc), + date_europe2 = as_date(dt_europe), date_char_ymd2 = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), date_char_ydm2 = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), date_int2 = as_date(integer_var, origin = "1970-01-01"), @@ -883,27 +893,6 @@ test_that("`as.Date()` and `as_date()`", { collect() ) - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( - .input %>% - mutate( - date_pv = as.Date(posixct_var), - date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") - ) %>% - collect(), - test_df - ) - - compare_dplyr_binding( - .input %>% - mutate( - date_pv = as_date(posixct_var), - date_pv_tz = as_date(posixct_var, tz = "Pacific/Marquesas") - ) %>% - collect(), - test_df - ) - # difference between as.Date() and as_date(): #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg # to `as.Date()` @@ -924,11 +913,24 @@ test_that("`as_datetime()`", { test_df <- tibble( date = as.Date(c("2022-03-22", "2021-07-30", NA)), char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), + char_date_non_iso = c("2022-22-03 12:34:56", "2021-30-07 14:32:47", NA), int_date = c(10L, 25L, NA), integerish_date = c(10, 25, NA), double_date = c(10.1, 25.2, NA) ) + test_df %>% + arrow_table() %>% + mutate( + ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dchar_date_non_iso = as_datetime(char_date_non_iso, format = "%Y-%d-%m %H:%M:%S"), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") + ) %>% + collect() + compare_dplyr_binding( .input %>% mutate( From fd8ba19afd1ee22abba900d04a4b15acb209e499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Apr 2022 10:58:25 +0100 Subject: [PATCH 15/30] remove self-asignment --- r/R/dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5650c6cdbea..c8a4972056e 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -396,7 +396,7 @@ binding_as_date <- function(x, } # POSIXct is of type double -> we need this to prevent going down the # "double" branch - x <- x + x # cast from character } else if (call_binding("is.character", x)) { From 110fa15979389049289dbc7f3865345e91efc3b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Apr 2022 11:01:29 +0100 Subject: [PATCH 16/30] colon --- r/tests/testthat/test-dplyr-funcs-type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 92e9795fcd8..31c02052ae4 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -885,7 +885,7 @@ test_that("`as.Date()` and `as_date()`", { ) # we do not support as_date with double/ float (error surfaced from C++) - # TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798 + # TODO: revisit after https://issues.apache.org/jira/browse/ARROW-15798 expect_error( test_df %>% arrow_table() %>% From 02c4df2c8c02a57a80d2d6fd03b667c038838b6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Apr 2022 11:02:44 +0100 Subject: [PATCH 17/30] no longer skipping on Windows --- r/tests/testthat/test-dplyr-funcs-type.R | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 31c02052ae4..6dcfb993925 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -936,6 +936,7 @@ test_that("`as_datetime()`", { mutate( ddate = as_datetime(date), dchar_date_no_tz = as_datetime(char_date), + dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), dint_date = as_datetime(int_date, origin = "1970-01-02"), dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") @@ -955,17 +956,6 @@ test_that("`as_datetime()`", { collect(), regexp = "Float value 10.1 was truncated converting to int64" ) - - # separate tz test so we can skip on Windows - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( - .input %>% - mutate( - dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas") - ) %>% - collect(), - test_df - ) }) test_that("format date/time", { From 9a1b0eccd79797c5ba5b10fd980ed98fbb5bbc6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Apr 2022 11:46:51 +0100 Subject: [PATCH 18/30] separated the double and origin different from `"1970-01-01"` logic --- r/R/dplyr-funcs-datetime.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index c8a4972056e..6de81545bfe 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -404,18 +404,21 @@ binding_as_date <- function(x, # unit = 0L is the identifier for seconds in valid_time32_units x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - # cast from numeric - } else if (call_binding("is.numeric", x) & - (!call_binding("is.integer", x) | origin != "1970-01-01")) { + # cast from float/ double + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { # Arrow does not support direct casting from double to date32(), but for # integer-like values we can go via int32() # https://issues.apache.org/jira/browse/ARROW-15798 # TODO revisit if arrow decides to support double -> date casting x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + + if (origin != "1970-01-01") { delta_in_sec <- call_binding("difftime", origin, "1970-01-01") - delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64())) - delta_in_days <- (delta_in_sec / 86400L)$cast(int32()) + delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int32())) + delta_in_days <- delta_in_sec / 86400L x <- build_expr("+", x, delta_in_days) } + build_expr("cast", x, options = cast_options(to_type = date32())) } From 634863e66b0e0252cc93e2715ec27e8d106f81bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Apr 2022 13:15:05 +0100 Subject: [PATCH 19/30] revert to casting `delta_in_sec` to `int64()` and `delta_in_days` to `int32()` --- r/R/dplyr-funcs-datetime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 6de81545bfe..2ae7617b940 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -415,8 +415,8 @@ binding_as_date <- function(x, if (origin != "1970-01-01") { delta_in_sec <- call_binding("difftime", origin, "1970-01-01") - delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int32())) - delta_in_days <- delta_in_sec / 86400L + delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64())) + delta_in_days <- (delta_in_sec / 86400L)$cast(int32()) x <- build_expr("+", x, delta_in_days) } From 054d92611f466bd1b79ee1337d67291d32de282f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Apr 2022 18:10:47 +0100 Subject: [PATCH 20/30] rename `tz` -> `use_tz` and added some clarifying comments --- r/R/dplyr-funcs-datetime.R | 14 +++++++++----- r/R/dplyr-funcs-type.R | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9f38f4b21fc..5ed0de534fd 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -468,7 +468,7 @@ binding_as_date <- function(x, format = NULL, tryFormats = "%Y-%m-%d", origin = "1970-01-01", - tz = "UTC", + use_tz = "UTC", base = TRUE) { if (is.null(format) && length(tryFormats) > 1) { @@ -480,10 +480,14 @@ binding_as_date <- function(x, # cast from POSIXct } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() first converts to the desired timezone and then extracts - # the date, which is why we need to go through timestamp() first - if (base || !is.null(tz)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + # base::as.Date() and lubridate::as_date() differ in the way they use the `tz` + # argument both cast to the desired timezone, if present. The difference + # appears when the `tz` argument is not set: `as.Date()` uses the default + # value ("UTC"), while `as_date()` keeps the original attribute => we only + # cast when we want the behaviour of the base version or when `use_tz` is + # set (i.e. not NULL) + if (base || !is.null(use_tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = use_tz))) } # POSIXct is of type double -> we need this to prevent going down the # "double" branch diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 2689633499d..5695a60bda8 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -87,7 +87,7 @@ register_bindings_type_cast <- function() { format = format, tryFormats = tryFormats, origin = origin, - tz = tz, + use_tz = tz, base = TRUE ) }) @@ -100,7 +100,7 @@ register_bindings_type_cast <- function() { x = x, format = format, origin = origin, - tz = tz, + use_tz = tz, base = FALSE ) }) From f2aa90427334bf93f657ef52ce15f88d64ede9bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Apr 2022 18:26:13 +0100 Subject: [PATCH 21/30] removed `x` no-op and added comments --- r/R/dplyr-funcs-datetime.R | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5ed0de534fd..eada36ed62d 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -480,18 +480,15 @@ binding_as_date <- function(x, # cast from POSIXct } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() and lubridate::as_date() differ in the way they use the `tz` - # argument both cast to the desired timezone, if present. The difference - # appears when the `tz` argument is not set: `as.Date()` uses the default - # value ("UTC"), while `as_date()` keeps the original attribute => we only - # cast when we want the behaviour of the base version or when `use_tz` is - # set (i.e. not NULL) + # base::as.Date() and lubridate::as_date() differ in the way they use the + # `tz` argument both cast to the desired timezone, if present. The + # difference appears when the `tz` argument is not set: `as.Date()` uses the + # default value ("UTC"), while `as_date()` keeps the original attribute + # => we only cast when we want the behaviour of the base version or when + # `use_tz` is set (i.e. not NULL) if (base || !is.null(use_tz)) { x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = use_tz))) } - # POSIXct is of type double -> we need this to prevent going down the - # "double" branch - x # cast from character } else if (call_binding("is.character", x)) { From 5a258b969badc4f4ab5220de04bdc8eb2ac2aebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Apr 2022 09:24:48 +0100 Subject: [PATCH 22/30] docs --- r/man/arrow-package.Rd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd index 122f7682e17..2a0143d02e5 100644 --- a/r/man/arrow-package.Rd +++ b/r/man/arrow-package.Rd @@ -24,8 +24,10 @@ Authors: \itemize{ \item Ian Cook \email{ianmcook@gmail.com} \item Nic Crane \email{thisisnic@gmail.com} - \item Jonathan Keane \email{jkeane@gmail.com} + \item Dewey Dunnington \email{dewey@fishandwhistle.net} (\href{https://orcid.org/0000-0002-9415-4582}{ORCID}) \item Romain François \email{romain@rstudio.com} (\href{https://orcid.org/0000-0002-2444-4226}{ORCID}) + \item Jonathan Keane \email{jkeane@gmail.com} + \item Dragoș Moldovan-Grünfeld \email{dragos.mold@gmail.com} \item Jeroen Ooms \email{jeroen@berkeley.edu} \item Apache Arrow \email{dev@arrow.apache.org} [copyright holder] } From 7576460befedb5816f3077b8f57f630429c4d15f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Apr 2022 09:25:31 +0100 Subject: [PATCH 23/30] remove the `base` and `tz` arguments --- r/R/dplyr-funcs-datetime.R | 11 +++-------- r/R/dplyr-funcs-type.R | 15 +++++++++------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index eada36ed62d..7243e1dfbcc 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -467,9 +467,7 @@ duration_from_chunks <- function(chunks) { binding_as_date <- function(x, format = NULL, tryFormats = "%Y-%m-%d", - origin = "1970-01-01", - use_tz = "UTC", - base = TRUE) { + origin = "1970-01-01") { if (is.null(format) && length(tryFormats) > 1) { abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") @@ -481,14 +479,11 @@ binding_as_date <- function(x, # cast from POSIXct } else if (call_binding("is.POSIXct", x)) { # base::as.Date() and lubridate::as_date() differ in the way they use the - # `tz` argument both cast to the desired timezone, if present. The + # `tz` argument. Both cast to the desired timezone, if present. The # difference appears when the `tz` argument is not set: `as.Date()` uses the # default value ("UTC"), while `as_date()` keeps the original attribute # => we only cast when we want the behaviour of the base version or when - # `use_tz` is set (i.e. not NULL) - if (base || !is.null(use_tz)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = use_tz))) - } + # `tz` is set (i.e. not NULL) # cast from character } else if (call_binding("is.character", x)) { diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 5695a60bda8..e223636e1d0 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -82,13 +82,15 @@ register_bindings_type_cast <- function() { tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { + if (call_binding("is.POSIXct", x)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + binding_as_date( x = x, format = format, tryFormats = tryFormats, - origin = origin, - use_tz = tz, - base = TRUE + origin = origin ) }) @@ -96,12 +98,13 @@ register_bindings_type_cast <- function() { format = NULL, origin = "1970-01-01", tz = NULL) { + if (call_binding("is.POSIXct", x) && !is.null(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } binding_as_date( x = x, format = format, - origin = origin, - use_tz = tz, - base = FALSE + origin = origin ) }) From 1b45fdaa30fec212b9e37f242313574370f16bac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Apr 2022 09:29:50 +0100 Subject: [PATCH 24/30] removed comments referencing ARROW-13168 --- r/tests/testthat/test-dplyr-funcs-type.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 4bc40d917e6..5874b7c582c 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -816,8 +816,6 @@ test_that("`as.Date()` and `as_date()`", { double_var = 34.56 ) - # casting from POSIXct treated separately so we can skip on Windows - # TODO move the test for casting from POSIXct below once ARROW-13168 is done compare_dplyr_binding( .input %>% mutate( From 728139517000701037398b70d6efae801d35f9c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Apr 2022 10:51:07 +0100 Subject: [PATCH 25/30] added comments on the as.Date() vs as_date() differece --- r/R/dplyr-funcs-type.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index e223636e1d0..e3700cf35b7 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -82,6 +82,12 @@ register_bindings_type_cast <- function() { tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { + # base::as.Date() and lubridate::as_date() differ in the way they use the + # `tz` argument. Both cast to the desired timezone, if present. The + # difference appears when the `tz` argument is not set: `as.Date()` uses the + # default value ("UTC"), while `as_date()` keeps the original attribute + # => we only cast when we want the behaviour of the base version or when + # `tz` is set (i.e. not NULL) if (call_binding("is.POSIXct", x)) { x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) } @@ -98,6 +104,12 @@ register_bindings_type_cast <- function() { format = NULL, origin = "1970-01-01", tz = NULL) { + # base::as.Date() and lubridate::as_date() differ in the way they use the + # `tz` argument. Both cast to the desired timezone, if present. The + # difference appears when the `tz` argument is not set: `as.Date()` uses the + # default value ("UTC"), while `as_date()` keeps the original attribute + # => we only cast when we want the behaviour of the base version or when + # `tz` is set (i.e. not NULL) if (call_binding("is.POSIXct", x) && !is.null(tz)) { x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) } From c1ed1cc55853b1ab50eefa0eb974eaa0d1bedb0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Apr 2022 10:51:28 +0100 Subject: [PATCH 26/30] refactored `binding_as_date()` --- r/R/dplyr-funcs-datetime.R | 47 ++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 7243e1dfbcc..73f9dc865f5 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -476,27 +476,34 @@ binding_as_date <- function(x, if (call_binding("is.Date", x)) { return(x) - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() and lubridate::as_date() differ in the way they use the - # `tz` argument. Both cast to the desired timezone, if present. The - # difference appears when the `tz` argument is not set: `as.Date()` uses the - # default value ("UTC"), while `as_date()` keeps the original attribute - # => we only cast when we want the behaviour of the base version or when - # `tz` is set (i.e. not NULL) - # cast from character } else if (call_binding("is.character", x)) { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from float/ double - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting + x <- binding_as_date_character(x, format, tryFormats) + + # cast from numeric + } else if (call_binding("is.numeric", x)) { + x <- binding_as_date_numeric(x, origin) + } + + build_expr("cast", x, options = cast_options(to_type = date32())) +} + +binding_as_date_character <- function(x, + format = NULL, + tryFormats = "%Y-%m-%d") { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + build_expr("strptime", x, options = list(format = format, unit = 0L)) +} + +binding_as_date_numeric <- function(x, + origin = "1970-01-01") { + + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + if (!call_binding("is.integer", x)) { x <- build_expr("cast", x, options = cast_options(to_type = int32())) } @@ -507,5 +514,5 @@ binding_as_date <- function(x, x <- build_expr("+", x, delta_in_days) } - build_expr("cast", x, options = cast_options(to_type = date32())) + x } From 833c432dca13f995df5a544f051ea789033824bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Apr 2022 09:16:38 +0100 Subject: [PATCH 27/30] added links to relevant Jiras --- r/R/dplyr-funcs-datetime.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 73f9dc865f5..2ff08effc90 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -509,6 +509,10 @@ binding_as_date_numeric <- function(x, if (origin != "1970-01-01") { delta_in_sec <- call_binding("difftime", origin, "1970-01-01") + # TODO: revisit once either of these issues is addressed: + # https://issues.apache.org/jira/browse/ARROW-16253 (helper function) or + # https://issues.apache.org/jira/browse/ARROW-15862 (casting from int32 + # -> duration or double -> duration) delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64())) delta_in_days <- (delta_in_sec / 86400L)$cast(int32()) x <- build_expr("+", x, delta_in_days) From 1ab2e2d3773e46bf8a3fce0a7383a28e22263f78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Apr 2022 09:21:24 +0100 Subject: [PATCH 28/30] added comment on why we're not testing the content of the error message --- r/tests/testthat/test-dplyr-funcs-type.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 5874b7c582c..6a07d36e818 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -854,7 +854,8 @@ test_that("`as.Date()` and `as_date()`", { warning = TRUE ) - # strptime does not support a partial format - + # strptime does not support a partial format - testing an error surfaced from + # C++ (hence not testing the content of the error message) # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 expect_error( test_df %>% From 06b7f86cad26a9776c62407ecf5d6fe5f80c6be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Apr 2022 09:23:02 +0100 Subject: [PATCH 29/30] style --- r/R/dplyr-funcs-datetime.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 2ff08effc90..b5be484d5a9 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -496,8 +496,7 @@ binding_as_date_character <- function(x, build_expr("strptime", x, options = list(format = format, unit = 0L)) } -binding_as_date_numeric <- function(x, - origin = "1970-01-01") { +binding_as_date_numeric <- function(x, origin = "1970-01-01") { # Arrow does not support direct casting from double to date32(), but for # integer-like values we can go via int32() From 3e721b46379f70ca6e4a563df08c5440c9802aac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Apr 2022 13:16:12 +0100 Subject: [PATCH 30/30] edit comment --- r/R/dplyr-funcs-datetime.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index b5be484d5a9..221de52c059 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -509,7 +509,8 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { if (origin != "1970-01-01") { delta_in_sec <- call_binding("difftime", origin, "1970-01-01") # TODO: revisit once either of these issues is addressed: - # https://issues.apache.org/jira/browse/ARROW-16253 (helper function) or + # https://issues.apache.org/jira/browse/ARROW-16253 (helper function for + # casting from double to duration) or # https://issues.apache.org/jira/browse/ARROW-15862 (casting from int32 # -> duration or double -> duration) delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64()))