diff --git a/r/NEWS.md b/r/NEWS.md index 9083249a99b..f99fe3edada 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -26,6 +26,7 @@ * Added `decimal_date()` and `date_decimal()` * Added `make_difftime()` (duration constructor) * date-time functionality: + * Added `as_date()` and `as_datetime()` * Added `difftime` and `as.difftime()` * Added `as.Date()` to convert to date * `median()` and `quantile()` will warn once about approximate calculations regardless of interactivity. diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 7657a79271b..221de52c059 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -263,11 +263,11 @@ register_bindings_duration <- function() { # cast to timestamp if time1 and time2 are not dates or timestamp expressions # (the subtraction of which would output a `duration`) if (!call_binding("is.instant", time1)) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) } if (!call_binding("is.instant", time2)) { - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp())) } # if time1 or time2 are timestamps they cannot be expressed in "s" /seconds @@ -463,3 +463,60 @@ duration_from_chunks <- function(chunks) { } duration } + +binding_as_date <- function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01") { + + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from character + } else if (call_binding("is.character", x)) { + x <- binding_as_date_character(x, format, tryFormats) + + # cast from numeric + } else if (call_binding("is.numeric", x)) { + x <- binding_as_date_numeric(x, origin) + } + + build_expr("cast", x, options = cast_options(to_type = date32())) +} + +binding_as_date_character <- function(x, + format = NULL, + tryFormats = "%Y-%m-%d") { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + build_expr("strptime", x, options = list(format = format, unit = 0L)) +} + +binding_as_date_numeric <- function(x, origin = "1970-01-01") { + + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + if (!call_binding("is.integer", x)) { + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + + if (origin != "1970-01-01") { + delta_in_sec <- call_binding("difftime", origin, "1970-01-01") + # TODO: revisit once either of these issues is addressed: + # https://issues.apache.org/jira/browse/ARROW-16253 (helper function for + # casting from double to duration) or + # https://issues.apache.org/jira/browse/ARROW-15862 (casting from int32 + # -> duration or double -> duration) + delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64())) + delta_in_days <- (delta_in_sec / 86400L)$cast(int32()) + x <- build_expr("+", x, delta_in_days) + } + + x +} diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 1bb633d5322..e3700cf35b7 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -82,44 +82,65 @@ register_bindings_type_cast <- function() { tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { - - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # this could be improved with tryFormats once strptime returns NA and we - # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + # base::as.Date() and lubridate::as_date() differ in the way they use the + # `tz` argument. Both cast to the desired timezone, if present. The + # difference appears when the `tz` argument is not set: `as.Date()` uses the + # default value ("UTC"), while `as_date()` keeps the original attribute + # => we only cast when we want the behaviour of the base version or when + # `tz` is set (i.e. not NULL) + if (call_binding("is.POSIXct", x)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) } - if (call_binding("is.Date", x)) { - return(x) + binding_as_date( + x = x, + format = format, + tryFormats = tryFormats, + origin = origin + ) + }) - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() first converts to the desired timezone and then extracts - # the date, which is why we need to go through timestamp() first + register_binding("as_date", function(x, + format = NULL, + origin = "1970-01-01", + tz = NULL) { + # base::as.Date() and lubridate::as_date() differ in the way they use the + # `tz` argument. Both cast to the desired timezone, if present. The + # difference appears when the `tz` argument is not set: `as.Date()` uses the + # default value ("UTC"), while `as_date()` keeps the original attribute + # => we only cast when we want the behaviour of the base version or when + # `tz` is set (i.e. not NULL) + if (call_binding("is.POSIXct", x) && !is.null(tz)) { x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + binding_as_date( + x = x, + format = format, + origin = origin + ) + }) - # cast from character - } else if (call_binding("is.character", x)) { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + register_binding("as_datetime", function(x, + origin = "1970-01-01", + tz = "UTC", + format = NULL) { + if (call_binding("is.numeric", x)) { + delta <- call_binding("difftime", origin, "1970-01-01") + delta <- build_expr("cast", delta, options = cast_options(to_type = int64())) + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + x <- build_expr("+", x, delta) + } - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - abort("`as.Date()` with double/float is not supported in Arrow") + if (call_binding("is.character", x) && !is.null(format)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr( + "strptime", + x, + options = list(format = format, unit = 0L, error_is_null = TRUE) + ) } - build_expr("cast", x, options = cast_options(to_type = date32())) + output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) + build_expr("assume_timezone", output, options = list(timezone = tz)) }) register_binding("is", function(object, class2) { diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd index 122f7682e17..2a0143d02e5 100644 --- a/r/man/arrow-package.Rd +++ b/r/man/arrow-package.Rd @@ -24,8 +24,10 @@ Authors: \itemize{ \item Ian Cook \email{ianmcook@gmail.com} \item Nic Crane \email{thisisnic@gmail.com} - \item Jonathan Keane \email{jkeane@gmail.com} + \item Dewey Dunnington \email{dewey@fishandwhistle.net} (\href{https://orcid.org/0000-0002-9415-4582}{ORCID}) \item Romain François \email{romain@rstudio.com} (\href{https://orcid.org/0000-0002-2444-4226}{ORCID}) + \item Jonathan Keane \email{jkeane@gmail.com} + \item Dragoș Moldovan-Grünfeld \email{dragos.mold@gmail.com} \item Jeroen Ooms \email{jeroen@berkeley.edu} \item Apache Arrow \email{dev@arrow.apache.org} [copyright holder] } diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index aa6667420c0..6a07d36e818 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -802,39 +802,48 @@ test_that("nested structs can be created from scalars and existing data frames", }) -test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { +test_that("`as.Date()` and `as_date()`", { test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), + dt_europe = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"), + dt_utc = ymd_hms("2010-08-03 00:50:50"), date_var = as.Date("2022-02-25"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), character_ymd_var = "2022-02-25 00:00:01", character_ydm_var = "2022/25/02 00:00:01", integer_var = 32L, + integerish_var = 32, double_var = 34.56 ) - # casting from POSIXct treated separately so we can skip on Windows - # TODO move the test for casting from POSIXct below once ARROW-13168 is done compare_dplyr_binding( .input %>% mutate( - date_dv = as.Date(date_var), - date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as.Date(integer_var, origin = "1970-01-01") + date_dv1 = as.Date(date_var), + date_pv1 = as.Date(posixct_var), + date_pv_tz1 = as.Date(posixct_var, tz = "Pacific/Marquesas"), + date_utc1 = as.Date(dt_utc), + date_europe1 = as.Date(dt_europe), + date_char_ymd1 = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm1 = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int1 = as.Date(integer_var, origin = "1970-01-01"), + date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"), + date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"), + date_dv2 = as_date(date_var), + date_pv2 = as_date(posixct_var), + date_pv_tz2 = as_date(posixct_var, tz = "Pacific/Marquesas"), + date_utc2 = as_date(dt_utc), + date_europe2 = as_date(dt_europe), + date_char_ymd2 = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm2 = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int2 = as_date(integer_var, origin = "1970-01-01"), + date_int_origin2 = as_date(integer_var, origin = "1970-01-03"), + date_integerish2 = as_date(integerish_var, origin = "1970-01-01") ) %>% collect(), test_df ) - # currently we do not support an origin different to "1970-01-01" - compare_dplyr_binding( - .input %>% - mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% - collect(), - test_df, - warning = TRUE - ) - # we do not support multiple tryFormats compare_dplyr_binding( .input %>% @@ -845,6 +854,16 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a warning = TRUE ) + # strptime does not support a partial format - testing an error surfaced from + # C++ (hence not testing the content of the error message) + # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as_date(character_ymd_var)) %>% + collect() + ) + expect_error( test_df %>% arrow_table() %>% @@ -854,25 +873,88 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a fixed = TRUE ) - # we do not support as.Date() with double/ float - compare_dplyr_binding( - .input %>% + + # we do not support as.Date() with double/ float (error surfaced from C++) + # TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798 + expect_error( + test_df %>% + arrow_table() %>% mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + # we do not support as_date with double/ float (error surfaced from C++) + # TODO: revisit after https://issues.apache.org/jira/browse/ARROW-15798 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% collect(), - test_df, - warning = TRUE + test_df ) +}) + +test_that("`as_datetime()`", { + test_df <- tibble( + date = as.Date(c("2022-03-22", "2021-07-30", NA)), + char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), + char_date_non_iso = c("2022-22-03 12:34:56", "2021-30-07 14:32:47", NA), + int_date = c(10L, 25L, NA), + integerish_date = c(10, 25, NA), + double_date = c(10.1, 25.2, NA) + ) + + test_df %>% + arrow_table() %>% + mutate( + ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dchar_date_non_iso = as_datetime(char_date_non_iso, format = "%Y-%d-%m %H:%M:%S"), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") + ) %>% + collect() - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 compare_dplyr_binding( .input %>% mutate( - date_pv = as.Date(posixct_var), - date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") + ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") ) %>% collect(), test_df ) + + # Arrow does not support conversion of double to date + # the below should error with an error message originating in the C++ code + expect_error( + test_df %>% + arrow_table() %>% + mutate( + ddouble_date = as_datetime(double_date) + ) %>% + collect(), + regexp = "Float value 10.1 was truncated converting to int64" + ) }) test_that("format date/time", {