diff --git a/r/NEWS.md b/r/NEWS.md index 4ad8393528b..81a23aa0318 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,7 +24,10 @@ * `tz()` to extract/get timezone * `semester()` to extract/get semester * `dst()` to get daylight savings time indicator. + * `date()` to extract date * `epiyear()` to get epiyear +* date-time functionality: + * `as.Date()` to convert to date # arrow 7.0.0 diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index a72df4c777a..5a22c970965 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -164,4 +164,7 @@ register_bindings_datetime <- function() { return(semester) } }) + register_binding("date", function(x) { + build_expr("cast", x, options = list(to_type = date32())) + }) } diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 2f1fa96b835..fa839269abe 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -76,6 +76,50 @@ register_bindings_type_cast <- function() { register_binding("as.numeric", function(x) { Expression$create("cast", x, options = cast_options(to_type = float64())) }) + register_binding("as.Date", function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01", + tz = "UTC") { + + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") + } + + # this could be improved with tryFormats once strptime returns NA and we + # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # base::as.Date() first converts to the desired timezone and then extracts + # the date, which is why we need to go through timestamp() first + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + + # cast from character + } else if (call_binding("is.character", x)) { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + abort("`as.Date()` with double/float is not supported in Arrow") + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) register_binding("is", function(object, class2) { if (is.string(class2)) { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 5ac64059e2b..31f944e74b6 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -819,3 +819,88 @@ test_that("dst extracts daylight savings time correctly", { test_df ) }) + +test_that("date works in arrow", { + # https://issues.apache.org/jira/browse/ARROW-13168 + skip_on_os("windows") + # this date is specific since lubridate::date() is different from base::as.Date() + # since as.Date returns the UTC date and date() doesn't + test_df <- tibble( + posixct_date = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York"), + integer_var = c(32L, NA)) + + r_date_object <- lubridate::ymd_hms("2012-03-26 23:12:13") + + # we can't (for now) use namespacing, so we need to make sure lubridate::date() + # and not base::date() is being used. This is due to the way testthat runs and + # normal use of arrow would not have to do this explicitly. + # TODO remove once https://issues.apache.org/jira/browse/ARROW-14575 is done + date <- lubridate::date + + compare_dplyr_binding( + .input %>% + mutate(a_date = date(posixct_date)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(a_date_base = as.Date(posixct_date)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(date_from_r_object = date(r_date_object)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(as_date_from_r_object = as.Date(r_date_object)) %>% + collect(), + test_df + ) + + # date from integer supported in arrow (similar to base::as.Date()), but in + # Arrow it assumes a fixed origin "1970-01-01". However this is not supported + # by lubridate. lubridate::date(integer_var) errors without an `origin` + expect_equal( + test_df %>% + arrow_table() %>% + select(integer_var) %>% + mutate(date_int = date(integer_var)) %>% + collect(), + tibble(integer_var = c(32L, NA), + date_int = as.Date(c("1970-02-02", NA))) + ) +}) + +test_that("date() errors with unsupported inputs", { + expect_error( + example_data %>% + arrow_table() %>% + mutate(date_char = date("2022-02-25 00:00:01")) %>% + collect(), + regexp = "Unsupported cast from string to date32 using function cast_date32" + ) + + expect_error( + example_data %>% + arrow_table() %>% + mutate(date_bool = date(TRUE)) %>% + collect(), + regexp = "Unsupported cast from bool to date32 using function cast_date32" + ) + + expect_error( + example_data %>% + arrow_table() %>% + mutate(date_double = date(34.56)) %>% + collect(), + regexp = "Unsupported cast from double to date32 using function cast_date32" + ) +}) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index c5fd83cb0f4..18efc5326dc 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -19,6 +19,7 @@ skip_if_not_available("dataset") library(dplyr, warn.conflicts = FALSE) suppressPackageStartupMessages(library(bit64)) +suppressPackageStartupMessages(library(lubridate)) tbl <- example_data @@ -767,4 +768,78 @@ test_that("nested structs can be created from scalars and existing data frames", collect(), tibble(a = 1:2) ) + + }) + +test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + double_var = 34.56 + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as.Date(date_var), + date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as.Date(integer_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # we do not support multiple tryFormats + compare_dplyr_binding( + .input %>% + mutate(date_char_ymd = as.Date(character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% + collect(), + test_df, + warning = TRUE + ) + + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as.Date(character_ymd_var)) %>% + collect(), + regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", + fixed = TRUE + ) + + # we do not support as.Date() with double/ float + compare_dplyr_binding( + .input %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect(), + test_df, + warning = TRUE + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as.Date(posixct_var), + date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) })