From a10b1b23181ffc284e99c323dfc6c6de5ed9d636 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 5 May 2022 10:58:39 +0100 Subject: [PATCH 01/36] 1st working draft. some finessing needed --- r/R/dplyr-datetime-helpers.R | 32 +++++++++++++++----- r/tests/testthat/test-dplyr-funcs-datetime.R | 29 +++++++++++++----- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 60771c86241..aedf1912780 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -104,9 +104,10 @@ binding_as_date <- function(x, format = NULL, tryFormats = "%Y-%m-%d", origin = "1970-01-01") { - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") - } + + # if (is.null(format) && length(tryFormats) > 1) { + # abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + # } if (call_binding("is.Date", x)) { return(x) @@ -125,10 +126,27 @@ binding_as_date <- function(x, binding_as_date_character <- function(x, format = NULL, - tryFormats = "%Y-%m-%d") { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - build_expr("strptime", x, options = list(format = format, unit = 0L)) + tryFormats = c("%Y-%m-%d", "%Y/%m/%d")) { + + # format <- format %||% tryFormats[[1]] + # # unit = 0L is the identifier for seconds in valid_time32_units + # build_expr("strptime", x, options = list(format = format, unit = 0L)) + if (missing(format)) { + parse_attempts <- list() + for (i in seq_along(tryFormats)) { + parse_attempts[[i]] <- + build_expr( + "strptime", x, + options = list(format = tryFormats[[i]], unit = 0L, error_is_null = TRUE) + ) + } + # coalesce_output <- build_expr("coalesce", parse_attempts[[1]], parse_attempts[[2]]) + coalesce_output <- build_expr("coalesce", args = parse_attempts) + return(coalesce_output) + # purrr::lift_dl(build_expr("coalesce"))(parse_attempts) + } else { + build_expr("strptime", x, options = list(format = format, unit = 0L)) + } } binding_as_date_numeric <- function(x, origin = "1970-01-01") { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 15af0c9f8db..a72c5dd912b 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1513,8 +1513,10 @@ test_that("`as.Date()` and `as_date()`", { dt_utc = ymd_hms("2010-08-03 00:50:50"), date_var = as.Date("2022-02-25"), difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), - character_ymd_var = "2022-02-25 00:00:01", - character_ydm_var = "2022/25/02 00:00:01", + character_ymd_hms_var = "2022-02-25 00:00:01", + character_ydm_hms_var = "2022/25/02 00:00:01", + character_ymd_var = "2022-02-25", + character_ydm_var = "2022/25/02", integer_var = 32L, integerish_var = 32, double_var = 34.56 @@ -1528,8 +1530,8 @@ test_that("`as.Date()` and `as_date()`", { date_pv_tz1 = as.Date(posixct_var, tz = "Pacific/Marquesas"), date_utc1 = as.Date(dt_utc), date_europe1 = as.Date(dt_europe), - date_char_ymd1 = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm1 = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_char_ymd_hms1 = as.Date(character_ymd_hms_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm_hms1 = as.Date(character_ydm_hms_var, format = "%Y/%d/%m %H:%M:%S"), date_int1 = as.Date(integer_var, origin = "1970-01-01"), date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"), date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"), @@ -1538,8 +1540,8 @@ test_that("`as.Date()` and `as_date()`", { date_pv_tz2 = as_date(posixct_var, tz = "Pacific/Marquesas"), date_utc2 = as_date(dt_utc), date_europe2 = as_date(dt_europe), - date_char_ymd2 = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm2 = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_char_ymd2 = as_date(character_ymd_hms_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm2 = as_date(character_ydm_hms_var, format = "%Y/%d/%m %H:%M:%S"), date_int2 = as_date(integer_var, origin = "1970-01-01"), date_int_origin2 = as_date(integer_var, origin = "1970-01-03"), date_integerish2 = as_date(integerish_var, origin = "1970-01-01") @@ -1548,6 +1550,17 @@ test_that("`as.Date()` and `as_date()`", { test_df ) + # test_df %>% + # arrow_table() %>% + # mutate( + # date_char_ymd = as.Date( + # character_ymd_var, + # tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + # ), + # .keep = "used" + # ) %>% + # collect() + # we do not support multiple tryFormats compare_dplyr_binding( .input %>% @@ -1565,14 +1578,14 @@ test_that("`as.Date()` and `as_date()`", { expect_error( test_df %>% arrow_table() %>% - mutate(date_char_ymd = as_date(character_ymd_var)) %>% + mutate(date_char_ymd = as_date(character_ymd_hms_var), .keep = "used") %>% collect() ) expect_error( test_df %>% arrow_table() %>% - mutate(date_char_ymd = as.Date(character_ymd_var)) %>% + mutate(date_char_ymd = as.Date(character_ymd_hms_var)) %>% collect(), regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", fixed = TRUE From 2c7b8d1cdf82713b946cd568e98c1b79a1d0490b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 5 May 2022 13:26:03 +0100 Subject: [PATCH 02/36] use `is.null` instead of missing + comment --- r/R/dplyr-datetime-helpers.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index aedf1912780..9b9621a0e13 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -131,7 +131,10 @@ binding_as_date_character <- function(x, # format <- format %||% tryFormats[[1]] # # unit = 0L is the identifier for seconds in valid_time32_units # build_expr("strptime", x, options = list(format = format, unit = 0L)) - if (missing(format)) { + + # we need to split the logic between as_date() and as.Date() since `tryFormats` + # is an argument only for `as.Date()` + if (is.null(format)) { parse_attempts <- list() for (i in seq_along(tryFormats)) { parse_attempts[[i]] <- From 3b4af684fd80476b26ad6cd3221266d75c01b105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 5 May 2022 13:28:12 +0100 Subject: [PATCH 03/36] temp --- r/tests/testthat/test-dplyr-funcs-datetime.R | 26 ++++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a72c5dd912b..6097cd77eac 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1550,16 +1550,16 @@ test_that("`as.Date()` and `as_date()`", { test_df ) - # test_df %>% - # arrow_table() %>% - # mutate( - # date_char_ymd = as.Date( - # character_ymd_var, - # tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - # ), - # .keep = "used" - # ) %>% - # collect() + test_df %>% + arrow_table() %>% + mutate( + date_char_ymd = as.Date( + character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + ), + .keep = "used" + ) %>% + collect() # we do not support multiple tryFormats compare_dplyr_binding( @@ -1568,8 +1568,8 @@ test_that("`as.Date()` and `as_date()`", { tryFormats = c("%Y-%m-%d", "%Y/%m/%d") )) %>% collect(), - test_df, - warning = TRUE + test_df#, + # warning = TRUE ) # strptime does not support a partial format - testing an error surfaced from @@ -1577,7 +1577,7 @@ test_that("`as.Date()` and `as_date()`", { # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 expect_error( test_df %>% - arrow_table() %>% + # arrow_table() %>% mutate(date_char_ymd = as_date(character_ymd_hms_var), .keep = "used") %>% collect() ) From a64e4e739eb54568dee11d788b884dc4107ccd29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 5 May 2022 15:55:08 +0100 Subject: [PATCH 04/36] clean-up `binding_as_date_character()` --- r/R/dplyr-datetime-helpers.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 9b9621a0e13..12d04ea930b 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -135,18 +135,16 @@ binding_as_date_character <- function(x, # we need to split the logic between as_date() and as.Date() since `tryFormats` # is an argument only for `as.Date()` if (is.null(format)) { - parse_attempts <- list() + parse_attempt_expressions <- list() for (i in seq_along(tryFormats)) { - parse_attempts[[i]] <- + parse_attempt_expressions[[i]] <- build_expr( "strptime", x, options = list(format = tryFormats[[i]], unit = 0L, error_is_null = TRUE) ) } - # coalesce_output <- build_expr("coalesce", parse_attempts[[1]], parse_attempts[[2]]) - coalesce_output <- build_expr("coalesce", args = parse_attempts) + coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) return(coalesce_output) - # purrr::lift_dl(build_expr("coalesce"))(parse_attempts) } else { build_expr("strptime", x, options = list(format = format, unit = 0L)) } From 17ab14399abff95631a5e0c9c6c776b6d71095ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 5 May 2022 15:56:54 +0100 Subject: [PATCH 05/36] update unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 26 ++++++-------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 6097cd77eac..63f8b5cf11c 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1550,26 +1550,16 @@ test_that("`as.Date()` and `as_date()`", { test_df ) - test_df %>% - arrow_table() %>% - mutate( - date_char_ymd = as.Date( - character_ymd_var, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - ), - .keep = "used" - ) %>% - collect() - - # we do not support multiple tryFormats + # we now support multiple tryFormats compare_dplyr_binding( .input %>% - mutate(date_char_ymd = as.Date(character_ymd_var, + mutate(date_char_ymd = as.Date( + character_ymd_var, tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - )) %>% + ) + ) %>% collect(), - test_df#, - # warning = TRUE + test_df ) # strptime does not support a partial format - testing an error surfaced from @@ -1577,8 +1567,8 @@ test_that("`as.Date()` and `as_date()`", { # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 expect_error( test_df %>% - # arrow_table() %>% - mutate(date_char_ymd = as_date(character_ymd_hms_var), .keep = "used") %>% + arrow_table() %>% + mutate(date_char_ymd_hms = as_date(character_ymd_hms_var), .keep = "used") %>% collect() ) From 19838213bbad878c2006cccb33945f517235549e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 5 May 2022 16:16:01 +0100 Subject: [PATCH 06/36] small change --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 63f8b5cf11c..738978fedf5 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1575,7 +1575,7 @@ test_that("`as.Date()` and `as_date()`", { expect_error( test_df %>% arrow_table() %>% - mutate(date_char_ymd = as.Date(character_ymd_hms_var)) %>% + mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var), .keep = "used") %>% collect(), regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", fixed = TRUE From 93a74188130e7e5cc893e6bbcd7b4011f130a763 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 10 May 2022 09:44:20 +0100 Subject: [PATCH 07/36] use `parse_date_time()` for `as_date()` and `as.Date()` bindings + updated unit tests --- r/R/dplyr-datetime-helpers.R | 22 +++---------- r/tests/testthat/test-dplyr-funcs-datetime.R | 34 ++++++++++++-------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 12d04ea930b..338d7f688d1 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -128,26 +128,14 @@ binding_as_date_character <- function(x, format = NULL, tryFormats = c("%Y-%m-%d", "%Y/%m/%d")) { - # format <- format %||% tryFormats[[1]] - # # unit = 0L is the identifier for seconds in valid_time32_units - # build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # we need to split the logic between as_date() and as.Date() since `tryFormats` - # is an argument only for `as.Date()` + # `tryFormats` is an argument only for `as.Date()`, but `as_date()` behaves in + # a similar fashion, with its `.parse_iso_dt()` if (is.null(format)) { - parse_attempt_expressions <- list() - for (i in seq_along(tryFormats)) { - parse_attempt_expressions[[i]] <- - build_expr( - "strptime", x, - options = list(format = tryFormats[[i]], unit = 0L, error_is_null = TRUE) - ) - } - coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) - return(coalesce_output) + parsed_date <- call_binding("parse_date_time", x, orders = tryFormats) } else { - build_expr("strptime", x, options = list(format = format, unit = 0L)) + parsed_date <- build_expr("strptime", x, options = list(format = format, unit = 0L)) } + parsed_date } binding_as_date_numeric <- function(x, origin = "1970-01-01") { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 738978fedf5..bd37b261e91 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1562,26 +1562,34 @@ test_that("`as.Date()` and `as_date()`", { test_df ) - # strptime does not support a partial format - testing an error surfaced from - # C++ (hence not testing the content of the error message) + # strptime does not support a partial format - Arrow returns NA, while + # lubridate parses correctly # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd_hms = as_date(character_ymd_hms_var), .keep = "used") %>% - collect() + expect_equal( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd_hms = as_date(character_ymd_hms_var), .keep = "used") %>% + collect(), + test_df %>% + mutate(date_char_ymd_hms = as_date(character_ymd_hms_var), .keep = "used") %>% + collect() + ) ) + # same as above expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var), .keep = "used") %>% - collect(), - regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", - fixed = TRUE + expect_equal( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var), .keep = "used") %>% + collect(), + test_df %>% + mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var), .keep = "used") %>% + collect() + ) ) - # we do not support as.Date() with double/ float (error surfaced from C++) # TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798 expect_error( From 26fb334a003177a3bf00969fe679bd1076374aea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 10 May 2022 10:05:46 +0100 Subject: [PATCH 08/36] clean-up and use ellipsis `...` --- r/R/dplyr-datetime-helpers.R | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 338d7f688d1..a9c2d92d4ce 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -101,24 +101,18 @@ duration_from_chunks <- function(chunks) { binding_as_date <- function(x, - format = NULL, - tryFormats = "%Y-%m-%d", - origin = "1970-01-01") { - - # if (is.null(format) && length(tryFormats) > 1) { - # abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") - # } + ...) { if (call_binding("is.Date", x)) { return(x) # cast from character } else if (call_binding("is.character", x)) { - x <- binding_as_date_character(x, format, tryFormats) + x <- binding_as_date_character(x, ...) # cast from numeric } else if (call_binding("is.numeric", x)) { - x <- binding_as_date_numeric(x, origin) + x <- binding_as_date_numeric(x, ...) } build_expr("cast", x, options = cast_options(to_type = date32())) @@ -126,7 +120,8 @@ binding_as_date <- function(x, binding_as_date_character <- function(x, format = NULL, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d")) { + tryFormats = c("%Y-%m-%d", "%Y/%m/%d"), + ...) { # `tryFormats` is an argument only for `as.Date()`, but `as_date()` behaves in # a similar fashion, with its `.parse_iso_dt()` @@ -138,7 +133,9 @@ binding_as_date_character <- function(x, parsed_date } -binding_as_date_numeric <- function(x, origin = "1970-01-01") { +binding_as_date_numeric <- function(x, + origin = "1970-01-01", + ...) { # Arrow does not support direct casting from double to date32(), but for # integer-like values we can go via int32() From 8ba5dc726324f38d70244c128a9d5d6249944954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 10 May 2022 11:09:47 +0100 Subject: [PATCH 09/36] support R objects as inputs + additional unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index bd37b261e91..5bf02a0d22d 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1550,7 +1550,7 @@ test_that("`as.Date()` and `as_date()`", { test_df ) - # we now support multiple tryFormats + # as.Date() supports multiple tryFormats compare_dplyr_binding( .input %>% mutate(date_char_ymd = as.Date( @@ -1562,6 +1562,23 @@ test_that("`as.Date()` and `as_date()`", { test_df ) + # as_date() and as.Date() work with R objects + compare_dplyr_binding( + .input %>% + mutate( + date1 = as.Date("2022-05-10"), + date2 = as.Date(12, origin = "2022-05-01"), + date3 = as.Date("2022-10-03", tryFormats = "%Y-%m-%d"), + date4 = as_date("2022-05-10"), + date5 = as_date(12, origin = "2022-05-01"), + date6 = as_date("2022-10-03") + ) %>% + collect(), + tibble( + a = 1 + ) + ) + # strptime does not support a partial format - Arrow returns NA, while # lubridate parses correctly # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 From ec0b385ab5f446e9cb212275430b556faf88cec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 10 May 2022 11:33:05 +0100 Subject: [PATCH 10/36] add unit test for `parse_date_time()` with regular R object --- r/tests/testthat/test-dplyr-funcs-datetime.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 5bf02a0d22d..5f63fe37b75 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1721,6 +1721,18 @@ test_that("parse_date_time() works with year, month, and date components", { ) ) + # parse_date_time() works with R objects + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time("2021/09///2", orders = "ymd") + ) %>% + collect(), + tibble::tibble( + a = 1 + ) + ) + # locale (affecting "%b% and "%B" formats) does not work properly on Windows # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done skip_on_os("windows") From 99e0147b7c824ea9506a7744220c168fc382a2bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 10 May 2022 12:56:59 +0100 Subject: [PATCH 11/36] separate the unit test involving string processing and skip it when RE2 is not available --- r/tests/testthat/test-dplyr-funcs-datetime.R | 38 +++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 5f63fe37b75..e2b9ff140cd 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1562,23 +1562,6 @@ test_that("`as.Date()` and `as_date()`", { test_df ) - # as_date() and as.Date() work with R objects - compare_dplyr_binding( - .input %>% - mutate( - date1 = as.Date("2022-05-10"), - date2 = as.Date(12, origin = "2022-05-01"), - date3 = as.Date("2022-10-03", tryFormats = "%Y-%m-%d"), - date4 = as_date("2022-05-10"), - date5 = as_date(12, origin = "2022-05-01"), - date6 = as_date("2022-10-03") - ) %>% - collect(), - tibble( - a = 1 - ) - ) - # strptime does not support a partial format - Arrow returns NA, while # lubridate parses correctly # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 @@ -1641,6 +1624,27 @@ test_that("`as.Date()` and `as_date()`", { ) }) +test_that("`as_date()` and `as.Date()` work with R objects", { + # some string processing (which depends on the RE2 library) is involved when + # parsing as date with multiple formats. RE2 is not available in R 3.6 on Windows + skip_if_not_available("re2") + compare_dplyr_binding( + .input %>% + mutate( + date1 = as.Date("2022-05-10"), + date2 = as.Date(12, origin = "2022-05-01"), + date3 = as.Date("2022-10-03", tryFormats = "%Y-%m-%d"), + date4 = as_date("2022-05-10"), + date5 = as_date(12, origin = "2022-05-01"), + date6 = as_date("2022-10-03") + ) %>% + collect(), + tibble( + a = 1 + ) + ) +}) + test_that("`as_datetime()`", { test_df <- tibble( date = as.Date(c("2022-03-22", "2021-07-30", NA)), From ab0a84345793bcd720cf8e4512e7873811fa23ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 10 May 2022 13:47:48 +0100 Subject: [PATCH 12/36] skip parsing with R object test on R 3.6 on windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e2b9ff140cd..c6162a45558 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1725,18 +1725,6 @@ test_that("parse_date_time() works with year, month, and date components", { ) ) - # parse_date_time() works with R objects - compare_dplyr_binding( - .input %>% - mutate( - parsed_date_ymd = parse_date_time("2021/09///2", orders = "ymd") - ) %>% - collect(), - tibble::tibble( - a = 1 - ) - ) - # locale (affecting "%b% and "%B" formats) does not work properly on Windows # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done skip_on_os("windows") @@ -1784,6 +1772,18 @@ test_that("parse_date_time() works with a mix of formats and orders", { collect(), test_df ) + + # parse_date_time() works with R objects + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time("2021/09///2", orders = "ymd") + ) %>% + collect(), + tibble::tibble( + a = 1 + ) + ) }) test_that("year, month, day date/time parsers", { From 613bed032c340c9075f1e4dcc5105860527889e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 10 May 2022 15:46:22 +0100 Subject: [PATCH 13/36] skip tests on r 3.6 & windows (re2 not available) --- r/tests/testthat/test-dataset-dplyr.R | 21 +++++++++------ r/tests/testthat/test-dplyr-funcs-datetime.R | 28 +++++++++++--------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index fb1ef802e0a..b6c5c459145 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -80,26 +80,29 @@ test_that("filter() on timestamp columns", { df1[5:10, c("ts")], ) - # Now with Date + # Now with bare string date + skip("Implement more aggressive implicit casting for scalars (ARROW-11402)") expect_equal( ds %>% - filter(ts >= as.Date("2015-05-04")) %>% + filter(ts >= "2015-05-04") %>% filter(part == 1) %>% select(ts) %>% collect(), df1[5:10, c("ts")], ) - # Now with bare string date - skip("Implement more aggressive implicit casting for scalars (ARROW-11402)") + # string processing requires RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + # Now with Date expect_equal( ds %>% - filter(ts >= "2015-05-04") %>% + filter(ts >= as.Date("2015-05-04")) %>% filter(part == 1) %>% select(ts) %>% collect(), df1[5:10, c("ts")], ) + }) test_that("filter() on date32 columns", { @@ -108,18 +111,20 @@ test_that("filter() on date32 columns", { df <- data.frame(date = as.Date(c("2020-02-02", "2020-02-03"))) write_parquet(df, file.path(tmp, "file.parquet")) + # Also with timestamp scalar expect_equal( open_dataset(tmp) %>% - filter(date > as.Date("2020-02-02")) %>% + filter(date > lubridate::ymd_hms("2020-02-02 00:00:00")) %>% collect() %>% nrow(), 1L ) - # Also with timestamp scalar + # string processing requires RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") expect_equal( open_dataset(tmp) %>% - filter(date > lubridate::ymd_hms("2020-02-02 00:00:00")) %>% + filter(date > as.Date("2020-02-02")) %>% collect() %>% nrow(), 1L diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index c6162a45558..ccf8e889fb9 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1550,18 +1550,6 @@ test_that("`as.Date()` and `as_date()`", { test_df ) - # as.Date() supports multiple tryFormats - compare_dplyr_binding( - .input %>% - mutate(date_char_ymd = as.Date( - character_ymd_var, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - ) - ) %>% - collect(), - test_df - ) - # strptime does not support a partial format - Arrow returns NA, while # lubridate parses correctly # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 @@ -1622,6 +1610,22 @@ test_that("`as.Date()` and `as_date()`", { collect(), test_df ) + + # some string processing (which depends on the RE2 library) is involved when + # parsing as date with multiple formats. RE2 is not available in R 3.6 on Windows + skip_if_not_available("re2") + # as.Date() supports multiple tryFormats + compare_dplyr_binding( + .input %>% + mutate(date_char_ymd = as.Date( + character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + ) + ) %>% + collect(), + test_df + ) + }) test_that("`as_date()` and `as.Date()` work with R objects", { From d6d45a2a6bc04073d05162153cb69bbec4c88eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 13 May 2022 09:13:09 +0100 Subject: [PATCH 14/36] new, failing test to test tryFormats not coalescing --- r/R/dplyr-datetime-helpers.R | 8 ++++++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 19 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index a9c2d92d4ce..b5552156457 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -122,6 +122,14 @@ binding_as_date_character <- function(x, format = NULL, tryFormats = c("%Y-%m-%d", "%Y/%m/%d"), ...) { +browser() + force(x) + for (i in 1:x$length()) { + if (x$IsValid(i -1)) { + xx <- x$Slice(i -1 , 1) + break() + } + } # `tryFormats` is an argument only for `as.Date()`, but `as_date()` behaves in # a similar fashion, with its `.parse_iso_dt()` diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index ccf8e889fb9..36a2c37ae86 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1513,6 +1513,7 @@ test_that("`as.Date()` and `as_date()`", { dt_utc = ymd_hms("2010-08-03 00:50:50"), date_var = as.Date("2022-02-25"), difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), + try_formats_string = c(NA, "2022-01-01", "2022/01/01"), character_ymd_hms_var = "2022-02-25 00:00:01", character_ydm_hms_var = "2022/25/02 00:00:01", character_ymd_var = "2022-02-25", @@ -1596,6 +1597,17 @@ test_that("`as.Date()` and `as_date()`", { collect() ) + test_df %>% + arrow_table() %>% + mutate( + date_try_formats_base = as.Date( + try_formats_string, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + ), + .keep = "used" + ) %>% + collect() + # difference between as.Date() and as_date(): # `as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg # to `as.Date()` @@ -1605,7 +1617,12 @@ test_that("`as.Date()` and `as_date()`", { .input %>% transmute( date_diff_lubridate = as_date(difference_date), - date_diff_base = as.Date(difference_date) + date_diff_base = as.Date(difference_date), + date_try_formats_base = as.Date( + try_formats_string, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + ), + date_try_formats_lubridate = as_date(try_formats_string) ) %>% collect(), test_df From 7a21e92d3b74f9bce06e3a7f64abbe2395f14f74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 20 May 2022 13:56:47 +0100 Subject: [PATCH 15/36] update `test_df` for `as.Date()` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 36a2c37ae86..6e3ccfcab29 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1508,19 +1508,19 @@ test_that("make_difftime()", { test_that("`as.Date()` and `as_date()`", { test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), - dt_europe = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"), - dt_utc = ymd_hms("2010-08-03 00:50:50"), - date_var = as.Date("2022-02-25"), - difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), + posixct_var = as.POSIXct(c("2022-02-25 00:00:01", "1987-11-24 12:34:56", NA), tz = "Pacific/Marquesas"), + dt_europe = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA, tz = "Europe/London"), + dt_utc = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA), + date_var = as.Date(c("2022-02-25", "1987-11-24", NA)), + difference_date = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA, tz = "Pacific/Marquesas"), try_formats_string = c(NA, "2022-01-01", "2022/01/01"), - character_ymd_hms_var = "2022-02-25 00:00:01", - character_ydm_hms_var = "2022/25/02 00:00:01", - character_ymd_var = "2022-02-25", - character_ydm_var = "2022/25/02", - integer_var = 32L, - integerish_var = 32, - double_var = 34.56 + character_ymd_hms_var = c("2022-02-25 00:00:01", "1987-11-24 12:34:56", NA), + character_ydm_hms_var = c("2022/25/02 00:00:01", "1987/24/11 12:34:56", NA), + character_ymd_var = c("2022-02-25", "1987-11-24", NA), + character_ydm_var = c("2022/25/02", "1987/24/11", NA), + integer_var = c(21L, 32L, NA), + integerish_var = c(21, 32, NA), + double_var = c(12.34, 56.78, NA) ) compare_dplyr_binding( From 0479b696ee268d2de9b6029a623e1cdd00776f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 20 May 2022 14:00:02 +0100 Subject: [PATCH 16/36] remove `browser()` --- r/R/dplyr-datetime-helpers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index b5552156457..bfe41a951a6 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -122,7 +122,7 @@ binding_as_date_character <- function(x, format = NULL, tryFormats = c("%Y-%m-%d", "%Y/%m/%d"), ...) { -browser() + force(x) for (i in 1:x$length()) { if (x$IsValid(i -1)) { From 96c44d3a0565355b546ea4c2bd83033c3b83262f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 14:05:55 +0100 Subject: [PATCH 17/36] undo some trials --- r/R/dplyr-datetime-helpers.R | 8 -------- 1 file changed, 8 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index bfe41a951a6..a9c2d92d4ce 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -123,14 +123,6 @@ binding_as_date_character <- function(x, tryFormats = c("%Y-%m-%d", "%Y/%m/%d"), ...) { - force(x) - for (i in 1:x$length()) { - if (x$IsValid(i -1)) { - xx <- x$Slice(i -1 , 1) - break() - } - } - # `tryFormats` is an argument only for `as.Date()`, but `as_date()` behaves in # a similar fashion, with its `.parse_iso_dt()` if (is.null(format)) { From 2492c00af6405e6c775c90e7e0b4f0c93a6c5a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 26 May 2022 11:36:03 +0100 Subject: [PATCH 18/36] scaffolding for `tryFormats` --- r/R/dplyr-datetime-helpers.R | 27 +++++++++++++++++++++++---- r/R/dplyr-funcs-datetime.R | 6 ++++-- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index a9c2d92d4ce..70b3684c46a 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -121,13 +121,32 @@ binding_as_date <- function(x, binding_as_date_character <- function(x, format = NULL, tryFormats = c("%Y-%m-%d", "%Y/%m/%d"), + coalesce = TRUE, ...) { - # `tryFormats` is an argument only for `as.Date()`, but `as_date()` behaves in - # a similar fashion, with its `.parse_iso_dt()` - if (is.null(format)) { + if (is.null(format) && coalesce) { parsed_date <- call_binding("parse_date_time", x, orders = tryFormats) - } else { + } + + if (is.null(format) && !coalesce) { + attempts <- map( + tryFormats, + ~ build_expr( + "strptime", + x, + options = list(format = .x, unit = 0L, error_is_null = TRUE) + ) + ) + n <- length(tryFormats) + results <- vector("list", n) + mask <- caller_env(n = 3) + for (i in seq_len(n)) { + results[[i]] <- arrow_eval(attempts[[i]], mask) + } + parsed_date <- results[[1]] + } + + if (!is.null(format)) { parsed_date <- build_expr("strptime", x, options = list(format = format, unit = 0L)) } parsed_date diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 8ecb80b6b45..879b745a47f 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -311,7 +311,8 @@ register_bindings_datetime_conversion <- function() { x = x, format = format, tryFormats = tryFormats, - origin = origin + origin = origin, + coalesce = FALSE ) }) @@ -331,7 +332,8 @@ register_bindings_datetime_conversion <- function() { binding_as_date( x = x, format = format, - origin = origin + origin = origin, + coalesce = TRUE ) }) From ec59d1f603bf6126bd75374fdbff2f047fe07bcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 15:17:46 +0100 Subject: [PATCH 19/36] revert `binding_as_character()` --- r/R/dplyr-datetime-helpers.R | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 70b3684c46a..1b2edbdc026 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -120,36 +120,10 @@ binding_as_date <- function(x, binding_as_date_character <- function(x, format = NULL, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d"), - coalesce = TRUE, - ...) { - - if (is.null(format) && coalesce) { - parsed_date <- call_binding("parse_date_time", x, orders = tryFormats) - } - - if (is.null(format) && !coalesce) { - attempts <- map( - tryFormats, - ~ build_expr( - "strptime", - x, - options = list(format = .x, unit = 0L, error_is_null = TRUE) - ) - ) - n <- length(tryFormats) - results <- vector("list", n) - mask <- caller_env(n = 3) - for (i in seq_len(n)) { - results[[i]] <- arrow_eval(attempts[[i]], mask) - } - parsed_date <- results[[1]] - } - - if (!is.null(format)) { - parsed_date <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - } - parsed_date + tryFormats = "%Y-%m-%d") { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + build_expr("strptime", x, options = list(format = format, unit = 0L)) } binding_as_date_numeric <- function(x, From a0ad6fa8c334e7fb3dcf20c69f457411475f1358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 15:18:58 +0100 Subject: [PATCH 20/36] revert `binding_as_date_numeric()` --- r/R/dplyr-datetime-helpers.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 1b2edbdc026..dfd37035c79 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -127,8 +127,7 @@ binding_as_date_character <- function(x, } binding_as_date_numeric <- function(x, - origin = "1970-01-01", - ...) { + origin = "1970-01-01") { # Arrow does not support direct casting from double to date32(), but for # integer-like values we can go via int32() From be71950970ad9bc9a418077a505adf0b589921d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 15:20:40 +0100 Subject: [PATCH 21/36] revert `binding_as_date()` --- r/R/dplyr-datetime-helpers.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index dfd37035c79..7f839ce42ae 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -102,6 +102,9 @@ duration_from_chunks <- function(chunks) { binding_as_date <- function(x, ...) { + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + } if (call_binding("is.Date", x)) { return(x) @@ -126,8 +129,7 @@ binding_as_date_character <- function(x, build_expr("strptime", x, options = list(format = format, unit = 0L)) } -binding_as_date_numeric <- function(x, - origin = "1970-01-01") { +binding_as_date_numeric <- function(x, origin = "1970-01-01") { # Arrow does not support direct casting from double to date32(), but for # integer-like values we can go via int32() From 17ea2f5186ca6589fed49d0acee6e329830ed80f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 15:22:45 +0100 Subject: [PATCH 22/36] revert `as.Date` and `as_date` bindings definitions --- r/R/dplyr-funcs-datetime.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 879b745a47f..8ecb80b6b45 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -311,8 +311,7 @@ register_bindings_datetime_conversion <- function() { x = x, format = format, tryFormats = tryFormats, - origin = origin, - coalesce = FALSE + origin = origin ) }) @@ -332,8 +331,7 @@ register_bindings_datetime_conversion <- function() { binding_as_date( x = x, format = format, - origin = origin, - coalesce = TRUE + origin = origin ) }) From 764f2d0835bd56529718c1c5e3f96e7baa6cc8b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 15:31:02 +0100 Subject: [PATCH 23/36] revert `tryFormats` test --- r/tests/testthat/test-dplyr-funcs-datetime.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 6e3ccfcab29..ebc03af223f 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1551,6 +1551,17 @@ test_that("`as.Date()` and `as_date()`", { test_df ) + # we do not support multiple tryFormats + compare_dplyr_binding( + .input %>% + mutate(date_char_ymd = as.Date(character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + )) %>% + collect(), + test_df, + warning = TRUE + ) + # strptime does not support a partial format - Arrow returns NA, while # lubridate parses correctly # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 From 130d426cf712a087f3112b3581aebd355f9732f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 15:37:38 +0100 Subject: [PATCH 24/36] test_fix --- r/tests/testthat/test-dplyr-funcs-datetime.R | 26 +++++--------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index ebc03af223f..308e3c472c9 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1554,9 +1554,11 @@ test_that("`as.Date()` and `as_date()`", { # we do not support multiple tryFormats compare_dplyr_binding( .input %>% - mutate(date_char_ymd = as.Date(character_ymd_var, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - )) %>% + mutate(date_char_ymd = as.Date( + character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + ) + ) %>% collect(), test_df, warning = TRUE @@ -1608,17 +1610,6 @@ test_that("`as.Date()` and `as_date()`", { collect() ) - test_df %>% - arrow_table() %>% - mutate( - date_try_formats_base = as.Date( - try_formats_string, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - ), - .keep = "used" - ) %>% - collect() - # difference between as.Date() and as_date(): # `as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg # to `as.Date()` @@ -1628,12 +1619,7 @@ test_that("`as.Date()` and `as_date()`", { .input %>% transmute( date_diff_lubridate = as_date(difference_date), - date_diff_base = as.Date(difference_date), - date_try_formats_base = as.Date( - try_formats_string, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - ), - date_try_formats_lubridate = as_date(try_formats_string) + date_diff_base = as.Date(difference_date) ) %>% collect(), test_df From 16a5849748425c0e6ac8d9c21fa562b6e2582bd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 15:38:42 +0100 Subject: [PATCH 25/36] revert tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 308e3c472c9..2059ec1f0b6 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1624,22 +1624,6 @@ test_that("`as.Date()` and `as_date()`", { collect(), test_df ) - - # some string processing (which depends on the RE2 library) is involved when - # parsing as date with multiple formats. RE2 is not available in R 3.6 on Windows - skip_if_not_available("re2") - # as.Date() supports multiple tryFormats - compare_dplyr_binding( - .input %>% - mutate(date_char_ymd = as.Date( - character_ymd_var, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - ) - ) %>% - collect(), - test_df - ) - }) test_that("`as_date()` and `as.Date()` work with R objects", { From d929b8bde699213d8ce61496f231a2366a73db79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 15:41:26 +0100 Subject: [PATCH 26/36] revert tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 2059ec1f0b6..222b0065111 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1627,9 +1627,6 @@ test_that("`as.Date()` and `as_date()`", { }) test_that("`as_date()` and `as.Date()` work with R objects", { - # some string processing (which depends on the RE2 library) is involved when - # parsing as date with multiple formats. RE2 is not available in R 3.6 on Windows - skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( @@ -1774,18 +1771,6 @@ test_that("parse_date_time() works with a mix of formats and orders", { collect(), test_df ) - - # parse_date_time() works with R objects - compare_dplyr_binding( - .input %>% - mutate( - parsed_date_ymd = parse_date_time("2021/09///2", orders = "ymd") - ) %>% - collect(), - tibble::tibble( - a = 1 - ) - ) }) test_that("year, month, day date/time parsers", { From ecb8f9998aca8a033d1b13a1101846774185309f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 16:57:52 +0100 Subject: [PATCH 27/36] revert `binding_as_date()` --- r/R/dplyr-datetime-helpers.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 7f839ce42ae..60771c86241 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -101,7 +101,9 @@ duration_from_chunks <- function(chunks) { binding_as_date <- function(x, - ...) { + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01") { if (is.null(format) && length(tryFormats) > 1) { abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") } @@ -111,11 +113,11 @@ binding_as_date <- function(x, # cast from character } else if (call_binding("is.character", x)) { - x <- binding_as_date_character(x, ...) + x <- binding_as_date_character(x, format, tryFormats) # cast from numeric } else if (call_binding("is.numeric", x)) { - x <- binding_as_date_numeric(x, ...) + x <- binding_as_date_numeric(x, origin) } build_expr("cast", x, options = cast_options(to_type = date32())) From 7e42b13bd47c043f67e6b7e8d2f9fe8c10947efd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 1 Jul 2022 09:43:46 +0100 Subject: [PATCH 28/36] not skipping since `as.Date()` no longer uses the `parse_date_time()` binding --- r/tests/testthat/test-dataset-dplyr.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index b6c5c459145..b34eb7e9740 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -91,8 +91,6 @@ test_that("filter() on timestamp columns", { df1[5:10, c("ts")], ) - # string processing requires RE2 library (not available on Windows with R 3.6) - skip_if_not_available("re2") # Now with Date expect_equal( ds %>% @@ -102,7 +100,6 @@ test_that("filter() on timestamp columns", { collect(), df1[5:10, c("ts")], ) - }) test_that("filter() on date32 columns", { From 0dcfe8caf2d9575c710728910dbc67941c2eca26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 1 Jul 2022 09:44:38 +0100 Subject: [PATCH 29/36] re-order --- r/tests/testthat/test-dataset-dplyr.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index b34eb7e9740..8ae9b92e089 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -80,21 +80,21 @@ test_that("filter() on timestamp columns", { df1[5:10, c("ts")], ) - # Now with bare string date - skip("Implement more aggressive implicit casting for scalars (ARROW-11402)") + # Now with Date expect_equal( ds %>% - filter(ts >= "2015-05-04") %>% + filter(ts >= as.Date("2015-05-04")) %>% filter(part == 1) %>% select(ts) %>% collect(), df1[5:10, c("ts")], ) - # Now with Date + # Now with bare string date + skip("Implement more aggressive implicit casting for scalars (ARROW-11402)") expect_equal( ds %>% - filter(ts >= as.Date("2015-05-04")) %>% + filter(ts >= "2015-05-04") %>% filter(part == 1) %>% select(ts) %>% collect(), From fe64d7b912c0a64d50b96e08208ad2e6cb1f0f61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 1 Jul 2022 09:48:43 +0100 Subject: [PATCH 30/36] reorder --- r/tests/testthat/test-dataset-dplyr.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 8ae9b92e089..fb1ef802e0a 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -108,20 +108,18 @@ test_that("filter() on date32 columns", { df <- data.frame(date = as.Date(c("2020-02-02", "2020-02-03"))) write_parquet(df, file.path(tmp, "file.parquet")) - # Also with timestamp scalar expect_equal( open_dataset(tmp) %>% - filter(date > lubridate::ymd_hms("2020-02-02 00:00:00")) %>% + filter(date > as.Date("2020-02-02")) %>% collect() %>% nrow(), 1L ) - # string processing requires RE2 library (not available on Windows with R 3.6) - skip_if_not_available("re2") + # Also with timestamp scalar expect_equal( open_dataset(tmp) %>% - filter(date > as.Date("2020-02-02")) %>% + filter(date > lubridate::ymd_hms("2020-02-02 00:00:00")) %>% collect() %>% nrow(), 1L From 948e8d35c4e31bc286631ab3e974a4da87566bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 4 Jul 2022 10:49:10 +0100 Subject: [PATCH 31/36] improve error message for `as.Date()` with multiple `tryFormats` + unit tests --- r/R/dplyr-datetime-helpers.R | 7 +++- r/tests/testthat/test-dplyr-funcs-datetime.R | 36 +++++++++++++++----- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 60771c86241..f8e67a34ece 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -105,7 +105,12 @@ binding_as_date <- function(x, tryFormats = "%Y-%m-%d", origin = "1970-01-01") { if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + abort( + paste( + "`as.Date()` with multiple `tryFormats` is not supported in Arrow,", + "consider using the specialised parsing functions" + ) + ) } if (call_binding("is.Date", x)) { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 222b0065111..3d177c1b8a6 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1552,16 +1552,36 @@ test_that("`as.Date()` and `as_date()`", { ) # we do not support multiple tryFormats - compare_dplyr_binding( - .input %>% - mutate(date_char_ymd = as.Date( - character_ymd_var, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - ) + # this is not a simple warning, therefore we cannot use compare_dplyr_binding() + # with `warning = TRUE` + # arrow_table test + expect_warning( + test_df %>% + arrow_table() %>% + mutate( + date_char_ymd = as.Date( + character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + ), + .keep = "used" ) %>% collect(), - test_df, - warning = TRUE + regexp = "consider using the specialised parsing functions" + ) + + # record batch test + expect_warning( + test_df %>% + record_batch() %>% + mutate( + date_char_ymd = as.Date( + character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d") + ), + .keep = "used" + ) %>% + collect(), + regexp = "consider using the specialised parsing functions" ) # strptime does not support a partial format - Arrow returns NA, while From a6243553250210f83afd86f5d23302fe49e792c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 4 Jul 2022 11:40:42 +0100 Subject: [PATCH 32/36] update error message --- r/R/dplyr-datetime-helpers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index f8e67a34ece..db0ed735b49 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -108,7 +108,7 @@ binding_as_date <- function(x, abort( paste( "`as.Date()` with multiple `tryFormats` is not supported in Arrow,", - "consider using the specialised parsing functions" + "consider using the specialised parsing functions in lubridate, such as, `ymd()`" ) ) } From 29c3251417a48ca95471ac7fcadc57d6b937272a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 4 Jul 2022 12:46:24 +0100 Subject: [PATCH 33/36] clearer error message --- r/R/dplyr-datetime-helpers.R | 2 +- r/tests/testthat/test-dplyr-funcs-datetime.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index db0ed735b49..76161431eb2 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -108,7 +108,7 @@ binding_as_date <- function(x, abort( paste( "`as.Date()` with multiple `tryFormats` is not supported in Arrow,", - "consider using the specialised parsing functions in lubridate, such as, `ymd()`" + "consider using the lubridate specialised parsing functions such as, `ymd()`, `ymd()`, etc." ) ) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 3d177c1b8a6..a6e5233726b 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1566,7 +1566,7 @@ test_that("`as.Date()` and `as_date()`", { .keep = "used" ) %>% collect(), - regexp = "consider using the specialised parsing functions" + regexp = "consider using the lubridate specialised parsing functions" ) # record batch test @@ -1581,7 +1581,7 @@ test_that("`as.Date()` and `as_date()`", { .keep = "used" ) %>% collect(), - regexp = "consider using the specialised parsing functions" + regexp = "consider using the lubridate specialised parsing functions" ) # strptime does not support a partial format - Arrow returns NA, while From 39ee8f2962941a9316ffa91b80b412e379e137a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 4 Jul 2022 13:48:55 +0100 Subject: [PATCH 34/36] move the `tryFormats` error message in `as.Date()` --- r/R/dplyr-datetime-helpers.R | 8 -------- r/R/dplyr-funcs-datetime.R | 10 ++++++++++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 76161431eb2..7099d79c785 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -104,14 +104,6 @@ binding_as_date <- function(x, format = NULL, tryFormats = "%Y-%m-%d", origin = "1970-01-01") { - if (is.null(format) && length(tryFormats) > 1) { - abort( - paste( - "`as.Date()` with multiple `tryFormats` is not supported in Arrow,", - "consider using the lubridate specialised parsing functions such as, `ymd()`, `ymd()`, etc." - ) - ) - } if (call_binding("is.Date", x)) { return(x) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 8ecb80b6b45..f7d948fdf27 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -297,6 +297,16 @@ register_bindings_datetime_conversion <- function() { tryFormats = "%Y-%m-%d", origin = "1970-01-01", tz = "UTC") { + + if (is.null(format) && length(tryFormats) > 1) { + abort( + paste( + "`as.Date()` with multiple `tryFormats` is not supported in Arrow,", + "consider using the lubridate specialised parsing functions such as, `ymd()`, `ymd()`, etc." + ) + ) + } + # base::as.Date() and lubridate::as_date() differ in the way they use the # `tz` argument. Both cast to the desired timezone, if present. The # difference appears when the `tz` argument is not set: `as.Date()` uses the From 104ebb2605738d2c1a9d7f645a7e95feda4a70e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 4 Jul 2022 15:01:46 +0100 Subject: [PATCH 35/36] remove `.keep = "used"` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a6e5233726b..f5e0eaaa8d3 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1562,8 +1562,7 @@ test_that("`as.Date()` and `as_date()`", { date_char_ymd = as.Date( character_ymd_var, tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - ), - .keep = "used" + ) ) %>% collect(), regexp = "consider using the lubridate specialised parsing functions" @@ -1577,8 +1576,7 @@ test_that("`as.Date()` and `as_date()`", { date_char_ymd = as.Date( character_ymd_var, tryFormats = c("%Y-%m-%d", "%Y/%m/%d") - ), - .keep = "used" + ) ) %>% collect(), regexp = "consider using the lubridate specialised parsing functions" From 1d96d85c7823c4d7cf10b201f8c3f00297a9659f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 4 Jul 2022 15:53:16 +0100 Subject: [PATCH 36/36] removed `.keep = "used"` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index f5e0eaaa8d3..8899f051ad7 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1589,10 +1589,10 @@ test_that("`as.Date()` and `as_date()`", { expect_equal( test_df %>% arrow_table() %>% - mutate(date_char_ymd_hms = as_date(character_ymd_hms_var), .keep = "used") %>% + mutate(date_char_ymd_hms = as_date(character_ymd_hms_var)) %>% collect(), test_df %>% - mutate(date_char_ymd_hms = as_date(character_ymd_hms_var), .keep = "used") %>% + mutate(date_char_ymd_hms = as_date(character_ymd_hms_var)) %>% collect() ) ) @@ -1602,10 +1602,10 @@ test_that("`as.Date()` and `as_date()`", { expect_equal( test_df %>% arrow_table() %>% - mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var), .keep = "used") %>% + mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var)) %>% collect(), test_df %>% - mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var), .keep = "used") %>% + mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var)) %>% collect() ) )