From 652a332afe6eddfcf38194e2fef990fc56f41007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 17 May 2022 15:19:24 +0100 Subject: [PATCH 1/7] `fast_strptime` binding + unit tests --- r/R/dplyr-funcs-datetime.R | 32 +++++++++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 59 ++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index e0c65d64cc0..e1a528667e6 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -546,4 +546,36 @@ register_bindings_datetime_parsers <- function() { for (ymd_order in ymd_parser_vec) { register_binding(ymd_order, ymd_parser_map_factory(ymd_order)) } + + register_binding("fast_strptime", function( x, + format, + tz = "UTC", + lt = FALSE, + cutoff_2000 = 68L) { + browser() + # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play + # well with mutate, for example) + if (lt) { + arrow_not_supported("`lt = TRUE` argument") + } + + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16596 is done + if (cutoff_2000 != 68L) { + arrow_not_supported("`cutoff_2000` != 68L argument") + } + + parse_attempt_expressions <- list() + + for (i in seq_along(format)) { + parse_attempt_expressions[[i]] <- build_expr( + "strptime", + x, + options = list(format = format[[i]], unit = 0L, error_is_null = TRUE) + ) + } + + coalesce_output <- call_binding("strptime", args = parse_attempt_expressions) + + build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) + }) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 42448e8243f..ca6380b0e31 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1776,3 +1776,62 @@ test_that("year, month, day date/time parsers work", { test_df ) }) + + +test_that("lubridate's fast_strptime", { + t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) + t_string_y <- tibble(x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)) + t_string_2formats <- tibble(x = c("2018-10-07 19:04:05", "68-10-07 19:04:05")) + t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) + + compare_dplyr_binding( + .input %>% + mutate(y = fast_strptime(x, format = "%Y-%m-%d %H:%M:%S", lt = FALSE)) %>% + collect(), + t_string, + # arrow does not preserve the `tzone` attribute + ignore_attr = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate(y = + fast_strptime("68-10-07 19:04:05", format = "%y-%m-%d %H:%M:%S", lt = FALSE) + ) %>% + collect(), + t_string, + ignore_attr = TRUE + ) + + # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is + # implicitly set to 68L both in base R and in Arrow + compare_dplyr_binding( + .input %>% + mutate(date_short_year = fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE)) %>% + collect(), + t_string_y, + # arrow does not preserve the `tzone` attribute + ignore_attr = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate( + date_short_year = + fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE, cutoff_2000 = 69L) + ) %>% + collect(), + t_string_y, + warning = TRUE + ) + + formats <- c("%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S") + compare_dplyr_binding( + .input %>% + mutate(date_multi_formats = + fast_strptime(x, format = formats, lt = FALSE)) %>% + collect(), + t_string_2formats, + warning = TRUE + ) +}) From af4f2e1f121ecf02024770c3c11cc405240e8aeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 17 May 2022 21:25:18 +0100 Subject: [PATCH 2/7] implement `fast_strptime` with `map` instead of a `for` loop + update tests --- r/R/dplyr-funcs-datetime.R | 16 ++-- r/tests/testthat/test-dplyr-funcs-datetime.R | 81 ++++++++++++++------ 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index e1a528667e6..f59d7ff56da 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -552,7 +552,6 @@ register_bindings_datetime_parsers <- function() { tz = "UTC", lt = FALSE, cutoff_2000 = 68L) { - browser() # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play # well with mutate, for example) if (lt) { @@ -566,15 +565,20 @@ register_bindings_datetime_parsers <- function() { parse_attempt_expressions <- list() - for (i in seq_along(format)) { - parse_attempt_expressions[[i]] <- build_expr( + parse_attempt_expressions <- map( + format, + ~ build_expr( "strptime", x, - options = list(format = format[[i]], unit = 0L, error_is_null = TRUE) + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) ) - } + ) - coalesce_output <- call_binding("strptime", args = parse_attempt_expressions) + coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) }) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index ca6380b0e31..e3fa4f13217 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1779,59 +1779,96 @@ test_that("year, month, day date/time parsers work", { test_that("lubridate's fast_strptime", { - t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) - t_string_y <- tibble(x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)) - t_string_2formats <- tibble(x = c("2018-10-07 19:04:05", "68-10-07 19:04:05")) - t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) compare_dplyr_binding( .input %>% - mutate(y = fast_strptime(x, format = "%Y-%m-%d %H:%M:%S", lt = FALSE)) %>% + mutate( + y = + fast_strptime( + x, + format = "%Y-%m-%d %H:%M:%S", + lt = FALSE + ) + ) %>% collect(), - t_string, + tibble( + x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA) + ), # arrow does not preserve the `tzone` attribute ignore_attr = TRUE ) + # R object compare_dplyr_binding( .input %>% - mutate(y = - fast_strptime("68-10-07 19:04:05", format = "%y-%m-%d %H:%M:%S", lt = FALSE) + mutate( + y = + fast_strptime( + "68-10-07 19:04:05", + format = "%y-%m-%d %H:%M:%S", + lt = FALSE + ) ) %>% collect(), - t_string, + tibble( + x = c("2018-10-07 19:04:05", NA) + ), ignore_attr = TRUE ) - # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is - # implicitly set to 68L both in base R and in Arrow compare_dplyr_binding( .input %>% - mutate(date_short_year = fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE)) %>% + mutate( + date_multi_formats = + fast_strptime( + x, + format = c("%Y-%m-%d %H:%M:%S", "%m-%d-%Y %H:%M:%S"), + lt = FALSE + ) + ) %>% collect(), - t_string_y, - # arrow does not preserve the `tzone` attribute - ignore_attr = TRUE + tibble( + x = c("2018-10-07 19:04:05", "10-07-1968 19:04:05") + ) ) + # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is + # implicitly set to 68L both in lubridate and in Arrow compare_dplyr_binding( .input %>% mutate( date_short_year = - fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE, cutoff_2000 = 69L) + fast_strptime( + x, + format = "%y-%m-%d %H:%M:%S", + lt = FALSE + ) ) %>% collect(), - t_string_y, - warning = TRUE + tibble( + x = + c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) + ), + # arrow does not preserve the `tzone` attribute + ignore_attr = TRUE ) - formats <- c("%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S") + # the arrow binding errors for a value different from 68L for `cutoff_2000` compare_dplyr_binding( .input %>% - mutate(date_multi_formats = - fast_strptime(x, format = formats, lt = FALSE)) %>% + mutate( + date_short_year = + fast_strptime( + x, + format = "%y-%m-%d %H:%M:%S", + lt = FALSE, + cutoff_2000 = 69L + ) + ) %>% collect(), - t_string_2formats, + tibble( + x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) + ), warning = TRUE ) }) From 989aafb3ec5c4167d1b71b4b08972da8d39f5d5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 17 May 2022 22:19:37 +0100 Subject: [PATCH 3/7] lint --- r/R/dplyr-funcs-datetime.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index f59d7ff56da..5b41be70dc1 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -547,11 +547,11 @@ register_bindings_datetime_parsers <- function() { register_binding(ymd_order, ymd_parser_map_factory(ymd_order)) } - register_binding("fast_strptime", function( x, - format, - tz = "UTC", - lt = FALSE, - cutoff_2000 = 68L) { + register_binding("fast_strptime", function(x, + format, + tz = "UTC", + lt = FALSE, + cutoff_2000 = 68L) { # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play # well with mutate, for example) if (lt) { From a238766e4341ef8d5a1e83425f679d1d9af3257a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 18 May 2022 16:02:05 +0100 Subject: [PATCH 4/7] test and comment for `lt = TRUE` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 39507060290..72073df32bc 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1906,4 +1906,22 @@ test_that("lubridate's fast_strptime", { ), warning = TRUE ) + + # compare_dplyr_binding would not work here since lt = TRUE returns a list + # and it also errors in regular dplyr pipelines + expect_warning( + tibble( + x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) + ) %>% + arrow_table() %>% + mutate( + date_short_year = + fast_strptime( + x, + format = "%y-%m-%d %H:%M:%S", + lt = TRUE + ) + ) %>% + collect() + ) }) From 513e3590cb6bd0ec6e7c65e37200b3ed04604d13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 09:33:00 +0100 Subject: [PATCH 5/7] test --- r/tests/testthat/test-dplyr-funcs-datetime.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 72073df32bc..a5268a626f4 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1828,9 +1828,9 @@ test_that("lubridate's fast_strptime", { collect(), tibble( x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA) - ), + )#, # arrow does not preserve the `tzone` attribute - ignore_attr = TRUE + # test ignore_attr = TRUE ) # R object @@ -1847,8 +1847,8 @@ test_that("lubridate's fast_strptime", { collect(), tibble( x = c("2018-10-07 19:04:05", NA) - ), - ignore_attr = TRUE + )#, + # test ignore_attr = TRUE ) compare_dplyr_binding( @@ -1883,9 +1883,9 @@ test_that("lubridate's fast_strptime", { tibble( x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) - ), + )#, # arrow does not preserve the `tzone` attribute - ignore_attr = TRUE + # test ignore_attr = TRUE ) # the arrow binding errors for a value different from 68L for `cutoff_2000` From d713a21817122f6cd0e51ef8750f3dd753e1c282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 09:40:34 +0100 Subject: [PATCH 6/7] test with a different timezone --- r/tests/testthat/test-dplyr-funcs-datetime.R | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a5268a626f4..41015f47e22 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1867,6 +1867,24 @@ test_that("lubridate's fast_strptime", { ) ) + compare_dplyr_binding( + .input %>% + mutate( + dttm_with_tz = fast_strptime( + dttm_as_string, + format = "%Y-%m-%d %H:%M:%S", + tz = "Pacific/Marquesas", + lt = FALSE + ) + ) %>% + collect(), + tibble( + dttm_as_string = + c("2018-10-07 19:04:05", "1969-10-07 19:04:05", NA) + ) + ) + + # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is # implicitly set to 68L both in lubridate and in Arrow compare_dplyr_binding( From 346c2482f7a901cd4dc93b67bfe069c9732a0bd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 09:47:14 +0100 Subject: [PATCH 7/7] delete empty row --- r/tests/testthat/test-dplyr-funcs-datetime.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 41015f47e22..86b1862ab4c 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1884,7 +1884,6 @@ test_that("lubridate's fast_strptime", { ) ) - # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is # implicitly set to 68L both in lubridate and in Arrow compare_dplyr_binding(