From 91b64733e82404227fd8d01eeac759a8c5c809d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 1 Apr 2022 12:17:24 +0100 Subject: [PATCH 01/44] unit tests for `parse_date_time()` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 44 ++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b9277c08c40..cf3bd86ee07 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1253,3 +1253,47 @@ test_that("`decimal_date()` and `date_decimal()`", { ignore_attr = "tzone" ) }) + +test_that("parse_date_time()", { + test_df <- tibble( + dates = c("09-01-17", "02-Sep-17") + ) + + test_dates <- tibble::tibble( + string_ymd = c( + "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", + "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", + "21:09:11", "2021 Sep 12", "2021 September 13", "21 Sep 14", + "21 September 15", + # not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19 + # no separators and %b or %B are even more complicated (and they work in + # lubridate). not to mention locale + NA + ), + string_dmy = c( + "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", + "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", + "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", + # not yet working for strings with no separators, like "10092021", "100921", + NA + ), + string_mdy = c( + "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", + "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", + "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", + # not yet working for strings with no separators, like "09102021", "091021", + NA + ) + ) + + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"), + parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"), + parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") + ) %>% + collect(), + test_dates + ) +}) From f9c7a37051b4aa103606414b687cb29be21b3ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 1 Apr 2022 12:18:21 +0100 Subject: [PATCH 02/44] first pass at implementing `parse_date_time()` --- r/R/dplyr-funcs-datetime.R | 51 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 1ca485f56e1..4fdd98919b0 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -351,6 +351,33 @@ register_bindings_duration <- function() { delta <- delta$cast(int64()) start + delta$cast(duration("s")) }) + register_binding("parse_date_time", function(x, + orders, + tz = "UTC") { + + # make all separators (non-letters and non-numbers) into "-" + x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) + # collapse multiple separators into a single one + x <- call_binding("gsub", "-{2,}", "-", x) + + # TODO figure out how to parse strings that have no separators) + # we could insert separators at the "likely" positions, but it might be + # tricky given the possible combinations between dmy formats + locale + + # each order is translated into 6 possible formats + formats <- build_formats(orders) + coalesce_output <- build_expr( + "coalesce", + build_expr("strptime", x, options = list(format = formats[1], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[2], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[3], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[4], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[5], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[6], unit = 0L, error_is_null = TRUE)) + ) + + build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) + }) } binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { @@ -372,3 +399,27 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { build_expr("strftime", x, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) } + +build_formats <- function(orders) { + year_chars <- sprintf("%%%s", c("y", "Y")) + month_chars <- sprintf("%%%s",c("m", "B", "b")) + day_chars <- sprintf("%%%s",c("d")) + + outcome <- switch( + orders, + "ymd" = expand.grid(year_chars, month_chars, day_chars), + "ydm" = expand.grid(year_chars, day_chars, month_chars), + "mdy" = expand.grid(month_chars, day_chars, year_chars), + "myd" = expand.grid(month_chars, year_chars, day_chars), + "dmy" = expand.grid(day_chars, month_chars, year_chars), + "dym" = expand.grid(day_chars, year_chars, month_chars) + ) + outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-") + outcome$format +} + +instert_at_position <- function(string, positions, replacement) { + if (positions < 0) { + pattern <- paste0("^(.{", nchar(string) - positions, "})(.*)$") + } +} From b63b4e7580135186b77eabad1a3402c3acba23ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 14 Apr 2022 11:52:35 +0300 Subject: [PATCH 03/44] added changes back in --- r/R/dplyr-funcs-datetime.R | 29 ++++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 102 +++++++++++++++++++ 2 files changed, 131 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 4fdd98919b0..4527f2e5995 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -351,6 +351,35 @@ register_bindings_duration <- function() { delta <- delta$cast(int64()) start + delta$cast(duration("s")) }) + register_binding("fast_strptime", function(x, + format, + tz = "UTC", + lt = TRUE, + cutoff_2000 = 68L, + unit = "s") { + # TODO support multiple formats once + # https://issues.apache.org/jira/browse/ARROW-15665 is done + if (length(format) > 1) { + arrow_not_supported("multiple values for `format`") + } + + if (!missing(tz)) { + arrow_not_supported("Time zone argument") + } + # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play + # well with mutate, for example) + if (lt) { + arrow_not_supported("`lt = TRUE` argument") + } + + if (cutoff_2000 != 68L) { + arrow_not_supported("`cutoff_2000` != 68L argument") + } + + unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) + + build_expr("strptime", x, options = list(format = format, unit = unit)) + }) register_binding("parse_date_time", function(x, orders, tz = "UTC") { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index cf3bd86ee07..7fc5a0f1e1f 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1297,3 +1297,105 @@ test_that("parse_date_time()", { test_dates ) }) + +test_that("lubridate's fast_strptime", { + t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) + t_string_y <- tibble(x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)) + t_string_2formats <- tibble(x = c("2018-10-07 19:04:05", "68-10-07 19:04:05")) + t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) + + compare_dplyr_binding( + .input %>% + mutate(y = fast_strptime(x, format = "%Y-%m-%d %H:%M:%S", lt = FALSE)) %>% + collect(), + t_string, + # arrow does not preserve the `tzone` attribute + ignore_attr = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate(y = + fast_strptime("68-10-07 19:04:05", format = "%y-%m-%d %H:%M:%S", lt = FALSE) + ) %>% + collect(), + t_string, + ignore_attr = TRUE + ) + + # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is + # implicitly set to 68L both in base R and in Arrow + compare_dplyr_binding( + .input %>% + mutate(date_short_year = fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE)) %>% + collect(), + t_string_y, + # arrow does not preserve the `tzone` attribute + ignore_attr = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate( + date_short_year = + fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE, cutoff_2000 = 69L) + ) %>% + collect(), + t_string_y, + warning = TRUE + ) + + formats <- c("%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S") + compare_dplyr_binding( + .input %>% + mutate(date_multi_formats = + fast_strptime(x, format = formats, lt = FALSE)) %>% + collect(), + t_string_2formats, + warning = TRUE + ) +}) + +test_that("parse_date_time()", { + test_df <- tibble( + dates = c("09-01-17", "02-Sep-17") + ) + + test_dates <- tibble::tibble( + string_ymd = c( + "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", + "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", + "21:09:11", "2021 Sep 12", "2021 September 13", "21 Sep 14", + "21 September 15", + # not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19 + # no separators and %b or %B are even more complicated (and they work in + # lubridate). not to mention locale + NA + ), + string_dmy = c( + "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", + "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", + "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", + # not yet working for strings with no separators, like "10092021", "100921", + NA + ), + string_mdy = c( + "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", + "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", + "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", + # not yet working for strings with no separators, like "09102021", "091021", + NA + ) + ) + + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"), + parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"), + parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") + ) %>% + collect(), + test_dates + ) +}) From cc01f243254e8c90310c12984e45946cca64ab33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 14 Apr 2022 11:56:39 +0300 Subject: [PATCH 04/44] NEWS update --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index 2e6a993507c..76b7cc402a7 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -27,6 +27,7 @@ * date-time functionality: * Added `difftime` and `as.difftime()` * Added `as.Date()` to convert to date + * Added `parse_date_time()` & `fast_strptime()` datetime parsers * `median()` and `quantile()` will warn once about approximate calculations regardless of interactivity. # arrow 7.0.0 From 35fb2935abc9c5dd1b3206e8f5fd01ffd4de456c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 14 Apr 2022 13:08:20 +0300 Subject: [PATCH 05/44] lints --- r/R/dplyr-funcs-datetime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 4527f2e5995..cdd557c2087 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -431,8 +431,8 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { build_formats <- function(orders) { year_chars <- sprintf("%%%s", c("y", "Y")) - month_chars <- sprintf("%%%s",c("m", "B", "b")) - day_chars <- sprintf("%%%s",c("d")) + month_chars <- sprintf("%%%s", c("m", "B", "b")) + day_chars <- sprintf("%%%s", c("d")) outcome <- switch( orders, From e7821c62c17f35455395446cfff7ca3adfa21b35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 14 Apr 2022 17:46:56 +0300 Subject: [PATCH 06/44] removed `insert_at_position()` and added a TODO --- r/R/dplyr-funcs-datetime.R | 8 +------- r/tests/testthat/test-dplyr-funcs-datetime.R | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index cdd557c2087..4f7ba98b07b 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -432,7 +432,7 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { build_formats <- function(orders) { year_chars <- sprintf("%%%s", c("y", "Y")) month_chars <- sprintf("%%%s", c("m", "B", "b")) - day_chars <- sprintf("%%%s", c("d")) + day_chars <- sprintf("%%%s", "d") outcome <- switch( orders, @@ -446,9 +446,3 @@ build_formats <- function(orders) { outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-") outcome$format } - -instert_at_position <- function(string, positions, replacement) { - if (positions < 0) { - pattern <- paste0("^(.{", nchar(string) - positions, "})(.*)$") - } -} diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 7fc5a0f1e1f..47e67a346eb 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1286,6 +1286,7 @@ test_that("parse_date_time()", { ) ) + # TODO add unit tests and support for hms parts too compare_dplyr_binding( .input %>% mutate( From 34383443f37ad1717a3588a4ce8f6efb2d595573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 14 Apr 2022 18:46:47 +0300 Subject: [PATCH 07/44] trying out bypassing the windows custom logic --- r/tests/testthat/test-dplyr-funcs-datetime.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 47e67a346eb..44fd76dc9db 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -26,9 +26,9 @@ library(dplyr, warn.conflicts = FALSE) # TODO: consider reevaluating this workaround after ARROW-12980 withr::local_timezone("UTC") -if (tolower(Sys.info()[["sysname"]]) == "windows") { - withr::local_locale(LC_TIME = "C") -} +# if (tolower(Sys.info()[["sysname"]]) == "windows") { +# withr::local_locale(LC_TIME = "C") +# } test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas") From 3cbbc9dd82954ce5cbbb12a214847e2f5790301f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 28 Apr 2022 12:05:20 +0100 Subject: [PATCH 08/44] clean-up after merging master --- r/tests/testthat/test-dplyr-funcs-datetime.R | 60 +++----------------- 1 file changed, 8 insertions(+), 52 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 21eda8494a3..0186dc3337e 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -28,9 +28,9 @@ library(dplyr, warn.conflicts = FALSE) # TODO: consider reevaluating this workaround after ARROW-12980 withr::local_timezone("UTC") -# if (tolower(Sys.info()[["sysname"]]) == "windows") { -# withr::local_locale(LC_TIME = "C") -# } +if (tolower(Sys.info()[["sysname"]]) == "windows") { + withr::local_locale(LC_TIME = "C") +} test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas") @@ -1434,7 +1434,7 @@ test_that("make_difftime()", { ) %>% collect(), paste0("named `difftime` units other than: `second`, `minute`, `hour`,", - " `day`, and `week` not supported in Arrow.") + " `day`, and `week` not supported in Arrow.") ) ) @@ -1529,12 +1529,12 @@ test_that("lubridate's fast_strptime", { collect(), t_string, # arrow does not preserve the `tzone` attribute - ignore_attr = TRUE + ignore_attr = TRUE ) compare_dplyr_binding( .input %>% - mutate(y = + mutate(y = fast_strptime("68-10-07 19:04:05", format = "%y-%m-%d %H:%M:%S", lt = FALSE) ) %>% collect(), @@ -1550,13 +1550,13 @@ test_that("lubridate's fast_strptime", { collect(), t_string_y, # arrow does not preserve the `tzone` attribute - ignore_attr = TRUE + ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( - date_short_year = + date_short_year = fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE, cutoff_2000 = 69L) ) %>% collect(), @@ -1574,47 +1574,3 @@ test_that("lubridate's fast_strptime", { warning = TRUE ) }) - -test_that("parse_date_time()", { - test_df <- tibble( - dates = c("09-01-17", "02-Sep-17") - ) - - test_dates <- tibble::tibble( - string_ymd = c( - "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", - "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", - "21:09:11", "2021 Sep 12", "2021 September 13", "21 Sep 14", - "21 September 15", - # not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19 - # no separators and %b or %B are even more complicated (and they work in - # lubridate). not to mention locale - NA - ), - string_dmy = c( - "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", - "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", - "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", - # not yet working for strings with no separators, like "10092021", "100921", - NA - ), - string_mdy = c( - "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", - "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", - "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", - # not yet working for strings with no separators, like "09102021", "091021", - NA - ) - ) - - compare_dplyr_binding( - .input %>% - mutate( - parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"), - parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"), - parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") - ) %>% - collect(), - test_dates - ) -}) \ No newline at end of file From f97fc0c092d26ef0a872f6f3b6d51634f6908c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 28 Apr 2022 12:06:02 +0100 Subject: [PATCH 09/44] add newline --- r/R/dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 3a9995066e3..15db0dcccbb 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -612,4 +612,4 @@ build_formats <- function(orders) { ) outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-") outcome$format -} \ No newline at end of file +} From 70204d12e3d5649e247d16f8b6c3c722acd66522 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 28 Apr 2022 13:22:11 +0100 Subject: [PATCH 10/44] separate locale-dependent unit tests on Windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 32 ++++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 0186dc3337e..b477be3da19 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1480,9 +1480,7 @@ test_that("parse_date_time()", { test_dates <- tibble::tibble( string_ymd = c( "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", - "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", - "21:09:11", "2021 Sep 12", "2021 September 13", "21 Sep 14", - "21 September 15", + "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", "21:09:11", # not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19 # no separators and %b or %B are even more complicated (and they work in # lubridate). not to mention locale @@ -1491,20 +1489,29 @@ test_that("parse_date_time()", { string_dmy = c( "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", - "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", # not yet working for strings with no separators, like "10092021", "100921", NA ), string_mdy = c( "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", - "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", # not yet working for strings with no separators, like "09102021", "091021", NA ) ) - # TODO add unit tests and support for hms parts too + test_dates_locale <- tibble::tibble( + string_ymd = c( + "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", NA + ), + string_dmy = c( + "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", NA + ), + string_mdy = c( + "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", NA + ) + ) + compare_dplyr_binding( .input %>% mutate( @@ -1515,6 +1522,19 @@ test_that("parse_date_time()", { collect(), test_dates ) + + # locale not working properly on windows + skip_on_os("windows") + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"), + parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"), + parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") + ) %>% + collect(), + test_dates_locale + ) }) test_that("lubridate's fast_strptime", { From 9da29c05bc9e1f11b57fbb05b5076c365bf52013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 28 Apr 2022 13:25:18 +0100 Subject: [PATCH 11/44] add reference to locale Jira --- r/tests/testthat/test-dplyr-funcs-datetime.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b477be3da19..a994c88aadf 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1524,6 +1524,7 @@ test_that("parse_date_time()", { ) # locale not working properly on windows + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-13133 is done skip_on_os("windows") compare_dplyr_binding( .input %>% From 395594cb32156c78348580588e199a6b1f8771fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 07:34:52 +0100 Subject: [PATCH 12/44] refer to the locale on Windows Jira --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a994c88aadf..becd47df257 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1524,7 +1524,7 @@ test_that("parse_date_time()", { ) # locale not working properly on windows - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-13133 is done + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16399 is done skip_on_os("windows") compare_dplyr_binding( .input %>% From f77246da084264a5a09812cbdbb5011bcc5ba57a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 08:50:51 +0100 Subject: [PATCH 13/44] skip tests on windows when the R version is older than 4.0 --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index becd47df257..40f48e9fd08 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1512,6 +1512,9 @@ test_that("parse_date_time()", { ) ) + if (R.version$major < 4) { + skip_on_os("windows") + } compare_dplyr_binding( .input %>% mutate( From ffa667fe4b30284f096f37fc64168f9f432f8ee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 12:09:08 +0100 Subject: [PATCH 14/44] replace skipping on older R on windows with skip if RE2 not available --- r/tests/testthat/test-dplyr-funcs-datetime.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 40f48e9fd08..9515809dd79 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1512,9 +1512,7 @@ test_that("parse_date_time()", { ) ) - if (R.version$major < 4) { - skip_on_os("windows") - } + skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( From 1c97ca6382cdcb820e32700855d0cf6332af3023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 15:08:09 +0100 Subject: [PATCH 15/44] error early when the user supplies `orders` we don't yet support --- r/R/dplyr-funcs-datetime.R | 14 ++++++++++++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 19 ++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 15db0dcccbb..94cc1682b38 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -390,6 +390,20 @@ register_bindings_duration <- function() { orders, tz = "UTC") { + supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") + unsupported_passed_orders <- setdiff(orders, supported_orders) + + if (length(unsupported_passed_orders) > 0) { + arrow_not_supported( + paste0( + oxford_paste( + unsupported_passed_orders + ), + " `orders`" + ) + ) + } + # make all separators (non-letters and non-numbers) into "-" x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) # collapse multiple separators into a single one diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 9515809dd79..116113a790f 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1472,11 +1472,7 @@ test_that("make_difftime()", { ) }) -test_that("parse_date_time()", { - test_df <- tibble( - dates = c("09-01-17", "02-Sep-17") - ) - +test_that("parse_date_time() with year, month, and date components", { test_dates <- tibble::tibble( string_ymd = c( "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", @@ -1512,6 +1508,19 @@ test_that("parse_date_time()", { ) ) + test_dates_times <- tibble( + date_times = c("09-01-17 12:34:56", NA) + ) + + expect_warning( + test_dates_times %>% + arrow_table() %>% + mutate(parsed_date_ymd = parse_date_time(date_times, orders = "ymd_HMS")) %>% + collect(), + '"ymd_HMS" `orders` not supported in Arrow' + ) + + skip_if_not_available("re2") compare_dplyr_binding( .input %>% From 23adca4ce26da8d674b874b830a2c03515e5041d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 15:12:28 +0100 Subject: [PATCH 16/44] simplify `build_formats()` --- r/R/dplyr-funcs-datetime.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 94cc1682b38..3dd4ce89a5e 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -611,9 +611,9 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { } build_formats <- function(orders) { - year_chars <- sprintf("%%%s", c("y", "Y")) - month_chars <- sprintf("%%%s", c("m", "B", "b")) - day_chars <- sprintf("%%%s", "d") + year_chars <- c("%y", "%Y") + month_chars <- c("%m", "%B", "%b") + day_chars <- "%d" outcome <- switch( orders, From 94bef2d84fd035af14920efcc7dfa7f06e45213f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 15:27:06 +0100 Subject: [PATCH 17/44] simplify the `fast_strptime()` implementation --- r/R/dplyr-funcs-datetime.R | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 3dd4ce89a5e..1ddf8c43482 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -369,9 +369,6 @@ register_bindings_duration <- function() { arrow_not_supported("multiple values for `format`") } - if (!missing(tz)) { - arrow_not_supported("Time zone argument") - } # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play # well with mutate, for example) if (lt) { @@ -382,9 +379,7 @@ register_bindings_duration <- function() { arrow_not_supported("`cutoff_2000` != 68L argument") } - unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) - - build_expr("strptime", x, options = list(format = format, unit = unit)) + call_binding("strptime", x, format = format, unit = unit) }) register_binding("parse_date_time", function(x, orders, From a5e2b1ac1a353cb329020bd1ef796e188f24221e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 09:26:41 +0100 Subject: [PATCH 18/44] trigger a CI failure on windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 116113a790f..5bb51f2a436 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1535,7 +1535,7 @@ test_that("parse_date_time() with year, month, and date components", { # locale not working properly on windows # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16399 is done - skip_on_os("windows") + # test skip_on_os("windows") compare_dplyr_binding( .input %>% mutate( From 93f69449a60d1ceb6efecd467b05bbfc106f71b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 09:40:19 +0100 Subject: [PATCH 19/44] remove binding and unit tests for `fast_strptime()` --- r/NEWS.md | 2 +- r/R/dplyr-funcs-datetime.R | 23 -------- r/tests/testthat/test-dplyr-funcs-datetime.R | 58 -------------------- 3 files changed, 1 insertion(+), 82 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 97d605706e2..5125477ef2d 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -30,7 +30,7 @@ * Added `as_date()` and `as_datetime()` * Added `difftime` and `as.difftime()` * Added `as.Date()` to convert to date - * Added `parse_date_time()` & `fast_strptime()` datetime parsers + * Added `parse_date_time()` datetime parser for year, month, and day components. * `median()` and `quantile()` will warn once about approximate calculations regardless of interactivity. * Removed Solaris workarounds, libarrow is now required. diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 1ddf8c43482..924ca95c1f4 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -357,30 +357,7 @@ register_bindings_duration <- function() { delta <- delta$cast(int64()) start + delta$cast(duration("s")) }) - register_binding("fast_strptime", function(x, - format, - tz = "UTC", - lt = TRUE, - cutoff_2000 = 68L, - unit = "s") { - # TODO support multiple formats once - # https://issues.apache.org/jira/browse/ARROW-15665 is done - if (length(format) > 1) { - arrow_not_supported("multiple values for `format`") - } - - # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play - # well with mutate, for example) - if (lt) { - arrow_not_supported("`lt = TRUE` argument") - } - if (cutoff_2000 != 68L) { - arrow_not_supported("`cutoff_2000` != 68L argument") - } - - call_binding("strptime", x, format = format, unit = unit) - }) register_binding("parse_date_time", function(x, orders, tz = "UTC") { diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 5bb51f2a436..d76b6308f63 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1547,61 +1547,3 @@ test_that("parse_date_time() with year, month, and date components", { test_dates_locale ) }) - -test_that("lubridate's fast_strptime", { - t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) - t_string_y <- tibble(x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)) - t_string_2formats <- tibble(x = c("2018-10-07 19:04:05", "68-10-07 19:04:05")) - t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) - - compare_dplyr_binding( - .input %>% - mutate(y = fast_strptime(x, format = "%Y-%m-%d %H:%M:%S", lt = FALSE)) %>% - collect(), - t_string, - # arrow does not preserve the `tzone` attribute - ignore_attr = TRUE - ) - - compare_dplyr_binding( - .input %>% - mutate(y = - fast_strptime("68-10-07 19:04:05", format = "%y-%m-%d %H:%M:%S", lt = FALSE) - ) %>% - collect(), - t_string, - ignore_attr = TRUE - ) - - # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is - # implicitly set to 68L both in base R and in Arrow - compare_dplyr_binding( - .input %>% - mutate(date_short_year = fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE)) %>% - collect(), - t_string_y, - # arrow does not preserve the `tzone` attribute - ignore_attr = TRUE - ) - - compare_dplyr_binding( - .input %>% - mutate( - date_short_year = - fast_strptime(x, format = "%y-%m-%d %H:%M:%S", lt = FALSE, cutoff_2000 = 69L) - ) %>% - collect(), - t_string_y, - warning = TRUE - ) - - formats <- c("%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S") - compare_dplyr_binding( - .input %>% - mutate(date_multi_formats = - fast_strptime(x, format = formats, lt = FALSE)) %>% - collect(), - t_string_2formats, - warning = TRUE - ) -}) From e223f515bc9045b786b06f78a10196d88fb20e3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 10:33:16 +0100 Subject: [PATCH 20/44] move `HMS` failing (expected) tests to a separate block --- r/tests/testthat/test-dplyr-funcs-datetime.R | 31 ++++++++++---------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index d76b6308f63..5cd2b2d01ae 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1472,7 +1472,7 @@ test_that("make_difftime()", { ) }) -test_that("parse_date_time() with year, month, and date components", { +test_that("parse_date_time() works with year, month, and date components", { test_dates <- tibble::tibble( string_ymd = c( "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", @@ -1508,19 +1508,6 @@ test_that("parse_date_time() with year, month, and date components", { ) ) - test_dates_times <- tibble( - date_times = c("09-01-17 12:34:56", NA) - ) - - expect_warning( - test_dates_times %>% - arrow_table() %>% - mutate(parsed_date_ymd = parse_date_time(date_times, orders = "ymd_HMS")) %>% - collect(), - '"ymd_HMS" `orders` not supported in Arrow' - ) - - skip_if_not_available("re2") compare_dplyr_binding( .input %>% @@ -1535,7 +1522,7 @@ test_that("parse_date_time() with year, month, and date components", { # locale not working properly on windows # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16399 is done - # test skip_on_os("windows") + skip_on_os("windows") compare_dplyr_binding( .input %>% mutate( @@ -1547,3 +1534,17 @@ test_that("parse_date_time() with year, month, and date components", { test_dates_locale ) }) + +test_that("parse_date_time() doesn't work with hour, minutes, and second components", { + test_dates_times <- tibble( + date_times = c("09-01-17 12:34:56", NA) + ) + + expect_warning( + test_dates_times %>% + arrow_table() %>% + mutate(parsed_date_ymd = parse_date_time(date_times, orders = "ymd_HMS")) %>% + collect(), + '"ymd_HMS" `orders` not supported in Arrow' + ) +}) From 16c634d03911538be1df68c478b292ccc0162dd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 10:56:32 +0100 Subject: [PATCH 21/44] test to figure out the CI `LC_TIME` locale --- r/tests/testthat/test-dplyr-funcs-datetime.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 5cd2b2d01ae..698483d5629 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1508,6 +1508,8 @@ test_that("parse_date_time() works with year, month, and date components", { ) ) + Sys.getlocale(category = "LC_TIME") + skip_if_not_available("re2") compare_dplyr_binding( .input %>% @@ -1522,7 +1524,7 @@ test_that("parse_date_time() works with year, month, and date components", { # locale not working properly on windows # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16399 is done - skip_on_os("windows") + # test skip_on_os("windows") compare_dplyr_binding( .input %>% mutate( From e0a4db612823c42ee652e356f2326aa07c03fbaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 10:59:17 +0100 Subject: [PATCH 22/44] test --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 698483d5629..5382f56f801 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1510,6 +1510,9 @@ test_that("parse_date_time() works with year, month, and date components", { Sys.getlocale(category = "LC_TIME") + a <- Array$create("2021 September 13") + call_function("strptime", a, options = list(format = "%Y %B %d", unit = 0L)) + skip_if_not_available("re2") compare_dplyr_binding( .input %>% From c89cca85a9541d8174d1573f5029a224aa9d29ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 11:40:14 +0100 Subject: [PATCH 23/44] set locale to `"en_US.UTF-8"` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 5382f56f801..204ac5353ea 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1508,11 +1508,6 @@ test_that("parse_date_time() works with year, month, and date components", { ) ) - Sys.getlocale(category = "LC_TIME") - - a <- Array$create("2021 September 13") - call_function("strptime", a, options = list(format = "%Y %B %d", unit = 0L)) - skip_if_not_available("re2") compare_dplyr_binding( .input %>% @@ -1526,8 +1521,11 @@ test_that("parse_date_time() works with year, month, and date components", { ) # locale not working properly on windows - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16399 is done - # test skip_on_os("windows") + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done + # skip_on_os("windows") + if (tolower(Sys.info()[["sysname"]]) == "windows") { + withr::local_locale(LC_TIME = "en_US.UTF-8") + } compare_dplyr_binding( .input %>% mutate( From 683f6ad37d42975ff5acbb43b413a184956e6d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 11:43:56 +0100 Subject: [PATCH 24/44] `LC_TIME="C"` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 204ac5353ea..f1c479cf6fd 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1524,7 +1524,7 @@ test_that("parse_date_time() works with year, month, and date components", { # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done # skip_on_os("windows") if (tolower(Sys.info()[["sysname"]]) == "windows") { - withr::local_locale(LC_TIME = "en_US.UTF-8") + withr::local_locale(LC_TIME = "C") } compare_dplyr_binding( .input %>% From 203f0fa24c83074cbd591977dd7a47b5034c5050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 11:53:47 +0100 Subject: [PATCH 25/44] final version with skip on Windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index f1c479cf6fd..96dc14c538a 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1522,10 +1522,7 @@ test_that("parse_date_time() works with year, month, and date components", { # locale not working properly on windows # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done - # skip_on_os("windows") - if (tolower(Sys.info()[["sysname"]]) == "windows") { - withr::local_locale(LC_TIME = "C") - } + skip_on_os("windows") compare_dplyr_binding( .input %>% mutate( From 5774f1000a0f14d010bb528c6f11be6c50a893aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 14:31:39 +0100 Subject: [PATCH 26/44] move test data frames definition inside `compare_dplyr_binding()` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 72 +++++++++----------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 96dc14c538a..2a3de490e14 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1473,41 +1473,6 @@ test_that("make_difftime()", { }) test_that("parse_date_time() works with year, month, and date components", { - test_dates <- tibble::tibble( - string_ymd = c( - "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", - "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", "21:09:11", - # not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19 - # no separators and %b or %B are even more complicated (and they work in - # lubridate). not to mention locale - NA - ), - string_dmy = c( - "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", - "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", - # not yet working for strings with no separators, like "10092021", "100921", - NA - ), - string_mdy = c( - "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", - "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", - # not yet working for strings with no separators, like "09102021", "091021", - NA - ) - ) - - test_dates_locale <- tibble::tibble( - string_ymd = c( - "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", NA - ), - string_dmy = c( - "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", NA - ), - string_mdy = c( - "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", NA - ) - ) - skip_if_not_available("re2") compare_dplyr_binding( .input %>% @@ -1517,10 +1482,31 @@ test_that("parse_date_time() works with year, month, and date components", { parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") ) %>% collect(), - test_dates + tibble::tibble( + string_ymd = c( + "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", + "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", "21:09:11", + # not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19 + # no separators and %b or %B are even more complicated (and they work in + # lubridate). not to mention locale + NA + ), + string_dmy = c( + "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", + "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", + # not yet working for strings with no separators, like "10092021", "100921", + NA + ), + string_mdy = c( + "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", + "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", + # not yet working for strings with no separators, like "09102021", "091021", + NA + ) + ) ) - # locale not working properly on windows + # locale (affecting "%b% and "%B" formats) does not work properly on Windows # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done skip_on_os("windows") compare_dplyr_binding( @@ -1531,7 +1517,17 @@ test_that("parse_date_time() works with year, month, and date components", { parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") ) %>% collect(), - test_dates_locale + tibble::tibble( + string_ymd = c( + "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", NA + ), + string_dmy = c( + "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", NA + ), + string_mdy = c( + "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", NA + ) + ) ) }) From ad2596bc7e69f4dac1c801dca0918a46f5376f50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 3 May 2022 14:52:50 +0100 Subject: [PATCH 27/44] added a Jira to update `parse_date_time` to accept strings without separators --- r/R/dplyr-funcs-datetime.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 924ca95c1f4..37234d50304 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -381,7 +381,8 @@ register_bindings_duration <- function() { # collapse multiple separators into a single one x <- call_binding("gsub", "-{2,}", "-", x) - # TODO figure out how to parse strings that have no separators) + # TODO figure out how to parse strings that have no separators + # https://issues.apache.org/jira/browse/ARROW-16446 # we could insert separators at the "likely" positions, but it might be # tricky given the possible combinations between dmy formats + locale From 9b6309a852aa15ffa2ee149e57b2c19091a0ff3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 4 May 2022 14:53:19 +0100 Subject: [PATCH 28/44] moved `build_formats()` to helpers --- r/R/dplyr-datetime-helpers.R | 18 ++++++++++++++++++ r/R/dplyr-funcs-datetime.R | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 9acf8b18435..fbd2f8d48ff 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -154,3 +154,21 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { x } + +build_formats <- function(orders) { + year_chars <- c("%y", "%Y") + month_chars <- c("%m", "%B", "%b") + day_chars <- "%d" + + outcome <- switch( + orders, + "ymd" = expand.grid(year_chars, month_chars, day_chars), + "ydm" = expand.grid(year_chars, day_chars, month_chars), + "mdy" = expand.grid(month_chars, day_chars, year_chars), + "myd" = expand.grid(month_chars, year_chars, day_chars), + "dmy" = expand.grid(day_chars, month_chars, year_chars), + "dym" = expand.grid(day_chars, year_chars, month_chars) + ) + outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-") + outcome$format +} diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 7b8910729ad..0025171d765 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -528,21 +528,3 @@ register_bindings_duration_helpers <- function() { abort("Duration in picoseconds not supported in Arrow.") }) } - -build_formats <- function(orders) { - year_chars <- c("%y", "%Y") - month_chars <- c("%m", "%B", "%b") - day_chars <- "%d" - - outcome <- switch( - orders, - "ymd" = expand.grid(year_chars, month_chars, day_chars), - "ydm" = expand.grid(year_chars, day_chars, month_chars), - "mdy" = expand.grid(month_chars, day_chars, year_chars), - "myd" = expand.grid(month_chars, year_chars, day_chars), - "dmy" = expand.grid(day_chars, month_chars, year_chars), - "dym" = expand.grid(day_chars, year_chars, month_chars) - ) - outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-") - outcome$format -} From 88beefbc4149140add7de04dc1118555ad12439c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 4 May 2022 14:57:33 +0100 Subject: [PATCH 29/44] register datetime parsers separately --- r/R/dplyr-funcs-datetime.R | 89 ++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 43 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 0025171d765..e64d4d1b270 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -24,6 +24,7 @@ register_bindings_datetime <- function() { register_bindings_duration() register_bindings_duration_constructor() register_bindings_duration_helpers() + register_bindings_datetime_parsers() } register_bindings_datetime_utility <- function() { @@ -432,49 +433,6 @@ register_bindings_duration <- function() { build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) }) - - register_binding("parse_date_time", function(x, - orders, - tz = "UTC") { - - supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") - unsupported_passed_orders <- setdiff(orders, supported_orders) - - if (length(unsupported_passed_orders) > 0) { - arrow_not_supported( - paste0( - oxford_paste( - unsupported_passed_orders - ), - " `orders`" - ) - ) - } - - # make all separators (non-letters and non-numbers) into "-" - x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) - # collapse multiple separators into a single one - x <- call_binding("gsub", "-{2,}", "-", x) - - # TODO figure out how to parse strings that have no separators - # https://issues.apache.org/jira/browse/ARROW-16446 - # we could insert separators at the "likely" positions, but it might be - # tricky given the possible combinations between dmy formats + locale - - # each order is translated into 6 possible formats - formats <- build_formats(orders) - coalesce_output <- build_expr( - "coalesce", - build_expr("strptime", x, options = list(format = formats[1], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[2], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[3], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[4], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[5], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[6], unit = 0L, error_is_null = TRUE)) - ) - - build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) - }) } register_bindings_duration_constructor <- function() { @@ -528,3 +486,48 @@ register_bindings_duration_helpers <- function() { abort("Duration in picoseconds not supported in Arrow.") }) } + +register_bindings_datetime_parsers <- function() { + register_binding("parse_date_time", function(x, + orders, + tz = "UTC") { + + supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") + unsupported_passed_orders <- setdiff(orders, supported_orders) + + if (length(unsupported_passed_orders) > 0) { + arrow_not_supported( + paste0( + oxford_paste( + unsupported_passed_orders + ), + " `orders`" + ) + ) + } + + # make all separators (non-letters and non-numbers) into "-" + x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) + # collapse multiple separators into a single one + x <- call_binding("gsub", "-{2,}", "-", x) + + # TODO figure out how to parse strings that have no separators + # https://issues.apache.org/jira/browse/ARROW-16446 + # we could insert separators at the "likely" positions, but it might be + # tricky given the possible combinations between dmy formats + locale + + # each order is translated into 6 possible formats + formats <- build_formats(orders) + coalesce_output <- build_expr( + "coalesce", + build_expr("strptime", x, options = list(format = formats[1], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[2], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[3], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[4], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[5], unit = 0L, error_is_null = TRUE)), + build_expr("strptime", x, options = list(format = formats[6], unit = 0L, error_is_null = TRUE)) + ) + + build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) + }) +} From 3717d49676da13c7ff92e3fc54aab4de3b96f713 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 4 May 2022 15:26:49 +0100 Subject: [PATCH 30/44] style --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index de2dba48d64..b718d0735f3 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1692,5 +1692,5 @@ test_that("parse_date_time() doesn't work with hour, minutes, and second compone mutate(parsed_date_ymd = parse_date_time(date_times, orders = "ymd_HMS")) %>% collect(), '"ymd_HMS" `orders` not supported in Arrow' - ) + ) }) From c6f9e0037e180e121c63749a676e576011c71ce7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 5 May 2022 11:04:41 +0100 Subject: [PATCH 31/44] a more elegant implementation (no hardcoding) --- r/R/dplyr-funcs-datetime.R | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index e64d4d1b270..96e7ff56e0c 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -518,15 +518,19 @@ register_bindings_datetime_parsers <- function() { # each order is translated into 6 possible formats formats <- build_formats(orders) - coalesce_output <- build_expr( - "coalesce", - build_expr("strptime", x, options = list(format = formats[1], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[2], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[3], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[4], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[5], unit = 0L, error_is_null = TRUE)), - build_expr("strptime", x, options = list(format = formats[6], unit = 0L, error_is_null = TRUE)) - ) + + # build a list of expressions for each format + parse_attempt_expressions <- list() + + for (i in seq_along(formats)) { + parse_attempt_expressions[[i]] <- build_expr( + "strptime", + x, + options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE) + ) + } + + coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) }) From 98890c5b43ec1f6b81b4908bbe35a6040c2914ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 5 May 2022 16:29:30 +0100 Subject: [PATCH 32/44] accept `orders` longer than 1 --- r/R/dplyr-datetime-helpers.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index fbd2f8d48ff..b8397ddfb0b 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -156,12 +156,17 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { } build_formats <- function(orders) { + formats_list <- map(orders, build_format_from_order) + purrr::flatten_chr(formats_list) +} + +build_format_from_order <- function(order) { year_chars <- c("%y", "%Y") month_chars <- c("%m", "%B", "%b") day_chars <- "%d" outcome <- switch( - orders, + order, "ymd" = expand.grid(year_chars, month_chars, day_chars), "ydm" = expand.grid(year_chars, day_chars, month_chars), "mdy" = expand.grid(month_chars, day_chars, year_chars), From 5dd5247677901eafa7f3de99f5bfaf339fc0b0e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 09:50:25 +0100 Subject: [PATCH 33/44] updated NEWS with `parse_date_time()` limitations --- r/NEWS.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 8e6af2efa94..f8e05045ee6 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -16,8 +16,15 @@ specific language governing permissions and limitations under the License. --> +# arrow 8.0.0.9000 -# arrow 7.0.0.9000 + * `lubridate::parse_date_time()` datetime parser (only for year, month, and day components with separators). Not all functionality has been implemented: + * the `orders` argument in the Arrow binding works slightly different than in `lubridate::parse_date_time()` and closer to `lubridate::parse_date_time2()` and `lubridate::fast_strptime()`: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place. `lubridate::parse_date_time()` usually "trains" formats on a subset of the input vector and then applies them according to performance on the training set. + * currently only `orders` containing the year, month, and day components are supported. In a future release `orders` containing other datetime components (such as hours, minutes, seconds, etc) will be supported. + * `orders` currently only works with shorter formats (without the `%`). Supported formats: `"ymd"`, `"ydm"`, `"mdy"`, `"myd"`, `"dmy"`, and `"dym"`. + * strings with no separators (e.g. 20210917) will likely fail to parse so you might want to try using the `strptime()` binding instead. + +# arrow 8.0.0 ## Enhancements to dplyr and datasets @@ -62,7 +69,6 @@ * `?lubridate::duration` helper functions, such as `dyears()`, `dhours()`, `dseconds()`. * `lubridate::leap_year()` * `lubridate::as_date()` and `lubridate::as_datetime()` - * `lubridate::parse_date_time()` datetime parser (only for year, month, and day components with separators). * Also for Arrow dplyr queries, added support and fixes for base date and time functions: * `base::difftime` and `base::as.difftime()` * `base::as.Date()` to convert to date From db274ce331ae407b62fbfc29f7977eeca01044bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 10:00:54 +0100 Subject: [PATCH 34/44] switch back to 7.0.0.9000 --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index f8e05045ee6..1f2d430eca5 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,7 +24,7 @@ * `orders` currently only works with shorter formats (without the `%`). Supported formats: `"ymd"`, `"ydm"`, `"mdy"`, `"myd"`, `"dmy"`, and `"dym"`. * strings with no separators (e.g. 20210917) will likely fail to parse so you might want to try using the `strptime()` binding instead. -# arrow 8.0.0 +# arrow 7.0.0.9000 ## Enhancements to dplyr and datasets From dc36fd0464db4bd40eaa238a1cf885d78b54b4b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 10:09:10 +0100 Subject: [PATCH 35/44] one more try --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 1f2d430eca5..7be7adf1f48 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License. --> -# arrow 8.0.0.9000 +# development version * `lubridate::parse_date_time()` datetime parser (only for year, month, and day components with separators). Not all functionality has been implemented: * the `orders` argument in the Arrow binding works slightly different than in `lubridate::parse_date_time()` and closer to `lubridate::parse_date_time2()` and `lubridate::fast_strptime()`: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place. `lubridate::parse_date_time()` usually "trains" formats on a subset of the input vector and then applies them according to performance on the training set. From de5252497c9d066317d90cbd87f5007e1e8a6dcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 10:26:26 +0100 Subject: [PATCH 36/44] allow orders to have `"%"` (which get stripped out) and move checks on orders to `build_formats()` --- r/NEWS.md | 1 - r/R/dplyr-datetime-helpers.R | 20 ++++++++++++++++++++ r/R/dplyr-funcs-datetime.R | 18 ++---------------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 7be7adf1f48..2b8e71226e9 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -21,7 +21,6 @@ * `lubridate::parse_date_time()` datetime parser (only for year, month, and day components with separators). Not all functionality has been implemented: * the `orders` argument in the Arrow binding works slightly different than in `lubridate::parse_date_time()` and closer to `lubridate::parse_date_time2()` and `lubridate::fast_strptime()`: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place. `lubridate::parse_date_time()` usually "trains" formats on a subset of the input vector and then applies them according to performance on the training set. * currently only `orders` containing the year, month, and day components are supported. In a future release `orders` containing other datetime components (such as hours, minutes, seconds, etc) will be supported. - * `orders` currently only works with shorter formats (without the `%`). Supported formats: `"ymd"`, `"ydm"`, `"mdy"`, `"myd"`, `"dmy"`, and `"dym"`. * strings with no separators (e.g. 20210917) will likely fail to parse so you might want to try using the `strptime()` binding instead. # arrow 7.0.0.9000 diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index b8397ddfb0b..f11b2ddcaa4 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -156,6 +156,26 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { } build_formats <- function(orders) { + # only keep the letters and the underscore as separator -> allow the users to + # pass strptime-like formats (with "%") + orders <- str_remove_all(orders, "[^A-Za-z_]") + + supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") + unsupported_passed_orders <- setdiff(orders, supported_orders) + supported_passed_orders <- intersect(orders, supported_orders) + + # error only if there isn't at least one valid order we can try + if (length(supported_passed_orders) == 0) { + arrow_not_supported( + paste0( + oxford_paste( + unsupported_passed_orders + ), + " `orders`" + ) + ) + } + formats_list <- map(orders, build_format_from_order) purrr::flatten_chr(formats_list) } diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 96e7ff56e0c..1e56dcf8615 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -492,19 +492,8 @@ register_bindings_datetime_parsers <- function() { orders, tz = "UTC") { - supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") - unsupported_passed_orders <- setdiff(orders, supported_orders) - - if (length(unsupported_passed_orders) > 0) { - arrow_not_supported( - paste0( - oxford_paste( - unsupported_passed_orders - ), - " `orders`" - ) - ) - } + # each order is translated into possible formats + formats <- build_formats(orders) # make all separators (non-letters and non-numbers) into "-" x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) @@ -516,9 +505,6 @@ register_bindings_datetime_parsers <- function() { # we could insert separators at the "likely" positions, but it might be # tricky given the possible combinations between dmy formats + locale - # each order is translated into 6 possible formats - formats <- build_formats(orders) - # build a list of expressions for each format parse_attempt_expressions <- list() From 0d4cc6a04b40a86bba8d024eb0bdda335d06f7dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 12:22:58 +0100 Subject: [PATCH 37/44] stop processing formats passed via orders. assume they are correct as passed --- r/R/dplyr-datetime-helpers.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index f11b2ddcaa4..015c0f07fdd 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -156,13 +156,14 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { } build_formats <- function(orders) { - # only keep the letters and the underscore as separator -> allow the users to - # pass strptime-like formats (with "%") - orders <- str_remove_all(orders, "[^A-Za-z_]") + # allow the users to pass strptime-like formats (with "%") + # separate formats (strings containing "%") from orders + formats_passsed_via_orders <- grep("%", orders, value = TRUE) + pure_orders <- grep("%", orders, value = TRUE, invert = TRUE) supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") - unsupported_passed_orders <- setdiff(orders, supported_orders) - supported_passed_orders <- intersect(orders, supported_orders) + unsupported_passed_orders <- setdiff(pure_orders, supported_orders) + supported_passed_orders <- intersect(pure_orders, supported_orders) # error only if there isn't at least one valid order we can try if (length(supported_passed_orders) == 0) { @@ -177,7 +178,8 @@ build_formats <- function(orders) { } formats_list <- map(orders, build_format_from_order) - purrr::flatten_chr(formats_list) + formats_from_order <- purrr::flatten_chr(formats_list) + c(formats_passsed_via_orders, formats_from_order) } build_format_from_order <- function(order) { From a6b242b38c534e4e2e71816824008d4d4868e77d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 12:48:46 +0100 Subject: [PATCH 38/44] return to processing `orders` with `"%"` into `formats` --- r/R/dplyr-datetime-helpers.R | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 015c0f07fdd..47714ffb084 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -156,14 +156,16 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { } build_formats <- function(orders) { - # allow the users to pass strptime-like formats (with "%") - # separate formats (strings containing "%") from orders - formats_passsed_via_orders <- grep("%", orders, value = TRUE) - pure_orders <- grep("%", orders, value = TRUE, invert = TRUE) + + # only keep the letters and the underscore as separator -> allow the users to + # pass strptime-like formats (with "%"). Processing is neede (instead of passing + # formats as-is) due to the processing of the character vector in parse_date_time() + orders <- gsub("[^A-Za-z_]", "", orders) + orders <- gsub("Y", "y", orders) supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") - unsupported_passed_orders <- setdiff(pure_orders, supported_orders) - supported_passed_orders <- intersect(pure_orders, supported_orders) + unsupported_passed_orders <- setdiff(orders, supported_orders) + supported_passed_orders <- intersect(orders, supported_orders) # error only if there isn't at least one valid order we can try if (length(supported_passed_orders) == 0) { @@ -178,8 +180,7 @@ build_formats <- function(orders) { } formats_list <- map(orders, build_format_from_order) - formats_from_order <- purrr::flatten_chr(formats_list) - c(formats_passsed_via_orders, formats_from_order) + purrr::flatten_chr(formats_list) } build_format_from_order <- function(order) { From 96220c4cbb23e7b62dc1aad61ea529b241db26b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 12:49:14 +0100 Subject: [PATCH 39/44] add a test with combi formats and orders + combi strings to be parsed --- r/tests/testthat/test-dplyr-funcs-datetime.R | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b718d0735f3..e8cb4437070 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1681,6 +1681,24 @@ test_that("parse_date_time() works with year, month, and date components", { ) }) +test_that("parse_date_time() works with a mix of formats and orders", { + test_df <- tibble( + string_combi = c("2021-09-1", "2/09//2021", "09.3.2021") + ) + + compare_dplyr_binding( + .input %>% + mutate( + date_from_string = parse_date_time( + string_combi, + orders = c("ymd", "%d/%m//%Y", "%m.%d.%Y") + ) + ) %>% + collect(), + test_df + ) +}) + test_that("parse_date_time() doesn't work with hour, minutes, and second components", { test_dates_times <- tibble( date_times = c("09-01-17 12:34:56", NA) From db777bffdb00b56342e98fa9bcd773e4c496a7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 13:59:07 +0100 Subject: [PATCH 40/44] typo --- r/R/dplyr-datetime-helpers.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 47714ffb084..22fd4b1173e 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -156,9 +156,8 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { } build_formats <- function(orders) { - # only keep the letters and the underscore as separator -> allow the users to - # pass strptime-like formats (with "%"). Processing is neede (instead of passing + # pass strptime-like formats (with "%"). Processing is needed (instead of passing # formats as-is) due to the processing of the character vector in parse_date_time() orders <- gsub("[^A-Za-z_]", "", orders) orders <- gsub("Y", "y", orders) From 9471e5f482352d2477b434353a5f48afddd35a40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 6 May 2022 14:53:12 +0100 Subject: [PATCH 41/44] skip when RE2 not available and add clarifying comments --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e8cb4437070..602f616bb9a 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1623,6 +1623,7 @@ test_that("`as_datetime()`", { }) test_that("parse_date_time() works with year, month, and date components", { + # string processing requires RE2 library (not available on Windows with R 3.6) skip_if_not_available("re2") compare_dplyr_binding( .input %>% @@ -1682,6 +1683,8 @@ test_that("parse_date_time() works with year, month, and date components", { }) test_that("parse_date_time() works with a mix of formats and orders", { + # string processing requires RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") test_df <- tibble( string_combi = c("2021-09-1", "2/09//2021", "09.3.2021") ) From d1da8b2cd3ac7fddba5c185c76a95bb08bd12ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 9 May 2022 15:49:53 +0100 Subject: [PATCH 42/44] hopefully made NEWS entry a bit clearer --- r/NEWS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 2b8e71226e9..40c70a7e64a 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -18,10 +18,10 @@ --> # development version - * `lubridate::parse_date_time()` datetime parser (only for year, month, and day components with separators). Not all functionality has been implemented: - * the `orders` argument in the Arrow binding works slightly different than in `lubridate::parse_date_time()` and closer to `lubridate::parse_date_time2()` and `lubridate::fast_strptime()`: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place. `lubridate::parse_date_time()` usually "trains" formats on a subset of the input vector and then applies them according to performance on the training set. - * currently only `orders` containing the year, month, and day components are supported. In a future release `orders` containing other datetime components (such as hours, minutes, seconds, etc) will be supported. - * strings with no separators (e.g. 20210917) will likely fail to parse so you might want to try using the `strptime()` binding instead. + * `lubridate::parse_date_time()` datetime parser: + * currently only `orders` containing the year, month, and day components are supported. In a future release `orders` containing other datetime components (such as hours, minutes, seconds, etc) will be added. + * strings with no separators (e.g. `"20210917"`) could be ambiguous and are not yet supported. + * the `orders` argument in the Arrow binding worksas follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`). # arrow 7.0.0.9000 From 362681da63ff6b1f484321e2433d4171a5413222 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 9 May 2022 15:55:07 +0100 Subject: [PATCH 43/44] another minor NEWS update --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 40c70a7e64a..4fe1933ab78 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,7 +19,7 @@ # development version * `lubridate::parse_date_time()` datetime parser: - * currently only `orders` containing the year, month, and day components are supported. In a future release `orders` containing other datetime components (such as hours, minutes, seconds, etc) will be added. + * currently parses only `orders` with year, month, and day components. In a future release `orders` support for other datetime components (such as hours, minutes, seconds, etc) will be added. * strings with no separators (e.g. `"20210917"`) could be ambiguous and are not yet supported. * the `orders` argument in the Arrow binding worksas follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`). From 904a90e78fa26c3cc9097c9e460cd12653088fe4 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 9 May 2022 14:29:03 -0500 Subject: [PATCH 44/44] Update r/NEWS.md --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 4fe1933ab78..ed9b600050f 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -21,7 +21,7 @@ * `lubridate::parse_date_time()` datetime parser: * currently parses only `orders` with year, month, and day components. In a future release `orders` support for other datetime components (such as hours, minutes, seconds, etc) will be added. * strings with no separators (e.g. `"20210917"`) could be ambiguous and are not yet supported. - * the `orders` argument in the Arrow binding worksas follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`). + * the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`). # arrow 7.0.0.9000