From 582d2950185f34dd2446739d07954e5220a70abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 11:15:48 +0100 Subject: [PATCH 01/47] a nicer, mode extensible version of `build_format_from_order()` (less hardcoding) --- r/R/dplyr-datetime-helpers.R | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index bc1a13075dc..8b4b62dea02 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -201,19 +201,15 @@ build_formats <- function(orders) { } build_format_from_order <- function(order) { - year_chars <- c("%y", "%Y") - month_chars <- c("%m", "%B", "%b") - day_chars <- "%d" - - outcome <- switch( - order, - "ymd" = expand.grid(year_chars, month_chars, day_chars), - "ydm" = expand.grid(year_chars, day_chars, month_chars), - "mdy" = expand.grid(month_chars, day_chars, year_chars), - "myd" = expand.grid(month_chars, year_chars, day_chars), - "dmy" = expand.grid(day_chars, month_chars, year_chars), - "dym" = expand.grid(day_chars, year_chars, month_chars) + char_list <- list( + "y" = c("%y", "%Y"), + "m" = c("%m", "%B", "%b"), + "d" = "%d" ) - outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-") - outcome$format + + split_order <- strsplit(order, split = "")[[1]] + + outcome <- expand.grid(char_list[split_order]) + format <- do.call(paste, c(outcome, sep = "-")) + format } From c636bcb151afdf7521aea6ba99d17d112318feb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 12:09:12 +0100 Subject: [PATCH 02/47] extend the supported orders + unit tests --- r/R/dplyr-datetime-helpers.R | 12 ++++++++++-- r/tests/testthat/test-dplyr-funcs-datetime.R | 14 -------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 8b4b62dea02..41a0fd31a3a 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -179,7 +179,11 @@ build_formats <- function(orders) { orders <- unique(c(orders1, orders2)) } - supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") + supported_orders <- c( + "ymd", "ydm", "mdy", "myd", "dmy", "dym", + "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS", + "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H") + unsupported_passed_orders <- setdiff(orders, supported_orders) supported_passed_orders <- intersect(orders, supported_orders) @@ -204,9 +208,13 @@ build_format_from_order <- function(order) { char_list <- list( "y" = c("%y", "%Y"), "m" = c("%m", "%B", "%b"), - "d" = "%d" + "d" = "%d", + "H" = "%H", + "M" = "%M", + "S" = "%S" ) + order <- gsub("_", "", order) split_order <- strsplit(order, split = "")[[1]] outcome <- expand.grid(char_list[split_order]) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index bcea41b0521..8a864cd5177 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1744,20 +1744,6 @@ test_that("parse_date_time() works with a mix of formats and orders", { ) }) -test_that("parse_date_time() doesn't work with hour, minutes, and second components", { - test_dates_times <- tibble( - date_times = c("09-01-17 12:34:56", NA) - ) - - expect_warning( - test_dates_times %>% - arrow_table() %>% - mutate(parsed_date_ymd = parse_date_time(date_times, orders = "ymd_HMS")) %>% - collect(), - '"ymd_HMS" `orders` not supported in Arrow' - ) -}) - test_that("year, month, day date/time parsers", { test_df <- tibble::tibble( ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"), From c38a993ebd8209d24a07fbd537c85b528d8ddb2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 14:16:24 +0100 Subject: [PATCH 03/47] add support for `truncated` and `quiet` arguments --- r/R/dplyr-datetime-helpers.R | 8 +++++--- r/R/dplyr-funcs-datetime.R | 12 +++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 41a0fd31a3a..c8b9b7b6e97 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -179,10 +179,12 @@ build_formats <- function(orders) { orders <- unique(c(orders1, orders2)) } - supported_orders <- c( - "ymd", "ydm", "mdy", "myd", "dmy", "dym", + ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") + ymd_hms_orders <- c( "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS", - "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H") + "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H" + ) + supported_orders <- c(ymd_orders, ymd_hms_orders) unsupported_passed_orders <- setdiff(orders, supported_orders) supported_passed_orders <- intersect(orders, supported_orders) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index bc0ddc4eed1..b12fdfac685 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -520,7 +520,17 @@ register_bindings_duration_helpers <- function() { register_bindings_datetime_parsers <- function() { register_binding("parse_date_time", function(x, orders, - tz = "UTC") { + tz = "UTC", + truncated = 0, + quiet = TRUE) { + if (!quiet) { + arrow_not_supported("`quiet = FALSE`") + } + + if (truncated != 0) { + # build several orders for truncated formats + orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop = nchar(orders) - .x)) + } # each order is translated into possible formats formats <- build_formats(orders) From 5b2445a7e46c3cc80f7d699e7fca7dbc30c36945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 14:16:50 +0100 Subject: [PATCH 04/47] more unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 78 ++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 8a864cd5177..492041e05bd 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1876,6 +1876,10 @@ test_that("lubridate's fast_strptime", { ) ) + # these functions' internals use some string processing which requires the + # RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + compare_dplyr_binding( .input %>% mutate( @@ -1950,4 +1954,78 @@ test_that("lubridate's fast_strptime", { ) %>% collect() ) + + test_dates_times2 <- tibble( + ymd_hms_string = c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", NA), + ymd_hm_string = c("67-Jan-09 12:34", "1970-June-22 20:13", NA), + ymd_h_string = c("67-Jan-09 12", "1970-June-22 20", NA), + dmy_hms_string = c("09-Jan-67 12:34:56", "22-June-1970 20:13:59", NA), + dmy_hm_string = c("09-Jan-67 12:34", "22-June-1970 20:13", NA), + dmy_h_string = c("09-Jan-67 12", "22-June-1970 20", NA), + mdy_hms_string = c("Jan-09-67 12:34:56", "June-22-1970 20:13:59", NA), + mdy_hm_string = c("Jan-09-67 12:34", "June-22-1970 20:13", NA), + mdy_h_string = c("Jan-09-67 12", "June-22-1970 20", NA), + ydm_hms_string = c("67-09-Jan 12:34:56", "1970-22-June 20:13:59", NA), + ydm_hm_string = c("67-09-Jan 12:34", "1970-22-June 20:13", NA), + ydm_h_string = c("67-09-Jan 12", "1970-22-June 20", NA) + ) + + compare_dplyr_binding( + .input %>% + mutate( + ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"), + ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM"), + ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"), + dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"), + dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM"), + dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"), + mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"), + mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM"), + mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), + ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), + ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM"), + ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H"), + .keep = "used" + ) %>% + collect(), + test_dates_times2 + ) + + # test truncated formats + compare_dplyr_binding( + .input %>% + mutate( + dttm = + parse_date_time( + truncated_ymd_string, + orders = "ymd_HMS", + truncated = 3 + ) + ) %>% + collect(), + tibble( + truncated_ymd_string = + c( + "2022-05-19 13:46:51", + "2022-05-18 13:46", + "2022-05-17 13", + "2022-05-16" + ) + ) + ) + + # we need expect_warning twice as both the arrow pipeline (because quiet = + # FALSE is not supported) and the fallback dplyr/lubridate one throw + # warnings (the lubridate one because quiet is FALSE) + expect_warning( + expect_warning( + tibble(x = c("2022-05-19 13:46:51")) %>% + arrow_table() %>% + mutate( + x_dttm = parse_date_time(x, orders = "dmy_HMS", quiet = FALSE) + ) %>% + collect(), + "`quiet = FALSE` not supported in Arrow" + ) + ) }) From c5e275225963d700aefcb5864b0d3d5a191bf2fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 14:31:09 +0100 Subject: [PATCH 05/47] all tests back in --- r/tests/testthat/test-dplyr-funcs-datetime.R | 75 +++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 492041e05bd..b28df7141c0 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1954,6 +1954,79 @@ test_that("lubridate's fast_strptime", { ) %>% collect() ) +}) + +test_that("parse_date_time with hours, minutes and seconds components", { + test_dates_times <- tibble( + ymd_hms_string = c("67-01-09 12:34:56", "1970-05-22 20:13:59", NA), + ymd_hm_string = c("67-01-09 12:34", "1970-05-22 20:13", NA), + ymd_h_string = c("67-01-09 12", "1970-05-22 20", NA), + dmy_hms_string = c("09-01-67 12:34:56", "22-05-1970 20:13:59", NA), + dmy_hm_string = c("09-01-67 12:34", "22-05-1970 20:13", NA), + dmy_h_string = c("09-01-67 12", "22-05-1970 20", NA), + mdy_hms_string = c("01-09-67 12:34:56", "05-22-1970 20:13:59", NA), + mdy_hm_string = c("01-09-67 12:34", "05-22-1970 20:13", NA), + mdy_h_string = c("01-09-67 12", "05-22-1970 20", NA), + ydm_hms_string = c("67-09-01 12:34:56", "1970-22-05 20:13:59", NA), + ydm_hm_string = c("67-09-01 12:34", "1970-22-05 20:13", NA), + ydm_h_string = c("67-09-01 12", "1970-22-05 20", NA) + ) + + # these functions' internals use some string processing which requires the + # RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + + compare_dplyr_binding( + .input %>% + mutate( + ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"), + ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM"), + ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"), + dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"), + dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM"), + dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"), + mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"), + mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM"), + mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), + ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), + ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM"), + ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H") + ) %>% + collect(), + test_dates_times + ) + + compare_dplyr_binding( + .input %>% + mutate( + ymd_hms_dttm = + parse_date_time(ymd_hms_string, orders = "ymd_HMS", tz = "Pacific/Marquesas"), + ymd_hm_dttm = + parse_date_time(ymd_hm_string, orders = "ymd_HM", tz = "Pacific/Marquesas"), + ymd_h_dttm = + parse_date_time(ymd_h_string, orders = "ymd_H", tz = "Pacific/Marquesas"), + dmy_hms_dttm = + parse_date_time(dmy_hms_string, orders = "dmy_HMS", tz = "Pacific/Marquesas"), + dmy_hm_dttm = + parse_date_time(dmy_hm_string, orders = "dmy_HM", tz = "Pacific/Marquesas"), + dmy_h_dttm = + parse_date_time(dmy_h_string, orders = "dmy_H", tz = "Pacific/Marquesas"), + mdy_hms_dttm = + parse_date_time(mdy_hms_string, orders = "mdy_HMS", tz = "Pacific/Marquesas"), + mdy_hm_dttm = + parse_date_time(mdy_hm_string, orders = "mdy_HM", tz = "Pacific/Marquesas"), + mdy_h_dttm = + parse_date_time(mdy_h_string, orders = "mdy_H", tz = "Pacific/Marquesas"), + ydm_hms_dttm = + parse_date_time(ydm_hms_string, orders = "ydm_HMS", tz = "Pacific/Marquesas"), + ydm_hm_dttm = + parse_date_time(ydm_hm_string, orders = "ydm_HM", tz = "Pacific/Marquesas"), + ydm_h_dttm = + parse_date_time(ydm_h_string, orders = "ydm_H", tz = "Pacific/Marquesas") + ) %>% + collect(), + test_dates_times + ) test_dates_times2 <- tibble( ymd_hms_string = c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", NA), @@ -2010,7 +2083,7 @@ test_that("lubridate's fast_strptime", { "2022-05-18 13:46", "2022-05-17 13", "2022-05-16" - ) + ) ) ) From fa47c9c4dcfec57a1bc922838b95fa6f85722be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 14:47:07 +0100 Subject: [PATCH 06/47] support `ymd HMS` and `ymdHMS` orders --- r/R/dplyr-datetime-helpers.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index c8b9b7b6e97..e59cd025f00 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -156,7 +156,7 @@ build_formats <- function(orders) { # only keep the letters and the underscore as separator -> allow the users to # pass strptime-like formats (with "%"). Processing is needed (instead of passing # formats as-is) due to the processing of the character vector in parse_date_time() - orders <- gsub("[^A-Za-z_]", "", orders) + orders <- gsub("[^A-Za-z]", "", orders) orders <- gsub("Y", "y", orders) # we separate "ym', "my", and "yq" from the rest of the `orders` vector and @@ -184,7 +184,13 @@ build_formats <- function(orders) { "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS", "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H" ) - supported_orders <- c(ymd_orders, ymd_hms_orders) + + supported_orders <- c( + ymd_orders, + ymd_hms_orders, + gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as separators + gsub("_", "", ymd_hms_orders) + ) unsupported_passed_orders <- setdiff(orders, supported_orders) supported_passed_orders <- intersect(orders, supported_orders) @@ -216,7 +222,6 @@ build_format_from_order <- function(order) { "S" = "%S" ) - order <- gsub("_", "", order) split_order <- strsplit(order, split = "")[[1]] outcome <- expand.grid(char_list[split_order]) From 8ea9845e5b62db5199d7b9a0b6e66e9af3f9c3e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 14:48:05 +0100 Subject: [PATCH 07/47] test `ydmHM` and `ydmH` orders --- r/tests/testthat/test-dplyr-funcs-datetime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b28df7141c0..bb968f17e17 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1989,8 +1989,8 @@ test_that("parse_date_time with hours, minutes and seconds components", { mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM"), mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), - ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM"), - ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H") + ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydmHM"), + ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydmH") ) %>% collect(), test_dates_times From 5e44e06d17025c39283abf15fca48e2c71a77e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 15:37:23 +0100 Subject: [PATCH 08/47] reorganised tests so we can skip locale-dependent ones on windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 76 ++++++++++---------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index bb968f17e17..e84cb9437a6 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2028,42 +2028,6 @@ test_that("parse_date_time with hours, minutes and seconds components", { test_dates_times ) - test_dates_times2 <- tibble( - ymd_hms_string = c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", NA), - ymd_hm_string = c("67-Jan-09 12:34", "1970-June-22 20:13", NA), - ymd_h_string = c("67-Jan-09 12", "1970-June-22 20", NA), - dmy_hms_string = c("09-Jan-67 12:34:56", "22-June-1970 20:13:59", NA), - dmy_hm_string = c("09-Jan-67 12:34", "22-June-1970 20:13", NA), - dmy_h_string = c("09-Jan-67 12", "22-June-1970 20", NA), - mdy_hms_string = c("Jan-09-67 12:34:56", "June-22-1970 20:13:59", NA), - mdy_hm_string = c("Jan-09-67 12:34", "June-22-1970 20:13", NA), - mdy_h_string = c("Jan-09-67 12", "June-22-1970 20", NA), - ydm_hms_string = c("67-09-Jan 12:34:56", "1970-22-June 20:13:59", NA), - ydm_hm_string = c("67-09-Jan 12:34", "1970-22-June 20:13", NA), - ydm_h_string = c("67-09-Jan 12", "1970-22-June 20", NA) - ) - - compare_dplyr_binding( - .input %>% - mutate( - ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"), - ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM"), - ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"), - dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"), - dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM"), - dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"), - mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"), - mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM"), - mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), - ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), - ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM"), - ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H"), - .keep = "used" - ) %>% - collect(), - test_dates_times2 - ) - # test truncated formats compare_dplyr_binding( .input %>% @@ -2101,4 +2065,44 @@ test_that("parse_date_time with hours, minutes and seconds components", { "`quiet = FALSE` not supported in Arrow" ) ) + + test_dates_times2 <- tibble( + ymd_hms_string = c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", NA), + ymd_hm_string = c("67-Jan-09 12:34", "1970-June-22 20:13", NA), + ymd_h_string = c("67-Jan-09 12", "1970-June-22 20", NA), + dmy_hms_string = c("09-Jan-67 12:34:56", "22-June-1970 20:13:59", NA), + dmy_hm_string = c("09-Jan-67 12:34", "22-June-1970 20:13", NA), + dmy_h_string = c("09-Jan-67 12", "22-June-1970 20", NA), + mdy_hms_string = c("Jan-09-67 12:34:56", "June-22-1970 20:13:59", NA), + mdy_hm_string = c("Jan-09-67 12:34", "June-22-1970 20:13", NA), + mdy_h_string = c("Jan-09-67 12", "June-22-1970 20", NA), + ydm_hms_string = c("67-09-Jan 12:34:56", "1970-22-June 20:13:59", NA), + ydm_hm_string = c("67-09-Jan 12:34", "1970-22-June 20:13", NA), + ydm_h_string = c("67-09-Jan 12", "1970-22-June 20", NA) + ) + + # locale (affecting "%b% and "%B" formats) does not work properly on Windows + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done + skip_on_os("windows") + + compare_dplyr_binding( + .input %>% + mutate( + ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"), + ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM"), + ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"), + dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"), + dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM"), + dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"), + mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"), + mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM"), + mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), + ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), + ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM"), + ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H"), + .keep = "used" + ) %>% + collect(), + test_dates_times2 + ) }) From 93346ee5eb33d25c11b451728f51bf0c5c96b794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 19 May 2022 19:15:47 +0100 Subject: [PATCH 09/47] removed commented code referenced #13174 --- r/tests/testthat/test-dplyr-funcs-datetime.R | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e84cb9437a6..a41c78a06c1 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1837,9 +1837,7 @@ test_that("lubridate's fast_strptime", { collect(), tibble( x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA) - )#, - # arrow does not preserve the `tzone` attribute - # test ignore_attr = TRUE + ) ) # R object @@ -1856,8 +1854,7 @@ test_that("lubridate's fast_strptime", { collect(), tibble( x = c("2018-10-07 19:04:05", NA) - )#, - # test ignore_attr = TRUE + ) ) compare_dplyr_binding( @@ -1913,9 +1910,7 @@ test_that("lubridate's fast_strptime", { tibble( x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) - )#, - # arrow does not preserve the `tzone` attribute - # test ignore_attr = TRUE + ) ) # the arrow binding errors for a value different from 68L for `cutoff_2000` From 3b213f98985b1cefee8ae1d8ef5fe92e2a5fea46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 20 May 2022 09:40:26 +0100 Subject: [PATCH 10/47] added support (and unit tests) for unseparated strings --- r/R/dplyr-datetime-helpers.R | 5 +- r/R/dplyr-funcs-datetime.R | 28 +++-- r/tests/testthat/test-dplyr-funcs-datetime.R | 101 ++++++++++++------- 3 files changed, 87 insertions(+), 47 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index e59cd025f00..fd53e79d50c 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -225,6 +225,7 @@ build_format_from_order <- function(order) { split_order <- strsplit(order, split = "")[[1]] outcome <- expand.grid(char_list[split_order]) - format <- do.call(paste, c(outcome, sep = "-")) - format + formats_with_sep <- do.call(paste, c(outcome, sep = "-")) + formats_without_sep <- do.call(paste, c(outcome, sep = "")) + c(formats_with_sep, formats_without_sep) } diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index b12fdfac685..9f279add2b1 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -546,7 +546,13 @@ register_bindings_datetime_parsers <- function() { # for `ym` and `my` orders we add a day ("01") augmented_x <- NULL if (any(orders %in% c("ym", "my"))) { - augmented_x <- call_binding("paste0", x, "-01") + # add day as "-01" if there is a "-" separator and as "01" if not + augmented_x <- call_binding( + "if_else", + call_binding("grepl", "-", x), + call_binding("paste0", x, "-01"), + call_binding("paste0", x, "01") + ) } # for `yq` we need to transform the quarter into the start month (lubridate @@ -622,11 +628,21 @@ register_bindings_datetime_parsers <- function() { } # combine all attempts expressions in prep for coalesce - parse_attempt_expressions <- c( - parse_attempt_expressions, - parse_attempt_exp_augmented_x, - parse_attempt_exp_augmented_x2 - ) + # if the users passes only a short order (`ym`, `my` or `yq`) then only use + # the corresponding augmented_x + if (all(orders == "ym") || all(orders == "my")) { + parse_attempt_expressions <- parse_attempt_exp_augmented_x + } else if (all(orders == "yq")) { + parse_attempt_expressions <- parse_attempt_exp_augmented_x2 + } else { + parse_attempt_expressions <- c( + # if we have an augmented x give preference to the corresponding + # parsing attempts + parse_attempt_exp_augmented_x, + parse_attempt_exp_augmented_x2, + parse_attempt_expressions + ) + } coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a41c78a06c1..b1b5c589c13 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1678,22 +1678,17 @@ test_that("parse_date_time() works with year, month, and date components", { string_ymd = c( "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", "21:09:11", - # not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19 - # no separators and %b or %B are even more complicated (and they work in - # lubridate). not to mention locale - NA + "20210912", "210913", NA ), string_dmy = c( "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", - # not yet working for strings with no separators, like "10092021", "100921", - NA + "12092021", "130921", NA ), string_mdy = c( "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", - # not yet working for strings with no separators, like "09102021", "091021", - NA + "09122021", "091321", NA ) ) ) @@ -1711,13 +1706,16 @@ test_that("parse_date_time() works with year, month, and date components", { collect(), tibble::tibble( string_ymd = c( - "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", NA + "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", + "2021Sep16", NA ), string_dmy = c( - "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", NA + "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", + "16Sep2021",NA ), string_mdy = c( - "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", NA + "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", + "Sep1621", NA ) ) ) @@ -1953,19 +1951,32 @@ test_that("lubridate's fast_strptime", { test_that("parse_date_time with hours, minutes and seconds components", { test_dates_times <- tibble( - ymd_hms_string = c("67-01-09 12:34:56", "1970-05-22 20:13:59", NA), - ymd_hm_string = c("67-01-09 12:34", "1970-05-22 20:13", NA), - ymd_h_string = c("67-01-09 12", "1970-05-22 20", NA), - dmy_hms_string = c("09-01-67 12:34:56", "22-05-1970 20:13:59", NA), - dmy_hm_string = c("09-01-67 12:34", "22-05-1970 20:13", NA), - dmy_h_string = c("09-01-67 12", "22-05-1970 20", NA), - mdy_hms_string = c("01-09-67 12:34:56", "05-22-1970 20:13:59", NA), - mdy_hm_string = c("01-09-67 12:34", "05-22-1970 20:13", NA), - mdy_h_string = c("01-09-67 12", "05-22-1970 20", NA), - ydm_hms_string = c("67-09-01 12:34:56", "1970-22-05 20:13:59", NA), - ydm_hm_string = c("67-09-01 12:34", "1970-22-05 20:13", NA), - ydm_h_string = c("67-09-01 12", "1970-22-05 20", NA) - ) + ymd_hms_string = + c("67-01-09 12:34:56", "1970-05-22 20:13:59", "870822201359", NA), + ymd_hm_string = + c("67-01-09 12:34", "1970-05-22 20:13", "8708222013", NA), + ymd_h_string = + c("67-01-09 12", "1970-05-22 20", "87082220", NA), + dmy_hms_string = + c("09-01-67 12:34:56", "22-05-1970 20:13:59", "220887201359", NA), + dmy_hm_string = + c("09-01-67 12:34", "22-05-1970 20:13", "2208872013", NA), + dmy_h_string = + c("09-01-67 12", "22-05-1970 20", "22088720", NA), + mdy_hms_string = + c("01-09-67 12:34:56", "05-22-1970 20:13:59", "082287201359", NA), + mdy_hm_string = + c("01-09-67 12:34", "05-22-1970 20:13", "0822872013", NA), + mdy_h_string = + c("01-09-67 12", "05-22-1970 20", "08228720", NA), + ydm_hms_string = + c("67-09-01 12:34:56", "1970-22-05 20:13:59", "872208201359", NA), + ydm_hm_string = + c("67-09-01 12:34", "1970-22-05 20:13", "8722082013", NA), + ydm_h_string = + c("67-09-01 12", "1970-22-05 20", "87220820", NA) + ) + # the unseparated strings are versions of "1987-08-22 20:13:59" (with %y) # these functions' internals use some string processing which requires the # RE2 library (not available on Windows with R 3.6) @@ -2062,19 +2073,32 @@ test_that("parse_date_time with hours, minutes and seconds components", { ) test_dates_times2 <- tibble( - ymd_hms_string = c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", NA), - ymd_hm_string = c("67-Jan-09 12:34", "1970-June-22 20:13", NA), - ymd_h_string = c("67-Jan-09 12", "1970-June-22 20", NA), - dmy_hms_string = c("09-Jan-67 12:34:56", "22-June-1970 20:13:59", NA), - dmy_hm_string = c("09-Jan-67 12:34", "22-June-1970 20:13", NA), - dmy_h_string = c("09-Jan-67 12", "22-June-1970 20", NA), - mdy_hms_string = c("Jan-09-67 12:34:56", "June-22-1970 20:13:59", NA), - mdy_hm_string = c("Jan-09-67 12:34", "June-22-1970 20:13", NA), - mdy_h_string = c("Jan-09-67 12", "June-22-1970 20", NA), - ydm_hms_string = c("67-09-Jan 12:34:56", "1970-22-June 20:13:59", NA), - ydm_hm_string = c("67-09-Jan 12:34", "1970-22-June 20:13", NA), - ydm_h_string = c("67-09-Jan 12", "1970-22-June 20", NA) - ) + ymd_hms_string = + c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", "87Aug22201359", NA), + ymd_hm_string = + c("67-Jan-09 12:34", "1970-June-22 20:13", "87Aug222013", NA), + ymd_h_string = + c("67-Jan-09 12", "1970-June-22 20", "87Aug2220", NA), + dmy_hms_string = + c("09-Jan-67 12:34:56", "22-June-1970 20:13:59", "22Aug87201359", NA), + dmy_hm_string = + c("09-Jan-67 12:34", "22-June-1970 20:13", "22Aug872013", NA), + dmy_h_string = + c("09-Jan-67 12", "22-June-1970 20", "22Aug8720", NA), + mdy_hms_string = + c("Jan-09-67 12:34:56", "June-22-1970 20:13:59", "Aug2287201359", NA), + mdy_hm_string = + c("Jan-09-67 12:34", "June-22-1970 20:13", "Aug22872013", NA), + mdy_h_string = + c("Jan-09-67 12", "June-22-1970 20", "Aug228720", NA), + ydm_hms_string = + c("67-09-Jan 12:34:56", "1970-22-June 20:13:59", "8722Aug201359", NA), + ydm_hm_string = + c("67-09-Jan 12:34", "1970-22-June 20:13", "8722Aug2013", NA), + ydm_h_string = + c("67-09-Jan 12", "1970-22-June 20", "8722Aug20", NA) + ) + # the unseparated strings are versions of "1987-08-22 20:13:59" (with %y) # locale (affecting "%b% and "%B" formats) does not work properly on Windows # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done @@ -2094,8 +2118,7 @@ test_that("parse_date_time with hours, minutes and seconds components", { mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM"), - ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H"), - .keep = "used" + ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H") ) %>% collect(), test_dates_times2 From 715201d071c0d745a9cb0a43c3e058648599eb4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 20 May 2022 09:42:22 +0100 Subject: [PATCH 11/47] update NEWS --- r/NEWS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 6d25aa2154f..d88be229640 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -20,8 +20,7 @@ # arrow 8.0.0.9000 * `lubridate::parse_date_time()` datetime parser: - * currently parses only `orders` with year, month, and day components. In a future release `orders` support for other datetime components (such as hours, minutes, seconds, etc) will be added. - * strings with no separators (e.g. `"20210917"`) could be ambiguous and are not yet supported. + * `orders` with year, month, day, hours, minutes, and seconds components are supported. * the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`). # arrow 8.0.0 From 5353532b9b52efe16f6d9e902924116fbc3e37cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 20 May 2022 09:47:50 +0100 Subject: [PATCH 12/47] orders without `"_"` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b1b5c589c13..bd983cba91e 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2108,16 +2108,16 @@ test_that("parse_date_time with hours, minutes and seconds components", { .input %>% mutate( ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"), - ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM"), + ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymdHM"), ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"), dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"), - dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM"), + dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmyHM"), dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"), mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"), - mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM"), + mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdyHM"), mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), - ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM"), + ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydmHM"), ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H") ) %>% collect(), From 57c02d67a1a2b9e85dbab1c0a4e22a125b535d92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 20 May 2022 10:46:10 +0100 Subject: [PATCH 13/47] lint --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index bd983cba91e..ab564376d0c 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1711,7 +1711,7 @@ test_that("parse_date_time() works with year, month, and date components", { ), string_dmy = c( "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", - "16Sep2021",NA + "16Sep2021", NA ), string_mdy = c( "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", From ecc62ba28e43955f96aaf557203b16d76afec761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 20 May 2022 12:26:27 +0100 Subject: [PATCH 14/47] unit test for `exact = TRUE` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 27 ++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index ab564376d0c..134ba57a259 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2124,3 +2124,30 @@ test_that("parse_date_time with hours, minutes and seconds components", { test_dates_times2 ) }) + +test_that("parse_date_time & `exact = TRUE`", { + test_df <- tibble( + x = c("2022-12-31 12:59:59", "2022-01-01 12:11", "2022-01-01 12", "2022-01-01", NA), + y = c("11/23/1998 07:00:00", "6/18/1952 0135", "2/25/1974 0523", "9/07/1985 01", NA) + ) + + compare_dplyr_binding( + .input %>% + mutate( + parsed_x = + parse_date_time( + x, + c("%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%d %H", "%Y-%m-%d"), + exact = TRUE + ), + parsed_y = + parse_date_time( + y, + c("%m/%d/%Y %I:%M:%S", "%m/%d/%Y %H%M", "%m/%d/%Y %H"), + exact = TRUE + ) + ) %>% + collect(), + test_df + ) +}) From f4897c1a30fb37c965c6300e36e5e0f89184e90b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 20 May 2022 12:27:41 +0100 Subject: [PATCH 15/47] refactored `parse_date_time()` with several helper functions --- r/R/dplyr-datetime-helpers.R | 111 ++++++++++++++++++++++++++++++++ r/R/dplyr-funcs-datetime.R | 119 +++-------------------------------- 2 files changed, 119 insertions(+), 111 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index fd53e79d50c..ac7209c88d7 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -229,3 +229,114 @@ build_format_from_order <- function(order) { formats_without_sep <- do.call(paste, c(outcome, sep = "")) c(formats_with_sep, formats_without_sep) } + +process_data_for_parsing <- function(x, + orders) { + + processed_x <- x$cast(string()) + + # make all separators (non-letters and non-numbers) into "-" + processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x) + # collapse multiple separators into a single one + processed_x <- call_binding("gsub", "-{2,}", "-", processed_x) + + # we need to transform `x` when orders are `ym`, `my`, and `yq` + # for `ym` and `my` orders we add a day ("01") + augmented_x_ym <- NULL + if (any(orders %in% c("ym", "my"))) { + # add day as "-01" if there is a "-" separator and as "01" if not + augmented_x_ym <- call_binding( + "if_else", + call_binding("grepl", "-", processed_x), + call_binding("paste0", processed_x, "-01"), + call_binding("paste0", processed_x, "01") + ) + } + + # for `yq` we need to transform the quarter into the start month (lubridate + # behaviour) and then add 01 to parse to the first day of the quarter + augmented_x_yq <- NULL + if (any(orders == "yq")) { + # extract everything that comes after the `-` separator, i.e. the quarter + # (e.g. 4 from 2022-4) + quarter_x <- call_binding("gsub", "^.*?-", "", processed_x) + # we should probably error if quarter is not in 1:4 + # extract everything that comes before the `-`, i.e. the year (e.g. 2002 + # in 2002-4) + year_x <- call_binding("gsub", "-.*$", "", processed_x) + quarter_x <- quarter_x$cast(int32()) + month_x <- (quarter_x - 1) * 3 + 1 + augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01") + } + + list( + "augmented_x_ym" = augmented_x_ym, + "augmented_x_yq" = augmented_x_yq, + "processed_x" = processed_x + ) +} + +attempt_parsing <- function(x, + orders, + formats = NULL) { + if (is.null(formats)) { + # this is the situation in which orders were passed with `exact = TRUE` + # no data processing takes place + # we don't derive formats as the orders are assumed to be formats + parse_attempt_expressions <- build_strptime_exps(x, orders) + return(parse_attempt_expressions) + } + + processed_data <- process_data_for_parsing(x, orders) + + processed_x <- processed_data[["processed_x"]] + augmented_x_ym <- processed_data[["augmented_x_ym"]] + augmented_x_yq <- processed_data[["augmented_x_yq"]] + + # build a list of expressions for each format + parse_attempt_exp_processed_x <- build_strptime_exps(processed_x, formats) + + # build separate expression lists of parsing attempts for the orders that + # need an augmented `x` + # list for attempts when orders %in% c("ym", "my") + parse_attempt_exp_augmented_x_ym <- list() + + if (!is.null(augmented_x_ym)) { + parse_attempt_exp_augmented_x_ym <- build_strptime_exps(augmented_x_ym, formats) + } + + # list for attempts when orders %in% c("yq") + parse_attempt_exp_augmented_x_yq <- list() + if (!is.null(augmented_x_yq)) { + parse_attempt_exp_augmented_x_yq <- build_strptime_exps(augmented_x_yq, formats) + } + + # combine all attempts expressions in prep for coalesce + # if the users passes only a short order (`ym`, `my` or `yq`) then only use + # the corresponding augmented_x + if (all(orders == "ym") || all(orders == "my")) { + parse_attempt_expressions <- parse_attempt_exp_augmented_x_ym + } else if (all(orders == "yq")) { + parse_attempt_expressions <- parse_attempt_exp_augmented_x_yq + } else { + parse_attempt_expressions <- c( + # if we have an augmented x give preference to the corresponding + # parsing attempts + parse_attempt_exp_augmented_x_ym, + parse_attempt_exp_augmented_x_yq, + parse_attempt_exp_processed_x + ) + } + parse_attempt_expressions +} + +build_strptime_exps <- function(x, formats) { + map( + formats, + ~ build_expr( + "strptime", + x, + options = list(format = .x, unit = 0L, error_is_null = TRUE) + ) + ) +} diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9f279add2b1..bd46527f2ca 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -522,7 +522,8 @@ register_bindings_datetime_parsers <- function() { orders, tz = "UTC", truncated = 0, - quiet = TRUE) { + quiet = TRUE, + exact = FALSE) { if (!quiet) { arrow_not_supported("`quiet = FALSE`") } @@ -532,119 +533,15 @@ register_bindings_datetime_parsers <- function() { orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop = nchar(orders) - .x)) } - # each order is translated into possible formats - formats <- build_formats(orders) - - x <- x$cast(string()) - - # make all separators (non-letters and non-numbers) into "-" - x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) - # collapse multiple separators into a single one - x <- call_binding("gsub", "-{2,}", "-", x) - - # we need to transform `x` when orders are `ym`, `my`, and `yq` - # for `ym` and `my` orders we add a day ("01") - augmented_x <- NULL - if (any(orders %in% c("ym", "my"))) { - # add day as "-01" if there is a "-" separator and as "01" if not - augmented_x <- call_binding( - "if_else", - call_binding("grepl", "-", x), - call_binding("paste0", x, "-01"), - call_binding("paste0", x, "01") - ) - } - - # for `yq` we need to transform the quarter into the start month (lubridate - # behaviour) and then add 01 to parse to the first day of the quarter - augmented_x2 <- NULL - if (any(orders == "yq")) { - # extract everything that comes after the `-` separator, i.e. the quarter - # (e.g. 4 from 2022-4) - quarter_x <- call_binding("gsub", "^.*?-", "", x) - # we should probably error if quarter is not in 1:4 - # extract everything that comes before the `-`, i.e. the year (e.g. 2002 - # in 2002-4) - year_x <- call_binding("gsub", "-.*$", "", x) - quarter_x <- quarter_x$cast(int32()) - month_x <- (quarter_x - 1) * 3 + 1 - augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01") - } - - # TODO figure out how to parse strings that have no separators - # https://issues.apache.org/jira/browse/ARROW-16446 - # we could insert separators at the "likely" positions, but it might be - # tricky given the possible combinations between dmy formats + locale - - # build a list of expressions for each format - parse_attempt_expressions <- map( - formats, - ~ build_expr( - "strptime", - x, - options = list( - format = .x, - unit = 0L, - error_is_null = TRUE - ) - ) - ) - - # build separate expression lists of parsing attempts for the orders that - # need an augmented `x` - # list for attempts when orders %in% c("ym", "my") - parse_attempt_exp_augmented_x <- list() - - if (!is.null(augmented_x)) { - parse_attempt_exp_augmented_x <- map( - formats, - ~ build_expr( - "strptime", - augmented_x, - options = list( - format = .x, - unit = 0L, - error_is_null = TRUE - ) - ) - ) - } - - # list for attempts when orders %in% c("yq") - parse_attempt_exp_augmented_x2 <- list() - if (!is.null(augmented_x2)) { - parse_attempt_exp_augmented_x2 <- map( - formats, - ~ build_expr( - "strptime", - augmented_x2, - options = list( - format = .x, - unit = 0L, - error_is_null = TRUE - ) - ) - ) - } - - # combine all attempts expressions in prep for coalesce - # if the users passes only a short order (`ym`, `my` or `yq`) then only use - # the corresponding augmented_x - if (all(orders == "ym") || all(orders == "my")) { - parse_attempt_expressions <- parse_attempt_exp_augmented_x - } else if (all(orders == "yq")) { - parse_attempt_expressions <- parse_attempt_exp_augmented_x2 + if (exact == TRUE) { + parse_attempts <- attempt_parsing(x, orders = orders) } else { - parse_attempt_expressions <- c( - # if we have an augmented x give preference to the corresponding - # parsing attempts - parse_attempt_exp_augmented_x, - parse_attempt_exp_augmented_x2, - parse_attempt_expressions - ) + # each order is translated into possible formats + formats <- build_formats(orders) + parse_attempts <- attempt_parsing(x, orders = orders, formats = formats) } - coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) + coalesce_output <- build_expr("coalesce", args = parse_attempts) # we need this binding to be able to handle a NULL `tz`, which will then be # used by bindings such as `ymd` to return, based on whether tz is NULL or From 81468f3291b461b2768beaf58020937f25236fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Sat, 21 May 2022 11:31:43 +0100 Subject: [PATCH 16/47] move `build_formats` inside `attempt_parsing`, since it won't be used outside it --- r/R/dplyr-datetime-helpers.R | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index ac7209c88d7..a0908b30c9d 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -277,15 +277,10 @@ process_data_for_parsing <- function(x, } attempt_parsing <- function(x, - orders, - formats = NULL) { - if (is.null(formats)) { - # this is the situation in which orders were passed with `exact = TRUE` - # no data processing takes place - # we don't derive formats as the orders are assumed to be formats - parse_attempt_expressions <- build_strptime_exps(x, orders) - return(parse_attempt_expressions) - } + orders) { + + # translate orders into possible formats + formats <- build_formats(orders) processed_data <- process_data_for_parsing(x, orders) From e9cd0e9aa121a3ede88c97cedca063696548a262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Sat, 21 May 2022 11:32:21 +0100 Subject: [PATCH 17/47] when `exact = TRUE` go directly to `build_strptime_exps()` --- r/R/dplyr-funcs-datetime.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index bd46527f2ca..6b4a2a406f8 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -534,11 +534,10 @@ register_bindings_datetime_parsers <- function() { } if (exact == TRUE) { - parse_attempts <- attempt_parsing(x, orders = orders) + # no data processing takes place & we don't derive formats + parse_attempts <- build_strptime_exps(x, orders) } else { - # each order is translated into possible formats - formats <- build_formats(orders) - parse_attempts <- attempt_parsing(x, orders = orders, formats = formats) + parse_attempts <- attempt_parsing(x, orders = orders) } coalesce_output <- build_expr("coalesce", args = parse_attempts) From a72659fb3a4a9b048e30aa897b9958a8689cc678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Sat, 21 May 2022 11:39:31 +0100 Subject: [PATCH 18/47] assert warning message originating in the dplyr pipeline --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 134ba57a259..9572269aad7 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2069,7 +2069,8 @@ test_that("parse_date_time with hours, minutes and seconds components", { ) %>% collect(), "`quiet = FALSE` not supported in Arrow" - ) + ), + "All formats failed to parse" ) test_dates_times2 <- tibble( From edb76ccab9730fcb58a5f30d28238a170c046c23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 23 May 2022 12:38:29 +0100 Subject: [PATCH 19/47] limit `truncated` to `nchar(orders) - 3` + unit test --- r/R/dplyr-funcs-datetime.R | 3 ++ r/tests/testthat/test-dplyr-funcs-datetime.R | 37 +++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 6b4a2a406f8..9dcc563fef6 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -529,6 +529,9 @@ register_bindings_datetime_parsers <- function() { } if (truncated != 0) { + if (truncated > (nchar(orders) - 3)) { + arrow_not_supported(paste0("a value for `truncated` > ", nchar(orders) - 3)) + } # build several orders for truncated formats orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop = nchar(orders) - .x)) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 9572269aad7..52ba6f901f7 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2035,6 +2035,16 @@ test_that("parse_date_time with hours, minutes and seconds components", { ) # test truncated formats + test_truncation_df <- tibble( + truncated_ymd_string = + c( + "2022-05-19 13:46:51", + "2022-05-18 13:46", + "2022-05-17 13", + "2022-05-16" + ) + ) + compare_dplyr_binding( .input %>% mutate( @@ -2046,17 +2056,26 @@ test_that("parse_date_time with hours, minutes and seconds components", { ) ) %>% collect(), - tibble( - truncated_ymd_string = - c( - "2022-05-19 13:46:51", - "2022-05-18 13:46", - "2022-05-17 13", - "2022-05-16" - ) - ) + test_truncation_df + ) + + # values for truncated greater than nchar(orders) - 3 not supported in Arrow + compare_dplyr_binding( + .input %>% + mutate( + dttm = + parse_date_time( + truncated_ymd_string, + orders = "ymd_HMS", + truncated = 5 + ) + ) %>% + collect(), + test_truncation_df, + warning = "a value for `truncated` > 4 not supported in Arrow" ) + # we need expect_warning twice as both the arrow pipeline (because quiet = # FALSE is not supported) and the fallback dplyr/lubridate one throw # warnings (the lubridate one because quiet is FALSE) From 1c39c7cf29aee7dca65aa3a57aaeb851b9ecd176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 23 May 2022 13:12:21 +0100 Subject: [PATCH 20/47] support hours as `"%I"` + unit tests --- r/R/dplyr-datetime-helpers.R | 10 ++++++++-- r/tests/testthat/test-dplyr-funcs-datetime.R | 11 +++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index a0908b30c9d..1698d4a296b 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -184,12 +184,16 @@ build_formats <- function(orders) { "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS", "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H" ) + ymd_ims_orders <- gsub("H", "I", ymd_hms_orders) supported_orders <- c( ymd_orders, ymd_hms_orders, gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as separators - gsub("_", "", ymd_hms_orders) + gsub("_", "", ymd_hms_orders), + ymd_ims_orders, + gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as separators + gsub("_", "", ymd_ims_orders) ) unsupported_passed_orders <- setdiff(orders, supported_orders) @@ -219,7 +223,8 @@ build_format_from_order <- function(order) { "d" = "%d", "H" = "%H", "M" = "%M", - "S" = "%S" + "S" = "%S", + "I" = "%I" ) split_order <- strsplit(order, split = "")[[1]] @@ -242,6 +247,7 @@ process_data_for_parsing <- function(x, # we need to transform `x` when orders are `ym`, `my`, and `yq` # for `ym` and `my` orders we add a day ("01") + # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627 augmented_x_ym <- NULL if (any(orders %in% c("ym", "my"))) { # add day as "-01" if there is a "-" separator and as "01" if not diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 52ba6f901f7..76ba7034d75 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2034,6 +2034,17 @@ test_that("parse_date_time with hours, minutes and seconds components", { test_dates_times ) + # test ymd_ims + compare_dplyr_binding( + .input %>% + mutate(ymd_ims_dttm = parse_date_time(ymd_ims_string, orders = "ymd_IMS")) %>% + collect(), + tibble( + ymd_ims_string = + c("67-01-09 12:34:56", "1970-05-22 11:13:59", "870822101359", NA) + ) + ) + # test truncated formats test_truncation_df <- tibble( truncated_ymd_string = From ad12187a329bcb47b6cd0497464012223989c612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 23 May 2022 14:07:31 +0100 Subject: [PATCH 21/47] simplified `attempt_parsing()` --- r/R/dplyr-datetime-helpers.R | 47 +++++++++--------------------------- r/R/dplyr-funcs-datetime.R | 2 +- 2 files changed, 12 insertions(+), 37 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 1698d4a296b..29b0a7a423e 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -184,6 +184,7 @@ build_formats <- function(orders) { "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS", "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H" ) + # support "%I" hour formats ymd_ims_orders <- gsub("H", "I", ymd_hms_orders) supported_orders <- c( @@ -290,48 +291,22 @@ attempt_parsing <- function(x, processed_data <- process_data_for_parsing(x, orders) - processed_x <- processed_data[["processed_x"]] - augmented_x_ym <- processed_data[["augmented_x_ym"]] - augmented_x_yq <- processed_data[["augmented_x_yq"]] + parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs, formats) - # build a list of expressions for each format - parse_attempt_exp_processed_x <- build_strptime_exps(processed_x, formats) - - # build separate expression lists of parsing attempts for the orders that - # need an augmented `x` - # list for attempts when orders %in% c("ym", "my") - parse_attempt_exp_augmented_x_ym <- list() - - if (!is.null(augmented_x_ym)) { - parse_attempt_exp_augmented_x_ym <- build_strptime_exps(augmented_x_ym, formats) + # if all orders are c("ym", "my", "yq") only attempt to parse the augmented_x + if (all(orders %in% c("ym", "my", "yq"))) { + parse_attempt_exprs_list$processed_x <- list() } - # list for attempts when orders %in% c("yq") - parse_attempt_exp_augmented_x_yq <- list() - if (!is.null(augmented_x_yq)) { - parse_attempt_exp_augmented_x_yq <- build_strptime_exps(augmented_x_yq, formats) - } + purrr::flatten(parse_attempt_exprs_list) +} - # combine all attempts expressions in prep for coalesce - # if the users passes only a short order (`ym`, `my` or `yq`) then only use - # the corresponding augmented_x - if (all(orders == "ym") || all(orders == "my")) { - parse_attempt_expressions <- parse_attempt_exp_augmented_x_ym - } else if (all(orders == "yq")) { - parse_attempt_expressions <- parse_attempt_exp_augmented_x_yq - } else { - parse_attempt_expressions <- c( - # if we have an augmented x give preference to the corresponding - # parsing attempts - parse_attempt_exp_augmented_x_ym, - parse_attempt_exp_augmented_x_yq, - parse_attempt_exp_processed_x - ) +build_strptime_exprs <- function(x, formats) { + # returning an empty list helps when iterating with build_strptime_exprs + if (is.null(x)) { + return(list()) } - parse_attempt_expressions -} -build_strptime_exps <- function(x, formats) { map( formats, ~ build_expr( diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9dcc563fef6..a9d89e283e8 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -538,7 +538,7 @@ register_bindings_datetime_parsers <- function() { if (exact == TRUE) { # no data processing takes place & we don't derive formats - parse_attempts <- build_strptime_exps(x, orders) + parse_attempts <- build_strptime_exprs(x, orders) } else { parse_attempts <- attempt_parsing(x, orders = orders) } From 384bc0a093acb488a68daf791139d69403b220b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 23 May 2022 14:40:51 +0100 Subject: [PATCH 22/47] minor --- r/R/dplyr-datetime-helpers.R | 5 ++--- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 29b0a7a423e..abc8801d512 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -285,7 +285,6 @@ process_data_for_parsing <- function(x, attempt_parsing <- function(x, orders) { - # translate orders into possible formats formats <- build_formats(orders) @@ -293,7 +292,7 @@ attempt_parsing <- function(x, parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs, formats) - # if all orders are c("ym", "my", "yq") only attempt to parse the augmented_x + # if all orders are in c("ym", "my", "yq") only attempt to parse the augmented_x if (all(orders %in% c("ym", "my", "yq"))) { parse_attempt_exprs_list$processed_x <- list() } @@ -302,7 +301,7 @@ attempt_parsing <- function(x, } build_strptime_exprs <- function(x, formats) { - # returning an empty list helps when iterating with build_strptime_exprs + # returning an empty list helps when iterating if (is.null(x)) { return(list()) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 76ba7034d75..348326da084 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2041,7 +2041,7 @@ test_that("parse_date_time with hours, minutes and seconds components", { collect(), tibble( ymd_ims_string = - c("67-01-09 12:34:56", "1970-05-22 11:13:59", "870822101359", NA) + c("67-01-09 9:34:56", "1970-05-22 10:13:59", "870822111359", NA) ) ) From 0e539d898fb048621bec6849738125b2bbc408e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 24 May 2022 13:54:51 +0100 Subject: [PATCH 23/47] add support and unit tests for `"qy"` order + document `process_data_for_parsing()` --- r/R/dplyr-datetime-helpers.R | 55 ++++++++++++++++++-- r/R/dplyr-funcs-datetime.R | 2 +- r/tests/testthat/test-dplyr-funcs-datetime.R | 20 ++++++- 3 files changed, 70 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index abc8801d512..767da018d6f 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -154,7 +154,9 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { build_formats <- function(orders) { # only keep the letters and the underscore as separator -> allow the users to - # pass strptime-like formats (with "%"). Processing is needed (instead of passing + # pass strptime-like formats (with "%"). We process the data -> we need to + # process the `orders` (even if supplied in the desired format) + # Processing is needed (instead of passing # formats as-is) due to the processing of the character vector in parse_date_time() orders <- gsub("[^A-Za-z]", "", orders) orders <- gsub("Y", "y", orders) @@ -179,6 +181,12 @@ build_formats <- function(orders) { orders <- unique(c(orders1, orders2)) } + if (any(orders == "qy")) { + orders1 <- setdiff(orders, "qy") + orders2 <- "ymd" + orders <- unique(c(orders1, orders2)) + } + ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") ymd_hms_orders <- c( "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS", @@ -190,10 +198,10 @@ build_formats <- function(orders) { supported_orders <- c( ymd_orders, ymd_hms_orders, - gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as separators + gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as order separators gsub("_", "", ymd_hms_orders), ymd_ims_orders, - gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as separators + gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as order separators gsub("_", "", ymd_ims_orders) ) @@ -236,9 +244,36 @@ build_format_from_order <- function(order) { c(formats_with_sep, formats_without_sep) } +#' Process data in preparation for parsing +#' +#' `process_data_for_parsing()` takes a data column and a vector of `orders` and +#' prepares several versions of the input data: +#' * `processed_x` is a version of `x` where all separators were replaced with +#' `"-"` and multiple separators were collapsed into a single one. This element +#' is only set to an empty list when the `orders` argument indicate we're only +#' interested in parsing the augmented version of `x`. +#' * each of the other 3 elements augment `x` in some way +#' * `augmented_x_ym` - augments the `ym` and `my` formats by adding `"01"` +#' (to indicate the first day of the month) +#' * `augmented_x_yq` - transforms the `yq` format to `ymd`, by deriving the +#' first month of the quarter and adding `"01"` to indicate the first day +#' * `augmented_x_qy` - transforms the `qy` format to `ymd` in a similar +#' manner to `"yq"` +#' +#' @param x an Expression corresponding to a character or numeric vector of +#' dates to be parsed. +#' @param orders a character vector of date-time formats. +#' +#' @return a list made up of 4 list, each a different version of x: +#' * `processed_x` +#' * `augmented_x_ym` +#' * `augmented_x_yq` +#' * `augmented_x_qy` +#' @noRd process_data_for_parsing <- function(x, orders) { + # browser() processed_x <- x$cast(string()) # make all separators (non-letters and non-numbers) into "-" @@ -276,9 +311,21 @@ process_data_for_parsing <- function(x, augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01") } + augmented_x_qy <- NULL + if (any(orders == "qy")) { + quarter_x <- call_binding("gsub", "-.*$", "", processed_x) + quarter_x <- quarter_x$cast(int32()) + year_x <- call_binding("gsub", "^.*?-", "", processed_x) + # year might be missing the final 0s when extracted from a float + year_x <- call_binding("str_pad", year_x, width = 4, side = "right", pad = "0") + month_x <- (quarter_x - 1) * 3 + 1 + augmented_x_qy <- call_binding("paste0", year_x, "-", month_x, "-01") + } + list( "augmented_x_ym" = augmented_x_ym, "augmented_x_yq" = augmented_x_yq, + "augmented_x_qy" = augmented_x_qy, "processed_x" = processed_x ) } @@ -293,7 +340,7 @@ attempt_parsing <- function(x, parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs, formats) # if all orders are in c("ym", "my", "yq") only attempt to parse the augmented_x - if (all(orders %in% c("ym", "my", "yq"))) { + if (all(orders %in% c("ym", "my", "yq", "qy"))) { parse_attempt_exprs_list$processed_x <- list() } diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index a9d89e283e8..33c02f781c7 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -528,7 +528,7 @@ register_bindings_datetime_parsers <- function() { arrow_not_supported("`quiet = FALSE`") } - if (truncated != 0) { + if (truncated > 0) { if (truncated > (nchar(orders) - 3)) { arrow_not_supported(paste0("a value for `truncated` > ", nchar(orders) - 3)) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 348326da084..567c59a7748 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1790,9 +1790,22 @@ test_that("ym, my & yq parsers", { my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA), yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA), yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA), - yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA) + yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA), + qy_string = c("3.2007", "2.1970", "1.2020", "4.2009", "1.1975", NA), + qy_numeric = c(3.2007, 2.1970, 1.2020, 4.2009, 1.1975, NA), + qy_space = c("3 2007", "2 1970", "1 2020", "4 2009", "1 1975", NA) ) + test_df %>% + # arrow_table() %>% + mutate( + # qy_date_from_string = parse_date_time(qy_string, orders = "qy"), + qy_date_from_numeric = parse_date_time(as.character(qy_numeric), orders = "qy"), + # qy_date_from_string_with_space = parse_date_time(qy_space, orders = "qy"), + .keep = "used" + ) %>% + collect() + # these functions' internals use some string processing which requires the # RE2 library (not available on Windows with R 3.6) skip_if_not_available("re2") @@ -1813,7 +1826,10 @@ test_that("ym, my & yq parsers", { my_date2 = parse_date_time(my_string, orders = c("my", "myd")), yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"), yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"), - yq_date_from_string_with_space2 = parse_date_time(yq_space, orders = "yq") + yq_date_from_string_with_space2 = parse_date_time(yq_space, orders = "yq"), + qy_date_from_string = parse_date_time(qy_string, orders = "qy"), + qy_date_from_numeric = parse_date_time(qy_numeric, orders = "qy"), + qy_date_from_string_with_space = parse_date_time(qy_space, orders = "qy") ) %>% collect(), test_df From 63acafc2cb79693e244f02e9d3379ea9fa80bbcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 24 May 2022 13:57:12 +0100 Subject: [PATCH 24/47] unit test clean-up --- r/tests/testthat/test-dplyr-funcs-datetime.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 567c59a7748..0f6e9587cf7 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1796,16 +1796,6 @@ test_that("ym, my & yq parsers", { qy_space = c("3 2007", "2 1970", "1 2020", "4 2009", "1 1975", NA) ) - test_df %>% - # arrow_table() %>% - mutate( - # qy_date_from_string = parse_date_time(qy_string, orders = "qy"), - qy_date_from_numeric = parse_date_time(as.character(qy_numeric), orders = "qy"), - # qy_date_from_string_with_space = parse_date_time(qy_space, orders = "qy"), - .keep = "used" - ) %>% - collect() - # these functions' internals use some string processing which requires the # RE2 library (not available on Windows with R 3.6) skip_if_not_available("re2") From b492c30bf400650013ab02e40b122bf78cd92a94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 24 May 2022 14:53:22 +0100 Subject: [PATCH 25/47] comments --- r/R/dplyr-datetime-helpers.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 767da018d6f..2e0b22d9b55 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -316,7 +316,8 @@ process_data_for_parsing <- function(x, quarter_x <- call_binding("gsub", "-.*$", "", processed_x) quarter_x <- quarter_x$cast(int32()) year_x <- call_binding("gsub", "^.*?-", "", processed_x) - # year might be missing the final 0s when extracted from a float + # year might be missing the final 0s when extracted from a float, hence the + # need to pad year_x <- call_binding("str_pad", year_x, width = 4, side = "right", pad = "0") month_x <- (quarter_x - 1) * 3 + 1 augmented_x_qy <- call_binding("paste0", year_x, "-", month_x, "-01") @@ -339,7 +340,8 @@ attempt_parsing <- function(x, parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs, formats) - # if all orders are in c("ym", "my", "yq") only attempt to parse the augmented_x + # if all orders are in c("ym", "my", "yq", "qy") only attempt to parse the + # augmented version(s) of x if (all(orders %in% c("ym", "my", "yq", "qy"))) { parse_attempt_exprs_list$processed_x <- list() } From 163d895f7f999621a3f0b4d2c083221179874a98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 24 May 2022 14:55:02 +0100 Subject: [PATCH 26/47] comment + removed `browser()` --- r/R/dplyr-datetime-helpers.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 2e0b22d9b55..6a13cb08990 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -273,7 +273,6 @@ build_format_from_order <- function(order) { process_data_for_parsing <- function(x, orders) { - # browser() processed_x <- x$cast(string()) # make all separators (non-letters and non-numbers) into "-" @@ -311,6 +310,8 @@ process_data_for_parsing <- function(x, augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01") } + # same as for `yq`, we need to derive the month from the quarter and add a + # "01" to give us the first day of the month augmented_x_qy <- NULL if (any(orders == "qy")) { quarter_x <- call_binding("gsub", "-.*$", "", processed_x) From 856f4027f531ec7a6666608ae2b4f20172764832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 24 May 2022 17:00:34 +0100 Subject: [PATCH 27/47] test `"IMS"` with hour greater than 12 --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 0f6e9587cf7..59babb4f213 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2047,7 +2047,7 @@ test_that("parse_date_time with hours, minutes and seconds components", { collect(), tibble( ymd_ims_string = - c("67-01-09 9:34:56", "1970-05-22 10:13:59", "870822111359", NA) + c("67-01-09 9:34:56", "1970-05-22 10:13:59", "870822171359", NA) ) ) From 56ad69c22b08e7c400a857975ee3fc4db1ebf3a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 24 May 2022 17:06:09 +0100 Subject: [PATCH 28/47] comment + reflow --- r/tests/testthat/test-dplyr-funcs-datetime.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 59babb4f213..a9f6aedc122 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2043,7 +2043,15 @@ test_that("parse_date_time with hours, minutes and seconds components", { # test ymd_ims compare_dplyr_binding( .input %>% - mutate(ymd_ims_dttm = parse_date_time(ymd_ims_string, orders = "ymd_IMS")) %>% + mutate( + ymd_ims_dttm = + parse_date_time( + ymd_ims_string, + orders = "ymd_IMS", + # we add this since lubridate is chatty and will warn 1 format failed to parse + quiet = TRUE + ) + ) %>% collect(), tibble( ymd_ims_string = From 8aefd2c94502e641794ace80d94b7dfaee0fbea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 09:05:27 +0100 Subject: [PATCH 29/47] make the ims string unanbiguous --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a9f6aedc122..fa1f4d807f8 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2055,7 +2055,7 @@ test_that("parse_date_time with hours, minutes and seconds components", { collect(), tibble( ymd_ims_string = - c("67-01-09 9:34:56", "1970-05-22 10:13:59", "870822171359", NA) + c("67-01-09 9:34:56", "1970-05-22 10:13:59", "19870822171359", NA) ) ) From 600a6ff491e520cfe82bcdd82db16bf72321bbbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 09:07:11 +0100 Subject: [PATCH 30/47] comment --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index fa1f4d807f8..8fd3e2d9cba 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2048,7 +2048,7 @@ test_that("parse_date_time with hours, minutes and seconds components", { parse_date_time( ymd_ims_string, orders = "ymd_IMS", - # we add this since lubridate is chatty and will warn 1 format failed to parse + # lubridate is chatty and will warn 1 format failed to parse quiet = TRUE ) ) %>% From c5ee8ad1b9bc87bda5e9ec84e759a63ba2bd6062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 10:37:09 +0100 Subject: [PATCH 31/47] docs + change in approach if `x` has or doesn't have a `"-"` separator --- r/R/dplyr-datetime-helpers.R | 69 ++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 6a13cb08990..f1ec81d2fdd 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -239,9 +239,10 @@ build_format_from_order <- function(order) { split_order <- strsplit(order, split = "")[[1]] outcome <- expand.grid(char_list[split_order]) + # return only formats with "-" separator, which will be removed during parsing + # if the string to be parsed does not contain a separator formats_with_sep <- do.call(paste, c(outcome, sep = "-")) - formats_without_sep <- do.call(paste, c(outcome, sep = "")) - c(formats_with_sep, formats_without_sep) + formats_with_sep } #' Process data in preparation for parsing @@ -253,7 +254,7 @@ build_format_from_order <- function(order) { #' is only set to an empty list when the `orders` argument indicate we're only #' interested in parsing the augmented version of `x`. #' * each of the other 3 elements augment `x` in some way -#' * `augmented_x_ym` - augments the `ym` and `my` formats by adding `"01"` +#' * `augmented_x_ym` - builds the `ym` and `my` formats by adding `"01"` #' (to indicate the first day of the month) #' * `augmented_x_yq` - transforms the `yq` format to `ymd`, by deriving the #' first month of the quarter and adding `"01"` to indicate the first day @@ -332,13 +333,30 @@ process_data_for_parsing <- function(x, ) } + +#' Attempt parsing +#' +#' This function does several things: +#' * builds all possible `formats` from the supplied `orders` +#' * processes the data with `process_data_for_parsing()` +#' * build a list of the possible `strptime` Expressions for the data & formats +#' combinations +#' +#' @inheritParams process_data_for_parsing +#' +#' @return a list of `strptime` Expressions we can use with `coalesce` +#' @noRd attempt_parsing <- function(x, orders) { # translate orders into possible formats formats <- build_formats(orders) + # depending on the orders argument we need to do some processing to the input + # data processed_data <- process_data_for_parsing(x, orders) + # build a list of expressions for parsing each processed_data element and + # format combination parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs, formats) # if all orders are in c("ym", "my", "yq", "qy") only attempt to parse the @@ -347,21 +365,58 @@ attempt_parsing <- function(x, parse_attempt_exprs_list$processed_x <- list() } + # we need the output to be a list of expressions (currently it is a list of + # lists of expressions due to the shape of the processed data. we have one list + # of expressions for each element of/ list in processed_data) -> we need to + # remove a level of hierarchy from the list purrr::flatten(parse_attempt_exprs_list) } +#' Build `strptime` expressions +#' +#' This function takes several `formats`, iterates over them and builds a +#' `strptime` Expression for each of them. Given these Expressions are evaluated +#' row-wise we can leverage this behaviour and introduce a condition. If `x` has +#' a separator, use the `format` as is, if it doesn't have a separator, remove +#' the `"-"` separator from the `format`. +#' +#' @param x an Expression corresponding to a character or numeric vector of +#' dates to be parsed. +#' @param formats a vector of formats as returned by `build_format_from_order` +#' +#' @return a list of Expressions +#' @noRd build_strptime_exprs <- function(x, formats) { # returning an empty list helps when iterating if (is.null(x)) { return(list()) } + # if x has a separator ("-") use the format as-is (i.e. with separator) + # if not, remove the separator map( formats, - ~ build_expr( - "strptime", - x, - options = list(format = .x, unit = 0L, error_is_null = TRUE) + ~ call_binding( + "if_else", + call_binding("grepl", "-", x), + build_expr( + "strptime", + x, + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) + ), + build_expr( + "strptime", + x, + options = list( + format = gsub("-", "", .x), + unit = 0L, + error_is_null = TRUE + ) + ) ) ) } From e8bc62ddf7ff82598929614cca2c19b7e4240554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 12:44:57 +0100 Subject: [PATCH 32/47] update tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 8fd3e2d9cba..24a74592e26 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2176,6 +2176,9 @@ test_that("parse_date_time & `exact = TRUE`", { y = c("11/23/1998 07:00:00", "6/18/1952 0135", "2/25/1974 0523", "9/07/1985 01", NA) ) + # these functions' internals use some string processing which requires the + # RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( From aa616e543d9f714e44f0f31681e4b9027253308a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 26 May 2022 11:42:58 +0100 Subject: [PATCH 33/47] regular R objects as input + tests --- r/R/dplyr-funcs-datetime.R | 4 ++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 33c02f781c7..5c3f3f4c764 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -536,6 +536,10 @@ register_bindings_datetime_parsers <- function() { orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop = nchar(orders) - .x)) } + if (!inherits(x, "Expression")) { + x <- Expression$scalar(x) + } + if (exact == TRUE) { # no data processing takes place & we don't derive formats parse_attempts <- build_strptime_exprs(x, orders) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 24a74592e26..8b19a6e9f11 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2199,3 +2199,16 @@ test_that("parse_date_time & `exact = TRUE`", { test_df ) }) + +test_that("parse_date_time with R objects", { + compare_dplyr_binding( + .input %>% + mutate( + b = parse_date_time("2022-12-31 12:59:59", orders = "ymd_HMS") + ) %>% + collect(), + tibble( + a = 1 + ) + ) +}) From 2ddcd5e4c2987cb4884e148c63a718c0f1abff5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 26 May 2022 11:45:39 +0100 Subject: [PATCH 34/47] merged test_that blocks --- r/tests/testthat/test-dplyr-funcs-datetime.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 8b19a6e9f11..7de48927efa 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2170,7 +2170,7 @@ test_that("parse_date_time with hours, minutes and seconds components", { ) }) -test_that("parse_date_time & `exact = TRUE`", { +test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { test_df <- tibble( x = c("2022-12-31 12:59:59", "2022-01-01 12:11", "2022-01-01 12", "2022-01-01", NA), y = c("11/23/1998 07:00:00", "6/18/1952 0135", "2/25/1974 0523", "9/07/1985 01", NA) @@ -2198,9 +2198,6 @@ test_that("parse_date_time & `exact = TRUE`", { collect(), test_df ) -}) - -test_that("parse_date_time with R objects", { compare_dplyr_binding( .input %>% mutate( From ecfd70ab63f7c0b0ee94fee76521d349270c3f71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 6 Jun 2022 10:15:31 +0100 Subject: [PATCH 35/47] added some tests for `build_formats()` (not finished) --- r/tests/testthat/test-dplyr-funcs-datetime.R | 88 ++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 7de48927efa..64489645afa 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2209,3 +2209,91 @@ test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { ) ) }) + +test_that("build_formats", { + # TODO finish adding tests for build_formats + expect_equal( + build_formats(c("ym", "myd", "%Y-%d-%m")), + c("%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", + "%b-%Y-%d", "%y-%d-%m", "%Y-%d-%m", "%y-%d-%B", "%Y-%d-%B", "%y-%d-%b", + "%Y-%d-%b", "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", + "%Y-%b-%d") + ) + expect_error(build_formats("abd")) +# a +# Abbreviated weekday name in the current locale. (Also matches full name) +# +# A +# Full weekday name in the current locale. (Also matches abbreviated name). +# +# You don't need to specify a and A formats explicitly. Wday is automatically handled if preproc_wday = TRUE +# +# b (!) +# Abbreviated or full month name in the current locale. The C parser currently understands only English month names. +# +# B (!) +# Same as b. +# +# d (!) +# Day of the month as decimal number (01–31 or 0–31) +# +# H (!) +# Hours as decimal number (00–24 or 0–24). +# +# I (!) +# Hours as decimal number (01–12 or 1–12). +# +# j +# Day of year as decimal number (001–366 or 1–366). +# +# q (!*) +# Quarter (1–4). The quarter month is added to the parsed month if m element is present. +# +# m (!*) +# Month as decimal number (01–12 or 1–12). For parse_date_time also matches abbreviated and full months names as b and B formats. C parser understands only English month names. +# +# M (!) +# Minute as decimal number (00–59 or 0–59). +# +# p (!) +# AM/PM indicator in the locale. Commonly used in conjunction with I and not with H. But lubridate's C parser accepts H format as long as hour is not greater than 12. C parser understands only English locale AM/PM indicator. +# +# S (!) +# Second as decimal number (00–61 or 0–61), allowing for up to two leap-seconds (but POSIX-compliant implementations will ignore leap seconds). +# +# OS +# Fractional second. +# +# U +# Week of the year as decimal number (00–53 or 0–53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention. +# +# w +# Weekday as decimal number (0–6, Sunday is 0). +# +# W +# Week of the year as decimal number (00–53 or 0–53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention. +# +# y (!*) +# Year without century (00–99 or 0–99). In parse_date_time() also matches year with century (Y format). +# +# Y (!) +# Year with century. +# +# z (!*) +# ISO8601 signed offset in hours and minutes from UTC. For example -0800, -08:00 or -08, all represent 8 hours behind UTC. This format also matches the Z (Zulu) UTC indicator. Because base::strptime() doesn't fully support ISO8601 this format is implemented as an union of 4 formats: Ou (Z), Oz (-0800), OO (-08:00) and Oo (-08). You can use these formats as any other but it is rarely necessary. parse_date_time2() and fast_strptime() support all of these formats. +# +# Om (!*) +# Matches numeric month and English alphabetic months (Both, long and abbreviated forms). +# +# Op (!*) +# Matches AM/PM English indicator. +# +# r (*) +# Matches Ip and H orders. +# +# R (*) +# Matches HM andIMp orders. +# +# T (*) +# Matches IMSp, HMS, and HMOS orders. +}) From f93e038b011854f8c487fc6b08efeef6de330cf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 20 Jun 2022 16:14:03 +0100 Subject: [PATCH 36/47] update comment --- r/R/dplyr-funcs-datetime.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5c3f3f4c764..8ecb80b6b45 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -549,9 +549,9 @@ register_bindings_datetime_parsers <- function() { coalesce_output <- build_expr("coalesce", args = parse_attempts) - # we need this binding to be able to handle a NULL `tz`, which will then be - # used by bindings such as `ymd` to return, based on whether tz is NULL or - # not, a date or timestamp + # we need this binding to be able to handle a NULL `tz`, which, in turn, + # will be used by bindings such as `ymd()` to return a date or timestamp, + # based on whether tz is NULL or not if (!is.null(tz)) { build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) } else { From 323fe4feeef0d28fa75b549a71c0268a26b3fc11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 20 Jun 2022 16:18:35 +0100 Subject: [PATCH 37/47] add a first battery of tests for `build_formats()` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 39 +++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 64489645afa..f1360f0a8b3 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2211,7 +2211,6 @@ test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { }) test_that("build_formats", { - # TODO finish adding tests for build_formats expect_equal( build_formats(c("ym", "myd", "%Y-%d-%m")), c("%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", @@ -2219,7 +2218,43 @@ test_that("build_formats", { "%Y-%d-%b", "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d") ) - expect_error(build_formats("abd")) + + # when order is one of "yq", "qy", "ym" or"my" the data is augmented to "ymd" + # or "ydm" and the formats are built accordingly + ymd_formats <- c( + "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d" + ) + expect_equal( + build_formats("yq"), + ymd_formats + ) + + expect_equal( + build_formats("ym"), + ymd_formats + ) + + expect_equal( + build_formats("qy"), + ymd_formats + ) + + # build formats will output unique formats + expect_equal( + build_formats(c("yq", "ym", "qy")), + ymd_formats + ) + + expect_equal( + build_formats("my"), + c("%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", "%b-%Y-%d") + ) + + # ab not supported yet + expect_error( + build_formats("abd"), + '"abd" `orders` not supported in Arrow' + ) # a # Abbreviated weekday name in the current locale. (Also matches full name) # From 4115f41cd34dde9cdcc9d4f7973375fe49f7ea46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 20 Jun 2022 16:33:49 +0100 Subject: [PATCH 38/47] handle `Yq`, `qY`, `mY` and `Ym` + unit tests --- r/R/dplyr-datetime-helpers.R | 6 +++--- r/tests/testthat/test-dplyr-funcs-datetime.R | 19 ++++++++++++++++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index f1ec81d2fdd..7cba7ac7068 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -285,7 +285,7 @@ process_data_for_parsing <- function(x, # for `ym` and `my` orders we add a day ("01") # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627 augmented_x_ym <- NULL - if (any(orders %in% c("ym", "my"))) { + if (any(orders %in% c("ym", "my", "Ym", "mY"))) { # add day as "-01" if there is a "-" separator and as "01" if not augmented_x_ym <- call_binding( "if_else", @@ -298,7 +298,7 @@ process_data_for_parsing <- function(x, # for `yq` we need to transform the quarter into the start month (lubridate # behaviour) and then add 01 to parse to the first day of the quarter augmented_x_yq <- NULL - if (any(orders == "yq")) { + if (any(orders %in% c("yq", "Yq"))) { # extract everything that comes after the `-` separator, i.e. the quarter # (e.g. 4 from 2022-4) quarter_x <- call_binding("gsub", "^.*?-", "", processed_x) @@ -314,7 +314,7 @@ process_data_for_parsing <- function(x, # same as for `yq`, we need to derive the month from the quarter and add a # "01" to give us the first day of the month augmented_x_qy <- NULL - if (any(orders == "qy")) { + if (any(orders %in% c("qy", "qY"))) { quarter_x <- call_binding("gsub", "-.*$", "", processed_x) quarter_x <- quarter_x$cast(int32()) year_x <- call_binding("gsub", "^.*?-", "", processed_x) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index f1360f0a8b3..60279b580f4 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1788,6 +1788,8 @@ test_that("ym, my & yq parsers", { test_df <- tibble::tibble( ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA), my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA), + Ym_string = c("2022-05", "2022/02", "2022.03", "1979//12", "1988.09", NA), + mY_string = c("05-2022", "02/2022", "03.2022", "12//1979", "09.1988", NA), yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA), yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA), yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA), @@ -1804,8 +1806,12 @@ test_that("ym, my & yq parsers", { mutate( ym_date = ym(ym_string), ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"), + Ym_date = ym(Ym_string), + Ym_datetime = ym(Ym_string, tz = "Pacific/Marquesas"), my_date = my(my_string), my_datetime = my(my_string, tz = "Pacific/Marquesas"), + mY_date = my(mY_string), + mY_datetime = my(mY_string, tz = "Pacific/Marquesas"), yq_date_from_string = yq(yq_string), yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"), yq_date_from_numeric = yq(yq_numeric), @@ -1814,12 +1820,23 @@ test_that("ym, my & yq parsers", { yq_datetime_from_string_with_space = yq(yq_space, tz = "Pacific/Marquesas"), ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")), my_date2 = parse_date_time(my_string, orders = c("my", "myd")), + Ym_date2 = parse_date_time(Ym_string, orders = c("Ym", "ymd")), + mY_date2 = parse_date_time(mY_string, orders = c("mY", "myd")), yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"), yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"), yq_date_from_string_with_space2 = parse_date_time(yq_space, orders = "yq"), + # testing with Yq + yq_date_from_string3 = parse_date_time(yq_string, orders = "Yq"), + yq_date_from_numeric3 = parse_date_time(yq_numeric, orders = "Yq"), + yq_date_from_string_with_space3 = parse_date_time(yq_space, orders = "Yq"), + # testing with qy qy_date_from_string = parse_date_time(qy_string, orders = "qy"), qy_date_from_numeric = parse_date_time(qy_numeric, orders = "qy"), - qy_date_from_string_with_space = parse_date_time(qy_space, orders = "qy") + qy_date_from_string_with_space = parse_date_time(qy_space, orders = "qy"), + # testing with qY + qy_date_from_string2 = parse_date_time(qy_string, orders = "qY"), + qy_date_from_numeric2 = parse_date_time(qy_numeric, orders = "qY"), + qy_date_from_string_with_space2 = parse_date_time(qy_space, orders = "qY") ) %>% collect(), test_df From c48425ff79bac74e11253787cab1289d35e034da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 20 Jun 2022 17:13:31 +0100 Subject: [PATCH 39/47] clean-up --- r/tests/testthat/test-dplyr-funcs-datetime.R | 76 -------------------- 1 file changed, 76 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 60279b580f4..0e707bd3871 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2272,80 +2272,4 @@ test_that("build_formats", { build_formats("abd"), '"abd" `orders` not supported in Arrow' ) -# a -# Abbreviated weekday name in the current locale. (Also matches full name) -# -# A -# Full weekday name in the current locale. (Also matches abbreviated name). -# -# You don't need to specify a and A formats explicitly. Wday is automatically handled if preproc_wday = TRUE -# -# b (!) -# Abbreviated or full month name in the current locale. The C parser currently understands only English month names. -# -# B (!) -# Same as b. -# -# d (!) -# Day of the month as decimal number (01–31 or 0–31) -# -# H (!) -# Hours as decimal number (00–24 or 0–24). -# -# I (!) -# Hours as decimal number (01–12 or 1–12). -# -# j -# Day of year as decimal number (001–366 or 1–366). -# -# q (!*) -# Quarter (1–4). The quarter month is added to the parsed month if m element is present. -# -# m (!*) -# Month as decimal number (01–12 or 1–12). For parse_date_time also matches abbreviated and full months names as b and B formats. C parser understands only English month names. -# -# M (!) -# Minute as decimal number (00–59 or 0–59). -# -# p (!) -# AM/PM indicator in the locale. Commonly used in conjunction with I and not with H. But lubridate's C parser accepts H format as long as hour is not greater than 12. C parser understands only English locale AM/PM indicator. -# -# S (!) -# Second as decimal number (00–61 or 0–61), allowing for up to two leap-seconds (but POSIX-compliant implementations will ignore leap seconds). -# -# OS -# Fractional second. -# -# U -# Week of the year as decimal number (00–53 or 0–53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention. -# -# w -# Weekday as decimal number (0–6, Sunday is 0). -# -# W -# Week of the year as decimal number (00–53 or 0–53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention. -# -# y (!*) -# Year without century (00–99 or 0–99). In parse_date_time() also matches year with century (Y format). -# -# Y (!) -# Year with century. -# -# z (!*) -# ISO8601 signed offset in hours and minutes from UTC. For example -0800, -08:00 or -08, all represent 8 hours behind UTC. This format also matches the Z (Zulu) UTC indicator. Because base::strptime() doesn't fully support ISO8601 this format is implemented as an union of 4 formats: Ou (Z), Oz (-0800), OO (-08:00) and Oo (-08). You can use these formats as any other but it is rarely necessary. parse_date_time2() and fast_strptime() support all of these formats. -# -# Om (!*) -# Matches numeric month and English alphabetic months (Both, long and abbreviated forms). -# -# Op (!*) -# Matches AM/PM English indicator. -# -# r (*) -# Matches Ip and H orders. -# -# R (*) -# Matches HM andIMp orders. -# -# T (*) -# Matches IMSp, HMS, and HMOS orders. }) From b8c690095a93facec883e510cfa58cbf844348af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 21 Jun 2022 13:04:13 +0100 Subject: [PATCH 40/47] switch to the previous implementation, where `build_format_from_order()` returns both "-" separated and non-separated formats in a single vector --- r/R/dplyr-datetime-helpers.R | 44 ++++++++++++------------------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 7cba7ac7068..84a87de76a7 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -172,7 +172,7 @@ build_formats <- function(orders) { orders1 <- setdiff(orders, short_orders) orders2 <- intersect(orders, short_orders) orders2 <- paste0(orders2, "d") - orders <- unique(c(orders1, orders2)) + orders <- unique(c(orders2, orders1)) } if (any(orders == "yq")) { @@ -239,10 +239,14 @@ build_format_from_order <- function(order) { split_order <- strsplit(order, split = "")[[1]] outcome <- expand.grid(char_list[split_order]) - # return only formats with "-" separator, which will be removed during parsing - # if the string to be parsed does not contain a separator + # we combine formats with and without the "-" separator, we will later + # coalesce through all of them (benchmarking indicated this is a more + # computationally efficient approach rather than figuring out if a string has + # separators or not and applying only ) + # during parsing if the string to be parsed does not contain a separator formats_with_sep <- do.call(paste, c(outcome, sep = "-")) - formats_with_sep + formats_without_sep <- do.call(paste, c(outcome, sep = "")) + c(formats_with_sep, formats_without_sep) } #' Process data in preparation for parsing @@ -352,7 +356,8 @@ attempt_parsing <- function(x, formats <- build_formats(orders) # depending on the orders argument we need to do some processing to the input - # data + # data. `process_data_for_parsing()` uses the passed `orders` and not the + # derived `formats` processed_data <- process_data_for_parsing(x, orders) # build a list of expressions for parsing each processed_data element and @@ -361,7 +366,7 @@ attempt_parsing <- function(x, # if all orders are in c("ym", "my", "yq", "qy") only attempt to parse the # augmented version(s) of x - if (all(orders %in% c("ym", "my", "yq", "qy"))) { + if (all(orders %in% c("ym", "Ym", "my", "mY", "yq", "Yq", "qy", "qY"))) { parse_attempt_exprs_list$processed_x <- list() } @@ -392,31 +397,12 @@ build_strptime_exprs <- function(x, formats) { return(list()) } - # if x has a separator ("-") use the format as-is (i.e. with separator) - # if not, remove the separator map( formats, - ~ call_binding( - "if_else", - call_binding("grepl", "-", x), - build_expr( - "strptime", - x, - options = list( - format = .x, - unit = 0L, - error_is_null = TRUE - ) - ), - build_expr( - "strptime", - x, - options = list( - format = gsub("-", "", .x), - unit = 0L, - error_is_null = TRUE - ) - ) + ~ build_expr( + "strptime", + x, + options = list(format = .x, unit = 0L, error_is_null = TRUE) ) ) } From 13eac86e466beb3b745e9c29cf6e8465103bd796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 21 Jun 2022 13:05:29 +0100 Subject: [PATCH 41/47] updated `build_formats()` unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 23 +++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 0e707bd3871..9865f068d70 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1786,7 +1786,7 @@ test_that("year, month, day date/time parsers", { test_that("ym, my & yq parsers", { test_df <- tibble::tibble( - ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA), + ym_string = c("2022-05", "2022/02", "22.3", "1979//12", "88.09", NA), my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA), Ym_string = c("2022-05", "2022/02", "2022.03", "1979//12", "1988.09", NA), mY_string = c("05-2022", "02/2022", "03.2022", "12//1979", "09.1988", NA), @@ -2230,17 +2230,23 @@ test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { test_that("build_formats", { expect_equal( build_formats(c("ym", "myd", "%Y-%d-%m")), - c("%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", - "%b-%Y-%d", "%y-%d-%m", "%Y-%d-%m", "%y-%d-%B", "%Y-%d-%B", "%y-%d-%b", - "%Y-%d-%b", "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", - "%Y-%b-%d") + c( + # formats from "ym" order + "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", + "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d", + # formats from "myd" order + "%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", "%b-%Y-%d", + "%m%y%d", "%B%y%d", "%b%y%d", "%m%Y%d", "%B%Y%d", "%b%Y%d", + # formats from "%Y-%d-%m" format + "%y-%d-%m", "%Y-%d-%m", "%y-%d-%B", "%Y-%d-%B", "%y-%d-%b", "%Y-%d-%b", + "%y%d%m", "%Y%d%m", "%y%d%B", "%Y%d%B", "%y%d%b", "%Y%d%b") ) # when order is one of "yq", "qy", "ym" or"my" the data is augmented to "ymd" # or "ydm" and the formats are built accordingly ymd_formats <- c( - "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d" - ) + "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", + "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d") expect_equal( build_formats("yq"), ymd_formats @@ -2264,7 +2270,8 @@ test_that("build_formats", { expect_equal( build_formats("my"), - c("%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", "%b-%Y-%d") + c("%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", "%b-%Y-%d", + "%m%y%d", "%B%y%d", "%b%y%d", "%m%Y%d", "%B%Y%d", "%b%Y%d") ) # ab not supported yet From 7d2d8568a68b38a9b74cbc8c492d0fde676f5468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 21 Jun 2022 13:27:35 +0100 Subject: [PATCH 42/47] document `build_formats()` and `build_format_from_order()` --- r/R/dplyr-datetime-helpers.R | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 84a87de76a7..fb8e724448d 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -152,6 +152,16 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { x } +#' Build formats from multiple orders +#' +#' This function is a vectorisation of `build_format_from_order()`. It also +#' checks if the supplied orders are currently supported. +#' +#' @inheritParams process_data_for_parsing +#' +#' @return a vector of unique formats +#' +#' @noRd build_formats <- function(orders) { # only keep the letters and the underscore as separator -> allow the users to # pass strptime-like formats (with "%"). We process the data -> we need to @@ -225,6 +235,14 @@ build_formats <- function(orders) { unique(formats) } +#' Build formats from a single order +#' +#' @param order a single string date-time format, such as `"ymd"` or `"ymd_hms"` +#' +#' @return a vector of all possible formats derived from the input +#' order +#' +#' @noRd build_format_from_order <- function(order) { char_list <- list( "y" = c("%y", "%Y"), @@ -269,7 +287,7 @@ build_format_from_order <- function(order) { #' dates to be parsed. #' @param orders a character vector of date-time formats. #' -#' @return a list made up of 4 list, each a different version of x: +#' @return a list made up of 4 lists, each a different version of x: #' * `processed_x` #' * `augmented_x_ym` #' * `augmented_x_yq` @@ -387,7 +405,8 @@ attempt_parsing <- function(x, #' #' @param x an Expression corresponding to a character or numeric vector of #' dates to be parsed. -#' @param formats a vector of formats as returned by `build_format_from_order` +#' @param formats a character vector of formats as returned by +#' `build_format_from_order` #' #' @return a list of Expressions #' @noRd From 4311ae6e15250ee094c5a55fcafa5880a29361ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 21 Jun 2022 13:28:09 +0100 Subject: [PATCH 43/47] more unit tests for `build_formats()` and `build_format_from_order()` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 29 +++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 9865f068d70..77975929ec3 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2227,7 +2227,7 @@ test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { ) }) -test_that("build_formats", { +test_that("build_formats() and build_format_from_order()", { expect_equal( build_formats(c("ym", "myd", "%Y-%d-%m")), c( @@ -2242,6 +2242,14 @@ test_that("build_formats", { "%y%d%m", "%Y%d%m", "%y%d%B", "%Y%d%B", "%y%d%b", "%Y%d%b") ) + expect_equal( + build_formats("ymd_HMS"), + c("%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", + "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", + "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", + "%y%b%d%H%M%S", "%Y%b%d%H%M%S") + ) + # when order is one of "yq", "qy", "ym" or"my" the data is augmented to "ymd" # or "ydm" and the formats are built accordingly ymd_formats <- c( @@ -2279,4 +2287,23 @@ test_that("build_formats", { build_formats("abd"), '"abd" `orders` not supported in Arrow' ) + + expect_error( + build_formats("vup"), + '"vup" `orders` not supported in Arrow' + ) + + expect_equal( + build_format_from_order("ymd"), + c("%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", + "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d") + ) + + expect_equal( + build_format_from_order("ymdHMS"), + c("%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", + "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", + "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", + "%y%b%d%H%M%S", "%Y%b%d%H%M%S") + ) }) From 423eb50f8b233202f2f4a6949f8da553ee358d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 21 Jun 2022 13:30:35 +0100 Subject: [PATCH 44/47] update --- r/R/dplyr-datetime-helpers.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index fb8e724448d..83d8ae97683 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -154,8 +154,9 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { #' Build formats from multiple orders #' -#' This function is a vectorisation of `build_format_from_order()`. It also -#' checks if the supplied orders are currently supported. +#' This function is a vectorised version of `build_format_from_order()`. In +#' addition to `build_format_from_order()`, it also checks if the supplied +#' orders are currently supported. #' #' @inheritParams process_data_for_parsing #' From 4a5e62ff1891af2dae848d9370f563be999d9ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 23 Jun 2022 14:18:33 +0100 Subject: [PATCH 45/47] style --- r/R/dplyr-datetime-helpers.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 83d8ae97683..60771c86241 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -294,8 +294,7 @@ build_format_from_order <- function(order) { #' * `augmented_x_yq` #' * `augmented_x_qy` #' @noRd -process_data_for_parsing <- function(x, - orders) { +process_data_for_parsing <- function(x, orders) { processed_x <- x$cast(string()) @@ -369,8 +368,7 @@ process_data_for_parsing <- function(x, #' #' @return a list of `strptime` Expressions we can use with `coalesce` #' @noRd -attempt_parsing <- function(x, - orders) { +attempt_parsing <- function(x, orders) { # translate orders into possible formats formats <- build_formats(orders) From b1f35ffed7ea02b46768d95520f1eca85b5446cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 23 Jun 2022 14:29:53 +0100 Subject: [PATCH 46/47] reorganised unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 167 +++++++++---------- 1 file changed, 80 insertions(+), 87 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 77975929ec3..5ec613c2f89 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2025,33 +2025,23 @@ test_that("parse_date_time with hours, minutes and seconds components", { test_dates_times ) + # parse_date_time with timezone + pm_tz <- "Pacific/Marquesas" compare_dplyr_binding( .input %>% mutate( - ymd_hms_dttm = - parse_date_time(ymd_hms_string, orders = "ymd_HMS", tz = "Pacific/Marquesas"), - ymd_hm_dttm = - parse_date_time(ymd_hm_string, orders = "ymd_HM", tz = "Pacific/Marquesas"), - ymd_h_dttm = - parse_date_time(ymd_h_string, orders = "ymd_H", tz = "Pacific/Marquesas"), - dmy_hms_dttm = - parse_date_time(dmy_hms_string, orders = "dmy_HMS", tz = "Pacific/Marquesas"), - dmy_hm_dttm = - parse_date_time(dmy_hm_string, orders = "dmy_HM", tz = "Pacific/Marquesas"), - dmy_h_dttm = - parse_date_time(dmy_h_string, orders = "dmy_H", tz = "Pacific/Marquesas"), - mdy_hms_dttm = - parse_date_time(mdy_hms_string, orders = "mdy_HMS", tz = "Pacific/Marquesas"), - mdy_hm_dttm = - parse_date_time(mdy_hm_string, orders = "mdy_HM", tz = "Pacific/Marquesas"), - mdy_h_dttm = - parse_date_time(mdy_h_string, orders = "mdy_H", tz = "Pacific/Marquesas"), - ydm_hms_dttm = - parse_date_time(ydm_hms_string, orders = "ydm_HMS", tz = "Pacific/Marquesas"), - ydm_hm_dttm = - parse_date_time(ydm_hm_string, orders = "ydm_HM", tz = "Pacific/Marquesas"), - ydm_h_dttm = - parse_date_time(ydm_h_string, orders = "ydm_H", tz = "Pacific/Marquesas") + ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS", tz = pm_tz), + ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM", tz = pm_tz), + ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H", tz = pm_tz), + dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS", tz = pm_tz), + dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM", tz = pm_tz), + dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H", tz = pm_tz), + mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS", tz = pm_tz), + mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM", tz = pm_tz), + mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H", tz = pm_tz), + ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS", tz = pm_tz), + ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM", tz = pm_tz), + ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H", tz = pm_tz) ) %>% collect(), test_dates_times @@ -2075,65 +2065,12 @@ test_that("parse_date_time with hours, minutes and seconds components", { c("67-01-09 9:34:56", "1970-05-22 10:13:59", "19870822171359", NA) ) ) +}) - # test truncated formats - test_truncation_df <- tibble( - truncated_ymd_string = - c( - "2022-05-19 13:46:51", - "2022-05-18 13:46", - "2022-05-17 13", - "2022-05-16" - ) - ) - - compare_dplyr_binding( - .input %>% - mutate( - dttm = - parse_date_time( - truncated_ymd_string, - orders = "ymd_HMS", - truncated = 3 - ) - ) %>% - collect(), - test_truncation_df - ) - - # values for truncated greater than nchar(orders) - 3 not supported in Arrow - compare_dplyr_binding( - .input %>% - mutate( - dttm = - parse_date_time( - truncated_ymd_string, - orders = "ymd_HMS", - truncated = 5 - ) - ) %>% - collect(), - test_truncation_df, - warning = "a value for `truncated` > 4 not supported in Arrow" - ) - - - # we need expect_warning twice as both the arrow pipeline (because quiet = - # FALSE is not supported) and the fallback dplyr/lubridate one throw - # warnings (the lubridate one because quiet is FALSE) - expect_warning( - expect_warning( - tibble(x = c("2022-05-19 13:46:51")) %>% - arrow_table() %>% - mutate( - x_dttm = parse_date_time(x, orders = "dmy_HMS", quiet = FALSE) - ) %>% - collect(), - "`quiet = FALSE` not supported in Arrow" - ), - "All formats failed to parse" - ) - +test_that("parse_date_time with month names and HMS", { + # locale (affecting "%b% and "%B" formats) does not work properly on Windows + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done + skip_on_os("windows") test_dates_times2 <- tibble( ymd_hms_string = c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", "87Aug22201359", NA), @@ -2160,11 +2097,7 @@ test_that("parse_date_time with hours, minutes and seconds components", { ydm_h_string = c("67-09-Jan 12", "1970-22-June 20", "8722Aug20", NA) ) - # the unseparated strings are versions of "1987-08-22 20:13:59" (with %y) - - # locale (affecting "%b% and "%B" formats) does not work properly on Windows - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done - skip_on_os("windows") + # the un-separated strings are versions of "1987-08-22 20:13:59" (with %y) compare_dplyr_binding( .input %>% @@ -2187,6 +2120,66 @@ test_that("parse_date_time with hours, minutes and seconds components", { ) }) +test_that("parse_date_time with `quiet = FALSE` not supported", { + # we need expect_warning twice as both the arrow pipeline (because quiet = + # FALSE is not supported) and the fallback dplyr/lubridate one throw + # warnings (the lubridate one because quiet is FALSE) + expect_warning( + expect_warning( + tibble(x = c("2022-05-19 13:46:51")) %>% + arrow_table() %>% + mutate( + x_dttm = parse_date_time(x, orders = "dmy_HMS", quiet = FALSE) + ) %>% + collect(), + "`quiet = FALSE` not supported in Arrow" + ), + "All formats failed to parse" + ) +}) + +test_that("parse_date_time with truncated formats", { + test_truncation_df <- tibble( + truncated_ymd_string = + c( + "2022-05-19 13:46:51", + "2022-05-18 13:46", + "2022-05-17 13", + "2022-05-16" + ) + ) + + compare_dplyr_binding( + .input %>% + mutate( + dttm = + parse_date_time( + truncated_ymd_string, + orders = "ymd_HMS", + truncated = 3 + ) + ) %>% + collect(), + test_truncation_df + ) + + # values for truncated greater than nchar(orders) - 3 not supported in Arrow + compare_dplyr_binding( + .input %>% + mutate( + dttm = + parse_date_time( + truncated_ymd_string, + orders = "ymd_HMS", + truncated = 5 + ) + ) %>% + collect(), + test_truncation_df, + warning = "a value for `truncated` > 4 not supported in Arrow" + ) +}) + test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { test_df <- tibble( x = c("2022-12-31 12:59:59", "2022-01-01 12:11", "2022-01-01 12", "2022-01-01", NA), From 5fc561fdc6b5bfc83ba707353ac7d406b9e42b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 23 Jun 2022 15:22:50 +0100 Subject: [PATCH 47/47] skip test on win & R 3.6 --- r/tests/testthat/test-dplyr-funcs-datetime.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 5ec613c2f89..15af0c9f8db 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2139,6 +2139,10 @@ test_that("parse_date_time with `quiet = FALSE` not supported", { }) test_that("parse_date_time with truncated formats", { + # these functions' internals use some string processing which requires the + # RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + test_truncation_df <- tibble( truncated_ymd_string = c(