From ed0478cc63ea95e138a62b92d1e8c5cf8f1ea5f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 16 May 2022 11:16:35 +0100 Subject: [PATCH 01/11] `"ym"` and `"my"` parsers + unit tests --- r/R/dplyr-datetime-helpers.R | 4 ++++ r/R/dplyr-funcs-datetime.R | 7 ++++++- r/tests/testthat/test-dplyr-funcs-datetime.R | 21 +++++++++++++++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 1855a4a46ea..5bf6cceea01 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -159,6 +159,10 @@ build_formats <- function(orders) { orders <- gsub("[^A-Za-z_]", "", orders) orders <- gsub("Y", "y", orders) + if (orders %in% c("ym", "my")) { + orders <- paste0(orders, "d") + } + supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") unsupported_passed_orders <- setdiff(orders, supported_orders) supported_passed_orders <- intersect(orders, supported_orders) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index e0c65d64cc0..525cb8a2413 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -498,6 +498,11 @@ register_bindings_datetime_parsers <- function() { # collapse multiple separators into a single one x <- call_binding("gsub", "-{2,}", "-", x) + # add a day (01) for "ym" and "my" orders + if (orders %in% c("ym", "my")) { + x <- call_binding("paste0", x, "-01") + } + # TODO figure out how to parse strings that have no separators # https://issues.apache.org/jira/browse/ARROW-16446 # we could insert separators at the "likely" positions, but it might be @@ -527,7 +532,7 @@ register_bindings_datetime_parsers <- function() { }) - ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") + ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my") ymd_parser_map_factory <- function(order) { force(order) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 42448e8243f..89cd05d68e4 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1735,7 +1735,7 @@ test_that("parse_date_time() doesn't work with hour, minutes, and second compone ) }) -test_that("year, month, day date/time parsers work", { +test_that("year, month, day date/time parsers", { test_df <- tibble::tibble( ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"), ydm_string = c("2022-11-05", "2022/12/05", "22.13-05"), @@ -1776,3 +1776,22 @@ test_that("year, month, day date/time parsers work", { test_df ) }) + +test_that("ym, my & yq parsers", { + test_df <- tibble::tibble( + ym_string = c("2022-05", "2022/02", "22.03", NA), + my_string = c("05-2022", "02/2022", "03.22", NA) + ) + + compare_dplyr_binding( + .input %>% + mutate( + ym_date = ym(ym_string), + ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"), + my_date = my(my_string), + my_datetime = my(my_string, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) From 20e5cabfcaaa1177dfb8e946ae5aa59edb03e8f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 16 May 2022 15:08:05 +0100 Subject: [PATCH 02/11] x can be either a numeric or string; added logic for implementing parsing cases when we augment x (from ym and yq -> ymd, and from my -> myd) --- r/R/dplyr-funcs-datetime.R | 49 +++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 525cb8a2413..d2568dbcd3e 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -493,14 +493,27 @@ register_bindings_datetime_parsers <- function() { # each order is translated into possible formats formats <- build_formats(orders) + x <- x$cast(string()) + # make all separators (non-letters and non-numbers) into "-" x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) # collapse multiple separators into a single one x <- call_binding("gsub", "-{2,}", "-", x) # add a day (01) for "ym" and "my" orders - if (orders %in% c("ym", "my")) { - x <- call_binding("paste0", x, "-01") + augmented_x <- NULL + if (any(orders %in% c("ym", "my"))) { + augmented_x <- call_binding("paste0", x, "-01") + } + + augmented_x2 <- NULL + if (any(orders == "yq")) { + quarter_x <- call_binding("gsub", "^.*?-", "", x) + # we should probably error if quarter is not in 1:4 + year_x <- call_binding("gsub", "-.*$", "", x) + quarter_x <- quarter_x$cast(int32()) + month_x <- (quarter_x - 1) * 3 + 1 + augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01") } # TODO figure out how to parse strings that have no separators @@ -519,6 +532,36 @@ register_bindings_datetime_parsers <- function() { ) } + parse_attempt_exp_augmented_x <- list() + + if (!is.null(augmented_x)) { + for (i in seq_along(formats)) { + parse_attempt_expressions[[i]] <- build_expr( + "strptime", + augmented_x, + options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE) + ) + } + } + + parse_attempt_exp_augmented_x2 <- list() + + if (!is.null(augmented_x2)) { + for (i in seq_along(formats)) { + parse_attempt_expressions[[i]] <- build_expr( + "strptime", + augmented_x2, + options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE) + ) + } + } + + parse_attempt_expressions <- c( + parse_attempt_expressions, + parse_attempt_exp_augmented_x, + parse_attempt_exp_augmented_x2 + ) + coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) # we need this binding to be able to handle a NULL `tz`, which will then be @@ -532,7 +575,7 @@ register_bindings_datetime_parsers <- function() { }) - ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my") + ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my", "yq") ymd_parser_map_factory <- function(order) { force(order) From 0051609a0b040bb5de5b6e68261128033d408d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 16 May 2022 15:08:35 +0100 Subject: [PATCH 03/11] build_formats builds correctly the formats for ym, my and yq orders --- r/R/dplyr-datetime-helpers.R | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 5bf6cceea01..ee0d7ceb597 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -159,8 +159,19 @@ build_formats <- function(orders) { orders <- gsub("[^A-Za-z_]", "", orders) orders <- gsub("Y", "y", orders) - if (orders %in% c("ym", "my")) { - orders <- paste0(orders, "d") + short_orders <- c("ym", "my") + + if (any(orders %in% short_orders)) { + orders1 <- setdiff(orders, short_orders) + orders2 <- intersect(orders, short_orders) + orders2 <- paste0(orders2, "d") + orders <- unique(c(orders1, orders2)) + } + + if (any(orders == "yq")) { + orders1 <- setdiff(orders, "yq") + orders2 <- "ymd" + orders <- unique(c(orders1, orders2)) } supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") From d172d82ac8bcd26755768b5e8ace3f8816100b7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 16 May 2022 15:08:45 +0100 Subject: [PATCH 04/11] more unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 89cd05d68e4..c19e2095c23 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1779,8 +1779,10 @@ test_that("year, month, day date/time parsers", { test_that("ym, my & yq parsers", { test_df <- tibble::tibble( - ym_string = c("2022-05", "2022/02", "22.03", NA), - my_string = c("05-2022", "02/2022", "03.22", NA) + ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA), + my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA), + yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA), + yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA), ) compare_dplyr_binding( @@ -1789,7 +1791,15 @@ test_that("ym, my & yq parsers", { ym_date = ym(ym_string), ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"), my_date = my(my_string), - my_datetime = my(my_string, tz = "Pacific/Marquesas") + my_datetime = my(my_string, tz = "Pacific/Marquesas"), + yq_date_from_string = yq(yq_string), + yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"), + yq_date_from_numeric = yq(yq_numeric), + yq_datetime_from_numeric = yq(yq_numeric, tz = "Pacific/Marquesas"), + ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")), + my_date2 = parse_date_time(my_string, orders = c("my", "myd")), + yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"), + yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq") ) %>% collect(), test_df From bdaf97ea79857c6ad150c29dfabdc1af214005ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 16 May 2022 15:09:34 +0100 Subject: [PATCH 05/11] skip tests if RE2 not available --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index c19e2095c23..82d37d2d418 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1785,6 +1785,9 @@ test_that("ym, my & yq parsers", { yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA), ) + # these functions' internals use some string processing which requires the + # RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( From 39b670906427455cbde5f65ad79f290b09b77b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 16 May 2022 17:42:20 +0100 Subject: [PATCH 06/11] remove duplicate formats --- r/R/dplyr-datetime-helpers.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index ee0d7ceb597..46e2d1b9b0e 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -191,7 +191,8 @@ build_formats <- function(orders) { } formats_list <- map(orders, build_format_from_order) - purrr::flatten_chr(formats_list) + formats <- purrr::flatten_chr(formats_list) + unique(formats) } build_format_from_order <- function(order) { From cbe518103625e36f407cbaee17f8fe97d30ab9d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 17 May 2022 13:37:47 +0100 Subject: [PATCH 07/11] added comment for `build_formats` with `ym`, `my`, and `yq` orders --- r/R/dplyr-datetime-helpers.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 46e2d1b9b0e..bbc364ec840 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -159,6 +159,12 @@ build_formats <- function(orders) { orders <- gsub("[^A-Za-z_]", "", orders) orders <- gsub("Y", "y", orders) + # we need a different logic in order to deal with "ym', "my", and "yq" orders + # we separate them from the rest of the `orders` vector and transform them. + # `ym` and `yq` become `ymd` & `my` becomes `myd` + # this is needed because strptime does not parse "2022-05", so we add "-01", + # thus changing the format, and for equivalence with lubridate, which parses + # `ym` to the first day of the month short_orders <- c("ym", "my") if (any(orders %in% short_orders)) { From 78251baff27f9196412d12df19af5967fae02a54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 17 May 2022 13:57:58 +0100 Subject: [PATCH 08/11] added comments in `parse_date_time()` --- r/R/dplyr-funcs-datetime.R | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index d2568dbcd3e..15d6d42864f 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -500,16 +500,23 @@ register_bindings_datetime_parsers <- function() { # collapse multiple separators into a single one x <- call_binding("gsub", "-{2,}", "-", x) - # add a day (01) for "ym" and "my" orders + # we need to transform `x` when orders are `ym`, `my`, and `yq` + # for `ym` and `my` orders we add a day ("01") augmented_x <- NULL if (any(orders %in% c("ym", "my"))) { augmented_x <- call_binding("paste0", x, "-01") } + # for `yq` we need to transform the quarter into the start month (lubridate + # behaviour) and then add 01 to parse to the first day of the quarter augmented_x2 <- NULL if (any(orders == "yq")) { + # extract everything that comes after the `-` separator, i.e. the quarter + # (e.g. 4 from 2022-4) quarter_x <- call_binding("gsub", "^.*?-", "", x) # we should probably error if quarter is not in 1:4 + # extract everything that comes before the `-`, i.e. the year (e.g. 2002 + # in 2002-4) year_x <- call_binding("gsub", "-.*$", "", x) quarter_x <- quarter_x$cast(int32()) month_x <- (quarter_x - 1) * 3 + 1 @@ -532,6 +539,9 @@ register_bindings_datetime_parsers <- function() { ) } + # build separate expression lists of parsing attempts for the orders that + # need an augmented `x` + # list for attempts when orders %in% c("ym", "my") parse_attempt_exp_augmented_x <- list() if (!is.null(augmented_x)) { @@ -544,6 +554,7 @@ register_bindings_datetime_parsers <- function() { } } + # list for attempts when orders %in% c("yq") parse_attempt_exp_augmented_x2 <- list() if (!is.null(augmented_x2)) { @@ -556,6 +567,7 @@ register_bindings_datetime_parsers <- function() { } } + # combine all attempts expressions in prep for coalesce parse_attempt_expressions <- c( parse_attempt_expressions, parse_attempt_exp_augmented_x, From bca48d0c4515dd027cf6afccc7c0e890e4c75eff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 17 May 2022 16:41:49 +0100 Subject: [PATCH 09/11] add unit test for `yq` with space --- r/tests/testthat/test-dplyr-funcs-datetime.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 82d37d2d418..b1223630153 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1783,6 +1783,7 @@ test_that("ym, my & yq parsers", { my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA), yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA), yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA), + yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA) ) # these functions' internals use some string processing which requires the @@ -1799,10 +1800,13 @@ test_that("ym, my & yq parsers", { yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"), yq_date_from_numeric = yq(yq_numeric), yq_datetime_from_numeric = yq(yq_numeric, tz = "Pacific/Marquesas"), + yq_date_from_string_with_space = yq(yq_space), + yq_datetime_from_string_with_space = yq(yq_space, tz = "Pacific/Marquesas"), ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")), my_date2 = parse_date_time(my_string, orders = c("my", "myd")), yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"), - yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq") + yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"), + yq_date_from_string_with_space2 = parse_date_time(yq_space, orders = "yq") ) %>% collect(), test_df From 5af2721857c663f0cbc382799010efd97d084f90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 17 May 2022 20:36:23 +0100 Subject: [PATCH 10/11] update `build_formats` comment --- r/R/dplyr-datetime-helpers.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index bbc364ec840..607104d7ce5 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -159,12 +159,11 @@ build_formats <- function(orders) { orders <- gsub("[^A-Za-z_]", "", orders) orders <- gsub("Y", "y", orders) - # we need a different logic in order to deal with "ym', "my", and "yq" orders - # we separate them from the rest of the `orders` vector and transform them. - # `ym` and `yq` become `ymd` & `my` becomes `myd` - # this is needed because strptime does not parse "2022-05", so we add "-01", - # thus changing the format, and for equivalence with lubridate, which parses - # `ym` to the first day of the month + # we separate "ym', "my", and "yq" from the rest of the `orders` vector and + # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd` + # this is needed for 2 reasons: + # 1. strptime does not parse "2022-05" -> we add "-01", thus changing the format, + # 2. for equivalence to lubridate, which parses `ym` to the first day of the month short_orders <- c("ym", "my") if (any(orders %in% short_orders)) { From 9368c88facc3bc84af6e4147892adeb7364da7b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 17 May 2022 20:50:30 +0100 Subject: [PATCH 11/11] use `purrr::map()` instead of a `for` loop --- r/R/dplyr-funcs-datetime.R | 42 ++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 15d6d42864f..02ec35bda26 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -529,15 +529,18 @@ register_bindings_datetime_parsers <- function() { # tricky given the possible combinations between dmy formats + locale # build a list of expressions for each format - parse_attempt_expressions <- list() - - for (i in seq_along(formats)) { - parse_attempt_expressions[[i]] <- build_expr( + parse_attempt_expressions <- map( + formats, + ~ build_expr( "strptime", x, - options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE) + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) ) - } + ) # build separate expression lists of parsing attempts for the orders that # need an augmented `x` @@ -545,26 +548,35 @@ register_bindings_datetime_parsers <- function() { parse_attempt_exp_augmented_x <- list() if (!is.null(augmented_x)) { - for (i in seq_along(formats)) { - parse_attempt_expressions[[i]] <- build_expr( + parse_attempt_exp_augmented_x <- map( + formats, + ~ build_expr( "strptime", augmented_x, - options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE) + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) ) - } + ) } # list for attempts when orders %in% c("yq") parse_attempt_exp_augmented_x2 <- list() - if (!is.null(augmented_x2)) { - for (i in seq_along(formats)) { - parse_attempt_expressions[[i]] <- build_expr( + parse_attempt_exp_augmented_x2 <- map( + formats, + ~ build_expr( "strptime", augmented_x2, - options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE) + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) ) - } + ) } # combine all attempts expressions in prep for coalesce