diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 1855a4a46ea..607104d7ce5 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -159,6 +159,26 @@ build_formats <- function(orders) { orders <- gsub("[^A-Za-z_]", "", orders) orders <- gsub("Y", "y", orders) + # we separate "ym', "my", and "yq" from the rest of the `orders` vector and + # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd` + # this is needed for 2 reasons: + # 1. strptime does not parse "2022-05" -> we add "-01", thus changing the format, + # 2. for equivalence to lubridate, which parses `ym` to the first day of the month + short_orders <- c("ym", "my") + + if (any(orders %in% short_orders)) { + orders1 <- setdiff(orders, short_orders) + orders2 <- intersect(orders, short_orders) + orders2 <- paste0(orders2, "d") + orders <- unique(c(orders1, orders2)) + } + + if (any(orders == "yq")) { + orders1 <- setdiff(orders, "yq") + orders2 <- "ymd" + orders <- unique(c(orders1, orders2)) + } + supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") unsupported_passed_orders <- setdiff(orders, supported_orders) supported_passed_orders <- intersect(orders, supported_orders) @@ -176,7 +196,8 @@ build_formats <- function(orders) { } formats_list <- map(orders, build_format_from_order) - purrr::flatten_chr(formats_list) + formats <- purrr::flatten_chr(formats_list) + unique(formats) } build_format_from_order <- function(order) { diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index e0c65d64cc0..02ec35bda26 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -493,27 +493,99 @@ register_bindings_datetime_parsers <- function() { # each order is translated into possible formats formats <- build_formats(orders) + x <- x$cast(string()) + # make all separators (non-letters and non-numbers) into "-" x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) # collapse multiple separators into a single one x <- call_binding("gsub", "-{2,}", "-", x) + # we need to transform `x` when orders are `ym`, `my`, and `yq` + # for `ym` and `my` orders we add a day ("01") + augmented_x <- NULL + if (any(orders %in% c("ym", "my"))) { + augmented_x <- call_binding("paste0", x, "-01") + } + + # for `yq` we need to transform the quarter into the start month (lubridate + # behaviour) and then add 01 to parse to the first day of the quarter + augmented_x2 <- NULL + if (any(orders == "yq")) { + # extract everything that comes after the `-` separator, i.e. the quarter + # (e.g. 4 from 2022-4) + quarter_x <- call_binding("gsub", "^.*?-", "", x) + # we should probably error if quarter is not in 1:4 + # extract everything that comes before the `-`, i.e. the year (e.g. 2002 + # in 2002-4) + year_x <- call_binding("gsub", "-.*$", "", x) + quarter_x <- quarter_x$cast(int32()) + month_x <- (quarter_x - 1) * 3 + 1 + augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01") + } + # TODO figure out how to parse strings that have no separators # https://issues.apache.org/jira/browse/ARROW-16446 # we could insert separators at the "likely" positions, but it might be # tricky given the possible combinations between dmy formats + locale # build a list of expressions for each format - parse_attempt_expressions <- list() - - for (i in seq_along(formats)) { - parse_attempt_expressions[[i]] <- build_expr( + parse_attempt_expressions <- map( + formats, + ~ build_expr( "strptime", x, - options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE) + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) + ) + ) + + # build separate expression lists of parsing attempts for the orders that + # need an augmented `x` + # list for attempts when orders %in% c("ym", "my") + parse_attempt_exp_augmented_x <- list() + + if (!is.null(augmented_x)) { + parse_attempt_exp_augmented_x <- map( + formats, + ~ build_expr( + "strptime", + augmented_x, + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) + ) ) } + # list for attempts when orders %in% c("yq") + parse_attempt_exp_augmented_x2 <- list() + if (!is.null(augmented_x2)) { + parse_attempt_exp_augmented_x2 <- map( + formats, + ~ build_expr( + "strptime", + augmented_x2, + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) + ) + ) + } + + # combine all attempts expressions in prep for coalesce + parse_attempt_expressions <- c( + parse_attempt_expressions, + parse_attempt_exp_augmented_x, + parse_attempt_exp_augmented_x2 + ) + coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) # we need this binding to be able to handle a NULL `tz`, which will then be @@ -527,7 +599,7 @@ register_bindings_datetime_parsers <- function() { }) - ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") + ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my", "yq") ymd_parser_map_factory <- function(order) { force(order) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 42448e8243f..b1223630153 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1735,7 +1735,7 @@ test_that("parse_date_time() doesn't work with hour, minutes, and second compone ) }) -test_that("year, month, day date/time parsers work", { +test_that("year, month, day date/time parsers", { test_df <- tibble::tibble( ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"), ydm_string = c("2022-11-05", "2022/12/05", "22.13-05"), @@ -1776,3 +1776,39 @@ test_that("year, month, day date/time parsers work", { test_df ) }) + +test_that("ym, my & yq parsers", { + test_df <- tibble::tibble( + ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA), + my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA), + yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA), + yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA), + yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA) + ) + + # these functions' internals use some string processing which requires the + # RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + compare_dplyr_binding( + .input %>% + mutate( + ym_date = ym(ym_string), + ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"), + my_date = my(my_string), + my_datetime = my(my_string, tz = "Pacific/Marquesas"), + yq_date_from_string = yq(yq_string), + yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"), + yq_date_from_numeric = yq(yq_numeric), + yq_datetime_from_numeric = yq(yq_numeric, tz = "Pacific/Marquesas"), + yq_date_from_string_with_space = yq(yq_space), + yq_datetime_from_string_with_space = yq(yq_space, tz = "Pacific/Marquesas"), + ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")), + my_date2 = parse_date_time(my_string, orders = c("my", "myd")), + yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"), + yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"), + yq_date_from_string_with_space2 = parse_date_time(yq_space, orders = "yq") + ) %>% + collect(), + test_df + ) +})