diff --git a/r/NEWS.md b/r/NEWS.md index c7a71ca956a..ed9b600050f 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -16,6 +16,12 @@ specific language governing permissions and limitations under the License. --> +# development version + + * `lubridate::parse_date_time()` datetime parser: + * currently parses only `orders` with year, month, and day components. In a future release `orders` support for other datetime components (such as hours, minutes, seconds, etc) will be added. + * strings with no separators (e.g. `"20210917"`) could be ambiguous and are not yet supported. + * the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`). # arrow 7.0.0.9000 diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 9acf8b18435..22fd4b1173e 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -154,3 +154,48 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { x } + +build_formats <- function(orders) { + # only keep the letters and the underscore as separator -> allow the users to + # pass strptime-like formats (with "%"). Processing is needed (instead of passing + # formats as-is) due to the processing of the character vector in parse_date_time() + orders <- gsub("[^A-Za-z_]", "", orders) + orders <- gsub("Y", "y", orders) + + supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") + unsupported_passed_orders <- setdiff(orders, supported_orders) + supported_passed_orders <- intersect(orders, supported_orders) + + # error only if there isn't at least one valid order we can try + if (length(supported_passed_orders) == 0) { + arrow_not_supported( + paste0( + oxford_paste( + unsupported_passed_orders + ), + " `orders`" + ) + ) + } + + formats_list <- map(orders, build_format_from_order) + purrr::flatten_chr(formats_list) +} + +build_format_from_order <- function(order) { + year_chars <- c("%y", "%Y") + month_chars <- c("%m", "%B", "%b") + day_chars <- "%d" + + outcome <- switch( + order, + "ymd" = expand.grid(year_chars, month_chars, day_chars), + "ydm" = expand.grid(year_chars, day_chars, month_chars), + "mdy" = expand.grid(month_chars, day_chars, year_chars), + "myd" = expand.grid(month_chars, year_chars, day_chars), + "dmy" = expand.grid(day_chars, month_chars, year_chars), + "dym" = expand.grid(day_chars, year_chars, month_chars) + ) + outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-") + outcome$format +} diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 4d7ea050a0a..1e56dcf8615 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -24,6 +24,7 @@ register_bindings_datetime <- function() { register_bindings_duration() register_bindings_duration_constructor() register_bindings_duration_helpers() + register_bindings_datetime_parsers() } register_bindings_datetime_utility <- function() { @@ -485,3 +486,38 @@ register_bindings_duration_helpers <- function() { abort("Duration in picoseconds not supported in Arrow.") }) } + +register_bindings_datetime_parsers <- function() { + register_binding("parse_date_time", function(x, + orders, + tz = "UTC") { + + # each order is translated into possible formats + formats <- build_formats(orders) + + # make all separators (non-letters and non-numbers) into "-" + x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) + # collapse multiple separators into a single one + x <- call_binding("gsub", "-{2,}", "-", x) + + # TODO figure out how to parse strings that have no separators + # https://issues.apache.org/jira/browse/ARROW-16446 + # we could insert separators at the "likely" positions, but it might be + # tricky given the possible combinations between dmy formats + locale + + # build a list of expressions for each format + parse_attempt_expressions <- list() + + for (i in seq_along(formats)) { + parse_attempt_expressions[[i]] <- build_expr( + "strptime", + x, + options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE) + ) + } + + coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) + + build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) + }) +} diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a7afe4c5b97..602f616bb9a 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1429,7 +1429,7 @@ test_that("make_difftime()", { ) %>% collect(), paste0("named `difftime` units other than: `second`, `minute`, `hour`,", - " `day`, and `week` not supported in Arrow.") + " `day`, and `week` not supported in Arrow.") ) ) @@ -1621,3 +1621,97 @@ test_that("`as_datetime()`", { regexp = "Float value 10.1 was truncated converting to int64" ) }) + +test_that("parse_date_time() works with year, month, and date components", { + # string processing requires RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"), + parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"), + parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") + ) %>% + collect(), + tibble::tibble( + string_ymd = c( + "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", + "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", "21:09:11", + # not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19 + # no separators and %b or %B are even more complicated (and they work in + # lubridate). not to mention locale + NA + ), + string_dmy = c( + "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", + "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", + # not yet working for strings with no separators, like "10092021", "100921", + NA + ), + string_mdy = c( + "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", + "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", + # not yet working for strings with no separators, like "09102021", "091021", + NA + ) + ) + ) + + # locale (affecting "%b% and "%B" formats) does not work properly on Windows + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done + skip_on_os("windows") + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"), + parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"), + parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") + ) %>% + collect(), + tibble::tibble( + string_ymd = c( + "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", NA + ), + string_dmy = c( + "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", NA + ), + string_mdy = c( + "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", NA + ) + ) + ) +}) + +test_that("parse_date_time() works with a mix of formats and orders", { + # string processing requires RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + test_df <- tibble( + string_combi = c("2021-09-1", "2/09//2021", "09.3.2021") + ) + + compare_dplyr_binding( + .input %>% + mutate( + date_from_string = parse_date_time( + string_combi, + orders = c("ymd", "%d/%m//%Y", "%m.%d.%Y") + ) + ) %>% + collect(), + test_df + ) +}) + +test_that("parse_date_time() doesn't work with hour, minutes, and second components", { + test_dates_times <- tibble( + date_times = c("09-01-17 12:34:56", NA) + ) + + expect_warning( + test_dates_times %>% + arrow_table() %>% + mutate(parsed_date_ymd = parse_date_time(date_times, orders = "ymd_HMS")) %>% + collect(), + '"ymd_HMS" `orders` not supported in Arrow' + ) +})