From 7aa91c62a889c8559c6cabd8c576d19311c348ea Mon Sep 17 00:00:00 2001 From: Rok Date: Sat, 2 Jul 2022 12:21:44 +0200 Subject: [PATCH 01/11] Initial commit. --- r/tests/testthat/test-dplyr-funcs-datetime.R | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b8bd28e970c..7ee3584c60e 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -150,6 +150,30 @@ test_that("strptime", { as.POSIXct(tstamp), ignore_attr = "tzone" ) + + tz <- "Pacific/Marquesas" + times <- seq(as.POSIXct("1999-02-07", tz = tz), as.POSIXct("2000-01-01", tz = tz), by = "hour") + times <- c(as.POSIXct("1999-12-31T12:34:56.01", tz = "UTC")) + + # The following formats are currently not supported by strptime: %q %Op + formats <- c("%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", + "%S", "%q", "%M", "%p", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z") + base_format <- c("%Y-%m-%d") + + for (fmt in formats) { + fmt <- paste(base_format, fmt) + test_df <- tibble::tibble(x = strftime(times, format = fmt)) + expect_equal( + test_df %>% + arrow_table() %>% + mutate(x = strptime(x, format = fmt)) %>% + collect(), + test_df %>% + mutate(x = strptime(x, format = fmt)) %>% + mutate(x = as.POSIXct(x)) %>% + collect() + ) + } }) test_that("strptime returns NA when format doesn't match the data", { From 39d0087959d3d55a9dd0bc8c1919d9229c4e4fad Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 22 Jul 2022 14:35:22 +0200 Subject: [PATCH 02/11] Apply suggestions from code review Co-authored-by: Dewey Dunnington --- r/tests/testthat/test-dplyr-funcs-datetime.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 7ee3584c60e..4ad1149e778 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -156,8 +156,10 @@ test_that("strptime", { times <- c(as.POSIXct("1999-12-31T12:34:56.01", tz = "UTC")) # The following formats are currently not supported by strptime: %q %Op - formats <- c("%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", - "%S", "%q", "%M", "%p", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z") + formats <- c( + "%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", + "%S", "%q", "%M", "%p", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z" + ) base_format <- c("%Y-%m-%d") for (fmt in formats) { @@ -169,8 +171,7 @@ test_that("strptime", { mutate(x = strptime(x, format = fmt)) %>% collect(), test_df %>% - mutate(x = strptime(x, format = fmt)) %>% - mutate(x = as.POSIXct(x)) %>% + mutate(x = as.POSIXct(strptime(x, format = fmt))) %>% collect() ) } From 1f9bd399b1ca8c2e6c516a2a917a941885db738e Mon Sep 17 00:00:00 2001 From: Rok Date: Fri, 22 Jul 2022 15:31:17 +0200 Subject: [PATCH 03/11] Review feedback --- r/tests/testthat/test-dplyr-funcs-datetime.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 4ad1149e778..afe190f8738 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -152,13 +152,14 @@ test_that("strptime", { ) tz <- "Pacific/Marquesas" - times <- seq(as.POSIXct("1999-02-07", tz = tz), as.POSIXct("2000-01-01", tz = tz), by = "hour") - times <- c(as.POSIXct("1999-12-31T12:34:56.01", tz = "UTC")) + set.seed(42) + times <- seq(as.POSIXct("1999-02-07", tz = tz), as.POSIXct("2000-01-01", tz = tz), by = "sec") + times <- sample(times, 100) # The following formats are currently not supported by strptime: %q %Op formats <- c( "%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", - "%S", "%q", "%M", "%p", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z" + "%S", "%q", "%M", "%I%p", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z" ) base_format <- c("%Y-%m-%d") From 18ac094eb05798b47d008e551c3dddd3371d3cb3 Mon Sep 17 00:00:00 2001 From: Rok Date: Fri, 22 Jul 2022 19:07:07 +0200 Subject: [PATCH 04/11] Review feedback --- r/R/dplyr-datetime-helpers.R | 45 ++++++-------------- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 2 files changed, 13 insertions(+), 34 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index efcc62ff4ef..3d1918a8f5d 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -166,6 +166,18 @@ build_formats <- function(orders) { orders <- gsub("[^A-Za-z]", "", orders) orders <- gsub("Y", "y", orders) + invalid_orders <- nchar(gsub("[ymdqIHMS]", "", orders)) > 0 + if (any(invalid_orders)) { + arrow_not_supported( + paste0( + oxford_paste( + orders[invalid_orders] + ), + " `orders`" + ) + ) + } + # we separate "ym', "my", and "yq" from the rest of the `orders` vector and # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd` # this is needed for 2 reasons: @@ -192,39 +204,6 @@ build_formats <- function(orders) { orders <- unique(c(orders1, orders2)) } - ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym") - ymd_hms_orders <- c( - "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS", - "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H" - ) - # support "%I" hour formats - ymd_ims_orders <- gsub("H", "I", ymd_hms_orders) - - supported_orders <- c( - ymd_orders, - ymd_hms_orders, - gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as order separators - gsub("_", "", ymd_hms_orders), - ymd_ims_orders, - gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as order separators - gsub("_", "", ymd_ims_orders) - ) - - unsupported_passed_orders <- setdiff(orders, supported_orders) - supported_passed_orders <- intersect(orders, supported_orders) - - # error only if there isn't at least one valid order we can try - if (length(supported_passed_orders) == 0) { - arrow_not_supported( - paste0( - oxford_paste( - unsupported_passed_orders - ), - " `orders`" - ) - ) - } - formats_list <- map(orders, build_format_from_order) formats <- purrr::flatten_chr(formats_list) unique(formats) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index afe190f8738..7e754129b2b 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2671,7 +2671,7 @@ test_that("build_formats() and build_format_from_order()", { ) expect_error( - build_formats("vup"), + build_formats(c("vup", "ymd")), '"vup" `orders` not supported in Arrow' ) From 2b1cebed221c07cdc4d0a2260b3c4bc57c53e9ab Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 25 Jul 2022 07:05:24 +0200 Subject: [PATCH 05/11] Allowing more formats --- r/R/dplyr-datetime-helpers.R | 71 +++++++++----------- r/tests/testthat/test-dplyr-funcs-datetime.R | 56 +++++++++++++-- 2 files changed, 83 insertions(+), 44 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 3d1918a8f5d..0afd38f5875 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -158,32 +158,13 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { #' #' @noRd build_formats <- function(orders) { - # only keep the letters and the underscore as separator -> allow the users to - # pass strptime-like formats (with "%"). We process the data -> we need to - # process the `orders` (even if supplied in the desired format) - # Processing is needed (instead of passing - # formats as-is) due to the processing of the character vector in parse_date_time() - orders <- gsub("[^A-Za-z]", "", orders) - orders <- gsub("Y", "y", orders) - - invalid_orders <- nchar(gsub("[ymdqIHMS]", "", orders)) > 0 - if (any(invalid_orders)) { - arrow_not_supported( - paste0( - oxford_paste( - orders[invalid_orders] - ), - " `orders`" - ) - ) - } - # we separate "ym', "my", and "yq" from the rest of the `orders` vector and # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd` # this is needed for 2 reasons: # 1. strptime does not parse "2022-05" -> we add "-01", thus changing the format, # 2. for equivalence to lubridate, which parses `ym` to the first day of the month - short_orders <- c("ym", "my") + short_orders <- c("ym", "my", "Ym", "mY") + quarter_orders <- c("yq", "Yq", "qy", "qY") if (any(orders %in% short_orders)) { orders1 <- setdiff(orders, short_orders) @@ -191,20 +172,26 @@ build_formats <- function(orders) { orders2 <- paste0(orders2, "d") orders <- unique(c(orders2, orders1)) } - - if (any(orders == "yq")) { - orders1 <- setdiff(orders, "yq") - orders2 <- "ymd" - orders <- unique(c(orders1, orders2)) + if (any(orders %in% quarter_orders)) { + orders <- c(setdiff(orders, quarter_orders), "ymd") } + orders <- unique(orders) + + formats_list <- map(orders, build_format_from_order) + formats_length <- map(map(formats_list, nchar), max) + invalid_orders <- formats_length < 6 - if (any(orders == "qy")) { - orders1 <- setdiff(orders, "qy") - orders2 <- "ymd" - orders <- unique(c(orders1, orders2)) + if (any(invalid_orders)) { + arrow_not_supported( + paste0( + oxford_paste( + orders[invalid_orders] + ), + " `orders`" + ) + ) } - formats_list <- map(orders, build_format_from_order) formats <- purrr::flatten_chr(formats_list) unique(formats) } @@ -219,25 +206,29 @@ build_formats <- function(orders) { #' @noRd build_format_from_order <- function(order) { char_list <- list( + "T" = "%H-%M-%S", + "%T" = "%H-%M-%S", "y" = c("%y", "%Y"), + "Y" = c("%y", "%Y"), "m" = c("%m", "%B", "%b"), - "d" = "%d", - "H" = "%H", - "M" = "%M", - "S" = "%S", - "I" = "%I" + "%Y" = c("%y", "%Y"), + "%m" = c("%m", "%B", "%b") ) - split_order <- strsplit(order, split = "")[[1]] + available_formats <- strsplit("aAbBdHjmTpSqMIUwWyYrRz", "")[[1]] + formats <- stringr::str_extract_all(order, + "(%[a|A|b|B|d|H|j|m|T|p|S|q|M|I|U|w|W|y|Y|r|R|z])|([aAbBdHjmTpSqMIUwWyYrRz])")[[1]] + formats <- ifelse(formats %in% names(char_list), char_list[formats], formats) + formats <- ifelse(formats %in% available_formats, paste0("%", formats, ""), formats) + formats <- expand.grid(formats) - outcome <- expand.grid(char_list[split_order]) # we combine formats with and without the "-" separator, we will later # coalesce through all of them (benchmarking indicated this is a more # computationally efficient approach rather than figuring out if a string has # separators or not and applying only ) # during parsing if the string to be parsed does not contain a separator - formats_with_sep <- do.call(paste, c(outcome, sep = "-")) - formats_without_sep <- do.call(paste, c(outcome, sep = "")) + formats_with_sep <- do.call(paste, c(formats, sep = "-")) + formats_without_sep <- gsub("-", "", formats_with_sep) c(formats_with_sep, formats_without_sep) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 7e754129b2b..f7e207ad932 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -161,7 +161,12 @@ test_that("strptime", { "%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", "%S", "%q", "%M", "%I%p", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z" ) + formats2 <- c( + "a", "A", "b", "B", "d", "H", "j", "m", "Om", "T", "OS", "Ip", + "S", "q", "M", "Ip", "U", "w", "W", "y", "Y", "r", "R", "Tz" + ) base_format <- c("%Y-%m-%d") + base_format2 <- c("ymd") for (fmt in formats) { fmt <- paste(base_format, fmt) @@ -176,6 +181,31 @@ test_that("strptime", { collect() ) } + + for (fmt in formats2) { + fmt2 <- paste(base_format2, fmt) + fmt <- paste(base_format, paste0("%", fmt)) + test_df <- tibble::tibble(x = strftime(times, format = fmt)) + expect_equal( + test_df %>% + arrow_table() %>% + mutate(x = strptime(x, format = fmt2)) %>% + collect(), + test_df %>% + mutate(x = as.POSIXct(strptime(x, format = fmt2))) %>% + collect() + ) + } + + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time(string_1, orders = "Y-%b-d-%T") + ) %>% + collect(), + tibble::tibble(string_1 = c("2022-Feb-11-12:23:45", NA)) + ) + }) test_that("strptime returns NA when format doesn't match the data", { @@ -2664,10 +2694,14 @@ test_that("build_formats() and build_format_from_order()", { ) ) - # ab not supported yet - expect_error( - build_formats("abd"), - '"abd" `orders` not supported in Arrow' + expect_equal( + build_format_from_order("abp"), + c("%a", "%b","%p", "%a", "%b", "%p") + ) + + expect_equal( + build_format_from_order("uYM"), + c("%y-%M", "%Y-%M", "%y%M", "%Y%M") ) expect_error( @@ -2712,6 +2746,20 @@ test_that("build_formats() and build_format_from_order()", { "%y%b%d%H", "%Y%b%d%H" ) ) + + expect_equal( + build_formats("y-%b-d-%T"), + c("%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S") + ) + + expect_equal( + build_formats("%YdmH%p"), + c( + "%y-%d-%m-%H-%p", "%Y-%d-%m-%H-%p", "%y-%d-%B-%H-%p", "%Y-%d-%B-%H-%p", + "%y-%d-%b-%H-%p", "%Y-%d-%b-%H-%p", "%y%d%m%H%p", "%Y%d%m%H%p", + "%y%d%B%H%p", "%Y%d%B%H%p", "%y%d%b%H%p", "%Y%d%b%H%p" + ) + ) }) From fb5a63ac11241a1f544155650c4e73ed033b54a4 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 26 Jul 2022 17:24:38 +0200 Subject: [PATCH 06/11] Review feedback --- r/R/dplyr-datetime-helpers.R | 31 +++++++++++--------- r/tests/testthat/test-dplyr-funcs-datetime.R | 20 ++++++++----- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 0afd38f5875..ff6b331c50a 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -158,13 +158,21 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") { #' #' @noRd build_formats <- function(orders) { + # only keep the letters and the underscore as separator -> allow the users to + # pass strptime-like formats (with "%"). We process the data -> we need to + # process the `orders` (even if supplied in the desired format) + # Processing is needed (instead of passing + # formats as-is) due to the processing of the character vector in parse_date_time() + orders <- gsub("[^A-Za-z]", "", orders) + orders <- gsub("Y", "y", orders) + # we separate "ym', "my", and "yq" from the rest of the `orders` vector and # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd` # this is needed for 2 reasons: # 1. strptime does not parse "2022-05" -> we add "-01", thus changing the format, # 2. for equivalence to lubridate, which parses `ym` to the first day of the month - short_orders <- c("ym", "my", "Ym", "mY") - quarter_orders <- c("yq", "Yq", "qy", "qY") + short_orders <- c("ym", "my", "yOm", "Omy") + quarter_orders <- c("yq", "qy") if (any(orders %in% short_orders)) { orders1 <- setdiff(orders, short_orders) @@ -206,28 +214,23 @@ build_formats <- function(orders) { #' @noRd build_format_from_order <- function(order) { char_list <- list( - "T" = "%H-%M-%S", "%T" = "%H-%M-%S", - "y" = c("%y", "%Y"), - "Y" = c("%y", "%Y"), - "m" = c("%m", "%B", "%b"), - "%Y" = c("%y", "%Y"), - "%m" = c("%m", "%B", "%b") + "%y" = c("%y", "%Y"), + "%m" = c("%m", "%B", "%b"), + "%b" = c("%m", "%B", "%b") ) - available_formats <- strsplit("aAbBdHjmTpSqMIUwWyYrRz", "")[[1]] - formats <- stringr::str_extract_all(order, - "(%[a|A|b|B|d|H|j|m|T|p|S|q|M|I|U|w|W|y|Y|r|R|z])|([aAbBdHjmTpSqMIUwWyYrRz])")[[1]] + formats <- stringr::str_extract_all(order, "%{0,1}(O*[a-zA-Z])")[[1]] + formats <- paste0("%", formats) formats <- ifelse(formats %in% names(char_list), char_list[formats], formats) - formats <- ifelse(formats %in% available_formats, paste0("%", formats, ""), formats) - formats <- expand.grid(formats) + outcome <- expand.grid(formats) # we combine formats with and without the "-" separator, we will later # coalesce through all of them (benchmarking indicated this is a more # computationally efficient approach rather than figuring out if a string has # separators or not and applying only ) # during parsing if the string to be parsed does not contain a separator - formats_with_sep <- do.call(paste, c(formats, sep = "-")) + formats_with_sep <- do.call(paste, c(outcome, sep = "-")) formats_without_sep <- gsub("-", "", formats_with_sep) c(formats_with_sep, formats_without_sep) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index f7e207ad932..ba1c6cdd9d6 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2659,7 +2659,7 @@ test_that("build_formats() and build_format_from_order()", { ) ) - # when order is one of "yq", "qy", "ym" or"my" the data is augmented to "ymd" + # when order is one of "yq", "qy", "ym" or "my" the data is augmented to "ymd" # or "ydm" and the formats are built accordingly ymd_formats <- c( "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", @@ -2686,6 +2686,11 @@ test_that("build_formats() and build_format_from_order()", { ymd_formats ) + expect_equal( + build_formats(c("yOm", "yOmd")), + c("%y-%Om-%d", "%Y-%Om-%d", "%y%Om%d", "%Y%Om%d") + ) + expect_equal( build_formats("my"), c( @@ -2696,12 +2701,7 @@ test_that("build_formats() and build_format_from_order()", { expect_equal( build_format_from_order("abp"), - c("%a", "%b","%p", "%a", "%b", "%p") - ) - - expect_equal( - build_format_from_order("uYM"), - c("%y-%M", "%Y-%M", "%y%M", "%Y%M") + c("%a-%m-%p", "%a-%B-%p", "%a-%b-%p", "%a%m%p", "%a%B%p", "%a%b%p") ) expect_error( @@ -2749,7 +2749,11 @@ test_that("build_formats() and build_format_from_order()", { expect_equal( build_formats("y-%b-d-%T"), - c("%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S") + c( + "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", "%Y-%B-%d-%H-%M-%S", + "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", "%y%m%d%H%M%S", "%Y%m%d%H%M%S", + "%y%B%d%H%M%S", "%Y%B%d%H%M%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S" + ) ) expect_equal( From 126808293633fa65ef69782d47aac757f6026cd3 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 26 Jul 2022 18:46:21 +0200 Subject: [PATCH 07/11] Review feedback --- r/R/dplyr-datetime-helpers.R | 2 +- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index ff6b331c50a..74585e5da6c 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -220,7 +220,7 @@ build_format_from_order <- function(order) { "%b" = c("%m", "%B", "%b") ) - formats <- stringr::str_extract_all(order, "%{0,1}(O*[a-zA-Z])")[[1]] + formats <- regmatches(order, gregexpr("(O{0,1}[a-zA-Z])", order))[[1]] formats <- paste0("%", formats) formats <- ifelse(formats %in% names(char_list), char_list[formats], formats) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index ba1c6cdd9d6..005642d2480 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -156,7 +156,7 @@ test_that("strptime", { times <- seq(as.POSIXct("1999-02-07", tz = tz), as.POSIXct("2000-01-01", tz = tz), by = "sec") times <- sample(times, 100) - # The following formats are currently not supported by strptime: %q %Op + # %Op format is currently not supported by strptime formats <- c( "%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", "%S", "%q", "%M", "%I%p", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z" From f1000f51ac6d8a502608a937f2fb8dc94ba55103 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 26 Jul 2022 20:20:59 +0200 Subject: [PATCH 08/11] Subset of formats on windows --- r/tests/testthat/test-dplyr-funcs-datetime.R | 29 +++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 005642d2480..a12e356388f 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -157,14 +157,23 @@ test_that("strptime", { times <- sample(times, 100) # %Op format is currently not supported by strptime - formats <- c( - "%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", - "%S", "%q", "%M", "%I%p", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z" - ) - formats2 <- c( - "a", "A", "b", "B", "d", "H", "j", "m", "Om", "T", "OS", "Ip", - "S", "q", "M", "Ip", "U", "w", "W", "y", "Y", "r", "R", "Tz" - ) + # We support a subset of flags on Windows + if (tolower(Sys.info()[["sysname"]]) == "windows") { + formats <- c( + "%d", "%H", "%j", "%m", "%T", "%S", "%q", "%M", "%U", "%w", "%W", "%y", "%Y", "%R" + ) + formats2 <- c("d", "H", "j", "m", "T", "S", "q", "M", "U", "w", "W", "y", "Y", "R") + } else { + formats <- c( + "%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", + "%S", "%q", "%M", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z" + ) + formats2 <- c( + "a", "A", "b", "B", "d", "H", "j", "m", "Om", "T", "OS", "Ip", + "S", "q", "M", "U", "w", "W", "y", "Y", "r", "R", "Tz" + ) + } + base_format <- c("%Y-%m-%d") base_format2 <- c("ymd") @@ -200,10 +209,10 @@ test_that("strptime", { compare_dplyr_binding( .input %>% mutate( - parsed_date_ymd = parse_date_time(string_1, orders = "Y-%b-d-%T") + parsed_date_ymd = parse_date_time(string_1, orders = "Y-%m-d-%T") ) %>% collect(), - tibble::tibble(string_1 = c("2022-Feb-11-12:23:45", NA)) + tibble::tibble(string_1 = c("2022-02-11-12:23:45", NA)) ) }) From 363c9fb877ec896e501d804f46abaa68be1d4c6d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 27 Jul 2022 14:32:00 +0200 Subject: [PATCH 09/11] Apply suggestions from code review Co-authored-by: Dewey Dunnington --- r/R/dplyr-datetime-helpers.R | 8 ++++---- r/tests/testthat/test-dplyr-funcs-datetime.R | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 74585e5da6c..742313a8f88 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -186,7 +186,7 @@ build_formats <- function(orders) { orders <- unique(orders) formats_list <- map(orders, build_format_from_order) - formats_length <- map(map(formats_list, nchar), max) + formats_length <- map_int(map_int(formats_list, nchar), max) invalid_orders <- formats_length < 6 if (any(invalid_orders)) { @@ -215,9 +215,9 @@ build_formats <- function(orders) { build_format_from_order <- function(order) { char_list <- list( "%T" = "%H-%M-%S", - "%y" = c("%y", "%Y"), - "%m" = c("%m", "%B", "%b"), - "%b" = c("%m", "%B", "%b") + "%y" = "%y", "%Y", + "%m" = "%m", "%B", "%b", + "%b" = "%m", "%B", "%b" ) formats <- regmatches(order, gregexpr("(O{0,1}[a-zA-Z])", order))[[1]] diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a12e356388f..4ac56fe1324 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -174,8 +174,8 @@ test_that("strptime", { ) } - base_format <- c("%Y-%m-%d") - base_format2 <- c("ymd") + base_format <- "%Y-%m-%d" + base_format2 <- "ymd" for (fmt in formats) { fmt <- paste(base_format, fmt) From 4ecaa52b8db01d32af2d6052871eb5a0bf1133a3 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 28 Jul 2022 00:11:57 +0200 Subject: [PATCH 10/11] Review feedback --- r/R/dplyr-datetime-helpers.R | 70 ++++-- r/tests/testthat/test-dplyr-funcs-datetime.R | 219 ++++++++++++++----- 2 files changed, 216 insertions(+), 73 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 742313a8f88..06b42ab9048 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -163,6 +163,21 @@ build_formats <- function(orders) { # process the `orders` (even if supplied in the desired format) # Processing is needed (instead of passing # formats as-is) due to the processing of the character vector in parse_date_time() + + valid_formats <- "[a|A|b|B|d|H|I|j|m|Om|M|Op|p|q|OS|S|U|w|W|y|Y|r|R|T|z]" + invalid_orders <- !grepl(valid_formats, orders) + + if (any(invalid_orders)) { + arrow_not_supported( + paste0( + oxford_paste( + orders[invalid_orders] + ), + " `orders`" + ) + ) + } + orders <- gsub("[^A-Za-z]", "", orders) orders <- gsub("Y", "y", orders) @@ -186,20 +201,6 @@ build_formats <- function(orders) { orders <- unique(orders) formats_list <- map(orders, build_format_from_order) - formats_length <- map_int(map_int(formats_list, nchar), max) - invalid_orders <- formats_length < 6 - - if (any(invalid_orders)) { - arrow_not_supported( - paste0( - oxford_paste( - orders[invalid_orders] - ), - " `orders`" - ) - ) - } - formats <- purrr::flatten_chr(formats_list) unique(formats) } @@ -213,23 +214,44 @@ build_formats <- function(orders) { #' #' @noRd build_format_from_order <- function(order) { + month_formats <- c("%m", "%B", "%b") + week_formats <- c("%a", "%A") + year_formats <- c("%y", "%Y") char_list <- list( - "%T" = "%H-%M-%S", - "%y" = "%y", "%Y", - "%m" = "%m", "%B", "%b", - "%b" = "%m", "%B", "%b" + "%y" = year_formats, + "%Y" = year_formats, + "%m" = month_formats, + "%Om" = month_formats, + "%b" = month_formats, + "%B" = month_formats, + "%a" = week_formats, + "%A" = week_formats, + "%d" = "%d", + "%H" = "%H", + "%j" = "%j", + "%OS" = "%OS", + "%I" = "%I", + "%S" = "%S", + "%q" = "%q", + "%M" = "%M", + "%U" = "%U", + "%w" = "%w", + "%W" = "%W", + "%p" = "%p", + "%z" = "%z", + "%r" = c("%H", "%I-%p"), + "%R" = c("%H-%M", "%I-%M-%p"), + "%T" = c("%I-%M-%S-%p", "%H-%M-%S", "%H-%M-%OS") ) - formats <- regmatches(order, gregexpr("(O{0,1}[a-zA-Z])", order))[[1]] - formats <- paste0("%", formats) - formats <- ifelse(formats %in% names(char_list), char_list[formats], formats) + split_order <- regmatches(order, gregexpr("(O{0,1}[a-zA-Z])", order))[[1]] + split_order <- paste0("%", split_order) + outcome <- expand.grid(char_list[split_order]) - outcome <- expand.grid(formats) # we combine formats with and without the "-" separator, we will later # coalesce through all of them (benchmarking indicated this is a more # computationally efficient approach rather than figuring out if a string has - # separators or not and applying only ) - # during parsing if the string to be parsed does not contain a separator + # separators or not and applying the relevant order afterwards) formats_with_sep <- do.call(paste, c(outcome, sep = "-")) formats_without_sep <- gsub("-", "", formats_with_sep) c(formats_with_sep, formats_without_sep) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 4ac56fe1324..b72b7a5598c 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -151,32 +151,32 @@ test_that("strptime", { ignore_attr = "tzone" ) + # these functions' internals use some string processing which requires the + # RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + tz <- "Pacific/Marquesas" set.seed(42) times <- seq(as.POSIXct("1999-02-07", tz = tz), as.POSIXct("2000-01-01", tz = tz), by = "sec") times <- sample(times, 100) - # %Op format is currently not supported by strptime - # We support a subset of flags on Windows - if (tolower(Sys.info()[["sysname"]]) == "windows") { - formats <- c( - "%d", "%H", "%j", "%m", "%T", "%S", "%q", "%M", "%U", "%w", "%W", "%y", "%Y", "%R" - ) - formats2 <- c("d", "H", "j", "m", "T", "S", "q", "M", "U", "w", "W", "y", "Y", "R") - } else { - formats <- c( - "%a", "%A", "%b", "%B", "%d", "%H", "%j", "%m", "%Om", "%T", "%OS", "%I%p", - "%S", "%q", "%M", "%U", "%w", "%W", "%y", "%Y", "%r", "%R", "%T%z" - ) - formats2 <- c( - "a", "A", "b", "B", "d", "H", "j", "m", "Om", "T", "OS", "Ip", - "S", "q", "M", "U", "w", "W", "y", "Y", "r", "R", "Tz" - ) - } - + # Op format is currently not supported by strptime + formats <- c( + "%d", "%H", "%j", "%m", "%T", + "%S", "%q", "%M", "%U", "%w", "%W", "%y", "%Y", "%R", "%T" + ) + formats2 <- c( + "a", "A", "b", "B", "d", "H", "j", "m", "Om", "T", "OS", "Ip", + "S", "q", "M", "U", "w", "W", "y", "Y", "r", "R", "Tz" + ) base_format <- "%Y-%m-%d" base_format2 <- "ymd" + # Some formats are not supported on Windows + if (!tolower(Sys.info()[["sysname"]]) == "windows") { + formats <- c(formats, "%a", "%A", "%b", "%B", "%Om", "%OS", "%I%p", "%r", "%T%z") + } + for (fmt in formats) { fmt <- paste(base_format, fmt) test_df <- tibble::tibble(x = strftime(times, format = fmt)) @@ -2110,6 +2110,118 @@ test_that("ym, my & yq parsers", { ) }) +test_that("parse_date_time's other formats", { + # these functions' internals use some string processing which requires the + # RE2 library (not available on Windows with R 3.6) + skip_if_not_available("re2") + + # q, OS, Op, z formats are currently not supported by strptime + test_df <- tibble( + string_a = c("2023-12-30-Sat", NA), + string_A = c("2023-12-30-Saturday", NA), + string_b = c("2023-12-30-Dec", NA), + string_B = c("2023-12-30-December", NA), + string_H = c("2023-12-30-01", NA), + string_I = c("2023-12-30-01", NA), + string_j = c("2023-12-30-364", NA), + string_M = c("2023-12-30-00", NA), + string_p = c("2023-12-30-AM", NA), + string_S = c("2023-12-30-00", NA), + string_U = c("2023-12-30-52", NA), + string_w = c("2023-12-30-6", NA), + string_W = c("2023-12-30-52", NA), + string_y = c("23-12-30", NA), + string_Y = c("2023-12-30", NA), + string_Om = c("2023-01-30", NA), + string_r = c("2023-12-30-01", NA), + string_R = c("2023-12-30-01:00", NA), + string_T = c("2023-12-30-01:00:00", NA) + ) + + compare_dplyr_binding( + .input %>% + mutate( + parsed_H = parse_date_time(string_H, orders = "%Y-%m-%d-%H"), + parsed_I = parse_date_time(string_I, orders = "%Y-%m-%d-%I"), + parsed_j = parse_date_time(string_j, orders = "%Y-%m-%d-%j"), + parsed_M = parse_date_time(string_M, orders = "%Y-%m-%d-%M"), + parsed_S = parse_date_time(string_S, orders = "%Y-%m-%d-%S"), + parsed_U = parse_date_time(string_U, orders = "%Y-%m-%d-%U"), + parsed_w = parse_date_time(string_w, orders = "%Y-%m-%d-%w"), + parsed_W = parse_date_time(string_W, orders = "%Y-%m-%d-%W"), + parsed_y = parse_date_time(string_y, orders = "%y-%m-%d"), + parsed_Y = parse_date_time(string_Y, orders = "%Y-%m-%d"), + parsed_R = parse_date_time(string_R, orders = "%Y-%m-%d-%R"), + parsed_T = parse_date_time(string_T, orders = "%Y-%m-%d-%T") + ) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate( + parsed_H = parse_date_time(string_H, orders = "ymdH"), + parsed_I = parse_date_time(string_I, orders = "ymdI"), + parsed_j = parse_date_time(string_j, orders = "ymdj"), + parsed_M = parse_date_time(string_M, orders = "ymdM"), + parsed_S = parse_date_time(string_S, orders = "ymdS"), + parsed_U = parse_date_time(string_U, orders = "ymdU"), + parsed_w = parse_date_time(string_w, orders = "ymdw"), + parsed_W = parse_date_time(string_W, orders = "ymdW"), + parsed_y = parse_date_time(string_y, orders = "ymd"), + parsed_Y = parse_date_time(string_Y, orders = "Ymd"), + parsed_R = parse_date_time(string_R, orders = "ymdR"), + parsed_T = parse_date_time(string_T, orders = "ymdT") + ) %>% + collect(), + test_df + ) + + # Some formats are not supported on Windows + if (!tolower(Sys.info()[["sysname"]]) == "windows") { + compare_dplyr_binding( + .input %>% + mutate( + parsed_a = parse_date_time(string_a, orders = "%Y-%m-%d-%a"), + parsed_A = parse_date_time(string_A, orders = "%Y-%m-%d-%A"), + parsed_b = parse_date_time(string_b, orders = "%Y-%m-%d-%b"), + parsed_B = parse_date_time(string_B, orders = "%Y-%m-%d-%B"), + parsed_Om = parse_date_time(string_Om, orders = "%Y-%Om-%d"), + parsed_p = parse_date_time(string_p, orders = "%Y-%m-%d-%p"), + parsed_r = parse_date_time(string_r, orders = "%Y-%m-%d-%r") + ) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate( + parsed_a = parse_date_time(string_a, orders = "ymda"), + parsed_A = parse_date_time(string_A, orders = "ymdA"), + parsed_b = parse_date_time(string_b, orders = "ymdb"), + parsed_B = parse_date_time(string_B, orders = "ymdB"), + parsed_Om = parse_date_time(string_Om, orders = "yOmd"), + parsed_p = parse_date_time(string_p, orders = "ymdp"), + parsed_r = parse_date_time(string_r, orders = "ymdr") + ) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate( + parsed_date_ymd = parse_date_time(string_1, orders = "Y-%b-d-%T") + ) %>% + collect(), + tibble::tibble(string_1 = c("2022-Feb-11-12:23:45", NA)) + ) + } + +}) + test_that("lubridate's fast_strptime", { compare_dplyr_binding( .input %>% @@ -2643,6 +2755,19 @@ test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { }) test_that("build_formats() and build_format_from_order()", { + + ymd_formats <- c( + "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", + "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d" + ) + + ymd_hms_formats <- c( + "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", + "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", + "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", + "%y%b%d%H%M%S", "%Y%b%d%H%M%S" + ) + expect_equal( build_formats(c("ym", "myd", "%Y-%d-%m")), c( @@ -2660,20 +2785,11 @@ test_that("build_formats() and build_format_from_order()", { expect_equal( build_formats("ymd_HMS"), - c( - "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", - "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", - "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", - "%y%b%d%H%M%S", "%Y%b%d%H%M%S" - ) + ymd_hms_formats ) # when order is one of "yq", "qy", "ym" or "my" the data is augmented to "ymd" # or "ydm" and the formats are built accordingly - ymd_formats <- c( - "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", - "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d" - ) expect_equal( build_formats("yq"), ymd_formats @@ -2695,11 +2811,6 @@ test_that("build_formats() and build_format_from_order()", { ymd_formats ) - expect_equal( - build_formats(c("yOm", "yOmd")), - c("%y-%Om-%d", "%Y-%Om-%d", "%y%Om%d", "%Y%Om%d") - ) - expect_equal( build_formats("my"), c( @@ -2710,30 +2821,35 @@ test_that("build_formats() and build_format_from_order()", { expect_equal( build_format_from_order("abp"), - c("%a-%m-%p", "%a-%B-%p", "%a-%b-%p", "%a%m%p", "%a%B%p", "%a%b%p") + c( + "%a-%m-%p", "%A-%m-%p", "%a-%B-%p", "%A-%B-%p", "%a-%b-%p", "%A-%b-%p", + "%a%m%p", "%A%m%p", "%a%B%p", "%A%B%p", "%a%b%p", "%A%b%p" + ) ) expect_error( - build_formats(c("vup", "ymd")), - '"vup" `orders` not supported in Arrow' + build_formats(c("vu", "ymd")), + '"vu" `orders` not supported in Arrow' + ) + + expect_equal( + build_formats("wIpz"), + c("%w-%I-%p-%z", "%w%I%p%z") + ) + + expect_equal( + build_formats("yOmd"), + ymd_formats ) expect_equal( build_format_from_order("ymd"), - c( - "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", - "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d" - ) + ymd_formats ) expect_equal( build_format_from_order("ymdHMS"), - c( - "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", - "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", - "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", - "%y%b%d%H%M%S", "%Y%b%d%H%M%S" - ) + ymd_hms_formats ) expect_equal( @@ -2759,9 +2875,14 @@ test_that("build_formats() and build_format_from_order()", { expect_equal( build_formats("y-%b-d-%T"), c( - "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", "%Y-%B-%d-%H-%M-%S", - "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", "%y%m%d%H%M%S", "%Y%m%d%H%M%S", - "%y%B%d%H%M%S", "%Y%B%d%H%M%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S" + "%y-%m-%d-%I-%M-%S-%p", "%Y-%m-%d-%I-%M-%S-%p", "%y-%B-%d-%I-%M-%S-%p", "%Y-%B-%d-%I-%M-%S-%p", + "%y-%b-%d-%I-%M-%S-%p", "%Y-%b-%d-%I-%M-%S-%p", "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", + "%y-%B-%d-%H-%M-%S", "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", + "%y-%m-%d-%H-%M-%OS", "%Y-%m-%d-%H-%M-%OS", "%y-%B-%d-%H-%M-%OS", "%Y-%B-%d-%H-%M-%OS", + "%y-%b-%d-%H-%M-%OS", "%Y-%b-%d-%H-%M-%OS", "%y%m%d%I%M%S%p", "%Y%m%d%I%M%S%p", + "%y%B%d%I%M%S%p", "%Y%B%d%I%M%S%p", "%y%b%d%I%M%S%p", "%Y%b%d%I%M%S%p", "%y%m%d%H%M%S", + "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S", "%y%m%d%H%M%OS", + "%Y%m%d%H%M%OS", "%y%B%d%H%M%OS", "%Y%B%d%H%M%OS", "%y%b%d%H%M%OS", "%Y%b%d%H%M%OS" ) ) From 17b46ab264beb29e1cc73db322c09990e8965df7 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 28 Jul 2022 17:36:30 +0200 Subject: [PATCH 11/11] Review feedback --- r/R/dplyr-datetime-helpers.R | 9 +++++---- r/tests/testthat/test-dplyr-funcs-datetime.R | 5 +++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 06b42ab9048..4c9a8d1bf05 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -164,8 +164,11 @@ build_formats <- function(orders) { # Processing is needed (instead of passing # formats as-is) due to the processing of the character vector in parse_date_time() + orders <- gsub("[^A-Za-z]", "", orders) + orders <- gsub("Y", "y", orders) + valid_formats <- "[a|A|b|B|d|H|I|j|m|Om|M|Op|p|q|OS|S|U|w|W|y|Y|r|R|T|z]" - invalid_orders <- !grepl(valid_formats, orders) + invalid_orders <- nchar(gsub(valid_formats, "", orders)) > 0 if (any(invalid_orders)) { arrow_not_supported( @@ -178,9 +181,6 @@ build_formats <- function(orders) { ) } - orders <- gsub("[^A-Za-z]", "", orders) - orders <- gsub("Y", "y", orders) - # we separate "ym', "my", and "yq" from the rest of the `orders` vector and # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd` # this is needed for 2 reasons: @@ -238,6 +238,7 @@ build_format_from_order <- function(order) { "%w" = "%w", "%W" = "%W", "%p" = "%p", + "%Op" = "%Op", "%z" = "%z", "%r" = c("%H", "%I-%p"), "%R" = c("%H-%M", "%I-%M-%p"), diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b72b7a5598c..25fe23a28db 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -2832,6 +2832,11 @@ test_that("build_formats() and build_format_from_order()", { '"vu" `orders` not supported in Arrow' ) + expect_error( + build_formats(c("abc")), + '"abc" `orders` not supported in Arrow' + ) + expect_equal( build_formats("wIpz"), c("%w-%I-%p-%z", "%w%I%p%z")