diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 4e66c227bea..264c4929f72 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -543,16 +543,28 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) { arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) { function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) { - + assert_that(is.string(split)) - - # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex + + # The Arrow C++ library does not support splitting a string by a regular + # expression pattern (ARROW-12608) but the default behavior of + # base::strsplit() is to interpret the split pattern as a regex + # (fixed = FALSE). R users commonly pass non-regex split patterns to + # strsplit() without bothering to set fixed = TRUE. It would be annoying if + # that didn't work here. So: if fixed = FALSE, let's check the split pattern + # to see if it is a regex (if it contains any regex metacharacters). If not, + # then allow to proceed. if (!fixed && contains_regex(split)) { stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE) } + # warn when the user specifies both fixed = TRUE and perl = TRUE, for + # consistency with the behavior of base::strsplit() if (fixed && perl) { warning("Argument 'perl = TRUE' will be ignored", call. = FALSE) } + # since split is not a regex, proceed without any warnings or errors + # regardless of the value of perl, for consistency with the behavior of + # base::strsplit() FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits)) } } @@ -575,6 +587,10 @@ arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) { if (simplify) { warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE) } + # The max_splits option in the Arrow C++ library controls the maximum number + # of places at which the string is split, whereas the argument n to + # str_split() controls the maximum number of pieces to return. So we must + # subtract 1 from n to get max_splits. FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1L)) } } @@ -1148,7 +1164,7 @@ not_implemented_for_dataset <- function(method) { } #' Does this string contain regex metacharacters? -#' +#' #' @param string String to be tested #' @keywords internal #' @return Logical: does `string` contain regex metacharacters? diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 64351a83ea7..d7df83cc7a6 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -212,21 +212,21 @@ test_that("str_replace and str_replace_all", { collect(), df ) - + expect_dplyr_equal( input %>% transmute(x = str_replace_all(x, regex("^F"), "baz")) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>% collect(), df ) - + expect_dplyr_equal( input %>% transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>% @@ -307,6 +307,7 @@ test_that("arrow_*_split_whitespace functions", { df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux"))) + # use default option values expect_equivalent( df_ascii %>% Table$create() %>% @@ -322,6 +323,26 @@ test_that("arrow_*_split_whitespace functions", { df_split ) + # specify non-default option values + expect_equivalent( + df_ascii %>% + Table$create() %>% + mutate( + x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) + ) %>% + collect(), + tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))) + ) + expect_equivalent( + df_utf8 %>% + Table$create() %>% + mutate( + x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) + ) %>% + collect(), + tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))) + ) + }) test_that("errors and warnings in string splitting", {