From b60655cde546081df4515c5cea070d3ca03a13eb Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 7 May 2021 17:28:22 -0400 Subject: [PATCH 1/4] Improve comments in string split functions --- r/R/dplyr.R | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 4e66c227bea..6055cbca5fa 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -543,16 +543,28 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) { arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) { function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) { - + assert_that(is.string(split)) - - # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex + + # The Arrow C++ library does not support splitting a string by a regular + # expression pattern (ARROW-12608) but the default behavior of + # base::strsplit() is to interpret the split pattern as a regex + # (fixed = FALSE). R users commonly pass non-regex split patterns to + # strsplit() without bothering to set fixed = FALSE. It would be annoying if + # that didn't work here. So: if fixed = FALSE, let's check the split pattern + # to see if it is a regex (if it contains any regex metacharacters). If not, + # then allow to proceed. if (!fixed && contains_regex(split)) { stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE) } + # warn when the user specifies both fixed = TRUE and perl = TRUE, for + # consistency with the behavior of base::strsplit() if (fixed && perl) { warning("Argument 'perl = TRUE' will be ignored", call. = FALSE) } + # since split is not a regex, proceed without any warnings or errors + # regardless of the value of perl, for consistency with the behavior of + # base::strsplit() FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits)) } } @@ -575,6 +587,10 @@ arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) { if (simplify) { warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE) } + # The max_splits option in the Arrow C++ library controls the maximum number + # of places at which the string is split, whereas the argument n to + # str_split() controls the maximum number of pieces to return. So we must + # subtract 1 from n to get max_splits. FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1L)) } } @@ -1148,7 +1164,7 @@ not_implemented_for_dataset <- function(method) { } #' Does this string contain regex metacharacters? -#' +#' #' @param string String to be tested #' @keywords internal #' @return Logical: does `string` contain regex metacharacters? From 749a520b8f25f17c967b47cadcc7e322ad72367d Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 7 May 2021 18:27:28 -0400 Subject: [PATCH 2/4] Fix typo --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 6055cbca5fa..84ccc65675b 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -550,7 +550,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) # expression pattern (ARROW-12608) but the default behavior of # base::strsplit() is to interpret the split pattern as a regex # (fixed = FALSE). R users commonly pass non-regex split patterns to - # strsplit() without bothering to set fixed = FALSE. It would be annoying if + # strsplit() without bothering to set fixed = TRUE It would be annoying if # that didn't work here. So: if fixed = FALSE, let's check the split pattern # to see if it is a regex (if it contains any regex metacharacters). If not, # then allow to proceed. From 05272e5c37fc5028e4cadc02f1b0d733a17dc392 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 7 May 2021 18:28:09 -0400 Subject: [PATCH 3/4] Fix typo --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 84ccc65675b..264c4929f72 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -550,7 +550,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) # expression pattern (ARROW-12608) but the default behavior of # base::strsplit() is to interpret the split pattern as a regex # (fixed = FALSE). R users commonly pass non-regex split patterns to - # strsplit() without bothering to set fixed = TRUE It would be annoying if + # strsplit() without bothering to set fixed = TRUE. It would be annoying if # that didn't work here. So: if fixed = FALSE, let's check the split pattern # to see if it is a regex (if it contains any regex metacharacters). If not, # then allow to proceed. From c54ac3760f0feb6b2a5d9815506a6b0ca838e96f Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 7 May 2021 19:08:26 -0400 Subject: [PATCH 4/4] Exercise arrow_*_split_whitespace options --- .../testthat/test-dplyr-string-functions.R | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 64351a83ea7..d7df83cc7a6 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -212,21 +212,21 @@ test_that("str_replace and str_replace_all", { collect(), df ) - + expect_dplyr_equal( input %>% transmute(x = str_replace_all(x, regex("^F"), "baz")) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>% collect(), df ) - + expect_dplyr_equal( input %>% transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>% @@ -307,6 +307,7 @@ test_that("arrow_*_split_whitespace functions", { df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux"))) + # use default option values expect_equivalent( df_ascii %>% Table$create() %>% @@ -322,6 +323,26 @@ test_that("arrow_*_split_whitespace functions", { df_split ) + # specify non-default option values + expect_equivalent( + df_ascii %>% + Table$create() %>% + mutate( + x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) + ) %>% + collect(), + tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))) + ) + expect_equivalent( + df_utf8 %>% + Table$create() %>% + mutate( + x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) + ) %>% + collect(), + tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))) + ) + }) test_that("errors and warnings in string splitting", {