From b60655cde546081df4515c5cea070d3ca03a13eb Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 7 May 2021 17:28:22 -0400
Subject: [PATCH 1/4] Improve comments in string split functions

---
 r/R/dplyr.R | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 4e66c227bea..6055cbca5fa 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -543,16 +543,28 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) {
 
 arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) {
   function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) {
-    
+
     assert_that(is.string(split))
-    
-    # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
+
+    # The Arrow C++ library does not support splitting a string by a regular
+    # expression pattern (ARROW-12608) but the default behavior of
+    # base::strsplit() is to interpret the split pattern as a regex
+    # (fixed = FALSE). R users commonly pass non-regex split patterns to
+    # strsplit() without bothering to set fixed = FALSE. It would be annoying if
+    # that didn't work here. So: if fixed = FALSE, let's check the split pattern
+    # to see if it is a regex (if it contains any regex metacharacters). If not,
+    # then allow to proceed.
     if (!fixed && contains_regex(split)) {
       stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE)
     }
+    # warn when the user specifies both fixed = TRUE and perl = TRUE, for
+    # consistency with the behavior of base::strsplit()
     if (fixed && perl) {
       warning("Argument 'perl = TRUE' will be ignored", call. = FALSE)
     }
+    # since split is not a regex, proceed without any warnings or errors
+    # regardless of the value of perl, for consistency with the behavior of
+    # base::strsplit()
     FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits))
   }
 }
@@ -575,6 +587,10 @@ arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) {
     if (simplify) {
       warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE)
     }
+    # The max_splits option in the Arrow C++ library controls the maximum number
+    # of places at which the string is split, whereas the argument n to
+    # str_split() controls the maximum number of pieces to return. So we must
+    # subtract 1 from n to get max_splits.
     FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1L))
   }
 }
@@ -1148,7 +1164,7 @@ not_implemented_for_dataset <- function(method) {
 }
 
 #' Does this string contain regex metacharacters?
-#' 
+#'
 #' @param string String to be tested
 #' @keywords internal
 #' @return Logical: does `string` contain regex metacharacters?

From 749a520b8f25f17c967b47cadcc7e322ad72367d Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 7 May 2021 18:27:28 -0400
Subject: [PATCH 2/4] Fix typo

---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 6055cbca5fa..84ccc65675b 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -550,7 +550,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
     # expression pattern (ARROW-12608) but the default behavior of
     # base::strsplit() is to interpret the split pattern as a regex
     # (fixed = FALSE). R users commonly pass non-regex split patterns to
-    # strsplit() without bothering to set fixed = FALSE. It would be annoying if
+    # strsplit() without bothering to set fixed = TRUE It would be annoying if
     # that didn't work here. So: if fixed = FALSE, let's check the split pattern
     # to see if it is a regex (if it contains any regex metacharacters). If not,
     # then allow to proceed.

From 05272e5c37fc5028e4cadc02f1b0d733a17dc392 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 7 May 2021 18:28:09 -0400
Subject: [PATCH 3/4] Fix typo

---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 84ccc65675b..264c4929f72 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -550,7 +550,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
     # expression pattern (ARROW-12608) but the default behavior of
     # base::strsplit() is to interpret the split pattern as a regex
     # (fixed = FALSE). R users commonly pass non-regex split patterns to
-    # strsplit() without bothering to set fixed = TRUE It would be annoying if
+    # strsplit() without bothering to set fixed = TRUE. It would be annoying if
     # that didn't work here. So: if fixed = FALSE, let's check the split pattern
     # to see if it is a regex (if it contains any regex metacharacters). If not,
     # then allow to proceed.

From c54ac3760f0feb6b2a5d9815506a6b0ca838e96f Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 7 May 2021 19:08:26 -0400
Subject: [PATCH 4/4] Exercise arrow_*_split_whitespace options

---
 .../testthat/test-dplyr-string-functions.R    | 27 ++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index 64351a83ea7..d7df83cc7a6 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -212,21 +212,21 @@ test_that("str_replace and str_replace_all", {
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
@@ -307,6 +307,7 @@ test_that("arrow_*_split_whitespace functions", {
 
   df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux")))
 
+  # use default option values
   expect_equivalent(
     df_ascii %>%
       Table$create() %>%
@@ -322,6 +323,26 @@ test_that("arrow_*_split_whitespace functions", {
     df_split
   )
 
+  # specify non-default option values
+  expect_equivalent(
+    df_ascii %>%
+      Table$create() %>%
+      mutate(
+        x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
+      ) %>%
+      collect(),
+    tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux")))
+  )
+  expect_equivalent(
+    df_utf8 %>%
+      Table$create() %>%
+      mutate(
+        x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
+      ) %>%
+      collect(),
+    tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux")))
+  )
+
 })
 
 test_that("errors and warnings in string splitting", {