From 4710e2ec56cb845ea6b0bf575aa5d5cf8f4bc357 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 28 Apr 2021 14:47:24 +0100
Subject: [PATCH 01/37] Add split_pattern function

---
 r/NAMESPACE                                  |  1 +
 r/R/compute.R                                | 17 +++++
 r/_pkgdown.yml                               |  1 +
 r/man/split_pattern.Rd                       | 23 ++++++
 r/src/compute.cpp                            | 10 +++
 r/tests/testthat/test-compute-string-split.R | 79 ++++++++++++++++++++
 6 files changed, 131 insertions(+)
 create mode 100644 r/man/split_pattern.Rd
 create mode 100644 r/tests/testthat/test-compute-string-split.R
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 607177235e9..08a5e9c532b 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -263,6 +263,7 @@ export(record_batch)
 export(s3_bucket)
 export(schema)
 export(set_cpu_count)
+export(split_pattern)
 export(starts_with)
 export(string)
 export(struct)
diff --git a/r/R/compute.R b/r/R/compute.R
index 0641bf1615c..aea92009508 100644
--- a/r/R/compute.R
+++ b/r/R/compute.R
@@ -284,3 +284,20 @@ cast_options <- function(safe = TRUE, ...) {
   )
   modifyList(opts, list(...))
 }
+
+#' String splitting
+#' 
+#' Split a string into a list of strings
+#' 
+#' @param x `Scalar`, `Array` or `ChunkedArray`
+#' @param pattern The exact substring to look for inside input values.
+#' @param max_splits Maximum number of splits. Default -1 means no limit.
+#' @param reverse When true, the splitting is done from the end of the string.
+#' 
+#' @return An object (same type as `x`) containing lists of strings
+#' 
+#' @export
+split_pattern <- function(x, pattern, max_splits = -1L, reverse = FALSE){
+  split_options <- list(pattern = pattern, max_splits = max_splits, reverse = reverse)
+  call_function("split_pattern", x, options = split_options)
+}
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index bb77b416aab..822573d080e 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -162,6 +162,7 @@ reference:
       - match_arrow
       - value_counts
       - list_compute_functions
+      - split_pattern
   - title: Configuration
     contents:
       - arrow_info
diff --git a/r/man/split_pattern.Rd b/r/man/split_pattern.Rd
new file mode 100644
index 00000000000..179ce866b1a
--- /dev/null
+++ b/r/man/split_pattern.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute.R
+\name{split_pattern}
+\alias{split_pattern}
+\title{String splitting}
+\usage{
+split_pattern(x, pattern, max_splits = -1L, reverse = FALSE)
+}
+\arguments{
+\item{x}{\code{Scalar}, \code{Array} or \code{ChunkedArray}}
+
+\item{pattern}{The exact substring to look for inside input values.}
+
+\item{max_splits}{Maximum number of splits. Default -1 means no limit.}
+
+\item{reverse}{When true, the splitting is done from the end of the string.}
+}
+\value{
+An object (same type as \code{x}) containing lists of strings
+}
+\description{
+Split a string into a list of strings
+}
diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 34bc3bea456..80895fddd8a 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -233,6 +233,16 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
                                      max_replacements);
   }
 
+  if (func_name == "split_pattern") {
+    using Options = arrow::compute::SplitPatternOptions;
+    int64_t max_splits = -1;
+    if (!Rf_isNull(options["max_splits"])) {
+      max_splits = cpp11::as_cpp<int64_t>(options["max_splits"]);
+    }
+    return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["pattern"]),
+                                     max_splits, cpp11::as_cpp<bool>(options["reverse"]));
+  }
+
   return nullptr;
 }
 
diff --git a/r/tests/testthat/test-compute-string-split.R b/r/tests/testthat/test-compute-string-split.R
new file mode 100644
index 00000000000..b2fd9a44f79
--- /dev/null
+++ b/r/tests/testthat/test-compute-string-split.R
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_str <-c("foobar, foo, bar", "baz, qux, quux.")
+
+test_that("split_pattern", {
+  
+  test_scalar <- Scalar$create(test_str[1])
+  test_array <- Array$create(test_str)
+  test_charray <- chunked_array(test_str[1], test_str[2])
+  
+  expect_equal(
+    split_pattern(test_scalar, pattern = ","),
+    Scalar$create(list(c("foobar", " foo", " bar")))
+  )
+  
+  expect_equal(
+    split_pattern(test_array, pattern = ","),
+    Array$create(list(c("foobar", " foo", " bar"), c("baz", " qux", " quux.")))
+  )
+  
+  expect_equal(
+    split_pattern(test_charray, pattern = ","),
+    ChunkedArray$create(list(c("foobar", " foo", " bar"), c("baz", " qux", " quux.")))
+  )
+  
+})
+
+test_that("split_pattern with max splits and reverse", {
+  
+  test_scalar <- Scalar$create(test_str[1])
+  test_array <- Array$create(test_str)
+  test_charray <- chunked_array(test_str[1], test_str[2])
+  
+  expect_equal(
+    split_pattern(test_scalar, pattern = ",", max_split = 1),
+    Scalar$create(list(c("foobar", " foo, bar")))
+  )
+  
+  expect_equal(
+    split_pattern(test_array, pattern = ",", max_split = 1),
+    Array$create(list(c("foobar", " foo, bar"), c("baz", " qux, quux.")))
+  )
+  
+  expect_equal(
+    split_pattern(test_charray, pattern = ",", max_split = 1),
+    ChunkedArray$create(list(c("foobar", " foo, bar"), c("baz", " qux, quux.")))
+  )
+  
+  expect_equal(
+    split_pattern(test_scalar, pattern = ",", max_split = 1, reverse = TRUE),
+    Scalar$create(list(c("foobar, foo", " bar")))
+  )
+  
+  expect_equal(
+    split_pattern(test_array, pattern = ",", max_split = 1, reverse = TRUE),
+    Array$create(list(c("foobar, foo", " bar"), c("baz, qux", " quux.")))
+  )
+  
+  expect_equal(
+    split_pattern(test_charray, pattern = ",", max_split = 1, reverse = TRUE),
+    ChunkedArray$create(list(c("foobar, foo", " bar"), c("baz, qux", " quux.")))
+  )
+  
+})

From 33a4447afbce05ed579e7f52b9bebe3b595a4308 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 28 Apr 2021 19:03:37 +0100
Subject: [PATCH 02/37] Remove references to split_pattern arguments in favour
 of strsplit arguments

---
 r/R/compute.R | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/r/R/compute.R b/r/R/compute.R
index aea92009508..470e768c133 100644
--- a/r/R/compute.R
+++ b/r/R/compute.R
@@ -285,19 +285,8 @@ cast_options <- function(safe = TRUE, ...) {
   modifyList(opts, list(...))
 }
 
-#' String splitting
-#' 
-#' Split a string into a list of strings
-#' 
-#' @param x `Scalar`, `Array` or `ChunkedArray`
-#' @param pattern The exact substring to look for inside input values.
-#' @param max_splits Maximum number of splits. Default -1 means no limit.
-#' @param reverse When true, the splitting is done from the end of the string.
-#' 
-#' @return An object (same type as `x`) containing lists of strings
-#' 
 #' @export
-split_pattern <- function(x, pattern, max_splits = -1L, reverse = FALSE){
-  split_options <- list(pattern = pattern, max_splits = max_splits, reverse = reverse)
+strsplit.ArrowDatum <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE){
+  split_options <- list(pattern = split)
   call_function("split_pattern", x, options = split_options)
 }

From a370b9ab8ca2c7b5e42ec4dbe570626fb8ccd849 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 28 Apr 2021 19:03:58 +0100
Subject: [PATCH 03/37] Remove reference to split_pattern from pkgdown

---
 r/_pkgdown.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 822573d080e..bb77b416aab 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -162,7 +162,6 @@ reference:
       - match_arrow
       - value_counts
       - list_compute_functions
-      - split_pattern
   - title: Configuration
     contents:
       - arrow_info

From 4677f3173ccb1887ceeb4e62af75731ffe898d59 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 28 Apr 2021 19:50:55 +0100
Subject: [PATCH 04/37] Update tests for strsplit

---
 r/NAMESPACE                                  |  4 +-
 r/R/compute.R                                | 11 +++-
 r/man/split_pattern.Rd                       | 23 --------
 r/tests/testthat/test-compute-string-split.R | 61 +++-----------------
 4 files changed, 18 insertions(+), 81 deletions(-)
 delete mode 100644 r/man/split_pattern.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 08a5e9c532b..e17901e43c3 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -91,6 +91,8 @@ S3method(read_message,default)
 S3method(row.names,ArrowTabular)
 S3method(sort,ArrowDatum)
 S3method(sort,Scalar)
+S3method(strsplit,ArrowDatum)
+S3method(strsplit,default)
 S3method(sum,ArrowDatum)
 S3method(tail,ArrowDatum)
 S3method(tail,ArrowTabular)
@@ -263,9 +265,9 @@ export(record_batch)
 export(s3_bucket)
 export(schema)
 export(set_cpu_count)
-export(split_pattern)
 export(starts_with)
 export(string)
+export(strsplit)
 export(struct)
 export(time32)
 export(time64)
diff --git a/r/R/compute.R b/r/R/compute.R
index 470e768c133..0ce835655a5 100644
--- a/r/R/compute.R
+++ b/r/R/compute.R
@@ -286,7 +286,12 @@ cast_options <- function(safe = TRUE, ...) {
 }
 
 #' @export
-strsplit.ArrowDatum <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE){
-  split_options <- list(pattern = split)
-  call_function("split_pattern", x, options = split_options)
+strsplit <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE, ...) UseMethod("strsplit")
+
+#' @export
+strsplit.default <- base::strsplit
+
+#' @export
+strsplit.ArrowDatum <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE, ...){
+  call_function("split_pattern", x, options = list(pattern = split, reverse = FALSE))
 }
diff --git a/r/man/split_pattern.Rd b/r/man/split_pattern.Rd
deleted file mode 100644
index 179ce866b1a..00000000000
--- a/r/man/split_pattern.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compute.R
-\name{split_pattern}
-\alias{split_pattern}
-\title{String splitting}
-\usage{
-split_pattern(x, pattern, max_splits = -1L, reverse = FALSE)
-}
-\arguments{
-\item{x}{\code{Scalar}, \code{Array} or \code{ChunkedArray}}
-
-\item{pattern}{The exact substring to look for inside input values.}
-
-\item{max_splits}{Maximum number of splits. Default -1 means no limit.}
-
-\item{reverse}{When true, the splitting is done from the end of the string.}
-}
-\value{
-An object (same type as \code{x}) containing lists of strings
-}
-\description{
-Split a string into a list of strings
-}
diff --git a/r/tests/testthat/test-compute-string-split.R b/r/tests/testthat/test-compute-string-split.R
index b2fd9a44f79..d8869034e02 100644
--- a/r/tests/testthat/test-compute-string-split.R
+++ b/r/tests/testthat/test-compute-string-split.R
@@ -17,63 +17,16 @@
 
 test_str <-c("foobar, foo, bar", "baz, qux, quux.")
 
-test_that("split_pattern", {
-  
-  test_scalar <- Scalar$create(test_str[1])
-  test_array <- Array$create(test_str)
-  test_charray <- chunked_array(test_str[1], test_str[2])
-  
-  expect_equal(
-    split_pattern(test_scalar, pattern = ","),
-    Scalar$create(list(c("foobar", " foo", " bar")))
-  )
-  
-  expect_equal(
-    split_pattern(test_array, pattern = ","),
-    Array$create(list(c("foobar", " foo", " bar"), c("baz", " qux", " quux.")))
-  )
-  
-  expect_equal(
-    split_pattern(test_charray, pattern = ","),
-    ChunkedArray$create(list(c("foobar", " foo", " bar"), c("baz", " qux", " quux.")))
-  )
-  
+test_that("strsplit for Array and ChunkedArray ", {
+  expect_vector_equal(strsplit(input, split = ","), test_str, ignore_attr = TRUE)
 })
 
-test_that("split_pattern with max splits and reverse", {
-  
+test_that("strsplit for Scalar", {
   test_scalar <- Scalar$create(test_str[1])
-  test_array <- Array$create(test_str)
-  test_charray <- chunked_array(test_str[1], test_str[2])
-  
   expect_equal(
-    split_pattern(test_scalar, pattern = ",", max_split = 1),
-    Scalar$create(list(c("foobar", " foo, bar")))
-  )
-  
-  expect_equal(
-    split_pattern(test_array, pattern = ",", max_split = 1),
-    Array$create(list(c("foobar", " foo, bar"), c("baz", " qux, quux.")))
-  )
-  
-  expect_equal(
-    split_pattern(test_charray, pattern = ",", max_split = 1),
-    ChunkedArray$create(list(c("foobar", " foo, bar"), c("baz", " qux, quux.")))
-  )
-  
-  expect_equal(
-    split_pattern(test_scalar, pattern = ",", max_split = 1, reverse = TRUE),
-    Scalar$create(list(c("foobar, foo", " bar")))
-  )
-  
-  expect_equal(
-    split_pattern(test_array, pattern = ",", max_split = 1, reverse = TRUE),
-    Array$create(list(c("foobar, foo", " bar"), c("baz, qux", " quux.")))
-  )
-  
-  expect_equal(
-    split_pattern(test_charray, pattern = ",", max_split = 1, reverse = TRUE),
-    ChunkedArray$create(list(c("foobar, foo", " bar"), c("baz, qux", " quux.")))
+    strsplit(test_scalar, split = ","),
+    Scalar$create(list(c("foobar", " foo", " bar")))
   )
-  
 })
+
+

From e1aa64f19fb6537b48219ef793c8bbad69e15614 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 12:13:33 +0100
Subject: [PATCH 05/37] Remove test for strsplit

---
 r/tests/testthat/test-compute-string-split.R | 32 --------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 r/tests/testthat/test-compute-string-split.R

diff --git a/r/tests/testthat/test-compute-string-split.R b/r/tests/testthat/test-compute-string-split.R
deleted file mode 100644
index d8869034e02..00000000000
--- a/r/tests/testthat/test-compute-string-split.R
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-test_str <-c("foobar, foo, bar", "baz, qux, quux.")
-
-test_that("strsplit for Array and ChunkedArray ", {
-  expect_vector_equal(strsplit(input, split = ","), test_str, ignore_attr = TRUE)
-})
-
-test_that("strsplit for Scalar", {
-  test_scalar <- Scalar$create(test_str[1])
-  expect_equal(
-    strsplit(test_scalar, split = ","),
-    Scalar$create(list(c("foobar", " foo", " bar")))
-  )
-})
-
-

From a3f073838eccafab8e8408b5ce2313fd9865be7a Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 12:14:12 +0100
Subject: [PATCH 06/37] Remove s3 methods for strsplit

---
 r/NAMESPACE | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index e17901e43c3..607177235e9 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -91,8 +91,6 @@ S3method(read_message,default)
 S3method(row.names,ArrowTabular)
 S3method(sort,ArrowDatum)
 S3method(sort,Scalar)
-S3method(strsplit,ArrowDatum)
-S3method(strsplit,default)
 S3method(sum,ArrowDatum)
 S3method(tail,ArrowDatum)
 S3method(tail,ArrowTabular)
@@ -267,7 +265,6 @@ export(schema)
 export(set_cpu_count)
 export(starts_with)
 export(string)
-export(strsplit)
 export(struct)
 export(time32)
 export(time64)

From 43120a6124c3ac5c0cac9fee6fb67acb394f7a3d Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 13:38:50 +0100
Subject: [PATCH 07/37] Remove strsplit from compute.R

---
 r/R/compute.R | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/r/R/compute.R b/r/R/compute.R
index 0ce835655a5..0641bf1615c 100644
--- a/r/R/compute.R
+++ b/r/R/compute.R
@@ -284,14 +284,3 @@ cast_options <- function(safe = TRUE, ...) {
   )
   modifyList(opts, list(...))
 }
-
-#' @export
-strsplit <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE, ...) UseMethod("strsplit")
-
-#' @export
-strsplit.default <- base::strsplit
-
-#' @export
-strsplit.ArrowDatum <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE, ...){
-  call_function("split_pattern", x, options = list(pattern = split, reverse = FALSE))
-}

From 0ecbab41402303adad1be308f8826da81255cd26 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 14:09:49 +0100
Subject: [PATCH 08/37] Implement strsplit

---
 r/R/dplyr.R | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 845cb3a1815..e367d530c0b 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -410,6 +410,18 @@ build_function_list <- function(FUN) {
         options = list(null_encoding_behavior = null_encoding_behavior)
       )
     },
+    strsplit = function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){
+      
+      regex_metachars <- c(".", "\\", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?")
+      is_regex <- map_lgl(regex_metachars, ~grepl(.x, split, fixed = TRUE))
+      
+      # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
+      if(!fixed && is_regex || perl){
+        stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE)
+      }
+      
+      FUN("split_pattern", x, options = list(pattern = split))
+    }
     # as.factor() is mapped in expression.R
     as.character = function(x) {
       FUN("cast", x, options = cast_options(to_type = string()))

From 5615486fd43e3312d5402596d049457258181a70 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 14:23:35 +0100
Subject: [PATCH 09/37] Fix typo

---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index e367d530c0b..0e6ac164b74 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -421,7 +421,7 @@ build_function_list <- function(FUN) {
       }
       
       FUN("split_pattern", x, options = list(pattern = split))
-    }
+    },
     # as.factor() is mapped in expression.R
     as.character = function(x) {
       FUN("cast", x, options = cast_options(to_type = string()))

From 9ceff3ce0f41e2c0910686e47da367185b2f5d7c Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 14:40:28 +0100
Subject: [PATCH 10/37] Add extra args to split_pattern

---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 0e6ac164b74..b2bfd977a8e 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -420,7 +420,7 @@ build_function_list <- function(FUN) {
         stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE)
       }
       
-      FUN("split_pattern", x, options = list(pattern = split))
+      FUN("split_pattern", x, options = list(pattern = split, reverse = FALSE, max_splits = -1))
     },
     # as.factor() is mapped in expression.R
     as.character = function(x) {

From d140155f624301ea038d07ff0ecb4510cd2fa702 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 14:56:12 +0100
Subject: [PATCH 11/37] Whitespace fix

---
 r/R/dplyr.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index b2bfd977a8e..61d199072aa 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -419,7 +419,6 @@ build_function_list <- function(FUN) {
       if(!fixed && is_regex || perl){
         stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE)
       }
-      
       FUN("split_pattern", x, options = list(pattern = split, reverse = FALSE, max_splits = -1))
     },
     # as.factor() is mapped in expression.R

From 8f5dcc6a1791365629e73af2194427cee7d1bb21 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 14:58:19 +0100
Subject: [PATCH 12/37] Add in 'any' to regex check

---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 61d199072aa..1ccf775ad85 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -413,7 +413,7 @@ build_function_list <- function(FUN) {
     strsplit = function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){
       
       regex_metachars <- c(".", "\\", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?")
-      is_regex <- map_lgl(regex_metachars, ~grepl(.x, split, fixed = TRUE))
+      is_regex <- any(map_lgl(regex_metachars, ~grepl(.x, split, fixed = TRUE)))
       
       # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
       if(!fixed && is_regex || perl){

From 6a7b756d593b97f97fc14da44d5edf8af07ff76f Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 15:47:02 +0100
Subject: [PATCH 13/37] Assert split length and shotern regex metachar search

---
 r/R/dplyr.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 1ccf775ad85..d6e79e432db 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -412,8 +412,9 @@ build_function_list <- function(FUN) {
     },
     strsplit = function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){
       
-      regex_metachars <- c(".", "\\", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?")
-      is_regex <- any(map_lgl(regex_metachars, ~grepl(.x, split, fixed = TRUE)))
+      assert_that(length(split) == 1)
+      
+      is_regex <- grepl("[.\\|()[{^$*+?]", split)
       
       # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
       if(!fixed && is_regex || perl){

From 983cb9c26427f222f643ed1a6ff052e743b1299f Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 30 Apr 2021 15:51:50 +0100
Subject: [PATCH 14/37] Ignore Perl

---
 r/R/dplyr.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index d6e79e432db..5409ba24dd9 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -417,9 +417,12 @@ build_function_list <- function(FUN) {
       is_regex <- grepl("[.\\|()[{^$*+?]", split)
       
       # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
-      if(!fixed && is_regex || perl){
+      if(!fixed && is_regex){
         stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE)
       }
+      if(fixed && perl){
+        warning("argument 'perl = TRUE' will be ignored")
+      }
       FUN("split_pattern", x, options = list(pattern = split, reverse = FALSE, max_splits = -1))
     },
     # as.factor() is mapped in expression.R

From 48e8917678cd3d97634832d83ef6c92eccdc1d75 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Tue, 4 May 2021 15:22:55 +0100
Subject: [PATCH 15/37] Add bdingin for str_split, and add tests

---
 r/R/dplyr.R                                   | 51 +++++++++++++------
 .../testthat/test-dplyr-string-functions.R    | 47 +++++++++++++++++
 2 files changed, 83 insertions(+), 15 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 5409ba24dd9..c2ccebea9cf 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -410,21 +410,6 @@ build_function_list <- function(FUN) {
         options = list(null_encoding_behavior = null_encoding_behavior)
       )
     },
-    strsplit = function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){
-      
-      assert_that(length(split) == 1)
-      
-      is_regex <- grepl("[.\\|()[{^$*+?]", split)
-      
-      # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
-      if(!fixed && is_regex){
-        stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE)
-      }
-      if(fixed && perl){
-        warning("argument 'perl = TRUE' will be ignored")
-      }
-      FUN("split_pattern", x, options = list(pattern = split, reverse = FALSE, max_splits = -1))
-    },
     # as.factor() is mapped in expression.R
     as.character = function(x) {
       FUN("cast", x, options = cast_options(to_type = string()))
@@ -492,6 +477,8 @@ build_function_list <- function(FUN) {
     gsub = arrow_r_string_replace_function(FUN, -1L),
     str_replace = arrow_stringr_string_replace_function(FUN, 1L),
     str_replace_all = arrow_stringr_string_replace_function(FUN, -1L),
+    strsplit = arrow_r_string_split_function(FUN),
+    str_split = arrow_stringr_string_split_function(FUN),
     between = function(x, left, right) {
       x >= left & x <= right
     },
@@ -555,6 +542,31 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) {
   }
 }
 
+arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1){
+  function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){
+    
+    assert_that(length(split) == 1)
+    
+    # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
+    if(!fixed && contains_regex(split)){
+      stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE)
+    }
+    if(fixed && perl){
+      warning("argument 'perl = TRUE' will be ignored")
+    }
+    FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits))
+  }
+}
+
+arrow_stringr_string_split_function <- function(FUN, reverse = FALSE){
+  function(string, pattern, n = 0){
+    if(contains_regex(pattern)){
+      stop("regular expression matching not supported in str_split for Arrow", call. = FALSE)
+    }
+    FUN("split_pattern", string, options = list(pattern = pattern, reverse = reverse, max_splits = n-1))
+  }
+}
+
 # format `pattern` as needed for case insensitivity and literal matching by RE2
 format_string_pattern <- function(pattern, ignore.case, fixed) {
   # Arrow lacks native support for case-insensitive literal string matching and
@@ -1114,3 +1126,12 @@ not_implemented_for_dataset <- function(method) {
     call. = FALSE
   )
 }
+
+#' Does this string contain regex metacharacters?
+#' 
+#' @param string String to be tested
+#' @keywords internal
+#' @return Logical: does `string` contain regex metacharacters?
+contains_regex <- function(string){
+  grepl("[.\\|()[{^$*+?]", string)
+}
\ No newline at end of file
diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index 5faf2436f55..811e7d7babf 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -239,6 +239,53 @@ test_that("str_replace and str_replace_all", {
 
 })
 
+test_that("strsplit and str_split", {
+  df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+  
+  expect_dplyr_equal(
+    input %>%
+      transmute(x = strsplit(x, "and")) %>%
+      collect(),
+    df
+  )
+  
+  expect_warning(
+    df %>%
+    Table$create() %>%
+      mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>%
+      collect(),
+    regexp = "not supported in Arrow"
+  )
+  
+  expect_dplyr_equal(
+    input %>%
+      mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
+      collect(),
+    df
+  )
+  
+  expect_dplyr_equal(
+    input %>%
+      transmute(x = str_split(x, "and")) %>%
+      collect(),
+    df
+  )
+  
+  expect_dplyr_equal(
+    input %>%
+      transmute(x = str_split(x, "and", n = 2)) %>%
+      collect(),
+    df
+  )
+  
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      transmute(x = str_split(x, "and.?")) %>%
+      collect()
+  )
+})
+
 test_that("backreferences in pattern", {
   skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)")
   df <- tibble(x = c("Foo", "bar"))

From 45f36c4a7866443b3b7f919a56057b220b388806 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Tue, 4 May 2021 15:35:08 +0100
Subject: [PATCH 16/37] Update test to use mutate

---
 r/tests/testthat/test-dplyr-string-functions.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index 811e7d7babf..363debca917 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -248,7 +248,7 @@ test_that("strsplit and str_split", {
       collect(),
     df
   )
-  
+
   expect_warning(
     df %>%
     Table$create() %>%
@@ -256,32 +256,32 @@ test_that("strsplit and str_split", {
       collect(),
     regexp = "not supported in Arrow"
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       transmute(x = str_split(x, "and")) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       transmute(x = str_split(x, "and", n = 2)) %>%
       collect(),
     df
   )
-  
+
   expect_warning(
     df %>%
       Table$create() %>%
-      transmute(x = str_split(x, "and.?")) %>%
+      mutate(x = str_split(x, "and.?")) %>%
       collect()
   )
 })

From abeef7c1cee7bf968e9d13e113fa1f11413af277 Mon Sep 17 00:00:00 2001
From: Nic <thisisnic@gmail.com>
Date: Tue, 4 May 2021 19:18:44 +0100
Subject: [PATCH 17/37] Update r/R/dplyr.R

Co-authored-by: Ian Cook <ianmcook@gmail.com>
---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index c2ccebea9cf..0b5eaa31bff 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -1134,4 +1134,4 @@ not_implemented_for_dataset <- function(method) {
 #' @return Logical: does `string` contain regex metacharacters?
 contains_regex <- function(string){
   grepl("[.\\|()[{^$*+?]", string)
-}
\ No newline at end of file
+}

From 1f77d66d9ab1bf5451cdbffbd931636ba7ab9aed Mon Sep 17 00:00:00 2001
From: Nic <thisisnic@gmail.com>
Date: Tue, 4 May 2021 19:23:06 +0100
Subject: [PATCH 18/37] use is.string instead of length == 1

Co-authored-by: Ian Cook <ianmcook@gmail.com>
---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 0b5eaa31bff..78d43565e58 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -545,7 +545,7 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) {
 arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1){
   function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){
     
-    assert_that(length(split) == 1)
+    assert_that(is.string(split))
     
     # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
     if(!fixed && contains_regex(split)){

From 2448450b4f6be375e6af95b1076d5cd8db2b6734 Mon Sep 17 00:00:00 2001
From: Nic <thisisnic@gmail.com>
Date: Tue, 4 May 2021 19:23:50 +0100
Subject: [PATCH 19/37] Capitalise

Co-authored-by: Ian Cook <ianmcook@gmail.com>
---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 78d43565e58..945a8dff71a 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -549,7 +549,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
     
     # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
     if(!fixed && contains_regex(split)){
-      stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE)
+      stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE)
     }
     if(fixed && perl){
       warning("argument 'perl = TRUE' will be ignored")

From 14cf037608c8dc6148e689837921912e8d5b800a Mon Sep 17 00:00:00 2001
From: Nic <thisisnic@gmail.com>
Date: Tue, 4 May 2021 19:23:58 +0100
Subject: [PATCH 20/37] Update r/R/dplyr.R

Co-authored-by: Ian Cook <ianmcook@gmail.com>
---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 945a8dff71a..52b54852d3b 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -561,7 +561,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
 arrow_stringr_string_split_function <- function(FUN, reverse = FALSE){
   function(string, pattern, n = 0){
     if(contains_regex(pattern)){
-      stop("regular expression matching not supported in str_split for Arrow", call. = FALSE)
+      stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE)
     }
     FUN("split_pattern", string, options = list(pattern = pattern, reverse = reverse, max_splits = n-1))
   }

From fc743e73098f5a5a76bad093c124d9cf45f9fe99 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Tue, 4 May 2021 19:36:12 +0100
Subject: [PATCH 21/37] Update spacing

---
 r/R/dplyr.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 52b54852d3b..17bdbd75afa 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -542,8 +542,8 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) {
   }
 }
 
-arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1){
-  function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){
+arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) {
+  function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) {
     
     assert_that(is.string(split))
     
@@ -551,7 +551,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
     if(!fixed && contains_regex(split)){
       stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE)
     }
-    if(fixed && perl){
+    if (fixed && perl) {
       warning("argument 'perl = TRUE' will be ignored")
     }
     FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits))
@@ -1132,6 +1132,6 @@ not_implemented_for_dataset <- function(method) {
 #' @param string String to be tested
 #' @keywords internal
 #' @return Logical: does `string` contain regex metacharacters?
-contains_regex <- function(string){
+contains_regex <- function(string) {
   grepl("[.\\|()[{^$*+?]", string)
 }

From 22b9256e9246306a50d0c2cf5f6b84a567fd20b7 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Tue, 4 May 2021 19:46:29 +0100
Subject: [PATCH 22/37] Add spacing

---
 r/R/dplyr.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 17bdbd75afa..fd0643fb58a 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -548,7 +548,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
     assert_that(is.string(split))
     
     # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex
-    if(!fixed && contains_regex(split)){
+    if (!fixed && contains_regex(split)) {
       stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE)
     }
     if (fixed && perl) {
@@ -558,9 +558,9 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
   }
 }
 
-arrow_stringr_string_split_function <- function(FUN, reverse = FALSE){
-  function(string, pattern, n = 0){
-    if(contains_regex(pattern)){
+arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) {
+  function(string, pattern, n = 0) {
+    if (contains_regex(pattern)) {
       stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE)
     }
     FUN("split_pattern", string, options = list(pattern = pattern, reverse = reverse, max_splits = n-1))

From 51d250fab03ef9701ee7148f6f09e6f72f3599dc Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Thu, 6 May 2021 16:38:04 +0100
Subject: [PATCH 23/37] Use get_stringr_pattern

---
 r/R/dplyr.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index fd0643fb58a..8c58459f245 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -560,10 +560,14 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
 
 arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) {
   function(string, pattern, n = 0) {
-    if (contains_regex(pattern)) {
+    opts <- get_stringr_pattern_options(enexpr(pattern))
+    if (!opts$fixed && contains_regex(opts$pattern)) {
       stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE)
     }
-    FUN("split_pattern", string, options = list(pattern = pattern, reverse = reverse, max_splits = n-1))
+    if (opts$ignore_case) {
+      stop("Case-insensitive string splitting not supported in Arrow", call. = FALSE)
+    }
+    FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1))
   }
 }
 

From a521405d4a6f21d073daa283c01abf8f1ca0e9b1 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Thu, 6 May 2021 17:09:44 +0100
Subject: [PATCH 24/37] Document get_stringr_pattern_options function, change
 default value of fixed to FALSE (to match stringr),

---
 r/R/dplyr.R | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 8c58459f245..4cef4161177 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -603,9 +603,16 @@ format_string_replacement <- function(replacement, ignore.case, fixed) {
   replacement
 }
 
-# this function assigns definitions for the stringr pattern modifier functions
-# (fixed, regex, etc.) in itself, and uses them to evaluate the quoted
-# expression `pattern`
+#' Get `stringr` pattern options
+#' 
+#' This function assigns definitions for the stringr pattern modifier functions
+#' (fixed, regex, etc.) in itself, and uses them to evaluate the quoted
+#' expression `pattern`
+#' 
+#' @param pattern Unevaluated pattern to be evaluated
+#'
+#' @return List containing elements `pattern`, `fixed`, and `ignore_case`
+#' @keywords internal
 get_stringr_pattern_options <- function(pattern) {
   fixed <- function(pattern, ignore_case = FALSE, ...) {
     check_dots(...)
@@ -637,7 +644,7 @@ get_stringr_pattern_options <- function(pattern) {
   }
   ensure_opts <- function(opts) {
     if (is.character(opts)) {
-      opts <- list(pattern = opts, fixed = TRUE, ignore_case = FALSE)
+      opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE)
     }
     opts
   }

From b758b5520dba31645de8a46d5ef36dff8516db5a Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Thu, 6 May 2021 17:15:48 +0100
Subject: [PATCH 25/37] Fix minor style issues, add extra tests

---
 .../testthat/test-dplyr-string-functions.R    | 49 +++++++++++++++++--
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index 363debca917..067204d5dd4 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -52,7 +52,7 @@ skip_if_not_available("re2")
 test_that("grepl", {
   df <- tibble(x = c("Foo", "bar"))
 
-  for(fixed in c(TRUE, FALSE)) {
+  for (fixed in c(TRUE, FALSE)) {
 
     expect_dplyr_equal(
       input %>%
@@ -150,7 +150,7 @@ test_that("str_detect", {
 test_that("sub and gsub", {
   df <- tibble(x = c("Foo", "bar"))
 
-  for(fixed in c(TRUE, FALSE)) {
+  for (fixed in c(TRUE, FALSE)) {
 
     expect_dplyr_equal(
       input %>%
@@ -206,12 +206,27 @@ test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", {
 test_that("str_replace and str_replace_all", {
   df <- tibble(x = c("Foo", "bar"))
 
+  expect_dplyr_equal(
+    input %>%
+      transmute(x = str_replace_all(x, "^F", "baz")) %>%
+      collect(),
+    df
+  )
+  
   expect_dplyr_equal(
     input %>%
       transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
       collect(),
     df
   )
+  
+  expect_dplyr_equal(
+    input %>%
+      mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
+      collect(),
+    df
+  )
+  
   expect_dplyr_equal(
     input %>%
       transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
@@ -240,11 +255,12 @@ test_that("str_replace and str_replace_all", {
 })
 
 test_that("strsplit and str_split", {
+  
   df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
   
   expect_dplyr_equal(
     input %>%
-      transmute(x = strsplit(x, "and")) %>%
+      mutate(x = strsplit(x, "and")) %>%
       collect(),
     df
   )
@@ -266,17 +282,39 @@ test_that("strsplit and str_split", {
 
   expect_dplyr_equal(
     input %>%
-      transmute(x = str_split(x, "and")) %>%
+      mutate(x = str_split(x, "and")) %>%
       collect(),
     df
   )
 
   expect_dplyr_equal(
     input %>%
-      transmute(x = str_split(x, "and", n = 2)) %>%
+      mutate(x = str_split(x, "and", n = 2)) %>%
       collect(),
     df
   )
+  
+  expect_dplyr_equal(
+    input %>%
+      mutate(x = str_split(x, fixed("and"), n = 2)) %>%
+      collect(),
+    df
+  )
+  
+  expect_dplyr_equal(
+    input %>%
+      mutate(x = str_split(x, regex("and"), n = 2)) %>%
+      collect(),
+    df
+  )
+  
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, regex("and.*"), n = 2)) %>%
+      collect(),
+    regexp = "not supported"
+  )
 
   expect_warning(
     df %>%
@@ -284,6 +322,7 @@ test_that("strsplit and str_split", {
       mutate(x = str_split(x, "and.?")) %>%
       collect()
   )
+  
 })
 
 test_that("backreferences in pattern", {

From 3d4f9ce069701d66bc1226cea0f14b40b6baf69f Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Thu, 6 May 2021 17:39:03 +0100
Subject: [PATCH 26/37] Reorganise warning/error generating tests into one
 section for consistency, and add extra tests when no helper function used

---
 .../testthat/test-dplyr-string-functions.R    | 69 ++++++++++++-------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index 067204d5dd4..e29f25218a9 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -264,15 +264,7 @@ test_that("strsplit and str_split", {
       collect(),
     df
   )
-
-  expect_warning(
-    df %>%
-    Table$create() %>%
-      mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>%
-      collect(),
-    regexp = "not supported in Arrow"
-  )
-
+  
   expect_dplyr_equal(
     input %>%
       mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
@@ -307,22 +299,6 @@ test_that("strsplit and str_split", {
       collect(),
     df
   )
-  
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      mutate(x = str_split(x, regex("and.*"), n = 2)) %>%
-      collect(),
-    regexp = "not supported"
-  )
-
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      mutate(x = str_split(x, "and.?")) %>%
-      collect()
-  )
-  
 })
 
 test_that("backreferences in pattern", {
@@ -351,6 +327,12 @@ test_that("backreferences (substitutions) in replacement", {
       collect(),
     tibble(url = "https://arrow.apache.org/docs/r/")
   )
+  expect_dplyr_equal(
+    input %>%
+      transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>%
+      collect(),
+    df
+  )
   expect_dplyr_equal(
     input %>%
       transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
@@ -408,6 +390,41 @@ test_that("errors and warnings", {
 
   # These conditions generate an error, but abandon_ship() catches the error,
   # issues a warning, and pulls the data into R
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>%
+      collect(),
+    regexp = "not supported in Arrow"
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, "and.?")) %>%
+      collect()
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, regex("and.?"), n = 2)) %>%
+      collect(),
+    regexp = "not supported"
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, coll("and.?"))) %>%
+      collect(),
+    regexp = "not supported"
+  )
+  
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, boundary(type = "word"))) %>%
+      collect(),
+    regexp = "not supported"
+  )
   expect_warning(
     df %>%
       Table$create() %>%
@@ -422,7 +439,6 @@ test_that("errors and warnings", {
       collect(),
     "not supported"
   )
-
   # This condition generates a warning
   expect_warning(
     df %>%
@@ -430,4 +446,5 @@ test_that("errors and warnings", {
       transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")),
     "Ignoring pattern modifier argument not supported in Arrow: \"multiline\""
   )
+  
 })

From f8cfff5b2cc34c7ef7ebf9812c977cb90263ec29 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 13:14:17 -0400
Subject: [PATCH 27/37] Fix missing }

---
 r/src/compute.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 58890c3785a..7c58c4a3a0a 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -241,7 +241,8 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
     }
     return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["pattern"]),
                                      max_splits, cpp11::as_cpp<bool>(options["reverse"]));
-
+  }
+  
   if (func_name == "variance" || func_name == "stddev") {
     using Options = arrow::compute::VarianceOptions;
     return std::make_shared<Options>(cpp11::as_cpp<int64_t>(options["ddof"]));

From 3b66f4f08e7ab2c84e1a93707ae37302ab677e70 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 13:17:29 -0400
Subject: [PATCH 28/37] Re-lint

---
 r/src/compute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 7c58c4a3a0a..6a82d5af3b3 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -242,7 +242,7 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
     return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["pattern"]),
                                      max_splits, cpp11::as_cpp<bool>(options["reverse"]));
   }
-  
+
   if (func_name == "variance" || func_name == "stddev") {
     using Options = arrow::compute::VarianceOptions;
     return std::make_shared<Options>(cpp11::as_cpp<int64_t>(options["ddof"]));

From 245c99fdddea5e4a627081e2e98fd97f85988fe0 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 17:23:56 -0400
Subject: [PATCH 29/37] Make reverse optional in split_pattern options

---
 r/src/compute.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 6a82d5af3b3..205f6068565 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -239,8 +239,12 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
     if (!Rf_isNull(options["max_splits"])) {
       max_splits = cpp11::as_cpp<int64_t>(options["max_splits"]);
     }
+    bool reverse = false;
+    if (!Rf_isNull(options["reverse"])) {
+      reverse = cpp11::as_cpp<bool>(options["reverse"]);
+    }
     return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["pattern"]),
-                                     max_splits, cpp11::as_cpp<bool>(options["reverse"]));
+                                     max_splits, reverse);
   }
 
   if (func_name == "variance" || func_name == "stddev") {

From b3884f26c2f23f59b2504f9f2301e6b6f5696626 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 17:24:55 -0400
Subject: [PATCH 30/37] Add handling for utf8_split_whitespace,
 ascii_split_whitespace options

---
 r/src/compute.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 205f6068565..0ffe53578c4 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -247,6 +247,19 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
                                      max_splits, reverse);
   }
 
+  if (func_name == "utf8_split_whitespace" || func_name == "ascii_split_whitespace") {
+    using Options = arrow::compute::SplitOptions;
+    int64_t max_splits = -1;
+    if (!Rf_isNull(options["max_splits"])) {
+      max_splits = cpp11::as_cpp<int64_t>(options["max_splits"]);
+    }
+    bool reverse = false;
+    if (!Rf_isNull(options["reverse"])) {
+      reverse = cpp11::as_cpp<bool>(options["reverse"]);
+    }
+    return std::make_shared<Options>(max_splits, reverse);
+  }
+
   if (func_name == "variance" || func_name == "stddev") {
     using Options = arrow::compute::VarianceOptions;
     return std::make_shared<Options>(cpp11::as_cpp<int64_t>(options["ddof"]));

From 8dfc213c2d8aedc6621259c9cd8667c97c60d48b Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 18:26:27 -0400
Subject: [PATCH 31/37] Tweak str_split() args for consistency with stringr

---
 r/R/dplyr.R | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 3ff11734ffd..bcc315dacb2 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -558,7 +558,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
 }
 
 arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) {
-  function(string, pattern, n = 0) {
+  function(string, pattern, n = Inf, simplify = FALSE) {
     opts <- get_stringr_pattern_options(enexpr(pattern))
     if (!opts$fixed && contains_regex(opts$pattern)) {
       stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE)
@@ -566,7 +566,16 @@ arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) {
     if (opts$ignore_case) {
       stop("Case-insensitive string splitting not supported in Arrow", call. = FALSE)
     }
-    FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1))
+    if (n == 0) {
+      stop("Splitting strings into zero parts not supported in Arrow" , call. = FALSE)
+    }
+    if (identical(n, Inf)) {
+      n <- 0L
+    }
+    if (simplify) {
+      warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE)
+    }
+    FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1L))
   }
 }
 

From cbc0e78bf3aa5d47b036d9f1397dd3a866642f6a Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 18:27:19 -0400
Subject: [PATCH 32/37] Add and reorganize tests

---
 .../testthat/test-dplyr-string-functions.R    | 175 ++++++++++--------
 1 file changed, 101 insertions(+), 74 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index e29f25218a9..844fa97d4af 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -255,53 +255,144 @@ test_that("str_replace and str_replace_all", {
 })
 
 test_that("strsplit and str_split", {
-  
+
   df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(x = strsplit(x, "and")) %>%
       collect(),
     df
   )
-  
   expect_dplyr_equal(
     input %>%
       mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
       collect(),
     df
   )
-
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, "and")) %>%
       collect(),
     df
   )
-
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, "and", n = 2)) %>%
       collect(),
     df
   )
-  
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, fixed("and"), n = 2)) %>%
       collect(),
     df
   )
-  
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, regex("and"), n = 2)) %>%
       collect(),
     df
   )
+
+})
+
+test_that("errors and warnings in string splitting", {
+  df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+  # These conditions generate an error, but abandon_ship() catches the error,
+  # issues a warning, and pulls the data into R
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>%
+      collect(),
+    regexp = "not supported"
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, "and.?")) %>%
+      collect()
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, regex("and.?"), n = 2)) %>%
+      collect(),
+    regexp = "not supported"
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, fixed("and", ignore_case = TRUE))) %>%
+      collect(),
+    "not supported"
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, coll("and.?"))) %>%
+      collect(),
+    regexp = "not supported"
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, boundary(type = "word"))) %>%
+      collect(),
+    regexp = "not supported"
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, "and", n = 0)) %>%
+      collect(),
+    regexp = "not supported"
+  )
+
+  # This condition generates a warning
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_split(x, fixed("and"), simplify = TRUE)) %>%
+      collect(),
+    "ignored"
+  )
+
 })
 
-test_that("backreferences in pattern", {
+test_that("errors and warnings in string detection and replacement", {
+  df <- tibble(x = c("Foo", "bar"))
+
+  # These conditions generate an error, but abandon_ship() catches the error,
+  # issues a warning, and pulls the data into R
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      filter(str_detect(x, boundary(type = "character"))) %>%
+      collect(),
+    regexp = "not implemented"
+  )
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_replace_all(x, coll("o", locale = "en"), "ó")) %>%
+      collect(),
+    regexp = "not supported"
+  )
+
+  # This condition generates a warning
+  expect_warning(
+    df %>%
+      Table$create() %>%
+      transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")),
+    "Ignoring pattern modifier argument not supported in Arrow: \"multiline\""
+  )
+
+})
+
+test_that("backreferences in pattern in string detection", {
   skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)")
   df <- tibble(x = c("Foo", "bar"))
 
@@ -313,7 +404,7 @@ test_that("backreferences in pattern", {
   )
 })
 
-test_that("backreferences (substitutions) in replacement", {
+test_that("backreferences (substitutions) in string replacement", {
   df <- tibble(x = c("Foo", "bar"))
 
   expect_dplyr_equal(
@@ -347,7 +438,7 @@ test_that("backreferences (substitutions) in replacement", {
   )
 })
 
-test_that("edge cases", {
+test_that("edge cases in string detection and replacement", {
 
   # in case-insensitive fixed match/replace, test that "\\E" in the search
   # string and backslashes in the replacement string are interpreted literally.
@@ -384,67 +475,3 @@ test_that("edge cases", {
   )
 
 })
-
-test_that("errors and warnings", {
-  df <- tibble(x = c("Foo", "bar"))
-
-  # These conditions generate an error, but abandon_ship() catches the error,
-  # issues a warning, and pulls the data into R
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>%
-      collect(),
-    regexp = "not supported in Arrow"
-  )
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      mutate(x = str_split(x, "and.?")) %>%
-      collect()
-  )
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      mutate(x = str_split(x, regex("and.?"), n = 2)) %>%
-      collect(),
-    regexp = "not supported"
-  )
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      mutate(x = str_split(x, coll("and.?"))) %>%
-      collect(),
-    regexp = "not supported"
-  )
-  
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      mutate(x = str_split(x, boundary(type = "word"))) %>%
-      collect(),
-    regexp = "not supported"
-  )
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      filter(str_detect(x, boundary(type = "character"))) %>%
-      collect(),
-    "not implemented"
-  )
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      mutate(x = str_replace_all(x, coll("o", locale = "en"), "ó")) %>%
-      collect(),
-    "not supported"
-  )
-  # This condition generates a warning
-  expect_warning(
-    df %>%
-      Table$create() %>%
-      transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")),
-    "Ignoring pattern modifier argument not supported in Arrow: \"multiline\""
-  )
-  
-})

From 4b96d07370e6f4e761eda7e30703e880fbf74469 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 18:28:42 -0400
Subject: [PATCH 33/37] Add missing call. = FALSE

---
 r/R/dplyr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index bcc315dacb2..d53b5ee57c9 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -551,7 +551,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1)
       stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE)
     }
     if (fixed && perl) {
-      warning("argument 'perl = TRUE' will be ignored")
+      warning("Argument 'perl = TRUE' will be ignored", call. = FALSE)
     }
     FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits))
   }

From a567ce1ad22e2b74d34cd5b89c21f0cb8f2ec343 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 18:30:29 -0400
Subject: [PATCH 34/37] Tweak get_stringr_pattern_options() docs

---
 r/R/dplyr.R | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index d53b5ee57c9..4e66c227bea 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -612,12 +612,14 @@ format_string_replacement <- function(replacement, ignore.case, fixed) {
 }
 
 #' Get `stringr` pattern options
-#' 
-#' This function assigns definitions for the stringr pattern modifier functions
-#' (fixed, regex, etc.) in itself, and uses them to evaluate the quoted
-#' expression `pattern`
-#' 
-#' @param pattern Unevaluated pattern to be evaluated
+#'
+#' This function assigns definitions for the `stringr` pattern modifier
+#' functions (`fixed()`, `regex()`, etc.) inside itself, and uses them to
+#' evaluate the quoted expression `pattern`, returning a list that is used
+#' to control pattern matching behavior in internal `arrow` functions.
+#'
+#' @param pattern Unevaluated expression containing a call to a `stringr`
+#' pattern modifier function
 #'
 #' @return List containing elements `pattern`, `fixed`, and `ignore_case`
 #' @keywords internal

From 0a30a69ab43ef2d21316e0601e5fa53580a6061a Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 18:31:13 -0400
Subject: [PATCH 35/37] Build new internal functions docs

---
 r/man/contains_regex.Rd              | 18 ++++++++++++++++++
 r/man/get_stringr_pattern_options.Rd | 22 ++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 r/man/contains_regex.Rd
 create mode 100644 r/man/get_stringr_pattern_options.Rd

diff --git a/r/man/contains_regex.Rd b/r/man/contains_regex.Rd
new file mode 100644
index 00000000000..d8fee96d99b
--- /dev/null
+++ b/r/man/contains_regex.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dplyr.R
+\name{contains_regex}
+\alias{contains_regex}
+\title{Does this string contain regex metacharacters?}
+\usage{
+contains_regex(string)
+}
+\arguments{
+\item{string}{String to be tested}
+}
+\value{
+Logical: does \code{string} contain regex metacharacters?
+}
+\description{
+Does this string contain regex metacharacters?
+}
+\keyword{internal}
diff --git a/r/man/get_stringr_pattern_options.Rd b/r/man/get_stringr_pattern_options.Rd
new file mode 100644
index 00000000000..79a9a72b7cf
--- /dev/null
+++ b/r/man/get_stringr_pattern_options.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dplyr.R
+\name{get_stringr_pattern_options}
+\alias{get_stringr_pattern_options}
+\title{Get \code{stringr} pattern options}
+\usage{
+get_stringr_pattern_options(pattern)
+}
+\arguments{
+\item{pattern}{Unevaluated expression containing a call to a \code{stringr}
+pattern modifier function}
+}
+\value{
+List containing elements \code{pattern}, \code{fixed}, and \code{ignore_case}
+}
+\description{
+This function assigns definitions for the \code{stringr} pattern modifier
+functions (\code{fixed()}, \code{regex()}, etc.) inside itself, and uses them to
+evaluate the quoted expression \code{pattern}, returning a list that is used
+to control pattern matching behavior in internal \code{arrow} functions.
+}
+\keyword{internal}

From 25ec2f8a5a1af060010656630c1f13bbdf9f1e90 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 18:32:18 -0400
Subject: [PATCH 36/37] Add unrelated note about 0-based indices in
 call_function() docs

---
 r/R/compute.R          | 2 ++
 r/man/call_function.Rd | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/r/R/compute.R b/r/R/compute.R
index 0641bf1615c..c3783ba3295 100644
--- a/r/R/compute.R
+++ b/r/R/compute.R
@@ -27,6 +27,8 @@
 #' `RecordBatch`, or `Table`.
 #' @param args list arguments as an alternative to specifying in `...`
 #' @param options named list of C++ function options.
+#' @details When passing indices in `...`, `args`, or `options`, express them as
+#' 0-based integers (consistent with C++).
 #' @return An `Array`, `ChunkedArray`, `Scalar`, `RecordBatch`, or `Table`, whatever the compute function results in.
 #' @seealso [Arrow C++ documentation](https://arrow.apache.org/docs/cpp/compute.html) for the functions and their respective options.
 #' @examples
diff --git a/r/man/call_function.Rd b/r/man/call_function.Rd
index 4ab9fd7e942..e89fd00576e 100644
--- a/r/man/call_function.Rd
+++ b/r/man/call_function.Rd
@@ -31,6 +31,10 @@ Many Arrow compute functions are mapped to R methods,
 and in a \code{dplyr} evaluation context, \link[=list_compute_functions]{all Arrow functions}
 are callable with an \code{arrow_} prefix.
 }
+\details{
+When passing indices in \code{...}, \code{args}, or \code{options}, express them as
+0-based integers (consistent with C++).
+}
 \examples{
 \donttest{
 a <- Array$create(c(1L, 2L, 3L, NA, 5L))

From 15042000daa742af911ea915a3903317b488ca2f Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 6 May 2021 22:49:20 -0400
Subject: [PATCH 37/37] Test arrow_ascii_split_whitespace,
 arrow_utf8_split_whitespace

---
 .../testthat/test-dplyr-string-functions.R    | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index 844fa97d4af..64351a83ea7 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -297,6 +297,33 @@ test_that("strsplit and str_split", {
 
 })
 
+test_that("arrow_*_split_whitespace functions", {
+
+  # use only ASCII whitespace characters
+  df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
+
+  # use only non-ASCII whitespace characters
+  df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux"))
+
+  df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux")))
+
+  expect_equivalent(
+    df_ascii %>%
+      Table$create() %>%
+      mutate(x = arrow_ascii_split_whitespace(x)) %>%
+      collect(),
+    df_split
+  )
+  expect_equivalent(
+    df_utf8 %>%
+      Table$create() %>%
+      mutate(x = arrow_utf8_split_whitespace(x)) %>%
+      collect(),
+    df_split
+  )
+
+})
+
 test_that("errors and warnings in string splitting", {
   df <- tibble(x = c("Foo and bar", "baz and qux and quux"))