From 4710e2ec56cb845ea6b0bf575aa5d5cf8f4bc357 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 28 Apr 2021 14:47:24 +0100 Subject: [PATCH 01/37] Add split_pattern function --- r/NAMESPACE | 1 + r/R/compute.R | 17 +++++ r/_pkgdown.yml | 1 + r/man/split_pattern.Rd | 23 ++++++ r/src/compute.cpp | 10 +++ r/tests/testthat/test-compute-string-split.R | 79 ++++++++++++++++++++ 6 files changed, 131 insertions(+) create mode 100644 r/man/split_pattern.Rd create mode 100644 r/tests/testthat/test-compute-string-split.R diff --git a/r/NAMESPACE b/r/NAMESPACE index 607177235e9..08a5e9c532b 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -263,6 +263,7 @@ export(record_batch) export(s3_bucket) export(schema) export(set_cpu_count) +export(split_pattern) export(starts_with) export(string) export(struct) diff --git a/r/R/compute.R b/r/R/compute.R index 0641bf1615c..aea92009508 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -284,3 +284,20 @@ cast_options <- function(safe = TRUE, ...) { ) modifyList(opts, list(...)) } + +#' String splitting +#' +#' Split a string into a list of strings +#' +#' @param x `Scalar`, `Array` or `ChunkedArray` +#' @param pattern The exact substring to look for inside input values. +#' @param max_splits Maximum number of splits. Default -1 means no limit. +#' @param reverse When true, the splitting is done from the end of the string. +#' +#' @return An object (same type as `x`) containing lists of strings +#' +#' @export +split_pattern <- function(x, pattern, max_splits = -1L, reverse = FALSE){ + split_options <- list(pattern = pattern, max_splits = max_splits, reverse = reverse) + call_function("split_pattern", x, options = split_options) +} diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index bb77b416aab..822573d080e 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -162,6 +162,7 @@ reference: - match_arrow - value_counts - list_compute_functions + - split_pattern - title: Configuration contents: - arrow_info diff --git a/r/man/split_pattern.Rd b/r/man/split_pattern.Rd new file mode 100644 index 00000000000..179ce866b1a --- /dev/null +++ b/r/man/split_pattern.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute.R +\name{split_pattern} +\alias{split_pattern} +\title{String splitting} +\usage{ +split_pattern(x, pattern, max_splits = -1L, reverse = FALSE) +} +\arguments{ +\item{x}{\code{Scalar}, \code{Array} or \code{ChunkedArray}} + +\item{pattern}{The exact substring to look for inside input values.} + +\item{max_splits}{Maximum number of splits. Default -1 means no limit.} + +\item{reverse}{When true, the splitting is done from the end of the string.} +} +\value{ +An object (same type as \code{x}) containing lists of strings +} +\description{ +Split a string into a list of strings +} diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 34bc3bea456..80895fddd8a 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -233,6 +233,16 @@ std::shared_ptr make_compute_options( max_replacements); } + if (func_name == "split_pattern") { + using Options = arrow::compute::SplitPatternOptions; + int64_t max_splits = -1; + if (!Rf_isNull(options["max_splits"])) { + max_splits = cpp11::as_cpp(options["max_splits"]); + } + return std::make_shared(cpp11::as_cpp(options["pattern"]), + max_splits, cpp11::as_cpp(options["reverse"])); + } + return nullptr; } diff --git a/r/tests/testthat/test-compute-string-split.R b/r/tests/testthat/test-compute-string-split.R new file mode 100644 index 00000000000..b2fd9a44f79 --- /dev/null +++ b/r/tests/testthat/test-compute-string-split.R @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_str <-c("foobar, foo, bar", "baz, qux, quux.") + +test_that("split_pattern", { + + test_scalar <- Scalar$create(test_str[1]) + test_array <- Array$create(test_str) + test_charray <- chunked_array(test_str[1], test_str[2]) + + expect_equal( + split_pattern(test_scalar, pattern = ","), + Scalar$create(list(c("foobar", " foo", " bar"))) + ) + + expect_equal( + split_pattern(test_array, pattern = ","), + Array$create(list(c("foobar", " foo", " bar"), c("baz", " qux", " quux."))) + ) + + expect_equal( + split_pattern(test_charray, pattern = ","), + ChunkedArray$create(list(c("foobar", " foo", " bar"), c("baz", " qux", " quux."))) + ) + +}) + +test_that("split_pattern with max splits and reverse", { + + test_scalar <- Scalar$create(test_str[1]) + test_array <- Array$create(test_str) + test_charray <- chunked_array(test_str[1], test_str[2]) + + expect_equal( + split_pattern(test_scalar, pattern = ",", max_split = 1), + Scalar$create(list(c("foobar", " foo, bar"))) + ) + + expect_equal( + split_pattern(test_array, pattern = ",", max_split = 1), + Array$create(list(c("foobar", " foo, bar"), c("baz", " qux, quux."))) + ) + + expect_equal( + split_pattern(test_charray, pattern = ",", max_split = 1), + ChunkedArray$create(list(c("foobar", " foo, bar"), c("baz", " qux, quux."))) + ) + + expect_equal( + split_pattern(test_scalar, pattern = ",", max_split = 1, reverse = TRUE), + Scalar$create(list(c("foobar, foo", " bar"))) + ) + + expect_equal( + split_pattern(test_array, pattern = ",", max_split = 1, reverse = TRUE), + Array$create(list(c("foobar, foo", " bar"), c("baz, qux", " quux."))) + ) + + expect_equal( + split_pattern(test_charray, pattern = ",", max_split = 1, reverse = TRUE), + ChunkedArray$create(list(c("foobar, foo", " bar"), c("baz, qux", " quux."))) + ) + +}) From 33a4447afbce05ed579e7f52b9bebe3b595a4308 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 28 Apr 2021 19:03:37 +0100 Subject: [PATCH 02/37] Remove references to split_pattern arguments in favour of strsplit arguments --- r/R/compute.R | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index aea92009508..470e768c133 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -285,19 +285,8 @@ cast_options <- function(safe = TRUE, ...) { modifyList(opts, list(...)) } -#' String splitting -#' -#' Split a string into a list of strings -#' -#' @param x `Scalar`, `Array` or `ChunkedArray` -#' @param pattern The exact substring to look for inside input values. -#' @param max_splits Maximum number of splits. Default -1 means no limit. -#' @param reverse When true, the splitting is done from the end of the string. -#' -#' @return An object (same type as `x`) containing lists of strings -#' #' @export -split_pattern <- function(x, pattern, max_splits = -1L, reverse = FALSE){ - split_options <- list(pattern = pattern, max_splits = max_splits, reverse = reverse) +strsplit.ArrowDatum <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE){ + split_options <- list(pattern = split) call_function("split_pattern", x, options = split_options) } From a370b9ab8ca2c7b5e42ec4dbe570626fb8ccd849 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 28 Apr 2021 19:03:58 +0100 Subject: [PATCH 03/37] Remove reference to split_pattern from pkgdown --- r/_pkgdown.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 822573d080e..bb77b416aab 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -162,7 +162,6 @@ reference: - match_arrow - value_counts - list_compute_functions - - split_pattern - title: Configuration contents: - arrow_info From 4677f3173ccb1887ceeb4e62af75731ffe898d59 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 28 Apr 2021 19:50:55 +0100 Subject: [PATCH 04/37] Update tests for strsplit --- r/NAMESPACE | 4 +- r/R/compute.R | 11 +++- r/man/split_pattern.Rd | 23 -------- r/tests/testthat/test-compute-string-split.R | 61 +++----------------- 4 files changed, 18 insertions(+), 81 deletions(-) delete mode 100644 r/man/split_pattern.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 08a5e9c532b..e17901e43c3 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -91,6 +91,8 @@ S3method(read_message,default) S3method(row.names,ArrowTabular) S3method(sort,ArrowDatum) S3method(sort,Scalar) +S3method(strsplit,ArrowDatum) +S3method(strsplit,default) S3method(sum,ArrowDatum) S3method(tail,ArrowDatum) S3method(tail,ArrowTabular) @@ -263,9 +265,9 @@ export(record_batch) export(s3_bucket) export(schema) export(set_cpu_count) -export(split_pattern) export(starts_with) export(string) +export(strsplit) export(struct) export(time32) export(time64) diff --git a/r/R/compute.R b/r/R/compute.R index 470e768c133..0ce835655a5 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -286,7 +286,12 @@ cast_options <- function(safe = TRUE, ...) { } #' @export -strsplit.ArrowDatum <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE){ - split_options <- list(pattern = split) - call_function("split_pattern", x, options = split_options) +strsplit <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE, ...) UseMethod("strsplit") + +#' @export +strsplit.default <- base::strsplit + +#' @export +strsplit.ArrowDatum <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE, ...){ + call_function("split_pattern", x, options = list(pattern = split, reverse = FALSE)) } diff --git a/r/man/split_pattern.Rd b/r/man/split_pattern.Rd deleted file mode 100644 index 179ce866b1a..00000000000 --- a/r/man/split_pattern.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compute.R -\name{split_pattern} -\alias{split_pattern} -\title{String splitting} -\usage{ -split_pattern(x, pattern, max_splits = -1L, reverse = FALSE) -} -\arguments{ -\item{x}{\code{Scalar}, \code{Array} or \code{ChunkedArray}} - -\item{pattern}{The exact substring to look for inside input values.} - -\item{max_splits}{Maximum number of splits. Default -1 means no limit.} - -\item{reverse}{When true, the splitting is done from the end of the string.} -} -\value{ -An object (same type as \code{x}) containing lists of strings -} -\description{ -Split a string into a list of strings -} diff --git a/r/tests/testthat/test-compute-string-split.R b/r/tests/testthat/test-compute-string-split.R index b2fd9a44f79..d8869034e02 100644 --- a/r/tests/testthat/test-compute-string-split.R +++ b/r/tests/testthat/test-compute-string-split.R @@ -17,63 +17,16 @@ test_str <-c("foobar, foo, bar", "baz, qux, quux.") -test_that("split_pattern", { - - test_scalar <- Scalar$create(test_str[1]) - test_array <- Array$create(test_str) - test_charray <- chunked_array(test_str[1], test_str[2]) - - expect_equal( - split_pattern(test_scalar, pattern = ","), - Scalar$create(list(c("foobar", " foo", " bar"))) - ) - - expect_equal( - split_pattern(test_array, pattern = ","), - Array$create(list(c("foobar", " foo", " bar"), c("baz", " qux", " quux."))) - ) - - expect_equal( - split_pattern(test_charray, pattern = ","), - ChunkedArray$create(list(c("foobar", " foo", " bar"), c("baz", " qux", " quux."))) - ) - +test_that("strsplit for Array and ChunkedArray ", { + expect_vector_equal(strsplit(input, split = ","), test_str, ignore_attr = TRUE) }) -test_that("split_pattern with max splits and reverse", { - +test_that("strsplit for Scalar", { test_scalar <- Scalar$create(test_str[1]) - test_array <- Array$create(test_str) - test_charray <- chunked_array(test_str[1], test_str[2]) - expect_equal( - split_pattern(test_scalar, pattern = ",", max_split = 1), - Scalar$create(list(c("foobar", " foo, bar"))) - ) - - expect_equal( - split_pattern(test_array, pattern = ",", max_split = 1), - Array$create(list(c("foobar", " foo, bar"), c("baz", " qux, quux."))) - ) - - expect_equal( - split_pattern(test_charray, pattern = ",", max_split = 1), - ChunkedArray$create(list(c("foobar", " foo, bar"), c("baz", " qux, quux."))) - ) - - expect_equal( - split_pattern(test_scalar, pattern = ",", max_split = 1, reverse = TRUE), - Scalar$create(list(c("foobar, foo", " bar"))) - ) - - expect_equal( - split_pattern(test_array, pattern = ",", max_split = 1, reverse = TRUE), - Array$create(list(c("foobar, foo", " bar"), c("baz, qux", " quux."))) - ) - - expect_equal( - split_pattern(test_charray, pattern = ",", max_split = 1, reverse = TRUE), - ChunkedArray$create(list(c("foobar, foo", " bar"), c("baz, qux", " quux."))) + strsplit(test_scalar, split = ","), + Scalar$create(list(c("foobar", " foo", " bar"))) ) - }) + + From e1aa64f19fb6537b48219ef793c8bbad69e15614 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 12:13:33 +0100 Subject: [PATCH 05/37] Remove test for strsplit --- r/tests/testthat/test-compute-string-split.R | 32 -------------------- 1 file changed, 32 deletions(-) delete mode 100644 r/tests/testthat/test-compute-string-split.R diff --git a/r/tests/testthat/test-compute-string-split.R b/r/tests/testthat/test-compute-string-split.R deleted file mode 100644 index d8869034e02..00000000000 --- a/r/tests/testthat/test-compute-string-split.R +++ /dev/null @@ -1,32 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -test_str <-c("foobar, foo, bar", "baz, qux, quux.") - -test_that("strsplit for Array and ChunkedArray ", { - expect_vector_equal(strsplit(input, split = ","), test_str, ignore_attr = TRUE) -}) - -test_that("strsplit for Scalar", { - test_scalar <- Scalar$create(test_str[1]) - expect_equal( - strsplit(test_scalar, split = ","), - Scalar$create(list(c("foobar", " foo", " bar"))) - ) -}) - - From a3f073838eccafab8e8408b5ce2313fd9865be7a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 12:14:12 +0100 Subject: [PATCH 06/37] Remove s3 methods for strsplit --- r/NAMESPACE | 3 --- 1 file changed, 3 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index e17901e43c3..607177235e9 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -91,8 +91,6 @@ S3method(read_message,default) S3method(row.names,ArrowTabular) S3method(sort,ArrowDatum) S3method(sort,Scalar) -S3method(strsplit,ArrowDatum) -S3method(strsplit,default) S3method(sum,ArrowDatum) S3method(tail,ArrowDatum) S3method(tail,ArrowTabular) @@ -267,7 +265,6 @@ export(schema) export(set_cpu_count) export(starts_with) export(string) -export(strsplit) export(struct) export(time32) export(time64) From 43120a6124c3ac5c0cac9fee6fb67acb394f7a3d Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 13:38:50 +0100 Subject: [PATCH 07/37] Remove strsplit from compute.R --- r/R/compute.R | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 0ce835655a5..0641bf1615c 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -284,14 +284,3 @@ cast_options <- function(safe = TRUE, ...) { ) modifyList(opts, list(...)) } - -#' @export -strsplit <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE, ...) UseMethod("strsplit") - -#' @export -strsplit.default <- base::strsplit - -#' @export -strsplit.ArrowDatum <- function(x, split, fixed = TRUE, perl = FALSE, useBytes = FALSE, ...){ - call_function("split_pattern", x, options = list(pattern = split, reverse = FALSE)) -} From 0ecbab41402303adad1be308f8826da81255cd26 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 14:09:49 +0100 Subject: [PATCH 08/37] Implement strsplit --- r/R/dplyr.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 845cb3a1815..e367d530c0b 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -410,6 +410,18 @@ build_function_list <- function(FUN) { options = list(null_encoding_behavior = null_encoding_behavior) ) }, + strsplit = function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){ + + regex_metachars <- c(".", "\\", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?") + is_regex <- map_lgl(regex_metachars, ~grepl(.x, split, fixed = TRUE)) + + # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex + if(!fixed && is_regex || perl){ + stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE) + } + + FUN("split_pattern", x, options = list(pattern = split)) + } # as.factor() is mapped in expression.R as.character = function(x) { FUN("cast", x, options = cast_options(to_type = string())) From 5615486fd43e3312d5402596d049457258181a70 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 14:23:35 +0100 Subject: [PATCH 09/37] Fix typo --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index e367d530c0b..0e6ac164b74 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -421,7 +421,7 @@ build_function_list <- function(FUN) { } FUN("split_pattern", x, options = list(pattern = split)) - } + }, # as.factor() is mapped in expression.R as.character = function(x) { FUN("cast", x, options = cast_options(to_type = string())) From 9ceff3ce0f41e2c0910686e47da367185b2f5d7c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 14:40:28 +0100 Subject: [PATCH 10/37] Add extra args to split_pattern --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 0e6ac164b74..b2bfd977a8e 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -420,7 +420,7 @@ build_function_list <- function(FUN) { stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE) } - FUN("split_pattern", x, options = list(pattern = split)) + FUN("split_pattern", x, options = list(pattern = split, reverse = FALSE, max_splits = -1)) }, # as.factor() is mapped in expression.R as.character = function(x) { From d140155f624301ea038d07ff0ecb4510cd2fa702 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 14:56:12 +0100 Subject: [PATCH 11/37] Whitespace fix --- r/R/dplyr.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index b2bfd977a8e..61d199072aa 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -419,7 +419,6 @@ build_function_list <- function(FUN) { if(!fixed && is_regex || perl){ stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE) } - FUN("split_pattern", x, options = list(pattern = split, reverse = FALSE, max_splits = -1)) }, # as.factor() is mapped in expression.R From 8f5dcc6a1791365629e73af2194427cee7d1bb21 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 14:58:19 +0100 Subject: [PATCH 12/37] Add in 'any' to regex check --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 61d199072aa..1ccf775ad85 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -413,7 +413,7 @@ build_function_list <- function(FUN) { strsplit = function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){ regex_metachars <- c(".", "\\", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?") - is_regex <- map_lgl(regex_metachars, ~grepl(.x, split, fixed = TRUE)) + is_regex <- any(map_lgl(regex_metachars, ~grepl(.x, split, fixed = TRUE))) # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex if(!fixed && is_regex || perl){ From 6a7b756d593b97f97fc14da44d5edf8af07ff76f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 15:47:02 +0100 Subject: [PATCH 13/37] Assert split length and shotern regex metachar search --- r/R/dplyr.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 1ccf775ad85..d6e79e432db 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -412,8 +412,9 @@ build_function_list <- function(FUN) { }, strsplit = function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){ - regex_metachars <- c(".", "\\", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?") - is_regex <- any(map_lgl(regex_metachars, ~grepl(.x, split, fixed = TRUE))) + assert_that(length(split) == 1) + + is_regex <- grepl("[.\\|()[{^$*+?]", split) # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex if(!fixed && is_regex || perl){ From 983cb9c26427f222f643ed1a6ff052e743b1299f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 15:51:50 +0100 Subject: [PATCH 14/37] Ignore Perl --- r/R/dplyr.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index d6e79e432db..5409ba24dd9 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -417,9 +417,12 @@ build_function_list <- function(FUN) { is_regex <- grepl("[.\\|()[{^$*+?]", split) # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex - if(!fixed && is_regex || perl){ + if(!fixed && is_regex){ stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE) } + if(fixed && perl){ + warning("argument 'perl = TRUE' will be ignored") + } FUN("split_pattern", x, options = list(pattern = split, reverse = FALSE, max_splits = -1)) }, # as.factor() is mapped in expression.R From 48e8917678cd3d97634832d83ef6c92eccdc1d75 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 May 2021 15:22:55 +0100 Subject: [PATCH 15/37] Add bdingin for str_split, and add tests --- r/R/dplyr.R | 51 +++++++++++++------ .../testthat/test-dplyr-string-functions.R | 47 +++++++++++++++++ 2 files changed, 83 insertions(+), 15 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 5409ba24dd9..c2ccebea9cf 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -410,21 +410,6 @@ build_function_list <- function(FUN) { options = list(null_encoding_behavior = null_encoding_behavior) ) }, - strsplit = function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){ - - assert_that(length(split) == 1) - - is_regex <- grepl("[.\\|()[{^$*+?]", split) - - # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex - if(!fixed && is_regex){ - stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE) - } - if(fixed && perl){ - warning("argument 'perl = TRUE' will be ignored") - } - FUN("split_pattern", x, options = list(pattern = split, reverse = FALSE, max_splits = -1)) - }, # as.factor() is mapped in expression.R as.character = function(x) { FUN("cast", x, options = cast_options(to_type = string())) @@ -492,6 +477,8 @@ build_function_list <- function(FUN) { gsub = arrow_r_string_replace_function(FUN, -1L), str_replace = arrow_stringr_string_replace_function(FUN, 1L), str_replace_all = arrow_stringr_string_replace_function(FUN, -1L), + strsplit = arrow_r_string_split_function(FUN), + str_split = arrow_stringr_string_split_function(FUN), between = function(x, left, right) { x >= left & x <= right }, @@ -555,6 +542,31 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) { } } +arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1){ + function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){ + + assert_that(length(split) == 1) + + # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex + if(!fixed && contains_regex(split)){ + stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE) + } + if(fixed && perl){ + warning("argument 'perl = TRUE' will be ignored") + } + FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits)) + } +} + +arrow_stringr_string_split_function <- function(FUN, reverse = FALSE){ + function(string, pattern, n = 0){ + if(contains_regex(pattern)){ + stop("regular expression matching not supported in str_split for Arrow", call. = FALSE) + } + FUN("split_pattern", string, options = list(pattern = pattern, reverse = reverse, max_splits = n-1)) + } +} + # format `pattern` as needed for case insensitivity and literal matching by RE2 format_string_pattern <- function(pattern, ignore.case, fixed) { # Arrow lacks native support for case-insensitive literal string matching and @@ -1114,3 +1126,12 @@ not_implemented_for_dataset <- function(method) { call. = FALSE ) } + +#' Does this string contain regex metacharacters? +#' +#' @param string String to be tested +#' @keywords internal +#' @return Logical: does `string` contain regex metacharacters? +contains_regex <- function(string){ + grepl("[.\\|()[{^$*+?]", string) +} \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 5faf2436f55..811e7d7babf 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -239,6 +239,53 @@ test_that("str_replace and str_replace_all", { }) +test_that("strsplit and str_split", { + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) + + expect_dplyr_equal( + input %>% + transmute(x = strsplit(x, "and")) %>% + collect(), + df + ) + + expect_warning( + df %>% + Table$create() %>% + mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>% + collect(), + regexp = "not supported in Arrow" + ) + + expect_dplyr_equal( + input %>% + mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + transmute(x = str_split(x, "and")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + transmute(x = str_split(x, "and", n = 2)) %>% + collect(), + df + ) + + expect_warning( + df %>% + Table$create() %>% + transmute(x = str_split(x, "and.?")) %>% + collect() + ) +}) + test_that("backreferences in pattern", { skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)") df <- tibble(x = c("Foo", "bar")) From 45f36c4a7866443b3b7f919a56057b220b388806 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 May 2021 15:35:08 +0100 Subject: [PATCH 16/37] Update test to use mutate --- r/tests/testthat/test-dplyr-string-functions.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 811e7d7babf..363debca917 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -248,7 +248,7 @@ test_that("strsplit and str_split", { collect(), df ) - + expect_warning( df %>% Table$create() %>% @@ -256,32 +256,32 @@ test_that("strsplit and str_split", { collect(), regexp = "not supported in Arrow" ) - + expect_dplyr_equal( input %>% mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% collect(), df ) - + expect_dplyr_equal( input %>% transmute(x = str_split(x, "and")) %>% collect(), df ) - + expect_dplyr_equal( input %>% transmute(x = str_split(x, "and", n = 2)) %>% collect(), df ) - + expect_warning( df %>% Table$create() %>% - transmute(x = str_split(x, "and.?")) %>% + mutate(x = str_split(x, "and.?")) %>% collect() ) }) From abeef7c1cee7bf968e9d13e113fa1f11413af277 Mon Sep 17 00:00:00 2001 From: Nic Date: Tue, 4 May 2021 19:18:44 +0100 Subject: [PATCH 17/37] Update r/R/dplyr.R Co-authored-by: Ian Cook --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index c2ccebea9cf..0b5eaa31bff 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -1134,4 +1134,4 @@ not_implemented_for_dataset <- function(method) { #' @return Logical: does `string` contain regex metacharacters? contains_regex <- function(string){ grepl("[.\\|()[{^$*+?]", string) -} \ No newline at end of file +} From 1f77d66d9ab1bf5451cdbffbd931636ba7ab9aed Mon Sep 17 00:00:00 2001 From: Nic Date: Tue, 4 May 2021 19:23:06 +0100 Subject: [PATCH 18/37] use is.string instead of length == 1 Co-authored-by: Ian Cook --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 0b5eaa31bff..78d43565e58 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -545,7 +545,7 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) { arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1){ function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){ - assert_that(length(split) == 1) + assert_that(is.string(split)) # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex if(!fixed && contains_regex(split)){ From 2448450b4f6be375e6af95b1076d5cd8db2b6734 Mon Sep 17 00:00:00 2001 From: Nic Date: Tue, 4 May 2021 19:23:50 +0100 Subject: [PATCH 19/37] Capitalise Co-authored-by: Ian Cook --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 78d43565e58..945a8dff71a 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -549,7 +549,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex if(!fixed && contains_regex(split)){ - stop("regular expression matching not supported in strsplit for Arrow", call. = FALSE) + stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE) } if(fixed && perl){ warning("argument 'perl = TRUE' will be ignored") From 14cf037608c8dc6148e689837921912e8d5b800a Mon Sep 17 00:00:00 2001 From: Nic Date: Tue, 4 May 2021 19:23:58 +0100 Subject: [PATCH 20/37] Update r/R/dplyr.R Co-authored-by: Ian Cook --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 945a8dff71a..52b54852d3b 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -561,7 +561,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) arrow_stringr_string_split_function <- function(FUN, reverse = FALSE){ function(string, pattern, n = 0){ if(contains_regex(pattern)){ - stop("regular expression matching not supported in str_split for Arrow", call. = FALSE) + stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE) } FUN("split_pattern", string, options = list(pattern = pattern, reverse = reverse, max_splits = n-1)) } From fc743e73098f5a5a76bad093c124d9cf45f9fe99 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 May 2021 19:36:12 +0100 Subject: [PATCH 21/37] Update spacing --- r/R/dplyr.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 52b54852d3b..17bdbd75afa 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -542,8 +542,8 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) { } } -arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1){ - function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE){ +arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) { + function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) { assert_that(is.string(split)) @@ -551,7 +551,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) if(!fixed && contains_regex(split)){ stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE) } - if(fixed && perl){ + if (fixed && perl) { warning("argument 'perl = TRUE' will be ignored") } FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits)) @@ -1132,6 +1132,6 @@ not_implemented_for_dataset <- function(method) { #' @param string String to be tested #' @keywords internal #' @return Logical: does `string` contain regex metacharacters? -contains_regex <- function(string){ +contains_regex <- function(string) { grepl("[.\\|()[{^$*+?]", string) } From 22b9256e9246306a50d0c2cf5f6b84a567fd20b7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 May 2021 19:46:29 +0100 Subject: [PATCH 22/37] Add spacing --- r/R/dplyr.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 17bdbd75afa..fd0643fb58a 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -548,7 +548,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) assert_that(is.string(split)) # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex - if(!fixed && contains_regex(split)){ + if (!fixed && contains_regex(split)) { stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE) } if (fixed && perl) { @@ -558,9 +558,9 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) } } -arrow_stringr_string_split_function <- function(FUN, reverse = FALSE){ - function(string, pattern, n = 0){ - if(contains_regex(pattern)){ +arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) { + function(string, pattern, n = 0) { + if (contains_regex(pattern)) { stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE) } FUN("split_pattern", string, options = list(pattern = pattern, reverse = reverse, max_splits = n-1)) From 51d250fab03ef9701ee7148f6f09e6f72f3599dc Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 May 2021 16:38:04 +0100 Subject: [PATCH 23/37] Use get_stringr_pattern --- r/R/dplyr.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index fd0643fb58a..8c58459f245 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -560,10 +560,14 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) { function(string, pattern, n = 0) { - if (contains_regex(pattern)) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (!opts$fixed && contains_regex(opts$pattern)) { stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE) } - FUN("split_pattern", string, options = list(pattern = pattern, reverse = reverse, max_splits = n-1)) + if (opts$ignore_case) { + stop("Case-insensitive string splitting not supported in Arrow", call. = FALSE) + } + FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1)) } } From a521405d4a6f21d073daa283c01abf8f1ca0e9b1 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 May 2021 17:09:44 +0100 Subject: [PATCH 24/37] Document get_stringr_pattern_options function, change default value of fixed to FALSE (to match stringr), --- r/R/dplyr.R | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 8c58459f245..4cef4161177 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -603,9 +603,16 @@ format_string_replacement <- function(replacement, ignore.case, fixed) { replacement } -# this function assigns definitions for the stringr pattern modifier functions -# (fixed, regex, etc.) in itself, and uses them to evaluate the quoted -# expression `pattern` +#' Get `stringr` pattern options +#' +#' This function assigns definitions for the stringr pattern modifier functions +#' (fixed, regex, etc.) in itself, and uses them to evaluate the quoted +#' expression `pattern` +#' +#' @param pattern Unevaluated pattern to be evaluated +#' +#' @return List containing elements `pattern`, `fixed`, and `ignore_case` +#' @keywords internal get_stringr_pattern_options <- function(pattern) { fixed <- function(pattern, ignore_case = FALSE, ...) { check_dots(...) @@ -637,7 +644,7 @@ get_stringr_pattern_options <- function(pattern) { } ensure_opts <- function(opts) { if (is.character(opts)) { - opts <- list(pattern = opts, fixed = TRUE, ignore_case = FALSE) + opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE) } opts } From b758b5520dba31645de8a46d5ef36dff8516db5a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 May 2021 17:15:48 +0100 Subject: [PATCH 25/37] Fix minor style issues, add extra tests --- .../testthat/test-dplyr-string-functions.R | 49 +++++++++++++++++-- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 363debca917..067204d5dd4 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -52,7 +52,7 @@ skip_if_not_available("re2") test_that("grepl", { df <- tibble(x = c("Foo", "bar")) - for(fixed in c(TRUE, FALSE)) { + for (fixed in c(TRUE, FALSE)) { expect_dplyr_equal( input %>% @@ -150,7 +150,7 @@ test_that("str_detect", { test_that("sub and gsub", { df <- tibble(x = c("Foo", "bar")) - for(fixed in c(TRUE, FALSE)) { + for (fixed in c(TRUE, FALSE)) { expect_dplyr_equal( input %>% @@ -206,12 +206,27 @@ test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", { test_that("str_replace and str_replace_all", { df <- tibble(x = c("Foo", "bar")) + expect_dplyr_equal( + input %>% + transmute(x = str_replace_all(x, "^F", "baz")) %>% + collect(), + df + ) + expect_dplyr_equal( input %>% transmute(x = str_replace_all(x, regex("^F"), "baz")) %>% collect(), df ) + + expect_dplyr_equal( + input %>% + mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>% + collect(), + df + ) + expect_dplyr_equal( input %>% transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>% @@ -240,11 +255,12 @@ test_that("str_replace and str_replace_all", { }) test_that("strsplit and str_split", { + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) expect_dplyr_equal( input %>% - transmute(x = strsplit(x, "and")) %>% + mutate(x = strsplit(x, "and")) %>% collect(), df ) @@ -266,17 +282,39 @@ test_that("strsplit and str_split", { expect_dplyr_equal( input %>% - transmute(x = str_split(x, "and")) %>% + mutate(x = str_split(x, "and")) %>% collect(), df ) expect_dplyr_equal( input %>% - transmute(x = str_split(x, "and", n = 2)) %>% + mutate(x = str_split(x, "and", n = 2)) %>% collect(), df ) + + expect_dplyr_equal( + input %>% + mutate(x = str_split(x, fixed("and"), n = 2)) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(x = str_split(x, regex("and"), n = 2)) %>% + collect(), + df + ) + + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, regex("and.*"), n = 2)) %>% + collect(), + regexp = "not supported" + ) expect_warning( df %>% @@ -284,6 +322,7 @@ test_that("strsplit and str_split", { mutate(x = str_split(x, "and.?")) %>% collect() ) + }) test_that("backreferences in pattern", { From 3d4f9ce069701d66bc1226cea0f14b40b6baf69f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 May 2021 17:39:03 +0100 Subject: [PATCH 26/37] Reorganise warning/error generating tests into one section for consistency, and add extra tests when no helper function used --- .../testthat/test-dplyr-string-functions.R | 69 ++++++++++++------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 067204d5dd4..e29f25218a9 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -264,15 +264,7 @@ test_that("strsplit and str_split", { collect(), df ) - - expect_warning( - df %>% - Table$create() %>% - mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>% - collect(), - regexp = "not supported in Arrow" - ) - + expect_dplyr_equal( input %>% mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% @@ -307,22 +299,6 @@ test_that("strsplit and str_split", { collect(), df ) - - expect_warning( - df %>% - Table$create() %>% - mutate(x = str_split(x, regex("and.*"), n = 2)) %>% - collect(), - regexp = "not supported" - ) - - expect_warning( - df %>% - Table$create() %>% - mutate(x = str_split(x, "and.?")) %>% - collect() - ) - }) test_that("backreferences in pattern", { @@ -351,6 +327,12 @@ test_that("backreferences (substitutions) in replacement", { collect(), tibble(url = "https://arrow.apache.org/docs/r/") ) + expect_dplyr_equal( + input %>% + transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>% + collect(), + df + ) expect_dplyr_equal( input %>% transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% @@ -408,6 +390,41 @@ test_that("errors and warnings", { # These conditions generate an error, but abandon_ship() catches the error, # issues a warning, and pulls the data into R + expect_warning( + df %>% + Table$create() %>% + mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>% + collect(), + regexp = "not supported in Arrow" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, "and.?")) %>% + collect() + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, regex("and.?"), n = 2)) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, coll("and.?"))) %>% + collect(), + regexp = "not supported" + ) + + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, boundary(type = "word"))) %>% + collect(), + regexp = "not supported" + ) expect_warning( df %>% Table$create() %>% @@ -422,7 +439,6 @@ test_that("errors and warnings", { collect(), "not supported" ) - # This condition generates a warning expect_warning( df %>% @@ -430,4 +446,5 @@ test_that("errors and warnings", { transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")), "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" ) + }) From f8cfff5b2cc34c7ef7ebf9812c977cb90263ec29 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 13:14:17 -0400 Subject: [PATCH 27/37] Fix missing } --- r/src/compute.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 58890c3785a..7c58c4a3a0a 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -241,7 +241,8 @@ std::shared_ptr make_compute_options( } return std::make_shared(cpp11::as_cpp(options["pattern"]), max_splits, cpp11::as_cpp(options["reverse"])); - + } + if (func_name == "variance" || func_name == "stddev") { using Options = arrow::compute::VarianceOptions; return std::make_shared(cpp11::as_cpp(options["ddof"])); From 3b66f4f08e7ab2c84e1a93707ae37302ab677e70 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 13:17:29 -0400 Subject: [PATCH 28/37] Re-lint --- r/src/compute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 7c58c4a3a0a..6a82d5af3b3 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -242,7 +242,7 @@ std::shared_ptr make_compute_options( return std::make_shared(cpp11::as_cpp(options["pattern"]), max_splits, cpp11::as_cpp(options["reverse"])); } - + if (func_name == "variance" || func_name == "stddev") { using Options = arrow::compute::VarianceOptions; return std::make_shared(cpp11::as_cpp(options["ddof"])); From 245c99fdddea5e4a627081e2e98fd97f85988fe0 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 17:23:56 -0400 Subject: [PATCH 29/37] Make reverse optional in split_pattern options --- r/src/compute.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 6a82d5af3b3..205f6068565 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -239,8 +239,12 @@ std::shared_ptr make_compute_options( if (!Rf_isNull(options["max_splits"])) { max_splits = cpp11::as_cpp(options["max_splits"]); } + bool reverse = false; + if (!Rf_isNull(options["reverse"])) { + reverse = cpp11::as_cpp(options["reverse"]); + } return std::make_shared(cpp11::as_cpp(options["pattern"]), - max_splits, cpp11::as_cpp(options["reverse"])); + max_splits, reverse); } if (func_name == "variance" || func_name == "stddev") { From b3884f26c2f23f59b2504f9f2301e6b6f5696626 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 17:24:55 -0400 Subject: [PATCH 30/37] Add handling for utf8_split_whitespace, ascii_split_whitespace options --- r/src/compute.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 205f6068565..0ffe53578c4 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -247,6 +247,19 @@ std::shared_ptr make_compute_options( max_splits, reverse); } + if (func_name == "utf8_split_whitespace" || func_name == "ascii_split_whitespace") { + using Options = arrow::compute::SplitOptions; + int64_t max_splits = -1; + if (!Rf_isNull(options["max_splits"])) { + max_splits = cpp11::as_cpp(options["max_splits"]); + } + bool reverse = false; + if (!Rf_isNull(options["reverse"])) { + reverse = cpp11::as_cpp(options["reverse"]); + } + return std::make_shared(max_splits, reverse); + } + if (func_name == "variance" || func_name == "stddev") { using Options = arrow::compute::VarianceOptions; return std::make_shared(cpp11::as_cpp(options["ddof"])); From 8dfc213c2d8aedc6621259c9cd8667c97c60d48b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 18:26:27 -0400 Subject: [PATCH 31/37] Tweak str_split() args for consistency with stringr --- r/R/dplyr.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 3ff11734ffd..bcc315dacb2 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -558,7 +558,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) } arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) { - function(string, pattern, n = 0) { + function(string, pattern, n = Inf, simplify = FALSE) { opts <- get_stringr_pattern_options(enexpr(pattern)) if (!opts$fixed && contains_regex(opts$pattern)) { stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE) @@ -566,7 +566,16 @@ arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) { if (opts$ignore_case) { stop("Case-insensitive string splitting not supported in Arrow", call. = FALSE) } - FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1)) + if (n == 0) { + stop("Splitting strings into zero parts not supported in Arrow" , call. = FALSE) + } + if (identical(n, Inf)) { + n <- 0L + } + if (simplify) { + warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE) + } + FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1L)) } } From cbc0e78bf3aa5d47b036d9f1397dd3a866642f6a Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 18:27:19 -0400 Subject: [PATCH 32/37] Add and reorganize tests --- .../testthat/test-dplyr-string-functions.R | 175 ++++++++++-------- 1 file changed, 101 insertions(+), 74 deletions(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index e29f25218a9..844fa97d4af 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -255,53 +255,144 @@ test_that("str_replace and str_replace_all", { }) test_that("strsplit and str_split", { - + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) - + expect_dplyr_equal( input %>% mutate(x = strsplit(x, "and")) %>% collect(), df ) - expect_dplyr_equal( input %>% mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% collect(), df ) - expect_dplyr_equal( input %>% mutate(x = str_split(x, "and")) %>% collect(), df ) - expect_dplyr_equal( input %>% mutate(x = str_split(x, "and", n = 2)) %>% collect(), df ) - expect_dplyr_equal( input %>% mutate(x = str_split(x, fixed("and"), n = 2)) %>% collect(), df ) - expect_dplyr_equal( input %>% mutate(x = str_split(x, regex("and"), n = 2)) %>% collect(), df ) + +}) + +test_that("errors and warnings in string splitting", { + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) + + # These conditions generate an error, but abandon_ship() catches the error, + # issues a warning, and pulls the data into R + expect_warning( + df %>% + Table$create() %>% + mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, "and.?")) %>% + collect() + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, regex("and.?"), n = 2)) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, fixed("and", ignore_case = TRUE))) %>% + collect(), + "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, coll("and.?"))) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, boundary(type = "word"))) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, "and", n = 0)) %>% + collect(), + regexp = "not supported" + ) + + # This condition generates a warning + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, fixed("and"), simplify = TRUE)) %>% + collect(), + "ignored" + ) + }) -test_that("backreferences in pattern", { +test_that("errors and warnings in string detection and replacement", { + df <- tibble(x = c("Foo", "bar")) + + # These conditions generate an error, but abandon_ship() catches the error, + # issues a warning, and pulls the data into R + expect_warning( + df %>% + Table$create() %>% + filter(str_detect(x, boundary(type = "character"))) %>% + collect(), + regexp = "not implemented" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_replace_all(x, coll("o", locale = "en"), "ó")) %>% + collect(), + regexp = "not supported" + ) + + # This condition generates a warning + expect_warning( + df %>% + Table$create() %>% + transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")), + "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" + ) + +}) + +test_that("backreferences in pattern in string detection", { skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)") df <- tibble(x = c("Foo", "bar")) @@ -313,7 +404,7 @@ test_that("backreferences in pattern", { ) }) -test_that("backreferences (substitutions) in replacement", { +test_that("backreferences (substitutions) in string replacement", { df <- tibble(x = c("Foo", "bar")) expect_dplyr_equal( @@ -347,7 +438,7 @@ test_that("backreferences (substitutions) in replacement", { ) }) -test_that("edge cases", { +test_that("edge cases in string detection and replacement", { # in case-insensitive fixed match/replace, test that "\\E" in the search # string and backslashes in the replacement string are interpreted literally. @@ -384,67 +475,3 @@ test_that("edge cases", { ) }) - -test_that("errors and warnings", { - df <- tibble(x = c("Foo", "bar")) - - # These conditions generate an error, but abandon_ship() catches the error, - # issues a warning, and pulls the data into R - expect_warning( - df %>% - Table$create() %>% - mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>% - collect(), - regexp = "not supported in Arrow" - ) - expect_warning( - df %>% - Table$create() %>% - mutate(x = str_split(x, "and.?")) %>% - collect() - ) - expect_warning( - df %>% - Table$create() %>% - mutate(x = str_split(x, regex("and.?"), n = 2)) %>% - collect(), - regexp = "not supported" - ) - expect_warning( - df %>% - Table$create() %>% - mutate(x = str_split(x, coll("and.?"))) %>% - collect(), - regexp = "not supported" - ) - - expect_warning( - df %>% - Table$create() %>% - mutate(x = str_split(x, boundary(type = "word"))) %>% - collect(), - regexp = "not supported" - ) - expect_warning( - df %>% - Table$create() %>% - filter(str_detect(x, boundary(type = "character"))) %>% - collect(), - "not implemented" - ) - expect_warning( - df %>% - Table$create() %>% - mutate(x = str_replace_all(x, coll("o", locale = "en"), "ó")) %>% - collect(), - "not supported" - ) - # This condition generates a warning - expect_warning( - df %>% - Table$create() %>% - transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")), - "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" - ) - -}) From 4b96d07370e6f4e761eda7e30703e880fbf74469 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 18:28:42 -0400 Subject: [PATCH 33/37] Add missing call. = FALSE --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index bcc315dacb2..d53b5ee57c9 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -551,7 +551,7 @@ arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE) } if (fixed && perl) { - warning("argument 'perl = TRUE' will be ignored") + warning("Argument 'perl = TRUE' will be ignored", call. = FALSE) } FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits)) } From a567ce1ad22e2b74d34cd5b89c21f0cb8f2ec343 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 18:30:29 -0400 Subject: [PATCH 34/37] Tweak get_stringr_pattern_options() docs --- r/R/dplyr.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index d53b5ee57c9..4e66c227bea 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -612,12 +612,14 @@ format_string_replacement <- function(replacement, ignore.case, fixed) { } #' Get `stringr` pattern options -#' -#' This function assigns definitions for the stringr pattern modifier functions -#' (fixed, regex, etc.) in itself, and uses them to evaluate the quoted -#' expression `pattern` -#' -#' @param pattern Unevaluated pattern to be evaluated +#' +#' This function assigns definitions for the `stringr` pattern modifier +#' functions (`fixed()`, `regex()`, etc.) inside itself, and uses them to +#' evaluate the quoted expression `pattern`, returning a list that is used +#' to control pattern matching behavior in internal `arrow` functions. +#' +#' @param pattern Unevaluated expression containing a call to a `stringr` +#' pattern modifier function #' #' @return List containing elements `pattern`, `fixed`, and `ignore_case` #' @keywords internal From 0a30a69ab43ef2d21316e0601e5fa53580a6061a Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 18:31:13 -0400 Subject: [PATCH 35/37] Build new internal functions docs --- r/man/contains_regex.Rd | 18 ++++++++++++++++++ r/man/get_stringr_pattern_options.Rd | 22 ++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 r/man/contains_regex.Rd create mode 100644 r/man/get_stringr_pattern_options.Rd diff --git a/r/man/contains_regex.Rd b/r/man/contains_regex.Rd new file mode 100644 index 00000000000..d8fee96d99b --- /dev/null +++ b/r/man/contains_regex.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr.R +\name{contains_regex} +\alias{contains_regex} +\title{Does this string contain regex metacharacters?} +\usage{ +contains_regex(string) +} +\arguments{ +\item{string}{String to be tested} +} +\value{ +Logical: does \code{string} contain regex metacharacters? +} +\description{ +Does this string contain regex metacharacters? +} +\keyword{internal} diff --git a/r/man/get_stringr_pattern_options.Rd b/r/man/get_stringr_pattern_options.Rd new file mode 100644 index 00000000000..79a9a72b7cf --- /dev/null +++ b/r/man/get_stringr_pattern_options.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr.R +\name{get_stringr_pattern_options} +\alias{get_stringr_pattern_options} +\title{Get \code{stringr} pattern options} +\usage{ +get_stringr_pattern_options(pattern) +} +\arguments{ +\item{pattern}{Unevaluated expression containing a call to a \code{stringr} +pattern modifier function} +} +\value{ +List containing elements \code{pattern}, \code{fixed}, and \code{ignore_case} +} +\description{ +This function assigns definitions for the \code{stringr} pattern modifier +functions (\code{fixed()}, \code{regex()}, etc.) inside itself, and uses them to +evaluate the quoted expression \code{pattern}, returning a list that is used +to control pattern matching behavior in internal \code{arrow} functions. +} +\keyword{internal} From 25ec2f8a5a1af060010656630c1f13bbdf9f1e90 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 18:32:18 -0400 Subject: [PATCH 36/37] Add unrelated note about 0-based indices in call_function() docs --- r/R/compute.R | 2 ++ r/man/call_function.Rd | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/r/R/compute.R b/r/R/compute.R index 0641bf1615c..c3783ba3295 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -27,6 +27,8 @@ #' `RecordBatch`, or `Table`. #' @param args list arguments as an alternative to specifying in `...` #' @param options named list of C++ function options. +#' @details When passing indices in `...`, `args`, or `options`, express them as +#' 0-based integers (consistent with C++). #' @return An `Array`, `ChunkedArray`, `Scalar`, `RecordBatch`, or `Table`, whatever the compute function results in. #' @seealso [Arrow C++ documentation](https://arrow.apache.org/docs/cpp/compute.html) for the functions and their respective options. #' @examples diff --git a/r/man/call_function.Rd b/r/man/call_function.Rd index 4ab9fd7e942..e89fd00576e 100644 --- a/r/man/call_function.Rd +++ b/r/man/call_function.Rd @@ -31,6 +31,10 @@ Many Arrow compute functions are mapped to R methods, and in a \code{dplyr} evaluation context, \link[=list_compute_functions]{all Arrow functions} are callable with an \code{arrow_} prefix. } +\details{ +When passing indices in \code{...}, \code{args}, or \code{options}, express them as +0-based integers (consistent with C++). +} \examples{ \donttest{ a <- Array$create(c(1L, 2L, 3L, NA, 5L)) From 15042000daa742af911ea915a3903317b488ca2f Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 6 May 2021 22:49:20 -0400 Subject: [PATCH 37/37] Test arrow_ascii_split_whitespace, arrow_utf8_split_whitespace --- .../testthat/test-dplyr-string-functions.R | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 844fa97d4af..64351a83ea7 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -297,6 +297,33 @@ test_that("strsplit and str_split", { }) +test_that("arrow_*_split_whitespace functions", { + + # use only ASCII whitespace characters + df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) + + # use only non-ASCII whitespace characters + df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux")) + + df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux"))) + + expect_equivalent( + df_ascii %>% + Table$create() %>% + mutate(x = arrow_ascii_split_whitespace(x)) %>% + collect(), + df_split + ) + expect_equivalent( + df_utf8 %>% + Table$create() %>% + mutate(x = arrow_utf8_split_whitespace(x)) %>% + collect(), + df_split + ) + +}) + test_that("errors and warnings in string splitting", { df <- tibble(x = c("Foo and bar", "baz and qux and quux"))