diff --git a/r/R/compute.R b/r/R/compute.R index 0641bf1615c..c3783ba3295 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -27,6 +27,8 @@ #' `RecordBatch`, or `Table`. #' @param args list arguments as an alternative to specifying in `...` #' @param options named list of C++ function options. +#' @details When passing indices in `...`, `args`, or `options`, express them as +#' 0-based integers (consistent with C++). #' @return An `Array`, `ChunkedArray`, `Scalar`, `RecordBatch`, or `Table`, whatever the compute function results in. #' @seealso [Arrow C++ documentation](https://arrow.apache.org/docs/cpp/compute.html) for the functions and their respective options. #' @examples diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 21266d39b26..4e66c227bea 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -476,6 +476,8 @@ build_function_list <- function(FUN) { gsub = arrow_r_string_replace_function(FUN, -1L), str_replace = arrow_stringr_string_replace_function(FUN, 1L), str_replace_all = arrow_stringr_string_replace_function(FUN, -1L), + strsplit = arrow_r_string_split_function(FUN), + str_split = arrow_stringr_string_split_function(FUN), between = function(x, left, right) { x >= left & x <= right }, @@ -539,6 +541,44 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) { } } +arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) { + function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) { + + assert_that(is.string(split)) + + # if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex + if (!fixed && contains_regex(split)) { + stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE) + } + if (fixed && perl) { + warning("Argument 'perl = TRUE' will be ignored", call. = FALSE) + } + FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits)) + } +} + +arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) { + function(string, pattern, n = Inf, simplify = FALSE) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (!opts$fixed && contains_regex(opts$pattern)) { + stop("Regular expression matching not supported in str_split() for Arrow", call. = FALSE) + } + if (opts$ignore_case) { + stop("Case-insensitive string splitting not supported in Arrow", call. = FALSE) + } + if (n == 0) { + stop("Splitting strings into zero parts not supported in Arrow" , call. = FALSE) + } + if (identical(n, Inf)) { + n <- 0L + } + if (simplify) { + warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE) + } + FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1L)) + } +} + # format `pattern` as needed for case insensitivity and literal matching by RE2 format_string_pattern <- function(pattern, ignore.case, fixed) { # Arrow lacks native support for case-insensitive literal string matching and @@ -571,9 +611,18 @@ format_string_replacement <- function(replacement, ignore.case, fixed) { replacement } -# this function assigns definitions for the stringr pattern modifier functions -# (fixed, regex, etc.) in itself, and uses them to evaluate the quoted -# expression `pattern` +#' Get `stringr` pattern options +#' +#' This function assigns definitions for the `stringr` pattern modifier +#' functions (`fixed()`, `regex()`, etc.) inside itself, and uses them to +#' evaluate the quoted expression `pattern`, returning a list that is used +#' to control pattern matching behavior in internal `arrow` functions. +#' +#' @param pattern Unevaluated expression containing a call to a `stringr` +#' pattern modifier function +#' +#' @return List containing elements `pattern`, `fixed`, and `ignore_case` +#' @keywords internal get_stringr_pattern_options <- function(pattern) { fixed <- function(pattern, ignore_case = FALSE, ...) { check_dots(...) @@ -605,7 +654,7 @@ get_stringr_pattern_options <- function(pattern) { } ensure_opts <- function(opts) { if (is.character(opts)) { - opts <- list(pattern = opts, fixed = TRUE, ignore_case = FALSE) + opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE) } opts } @@ -1097,3 +1146,12 @@ not_implemented_for_dataset <- function(method) { call. = FALSE ) } + +#' Does this string contain regex metacharacters? +#' +#' @param string String to be tested +#' @keywords internal +#' @return Logical: does `string` contain regex metacharacters? +contains_regex <- function(string) { + grepl("[.\\|()[{^$*+?]", string) +} diff --git a/r/man/call_function.Rd b/r/man/call_function.Rd index 4ab9fd7e942..e89fd00576e 100644 --- a/r/man/call_function.Rd +++ b/r/man/call_function.Rd @@ -31,6 +31,10 @@ Many Arrow compute functions are mapped to R methods, and in a \code{dplyr} evaluation context, \link[=list_compute_functions]{all Arrow functions} are callable with an \code{arrow_} prefix. } +\details{ +When passing indices in \code{...}, \code{args}, or \code{options}, express them as +0-based integers (consistent with C++). +} \examples{ \donttest{ a <- Array$create(c(1L, 2L, 3L, NA, 5L)) diff --git a/r/man/contains_regex.Rd b/r/man/contains_regex.Rd new file mode 100644 index 00000000000..d8fee96d99b --- /dev/null +++ b/r/man/contains_regex.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr.R +\name{contains_regex} +\alias{contains_regex} +\title{Does this string contain regex metacharacters?} +\usage{ +contains_regex(string) +} +\arguments{ +\item{string}{String to be tested} +} +\value{ +Logical: does \code{string} contain regex metacharacters? +} +\description{ +Does this string contain regex metacharacters? +} +\keyword{internal} diff --git a/r/man/get_stringr_pattern_options.Rd b/r/man/get_stringr_pattern_options.Rd new file mode 100644 index 00000000000..79a9a72b7cf --- /dev/null +++ b/r/man/get_stringr_pattern_options.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr.R +\name{get_stringr_pattern_options} +\alias{get_stringr_pattern_options} +\title{Get \code{stringr} pattern options} +\usage{ +get_stringr_pattern_options(pattern) +} +\arguments{ +\item{pattern}{Unevaluated expression containing a call to a \code{stringr} +pattern modifier function} +} +\value{ +List containing elements \code{pattern}, \code{fixed}, and \code{ignore_case} +} +\description{ +This function assigns definitions for the \code{stringr} pattern modifier +functions (\code{fixed()}, \code{regex()}, etc.) inside itself, and uses them to +evaluate the quoted expression \code{pattern}, returning a list that is used +to control pattern matching behavior in internal \code{arrow} functions. +} +\keyword{internal} diff --git a/r/src/compute.cpp b/r/src/compute.cpp index c215d661e3a..0ffe53578c4 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -233,6 +233,33 @@ std::shared_ptr make_compute_options( max_replacements); } + if (func_name == "split_pattern") { + using Options = arrow::compute::SplitPatternOptions; + int64_t max_splits = -1; + if (!Rf_isNull(options["max_splits"])) { + max_splits = cpp11::as_cpp(options["max_splits"]); + } + bool reverse = false; + if (!Rf_isNull(options["reverse"])) { + reverse = cpp11::as_cpp(options["reverse"]); + } + return std::make_shared(cpp11::as_cpp(options["pattern"]), + max_splits, reverse); + } + + if (func_name == "utf8_split_whitespace" || func_name == "ascii_split_whitespace") { + using Options = arrow::compute::SplitOptions; + int64_t max_splits = -1; + if (!Rf_isNull(options["max_splits"])) { + max_splits = cpp11::as_cpp(options["max_splits"]); + } + bool reverse = false; + if (!Rf_isNull(options["reverse"])) { + reverse = cpp11::as_cpp(options["reverse"]); + } + return std::make_shared(max_splits, reverse); + } + if (func_name == "variance" || func_name == "stddev") { using Options = arrow::compute::VarianceOptions; return std::make_shared(cpp11::as_cpp(options["ddof"])); diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 5faf2436f55..64351a83ea7 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -52,7 +52,7 @@ skip_if_not_available("re2") test_that("grepl", { df <- tibble(x = c("Foo", "bar")) - for(fixed in c(TRUE, FALSE)) { + for (fixed in c(TRUE, FALSE)) { expect_dplyr_equal( input %>% @@ -150,7 +150,7 @@ test_that("str_detect", { test_that("sub and gsub", { df <- tibble(x = c("Foo", "bar")) - for(fixed in c(TRUE, FALSE)) { + for (fixed in c(TRUE, FALSE)) { expect_dplyr_equal( input %>% @@ -206,12 +206,27 @@ test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", { test_that("str_replace and str_replace_all", { df <- tibble(x = c("Foo", "bar")) + expect_dplyr_equal( + input %>% + transmute(x = str_replace_all(x, "^F", "baz")) %>% + collect(), + df + ) + expect_dplyr_equal( input %>% transmute(x = str_replace_all(x, regex("^F"), "baz")) %>% collect(), df ) + + expect_dplyr_equal( + input %>% + mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>% + collect(), + df + ) + expect_dplyr_equal( input %>% transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>% @@ -239,7 +254,172 @@ test_that("str_replace and str_replace_all", { }) -test_that("backreferences in pattern", { +test_that("strsplit and str_split", { + + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) + + expect_dplyr_equal( + input %>% + mutate(x = strsplit(x, "and")) %>% + collect(), + df + ) + expect_dplyr_equal( + input %>% + mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% + collect(), + df + ) + expect_dplyr_equal( + input %>% + mutate(x = str_split(x, "and")) %>% + collect(), + df + ) + expect_dplyr_equal( + input %>% + mutate(x = str_split(x, "and", n = 2)) %>% + collect(), + df + ) + expect_dplyr_equal( + input %>% + mutate(x = str_split(x, fixed("and"), n = 2)) %>% + collect(), + df + ) + expect_dplyr_equal( + input %>% + mutate(x = str_split(x, regex("and"), n = 2)) %>% + collect(), + df + ) + +}) + +test_that("arrow_*_split_whitespace functions", { + + # use only ASCII whitespace characters + df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) + + # use only non-ASCII whitespace characters + df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux")) + + df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux"))) + + expect_equivalent( + df_ascii %>% + Table$create() %>% + mutate(x = arrow_ascii_split_whitespace(x)) %>% + collect(), + df_split + ) + expect_equivalent( + df_utf8 %>% + Table$create() %>% + mutate(x = arrow_utf8_split_whitespace(x)) %>% + collect(), + df_split + ) + +}) + +test_that("errors and warnings in string splitting", { + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) + + # These conditions generate an error, but abandon_ship() catches the error, + # issues a warning, and pulls the data into R + expect_warning( + df %>% + Table$create() %>% + mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, "and.?")) %>% + collect() + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, regex("and.?"), n = 2)) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, fixed("and", ignore_case = TRUE))) %>% + collect(), + "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, coll("and.?"))) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, boundary(type = "word"))) %>% + collect(), + regexp = "not supported" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, "and", n = 0)) %>% + collect(), + regexp = "not supported" + ) + + # This condition generates a warning + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_split(x, fixed("and"), simplify = TRUE)) %>% + collect(), + "ignored" + ) + +}) + +test_that("errors and warnings in string detection and replacement", { + df <- tibble(x = c("Foo", "bar")) + + # These conditions generate an error, but abandon_ship() catches the error, + # issues a warning, and pulls the data into R + expect_warning( + df %>% + Table$create() %>% + filter(str_detect(x, boundary(type = "character"))) %>% + collect(), + regexp = "not implemented" + ) + expect_warning( + df %>% + Table$create() %>% + mutate(x = str_replace_all(x, coll("o", locale = "en"), "ó")) %>% + collect(), + regexp = "not supported" + ) + + # This condition generates a warning + expect_warning( + df %>% + Table$create() %>% + transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")), + "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" + ) + +}) + +test_that("backreferences in pattern in string detection", { skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)") df <- tibble(x = c("Foo", "bar")) @@ -251,7 +431,7 @@ test_that("backreferences in pattern", { ) }) -test_that("backreferences (substitutions) in replacement", { +test_that("backreferences (substitutions) in string replacement", { df <- tibble(x = c("Foo", "bar")) expect_dplyr_equal( @@ -265,6 +445,12 @@ test_that("backreferences (substitutions) in replacement", { collect(), tibble(url = "https://arrow.apache.org/docs/r/") ) + expect_dplyr_equal( + input %>% + transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>% + collect(), + df + ) expect_dplyr_equal( input %>% transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% @@ -279,7 +465,7 @@ test_that("backreferences (substitutions) in replacement", { ) }) -test_that("edge cases", { +test_that("edge cases in string detection and replacement", { # in case-insensitive fixed match/replace, test that "\\E" in the search # string and backslashes in the replacement string are interpreted literally. @@ -316,32 +502,3 @@ test_that("edge cases", { ) }) - -test_that("errors and warnings", { - df <- tibble(x = c("Foo", "bar")) - - # These conditions generate an error, but abandon_ship() catches the error, - # issues a warning, and pulls the data into R - expect_warning( - df %>% - Table$create() %>% - filter(str_detect(x, boundary(type = "character"))) %>% - collect(), - "not implemented" - ) - expect_warning( - df %>% - Table$create() %>% - mutate(x = str_replace_all(x, coll("o", locale = "en"), "ó")) %>% - collect(), - "not supported" - ) - - # This condition generates a warning - expect_warning( - df %>% - Table$create() %>% - transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")), - "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" - ) -})