Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions r/R/dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -543,16 +543,28 @@ arrow_stringr_string_replace_function <- function(FUN, max_replacements) {

arrow_r_string_split_function <- function(FUN, reverse = FALSE, max_splits = -1) {
function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) {

assert_that(is.string(split))

# if !fixed but no regex metachars in split pattern, allow to proceed as split isn't regex

# The Arrow C++ library does not support splitting a string by a regular
# expression pattern (ARROW-12608) but the default behavior of
# base::strsplit() is to interpret the split pattern as a regex
# (fixed = FALSE). R users commonly pass non-regex split patterns to
# strsplit() without bothering to set fixed = TRUE. It would be annoying if
# that didn't work here. So: if fixed = FALSE, let's check the split pattern
# to see if it is a regex (if it contains any regex metacharacters). If not,
# then allow to proceed.
if (!fixed && contains_regex(split)) {
stop("Regular expression matching not supported in strsplit for Arrow", call. = FALSE)
}
# warn when the user specifies both fixed = TRUE and perl = TRUE, for
# consistency with the behavior of base::strsplit()
if (fixed && perl) {
warning("Argument 'perl = TRUE' will be ignored", call. = FALSE)
}
# since split is not a regex, proceed without any warnings or errors
# regardless of the value of perl, for consistency with the behavior of
# base::strsplit()
FUN("split_pattern", x, options = list(pattern = split, reverse = reverse, max_splits = max_splits))
}
}
Expand All @@ -575,6 +587,10 @@ arrow_stringr_string_split_function <- function(FUN, reverse = FALSE) {
if (simplify) {
warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE)
}
# The max_splits option in the Arrow C++ library controls the maximum number
# of places at which the string is split, whereas the argument n to
# str_split() controls the maximum number of pieces to return. So we must
# subtract 1 from n to get max_splits.
FUN("split_pattern", string, options = list(pattern = opts$pattern, reverse = reverse, max_splits = n - 1L))
}
}
Expand Down Expand Up @@ -1148,7 +1164,7 @@ not_implemented_for_dataset <- function(method) {
}

#' Does this string contain regex metacharacters?
#'
#'
#' @param string String to be tested
#' @keywords internal
#' @return Logical: does `string` contain regex metacharacters?
Expand Down
27 changes: 24 additions & 3 deletions r/tests/testthat/test-dplyr-string-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -212,21 +212,21 @@ test_that("str_replace and str_replace_all", {
collect(),
df
)

expect_dplyr_equal(
input %>%
transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
collect(),
df
)

expect_dplyr_equal(
input %>%
mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
collect(),
df
)

expect_dplyr_equal(
input %>%
transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
Expand Down Expand Up @@ -307,6 +307,7 @@ test_that("arrow_*_split_whitespace functions", {

df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux")))

# use default option values
expect_equivalent(
df_ascii %>%
Table$create() %>%
Expand All @@ -322,6 +323,26 @@ test_that("arrow_*_split_whitespace functions", {
df_split
)

# specify non-default option values
expect_equivalent(
df_ascii %>%
Table$create() %>%
mutate(
x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
) %>%
collect(),
tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux")))
)
expect_equivalent(
df_utf8 %>%
Table$create() %>%
mutate(
x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
) %>%
collect(),
tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux")))
)

})

test_that("errors and warnings in string splitting", {
Expand Down