diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index dbb9d5f46f6..717cdae9662 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -645,6 +645,19 @@ nse_funcs$str_ends <- function(string, pattern, negate = FALSE) { out } +nse_funcs$str_count <- function(string, pattern) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (!is.string(pattern)) { + arrow_not_supported("`pattern` must be a length 1 character vector; other values") + } + arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex") + Expression$create( + arrow_fun, + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) +} + # String function helpers # format `pattern` as needed for case insensitivity and literal matching by RE2 diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index dd59b5ac55d..333735be4f0 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1336,3 +1336,63 @@ test_that("str_starts, str_ends, startsWith, endsWith", { df ) }) + +test_that("str_count", { + df <- tibble( + cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"), + dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..") + ) + + expect_dplyr_equal( + input %>% + mutate(a_count = str_count(cities, pattern = "a")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(p_count = str_count(cities, pattern = "d")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(p_count = str_count(cities, + pattern = regex("d", ignore_case = TRUE) + )) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(e_count = str_count(cities, pattern = "u")) %>% + collect(), + df + ) + + # nse_funcs$str_count() is not vectorised over pattern + expect_dplyr_equal( + input %>% + mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% + collect(), + df, + warning = TRUE + ) + + expect_dplyr_equal( + input %>% + mutate(dots_count = str_count(dots, ".")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(dots_count = str_count(dots, fixed("."))) %>% + collect(), + df + ) +})