From 7d961bf0c1c9d863383f5f631b971724a4446dfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Oct 2021 11:06:54 +0100 Subject: [PATCH 01/12] added tests for str_count --- r/tests/testthat/test-dplyr-funcs-string.R | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index dd59b5ac55d..a68de7f5dad 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1336,3 +1336,36 @@ test_that("str_starts, str_ends, startsWith, endsWith", { df ) }) + +test_that("str_count", { + df <- tibble(fruit = c("apple", "banana", "pear", "pineapple")) + + expect_dplyr_equal( + input %>% + mutate(a_count = str_count(fruit, pattern = "a")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(p_count = str_count(fruit, pattern = "p")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(e_count = str_count(fruit, pattern = "e")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(let_count = str_count(fruit, pattern = c("a", "b", "p", "n"))) %>% + collect(), + df + ) + +}) From 87cbba94b55a181e02bfbce56d57f466f0bdf74b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 09:35:37 +0100 Subject: [PATCH 02/12] more tests and replacing fruits with cities --- r/tests/testthat/test-dplyr-funcs-string.R | 40 ++++++++++++++++++---- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index a68de7f5dad..ca712f76e5e 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -126,7 +126,7 @@ test_that("paste, paste0, and str_c", { ) # emits null in str_c() (consistent with stringr::str_c()) expect_dplyr_equal( - input %>% + df %>% transmute(str_c(x, y, sep = NA_character_)) %>% collect(), df @@ -1338,34 +1338,62 @@ test_that("str_starts, str_ends, startsWith, endsWith", { }) test_that("str_count", { - df <- tibble(fruit = c("apple", "banana", "pear", "pineapple")) + df <- tibble( + cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", + "Cluj Napoca", "Bern", "Bogota"), + dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..")) expect_dplyr_equal( input %>% - mutate(a_count = str_count(fruit, pattern = "a")) %>% + mutate(a_count = str_count(cities, pattern = "a")) %>% collect(), df ) expect_dplyr_equal( input %>% - mutate(p_count = str_count(fruit, pattern = "p")) %>% + mutate(p_count = str_count(cities, pattern = "d")) %>% collect(), df ) expect_dplyr_equal( input %>% - mutate(e_count = str_count(fruit, pattern = "e")) %>% + mutate(p_count = str_count(cities, + pattern = regex("d", ignore_case = TRUE))) %>% collect(), df ) expect_dplyr_equal( input %>% - mutate(let_count = str_count(fruit, pattern = c("a", "b", "p", "n"))) %>% + mutate(e_count = str_count(cities, pattern = "u")) %>% collect(), df ) + # nse_funcs$str_count() is not vectorised over pattern + expect_dplyr_equal( + input %>% + mutate(let_count = str_count( + cities, + pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% + collect(), + df, + warning = TRUE + ) + + expect_dplyr_equal( + input %>% + mutate(dots_count = str_count(dots, ".")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(dots_count = str_count(dots, fixed("."))) %>% + collect(), + df + ) }) From 32ecb358f7f4de018386559780decb729d220a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 09:35:51 +0100 Subject: [PATCH 03/12] added the binding for srt_count --- r/R/dplyr-functions.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index dbb9d5f46f6..518b658927b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -645,6 +645,27 @@ nse_funcs$str_ends <- function(string, pattern, negate = FALSE) { out } +nse_funcs$str_count <- function(string, pattern) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (length(pattern) > 1) { + arrow_not_supported("Pattern argument longer than 1") + } + if (opts$fixed) { + out <- Expression$create( + "count_substring", + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) + } else { + out <- Expression$create( + "count_substring_regex", + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) + } + out +} + # String function helpers # format `pattern` as needed for case insensitivity and literal matching by RE2 From e8f7801525ebb2a42361353c641e1b6702c16dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 09:39:06 +0100 Subject: [PATCH 04/12] updated NEWS --- r/NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/NEWS.md b/r/NEWS.md index 065a5d792df..43230899787 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,6 +19,8 @@ # arrow 5.0.0.9000 +* added bindings for `str_count()`. (ARROW-13156, @dragosmg) + There are now two ways to query Arrow data: ## 1. Expanded Arrow-native queries: aggregation and joins From c678cdfb6ae165ea39cf10245956b2eb416caa78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 09:45:54 +0100 Subject: [PATCH 05/12] undo --- r/tests/testthat/test-dplyr-funcs-string.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index ca712f76e5e..a945b883d94 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -126,7 +126,7 @@ test_that("paste, paste0, and str_c", { ) # emits null in str_c() (consistent with stringr::str_c()) expect_dplyr_equal( - df %>% + input %>% transmute(str_c(x, y, sep = NA_character_)) %>% collect(), df From 829bccbe30fc44cf020301a727579fffc64ff873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 10:45:31 +0100 Subject: [PATCH 06/12] Update r/R/dplyr-functions.R Co-authored-by: Nic --- r/R/dplyr-functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 518b658927b..320afe3aa69 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -647,8 +647,8 @@ nse_funcs$str_ends <- function(string, pattern, negate = FALSE) { nse_funcs$str_count <- function(string, pattern) { opts <- get_stringr_pattern_options(enexpr(pattern)) - if (length(pattern) > 1) { - arrow_not_supported("Pattern argument longer than 1") + if !is.string(pattern) { + arrow_not_supported("`pattern` must be a length 1 character vector; other values") } if (opts$fixed) { out <- Expression$create( From f819ee42cd984900b737ff244db4f18d8c936bb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 10:52:43 +0100 Subject: [PATCH 07/12] small changes --- r/R/dplyr-functions.R | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 320afe3aa69..84a769ba861 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -647,23 +647,26 @@ nse_funcs$str_ends <- function(string, pattern, negate = FALSE) { nse_funcs$str_count <- function(string, pattern) { opts <- get_stringr_pattern_options(enexpr(pattern)) - if !is.string(pattern) { + if (!is.string(pattern)) { arrow_not_supported("`pattern` must be a length 1 character vector; other values") } if (opts$fixed) { - out <- Expression$create( - "count_substring", - string, - options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + return( + Expression$create( + "count_substring", + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) ) } else { - out <- Expression$create( - "count_substring_regex", - string, - options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + return( + Expression$create( + "count_substring_regex", + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) ) } - out } # String function helpers From 5acad04a9457bde985a5940cd61eec19eafc37cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 14:45:02 +0100 Subject: [PATCH 08/12] Update r/R/dplyr-functions.R Co-authored-by: Ian Cook --- r/R/dplyr-functions.R | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 84a769ba861..717cdae9662 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -650,23 +650,12 @@ nse_funcs$str_count <- function(string, pattern) { if (!is.string(pattern)) { arrow_not_supported("`pattern` must be a length 1 character vector; other values") } - if (opts$fixed) { - return( - Expression$create( - "count_substring", - string, - options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) - ) - ) - } else { - return( - Expression$create( - "count_substring_regex", - string, - options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) - ) - ) - } + arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex") + Expression$create( + arrow_fun, + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) } # String function helpers From a4c75d2e85805a868106f00126fe22497885ffd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Oct 2021 10:13:27 +0100 Subject: [PATCH 09/12] styled the test file --- r/tests/testthat/test-dplyr-funcs-string.R | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index a945b883d94..c649c90c67d 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -791,7 +791,8 @@ test_that("strftime", { ) withr::with_timezone( - "Pacific/Marquesas", { + "Pacific/Marquesas", + { expect_dplyr_equal( input %>% mutate( @@ -1339,9 +1340,12 @@ test_that("str_starts, str_ends, startsWith, endsWith", { test_that("str_count", { df <- tibble( - cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", - "Cluj Napoca", "Bern", "Bogota"), - dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..")) + cities = c( + "Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", + "Cluj Napoca", "Bern", "Bogota" + ), + dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..") + ) expect_dplyr_equal( input %>% @@ -1360,7 +1364,8 @@ test_that("str_count", { expect_dplyr_equal( input %>% mutate(p_count = str_count(cities, - pattern = regex("d", ignore_case = TRUE))) %>% + pattern = regex("d", ignore_case = TRUE) + )) %>% collect(), df ) @@ -1377,14 +1382,15 @@ test_that("str_count", { input %>% mutate(let_count = str_count( cities, - pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% + pattern = c("a", "b", "e", "g", "p", "n", "s") + )) %>% collect(), df, warning = TRUE ) expect_dplyr_equal( - input %>% + input %>% mutate(dots_count = str_count(dots, ".")) %>% collect(), df From a408dd86f3f1023320adf36d52843d9cd4ac00d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Oct 2021 10:15:03 +0100 Subject: [PATCH 10/12] tweaked the styling a bit --- r/tests/testthat/test-dplyr-funcs-string.R | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index c649c90c67d..f7dcc02cc25 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1340,10 +1340,7 @@ test_that("str_starts, str_ends, startsWith, endsWith", { test_that("str_count", { df <- tibble( - cities = c( - "Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", - "Cluj Napoca", "Bern", "Bogota" - ), + cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"), dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..") ) @@ -1380,10 +1377,7 @@ test_that("str_count", { # nse_funcs$str_count() is not vectorised over pattern expect_dplyr_equal( input %>% - mutate(let_count = str_count( - cities, - pattern = c("a", "b", "e", "g", "p", "n", "s") - )) %>% + mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% collect(), df, warning = TRUE From 606a623f8db356d844eaeb2c57a9b3d9b08514f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Oct 2021 17:25:08 +0100 Subject: [PATCH 11/12] Update r/NEWS.md remove NEWS.md bullet point as we're in the immediate pre-release period Co-authored-by: Ian Cook --- r/NEWS.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 43230899787..065a5d792df 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,8 +19,6 @@ # arrow 5.0.0.9000 -* added bindings for `str_count()`. (ARROW-13156, @dragosmg) - There are now two ways to query Arrow data: ## 1. Expanded Arrow-native queries: aggregation and joins From 51060e8c0f458e96fecf75a096a0cefa378b99e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Oct 2021 17:25:44 +0100 Subject: [PATCH 12/12] Update r/tests/testthat/test-dplyr-funcs-string.R Co-authored-by: Ian Cook --- r/tests/testthat/test-dplyr-funcs-string.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index f7dcc02cc25..333735be4f0 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -791,8 +791,7 @@ test_that("strftime", { ) withr::with_timezone( - "Pacific/Marquesas", - { + "Pacific/Marquesas", { expect_dplyr_equal( input %>% mutate(