From ccbd3c8024af4baf90cdb7ebf889bf37e04f70f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 13:42:17 +0100 Subject: [PATCH 1/3] added test for str_to_sentence & binding to utf8_capitalize --- r/R/dplyr-functions.R | 5 +++++ r/tests/testthat/test-dplyr-funcs-string.R | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index dbb9d5f46f6..b314f97a450 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -367,6 +367,11 @@ nse_funcs$str_to_title <- function(string, locale = "en") { Expression$create("utf8_title", string) } +nse_funcs$str_to_sentence <- function(string, locale = "en") { + stop_if_locale_provided(locale) + Expression$create("utf8_capitalize", string) +} + nse_funcs$str_trim <- function(string, side = c("both", "left", "right")) { side <- match.arg(side) trim_fun <- switch(side, diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index dd59b5ac55d..b0ae394307b 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1336,3 +1336,14 @@ test_that("str_starts, str_ends, startsWith, endsWith", { df ) }) + +test_that("str_to_sentence", { + df <- tibble(x = c("first word", "the second word", "the third word")) + + expect_dplyr_equal( + input %>% + mutate(sentence_case = str_to_sentence(x)) %>% + collect(), + df + ) +}) From e00f21286ecfddef1cadc04abd3b6f3dd3e2b15c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 14:46:00 +0100 Subject: [PATCH 2/3] added more tests and comments (in expression) --- r/R/dplyr-functions.R | 1 + r/R/expression.R | 1 + r/tests/testthat/test-dplyr-funcs-string.R | 14 ++++++++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index b314f97a450..1a2bf4d9f65 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -341,6 +341,7 @@ arrow_string_join_function <- function(null_handling, null_replacement = NULL) { # str_to_lower # str_to_upper # str_to_title +# str_to_sentence # # Arrow locale will be supported with ARROW-14126 stop_if_locale_provided <- function(locale) { diff --git a/r/R/expression.R b/r/R/expression.R index b1b6635f538..f3110f40ef0 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -53,6 +53,7 @@ # str_to_lower is defined in dplyr-functions.R # str_to_title is defined in dplyr-functions.R # str_to_upper is defined in dplyr-functions.R + # str_to_sentence is defined in dplyr-functions.R # str_trim is defined in dplyr-functions.R "stri_reverse" = "utf8_reverse", # substr is defined in dplyr-functions.R diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index b0ae394307b..640fb9fa011 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1338,11 +1338,21 @@ test_that("str_starts, str_ends, startsWith, endsWith", { }) test_that("str_to_sentence", { - df <- tibble(x = c("first word", "the second word", "the third word")) + df <- tibble( + one_sent = c("first word", "the second word", "the third word"), + two_sent = c("first sent. second sent", "second word", "third word") + ) + + expect_dplyr_equal( + input %>% + mutate(sentence_case = str_to_sentence(one_sent)) %>% + collect(), + df + ) expect_dplyr_equal( input %>% - mutate(sentence_case = str_to_sentence(x)) %>% + mutate(sentence_case_two = str_to_sentence(two_sent)) %>% collect(), df ) From 836fdf897a05bb9870b7360027653567348989a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Oct 2021 14:58:35 +0100 Subject: [PATCH 3/3] added a failing test to capture the weird stringr behaviour --- r/tests/testthat/test-dplyr-funcs-string.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index 640fb9fa011..f6fc5f313c4 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1340,7 +1340,8 @@ test_that("str_starts, str_ends, startsWith, endsWith", { test_that("str_to_sentence", { df <- tibble( one_sent = c("first word", "the second word", "the third word"), - two_sent = c("first sent. second sent", "second word", "third word") + two_sent = c("first word. second word? third word! fourth word", + "second word", "third word") ) expect_dplyr_equal( @@ -1350,7 +1351,9 @@ test_that("str_to_sentence", { df ) - expect_dplyr_equal( + # there is something strange going on with str_to_sentence in stringr where + # it doesn't recognise `.` as a sentence end + expect_dplyr_error( input %>% mutate(sentence_case_two = str_to_sentence(two_sent)) %>% collect(),