From 00dfaf08d0f45a944b41853905de3aefc50020a3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 24 Jun 2021 06:46:29 +0100 Subject: [PATCH 1/5] Add tests --- .../testthat/test-dplyr-string-functions.R | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 4afb88e5732..87c5c62c6bd 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -724,3 +724,42 @@ test_that("errors in strptime", { 'Time zone argument not supported by Arrow' ) }) + +test_that("arrow_utf8_reverse and arrow_ascii_reverse functions", { + + df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) + + df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux")) + + expect_equivalent( + df_ascii %>% + Table$create() %>% + mutate(x = arrow_utf8_reverse(x)) %>% + collect(), + tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab")) + ) + + expect_equivalent( + df_utf8 %>% + Table$create() %>% + mutate(x = arrow_utf8_reverse(x)) %>% + collect(), + tibble(x = c("rab\u00A0dn\u0061\u00A0ooF", "xuuq\u00A0dna\u3000xuq\u00A0dna\u00A0za\u0062")) + ) + + expect_equivalent( + df_ascii %>% + Table$create() %>% + mutate(x = arrow_ascii_reverse(x)) %>% + collect(), + tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab")) + ) + + expect_error( + df_utf8 %>% + Table$create() %>% + mutate(x = arrow_ascii_reverse(x)) %>% + collect(), + "Invalid: Non-ASCII sequence in input" + ) +}) From bcc72953eac8b6048c73b64487369f5f6b2c1962 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 24 Jun 2021 11:06:33 +0100 Subject: [PATCH 2/5] Add string reverse functions and stringi to suggests --- r/DESCRIPTION | 1 + r/R/expression.R | 3 ++- .../testthat/test-dplyr-string-functions.R | 21 +++++++++---------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index a6536015530..3ad9472a209 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -48,6 +48,7 @@ Suggests: pkgload, reticulate, rmarkdown, + stringi, stringr, testthat, tibble, diff --git a/r/R/expression.R b/r/R/expression.R index ba542339ff8..9533f75fe76 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -29,7 +29,8 @@ # stringr spellings of those "str_length" = "utf8_length", "str_to_lower" = "utf8_lower", - "str_to_upper" = "utf8_upper" + "str_to_upper" = "utf8_upper", + "str_reverse" = "utf8_reverse" # str_trim is defined in dplyr.R ) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 87c5c62c6bd..0f6031a13e2 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -20,6 +20,7 @@ skip_if_not_available("utf8proc") library(dplyr) library(stringr) +library(stringi) test_that("paste, paste0, and str_c", { df <- tibble( @@ -725,26 +726,24 @@ test_that("errors in strptime", { ) }) -test_that("arrow_utf8_reverse and arrow_ascii_reverse functions", { +test_that("stri_reverse and arrow_ascii_reverse functions", { df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux")) - expect_equivalent( - df_ascii %>% - Table$create() %>% - mutate(x = arrow_utf8_reverse(x)) %>% + expect_dplyr_equal( + input %>% + mutate(x = stri_reverse(x)) %>% collect(), - tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab")) + df_utf8 ) - expect_equivalent( - df_utf8 %>% - Table$create() %>% - mutate(x = arrow_utf8_reverse(x)) %>% + expect_dplyr_equal( + input %>% + mutate(x = stri_reverse(x)) %>% collect(), - tibble(x = c("rab\u00A0dn\u0061\u00A0ooF", "xuuq\u00A0dna\u3000xuq\u00A0dna\u00A0za\u0062")) + df_ascii ) expect_equivalent( From 9caba5d909ad1923d2ce6f7e493f97aeea8942f1 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 24 Jun 2021 12:35:23 -0400 Subject: [PATCH 3/5] Add missing comma --- r/R/expression.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/expression.R b/r/R/expression.R index dcd7b162723..bfbaa2f0ab1 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -30,7 +30,7 @@ "str_length" = "utf8_length", "str_to_lower" = "utf8_lower", "str_to_upper" = "utf8_upper", - "str_reverse" = "utf8_reverse" + "str_reverse" = "utf8_reverse", # str_trim is defined in dplyr-functions.R "year" = "year", "isoyear" = "iso_year", From a7a9f9b15b39f17ac0fb88c14ca4cdad2b646eb4 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 24 Jun 2021 13:58:41 -0400 Subject: [PATCH 4/5] Remove errant comma --- r/tests/testthat/test-dplyr-string-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 76a50d716b1..377ba47fe27 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -819,6 +819,6 @@ test_that("str_like", { input %>% mutate(x = str_like(x, "%baz%")) %>% collect(), - df, + df ) }) From 83e402710f39b8d3fbb3024398755d632250d98f Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 24 Jun 2021 15:02:56 -0400 Subject: [PATCH 5/5] Fix missing brackets in tests --- r/tests/testthat/test-dplyr-string-functions.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 377ba47fe27..4cb07c9e39d 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -713,6 +713,7 @@ test_that("strptime", { tstamp, check.tzone = FALSE ) +}) test_that("errors in strptime", { # Error when tz is passed