From 216b1a4384c59ff5dc43a31511966cd4bb8b808c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 1 Jul 2021 11:05:27 +0100 Subject: [PATCH 01/10] Add unit tests for str_pad --- .../testthat/test-dplyr-string-functions.R | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index ecbe2f00f2d..37c1bd4ffa0 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -866,3 +866,44 @@ test_that("str_like", { df ) }) + +test_that("str_pad", { + + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) + + expect_dplyr_equal( + input %>% + mutate(x = str_pad(x, width = 30)) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(x = str_pad(x, width = 30, side = "right")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(x = str_pad(x, width = 30, side = "both")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(x = str_pad(x, width = 30, side = "both", pad = "+")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(x = str_pad(x, width = 10, side = "both", pad = "+")) %>% + collect(), + df + ) + +}) From 308461329a0c0bb98d9b9f67cfbb787ccf876835 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 1 Jul 2021 11:18:39 +0100 Subject: [PATCH 02/10] fix merge conflict --- r/R/dplyr-functions.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 27d6e889199..600a2f8d991 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -414,6 +414,25 @@ nse_funcs$pmax <- function(..., na.rm = FALSE) { ) } +nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") { + + side <- match.arg(side) + + if (side == "left") { + pad_func = "utf8_lpad_doc" + } else if (side == "right") { + pad_func = "utf8_rpad_doc" + } else if (side == "both") { + pad_func = "utf8_center_doc" + } + + Expression$create( + pad_func, + string, + options = list(width = width, padding = pad) + ) +} + # String function helpers # format `pattern` as needed for case insensitivity and literal matching by RE2 From 6ee7ef2566a88131fafa83fe9a28fd8a1cf5dbca Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 1 Jul 2021 11:19:33 +0100 Subject: [PATCH 03/10] Expose pad options --- r/src/compute.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 458e0e386e9..a74cce177ba 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -285,6 +285,14 @@ std::shared_ptr make_compute_options( max_splits, reverse); } + if (func_name == "utf8_lpad" || func_name == "utf8_rpad" || + func_name == "utf8_center" || func_name == "ascii_lpad" || + func_name == "ascii_rpad" || func_name == "ascii_center") { + using Options = arrow::compute::PadOptions; + return std::make_shared(cpp11::as_cpp(options["width"]), + cpp11::as_cpp(options["padding"])); + } + if (func_name == "utf8_split_whitespace" || func_name == "ascii_split_whitespace") { using Options = arrow::compute::SplitOptions; int64_t max_splits = -1; From 8dd8c9e977fa6e4e724a68cda16ed103e7e45fe8 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 1 Jul 2021 11:19:47 +0100 Subject: [PATCH 04/10] Mention where defined --- r/R/expression.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/expression.R b/r/R/expression.R index de140832374..9b4b79e458a 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -30,6 +30,7 @@ "str_length" = "utf8_length", "str_to_lower" = "utf8_lower", "str_to_upper" = "utf8_upper", + # str_pad is defined in dplyr-functions.R "str_reverse" = "utf8_reverse", # str_trim is defined in dplyr-functions.R "year" = "year", From 1e2e79dc456d02de95ae49704785aa8d9ba2f431 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 1 Jul 2021 12:04:54 +0100 Subject: [PATCH 05/10] Update options param to int type --- r/src/compute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index a74cce177ba..8d0f0793549 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -289,7 +289,7 @@ std::shared_ptr make_compute_options( func_name == "utf8_center" || func_name == "ascii_lpad" || func_name == "ascii_rpad" || func_name == "ascii_center") { using Options = arrow::compute::PadOptions; - return std::make_shared(cpp11::as_cpp(options["width"]), + return std::make_shared(cpp11::as_cpp(options["width"]), cpp11::as_cpp(options["padding"])); } From 6e33c28f4541aa9f4e28ada083ecbe1ff41ba5c7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 1 Jul 2021 12:05:07 +0100 Subject: [PATCH 06/10] Comment with open JIRA ticket --- r/tests/testthat/test-dplyr-string-functions.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 37c1bd4ffa0..5e9851014e5 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -887,21 +887,22 @@ test_that("str_pad", { expect_dplyr_equal( input %>% - mutate(x = str_pad(x, width = 30, side = "both")) %>% + mutate(x = str_pad(x, width = 30, side = "left", pad = "+")) %>% collect(), df ) expect_dplyr_equal( input %>% - mutate(x = str_pad(x, width = 30, side = "both", pad = "+")) %>% + mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>% collect(), df ) + skip("Extra space added to opposite side in C++ - ARROW-13234") expect_dplyr_equal( input %>% - mutate(x = str_pad(x, width = 10, side = "both", pad = "+")) %>% + mutate(x = str_pad(x, width = 30, side = "both")) %>% collect(), df ) From 3b2bcaae46689d29bf2a3fe09c33dc54d3bca6c3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 1 Jul 2021 12:05:18 +0100 Subject: [PATCH 07/10] Use correct compute function --- r/R/dplyr-functions.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 600a2f8d991..58f327c145e 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -419,11 +419,11 @@ nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), side <- match.arg(side) if (side == "left") { - pad_func = "utf8_lpad_doc" + pad_func = "utf8_lpad" } else if (side == "right") { - pad_func = "utf8_rpad_doc" + pad_func = "utf8_rpad" } else if (side == "both") { - pad_func = "utf8_center_doc" + pad_func = "utf8_center" } Expression$create( From 59f5f3cd32c5006b25cf292516c96ee87def12da Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 1 Jul 2021 16:39:15 +0100 Subject: [PATCH 08/10] Don't skip test as extra space added on right now --- r/tests/testthat/test-dplyr-string-functions.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 5e9851014e5..132d386d5b0 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -899,7 +899,6 @@ test_that("str_pad", { df ) - skip("Extra space added to opposite side in C++ - ARROW-13234") expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 30, side = "both")) %>% From 7bbca0c495da4c801b5377f09c6d8ad959db80b6 Mon Sep 17 00:00:00 2001 From: Nic Date: Thu, 8 Jul 2021 14:08:13 +0100 Subject: [PATCH 09/10] Update r/R/dplyr-functions.R Co-authored-by: Ian Cook --- r/R/dplyr-functions.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 58f327c145e..92181296b3f 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -416,7 +416,9 @@ nse_funcs$pmax <- function(..., na.rm = FALSE) { nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") { + assert_that(is_integerish(width)) side <- match.arg(side) + assert_that(is.string(pad)) if (side == "left") { pad_func = "utf8_lpad" From 048d90077df8873dc2d009915be593446ca6d184 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 8 Jul 2021 14:25:36 +0100 Subject: [PATCH 10/10] Add odd numbered padding --- r/tests/testthat/test-dplyr-string-functions.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 132d386d5b0..438f1038e57 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -873,7 +873,7 @@ test_that("str_pad", { expect_dplyr_equal( input %>% - mutate(x = str_pad(x, width = 30)) %>% + mutate(x = str_pad(x, width = 31)) %>% collect(), df ) @@ -887,7 +887,7 @@ test_that("str_pad", { expect_dplyr_equal( input %>% - mutate(x = str_pad(x, width = 30, side = "left", pad = "+")) %>% + mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>% collect(), df ) @@ -901,7 +901,7 @@ test_that("str_pad", { expect_dplyr_equal( input %>% - mutate(x = str_pad(x, width = 30, side = "both")) %>% + mutate(x = str_pad(x, width = 31, side = "both")) %>% collect(), df )