From 557f106924466ae3a0922694a2278e0b18b39f8d Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Wed, 14 Sep 2022 13:44:45 +0000 Subject: [PATCH 01/10] Implement dplyr::across() inside group_by() Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-group-by.R | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index c650799e8d0..0d2f570a657 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -24,32 +24,23 @@ group_by.arrow_dplyr_query <- function(.data, add = .add, .drop = dplyr::group_by_drop_default(.data)) { .data <- as_adq(.data) - new_groups <- enquos(...) - # ... can contain expressions (i.e. can add (or rename?) columns) and so we - # need to identify those and add them on to the query with mutate. Specifically, - # we want to mark as new: - # * expressions (named or otherwise) - # * variables that have new names - # All others (i.e. simple references to variables) should not be (re)-added + expression_list <- expand_across(.data, quos(...)) + new_groups <- ensure_named_exprs(expression_list) - # Identify any groups with names which aren't in names of .data - new_group_ind <- map_lgl(new_groups, ~ !(quo_name(.x) %in% names(.data))) - # Identify any groups which don't have names - named_group_ind <- map_lgl(names(new_groups), nzchar) - # Retain any new groups identified above - new_groups <- new_groups[new_group_ind | named_group_ind] if (length(new_groups)) { - # now either use the name that was given in ... or if that is "" then use the expr - names(new_groups) <- imap_chr(new_groups, ~ ifelse(.y == "", quo_name(.x), .y)) - # Add them to the data .data <- dplyr::mutate(.data, !!!new_groups) } - if (".add" %in% names(formals(dplyr::group_by))) { - # For compatibility with dplyr >= 1.0 - gv <- dplyr::group_by_prepare(.data, ..., .add = .add)$group_names + + if (!".add" %in% names(formals(dplyr::group_by))) { + # For compatibility with dplyr < 1.0 + .add <- add + } + + if (.add) { + gv <- dplyr::union(.data$group_by_vars, names(new_groups)) } else { - gv <- dplyr::group_by_prepare(.data, ..., add = add)$group_names + gv <- names(new_groups) } .data$group_by_vars <- gv .data$drop_empty_groups <- ifelse(length(gv), .drop, dplyr::group_by_drop_default(.data)) From e64da009183b83004016a2f4fe70a1c99a2fc90d Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Wed, 14 Sep 2022 13:53:33 +0000 Subject: [PATCH 02/10] add tests Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index c7380e96ec3..5e31c41c691 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -166,3 +166,36 @@ test_that("group_by() with namespaced functions", { tbl ) }) + +test_that("Can use across() within group_by()", { + test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog") + compare_dplyr_binding( + .input %>% + group_by(across()) %>% + collect(), + example_with_logical_factors + ) + compare_dplyr_binding( + .input %>% + group_by(across(starts_with("s"))) %>% + collect(), + example_with_logical_factors + ) + compare_dplyr_binding( + .input %>% + group_by(across({{test_groups}})) %>% + collect(), + example_with_logical_factors + ) + + # ARROW-12778 - `where()` is not yet supported + expect_error( + compare_dplyr_binding( + .input %>% + group_by(across(where(is.double))) %>% + collect(), + example_data + ), + "Unsupported selection helper" + ) +}) From 52617adce3dde7677daf7cd719e4c80561859760 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Wed, 14 Sep 2022 14:12:06 +0000 Subject: [PATCH 03/10] add tests for the .add option Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index 5e31c41c691..40f5aa7b124 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -167,6 +167,37 @@ test_that("group_by() with namespaced functions", { ) }) +test_that("group_by() with .add", { + compare_dplyr_binding( + .input %>% + group_by(dbl2) %>% + group_by() %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(dbl2) %>% + group_by(.add = TRUE) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(dbl2) %>% + group_by(chr) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(dbl2) %>% + group_by(chr, .add = TRUE) %>% + collect(), + tbl + ) +}) + test_that("Can use across() within group_by()", { test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog") compare_dplyr_binding( From 7c879441f9738ed822b911f013e7afd7a9e0830f Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Wed, 14 Sep 2022 14:21:24 +0000 Subject: [PATCH 04/10] fix for empty case Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-group-by.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index 0d2f570a657..c5a1eb7356b 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -38,11 +38,11 @@ group_by.arrow_dplyr_query <- function(.data, } if (.add) { - gv <- dplyr::union(.data$group_by_vars, names(new_groups)) + gv <- dplyr::union(dplyr::group_vars(.data), names(new_groups)) } else { gv <- names(new_groups) } - .data$group_by_vars <- gv + .data$group_by_vars <- gv %||% character() .data$drop_empty_groups <- ifelse(length(gv), .drop, dplyr::group_by_drop_default(.data)) .data } From 58cbc1ae3ab1be6182bb1c91e80dc3fc21efcc7f Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 15 Sep 2022 10:31:05 +0000 Subject: [PATCH 05/10] update add handling and use base::union Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-group-by.R | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index c5a1eb7356b..f0349235926 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -21,8 +21,18 @@ group_by.arrow_dplyr_query <- function(.data, ..., .add = FALSE, - add = .add, + add = NULL, .drop = dplyr::group_by_drop_default(.data)) { + if (!missing(add)) { + .Deprecated( + msg = paste( + "The `add` argument of `group_by()` is deprecated.", + "Please use the `.add` argument instead." + ) + ) + .add <- add + } + .data <- as_adq(.data) expression_list <- expand_across(.data, quos(...)) new_groups <- ensure_named_exprs(expression_list) @@ -32,13 +42,8 @@ group_by.arrow_dplyr_query <- function(.data, .data <- dplyr::mutate(.data, !!!new_groups) } - if (!".add" %in% names(formals(dplyr::group_by))) { - # For compatibility with dplyr < 1.0 - .add <- add - } - if (.add) { - gv <- dplyr::union(dplyr::group_vars(.data), names(new_groups)) + gv <- union(dplyr::group_vars(.data), names(new_groups)) } else { gv <- names(new_groups) } From 977bb9ef89edaac31164f08e7b299852f4a2d8e9 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 15 Sep 2022 10:54:00 +0000 Subject: [PATCH 06/10] update tests Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index 40f5aa7b124..e19d96bdda4 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -199,33 +199,33 @@ test_that("group_by() with .add", { }) test_that("Can use across() within group_by()", { - test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog") + test_groups <- c("dbl", "int", "chr") compare_dplyr_binding( .input %>% group_by(across()) %>% collect(), - example_with_logical_factors + tbl ) compare_dplyr_binding( .input %>% - group_by(across(starts_with("s"))) %>% + group_by(across(starts_with("d"))) %>% collect(), - example_with_logical_factors + tbl ) compare_dplyr_binding( .input %>% group_by(across({{test_groups}})) %>% collect(), - example_with_logical_factors + tbl ) # ARROW-12778 - `where()` is not yet supported expect_error( compare_dplyr_binding( .input %>% - group_by(across(where(is.double))) %>% + group_by(across(where(is.numeric))) %>% collect(), - example_data + tbl ), "Unsupported selection helper" ) From 8a8d7fbf753ac75f2cc3892492ddd98a2db991bd Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 15 Sep 2022 11:12:55 +0000 Subject: [PATCH 07/10] add more test for add Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index e19d96bdda4..f13cd6bc8de 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -171,7 +171,7 @@ test_that("group_by() with .add", { compare_dplyr_binding( .input %>% group_by(dbl2) %>% - group_by() %>% + group_by(.add = FALSE) %>% collect(), tbl ) @@ -185,7 +185,7 @@ test_that("group_by() with .add", { compare_dplyr_binding( .input %>% group_by(dbl2) %>% - group_by(chr) %>% + group_by(chr, .add = FALSE) %>% collect(), tbl ) @@ -196,6 +196,22 @@ test_that("group_by() with .add", { collect(), tbl ) + expect_warning( + tbl %>% + arrow_table() %>% + group_by(add = TRUE) %>% + collect(), + "The `add` argument of `group_by\\(\\)` is deprecated" + ) + expect_error( + suppressWarnings( + tbl %>% + arrow_table() %>% + group_by(add = dbl2) %>% + collect() + ), + "object 'dbl2' not found" + ) }) test_that("Can use across() within group_by()", { @@ -214,7 +230,7 @@ test_that("Can use across() within group_by()", { ) compare_dplyr_binding( .input %>% - group_by(across({{test_groups}})) %>% + group_by(across({{ test_groups }})) %>% collect(), tbl ) From 6a962e2a7b82dcf183b26979f84bd9a7d579203e Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 15 Sep 2022 12:13:40 +0000 Subject: [PATCH 08/10] formatting Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-group-by.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index f0349235926..57cf417c9ad 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -25,10 +25,7 @@ group_by.arrow_dplyr_query <- function(.data, .drop = dplyr::group_by_drop_default(.data)) { if (!missing(add)) { .Deprecated( - msg = paste( - "The `add` argument of `group_by()` is deprecated.", - "Please use the `.add` argument instead." - ) + msg = paste("The `add` argument of `group_by()` is deprecated. Please use the `.add` argument instead.") ) .add <- add } @@ -47,6 +44,7 @@ group_by.arrow_dplyr_query <- function(.data, } else { gv <- names(new_groups) } + .data$group_by_vars <- gv %||% character() .data$drop_empty_groups <- ifelse(length(gv), .drop, dplyr::group_by_drop_default(.data)) .data From f119bb429a67b11b619362673f88ff635cf3eba7 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 15 Sep 2022 12:34:15 +0000 Subject: [PATCH 09/10] more tests Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index f13cd6bc8de..b211e419fea 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -196,6 +196,22 @@ test_that("group_by() with .add", { collect(), tbl ) + suppressWarnings(compare_dplyr_binding( + .input %>% + group_by(dbl2) %>% + group_by(add = FALSE) %>% + collect(), + tbl, + warning = "deprecated" + )) + suppressWarnings(compare_dplyr_binding( + .input %>% + group_by(dbl2) %>% + group_by(add = TRUE) %>% + collect(), + tbl, + warning = "deprecated" + )) expect_warning( tbl %>% arrow_table() %>% From 9128544e9db88b816f4fb6d96ae681246b763b2e Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 15 Sep 2022 13:11:47 +0000 Subject: [PATCH 10/10] more tests Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index b211e419fea..9bb6aa9600d 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -196,6 +196,20 @@ test_that("group_by() with .add", { collect(), tbl ) + compare_dplyr_binding( + .input %>% + group_by(chr, .add = FALSE) %>% + collect(), + tbl %>% + group_by(dbl2) + ) + compare_dplyr_binding( + .input %>% + group_by(chr, .add = TRUE) %>% + collect(), + tbl %>% + group_by(dbl2) + ) suppressWarnings(compare_dplyr_binding( .input %>% group_by(dbl2) %>%