From 66590805c91259bf4abc955d274b3239a1f1ba9d Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Sat, 1 Oct 2022 05:08:28 +0000 Subject: [PATCH 01/22] add tests for compute Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-collapse.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index 1809cb6e388..424d382a079 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -276,3 +276,17 @@ test_that("collapse doesn't unnecessarily add ProjectNodes", { ) expect_length(grep("ProjectNode", plan), 1) }) + +test_that("compute", { + tab1 <- tbl %>% + arrow_table() %>% + group_by(int) %>% + compute() + expect_true(inherits(tab1, "Table")) + expect_equal( + as.data.frame(tab1), + tbl %>% + group_by(int) %>% + collect() + ) +}) From 55d803d121e604a4dda37bdf0a91fc0910f30246 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Sat, 1 Oct 2022 05:08:28 +0000 Subject: [PATCH 02/22] rename file to match name of file to be tested Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/{test-dplyr-collapse.R => test-dplyr-collect.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename r/tests/testthat/{test-dplyr-collapse.R => test-dplyr-collect.R} (100%) diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collect.R similarity index 100% rename from r/tests/testthat/test-dplyr-collapse.R rename to r/tests/testthat/test-dplyr-collect.R From cd8fa1ddf3bf0881bd97fbbc6b7eb1253fb076fe Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Sat, 1 Oct 2022 05:08:29 +0000 Subject: [PATCH 03/22] fix tests Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-collect.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-collect.R b/r/tests/testthat/test-dplyr-collect.R index 424d382a079..78d542a8ea1 100644 --- a/r/tests/testthat/test-dplyr-collect.R +++ b/r/tests/testthat/test-dplyr-collect.R @@ -282,7 +282,7 @@ test_that("compute", { arrow_table() %>% group_by(int) %>% compute() - expect_true(inherits(tab1, "Table")) + expect_r6_class(tab1, "Table") expect_equal( as.data.frame(tab1), tbl %>% From 9e1c73b65fb25e131eb8cfac4d93e919ba1430ab Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Sat, 1 Oct 2022 05:08:29 +0000 Subject: [PATCH 04/22] remove unused line Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-collect.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-collect.R b/r/tests/testthat/test-dplyr-collect.R index 78d542a8ea1..63d15970df6 100644 --- a/r/tests/testthat/test-dplyr-collect.R +++ b/r/tests/testthat/test-dplyr-collect.R @@ -286,7 +286,6 @@ test_that("compute", { expect_equal( as.data.frame(tab1), tbl %>% - group_by(int) %>% - collect() + group_by(int) ) }) From 71c170f1bf198c37e3eeda4c29a15190c4d5c3ef Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Sat, 1 Oct 2022 05:08:29 +0000 Subject: [PATCH 05/22] Revert "rename file to match name of file to be tested" This reverts commit 83eafbeec3b3879df5a0280bf37ba3500944f07f. --- r/tests/testthat/{test-dplyr-collect.R => test-dplyr-collapse.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename r/tests/testthat/{test-dplyr-collect.R => test-dplyr-collapse.R} (100%) diff --git a/r/tests/testthat/test-dplyr-collect.R b/r/tests/testthat/test-dplyr-collapse.R similarity index 100% rename from r/tests/testthat/test-dplyr-collect.R rename to r/tests/testthat/test-dplyr-collapse.R From fd4b15e06e8024459f9159260d8b4e36b3a16120 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Sat, 1 Oct 2022 05:08:29 +0000 Subject: [PATCH 06/22] move test to the other file and rename the test case Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-collapse.R | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index 63d15970df6..1809cb6e388 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -276,16 +276,3 @@ test_that("collapse doesn't unnecessarily add ProjectNodes", { ) expect_length(grep("ProjectNode", plan), 1) }) - -test_that("compute", { - tab1 <- tbl %>% - arrow_table() %>% - group_by(int) %>% - compute() - expect_r6_class(tab1, "Table") - expect_equal( - as.data.frame(tab1), - tbl %>% - group_by(int) - ) -}) From 2fab6349af1795d8a82e255773bc168aa7624265 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 6 Oct 2022 10:48:55 +0000 Subject: [PATCH 07/22] add tests for compute and collect Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-query.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index db9a3bb30d0..bd261874375 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -620,3 +620,25 @@ test_that("collect() is identical to compute() %>% collect()", { collect() ) }) + +test_that("collect() is identical to compute() %>% collect()", { + tab1 <- tbl %>% + arrow_table() + adq1 <- tab1 %>% + group_by(int) + + expect_equal( + tab1 %>% + compute() %>% + collect(), + tab1 %>% + collect() + ) + expect_equal( + adq1 %>% + compute() %>% + collect(), + adq1 %>% + collect() + ) +}) From 7830d2cfb2822d460ddd2117e382915c0fccfc20 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 6 Oct 2022 10:56:02 +0000 Subject: [PATCH 08/22] more test Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-query.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index bd261874375..6a8cd851ca6 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -597,6 +597,11 @@ test_that("compute() on a grouped query returns a Table with groups in metadata" tbl %>% group_by(int) ) + expect_equal( + collect(tab1), + tbl %>% + group_by(int) + ) }) test_that("collect() is identical to compute() %>% collect()", { From dc19476a9a5ea639435aabf98cc6c1bce31f6a59 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 09/22] use NULL for empty group vars, and remove group vars metadata from .data Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-group-by.R | 2 +- r/R/dplyr.R | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index 57cf417c9ad..9524331cfbc 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -54,7 +54,7 @@ group_by.Dataset <- group_by.ArrowTabular <- group_by.RecordBatchReader <- group groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x)) groups.Dataset <- groups.ArrowTabular <- groups.RecordBatchReader <- groups.arrow_dplyr_query -group_vars.arrow_dplyr_query <- function(x) x$group_by_vars +group_vars.arrow_dplyr_query <- function(x) x$group_by_vars %||% character() group_vars.Dataset <- function(x) character() group_vars.RecordBatchReader <- function(x) character() group_vars.ArrowTabular <- function(x) { diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 1f529066dd3..c85b2714025 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -43,12 +43,16 @@ arrow_dplyr_query <- function(.data) { # If dplyr is not available, or if the input doesn't have a group_vars # method, assume no group vars dplyr::group_vars(.data), - error = function(e) character() + error = function(e) NULL ) if (inherits(.data, "data.frame")) { .data <- Table$create(.data) } + + # Remove group vars metadata from the Table + .data$metadata$r$attributes$.group_vars <- NULL + # Evaluating expressions on a dataset with duplicated fieldnames will error dupes <- duplicated(names(.data)) if (any(dupes)) { From cae1f26bea4fe4d048de03eec9cf5b5fef5d502c Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 10/22] restore group vars from arrow dplyr query to Table Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-collect.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index deb26859e8b..6244b4f02b2 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -62,18 +62,18 @@ restore_dplyr_features <- function(df, query) { # An arrow_dplyr_query holds some attributes that Arrow doesn't know about # After calling collect(), make sure these features are carried over - if (length(query$group_by_vars) > 0) { + if (is.data.frame(df)) { # Preserve groupings, if present - if (is.data.frame(df)) { + if (length(query$group_by_vars) > 0) { df <- dplyr::grouped_df( df, dplyr::group_vars(query), drop = dplyr::group_by_drop_default(query) ) - } else { - # This is a Table, via compute() or collect(as_data_frame = FALSE) - df$metadata$r$attributes$.group_vars <- query$group_by_vars } + } else { + # This is a Table, via compute() or collect(as_data_frame = FALSE) + df$metadata$r$attributes$.group_vars <- query$group_by_vars } df } From 73edf1238370529dae1817dc5fe55017898b8e16 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 11/22] arrow_dplyr_query$group_by_vars should character, and metadata$r$attributes$.group_vars should not character(0) Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-collect.R | 7 ++++++- r/R/dplyr-group-by.R | 2 +- r/R/dplyr.R | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 6244b4f02b2..1b17a297a87 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -73,7 +73,12 @@ restore_dplyr_features <- function(df, query) { } } else { # This is a Table, via compute() or collect(as_data_frame = FALSE) - df$metadata$r$attributes$.group_vars <- query$group_by_vars + if (length(query$group_by_vars) > 0) { + gv <- query$group_by_vars + } else { + gv <- NULL + } + df$metadata$r$attributes$.group_vars <- gv } df } diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index 9524331cfbc..57cf417c9ad 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -54,7 +54,7 @@ group_by.Dataset <- group_by.ArrowTabular <- group_by.RecordBatchReader <- group groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x)) groups.Dataset <- groups.ArrowTabular <- groups.RecordBatchReader <- groups.arrow_dplyr_query -group_vars.arrow_dplyr_query <- function(x) x$group_by_vars %||% character() +group_vars.arrow_dplyr_query <- function(x) x$group_by_vars group_vars.Dataset <- function(x) character() group_vars.RecordBatchReader <- function(x) character() group_vars.ArrowTabular <- function(x) { diff --git a/r/R/dplyr.R b/r/R/dplyr.R index c85b2714025..0d516022736 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -43,7 +43,7 @@ arrow_dplyr_query <- function(.data) { # If dplyr is not available, or if the input doesn't have a group_vars # method, assume no group vars dplyr::group_vars(.data), - error = function(e) NULL + error = function(e) character() ) if (inherits(.data, "data.frame")) { From f6d2a0d4d1490621d3aa7330d376b3dc67611155 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 12/22] fix Signed-off-by: SHIMA Tatsuya --- r/R/dplyr.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 0d516022736..2db3c89d3ce 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -50,9 +50,6 @@ arrow_dplyr_query <- function(.data) { .data <- Table$create(.data) } - # Remove group vars metadata from the Table - .data$metadata$r$attributes$.group_vars <- NULL - # Evaluating expressions on a dataset with duplicated fieldnames will error dupes <- duplicated(names(.data)) if (any(dupes)) { From da80d7657cfa92b4ec4dae29e87724c454e64382 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 13/22] use as_arrow_table instead of collect because of Table with 0 columns handling Signed-off-by: SHIMA Tatsuya --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 2db3c89d3ce..1cb0c8a91bd 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -183,7 +183,7 @@ dim.arrow_dplyr_query <- function(x) { # Query on in-memory Table, so evaluate the filter # Don't need any columns x <- select.arrow_dplyr_query(x, NULL) - rows <- nrow(compute.arrow_dplyr_query(x)) + rows <- nrow(as_arrow_table(x)) } c(rows, cols) } From 3012a54430f699967eb9fc4f93aafd0cbea2b698 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 14/22] fix typo Signed-off-by: SHIMA Tatsuya --- r/R/dplyr.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 1cb0c8a91bd..0351afe0d45 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -49,7 +49,6 @@ arrow_dplyr_query <- function(.data) { if (inherits(.data, "data.frame")) { .data <- Table$create(.data) } - # Evaluating expressions on a dataset with duplicated fieldnames will error dupes <- duplicated(names(.data)) if (any(dupes)) { From d0c545fa1944c2715f02849f8bf67781ec77f861 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 15/22] add tests Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index 9bb6aa9600d..601db1911d7 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -196,6 +196,20 @@ test_that("group_by() with .add", { collect(), tbl ) + compare_dplyr_binding( + .input %>% + group_by(.add = FALSE) %>% + collect(), + tbl %>% + group_by(dbl2) + ) + compare_dplyr_binding( + .input %>% + group_by(.add = TRUE) %>% + collect(), + tbl %>% + group_by(dbl2) + ) compare_dplyr_binding( .input %>% group_by(chr, .add = FALSE) %>% From ce5d581611ec4a7b873f6346393ae69d25f90049 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 16/22] add tests Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index 601db1911d7..82401801c16 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -62,6 +62,22 @@ test_that("ungroup", { collect(), tbl ) + compare_dplyr_binding( + .input %>% + group_by(chr, .add = FALSE) %>% + ungroup() %>% + collect(), + tbl %>% + group_by(int) + ) + compare_dplyr_binding( + .input %>% + group_by(chr, .add = TRUE) %>% + ungroup() %>% + collect(), + tbl %>% + group_by(int) + ) # to confirm that the above expectation is actually testing what we think it's # testing, verify that compare_dplyr_binding() distinguishes between grouped and From e117e8ca2e2fa527af8982415c24505f6e05715b Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 17/22] ensure not to convert to a grouped_df Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-collect.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 1b17a297a87..8098c99ccd4 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -27,6 +27,8 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { augment_io_error_msg(e, call, schema = x$.data$schema) } ) + # Remove grouping metadata from the Table before converting to a data frame + out$metadata$r$attributes$.group_vars <- NULL if (as_data_frame) { out <- as.data.frame(out) From 36674d1f5563ef8c82ccda4261719c458f3d4449 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 18/22] change to manipulate groups only when necessary Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-collect.R | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 8098c99ccd4..a97428a19b4 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -27,8 +27,10 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { augment_io_error_msg(e, call, schema = x$.data$schema) } ) - # Remove grouping metadata from the Table before converting to a data frame - out$metadata$r$attributes$.group_vars <- NULL + # Ungroup the Table before converting this to a data frame + if (length(group_vars.ArrowTabular(out))) { + out <- ungroup.ArrowTabular(out) + } if (as_data_frame) { out <- as.data.frame(out) @@ -64,23 +66,22 @@ restore_dplyr_features <- function(df, query) { # An arrow_dplyr_query holds some attributes that Arrow doesn't know about # After calling collect(), make sure these features are carried over - if (is.data.frame(df)) { - # Preserve groupings, if present - if (length(query$group_by_vars) > 0) { - df <- dplyr::grouped_df( + if (length(dplyr::group_vars(df))) { + stop("df must be ungrouped", call. = FALSE) + } + + if (length(dplyr::group_vars(query))) { + if (is.data.frame(df)) { + # Preserve groupings, if present + df <- dplyr::group_by( df, - dplyr::group_vars(query), - drop = dplyr::group_by_drop_default(query) + !!!syms(dplyr::group_vars(query)), + .drop = dplyr::group_by_drop_default(query) ) - } - } else { - # This is a Table, via compute() or collect(as_data_frame = FALSE) - if (length(query$group_by_vars) > 0) { - gv <- query$group_by_vars } else { - gv <- NULL + # This is a Table, via compute() or collect(as_data_frame = FALSE) + df$metadata$r$attributes$.group_vars <- dplyr::group_vars(query) } - df$metadata$r$attributes$.group_vars <- gv } df } From 8b5ef460d9eea4ff9f392c4e118c56e57d3f9bff Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:01:07 +0000 Subject: [PATCH 19/22] separate the test case Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-group-by.R | 33 ++++++++++++++------------ 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index 82401801c16..319eec25ed7 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -62,6 +62,24 @@ test_that("ungroup", { collect(), tbl ) + + # to confirm that the above expectation is actually testing what we think it's + # testing, verify that compare_dplyr_binding() distinguishes between grouped and + # ungrouped tibbles + expect_error( + compare_dplyr_binding( + .input %>% + group_by(chr) %>% + select(int, chr) %>% + (function(x) if (inherits(x, "tbl_df")) ungroup(x) else x) %>% + filter(int > 5) %>% + collect(), + tbl + ) + ) +}) + +test_that("Groups before conversion to a Table must not be restored after collect() (ARROW-17737)", { compare_dplyr_binding( .input %>% group_by(chr, .add = FALSE) %>% @@ -78,21 +96,6 @@ test_that("ungroup", { tbl %>% group_by(int) ) - - # to confirm that the above expectation is actually testing what we think it's - # testing, verify that compare_dplyr_binding() distinguishes between grouped and - # ungrouped tibbles - expect_error( - compare_dplyr_binding( - .input %>% - group_by(chr) %>% - select(int, chr) %>% - (function(x) if (inherits(x, "tbl_df")) ungroup(x) else x) %>% - filter(int > 5) %>% - collect(), - tbl - ) - ) }) test_that("group_by then rename", { From 8a1f7cdb7996fa5ca49b474138ca6f87e67f88df Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Fri, 7 Oct 2022 22:22:42 +0000 Subject: [PATCH 20/22] fix rebasing Signed-off-by: SHIMA Tatsuya --- r/tests/testthat/test-dplyr-query.R | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 6a8cd851ca6..db9a3bb30d0 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -597,33 +597,6 @@ test_that("compute() on a grouped query returns a Table with groups in metadata" tbl %>% group_by(int) ) - expect_equal( - collect(tab1), - tbl %>% - group_by(int) - ) -}) - -test_that("collect() is identical to compute() %>% collect()", { - tab1 <- tbl %>% - arrow_table() - adq1 <- tab1 %>% - group_by(int) - - expect_equal( - tab1 %>% - compute() %>% - collect(), - tab1 %>% - collect() - ) - expect_equal( - adq1 %>% - compute() %>% - collect(), - adq1 %>% - collect() - ) }) test_that("collect() is identical to compute() %>% collect()", { From 72a5ab6cd10ed4975e4db0f1db3b0ff64e9904b4 Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Tue, 11 Oct 2022 12:43:19 +0000 Subject: [PATCH 21/22] ensure ungroup .data if it is a Table Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-collect.R | 4 ---- r/R/dplyr.R | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index a97428a19b4..8a08f7f0448 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -27,10 +27,6 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { augment_io_error_msg(e, call, schema = x$.data$schema) } ) - # Ungroup the Table before converting this to a data frame - if (length(group_vars.ArrowTabular(out))) { - out <- ungroup.ArrowTabular(out) - } if (as_data_frame) { out <- as.data.frame(out) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 0351afe0d45..010cb7c5ce1 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -49,6 +49,11 @@ arrow_dplyr_query <- function(.data) { if (inherits(.data, "data.frame")) { .data <- Table$create(.data) } + # If .data is a Table, it must be ungrouped (ARROW-17737) + if (length(group_vars.ArrowTabular(.data))) { + .data <- ungroup.ArrowTabular(.data) + } + # Evaluating expressions on a dataset with duplicated fieldnames will error dupes <- duplicated(names(.data)) if (any(dupes)) { From a7062684aa827387c0237bae6e14c159c0ece92b Mon Sep 17 00:00:00 2001 From: SHIMA Tatsuya Date: Thu, 13 Oct 2022 10:45:32 +0000 Subject: [PATCH 22/22] more simplify Signed-off-by: SHIMA Tatsuya --- r/R/dplyr-collect.R | 7 ++----- r/R/dplyr.R | 5 +++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 8a08f7f0448..5640565b153 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -62,17 +62,14 @@ restore_dplyr_features <- function(df, query) { # An arrow_dplyr_query holds some attributes that Arrow doesn't know about # After calling collect(), make sure these features are carried over - if (length(dplyr::group_vars(df))) { - stop("df must be ungrouped", call. = FALSE) - } - if (length(dplyr::group_vars(query))) { if (is.data.frame(df)) { # Preserve groupings, if present df <- dplyr::group_by( df, !!!syms(dplyr::group_vars(query)), - .drop = dplyr::group_by_drop_default(query) + .drop = dplyr::group_by_drop_default(query), + .add = FALSE ) } else { # This is a Table, via compute() or collect(as_data_frame = FALSE) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 010cb7c5ce1..c7b58bc0275 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -49,8 +49,9 @@ arrow_dplyr_query <- function(.data) { if (inherits(.data, "data.frame")) { .data <- Table$create(.data) } - # If .data is a Table, it must be ungrouped (ARROW-17737) - if (length(group_vars.ArrowTabular(.data))) { + # ARROW-17737: If .data is a Table, remove groups from metadata + # (we've already grabbed the groups above) + if (inherits(.data, "ArrowTabular")) { .data <- ungroup.ArrowTabular(.data) }