From d44975117557e07c376eda0fe852bdf2b42f5160 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 17 Sep 2021 15:37:25 -0400 Subject: [PATCH 1/3] Support .groups argument to summarize() --- r/R/dplyr-summarize.R | 35 +++++++++++++--- r/tests/testthat/helper-expectation.R | 6 +-- r/tests/testthat/test-dplyr-summarize.R | 53 ++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 459b7435a87..23ed270100a 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -51,10 +51,6 @@ summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query # This is the Arrow summarize implementation do_arrow_summarize <- function(.data, ..., .groups = NULL) { - if (!is.null(.groups)) { - # ARROW-13550 - abort("`summarize()` with `.groups` argument not supported in Arrow") - } exprs <- ensure_named_exprs(quos(...)) # Create a stateful environment for recording our evaluated expressions @@ -97,6 +93,35 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { ctx$post_mutate )[c(.data$group_by_vars, names(exprs))] } + + # Handle .groups argument + if (length(.data$group_by_vars)) { + if (is.null(.groups)) { + # dplyr docs say: + # When ‘.groups’ is not specified, it is chosen based on the + # number of rows of the results: + # • If all the results have 1 row, you get "drop_last". + # • If the number of rows varies, you get "keep". + # + # But we don't support anything that returns multiple rows now + .groups <- "drop_last" + } else { + assert_that(is.string(.groups)) + } + if (.groups == "drop_last") { + out$group_by_vars <- head(.data$group_by_vars, -1) + } else if (.groups == "keep") { + out$group_by_vars <- .data$group_by_vars + } else if (.groups == "rowwise") { + stop(arrow_not_supported('.groups = "rowwise"')) + } else if (.groups != "drop") { + # Drop means don't group by anything so there's nothing to do. + # Anything else is invalid + stop(paste("Invalid .groups argument:", .groups)) + } + # TODO: should we be doing something with `drop_empty_groups` in summarize? + out$drop_empty_groups <- .data$drop_empty_groups + } out } @@ -225,4 +250,4 @@ extract_aggregations <- function(expr, ctx) { expr <- as.symbol(tmpname) } expr -} +} \ No newline at end of file diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 72f07f32c96..932e4052d86 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -105,7 +105,7 @@ expect_dplyr_equal <- function(expr, ), warning ) - expect_equivalent(via_batch, expected, ...) + expect_equal(via_batch, expected, ...) } else { skip_msg <- c(skip_msg, skip_record_batch) } @@ -118,7 +118,7 @@ expect_dplyr_equal <- function(expr, ), warning ) - expect_equivalent(via_table, expected, ...) + expect_equal(via_table, expected, ...) } else { skip_msg <- c(skip_msg, skip_table) } @@ -281,4 +281,4 @@ split_vector_as_list <- function(vec) { vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)] vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)] list(vec1, vec2) -} +} \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index fa4bffe30d7..d1ebe1e36c9 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -390,7 +390,7 @@ test_that("Expressions on aggregations", { ) # Aggregate on an aggregate (trivial but dplyr allows) - skip("Not supported") + skip("Aggregate on an aggregate not supported") expect_dplyr_equal( input %>% group_by(some_grouping) %>% @@ -468,3 +468,54 @@ test_that("Not (yet) supported: implicit join", { warning = "Expression dbl - int not supported in Arrow; pulling data into R" ) }) + +test_that(".groups argument", { + expect_dplyr_equal( + input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n()) %>% + collect(), + tbl + ) + expect_dplyr_equal( + input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "drop_last") %>% + collect(), + tbl + ) + expect_dplyr_equal( + input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "keep") %>% + collect(), + tbl + ) + expect_dplyr_equal( + input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "drop") %>% + collect(), + tbl + ) + expect_dplyr_equal( + input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "rowwise") %>% + collect(), + tbl, + warning = TRUE + ) + + # abandon_ship() raises the warning, then dplyr itself errors + # This isn't ideal but it's fine and won't be an issue on Datasets + expect_error( + expect_warning( + Table$create(tbl) %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "NOTVALID"), + "Invalid .groups argument" + ), + "NOTVALID" + ) +}) \ No newline at end of file From b4accfad9bc83e6a764d33d44d6db6a41e0b8b17 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 17 Sep 2021 15:45:13 -0400 Subject: [PATCH 2/3] Loosen strsplit attribute checks now that expect_dplyr_equal checks them --- r/R/dplyr-summarize.R | 2 +- r/tests/testthat/helper-expectation.R | 2 +- r/tests/testthat/test-chunked-array.R | 10 ++++---- r/tests/testthat/test-compute-no-bindings.R | 2 -- .../testthat/test-dplyr-string-functions.R | 24 ++++++++++++------- r/tests/testthat/test-dplyr-summarize.R | 2 +- 6 files changed, 25 insertions(+), 17 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 23ed270100a..b57b05da70a 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -250,4 +250,4 @@ extract_aggregations <- function(expr, ctx) { expr <- as.symbol(tmpname) } expr -} \ No newline at end of file +} diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 932e4052d86..e765bd6cf54 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -281,4 +281,4 @@ split_vector_as_list <- function(vec) { vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)] vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)] list(vec1, vec2) -} \ No newline at end of file +} diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index 8ec8952a129..8ff4e6684c4 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -203,10 +203,12 @@ test_that("ChunkedArray supports difftime", { }) test_that("ChunkedArray supports empty arrays (ARROW-13761)", { - types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), - uint64(), float32(), float64(), timestamp("ns"), binary(), - large_binary(), fixed_size_binary(32), date32(), date64(), - decimal(4, 2)) + types <- c( + int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), + uint64(), float32(), float64(), timestamp("ns"), binary(), + large_binary(), fixed_size_binary(32), date32(), date64(), + decimal(4, 2) + ) empty_filter <- ChunkedArray$create(type = bool()) for (type in types) { diff --git a/r/tests/testthat/test-compute-no-bindings.R b/r/tests/testthat/test-compute-no-bindings.R index 05beb924d77..0546b98c0af 100644 --- a/r/tests/testthat/test-compute-no-bindings.R +++ b/r/tests/testthat/test-compute-no-bindings.R @@ -121,7 +121,6 @@ test_that("non-bound compute kernels using ModeOptions", { }) test_that("non-bound compute kernels using PartitionNthOptions", { - result <- call_function( "partition_nth_indices", Array$create(c(11:20)), @@ -131,7 +130,6 @@ test_that("non-bound compute kernels using PartitionNthOptions", { # (depends on C++ standard library implementation) expect_true(all(as.vector(result[1:3]) < 3)) expect_true(all(as.vector(result[4:10]) >= 3)) - }) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index b6b8f5a714a..05485bb24e1 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -410,49 +410,57 @@ test_that("strsplit and str_split", { input %>% mutate(x = strsplit(x, "and")) %>% collect(), - df + df, + check.attributes = FALSE ) expect_dplyr_equal( input %>% mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% collect(), - df + df, + check.attributes = FALSE ) expect_dplyr_equal( input %>% mutate(x = strsplit(x, " +and +")) %>% collect(), - df + df, + check.attributes = FALSE ) expect_dplyr_equal( input %>% mutate(x = str_split(x, "and")) %>% collect(), - df + df, + check.attributes = FALSE ) expect_dplyr_equal( input %>% mutate(x = str_split(x, "and", n = 2)) %>% collect(), - df + df, + check.attributes = FALSE ) expect_dplyr_equal( input %>% mutate(x = str_split(x, fixed("and"), n = 2)) %>% collect(), - df + df, + check.attributes = FALSE ) expect_dplyr_equal( input %>% mutate(x = str_split(x, regex("and"), n = 2)) %>% collect(), - df + df, + check.attributes = FALSE ) expect_dplyr_equal( input %>% mutate(x = str_split(x, "Foo|bar", n = 2)) %>% collect(), - df + df, + check.attributes = FALSE ) }) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index d1ebe1e36c9..12bb50fb3d5 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -518,4 +518,4 @@ test_that(".groups argument", { ), "NOTVALID" ) -}) \ No newline at end of file +}) From 67c4f89188352b693d9f1fd6903b095cec4db844 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 20 Sep 2021 15:06:01 -0400 Subject: [PATCH 3/3] Comments and jira xrefs --- r/R/dplyr-summarize.R | 2 +- r/tests/testthat/test-dplyr-string-functions.R | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index b57b05da70a..beb18e82039 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -119,7 +119,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # Anything else is invalid stop(paste("Invalid .groups argument:", .groups)) } - # TODO: should we be doing something with `drop_empty_groups` in summarize? + # TODO: shouldn't we be doing something with `drop_empty_groups` in summarize? (ARROW-14044) out$drop_empty_groups <- .data$drop_empty_groups } out diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 05485bb24e1..1098619f3a5 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -411,6 +411,12 @@ test_that("strsplit and str_split", { mutate(x = strsplit(x, "and")) %>% collect(), df, + # Pass check.attributes = FALSE through to expect_equal + # (which gives you expect_equivalent() behavior). + # This is because the vctr that comes back from arrow (ListArray) + # has type information in it, but it's just a bare list from R/dplyr. + # Note also that whenever we bump up to testthat 3rd edition (ARROW-12871), + # the parameter is called `ignore_attr = TRUE` check.attributes = FALSE ) expect_dplyr_equal(