From d44975117557e07c376eda0fe852bdf2b42f5160 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Fri, 17 Sep 2021 15:37:25 -0400
Subject: [PATCH 1/3] Support .groups argument to summarize()

---
 r/R/dplyr-summarize.R                   | 35 +++++++++++++---
 r/tests/testthat/helper-expectation.R   |  6 +--
 r/tests/testthat/test-dplyr-summarize.R | 53 ++++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index 459b7435a87..23ed270100a 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -51,10 +51,6 @@ summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query
 
 # This is the Arrow summarize implementation
 do_arrow_summarize <- function(.data, ..., .groups = NULL) {
-  if (!is.null(.groups)) {
-    # ARROW-13550
-    abort("`summarize()` with `.groups` argument not supported in Arrow")
-  }
   exprs <- ensure_named_exprs(quos(...))
 
   # Create a stateful environment for recording our evaluated expressions
@@ -97,6 +93,35 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) {
       ctx$post_mutate
     )[c(.data$group_by_vars, names(exprs))]
   }
+
+  # Handle .groups argument
+  if (length(.data$group_by_vars)) {
+    if (is.null(.groups)) {
+      # dplyr docs say:
+      # When ‘.groups’ is not specified, it is chosen based on the
+      # number of rows of the results:
+      # • If all the results have 1 row, you get "drop_last".
+      # • If the number of rows varies, you get "keep".
+      #
+      # But we don't support anything that returns multiple rows now
+      .groups <- "drop_last"
+    } else {
+      assert_that(is.string(.groups))
+    }
+    if (.groups == "drop_last") {
+      out$group_by_vars <- head(.data$group_by_vars, -1)
+    } else if (.groups == "keep") {
+      out$group_by_vars <- .data$group_by_vars
+    } else if (.groups == "rowwise") {
+      stop(arrow_not_supported('.groups = "rowwise"'))
+    } else if (.groups != "drop") {
+      # Drop means don't group by anything so there's nothing to do.
+      # Anything else is invalid
+      stop(paste("Invalid .groups argument:", .groups))
+    }
+    # TODO: should we be doing something with `drop_empty_groups` in summarize?
+    out$drop_empty_groups <- .data$drop_empty_groups
+  }
   out
 }
 
@@ -225,4 +250,4 @@ extract_aggregations <- function(expr, ctx) {
     expr <- as.symbol(tmpname)
   }
   expr
-}
+}
\ No newline at end of file
diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R
index 72f07f32c96..932e4052d86 100644
--- a/r/tests/testthat/helper-expectation.R
+++ b/r/tests/testthat/helper-expectation.R
@@ -105,7 +105,7 @@ expect_dplyr_equal <- function(expr,
       ),
       warning
     )
-    expect_equivalent(via_batch, expected, ...)
+    expect_equal(via_batch, expected, ...)
   } else {
     skip_msg <- c(skip_msg, skip_record_batch)
   }
@@ -118,7 +118,7 @@ expect_dplyr_equal <- function(expr,
       ),
       warning
     )
-    expect_equivalent(via_table, expected, ...)
+    expect_equal(via_table, expected, ...)
   } else {
     skip_msg <- c(skip_msg, skip_table)
   }
@@ -281,4 +281,4 @@ split_vector_as_list <- function(vec) {
   vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)]
   vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)]
   list(vec1, vec2)
-}
+}
\ No newline at end of file
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index fa4bffe30d7..d1ebe1e36c9 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -390,7 +390,7 @@ test_that("Expressions on aggregations", {
   )
 
   # Aggregate on an aggregate (trivial but dplyr allows)
-  skip("Not supported")
+  skip("Aggregate on an aggregate not supported")
   expect_dplyr_equal(
     input %>%
       group_by(some_grouping) %>%
@@ -468,3 +468,54 @@ test_that("Not (yet) supported: implicit join", {
     warning = "Expression dbl - int not supported in Arrow; pulling data into R"
   )
 })
+
+test_that(".groups argument", {
+  expect_dplyr_equal(
+    input %>%
+      group_by(some_grouping, int < 6) %>%
+      summarize(count = n()) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(some_grouping, int < 6) %>%
+      summarize(count = n(), .groups = "drop_last") %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(some_grouping, int < 6) %>%
+      summarize(count = n(), .groups = "keep") %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(some_grouping, int < 6) %>%
+      summarize(count = n(), .groups = "drop") %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(some_grouping, int < 6) %>%
+      summarize(count = n(), .groups = "rowwise") %>%
+      collect(),
+    tbl,
+    warning = TRUE
+  )
+
+  # abandon_ship() raises the warning, then dplyr itself errors
+  # This isn't ideal but it's fine and won't be an issue on Datasets
+  expect_error(
+    expect_warning(
+      Table$create(tbl) %>%
+        group_by(some_grouping, int < 6) %>%
+        summarize(count = n(), .groups = "NOTVALID"),
+      "Invalid .groups argument"
+    ),
+    "NOTVALID"
+  )
+})
\ No newline at end of file

From b4accfad9bc83e6a764d33d44d6db6a41e0b8b17 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Fri, 17 Sep 2021 15:45:13 -0400
Subject: [PATCH 2/3] Loosen strsplit attribute checks now that
 expect_dplyr_equal checks them

---
 r/R/dplyr-summarize.R                         |  2 +-
 r/tests/testthat/helper-expectation.R         |  2 +-
 r/tests/testthat/test-chunked-array.R         | 10 ++++----
 r/tests/testthat/test-compute-no-bindings.R   |  2 --
 .../testthat/test-dplyr-string-functions.R    | 24 ++++++++++++-------
 r/tests/testthat/test-dplyr-summarize.R       |  2 +-
 6 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index 23ed270100a..b57b05da70a 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -250,4 +250,4 @@ extract_aggregations <- function(expr, ctx) {
     expr <- as.symbol(tmpname)
   }
   expr
-}
\ No newline at end of file
+}
diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R
index 932e4052d86..e765bd6cf54 100644
--- a/r/tests/testthat/helper-expectation.R
+++ b/r/tests/testthat/helper-expectation.R
@@ -281,4 +281,4 @@ split_vector_as_list <- function(vec) {
   vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)]
   vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)]
   list(vec1, vec2)
-}
\ No newline at end of file
+}
diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R
index 8ec8952a129..8ff4e6684c4 100644
--- a/r/tests/testthat/test-chunked-array.R
+++ b/r/tests/testthat/test-chunked-array.R
@@ -203,10 +203,12 @@ test_that("ChunkedArray supports difftime", {
 })
 
 test_that("ChunkedArray supports empty arrays (ARROW-13761)", {
-  types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
-            uint64(), float32(), float64(), timestamp("ns"), binary(),
-            large_binary(), fixed_size_binary(32), date32(), date64(),
-            decimal(4, 2))
+  types <- c(
+    int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
+    uint64(), float32(), float64(), timestamp("ns"), binary(),
+    large_binary(), fixed_size_binary(32), date32(), date64(),
+    decimal(4, 2)
+  )
 
   empty_filter <- ChunkedArray$create(type = bool())
   for (type in types) {
diff --git a/r/tests/testthat/test-compute-no-bindings.R b/r/tests/testthat/test-compute-no-bindings.R
index 05beb924d77..0546b98c0af 100644
--- a/r/tests/testthat/test-compute-no-bindings.R
+++ b/r/tests/testthat/test-compute-no-bindings.R
@@ -121,7 +121,6 @@ test_that("non-bound compute kernels using ModeOptions", {
 })
 
 test_that("non-bound compute kernels using PartitionNthOptions", {
-
   result <- call_function(
     "partition_nth_indices",
     Array$create(c(11:20)),
@@ -131,7 +130,6 @@ test_that("non-bound compute kernels using PartitionNthOptions", {
   # (depends on C++ standard library implementation)
   expect_true(all(as.vector(result[1:3]) < 3))
   expect_true(all(as.vector(result[4:10]) >= 3))
-
 })
 
 
diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index b6b8f5a714a..05485bb24e1 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -410,49 +410,57 @@ test_that("strsplit and str_split", {
     input %>%
       mutate(x = strsplit(x, "and")) %>%
       collect(),
-    df
+    df,
+    check.attributes = FALSE
   )
   expect_dplyr_equal(
     input %>%
       mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
       collect(),
-    df
+    df,
+    check.attributes = FALSE
   )
   expect_dplyr_equal(
     input %>%
       mutate(x = strsplit(x, " +and +")) %>%
       collect(),
-    df
+    df,
+    check.attributes = FALSE
   )
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, "and")) %>%
       collect(),
-    df
+    df,
+    check.attributes = FALSE
   )
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, "and", n = 2)) %>%
       collect(),
-    df
+    df,
+    check.attributes = FALSE
   )
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, fixed("and"), n = 2)) %>%
       collect(),
-    df
+    df,
+    check.attributes = FALSE
   )
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, regex("and"), n = 2)) %>%
       collect(),
-    df
+    df,
+    check.attributes = FALSE
   )
   expect_dplyr_equal(
     input %>%
       mutate(x = str_split(x, "Foo|bar", n = 2)) %>%
       collect(),
-    df
+    df,
+    check.attributes = FALSE
   )
 })
 
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index d1ebe1e36c9..12bb50fb3d5 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -518,4 +518,4 @@ test_that(".groups argument", {
     ),
     "NOTVALID"
   )
-})
\ No newline at end of file
+})

From 67c4f89188352b693d9f1fd6903b095cec4db844 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Mon, 20 Sep 2021 15:06:01 -0400
Subject: [PATCH 3/3] Comments and jira xrefs

---
 r/R/dplyr-summarize.R                          | 2 +-
 r/tests/testthat/test-dplyr-string-functions.R | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index b57b05da70a..beb18e82039 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -119,7 +119,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) {
       # Anything else is invalid
       stop(paste("Invalid .groups argument:", .groups))
     }
-    # TODO: should we be doing something with `drop_empty_groups` in summarize?
+    # TODO: shouldn't we be doing something with `drop_empty_groups` in summarize? (ARROW-14044)
     out$drop_empty_groups <- .data$drop_empty_groups
   }
   out
diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R
index 05485bb24e1..1098619f3a5 100644
--- a/r/tests/testthat/test-dplyr-string-functions.R
+++ b/r/tests/testthat/test-dplyr-string-functions.R
@@ -411,6 +411,12 @@ test_that("strsplit and str_split", {
       mutate(x = strsplit(x, "and")) %>%
       collect(),
     df,
+    # Pass check.attributes = FALSE through to expect_equal
+    # (which gives you expect_equivalent() behavior).
+    # This is because the vctr that comes back from arrow (ListArray)
+    # has type information in it, but it's just a bare list from R/dplyr.
+    # Note also that whenever we bump up to testthat 3rd edition (ARROW-12871),
+    # the parameter is called `ignore_attr = TRUE`
     check.attributes = FALSE
   )
   expect_dplyr_equal(