From bf60dc230f5634929abed82b1dbe5843e5ecd4fd Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Wed, 5 Oct 2022 14:09:58 -0800
Subject: [PATCH 1/4] Change behavior of pull to compute instead of collect

Ref https://issues.apache.org/jira/browse/ARROW-17439
---
 r/NAMESPACE                                  |  1 +
 r/R/arrow-tabular.R                          |  5 +++
 r/R/dplyr-collect.R                          |  4 +-
 r/tests/testthat/test-dataset-write.R        |  4 +-
 r/tests/testthat/test-dataset.R              | 41 ++++++++++++++------
 r/tests/testthat/test-dplyr-arrange.R        |  4 +-
 r/tests/testthat/test-dplyr-funcs-datetime.R |  3 +-
 r/tests/testthat/test-dplyr-query.R          |  9 +++--
 8 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index e20e61c0e32..5641a665a23 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -344,6 +344,7 @@ export(null)
 export(num_range)
 export(one_of)
 export(open_dataset)
+export(pull.ArrowTabular)
 export(read_csv_arrow)
 export(read_delim_arrow)
 export(read_feather)
diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R
index ae68cc2118f..d580dd98a6e 100644
--- a/r/R/arrow-tabular.R
+++ b/r/R/arrow-tabular.R
@@ -259,3 +259,8 @@ na.omit.ArrowTabular <- function(object, ...) {
 
 #' @export
 na.exclude.ArrowTabular <- na.omit.ArrowTabular
+
+#' @export
+pull.ArrowTabular <- function(x, var = -1) {
+  x[[vars_pull(names(x), !!enquo(var))]]
+}
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index 1714af8a509..0515a22e544 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -50,9 +50,9 @@ pull.arrow_dplyr_query <- function(.data, var = -1) {
   .data <- as_adq(.data)
   var <- vars_pull(names(.data), !!enquo(var))
   .data$selected_columns <- set_names(.data$selected_columns[var], var)
-  dplyr::collect(.data)[[1]]
+  dplyr::compute(.data)[[1]]
 }
-pull.Dataset <- pull.ArrowTabular <- pull.RecordBatchReader <- pull.arrow_dplyr_query
+pull.Dataset <- pull.RecordBatchReader <- pull.arrow_dplyr_query
 
 restore_dplyr_features <- function(df, query) {
   # An arrow_dplyr_query holds some attributes that Arrow doesn't know about
diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R
index 7a5f861ca57..468d11676bb 100644
--- a/r/tests/testthat/test-dataset-write.R
+++ b/r/tests/testthat/test-dataset-write.R
@@ -739,7 +739,8 @@ test_that("Dataset min_rows_per_group", {
 
   row_group_sizes <- ds %>%
     map_batches(~ record_batch(nrows = .$num_rows)) %>%
-    pull(nrows)
+    pull(nrows) %>%
+    as.vector()
   index <- 1
 
   # We expect there to be 3 row groups since 11/5 = 2.2 and 11/4 = 2.75
@@ -778,6 +779,7 @@ test_that("Dataset write max rows per group", {
   row_group_sizes <- ds %>%
     map_batches(~ record_batch(nrows = .$num_rows)) %>%
     pull(nrows) %>%
+    as.vector() %>%
     sort()
 
   expect_equal(row_group_sizes, c(12, 18))
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index ea3e1365d6b..5e28a3ddd5b 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -67,7 +67,7 @@ test_that("IPC/Feather format data", {
 
   # Collecting virtual partition column works
   expect_equal(
-    ds %>% arrange(part) %>% pull(part),
+    ds %>% arrange(part) %>% pull(part) %>% as.vector(),
     c(rep(3, 10), rep(4, 10))
   )
 })
@@ -306,7 +306,7 @@ test_that("Simple interface for datasets", {
 
   # Collecting virtual partition column works
   expect_equal(
-    ds %>% arrange(part) %>% pull(part),
+    ds %>% arrange(part) %>% pull(part) %>% as.vector(),
     c(rep(1, 10), rep(2, 10))
   )
 })
@@ -625,8 +625,16 @@ test_that("scalar aggregates with many batches (ARROW-16904)", {
   ds <- open_dataset(tf)
   replicate(100, ds %>% summarize(min(x)) %>% pull())
 
-  expect_true(all(replicate(100, ds %>% summarize(min(x)) %>% pull()) == 1))
-  expect_true(all(replicate(100, ds %>% summarize(max(x)) %>% pull()) == 100))
+  expect_true(
+    all(
+      replicate(100, ds %>% summarize(min(x)) %>% pull() %>% as.vector()) == 1
+    )
+  )
+  expect_true(
+    all(
+      replicate(100, ds %>% summarize(max(x)) %>% pull() %>% as.vector()) == 100
+    )
+  )
 })
 
 test_that("map_batches", {
@@ -650,6 +658,7 @@ test_that("map_batches", {
       select(int, lgl) %>%
       map_batches(~ record_batch(nrows = .$num_rows)) %>%
       pull(nrows) %>%
+      as.vector() %>%
       sort(),
     c(5, 10)
   )
@@ -1170,7 +1179,8 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", {
   expect_equal(
     ds %>%
       arrange(cyl) %>%
-      pull(cyl),
+      pull(cyl) %>%
+      as.vector(),
     sort(mtcars$cyl)
   )
 
@@ -1188,7 +1198,8 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", {
   expect_equal(
     ds %>%
       arrange(cyl) %>%
-      pull(cyl),
+      pull(cyl) %>%
+      as.vector(),
     sort(mtcars$cyl)
   )
 
@@ -1204,7 +1215,8 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", {
   expect_equal(
     ds %>%
       arrange(cyl) %>%
-      pull(cyl),
+      pull(cyl) %>%
+      as.vector(),
     sort(mtcars$cyl)
   )
 
@@ -1222,7 +1234,8 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", {
   expect_equal(
     ds %>%
       arrange(cyl) %>%
-      pull(cyl),
+      pull(cyl) %>%
+      as.vector(),
     sort(mtcars$cyl)
   )
 
@@ -1256,7 +1269,8 @@ test_that("FileSystemFactoryOptions with HivePartitioning", {
   expect_equal(
     ds %>%
       arrange(cyl) %>%
-      pull(cyl),
+      pull(cyl) %>%
+      as.vector(),
     sort(mtcars$cyl)
   )
 
@@ -1272,7 +1286,8 @@ test_that("FileSystemFactoryOptions with HivePartitioning", {
   expect_equal(
     ds %>%
       arrange(cyl) %>%
-      pull(cyl),
+      pull(cyl) %>%
+      as.vector(),
     sort(mtcars$cyl)
   )
 
@@ -1286,7 +1301,8 @@ test_that("FileSystemFactoryOptions with HivePartitioning", {
   expect_equal(
     ds %>%
       arrange(cyl) %>%
-      pull(cyl),
+      pull(cyl) %>%
+      as.vector(),
     sort(mtcars$cyl)
   )
 
@@ -1302,7 +1318,8 @@ test_that("FileSystemFactoryOptions with HivePartitioning", {
   expect_equal(
     ds %>%
       arrange(cyl) %>%
-      pull(cyl),
+      pull(cyl) %>%
+      as.vector(),
     sort(mtcars$cyl)
   )
 })
diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R
index d8afcc5d4a8..e4452991dc7 100644
--- a/r/tests/testthat/test-dplyr-arrange.R
+++ b/r/tests/testthat/test-dplyr-arrange.R
@@ -118,7 +118,9 @@ test_that("arrange() on integer, double, and character columns", {
     .input %>%
       group_by(grp) %>%
       arrange(.by_group = TRUE) %>%
-      pull(grp),
+      ungroup() %>%
+      pull(grp) %>%
+      as.vector(),
     tbl
   )
   compare_dplyr_binding(
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index 25a844e0333..2608f9d6545 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -168,7 +168,8 @@ test_that("strptime", {
       mutate(
         x = strptime(x, format = "%m-%d-%Y")
       ) %>%
-      pull(),
+      pull() %>%
+      as.vector(),
     # R's strptime returns POSIXlt (list type)
     as.POSIXct(tstamp),
     ignore_attr = "tzone"
diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R
index c40815df69d..db9a3bb30d0 100644
--- a/r/tests/testthat/test-dplyr-query.R
+++ b/r/tests/testthat/test-dplyr-query.R
@@ -70,22 +70,23 @@ See $.data for the source Arrow object',
 
 test_that("pull", {
   compare_dplyr_binding(
-    .input %>% pull(),
+    .input %>% pull() %>% as.vector(),
     tbl
   )
   compare_dplyr_binding(
-    .input %>% pull(1),
+    .input %>% pull(1) %>% as.vector(),
     tbl
   )
   compare_dplyr_binding(
-    .input %>% pull(chr),
+    .input %>% pull(chr) %>% as.vector(),
     tbl
   )
   compare_dplyr_binding(
     .input %>%
       filter(int > 4) %>%
       rename(strng = chr) %>%
-      pull(strng),
+      pull(strng) %>%
+      as.vector(),
     tbl
   )
 })

From d1af67688fcc5afa455997a0f92ef82144d27643 Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Fri, 7 Oct 2022 18:04:58 -0800
Subject: [PATCH 2/4] Remove now-unneeded ungroup() call

No longer needed thanks to https://github.com/apache/arrow/pull/14160/ and @eitsupi's work
---
 r/tests/testthat/test-dplyr-arrange.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R
index e4452991dc7..3444e3ace5f 100644
--- a/r/tests/testthat/test-dplyr-arrange.R
+++ b/r/tests/testthat/test-dplyr-arrange.R
@@ -118,7 +118,6 @@ test_that("arrange() on integer, double, and character columns", {
     .input %>%
       group_by(grp) %>%
       arrange(.by_group = TRUE) %>%
-      ungroup() %>%
       pull(grp) %>%
       as.vector(),
     tbl

From c34de522562cbf8b4a3674605cab1eaf8fc114cc Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Fri, 7 Oct 2022 18:05:25 -0800
Subject: [PATCH 3/4] Remove @export from pull.ArrowTabular def

---
 r/NAMESPACE         | 1 -
 r/R/arrow-tabular.R | 1 -
 2 files changed, 2 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 5641a665a23..e20e61c0e32 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -344,7 +344,6 @@ export(null)
 export(num_range)
 export(one_of)
 export(open_dataset)
-export(pull.ArrowTabular)
 export(read_csv_arrow)
 export(read_delim_arrow)
 export(read_feather)
diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R
index d580dd98a6e..d011c961428 100644
--- a/r/R/arrow-tabular.R
+++ b/r/R/arrow-tabular.R
@@ -260,7 +260,6 @@ na.omit.ArrowTabular <- function(object, ...) {
 #' @export
 na.exclude.ArrowTabular <- na.omit.ArrowTabular
 
-#' @export
 pull.ArrowTabular <- function(x, var = -1) {
   x[[vars_pull(names(x), !!enquo(var))]]
 }

From 9c8691bb0f430c1e4a0a62003547fa4e57d62d70 Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Tue, 11 Oct 2022 14:46:21 -0800
Subject: [PATCH 4/4] Move pull.ArrowTabular to dplyr-collect.R

---
 r/R/arrow-tabular.R | 4 ----
 r/R/dplyr-collect.R | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R
index d011c961428..ae68cc2118f 100644
--- a/r/R/arrow-tabular.R
+++ b/r/R/arrow-tabular.R
@@ -259,7 +259,3 @@ na.omit.ArrowTabular <- function(object, ...) {
 
 #' @export
 na.exclude.ArrowTabular <- na.omit.ArrowTabular
-
-pull.ArrowTabular <- function(x, var = -1) {
-  x[[vars_pull(names(x), !!enquo(var))]]
-}
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index 0515a22e544..deb26859e8b 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -54,6 +54,10 @@ pull.arrow_dplyr_query <- function(.data, var = -1) {
 }
 pull.Dataset <- pull.RecordBatchReader <- pull.arrow_dplyr_query
 
+pull.ArrowTabular <- function(x, var = -1) {
+  x[[vars_pull(names(x), !!enquo(var))]]
+}
+
 restore_dplyr_features <- function(df, query) {
   # An arrow_dplyr_query holds some attributes that Arrow doesn't know about
   # After calling collect(), make sure these features are carried over