From bf60dc230f5634929abed82b1dbe5843e5ecd4fd Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 5 Oct 2022 14:09:58 -0800 Subject: [PATCH 1/4] Change behavior of pull to compute instead of collect Ref https://issues.apache.org/jira/browse/ARROW-17439 --- r/NAMESPACE | 1 + r/R/arrow-tabular.R | 5 +++ r/R/dplyr-collect.R | 4 +- r/tests/testthat/test-dataset-write.R | 4 +- r/tests/testthat/test-dataset.R | 41 ++++++++++++++------ r/tests/testthat/test-dplyr-arrange.R | 4 +- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 +- r/tests/testthat/test-dplyr-query.R | 9 +++-- 8 files changed, 50 insertions(+), 21 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index e20e61c0e32..5641a665a23 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -344,6 +344,7 @@ export(null) export(num_range) export(one_of) export(open_dataset) +export(pull.ArrowTabular) export(read_csv_arrow) export(read_delim_arrow) export(read_feather) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index ae68cc2118f..d580dd98a6e 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -259,3 +259,8 @@ na.omit.ArrowTabular <- function(object, ...) { #' @export na.exclude.ArrowTabular <- na.omit.ArrowTabular + +#' @export +pull.ArrowTabular <- function(x, var = -1) { + x[[vars_pull(names(x), !!enquo(var))]] +} diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 1714af8a509..0515a22e544 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -50,9 +50,9 @@ pull.arrow_dplyr_query <- function(.data, var = -1) { .data <- as_adq(.data) var <- vars_pull(names(.data), !!enquo(var)) .data$selected_columns <- set_names(.data$selected_columns[var], var) - dplyr::collect(.data)[[1]] + dplyr::compute(.data)[[1]] } -pull.Dataset <- pull.ArrowTabular <- pull.RecordBatchReader <- pull.arrow_dplyr_query +pull.Dataset <- pull.RecordBatchReader <- pull.arrow_dplyr_query restore_dplyr_features <- function(df, query) { # An arrow_dplyr_query holds some attributes that Arrow doesn't know about diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R index 7a5f861ca57..468d11676bb 100644 --- a/r/tests/testthat/test-dataset-write.R +++ b/r/tests/testthat/test-dataset-write.R @@ -739,7 +739,8 @@ test_that("Dataset min_rows_per_group", { row_group_sizes <- ds %>% map_batches(~ record_batch(nrows = .$num_rows)) %>% - pull(nrows) + pull(nrows) %>% + as.vector() index <- 1 # We expect there to be 3 row groups since 11/5 = 2.2 and 11/4 = 2.75 @@ -778,6 +779,7 @@ test_that("Dataset write max rows per group", { row_group_sizes <- ds %>% map_batches(~ record_batch(nrows = .$num_rows)) %>% pull(nrows) %>% + as.vector() %>% sort() expect_equal(row_group_sizes, c(12, 18)) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index ea3e1365d6b..5e28a3ddd5b 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -67,7 +67,7 @@ test_that("IPC/Feather format data", { # Collecting virtual partition column works expect_equal( - ds %>% arrange(part) %>% pull(part), + ds %>% arrange(part) %>% pull(part) %>% as.vector(), c(rep(3, 10), rep(4, 10)) ) }) @@ -306,7 +306,7 @@ test_that("Simple interface for datasets", { # Collecting virtual partition column works expect_equal( - ds %>% arrange(part) %>% pull(part), + ds %>% arrange(part) %>% pull(part) %>% as.vector(), c(rep(1, 10), rep(2, 10)) ) }) @@ -625,8 +625,16 @@ test_that("scalar aggregates with many batches (ARROW-16904)", { ds <- open_dataset(tf) replicate(100, ds %>% summarize(min(x)) %>% pull()) - expect_true(all(replicate(100, ds %>% summarize(min(x)) %>% pull()) == 1)) - expect_true(all(replicate(100, ds %>% summarize(max(x)) %>% pull()) == 100)) + expect_true( + all( + replicate(100, ds %>% summarize(min(x)) %>% pull() %>% as.vector()) == 1 + ) + ) + expect_true( + all( + replicate(100, ds %>% summarize(max(x)) %>% pull() %>% as.vector()) == 100 + ) + ) }) test_that("map_batches", { @@ -650,6 +658,7 @@ test_that("map_batches", { select(int, lgl) %>% map_batches(~ record_batch(nrows = .$num_rows)) %>% pull(nrows) %>% + as.vector() %>% sort(), c(5, 10) ) @@ -1170,7 +1179,8 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", { expect_equal( ds %>% arrange(cyl) %>% - pull(cyl), + pull(cyl) %>% + as.vector(), sort(mtcars$cyl) ) @@ -1188,7 +1198,8 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", { expect_equal( ds %>% arrange(cyl) %>% - pull(cyl), + pull(cyl) %>% + as.vector(), sort(mtcars$cyl) ) @@ -1204,7 +1215,8 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", { expect_equal( ds %>% arrange(cyl) %>% - pull(cyl), + pull(cyl) %>% + as.vector(), sort(mtcars$cyl) ) @@ -1222,7 +1234,8 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", { expect_equal( ds %>% arrange(cyl) %>% - pull(cyl), + pull(cyl) %>% + as.vector(), sort(mtcars$cyl) ) @@ -1256,7 +1269,8 @@ test_that("FileSystemFactoryOptions with HivePartitioning", { expect_equal( ds %>% arrange(cyl) %>% - pull(cyl), + pull(cyl) %>% + as.vector(), sort(mtcars$cyl) ) @@ -1272,7 +1286,8 @@ test_that("FileSystemFactoryOptions with HivePartitioning", { expect_equal( ds %>% arrange(cyl) %>% - pull(cyl), + pull(cyl) %>% + as.vector(), sort(mtcars$cyl) ) @@ -1286,7 +1301,8 @@ test_that("FileSystemFactoryOptions with HivePartitioning", { expect_equal( ds %>% arrange(cyl) %>% - pull(cyl), + pull(cyl) %>% + as.vector(), sort(mtcars$cyl) ) @@ -1302,7 +1318,8 @@ test_that("FileSystemFactoryOptions with HivePartitioning", { expect_equal( ds %>% arrange(cyl) %>% - pull(cyl), + pull(cyl) %>% + as.vector(), sort(mtcars$cyl) ) }) diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R index d8afcc5d4a8..e4452991dc7 100644 --- a/r/tests/testthat/test-dplyr-arrange.R +++ b/r/tests/testthat/test-dplyr-arrange.R @@ -118,7 +118,9 @@ test_that("arrange() on integer, double, and character columns", { .input %>% group_by(grp) %>% arrange(.by_group = TRUE) %>% - pull(grp), + ungroup() %>% + pull(grp) %>% + as.vector(), tbl ) compare_dplyr_binding( diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 25a844e0333..2608f9d6545 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -168,7 +168,8 @@ test_that("strptime", { mutate( x = strptime(x, format = "%m-%d-%Y") ) %>% - pull(), + pull() %>% + as.vector(), # R's strptime returns POSIXlt (list type) as.POSIXct(tstamp), ignore_attr = "tzone" diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index c40815df69d..db9a3bb30d0 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -70,22 +70,23 @@ See $.data for the source Arrow object', test_that("pull", { compare_dplyr_binding( - .input %>% pull(), + .input %>% pull() %>% as.vector(), tbl ) compare_dplyr_binding( - .input %>% pull(1), + .input %>% pull(1) %>% as.vector(), tbl ) compare_dplyr_binding( - .input %>% pull(chr), + .input %>% pull(chr) %>% as.vector(), tbl ) compare_dplyr_binding( .input %>% filter(int > 4) %>% rename(strng = chr) %>% - pull(strng), + pull(strng) %>% + as.vector(), tbl ) }) From d1af67688fcc5afa455997a0f92ef82144d27643 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 7 Oct 2022 18:04:58 -0800 Subject: [PATCH 2/4] Remove now-unneeded ungroup() call No longer needed thanks to https://github.com/apache/arrow/pull/14160/ and @eitsupi's work --- r/tests/testthat/test-dplyr-arrange.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R index e4452991dc7..3444e3ace5f 100644 --- a/r/tests/testthat/test-dplyr-arrange.R +++ b/r/tests/testthat/test-dplyr-arrange.R @@ -118,7 +118,6 @@ test_that("arrange() on integer, double, and character columns", { .input %>% group_by(grp) %>% arrange(.by_group = TRUE) %>% - ungroup() %>% pull(grp) %>% as.vector(), tbl From c34de522562cbf8b4a3674605cab1eaf8fc114cc Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 7 Oct 2022 18:05:25 -0800 Subject: [PATCH 3/4] Remove @export from pull.ArrowTabular def --- r/NAMESPACE | 1 - r/R/arrow-tabular.R | 1 - 2 files changed, 2 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 5641a665a23..e20e61c0e32 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -344,7 +344,6 @@ export(null) export(num_range) export(one_of) export(open_dataset) -export(pull.ArrowTabular) export(read_csv_arrow) export(read_delim_arrow) export(read_feather) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index d580dd98a6e..d011c961428 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -260,7 +260,6 @@ na.omit.ArrowTabular <- function(object, ...) { #' @export na.exclude.ArrowTabular <- na.omit.ArrowTabular -#' @export pull.ArrowTabular <- function(x, var = -1) { x[[vars_pull(names(x), !!enquo(var))]] } From 9c8691bb0f430c1e4a0a62003547fa4e57d62d70 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 11 Oct 2022 14:46:21 -0800 Subject: [PATCH 4/4] Move pull.ArrowTabular to dplyr-collect.R --- r/R/arrow-tabular.R | 4 ---- r/R/dplyr-collect.R | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index d011c961428..ae68cc2118f 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -259,7 +259,3 @@ na.omit.ArrowTabular <- function(object, ...) { #' @export na.exclude.ArrowTabular <- na.omit.ArrowTabular - -pull.ArrowTabular <- function(x, var = -1) { - x[[vars_pull(names(x), !!enquo(var))]] -} diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 0515a22e544..deb26859e8b 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -54,6 +54,10 @@ pull.arrow_dplyr_query <- function(.data, var = -1) { } pull.Dataset <- pull.RecordBatchReader <- pull.arrow_dplyr_query +pull.ArrowTabular <- function(x, var = -1) { + x[[vars_pull(names(x), !!enquo(var))]] +} + restore_dplyr_features <- function(df, query) { # An arrow_dplyr_query holds some attributes that Arrow doesn't know about # After calling collect(), make sure these features are carried over