Skip to content
9 changes: 5 additions & 4 deletions r/R/dplyr-collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,7 @@ restore_dplyr_features <- function(df, query) {
)
} else {
# This is a Table, via compute() or collect(as_data_frame = FALSE)
df <- as_adq(df)
df$group_by_vars <- query$group_by_vars
df$drop_empty_groups <- query$drop_empty_groups
df$metadata$r$attributes$.group_vars <- query$group_by_vars
}
}
df
Expand All @@ -80,7 +78,10 @@ collapse.arrow_dplyr_query <- function(x, ...) {
# Figure out what schema will result from the query
x$schema <- implicit_schema(x)
# Nest inside a new arrow_dplyr_query (and keep groups)
restore_dplyr_features(arrow_dplyr_query(x), x)
out <- arrow_dplyr_query(x)
out$group_by_vars <- x$group_by_vars
out$drop_empty_groups <- x$drop_empty_groups
out
}
collapse.Dataset <- collapse.ArrowTabular <- collapse.RecordBatchReader <- function(x, ...) {
arrow_dplyr_query(x)
Expand Down
10 changes: 3 additions & 7 deletions r/tests/testthat/test-dataset-dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -284,13 +284,9 @@ test_that("compute()/collect(as_data_frame=FALSE)", {
group_by(fct) %>%
compute()

# the group_by() prevents compute() from returning a Table...
expect_s3_class(tab5, "arrow_dplyr_query")

# ... but $.data is a Table...
expect_r6_class(tab5$.data, "Table")
# ... and the mutate() was evaluated
expect_true("negint" %in% names(tab5$.data))
expect_r6_class(tab5, "Table")
# mutate() was evaluated
expect_true("negint" %in% names(tab5))
})

test_that("head/tail on query on dataset", {
Expand Down
44 changes: 42 additions & 2 deletions r/tests/testthat/test-dplyr-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ test_that("collect(as_data_frame=FALSE)", {
filter(int > 5) %>%
group_by(int) %>%
collect(as_data_frame = FALSE)
expect_s3_class(b4, "arrow_dplyr_query")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need a test (and matching code change) that a Table with groups in the $metadata prints that it is grouped. Currently, the print method does not reveal that you have groupings, even though they get restored when you as.data.frame():

> tab <- arrow_table(mtcars)
> tab$metadata$r$attributes$.group_vars <- "cyl"
> tab
Table
32 rows x 11 columns
$mpg <double>
$cyl <double>
$disp <double>
$hp <double>
$drat <double>
$wt <double>
$qsec <double>
$vs <double>
$am <double>
$gear <double>
$carb <double>

See $metadata for additional Schema metadata

I wonder what else is lost or concealed by this change. This feature of reading groups from $metadata$r$attributes$.group_vars was a bit of a hack and not something we're generally relying on, so I don't think it's a simple change. Or at least, it's not something we can just rely that the test suite already covers everything we care about. (Search for ".group_vars" in the code, it's not touched in many places, and only in one test.)

Not a reason not to do it, just need to be thorough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your review.

I think we need a test (and matching code change) that a Table with groups in the $metadata prints that it is grouped. Currently, the print method does not reveal that you have groupings, even though they get restored when you as.data.frame():

I agree that the print results need improvement.
But would it be better to create a separate ticket and do it there?
This occurs even in arrow 9.0.0, regardless of the behavior with compute.

mtcars |> dplyr::group_by(cyl) |> arrow::arrow_table()
#> Table
#> 32 rows x 11 columns
#> $mpg <double>
#> $cyl <double>
#> $disp <double>
#> $hp <double>
#> $drat <double>
#> $wt <double>
#> $qsec <double>
#> $vs <double>
#> $am <double>
#> $gear <double>
#> $carb <double>
#>
#> See $metadata for additional Schema metadata

Created on 2022-09-20 with reprex v2.0.2

This feature of reading groups from $metadata$r$attributes$.group_vars was a bit of a hack and not something we're generally relying on, so I don't think it's a simple change. Or at least, it's not something we can just rely that the test suite already covers everything we care about.

Since dplyr::group_vars reads this field, isn't this function generally used?

expect_r6_class(b4, "Table")
expect_equal(
as.data.frame(b4),
expected %>%
Expand Down Expand Up @@ -156,7 +156,7 @@ test_that("compute()", {
filter(int > 5) %>%
group_by(int) %>%
compute()
expect_s3_class(b4, "arrow_dplyr_query")
expect_r6_class(b4, "Table")
expect_equal(
as.data.frame(b4),
expected %>%
Expand Down Expand Up @@ -579,3 +579,43 @@ test_that("needs_projection unit tests", {
tab %>% relocate(lgl)
))
})

test_that("compute() on a grouped query returns a Table with groups in metadata", {
tab1 <- tbl %>%
arrow_table() %>%
group_by(int) %>%
compute()
expect_r6_class(tab1, "Table")
expect_equal(
as.data.frame(tab1),
tbl %>%
group_by(int)
)
expect_equal(
collect(tab1),
tbl %>%
group_by(int)
)
})

test_that("collect() is identical to compute() %>% collect()", {
tab1 <- tbl %>%
arrow_table()
adq1 <- tab1 %>%
group_by(int)

expect_equal(
tab1 %>%
compute() %>%
collect(),
tab1 %>%
collect()
)
expect_equal(
adq1 %>%
compute() %>%
collect(),
adq1 %>%
collect()
)
})