Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions r/R/dplyr-collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,18 @@ restore_dplyr_features <- function(df, query) {
# An arrow_dplyr_query holds some attributes that Arrow doesn't know about
# After calling collect(), make sure these features are carried over

if (length(query$group_by_vars) > 0) {
# Preserve groupings, if present
if (length(dplyr::group_vars(query))) {
if (is.data.frame(df)) {
df <- dplyr::grouped_df(
# Preserve groupings, if present
df <- dplyr::group_by(
df,
dplyr::group_vars(query),
drop = dplyr::group_by_drop_default(query)
!!!syms(dplyr::group_vars(query)),
.drop = dplyr::group_by_drop_default(query),
.add = FALSE
)
} else {
# This is a Table, via compute() or collect(as_data_frame = FALSE)
df$metadata$r$attributes$.group_vars <- query$group_by_vars
df$metadata$r$attributes$.group_vars <- dplyr::group_vars(query)
}
}
df
Expand Down
8 changes: 7 additions & 1 deletion r/R/dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ arrow_dplyr_query <- function(.data) {
if (inherits(.data, "data.frame")) {
.data <- Table$create(.data)
}
# ARROW-17737: If .data is a Table, remove groups from metadata
# (we've already grabbed the groups above)
if (inherits(.data, "ArrowTabular")) {
.data <- ungroup.ArrowTabular(.data)
}

# Evaluating expressions on a dataset with duplicated fieldnames will error
dupes <- duplicated(names(.data))
if (any(dupes)) {
Expand Down Expand Up @@ -182,7 +188,7 @@ dim.arrow_dplyr_query <- function(x) {
# Query on in-memory Table, so evaluate the filter
# Don't need any columns
x <- select.arrow_dplyr_query(x, NULL)
rows <- nrow(compute.arrow_dplyr_query(x))
rows <- nrow(as_arrow_table(x))
Copy link
Contributor Author

@eitsupi eitsupi Oct 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is because manipulating metadata for a table with no rows will cause the size to be updated to 0 x 0.

mtcars |> arrow::arrow_table() |> dplyr::select(NULL) |> arrow::as_arrow_table()
#> Table
#> 32 rows x 0 columns
#>
#>
#> See $metadata for additional Schema metadata
mtcars |> arrow::arrow_table() |> dplyr::select(NULL) |> arrow::as_arrow_table() |> dplyr::ungroup()
#> Table
#> 0 rows x 0 columns
#>
#>
#> See $metadata for additional Schema metadata

Created on 2022-10-07 with reprex v2.0.2

I don't know if this (handling of tables with no rows) is a problem.
A table with 0 rows and multiple columns appears to be quite exceptional, since creating a table from a data frame with no rows results in 0 x 0.

mtcars |> dplyr::select(NULL) |> arrow::arrow_table()
#> Table
#> 0 rows x 0 columns
#>
#>
#> See $metadata for additional Schema metadata

Created on 2022-10-07 with reprex v2.0.2

}
c(rows, cols)
}
Expand Down
33 changes: 33 additions & 0 deletions r/tests/testthat/test-dplyr-group-by.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,25 @@ test_that("ungroup", {
)
})

test_that("Groups before conversion to a Table must not be restored after collect() (ARROW-17737)", {
compare_dplyr_binding(
.input %>%
group_by(chr, .add = FALSE) %>%
ungroup() %>%
collect(),
tbl %>%
group_by(int)
)
compare_dplyr_binding(
.input %>%
group_by(chr, .add = TRUE) %>%
ungroup() %>%
collect(),
tbl %>%
group_by(int)
)
})

test_that("group_by then rename", {
compare_dplyr_binding(
.input %>%
Expand Down Expand Up @@ -196,6 +215,20 @@ test_that("group_by() with .add", {
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
group_by(.add = FALSE) %>%
collect(),
tbl %>%
group_by(dbl2)
)
compare_dplyr_binding(
.input %>%
group_by(.add = TRUE) %>%
collect(),
tbl %>%
group_by(dbl2)
)
compare_dplyr_binding(
.input %>%
group_by(chr, .add = FALSE) %>%
Expand Down