Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions r/R/dplyr-summarize.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,6 @@ summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query

# This is the Arrow summarize implementation
do_arrow_summarize <- function(.data, ..., .groups = NULL) {
if (!is.null(.groups)) {
# ARROW-13550
abort("`summarize()` with `.groups` argument not supported in Arrow")
}
exprs <- ensure_named_exprs(quos(...))

# Create a stateful environment for recording our evaluated expressions
Expand Down Expand Up @@ -97,6 +93,35 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) {
ctx$post_mutate
)[c(.data$group_by_vars, names(exprs))]
}

# Handle .groups argument
if (length(.data$group_by_vars)) {
if (is.null(.groups)) {
# dplyr docs say:
# When ‘.groups’ is not specified, it is chosen based on the
# number of rows of the results:
# • If all the results have 1 row, you get "drop_last".
# • If the number of rows varies, you get "keep".
#
# But we don't support anything that returns multiple rows now
.groups <- "drop_last"
} else {
assert_that(is.string(.groups))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the length of .groups is > 1, it's caught here, which raises an error and drops to the R level, which also raises an error.

However, the error message from dplyr isn't quite fit for our purposes - it mentions that rowwise is a valid value for .groups, which is true in dplyr but not in arrow. Is it worth catching this case ourselves and providing our own error message, which doesn't mention rowwise?

[Edited after reading the tests below] Or is this something worth living with, as the alternative would require significant work to implement for little actual benefit?

mtcars %>% 
   Table$create() %>%
   group_by(mpg) %>% 
   summarise(.groups = c("drop", "keep")) %>%
   collect()
# Warning: Error : .groups is not a string (a length one character vector).; pulling data into R
# Error: `.groups` can't be <chr>
# ℹ Possible values are NULL (default), "drop_last", "drop", "keep", and "rowwise"
# Run `rlang::last_error()` to see where the error occurred.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm open to suggestion, and I'll think some more about whether this is a problem we have anywhere else, but my thinking for leaving it as it is: (1) when querying on a dataset, it will just error and tell you to pull into R (where rowwise would work); (2) it's an experimental feature in dplyr so IDK how much we should worry about the smoothest experience when you go off the default.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, that makes a lot of sense and good point about it being an experimental feature - sounds like something to leave for now but revisit if it comes up again elsewhere.

}
if (.groups == "drop_last") {
out$group_by_vars <- head(.data$group_by_vars, -1)
} else if (.groups == "keep") {
out$group_by_vars <- .data$group_by_vars
} else if (.groups == "rowwise") {
stop(arrow_not_supported('.groups = "rowwise"'))
} else if (.groups != "drop") {
# Drop means don't group by anything so there's nothing to do.
# Anything else is invalid
stop(paste("Invalid .groups argument:", .groups))
}
# TODO: shouldn't we be doing something with `drop_empty_groups` in summarize? (ARROW-14044)
out$drop_empty_groups <- .data$drop_empty_groups
}
out
}

Expand Down
4 changes: 2 additions & 2 deletions r/tests/testthat/helper-expectation.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ expect_dplyr_equal <- function(expr,
),
warning
)
expect_equivalent(via_batch, expected, ...)
expect_equal(via_batch, expected, ...)
} else {
skip_msg <- c(skip_msg, skip_record_batch)
}
Expand All @@ -118,7 +118,7 @@ expect_dplyr_equal <- function(expr,
),
warning
)
expect_equivalent(via_table, expected, ...)
expect_equal(via_table, expected, ...)
} else {
skip_msg <- c(skip_msg, skip_table)
}
Expand Down
10 changes: 6 additions & 4 deletions r/tests/testthat/test-chunked-array.R
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,12 @@ test_that("ChunkedArray supports difftime", {
})

test_that("ChunkedArray supports empty arrays (ARROW-13761)", {
types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
uint64(), float32(), float64(), timestamp("ns"), binary(),
large_binary(), fixed_size_binary(32), date32(), date64(),
decimal(4, 2))
types <- c(
int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
uint64(), float32(), float64(), timestamp("ns"), binary(),
large_binary(), fixed_size_binary(32), date32(), date64(),
decimal(4, 2)
)

empty_filter <- ChunkedArray$create(type = bool())
for (type in types) {
Expand Down
2 changes: 0 additions & 2 deletions r/tests/testthat/test-compute-no-bindings.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ test_that("non-bound compute kernels using ModeOptions", {
})

test_that("non-bound compute kernels using PartitionNthOptions", {

result <- call_function(
"partition_nth_indices",
Array$create(c(11:20)),
Expand All @@ -131,7 +130,6 @@ test_that("non-bound compute kernels using PartitionNthOptions", {
# (depends on C++ standard library implementation)
expect_true(all(as.vector(result[1:3]) < 3))
expect_true(all(as.vector(result[4:10]) >= 3))

})


Expand Down
30 changes: 22 additions & 8 deletions r/tests/testthat/test-dplyr-string-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -410,49 +410,63 @@ test_that("strsplit and str_split", {
input %>%
mutate(x = strsplit(x, "and")) %>%
collect(),
df
df,
# Pass check.attributes = FALSE through to expect_equal
# (which gives you expect_equivalent() behavior).
# This is because the vctr that comes back from arrow (ListArray)
# has type information in it, but it's just a bare list from R/dplyr.
# Note also that whenever we bump up to testthat 3rd edition (ARROW-12871),
# the parameter is called `ignore_attr = TRUE`
check.attributes = FALSE
)
expect_dplyr_equal(
input %>%
mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
collect(),
df
df,
check.attributes = FALSE
)
expect_dplyr_equal(
input %>%
mutate(x = strsplit(x, " +and +")) %>%
collect(),
df
df,
check.attributes = FALSE
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, "and")) %>%
collect(),
df
df,
check.attributes = FALSE
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, "and", n = 2)) %>%
collect(),
df
df,
check.attributes = FALSE
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, fixed("and"), n = 2)) %>%
collect(),
df
df,
check.attributes = FALSE
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, regex("and"), n = 2)) %>%
collect(),
df
df,
check.attributes = FALSE
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, "Foo|bar", n = 2)) %>%
collect(),
df
df,
check.attributes = FALSE
)
})

Expand Down
53 changes: 52 additions & 1 deletion r/tests/testthat/test-dplyr-summarize.R
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ test_that("Expressions on aggregations", {
)

# Aggregate on an aggregate (trivial but dplyr allows)
skip("Not supported")
skip("Aggregate on an aggregate not supported")
expect_dplyr_equal(
input %>%
group_by(some_grouping) %>%
Expand Down Expand Up @@ -468,3 +468,54 @@ test_that("Not (yet) supported: implicit join", {
warning = "Expression dbl - int not supported in Arrow; pulling data into R"
)
})

test_that(".groups argument", {
expect_dplyr_equal(
input %>%
group_by(some_grouping, int < 6) %>%
summarize(count = n()) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
group_by(some_grouping, int < 6) %>%
summarize(count = n(), .groups = "drop_last") %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
group_by(some_grouping, int < 6) %>%
summarize(count = n(), .groups = "keep") %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
group_by(some_grouping, int < 6) %>%
summarize(count = n(), .groups = "drop") %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
group_by(some_grouping, int < 6) %>%
summarize(count = n(), .groups = "rowwise") %>%
collect(),
tbl,
warning = TRUE
)

# abandon_ship() raises the warning, then dplyr itself errors
# This isn't ideal but it's fine and won't be an issue on Datasets
expect_error(
expect_warning(
Table$create(tbl) %>%
group_by(some_grouping, int < 6) %>%
summarize(count = n(), .groups = "NOTVALID"),
"Invalid .groups argument"
),
"NOTVALID"
)
})