From 4493c6c480b2d7a47c48511a107aa2db6dd451f9 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 12 Aug 2024 09:39:10 -0700 Subject: [PATCH] [R] Fix summarize() performance regression (pushdown) --- r/R/dplyr-summarize.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index f4fda0f13aa..a9ad750de7c 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -43,6 +43,15 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { hash = length(.data$group_by_vars) > 0 ) + # Do a projection here to keep only the columns we need in summarize(). + # If possible, this will push down the column selection into the SourceNode, + # saving lots of wasted processing for columns we don't need. (GH-43627) + vars_to_keep <- unique(c( + unlist(lapply(exprs, all.vars)), # vars referenced in summarize + dplyr::group_vars(.data) # vars needed for grouping + )) + .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data))) + # nolint start # summarize() is complicated because you can do a mixture of scalar operations # and aggregations, but that's not how Acero works. For example, for us to do