From 4493c6c480b2d7a47c48511a107aa2db6dd451f9 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Mon, 12 Aug 2024 09:39:10 -0700
Subject: [PATCH] [R] Fix summarize() performance regression (pushdown)

---
 r/R/dplyr-summarize.R | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index f4fda0f13aa..a9ad750de7c 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -43,6 +43,15 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) {
     hash = length(.data$group_by_vars) > 0
   )
 
+  # Do a projection here to keep only the columns we need in summarize().
+  # If possible, this will push down the column selection into the SourceNode,
+  # saving lots of wasted processing for columns we don't need. (GH-43627)
+  vars_to_keep <- unique(c(
+    unlist(lapply(exprs, all.vars)), # vars referenced in summarize
+    dplyr::group_vars(.data) # vars needed for grouping
+  ))
+  .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data)))
+
   # nolint start
   # summarize() is complicated because you can do a mixture of scalar operations
   # and aggregations, but that's not how Acero works. For example, for us to do