From b678ac49f0bc8ccda570dd76a92b747c10f349c5 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 27 Aug 2021 11:42:51 +0100
Subject: [PATCH 01/19] Rebase

---
 r/R/dplyr-functions.R                   | 10 ++++++++++
 r/src/compute.cpp                       |  9 ++++++++-
 r/tests/testthat/test-dplyr-summarize.R |  1 +
 3 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index 765438ac908..27d6d3a9e33 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -889,6 +889,16 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) {
     options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof)
   )
 }
+
+# na.rm here doesn't work yet - see ARROW-13782
+agg_funcs$median <- function(x, na.rm = FALSE) {
+  list(
+    fun = "tdigest",
+    data = x,
+    options = list(na.rm = na.rm, q = 0.5)
+  )
+}
+
 agg_funcs$n_distinct <- function(x, na.rm = FALSE) {
   list(
     fun = "count_distinct",
diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 8268fd10eee..61eb3575f79 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -186,7 +186,14 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
     }
     return out;
   }
-
+  
+  if (func_name == "tdigest" || func_name == "hash_tdigest") {
+    using Options = arrow::compute::TDigestOptions;
+    auto out = std::make_shared<Options>(Options::Defaults());
+    out->q = cpp11::as_cpp<std::vector<double>>(options["q"]);
+    return out;
+  }
+  
   if (func_name == "count") {
     using Options = arrow::compute::CountOptions;
     auto out = std::make_shared<Options>(Options::Defaults());
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index c74ed6aa938..fef84420aad 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -621,4 +621,5 @@ test_that(".groups argument", {
     ),
     "NOTVALID"
   )
+
 })

From 08c515ab76d08b60c9baec75c8396be4618914ad Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 27 Aug 2021 12:04:33 +0100
Subject: [PATCH 02/19] Run linter

---
 r/src/compute.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 61eb3575f79..16f7ae83a5e 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -186,14 +186,14 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
     }
     return out;
   }
-  
+
   if (func_name == "tdigest" || func_name == "hash_tdigest") {
     using Options = arrow::compute::TDigestOptions;
     auto out = std::make_shared<Options>(Options::Defaults());
     out->q = cpp11::as_cpp<std::vector<double>>(options["q"]);
     return out;
   }
-  
+
   if (func_name == "count") {
     using Options = arrow::compute::CountOptions;
     auto out = std::make_shared<Options>(Options::Defaults());

From a2717b7b3d489d59c90c1289dc74fe64164cf0e7 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 16 Sep 2021 21:03:31 -0400
Subject: [PATCH 03/19] Pass na.rm --> skip_nulls

---
 r/R/dplyr-functions.R | 3 +--
 r/src/compute.cpp     | 7 ++++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index 27d6d3a9e33..abcbae57664 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -890,12 +890,11 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) {
   )
 }
 
-# na.rm here doesn't work yet - see ARROW-13782
 agg_funcs$median <- function(x, na.rm = FALSE) {
   list(
     fun = "tdigest",
     data = x,
-    options = list(na.rm = na.rm, q = 0.5)
+    options = list(skip_nulls = na.rm, q = 0.5)
   )
 }
 
diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 16f7ae83a5e..830d164376b 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -190,7 +190,12 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
   if (func_name == "tdigest" || func_name == "hash_tdigest") {
     using Options = arrow::compute::TDigestOptions;
     auto out = std::make_shared<Options>(Options::Defaults());
-    out->q = cpp11::as_cpp<std::vector<double>>(options["q"]);
+    if (!Rf_isNull(options["q"])) {
+      out->q = cpp11::as_cpp<std::vector<double>>(options["q"]);
+    }
+    if (!Rf_isNull(options["skip_nulls"])) {
+      out->skip_nulls = cpp11::as_cpp<bool>(options["skip_nulls"]);
+    }
     return out;
   }
 

From 602bbcb6b43f77150be7807a9e12a1a691e5d77c Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 16 Sep 2021 21:45:31 -0400
Subject: [PATCH 04/19] Add TODOs

---
 r/R/dplyr-functions.R | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index abcbae57664..9014c4fe794 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -891,6 +891,13 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) {
 }
 
 agg_funcs$median <- function(x, na.rm = FALSE) {
+
+  # TODO: after ARROW-12669 is merged, use the list_element function
+  # to unnest the ListArray returned by tdigest.
+
+  # TODO: issue a warning (only once per session if possible) saying
+  # that this returns only an approximate median
+
   list(
     fun = "tdigest",
     data = x,

From 069bcc35f4d842e49077aa21d9488001ae2ebf0b Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 16 Sep 2021 22:36:55 -0400
Subject: [PATCH 05/19] Warn once per session that median is approx

---
 r/NAMESPACE                             |  1 +
 r/R/arrow-package.R                     |  2 +-
 r/R/dplyr-functions.R                   |  3 +--
 r/R/dplyr-summarize.R                   | 10 ++++++++++
 r/tests/testthat/test-dplyr-summarize.R |  6 ++++--
 5 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 61ca5d8fdc4..f89a7352ec9 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -323,6 +323,7 @@ importFrom(rlang,is_bare_character)
 importFrom(rlang,is_character)
 importFrom(rlang,is_false)
 importFrom(rlang,is_integerish)
+importFrom(rlang,is_interactive)
 importFrom(rlang,is_quosure)
 importFrom(rlang,list2)
 importFrom(rlang,new_data_mask)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index e2d3ec18846..ec9c0500494 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -21,7 +21,7 @@
 #' @importFrom assertthat assert_that is.string
 #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos
 #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec
-#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2
+#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
 #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
 #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
 #' @useDynLib arrow, .registration = TRUE
diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index 9014c4fe794..728d4fdc41e 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -895,8 +895,7 @@ agg_funcs$median <- function(x, na.rm = FALSE) {
   # TODO: after ARROW-12669 is merged, use the list_element function
   # to unnest the ListArray returned by tdigest.
 
-  # TODO: issue a warning (only once per session if possible) saying
-  # that this returns only an approximate median
+  # TODO: Bind to the Arrow function that returns an exact median (ARROW-14021)
 
   list(
     fun = "tdigest",
diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index beb18e82039..641603c05f4 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -161,6 +161,16 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) {
     return()
   }
 
+  # TODO: Remove this warning after median() returns an exact median
+  # (ARROW-14021)
+  if ("median" %in% funs_in_expr) {
+    warn(
+      "median() currently returns an approximate median in Arrow",
+      .frequency = ifelse(is_interactive(), "once", "always"),
+      .frequency_id = "arrow.median.approximate"
+    )
+  }
+
   # Start inspecting the expr to see what aggregations it involves
   agg_funs <- names(agg_funcs)
   outer_agg <- funs_in_expr[1] %in% agg_funs
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index fef84420aad..881215abc68 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -578,7 +578,8 @@ test_that(".groups argument", {
       group_by(some_grouping, int < 6) %>%
       summarize(count = n()) %>%
       collect(),
-    tbl
+    tbl,
+    warning = "median\\(\\) currently returns an approximate median in Arrow"
   )
   expect_dplyr_equal(
     input %>%
@@ -592,7 +593,8 @@ test_that(".groups argument", {
       group_by(some_grouping, int < 6) %>%
       summarize(count = n(), .groups = "keep") %>%
       collect(),
-    tbl
+    tbl,
+    warning = "median\\(\\) currently returns an approximate median in Arrow"
   )
   expect_dplyr_equal(
     input %>%

From c2347dbfb59c45dcbe053a5fa33f72849b99b582 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Thu, 16 Sep 2021 23:57:28 -0400
Subject: [PATCH 06/19] median(...) --> arrow_list_element(median(...), 0)

---
 r/R/dplyr-functions.R |  5 -----
 r/R/dplyr-summarize.R | 16 ++++++++++++++++
 r/R/util.R            |  7 +++++++
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index 728d4fdc41e..168e855a978 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -891,12 +891,7 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) {
 }
 
 agg_funcs$median <- function(x, na.rm = FALSE) {
-
-  # TODO: after ARROW-12669 is merged, use the list_element function
-  # to unnest the ListArray returned by tdigest.
-
   # TODO: Bind to the Arrow function that returns an exact median (ARROW-14021)
-
   list(
     fun = "tdigest",
     data = x,
diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index 641603c05f4..6e0757aea06 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -164,6 +164,7 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) {
   # TODO: Remove this warning after median() returns an exact median
   # (ARROW-14021)
   if ("median" %in% funs_in_expr) {
+    expr <- wrap_median(expr)
     warn(
       "median() currently returns an approximate median in Arrow",
       .frequency = ifelse(is_interactive(), "once", "always"),
@@ -261,3 +262,18 @@ extract_aggregations <- function(expr, ctx) {
   }
   expr
 }
+
+# This function recurses through expr and wraps each call to median() with a
+# call to arrow_list_element()
+# TODO: test that this works after ARROW-12669 is merged
+wrap_median <- function(expr) {
+  if (length(expr) == 1) {
+    return(expr)
+  } else {
+    if (is.call(expr) && expr[[1]] == quote(median)) {
+      return(str2lang(paste0("arrow_list_element(", deparse1(expr),", 0)")))
+    } else {
+      return(as.call(lapply(expr, wrap_median)))
+    }
+  }
+}
diff --git a/r/R/util.R b/r/R/util.R
index 94d3a78782f..918dba07eae 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -22,6 +22,13 @@ if (!exists("deparse1")) {
   }
 }
 
+# for compatibility with R versions earlier than 3.6.0
+if (!exists("str2lang")) {
+  str2lang <- function(s) {
+    parse(text = s, keep.source = FALSE)[[1]]
+  }
+}
+
 oxford_paste <- function(x, conjunction = "and", quote = TRUE) {
   if (quote && is.character(x)) {
     x <- paste0('"', x, '"')

From 2f5efae1af1070a6881674a62599d220a0630a62 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 17 Sep 2021 00:00:23 -0400
Subject: [PATCH 07/19] Move comment

---
 r/R/dplyr-summarize.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index 6e0757aea06..1ff377b771c 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -161,10 +161,10 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) {
     return()
   }
 
-  # TODO: Remove this warning after median() returns an exact median
-  # (ARROW-14021)
   if ("median" %in% funs_in_expr) {
     expr <- wrap_median(expr)
+    # TODO: Remove this warning after median() returns an exact median
+    # (ARROW-14021)
     warn(
       "median() currently returns an approximate median in Arrow",
       .frequency = ifelse(is_interactive(), "once", "always"),

From cc63551a5e08043a6db7c4faad62b2237d7c2194 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 17 Sep 2021 10:39:08 -0400
Subject: [PATCH 08/19] Make the linter happy

---
 r/R/dplyr-summarize.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index 1ff377b771c..27681322b44 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -271,7 +271,7 @@ wrap_median <- function(expr) {
     return(expr)
   } else {
     if (is.call(expr) && expr[[1]] == quote(median)) {
-      return(str2lang(paste0("arrow_list_element(", deparse1(expr),", 0)")))
+      return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0)")))
     } else {
       return(as.call(lapply(expr, wrap_median)))
     }

From 4a75d8ba7607f2e5d8120214311d37e230800088 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Mon, 20 Sep 2021 22:20:00 -0400
Subject: [PATCH 09/19] Handle tdigest output differently for hash vs. no hash

---
 r/R/dplyr-collect.R                     |  3 +-
 r/R/dplyr-functions.R                   |  8 +++-
 r/R/dplyr-summarize.R                   | 23 +++++++----
 r/tests/testthat/test-dplyr-summarize.R | 53 ++++++++++++++++++++++---
 4 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index 8a5488bf599..06914abe072 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -96,9 +96,10 @@ implicit_schema <- function(.data) {
     #   and they get projected to this order after aggregation)
     # * Infer the output types from the aggregations
     group_fields <- new_fields[.data$group_by_vars]
+    hash <- length(.data$group_by_vars) > 0
     agg_fields <- imap(
       new_fields[setdiff(names(new_fields), .data$group_by_vars)],
-      ~ output_type(.data$aggregations[[.y]][["fun"]], .x)
+      ~ output_type(.data$aggregations[[.y]][["fun"]], .x, hash)
     )
     new_fields <- c(group_fields, agg_fields)
   }
diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index 168e855a978..e4041c5abc5 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -936,7 +936,7 @@ agg_funcs$max <- function(..., na.rm = FALSE) {
   )
 }
 
-output_type <- function(fun, input_type) {
+output_type <- function(fun, input_type, hash) {
   # These are quick and dirty heuristics.
   if (fun %in% c("any", "all")) {
     bool()
@@ -945,6 +945,12 @@ output_type <- function(fun, input_type) {
     input_type
   } else if (fun %in% c("mean", "stddev", "variance")) {
     float64()
+  } else if (fun %in% "tdigest") {
+    if (hash) {
+      fixed_size_list_of(float64(), 1L)
+    } else {
+      float64()
+    }
   } else {
     # Just so things don't error, assume the resulting type is the same
     input_type
diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index 27681322b44..d145cc0efb4 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -65,7 +65,12 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) {
   for (i in seq_along(exprs)) {
     # Iterate over the indices and not the names because names may be repeated
     # (which overwrites the previous name)
-    summarize_eval(names(exprs)[i], exprs[[i]], ctx)
+    summarize_eval(
+      names(exprs)[i],
+      exprs[[i]],
+      ctx,
+      length(.data$group_by_vars) > 0
+    )
   }
 
   # Apply the results to the .data object.
@@ -150,7 +155,7 @@ format_aggregation <- function(x) {
 # appropriate combination of (1) aggregations (possibly temporary) and
 # (2) post-aggregation transformations (mutate)
 # The function returns nothing: it assigns into the `ctx` environment
-summarize_eval <- function(name, quosure, ctx, recurse = FALSE) {
+summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) {
   expr <- quo_get_expr(quosure)
   ctx$quo_env <- quo_get_env(quosure)
 
@@ -162,7 +167,7 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) {
   }
 
   if ("median" %in% funs_in_expr) {
-    expr <- wrap_median(expr)
+    expr <- wrap_median(expr, hash)
     # TODO: Remove this warning after median() returns an exact median
     # (ARROW-14021)
     warn(
@@ -170,6 +175,7 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) {
       .frequency = ifelse(is_interactive(), "once", "always"),
       .frequency_id = "arrow.median.approximate"
     )
+    funs_in_expr <- all_funs(expr)
   }
 
   # Start inspecting the expr to see what aggregations it involves
@@ -265,15 +271,18 @@ extract_aggregations <- function(expr, ctx) {
 
 # This function recurses through expr and wraps each call to median() with a
 # call to arrow_list_element()
-# TODO: test that this works after ARROW-12669 is merged
-wrap_median <- function(expr) {
+wrap_median <- function(expr, hash) {
   if (length(expr) == 1) {
     return(expr)
   } else {
     if (is.call(expr) && expr[[1]] == quote(median)) {
-      return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0)")))
+      if (hash) {
+        return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)")))
+      } else {
+        return(expr)
+      }
     } else {
-      return(as.call(lapply(expr, wrap_median)))
+      return(as.call(lapply(expr, wrap_median, hash)))
     }
   }
 }
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 881215abc68..ebec84c2a25 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -354,6 +354,52 @@ test_that("summarise() with !!sym()", {
   )
 })
 
+test_that("median()", {
+  # with groups
+  expect_dplyr_equal(
+    input %>%
+      group_by(some_grouping) %>%
+      summarize(
+        med_dbl = median(dbl),
+        med_int = median(int),
+        med_dbl_narmf = median(dbl, FALSE),
+        med_int_narmf = median(int, na.rm = F),
+        med_dbl_narmt = median(dbl, na.rm = TRUE),
+        med_int_narmt = median(int, T)
+      ) %>%
+      arrange(some_grouping) %>%
+      collect(),
+    tbl,
+    warning = "median\\(\\) currently returns an approximate median in Arrow"
+  )
+  # without groups, with na.rm = TRUE
+  expect_dplyr_equal(
+    input %>%
+      summarize(
+        med_dbl_narmt = median(dbl, na.rm = TRUE),
+        med_int_narmt = median(int, T)
+      ) %>%
+      collect(),
+    tbl,
+    warning = "median\\(\\) currently returns an approximate median in Arrow"
+  )
+
+  skip("Error on median(, na.rm = FALSE) with no groups (ARROW-14050)")
+  # without groups, with na.rm = FALSE (the default)
+  expect_dplyr_equal(
+    input %>%
+      summarize(
+        med_dbl = median(dbl),
+        med_int = median(int),
+        med_dbl_narmf = median(dbl, F),
+        med_int_narmf = median(int, na.rm = FALSE)
+      ) %>%
+      collect(),
+    tbl,
+    warning = "median\\(\\) currently returns an approximate median in Arrow"
+  )
+})
+
 test_that("Filter and aggregate", {
   expect_dplyr_equal(
     input %>%
@@ -578,8 +624,7 @@ test_that(".groups argument", {
       group_by(some_grouping, int < 6) %>%
       summarize(count = n()) %>%
       collect(),
-    tbl,
-    warning = "median\\(\\) currently returns an approximate median in Arrow"
+    tbl
   )
   expect_dplyr_equal(
     input %>%
@@ -593,8 +638,7 @@ test_that(".groups argument", {
       group_by(some_grouping, int < 6) %>%
       summarize(count = n(), .groups = "keep") %>%
       collect(),
-    tbl,
-    warning = "median\\(\\) currently returns an approximate median in Arrow"
+    tbl
   )
   expect_dplyr_equal(
     input %>%
@@ -623,5 +667,4 @@ test_that(".groups argument", {
     ),
     "NOTVALID"
   )
-
 })

From 64fb80bc712618308a68949c6e690beab75f8a70 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Tue, 21 Sep 2021 09:07:47 -0400
Subject: [PATCH 10/19] Make styler happy

---
 r/tests/testthat/test-dplyr-summarize.R | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index ebec84c2a25..45cf72d4e09 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -363,9 +363,13 @@ test_that("median()", {
         med_dbl = median(dbl),
         med_int = median(int),
         med_dbl_narmf = median(dbl, FALSE),
+        # styler: off
         med_int_narmf = median(int, na.rm = F),
+        # styler: on
         med_dbl_narmt = median(dbl, na.rm = TRUE),
+        # styler: off
         med_int_narmt = median(int, T)
+        # styler: on
       ) %>%
       arrange(some_grouping) %>%
       collect(),
@@ -377,7 +381,9 @@ test_that("median()", {
     input %>%
       summarize(
         med_dbl_narmt = median(dbl, na.rm = TRUE),
+        # styler: off
         med_int_narmt = median(int, T)
+        # styler: on
       ) %>%
       collect(),
     tbl,
@@ -391,7 +397,9 @@ test_that("median()", {
       summarize(
         med_dbl = median(dbl),
         med_int = median(int),
+        # styler: off
         med_dbl_narmf = median(dbl, F),
+        # styler: on
         med_int_narmf = median(int, na.rm = FALSE)
       ) %>%
       collect(),

From d1be7c2c4fb4ca6c644d1077580a14353152f02b Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Tue, 21 Sep 2021 09:27:58 -0400
Subject: [PATCH 11/19] Fix merge error

---
 r/tests/testthat/test-dplyr-summarize.R | 57 +++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 45cf72d4e09..28e322150cc 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -229,6 +229,63 @@ test_that("Group by n_distinct() on dataset", {
   )
 })
 
+<<<<<<< HEAD
+=======
+test_that("median()", {
+  # with groups
+  expect_dplyr_equal(
+    input %>%
+      group_by(some_grouping) %>%
+      summarize(
+        med_dbl = median(dbl),
+        med_int = median(int),
+        med_dbl_narmf = median(dbl, FALSE),
+        # styler: off
+        med_int_narmf = median(int, na.rm = F),
+        # styler: on
+        med_dbl_narmt = median(dbl, na.rm = TRUE),
+        # styler: off
+        med_int_narmt = median(int, T)
+        # styler: on
+      ) %>%
+      arrange(some_grouping) %>%
+      collect(),
+    tbl,
+    warning = "median\\(\\) currently returns an approximate median in Arrow"
+  )
+  # without groups, with na.rm = TRUE
+  expect_dplyr_equal(
+    input %>%
+      summarize(
+        med_dbl_narmt = median(dbl, na.rm = TRUE),
+        # styler: off
+        med_int_narmt = median(int, T)
+        # styler: on
+      ) %>%
+      collect(),
+    tbl,
+    warning = "median\\(\\) currently returns an approximate median in Arrow"
+  )
+
+  skip("Error on median(, na.rm = FALSE) with no groups (ARROW-14050)")
+  # without groups, with na.rm = FALSE (the default)
+  expect_dplyr_equal(
+    input %>%
+      summarize(
+        med_dbl = median(dbl),
+        med_int = median(int),
+        # styler: off
+        med_dbl_narmf = median(dbl, F),
+        # styler: on
+        med_int_narmf = median(int, na.rm = FALSE)
+      ) %>%
+      collect(),
+    tbl,
+    warning = "median\\(\\) currently returns an approximate median in Arrow"
+  )
+})
+
+>>>>>>> b5fffd2e2 (Fix merge error)
 test_that("summarize() with min() and max()", {
   expect_dplyr_equal(
     input %>%

From b191be640cfc1739210b4d44542223b08aa40047 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Tue, 21 Sep 2021 16:51:57 -0400
Subject: [PATCH 12/19] Use approximate_median for median(); add quantile()
 binding

---
 r/R/dplyr-functions.R | 33 +++++++++++++++++++++++++++------
 r/R/dplyr-summarize.R | 19 ++++++-------------
 r/src/compute.cpp     |  9 +++++++++
 3 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index e4041c5abc5..e72842e1d2b 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -889,16 +889,37 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) {
     options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof)
   )
 }
-
-agg_funcs$median <- function(x, na.rm = FALSE) {
-  # TODO: Bind to the Arrow function that returns an exact median (ARROW-14021)
+agg_funcs$quantile <- function(x, probs, na.rm = FALSE) {
+  if (length(probs) != 1) {
+    arrow_not_supported("quantile() with length(probs) != 1")
+  }
+  # TODO: Bind to the Arrow function that returns an exact quantile and remove
+  # this warning (ARROW-14021)
+  warn(
+    "quantile() currently returns an approximate quantile in Arrow",
+    .frequency = ifelse(is_interactive(), "once", "always"),
+    .frequency_id = "arrow.quantile.approximate"
+  )
   list(
     fun = "tdigest",
     data = x,
-    options = list(skip_nulls = na.rm, q = 0.5)
+    options = list(skip_nulls = na.rm, q = probs)
+  )
+}
+agg_funcs$median <- function(x, na.rm = FALSE) {
+  # TODO: Bind to the Arrow function that returns an exact median and remove
+  # this warning (ARROW-14021)
+  warn(
+    "median() currently returns an approximate median in Arrow",
+    .frequency = ifelse(is_interactive(), "once", "always"),
+    .frequency_id = "arrow.median.approximate"
+  )
+  list(
+    fun = "approximate_median",
+    data = x,
+    options = list(skip_nulls = na.rm)
   )
 }
-
 agg_funcs$n_distinct <- function(x, na.rm = FALSE) {
   list(
     fun = "count_distinct",
@@ -943,7 +964,7 @@ output_type <- function(fun, input_type, hash) {
   } else if (fun %in% "sum") {
     # It may upcast to a bigger type but this is close enough
     input_type
-  } else if (fun %in% c("mean", "stddev", "variance")) {
+  } else if (fun %in% c("mean", "stddev", "variance", "approximate_median")) {
     float64()
   } else if (fun %in% "tdigest") {
     if (hash) {
diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index d145cc0efb4..a1dc514bc97 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -166,15 +166,8 @@ summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) {
     return()
   }
 
-  if ("median" %in% funs_in_expr) {
-    expr <- wrap_median(expr, hash)
-    # TODO: Remove this warning after median() returns an exact median
-    # (ARROW-14021)
-    warn(
-      "median() currently returns an approximate median in Arrow",
-      .frequency = ifelse(is_interactive(), "once", "always"),
-      .frequency_id = "arrow.median.approximate"
-    )
+  if ("quantile" %in% funs_in_expr) {
+    expr <- wrap_quantile(expr, hash)
     funs_in_expr <- all_funs(expr)
   }
 
@@ -269,20 +262,20 @@ extract_aggregations <- function(expr, ctx) {
   expr
 }
 
-# This function recurses through expr and wraps each call to median() with a
+# This function recurses through expr and wraps each call to quantile() with a
 # call to arrow_list_element()
-wrap_median <- function(expr, hash) {
+wrap_quantile <- function(expr, hash) {
   if (length(expr) == 1) {
     return(expr)
   } else {
-    if (is.call(expr) && expr[[1]] == quote(median)) {
+    if (is.call(expr) && expr[[1]] == quote(quantile)) {
       if (hash) {
         return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)")))
       } else {
         return(expr)
       }
     } else {
-      return(as.call(lapply(expr, wrap_median, hash)))
+      return(as.call(lapply(expr, wrap_quantile, hash)))
     }
   }
 }
diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 830d164376b..6b8a6f17b53 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -199,6 +199,15 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
     return out;
   }
 
+  if (func_name == "approximate_median" || func_name == "hash_approximate_median") {
+    using Options = arrow::compute::TDigestOptions;
+    auto out = std::make_shared<Options>(Options::Defaults());
+    if (!Rf_isNull(options["skip_nulls"])) {
+      out->skip_nulls = cpp11::as_cpp<bool>(options["skip_nulls"]);
+    }
+    return out;
+  }
+
   if (func_name == "count") {
     using Options = arrow::compute::CountOptions;
     auto out = std::make_shared<Options>(Options::Defaults());

From f4bbfd21d5fc88eaff19a5931fd1f5aebb33c6d2 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Tue, 21 Sep 2021 18:03:09 -0400
Subject: [PATCH 13/19] Fix approx_median options

---
 r/src/compute.cpp | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 6b8a6f17b53..952a7cc3a9a 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -172,10 +172,11 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
   }
 
   if (func_name == "all" || func_name == "hash_all" || func_name == "any" ||
-      func_name == "hash_any" || func_name == "mean" || func_name == "hash_mean" ||
-      func_name == "min_max" || func_name == "hash_min_max" || func_name == "min" ||
-      func_name == "hash_min" || func_name == "max" || func_name == "hash_max" ||
-      func_name == "sum" || func_name == "hash_sum") {
+      func_name == "hash_any" || func_name == "approximate_median" ||
+      func_name == "hash_approximate_median" || func_name == "mean" ||
+      func_name == "hash_mean" || func_name == "min_max" || func_name == "hash_min_max" ||
+      func_name == "min" || func_name == "hash_min" || func_name == "max" ||
+      func_name == "hash_max" || func_name == "sum" || func_name == "hash_sum") {
     using Options = arrow::compute::ScalarAggregateOptions;
     auto out = std::make_shared<Options>(Options::Defaults());
     if (!Rf_isNull(options["min_count"])) {
@@ -199,15 +200,6 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
     return out;
   }
 
-  if (func_name == "approximate_median" || func_name == "hash_approximate_median") {
-    using Options = arrow::compute::TDigestOptions;
-    auto out = std::make_shared<Options>(Options::Defaults());
-    if (!Rf_isNull(options["skip_nulls"])) {
-      out->skip_nulls = cpp11::as_cpp<bool>(options["skip_nulls"]);
-    }
-    return out;
-  }
-
   if (func_name == "count") {
     using Options = arrow::compute::CountOptions;
     auto out = std::make_shared<Options>(Options::Defaults());

From 9e5858d234024084ae3dd705a260981c82be3383 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Tue, 21 Sep 2021 18:41:20 -0400
Subject: [PATCH 14/19] Fix tests

---
 r/tests/testthat/test-dplyr-summarize.R | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 28e322150cc..ef3019dcd58 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -238,15 +238,11 @@ test_that("median()", {
       group_by(some_grouping) %>%
       summarize(
         med_dbl = median(dbl),
-        med_int = median(int),
+        med_int = as.double(median(int)),
         med_dbl_narmf = median(dbl, FALSE),
-        # styler: off
-        med_int_narmf = median(int, na.rm = F),
-        # styler: on
+        med_int_narmf = as.double(median(int, na.rm = FALSE)),
         med_dbl_narmt = median(dbl, na.rm = TRUE),
-        # styler: off
-        med_int_narmt = median(int, T)
-        # styler: on
+        med_int_narmt = as.double(median(int, TRUE))
       ) %>%
       arrange(some_grouping) %>%
       collect(),
@@ -258,26 +254,20 @@ test_that("median()", {
     input %>%
       summarize(
         med_dbl_narmt = median(dbl, na.rm = TRUE),
-        # styler: off
-        med_int_narmt = median(int, T)
-        # styler: on
+        med_int_narmt = as.double(median(int, TRUE))
       ) %>%
       collect(),
     tbl,
     warning = "median\\(\\) currently returns an approximate median in Arrow"
   )
-
-  skip("Error on median(, na.rm = FALSE) with no groups (ARROW-14050)")
   # without groups, with na.rm = FALSE (the default)
   expect_dplyr_equal(
     input %>%
       summarize(
         med_dbl = median(dbl),
-        med_int = median(int),
-        # styler: off
-        med_dbl_narmf = median(dbl, F),
-        # styler: on
-        med_int_narmf = median(int, na.rm = FALSE)
+        med_int = as.double(median(int)),
+        med_dbl_narmf = median(dbl, FALSE),
+        med_int_narmf = as.double(median(int, na.rm = FALSE))
       ) %>%
       collect(),
     tbl,

From ccfec557c2d01e60ced261149039c012254d2cf0 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Tue, 21 Sep 2021 18:41:32 -0400
Subject: [PATCH 15/19] Add quantile() tests

---
 r/tests/testthat/test-dplyr-summarize.R | 86 ++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 3 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index ef3019dcd58..97ea5196330 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -229,8 +229,6 @@ test_that("Group by n_distinct() on dataset", {
   )
 })
 
-<<<<<<< HEAD
-=======
 test_that("median()", {
   # with groups
   expect_dplyr_equal(
@@ -275,7 +273,89 @@ test_that("median()", {
   )
 })
 
->>>>>>> b5fffd2e2 (Fix merge error)
+test_that("quantile()", {
+  # The default S3 method for stats::quantile() method throws an error when
+  # numeric input contains NA or NaN and na.rm is not TRUE. So the tests below
+  # only compare R and Arrow output on data with NAs or NaNs with na.rm = TRUE.
+
+  # Because the default S3 method defaults to names = TRUE, the Arrow binding
+  # does not accept a names argument, and the presence of the names causes
+  # expect_equal() to fail, we don't use expect_dplyr_equal() in the tests
+  # below.
+
+  # The tests below all use probs = 0.5 because other values cause differences
+  # between the exact quantiles returned by R and the approximate quantiles
+  # returned by Arrow.
+
+  # with groups
+  expect_warning(
+    expect_equal(
+      tbl %>%
+        group_by(some_grouping) %>%
+        summarize(
+          q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE),
+          q_int = quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+        ) %>%
+        arrange(some_grouping),
+      Table$create(tbl) %>%
+        group_by(some_grouping) %>%
+        summarize(
+          q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE),
+          q_int = quantile(int, probs = 0.5, na.rm = TRUE)
+        ) %>%
+        arrange(some_grouping) %>%
+        collect()
+    ),
+    "quantile() currently returns an approximate quantile in Arrow",
+    fixed = TRUE
+  )
+
+  # without groups
+  expect_warning(
+    expect_equal(
+      tbl %>%
+        summarize(
+          q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE),
+          q_int = quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+        ),
+      Table$create(tbl) %>%
+        summarize(
+          q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE),
+          q_int = quantile(int, probs = 0.5, na.rm = TRUE)
+        ) %>%
+        collect()
+    ),
+    "quantile() currently returns an approximate quantile in Arrow",
+    fixed = TRUE
+  )
+
+  # with missing values and na.rm = FALSE
+  expect_warning(
+    expect_equal(
+      tibble(
+        q_dbl = NA_real_,
+        q_int = NA_real_
+      ),
+      Table$create(tbl) %>%
+        summarize(
+          q_dbl = quantile(dbl, probs = 0.5, na.rm = FALSE),
+          q_int = quantile(int, probs = 0.5, na.rm = FALSE)
+        ) %>%
+        collect()
+    ),
+    "quantile() currently returns an approximate quantile in Arrow",
+    fixed = TRUE
+  )
+
+  # with a vector of 2+ probs
+  expect_warning(
+    Table$create(tbl) %>%
+      summarize(q = quantile(dbl, probs = c(0.2, 0.8), na.rm = FALSE)),
+    "quantile() with length(probs) != 1 not supported by Arrow",
+    fixed = TRUE
+  )
+})
+
 test_that("summarize() with min() and max()", {
   expect_dplyr_equal(
     input %>%

From 10f0bddefd07480311fc6f7fbf49dd58137213cc Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Tue, 21 Sep 2021 19:01:30 -0400
Subject: [PATCH 16/19] Improve AST modifying function and add comment

---
 r/R/dplyr-summarize.R | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index a1dc514bc97..6158026603f 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -166,8 +166,12 @@ summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) {
     return()
   }
 
-  if ("quantile" %in% funs_in_expr) {
-    expr <- wrap_quantile(expr, hash)
+  # For the quantile() binding in the hash aggregation case, we need to mutate
+  # the list output from the Arrow hash_tdigest kernel to flatten it into a
+  # column of type float64. We do that by modifying the unevaluated expression
+  # to replace quantile(...) with arrow_list_element(quantile(...), 0L)
+  if (hash && "quantile" %in% funs_in_expr) {
+    expr <- wrap_hash_quantile(expr)
     funs_in_expr <- all_funs(expr)
   }
 
@@ -264,18 +268,14 @@ extract_aggregations <- function(expr, ctx) {
 
 # This function recurses through expr and wraps each call to quantile() with a
 # call to arrow_list_element()
-wrap_quantile <- function(expr, hash) {
+wrap_hash_quantile <- function(expr) {
   if (length(expr) == 1) {
     return(expr)
   } else {
     if (is.call(expr) && expr[[1]] == quote(quantile)) {
-      if (hash) {
-        return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)")))
-      } else {
-        return(expr)
-      }
+      return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)")))
     } else {
-      return(as.call(lapply(expr, wrap_quantile, hash)))
+      return(as.call(lapply(expr, wrap_hash_quantile)))
     }
   }
 }

From a837785f5a2d9e67935ffba884603c20b42accda Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Wed, 22 Sep 2021 11:10:56 -0400
Subject: [PATCH 17/19] Improve tests and comments in tests

---
 r/tests/testthat/test-dplyr-summarize.R | 44 +++++++++++++++++--------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 97ea5196330..39e60ddd3ca 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -230,6 +230,11 @@ test_that("Group by n_distinct() on dataset", {
 })
 
 test_that("median()", {
+  # When medians are integer-valued, stats::median() sometimes returns output of
+  # type integer, whereas whereas the Arrow approx_median kernels always return
+  # output of type float64. The calls to median(int, ...) in the tests below
+  # are enclosed in as.double() to work around this known difference.
+
   # with groups
   expect_dplyr_equal(
     input %>%
@@ -274,19 +279,28 @@ test_that("median()", {
 })
 
 test_that("quantile()", {
-  # The default S3 method for stats::quantile() method throws an error when
-  # numeric input contains NA or NaN and na.rm is not TRUE. So the tests below
-  # only compare R and Arrow output on data with NAs or NaNs with na.rm = TRUE.
-
-  # Because the default S3 method defaults to names = TRUE, the Arrow binding
-  # does not accept a names argument, and the presence of the names causes
-  # expect_equal() to fail, we don't use expect_dplyr_equal() in the tests
-  # below.
+  # The default method for stats::quantile() throws an error when na.rm = FALSE
+  # and the input contains NA or NaN, whereas the Arrow tdigest kernels return
+  # null in this situation. To work around this known difference, the tests
+  # below always use na.rm = TRUE when the data contains NA or NaN.
+
+  # The default method for stats::quantile() has an argument `names` that
+  # controls whether the result has a names attribute. It defaults to
+  # names = TRUE. With Arrow, it is not possible to give the result a names
+  # attribute, so the quantile() binding in Arrow does not accept a `names`
+  # argument. Differences in this names attribute cause expect_dplyr_equal() to
+  # report that the objects are not equal, so we do not use expect_dplyr_equal()
+  # in the tests below.
 
   # The tests below all use probs = 0.5 because other values cause differences
   # between the exact quantiles returned by R and the approximate quantiles
   # returned by Arrow.
 
+  # When quantiles are integer-valued, stats::quantile() sometimes returns
+  # output of type integer, whereas whereas the Arrow tdigest kernels always
+  # return output of type float64. The calls to quantile(int, ...) in the tests
+  # below are enclosed in as.double() to work around this known difference.
+
   # with groups
   expect_warning(
     expect_equal(
@@ -294,14 +308,16 @@ test_that("quantile()", {
         group_by(some_grouping) %>%
         summarize(
           q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE),
-          q_int = quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+          q_int = as.double(
+            quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+          )
         ) %>%
         arrange(some_grouping),
       Table$create(tbl) %>%
         group_by(some_grouping) %>%
         summarize(
           q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE),
-          q_int = quantile(int, probs = 0.5, na.rm = TRUE)
+          q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE))
         ) %>%
         arrange(some_grouping) %>%
         collect()
@@ -316,12 +332,14 @@ test_that("quantile()", {
       tbl %>%
         summarize(
           q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE),
-          q_int = quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+          q_int = as.double(
+            quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+          )
         ),
       Table$create(tbl) %>%
         summarize(
           q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE),
-          q_int = quantile(int, probs = 0.5, na.rm = TRUE)
+          q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE))
         ) %>%
         collect()
     ),
@@ -339,7 +357,7 @@ test_that("quantile()", {
       Table$create(tbl) %>%
         summarize(
           q_dbl = quantile(dbl, probs = 0.5, na.rm = FALSE),
-          q_int = quantile(int, probs = 0.5, na.rm = FALSE)
+          q_int = as.double(quantile(int, probs = 0.5, na.rm = FALSE))
         ) %>%
         collect()
     ),

From a3402a59d9e5d067e627df1fbfb5ddf6a01866d9 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 24 Sep 2021 11:04:49 -0400
Subject: [PATCH 18/19] Fix bad merge

---
 r/tests/testthat/test-dplyr-summarize.R | 54 -------------------------
 1 file changed, 54 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 39e60ddd3ca..8739c70bbda 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -499,60 +499,6 @@ test_that("summarise() with !!sym()", {
   )
 })
 
-test_that("median()", {
-  # with groups
-  expect_dplyr_equal(
-    input %>%
-      group_by(some_grouping) %>%
-      summarize(
-        med_dbl = median(dbl),
-        med_int = median(int),
-        med_dbl_narmf = median(dbl, FALSE),
-        # styler: off
-        med_int_narmf = median(int, na.rm = F),
-        # styler: on
-        med_dbl_narmt = median(dbl, na.rm = TRUE),
-        # styler: off
-        med_int_narmt = median(int, T)
-        # styler: on
-      ) %>%
-      arrange(some_grouping) %>%
-      collect(),
-    tbl,
-    warning = "median\\(\\) currently returns an approximate median in Arrow"
-  )
-  # without groups, with na.rm = TRUE
-  expect_dplyr_equal(
-    input %>%
-      summarize(
-        med_dbl_narmt = median(dbl, na.rm = TRUE),
-        # styler: off
-        med_int_narmt = median(int, T)
-        # styler: on
-      ) %>%
-      collect(),
-    tbl,
-    warning = "median\\(\\) currently returns an approximate median in Arrow"
-  )
-
-  skip("Error on median(, na.rm = FALSE) with no groups (ARROW-14050)")
-  # without groups, with na.rm = FALSE (the default)
-  expect_dplyr_equal(
-    input %>%
-      summarize(
-        med_dbl = median(dbl),
-        med_int = median(int),
-        # styler: off
-        med_dbl_narmf = median(dbl, F),
-        # styler: on
-        med_int_narmf = median(int, na.rm = FALSE)
-      ) %>%
-      collect(),
-    tbl,
-    warning = "median\\(\\) currently returns an approximate median in Arrow"
-  )
-})
-
 test_that("Filter and aggregate", {
   expect_dplyr_equal(
     input %>%

From 950769da7cf6d7619ba3e07563e3c49734123505 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 24 Sep 2021 11:56:22 -0400
Subject: [PATCH 19/19] Update NEWS.md

---
 r/NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 381b6334909..bbefb33bbc0 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -23,7 +23,7 @@ There are now two ways to query Arrow data:
 
 ## 1. Grouped aggregation in Arrow
 
-`dplyr::summarize()`, both grouped and ungrouped, is now implemented for Arrow Datasets, Tables, and RecordBatches. Because data is scanned in chunks, you can aggregate over larger-than-memory datasets backed by many files. Supported aggregation functions include `n()`, `n_distinct()`, `sum()`, `mean()`, `var()`, `sd()`, `any()`, and `all()`.
+`dplyr::summarize()`, both grouped and ungrouped, is now implemented for Arrow Datasets, Tables, and RecordBatches. Because data is scanned in chunks, you can aggregate over larger-than-memory datasets backed by many files. Supported aggregation functions include `n()`, `n_distinct()`, `min(),` `max()`, `sum()`, `mean()`, `var()`, `sd()`, `any()`, and `all()`. `median()` and `quantile()` with one probability are also supported and currently return approximate results using the t-digest algorithm.
 
 This enhancement does change the behavior of `summarize()` and `collect()` in some cases: see "Breaking changes" below for details.