From b678ac49f0bc8ccda570dd76a92b747c10f349c5 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 27 Aug 2021 11:42:51 +0100 Subject: [PATCH 01/19] Rebase --- r/R/dplyr-functions.R | 10 ++++++++++ r/src/compute.cpp | 9 ++++++++- r/tests/testthat/test-dplyr-summarize.R | 1 + 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 765438ac908..27d6d3a9e33 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -889,6 +889,16 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) } + +# na.rm here doesn't work yet - see ARROW-13782 +agg_funcs$median <- function(x, na.rm = FALSE) { + list( + fun = "tdigest", + data = x, + options = list(na.rm = na.rm, q = 0.5) + ) +} + agg_funcs$n_distinct <- function(x, na.rm = FALSE) { list( fun = "count_distinct", diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 8268fd10eee..61eb3575f79 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -186,7 +186,14 @@ std::shared_ptr make_compute_options( } return out; } - + + if (func_name == "tdigest" || func_name == "hash_tdigest") { + using Options = arrow::compute::TDigestOptions; + auto out = std::make_shared(Options::Defaults()); + out->q = cpp11::as_cpp>(options["q"]); + return out; + } + if (func_name == "count") { using Options = arrow::compute::CountOptions; auto out = std::make_shared(Options::Defaults()); diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index c74ed6aa938..fef84420aad 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -621,4 +621,5 @@ test_that(".groups argument", { ), "NOTVALID" ) + }) From 08c515ab76d08b60c9baec75c8396be4618914ad Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 27 Aug 2021 12:04:33 +0100 Subject: [PATCH 02/19] Run linter --- r/src/compute.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 61eb3575f79..16f7ae83a5e 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -186,14 +186,14 @@ std::shared_ptr make_compute_options( } return out; } - + if (func_name == "tdigest" || func_name == "hash_tdigest") { using Options = arrow::compute::TDigestOptions; auto out = std::make_shared(Options::Defaults()); out->q = cpp11::as_cpp>(options["q"]); return out; } - + if (func_name == "count") { using Options = arrow::compute::CountOptions; auto out = std::make_shared(Options::Defaults()); From a2717b7b3d489d59c90c1289dc74fe64164cf0e7 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 16 Sep 2021 21:03:31 -0400 Subject: [PATCH 03/19] Pass na.rm --> skip_nulls --- r/R/dplyr-functions.R | 3 +-- r/src/compute.cpp | 7 ++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 27d6d3a9e33..abcbae57664 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -890,12 +890,11 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { ) } -# na.rm here doesn't work yet - see ARROW-13782 agg_funcs$median <- function(x, na.rm = FALSE) { list( fun = "tdigest", data = x, - options = list(na.rm = na.rm, q = 0.5) + options = list(skip_nulls = na.rm, q = 0.5) ) } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 16f7ae83a5e..830d164376b 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -190,7 +190,12 @@ std::shared_ptr make_compute_options( if (func_name == "tdigest" || func_name == "hash_tdigest") { using Options = arrow::compute::TDigestOptions; auto out = std::make_shared(Options::Defaults()); - out->q = cpp11::as_cpp>(options["q"]); + if (!Rf_isNull(options["q"])) { + out->q = cpp11::as_cpp>(options["q"]); + } + if (!Rf_isNull(options["skip_nulls"])) { + out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); + } return out; } From 602bbcb6b43f77150be7807a9e12a1a691e5d77c Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 16 Sep 2021 21:45:31 -0400 Subject: [PATCH 04/19] Add TODOs --- r/R/dplyr-functions.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index abcbae57664..9014c4fe794 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -891,6 +891,13 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { } agg_funcs$median <- function(x, na.rm = FALSE) { + + # TODO: after ARROW-12669 is merged, use the list_element function + # to unnest the ListArray returned by tdigest. + + # TODO: issue a warning (only once per session if possible) saying + # that this returns only an approximate median + list( fun = "tdigest", data = x, From 069bcc35f4d842e49077aa21d9488001ae2ebf0b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 16 Sep 2021 22:36:55 -0400 Subject: [PATCH 05/19] Warn once per session that median is approx --- r/NAMESPACE | 1 + r/R/arrow-package.R | 2 +- r/R/dplyr-functions.R | 3 +-- r/R/dplyr-summarize.R | 10 ++++++++++ r/tests/testthat/test-dplyr-summarize.R | 6 ++++-- 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 61ca5d8fdc4..f89a7352ec9 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -323,6 +323,7 @@ importFrom(rlang,is_bare_character) importFrom(rlang,is_character) importFrom(rlang,is_false) importFrom(rlang,is_integerish) +importFrom(rlang,is_interactive) importFrom(rlang,is_quosure) importFrom(rlang,list2) importFrom(rlang,new_data_mask) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index e2d3ec18846..ec9c0500494 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -21,7 +21,7 @@ #' @importFrom assertthat assert_that is.string #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec -#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 +#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @useDynLib arrow, .registration = TRUE diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 9014c4fe794..728d4fdc41e 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -895,8 +895,7 @@ agg_funcs$median <- function(x, na.rm = FALSE) { # TODO: after ARROW-12669 is merged, use the list_element function # to unnest the ListArray returned by tdigest. - # TODO: issue a warning (only once per session if possible) saying - # that this returns only an approximate median + # TODO: Bind to the Arrow function that returns an exact median (ARROW-14021) list( fun = "tdigest", diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index beb18e82039..641603c05f4 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -161,6 +161,16 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) { return() } + # TODO: Remove this warning after median() returns an exact median + # (ARROW-14021) + if ("median" %in% funs_in_expr) { + warn( + "median() currently returns an approximate median in Arrow", + .frequency = ifelse(is_interactive(), "once", "always"), + .frequency_id = "arrow.median.approximate" + ) + } + # Start inspecting the expr to see what aggregations it involves agg_funs <- names(agg_funcs) outer_agg <- funs_in_expr[1] %in% agg_funs diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index fef84420aad..881215abc68 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -578,7 +578,8 @@ test_that(".groups argument", { group_by(some_grouping, int < 6) %>% summarize(count = n()) %>% collect(), - tbl + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" ) expect_dplyr_equal( input %>% @@ -592,7 +593,8 @@ test_that(".groups argument", { group_by(some_grouping, int < 6) %>% summarize(count = n(), .groups = "keep") %>% collect(), - tbl + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" ) expect_dplyr_equal( input %>% From c2347dbfb59c45dcbe053a5fa33f72849b99b582 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 16 Sep 2021 23:57:28 -0400 Subject: [PATCH 06/19] median(...) --> arrow_list_element(median(...), 0) --- r/R/dplyr-functions.R | 5 ----- r/R/dplyr-summarize.R | 16 ++++++++++++++++ r/R/util.R | 7 +++++++ 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 728d4fdc41e..168e855a978 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -891,12 +891,7 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { } agg_funcs$median <- function(x, na.rm = FALSE) { - - # TODO: after ARROW-12669 is merged, use the list_element function - # to unnest the ListArray returned by tdigest. - # TODO: Bind to the Arrow function that returns an exact median (ARROW-14021) - list( fun = "tdigest", data = x, diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 641603c05f4..6e0757aea06 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -164,6 +164,7 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) { # TODO: Remove this warning after median() returns an exact median # (ARROW-14021) if ("median" %in% funs_in_expr) { + expr <- wrap_median(expr) warn( "median() currently returns an approximate median in Arrow", .frequency = ifelse(is_interactive(), "once", "always"), @@ -261,3 +262,18 @@ extract_aggregations <- function(expr, ctx) { } expr } + +# This function recurses through expr and wraps each call to median() with a +# call to arrow_list_element() +# TODO: test that this works after ARROW-12669 is merged +wrap_median <- function(expr) { + if (length(expr) == 1) { + return(expr) + } else { + if (is.call(expr) && expr[[1]] == quote(median)) { + return(str2lang(paste0("arrow_list_element(", deparse1(expr),", 0)"))) + } else { + return(as.call(lapply(expr, wrap_median))) + } + } +} diff --git a/r/R/util.R b/r/R/util.R index 94d3a78782f..918dba07eae 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -22,6 +22,13 @@ if (!exists("deparse1")) { } } +# for compatibility with R versions earlier than 3.6.0 +if (!exists("str2lang")) { + str2lang <- function(s) { + parse(text = s, keep.source = FALSE)[[1]] + } +} + oxford_paste <- function(x, conjunction = "and", quote = TRUE) { if (quote && is.character(x)) { x <- paste0('"', x, '"') From 2f5efae1af1070a6881674a62599d220a0630a62 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 17 Sep 2021 00:00:23 -0400 Subject: [PATCH 07/19] Move comment --- r/R/dplyr-summarize.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 6e0757aea06..1ff377b771c 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -161,10 +161,10 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) { return() } - # TODO: Remove this warning after median() returns an exact median - # (ARROW-14021) if ("median" %in% funs_in_expr) { expr <- wrap_median(expr) + # TODO: Remove this warning after median() returns an exact median + # (ARROW-14021) warn( "median() currently returns an approximate median in Arrow", .frequency = ifelse(is_interactive(), "once", "always"), From cc63551a5e08043a6db7c4faad62b2237d7c2194 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 17 Sep 2021 10:39:08 -0400 Subject: [PATCH 08/19] Make the linter happy --- r/R/dplyr-summarize.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 1ff377b771c..27681322b44 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -271,7 +271,7 @@ wrap_median <- function(expr) { return(expr) } else { if (is.call(expr) && expr[[1]] == quote(median)) { - return(str2lang(paste0("arrow_list_element(", deparse1(expr),", 0)"))) + return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0)"))) } else { return(as.call(lapply(expr, wrap_median))) } From 4a75d8ba7607f2e5d8120214311d37e230800088 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 20 Sep 2021 22:20:00 -0400 Subject: [PATCH 09/19] Handle tdigest output differently for hash vs. no hash --- r/R/dplyr-collect.R | 3 +- r/R/dplyr-functions.R | 8 +++- r/R/dplyr-summarize.R | 23 +++++++---- r/tests/testthat/test-dplyr-summarize.R | 53 ++++++++++++++++++++++--- 4 files changed, 73 insertions(+), 14 deletions(-) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 8a5488bf599..06914abe072 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -96,9 +96,10 @@ implicit_schema <- function(.data) { # and they get projected to this order after aggregation) # * Infer the output types from the aggregations group_fields <- new_fields[.data$group_by_vars] + hash <- length(.data$group_by_vars) > 0 agg_fields <- imap( new_fields[setdiff(names(new_fields), .data$group_by_vars)], - ~ output_type(.data$aggregations[[.y]][["fun"]], .x) + ~ output_type(.data$aggregations[[.y]][["fun"]], .x, hash) ) new_fields <- c(group_fields, agg_fields) } diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 168e855a978..e4041c5abc5 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -936,7 +936,7 @@ agg_funcs$max <- function(..., na.rm = FALSE) { ) } -output_type <- function(fun, input_type) { +output_type <- function(fun, input_type, hash) { # These are quick and dirty heuristics. if (fun %in% c("any", "all")) { bool() @@ -945,6 +945,12 @@ output_type <- function(fun, input_type) { input_type } else if (fun %in% c("mean", "stddev", "variance")) { float64() + } else if (fun %in% "tdigest") { + if (hash) { + fixed_size_list_of(float64(), 1L) + } else { + float64() + } } else { # Just so things don't error, assume the resulting type is the same input_type diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 27681322b44..d145cc0efb4 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -65,7 +65,12 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated # (which overwrites the previous name) - summarize_eval(names(exprs)[i], exprs[[i]], ctx) + summarize_eval( + names(exprs)[i], + exprs[[i]], + ctx, + length(.data$group_by_vars) > 0 + ) } # Apply the results to the .data object. @@ -150,7 +155,7 @@ format_aggregation <- function(x) { # appropriate combination of (1) aggregations (possibly temporary) and # (2) post-aggregation transformations (mutate) # The function returns nothing: it assigns into the `ctx` environment -summarize_eval <- function(name, quosure, ctx, recurse = FALSE) { +summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) { expr <- quo_get_expr(quosure) ctx$quo_env <- quo_get_env(quosure) @@ -162,7 +167,7 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) { } if ("median" %in% funs_in_expr) { - expr <- wrap_median(expr) + expr <- wrap_median(expr, hash) # TODO: Remove this warning after median() returns an exact median # (ARROW-14021) warn( @@ -170,6 +175,7 @@ summarize_eval <- function(name, quosure, ctx, recurse = FALSE) { .frequency = ifelse(is_interactive(), "once", "always"), .frequency_id = "arrow.median.approximate" ) + funs_in_expr <- all_funs(expr) } # Start inspecting the expr to see what aggregations it involves @@ -265,15 +271,18 @@ extract_aggregations <- function(expr, ctx) { # This function recurses through expr and wraps each call to median() with a # call to arrow_list_element() -# TODO: test that this works after ARROW-12669 is merged -wrap_median <- function(expr) { +wrap_median <- function(expr, hash) { if (length(expr) == 1) { return(expr) } else { if (is.call(expr) && expr[[1]] == quote(median)) { - return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0)"))) + if (hash) { + return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)"))) + } else { + return(expr) + } } else { - return(as.call(lapply(expr, wrap_median))) + return(as.call(lapply(expr, wrap_median, hash))) } } } diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 881215abc68..ebec84c2a25 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -354,6 +354,52 @@ test_that("summarise() with !!sym()", { ) }) +test_that("median()", { + # with groups + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize( + med_dbl = median(dbl), + med_int = median(int), + med_dbl_narmf = median(dbl, FALSE), + med_int_narmf = median(int, na.rm = F), + med_dbl_narmt = median(dbl, na.rm = TRUE), + med_int_narmt = median(int, T) + ) %>% + arrange(some_grouping) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) + # without groups, with na.rm = TRUE + expect_dplyr_equal( + input %>% + summarize( + med_dbl_narmt = median(dbl, na.rm = TRUE), + med_int_narmt = median(int, T) + ) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) + + skip("Error on median(, na.rm = FALSE) with no groups (ARROW-14050)") + # without groups, with na.rm = FALSE (the default) + expect_dplyr_equal( + input %>% + summarize( + med_dbl = median(dbl), + med_int = median(int), + med_dbl_narmf = median(dbl, F), + med_int_narmf = median(int, na.rm = FALSE) + ) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) +}) + test_that("Filter and aggregate", { expect_dplyr_equal( input %>% @@ -578,8 +624,7 @@ test_that(".groups argument", { group_by(some_grouping, int < 6) %>% summarize(count = n()) %>% collect(), - tbl, - warning = "median\\(\\) currently returns an approximate median in Arrow" + tbl ) expect_dplyr_equal( input %>% @@ -593,8 +638,7 @@ test_that(".groups argument", { group_by(some_grouping, int < 6) %>% summarize(count = n(), .groups = "keep") %>% collect(), - tbl, - warning = "median\\(\\) currently returns an approximate median in Arrow" + tbl ) expect_dplyr_equal( input %>% @@ -623,5 +667,4 @@ test_that(".groups argument", { ), "NOTVALID" ) - }) From 64fb80bc712618308a68949c6e690beab75f8a70 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 21 Sep 2021 09:07:47 -0400 Subject: [PATCH 10/19] Make styler happy --- r/tests/testthat/test-dplyr-summarize.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index ebec84c2a25..45cf72d4e09 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -363,9 +363,13 @@ test_that("median()", { med_dbl = median(dbl), med_int = median(int), med_dbl_narmf = median(dbl, FALSE), + # styler: off med_int_narmf = median(int, na.rm = F), + # styler: on med_dbl_narmt = median(dbl, na.rm = TRUE), + # styler: off med_int_narmt = median(int, T) + # styler: on ) %>% arrange(some_grouping) %>% collect(), @@ -377,7 +381,9 @@ test_that("median()", { input %>% summarize( med_dbl_narmt = median(dbl, na.rm = TRUE), + # styler: off med_int_narmt = median(int, T) + # styler: on ) %>% collect(), tbl, @@ -391,7 +397,9 @@ test_that("median()", { summarize( med_dbl = median(dbl), med_int = median(int), + # styler: off med_dbl_narmf = median(dbl, F), + # styler: on med_int_narmf = median(int, na.rm = FALSE) ) %>% collect(), From d1be7c2c4fb4ca6c644d1077580a14353152f02b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 21 Sep 2021 09:27:58 -0400 Subject: [PATCH 11/19] Fix merge error --- r/tests/testthat/test-dplyr-summarize.R | 57 +++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 45cf72d4e09..28e322150cc 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -229,6 +229,63 @@ test_that("Group by n_distinct() on dataset", { ) }) +<<<<<<< HEAD +======= +test_that("median()", { + # with groups + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize( + med_dbl = median(dbl), + med_int = median(int), + med_dbl_narmf = median(dbl, FALSE), + # styler: off + med_int_narmf = median(int, na.rm = F), + # styler: on + med_dbl_narmt = median(dbl, na.rm = TRUE), + # styler: off + med_int_narmt = median(int, T) + # styler: on + ) %>% + arrange(some_grouping) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) + # without groups, with na.rm = TRUE + expect_dplyr_equal( + input %>% + summarize( + med_dbl_narmt = median(dbl, na.rm = TRUE), + # styler: off + med_int_narmt = median(int, T) + # styler: on + ) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) + + skip("Error on median(, na.rm = FALSE) with no groups (ARROW-14050)") + # without groups, with na.rm = FALSE (the default) + expect_dplyr_equal( + input %>% + summarize( + med_dbl = median(dbl), + med_int = median(int), + # styler: off + med_dbl_narmf = median(dbl, F), + # styler: on + med_int_narmf = median(int, na.rm = FALSE) + ) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) +}) + +>>>>>>> b5fffd2e2 (Fix merge error) test_that("summarize() with min() and max()", { expect_dplyr_equal( input %>% From b191be640cfc1739210b4d44542223b08aa40047 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 21 Sep 2021 16:51:57 -0400 Subject: [PATCH 12/19] Use approximate_median for median(); add quantile() binding --- r/R/dplyr-functions.R | 33 +++++++++++++++++++++++++++------ r/R/dplyr-summarize.R | 19 ++++++------------- r/src/compute.cpp | 9 +++++++++ 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index e4041c5abc5..e72842e1d2b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -889,16 +889,37 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) } - -agg_funcs$median <- function(x, na.rm = FALSE) { - # TODO: Bind to the Arrow function that returns an exact median (ARROW-14021) +agg_funcs$quantile <- function(x, probs, na.rm = FALSE) { + if (length(probs) != 1) { + arrow_not_supported("quantile() with length(probs) != 1") + } + # TODO: Bind to the Arrow function that returns an exact quantile and remove + # this warning (ARROW-14021) + warn( + "quantile() currently returns an approximate quantile in Arrow", + .frequency = ifelse(is_interactive(), "once", "always"), + .frequency_id = "arrow.quantile.approximate" + ) list( fun = "tdigest", data = x, - options = list(skip_nulls = na.rm, q = 0.5) + options = list(skip_nulls = na.rm, q = probs) + ) +} +agg_funcs$median <- function(x, na.rm = FALSE) { + # TODO: Bind to the Arrow function that returns an exact median and remove + # this warning (ARROW-14021) + warn( + "median() currently returns an approximate median in Arrow", + .frequency = ifelse(is_interactive(), "once", "always"), + .frequency_id = "arrow.median.approximate" + ) + list( + fun = "approximate_median", + data = x, + options = list(skip_nulls = na.rm) ) } - agg_funcs$n_distinct <- function(x, na.rm = FALSE) { list( fun = "count_distinct", @@ -943,7 +964,7 @@ output_type <- function(fun, input_type, hash) { } else if (fun %in% "sum") { # It may upcast to a bigger type but this is close enough input_type - } else if (fun %in% c("mean", "stddev", "variance")) { + } else if (fun %in% c("mean", "stddev", "variance", "approximate_median")) { float64() } else if (fun %in% "tdigest") { if (hash) { diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index d145cc0efb4..a1dc514bc97 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -166,15 +166,8 @@ summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) { return() } - if ("median" %in% funs_in_expr) { - expr <- wrap_median(expr, hash) - # TODO: Remove this warning after median() returns an exact median - # (ARROW-14021) - warn( - "median() currently returns an approximate median in Arrow", - .frequency = ifelse(is_interactive(), "once", "always"), - .frequency_id = "arrow.median.approximate" - ) + if ("quantile" %in% funs_in_expr) { + expr <- wrap_quantile(expr, hash) funs_in_expr <- all_funs(expr) } @@ -269,20 +262,20 @@ extract_aggregations <- function(expr, ctx) { expr } -# This function recurses through expr and wraps each call to median() with a +# This function recurses through expr and wraps each call to quantile() with a # call to arrow_list_element() -wrap_median <- function(expr, hash) { +wrap_quantile <- function(expr, hash) { if (length(expr) == 1) { return(expr) } else { - if (is.call(expr) && expr[[1]] == quote(median)) { + if (is.call(expr) && expr[[1]] == quote(quantile)) { if (hash) { return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)"))) } else { return(expr) } } else { - return(as.call(lapply(expr, wrap_median, hash))) + return(as.call(lapply(expr, wrap_quantile, hash))) } } } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 830d164376b..6b8a6f17b53 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -199,6 +199,15 @@ std::shared_ptr make_compute_options( return out; } + if (func_name == "approximate_median" || func_name == "hash_approximate_median") { + using Options = arrow::compute::TDigestOptions; + auto out = std::make_shared(Options::Defaults()); + if (!Rf_isNull(options["skip_nulls"])) { + out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); + } + return out; + } + if (func_name == "count") { using Options = arrow::compute::CountOptions; auto out = std::make_shared(Options::Defaults()); From f4bbfd21d5fc88eaff19a5931fd1f5aebb33c6d2 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 21 Sep 2021 18:03:09 -0400 Subject: [PATCH 13/19] Fix approx_median options --- r/src/compute.cpp | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 6b8a6f17b53..952a7cc3a9a 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -172,10 +172,11 @@ std::shared_ptr make_compute_options( } if (func_name == "all" || func_name == "hash_all" || func_name == "any" || - func_name == "hash_any" || func_name == "mean" || func_name == "hash_mean" || - func_name == "min_max" || func_name == "hash_min_max" || func_name == "min" || - func_name == "hash_min" || func_name == "max" || func_name == "hash_max" || - func_name == "sum" || func_name == "hash_sum") { + func_name == "hash_any" || func_name == "approximate_median" || + func_name == "hash_approximate_median" || func_name == "mean" || + func_name == "hash_mean" || func_name == "min_max" || func_name == "hash_min_max" || + func_name == "min" || func_name == "hash_min" || func_name == "max" || + func_name == "hash_max" || func_name == "sum" || func_name == "hash_sum") { using Options = arrow::compute::ScalarAggregateOptions; auto out = std::make_shared(Options::Defaults()); if (!Rf_isNull(options["min_count"])) { @@ -199,15 +200,6 @@ std::shared_ptr make_compute_options( return out; } - if (func_name == "approximate_median" || func_name == "hash_approximate_median") { - using Options = arrow::compute::TDigestOptions; - auto out = std::make_shared(Options::Defaults()); - if (!Rf_isNull(options["skip_nulls"])) { - out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); - } - return out; - } - if (func_name == "count") { using Options = arrow::compute::CountOptions; auto out = std::make_shared(Options::Defaults()); From 9e5858d234024084ae3dd705a260981c82be3383 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 21 Sep 2021 18:41:20 -0400 Subject: [PATCH 14/19] Fix tests --- r/tests/testthat/test-dplyr-summarize.R | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 28e322150cc..ef3019dcd58 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -238,15 +238,11 @@ test_that("median()", { group_by(some_grouping) %>% summarize( med_dbl = median(dbl), - med_int = median(int), + med_int = as.double(median(int)), med_dbl_narmf = median(dbl, FALSE), - # styler: off - med_int_narmf = median(int, na.rm = F), - # styler: on + med_int_narmf = as.double(median(int, na.rm = FALSE)), med_dbl_narmt = median(dbl, na.rm = TRUE), - # styler: off - med_int_narmt = median(int, T) - # styler: on + med_int_narmt = as.double(median(int, TRUE)) ) %>% arrange(some_grouping) %>% collect(), @@ -258,26 +254,20 @@ test_that("median()", { input %>% summarize( med_dbl_narmt = median(dbl, na.rm = TRUE), - # styler: off - med_int_narmt = median(int, T) - # styler: on + med_int_narmt = as.double(median(int, TRUE)) ) %>% collect(), tbl, warning = "median\\(\\) currently returns an approximate median in Arrow" ) - - skip("Error on median(, na.rm = FALSE) with no groups (ARROW-14050)") # without groups, with na.rm = FALSE (the default) expect_dplyr_equal( input %>% summarize( med_dbl = median(dbl), - med_int = median(int), - # styler: off - med_dbl_narmf = median(dbl, F), - # styler: on - med_int_narmf = median(int, na.rm = FALSE) + med_int = as.double(median(int)), + med_dbl_narmf = median(dbl, FALSE), + med_int_narmf = as.double(median(int, na.rm = FALSE)) ) %>% collect(), tbl, From ccfec557c2d01e60ced261149039c012254d2cf0 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 21 Sep 2021 18:41:32 -0400 Subject: [PATCH 15/19] Add quantile() tests --- r/tests/testthat/test-dplyr-summarize.R | 86 ++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index ef3019dcd58..97ea5196330 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -229,8 +229,6 @@ test_that("Group by n_distinct() on dataset", { ) }) -<<<<<<< HEAD -======= test_that("median()", { # with groups expect_dplyr_equal( @@ -275,7 +273,89 @@ test_that("median()", { ) }) ->>>>>>> b5fffd2e2 (Fix merge error) +test_that("quantile()", { + # The default S3 method for stats::quantile() method throws an error when + # numeric input contains NA or NaN and na.rm is not TRUE. So the tests below + # only compare R and Arrow output on data with NAs or NaNs with na.rm = TRUE. + + # Because the default S3 method defaults to names = TRUE, the Arrow binding + # does not accept a names argument, and the presence of the names causes + # expect_equal() to fail, we don't use expect_dplyr_equal() in the tests + # below. + + # The tests below all use probs = 0.5 because other values cause differences + # between the exact quantiles returned by R and the approximate quantiles + # returned by Arrow. + + # with groups + expect_warning( + expect_equal( + tbl %>% + group_by(some_grouping) %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE), + q_int = quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE) + ) %>% + arrange(some_grouping), + Table$create(tbl) %>% + group_by(some_grouping) %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE), + q_int = quantile(int, probs = 0.5, na.rm = TRUE) + ) %>% + arrange(some_grouping) %>% + collect() + ), + "quantile() currently returns an approximate quantile in Arrow", + fixed = TRUE + ) + + # without groups + expect_warning( + expect_equal( + tbl %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE), + q_int = quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE) + ), + Table$create(tbl) %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE), + q_int = quantile(int, probs = 0.5, na.rm = TRUE) + ) %>% + collect() + ), + "quantile() currently returns an approximate quantile in Arrow", + fixed = TRUE + ) + + # with missing values and na.rm = FALSE + expect_warning( + expect_equal( + tibble( + q_dbl = NA_real_, + q_int = NA_real_ + ), + Table$create(tbl) %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = FALSE), + q_int = quantile(int, probs = 0.5, na.rm = FALSE) + ) %>% + collect() + ), + "quantile() currently returns an approximate quantile in Arrow", + fixed = TRUE + ) + + # with a vector of 2+ probs + expect_warning( + Table$create(tbl) %>% + summarize(q = quantile(dbl, probs = c(0.2, 0.8), na.rm = FALSE)), + "quantile() with length(probs) != 1 not supported by Arrow", + fixed = TRUE + ) +}) + test_that("summarize() with min() and max()", { expect_dplyr_equal( input %>% From 10f0bddefd07480311fc6f7fbf49dd58137213cc Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 21 Sep 2021 19:01:30 -0400 Subject: [PATCH 16/19] Improve AST modifying function and add comment --- r/R/dplyr-summarize.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index a1dc514bc97..6158026603f 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -166,8 +166,12 @@ summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) { return() } - if ("quantile" %in% funs_in_expr) { - expr <- wrap_quantile(expr, hash) + # For the quantile() binding in the hash aggregation case, we need to mutate + # the list output from the Arrow hash_tdigest kernel to flatten it into a + # column of type float64. We do that by modifying the unevaluated expression + # to replace quantile(...) with arrow_list_element(quantile(...), 0L) + if (hash && "quantile" %in% funs_in_expr) { + expr <- wrap_hash_quantile(expr) funs_in_expr <- all_funs(expr) } @@ -264,18 +268,14 @@ extract_aggregations <- function(expr, ctx) { # This function recurses through expr and wraps each call to quantile() with a # call to arrow_list_element() -wrap_quantile <- function(expr, hash) { +wrap_hash_quantile <- function(expr) { if (length(expr) == 1) { return(expr) } else { if (is.call(expr) && expr[[1]] == quote(quantile)) { - if (hash) { - return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)"))) - } else { - return(expr) - } + return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)"))) } else { - return(as.call(lapply(expr, wrap_quantile, hash))) + return(as.call(lapply(expr, wrap_hash_quantile))) } } } From a837785f5a2d9e67935ffba884603c20b42accda Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Wed, 22 Sep 2021 11:10:56 -0400 Subject: [PATCH 17/19] Improve tests and comments in tests --- r/tests/testthat/test-dplyr-summarize.R | 44 +++++++++++++++++-------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 97ea5196330..39e60ddd3ca 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -230,6 +230,11 @@ test_that("Group by n_distinct() on dataset", { }) test_that("median()", { + # When medians are integer-valued, stats::median() sometimes returns output of + # type integer, whereas whereas the Arrow approx_median kernels always return + # output of type float64. The calls to median(int, ...) in the tests below + # are enclosed in as.double() to work around this known difference. + # with groups expect_dplyr_equal( input %>% @@ -274,19 +279,28 @@ test_that("median()", { }) test_that("quantile()", { - # The default S3 method for stats::quantile() method throws an error when - # numeric input contains NA or NaN and na.rm is not TRUE. So the tests below - # only compare R and Arrow output on data with NAs or NaNs with na.rm = TRUE. - - # Because the default S3 method defaults to names = TRUE, the Arrow binding - # does not accept a names argument, and the presence of the names causes - # expect_equal() to fail, we don't use expect_dplyr_equal() in the tests - # below. + # The default method for stats::quantile() throws an error when na.rm = FALSE + # and the input contains NA or NaN, whereas the Arrow tdigest kernels return + # null in this situation. To work around this known difference, the tests + # below always use na.rm = TRUE when the data contains NA or NaN. + + # The default method for stats::quantile() has an argument `names` that + # controls whether the result has a names attribute. It defaults to + # names = TRUE. With Arrow, it is not possible to give the result a names + # attribute, so the quantile() binding in Arrow does not accept a `names` + # argument. Differences in this names attribute cause expect_dplyr_equal() to + # report that the objects are not equal, so we do not use expect_dplyr_equal() + # in the tests below. # The tests below all use probs = 0.5 because other values cause differences # between the exact quantiles returned by R and the approximate quantiles # returned by Arrow. + # When quantiles are integer-valued, stats::quantile() sometimes returns + # output of type integer, whereas whereas the Arrow tdigest kernels always + # return output of type float64. The calls to quantile(int, ...) in the tests + # below are enclosed in as.double() to work around this known difference. + # with groups expect_warning( expect_equal( @@ -294,14 +308,16 @@ test_that("quantile()", { group_by(some_grouping) %>% summarize( q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE), - q_int = quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE) + q_int = as.double( + quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE) + ) ) %>% arrange(some_grouping), Table$create(tbl) %>% group_by(some_grouping) %>% summarize( q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE), - q_int = quantile(int, probs = 0.5, na.rm = TRUE) + q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE)) ) %>% arrange(some_grouping) %>% collect() @@ -316,12 +332,14 @@ test_that("quantile()", { tbl %>% summarize( q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE), - q_int = quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE) + q_int = as.double( + quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE) + ) ), Table$create(tbl) %>% summarize( q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE), - q_int = quantile(int, probs = 0.5, na.rm = TRUE) + q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE)) ) %>% collect() ), @@ -339,7 +357,7 @@ test_that("quantile()", { Table$create(tbl) %>% summarize( q_dbl = quantile(dbl, probs = 0.5, na.rm = FALSE), - q_int = quantile(int, probs = 0.5, na.rm = FALSE) + q_int = as.double(quantile(int, probs = 0.5, na.rm = FALSE)) ) %>% collect() ), From a3402a59d9e5d067e627df1fbfb5ddf6a01866d9 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 24 Sep 2021 11:04:49 -0400 Subject: [PATCH 18/19] Fix bad merge --- r/tests/testthat/test-dplyr-summarize.R | 54 ------------------------- 1 file changed, 54 deletions(-) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 39e60ddd3ca..8739c70bbda 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -499,60 +499,6 @@ test_that("summarise() with !!sym()", { ) }) -test_that("median()", { - # with groups - expect_dplyr_equal( - input %>% - group_by(some_grouping) %>% - summarize( - med_dbl = median(dbl), - med_int = median(int), - med_dbl_narmf = median(dbl, FALSE), - # styler: off - med_int_narmf = median(int, na.rm = F), - # styler: on - med_dbl_narmt = median(dbl, na.rm = TRUE), - # styler: off - med_int_narmt = median(int, T) - # styler: on - ) %>% - arrange(some_grouping) %>% - collect(), - tbl, - warning = "median\\(\\) currently returns an approximate median in Arrow" - ) - # without groups, with na.rm = TRUE - expect_dplyr_equal( - input %>% - summarize( - med_dbl_narmt = median(dbl, na.rm = TRUE), - # styler: off - med_int_narmt = median(int, T) - # styler: on - ) %>% - collect(), - tbl, - warning = "median\\(\\) currently returns an approximate median in Arrow" - ) - - skip("Error on median(, na.rm = FALSE) with no groups (ARROW-14050)") - # without groups, with na.rm = FALSE (the default) - expect_dplyr_equal( - input %>% - summarize( - med_dbl = median(dbl), - med_int = median(int), - # styler: off - med_dbl_narmf = median(dbl, F), - # styler: on - med_int_narmf = median(int, na.rm = FALSE) - ) %>% - collect(), - tbl, - warning = "median\\(\\) currently returns an approximate median in Arrow" - ) -}) - test_that("Filter and aggregate", { expect_dplyr_equal( input %>% From 950769da7cf6d7619ba3e07563e3c49734123505 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 24 Sep 2021 11:56:22 -0400 Subject: [PATCH 19/19] Update NEWS.md --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 381b6334909..bbefb33bbc0 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -23,7 +23,7 @@ There are now two ways to query Arrow data: ## 1. Grouped aggregation in Arrow -`dplyr::summarize()`, both grouped and ungrouped, is now implemented for Arrow Datasets, Tables, and RecordBatches. Because data is scanned in chunks, you can aggregate over larger-than-memory datasets backed by many files. Supported aggregation functions include `n()`, `n_distinct()`, `sum()`, `mean()`, `var()`, `sd()`, `any()`, and `all()`. +`dplyr::summarize()`, both grouped and ungrouped, is now implemented for Arrow Datasets, Tables, and RecordBatches. Because data is scanned in chunks, you can aggregate over larger-than-memory datasets backed by many files. Supported aggregation functions include `n()`, `n_distinct()`, `min(),` `max()`, `sum()`, `mean()`, `var()`, `sd()`, `any()`, and `all()`. `median()` and `quantile()` with one probability are also supported and currently return approximate results using the t-digest algorithm. This enhancement does change the behavior of `summarize()` and `collect()` in some cases: see "Breaking changes" below for details.