From 464ff2a4f509e06e86d425c61644e8d7b0affdb4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 18 Aug 2021 13:22:11 +0100 Subject: [PATCH 01/24] Add unit tests for mean, sd, and var --- r/tests/testthat/test-dplyr.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index ed03c58a884..d3a64da1361 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1492,3 +1492,23 @@ test_that("coalesce()", { fixed = TRUE ) }) + +test_that("mean, sd, and var", { + expect_dplyr_equal( + input %>% + mutate(agg = mean(int)), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate(agg = sd(int)), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate(agg = var(int)), + tbl + ) +}) From 5f945439fa597663b829338256669debca186502 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 18 Aug 2021 17:10:43 +0100 Subject: [PATCH 02/24] move tests --- r/tests/testthat/test-dplyr-aggregate.R | 49 +++++++++++++++++++++++++ r/tests/testthat/test-dplyr.R | 20 ---------- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index ef2929ee65b..a3bf21e64af 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -46,8 +46,57 @@ test_that("summarize", { tbl, warning = TRUE ) + + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(mean_int = mean(int)), + tbl, + warning = TRUE + ) + + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(sd_int = sd(int)), + tbl, + warning = TRUE + ) + + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(var_int = var(int)), + tbl, + warning = TRUE + ) +}) + + +test_that("mean, sd, and var", { + expect_dplyr_equal( + input %>% + mutate(agg = mean(dbl)), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate(agg = sd(dbl)), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate(agg = var(dbl)), + tbl + ) }) + test_that("Can aggregate in Arrow", { expect_dplyr_equal( input %>% diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index d3a64da1361..ed03c58a884 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1492,23 +1492,3 @@ test_that("coalesce()", { fixed = TRUE ) }) - -test_that("mean, sd, and var", { - expect_dplyr_equal( - input %>% - mutate(agg = mean(int)), - tbl - ) - - expect_dplyr_equal( - input %>% - mutate(agg = sd(int)), - tbl - ) - - expect_dplyr_equal( - input %>% - mutate(agg = var(int)), - tbl - ) -}) From 1419836b5af5130670653de83eaa4c94ee79390e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 18 Aug 2021 17:11:00 +0100 Subject: [PATCH 03/24] implement mean/sd/var --- r/R/dplyr-functions.R | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index efba9f287f9..df2d5f31d60 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -808,3 +808,32 @@ agg_funcs$all <- function(x, na.rm = FALSE) { options = list(na.rm = na.rm, na.min_count = 0L) ) } + +agg_funcs$mean <- function(x, na.rm = FALSE) { + list( + fun = "mean", + data = x, + options = arrow_na_rm(na.rm = na.rm) + ) +} +agg_funcs$sd <- function(x, na.rm = FALSE) { + list( + fun = "stddev", + data = x, + options = arrow_na_rm(na.rm = na.rm) + ) +} +agg_funcs$var <- function(x, na.rm = FALSE) { + list( + fun = "variance", + data = x, + options = arrow_na_rm(na.rm = na.rm) + ) +} +arrow_na_rm <- function(na.rm) { + if (!isTRUE(na.rm)) { + # TODO: ARROW-13497 + arrow_not_supported(paste("na.rm =", na.rm)) + } + list(na.rm = na.rm, na.min_count = 0L) +} From 37a2392bb8dd95c51896ba49580f1a6834059f90 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:05:14 +0100 Subject: [PATCH 04/24] Add tests for var --- r/tests/testthat/test-dplyr-aggregate.R | 92 +++++++++++++++---------- 1 file changed, 57 insertions(+), 35 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index a3bf21e64af..b8b86c83c00 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -47,100 +47,122 @@ test_that("summarize", { warning = TRUE ) +}) + +test_that("Can aggregate in Arrow", { expect_dplyr_equal( input %>% - select(int, chr) %>% - filter(int > 5) %>% - summarize(mean_int = mean(int)), - tbl, - warning = TRUE - ) - - expect_dplyr_equal( - input %>% - select(int, chr) %>% - filter(int > 5) %>% - summarize(sd_int = sd(int)), - tbl, - warning = TRUE + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl ) - expect_dplyr_equal( input %>% - select(int, chr) %>% - filter(int > 5) %>% - summarize(var_int = var(int)), + summarize(total = sum(int)) %>% + collect(), tbl, + # ARROW-13497: This is failing because the default is na.rm = FALSE warning = TRUE ) }) - -test_that("mean, sd, and var", { +test_that("Group by sum on dataset", { expect_dplyr_equal( input %>% - mutate(agg = mean(dbl)), + group_by(some_grouping) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), tbl ) - + expect_dplyr_equal( input %>% - mutate(agg = sd(dbl)), + group_by(some_grouping) %>% + summarize(total = sum(int * 4, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), tbl ) - + expect_dplyr_equal( input %>% - mutate(agg = var(dbl)), - tbl + group_by(some_grouping) %>% + summarize(total = sum(int)) %>% + arrange(some_grouping) %>% + collect(), + tbl, + # ARROW-13497: This is failing because the default is na.rm = FALSE + warning = TRUE ) }) - -test_that("Can aggregate in Arrow", { +test_that("Group by mean on dataset", { expect_dplyr_equal( input %>% - summarize(total = sum(int, na.rm = TRUE)) %>% + group_by(some_grouping) %>% + summarize(mean = mean(int, na.rm = TRUE)) %>% + arrange(some_grouping) %>% collect(), tbl ) + expect_dplyr_equal( input %>% - summarize(total = sum(int)) %>% + group_by(some_grouping) %>% + summarize(mean = mean(int, na.rm = FALSE)) %>% + arrange(some_grouping) %>% collect(), tbl ) + }) -test_that("Group by sum on dataset", { +test_that("Group by sd on dataset", { expect_dplyr_equal( input %>% group_by(some_grouping) %>% - summarize(total = sum(int, na.rm = TRUE)) %>% + summarize(sd = sd(int, na.rm = TRUE)) %>% arrange(some_grouping) %>% collect(), tbl ) + + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(sd = sd(int, na.rm = FALSE)) %>% + arrange(some_grouping) %>% + collect(), + tbl, + # ARROW-13497: This is failing because the default is na.rm = FALSE + warning = TRUE + ) + +}) +test_that("Group by var on dataset", { expect_dplyr_equal( input %>% group_by(some_grouping) %>% - summarize(total = sum(int * 4, na.rm = TRUE)) %>% + summarize(var = var(int, na.rm = TRUE)) %>% arrange(some_grouping) %>% collect(), tbl ) - + expect_dplyr_equal( input %>% group_by(some_grouping) %>% - summarize(total = sum(int)) %>% + summarize(var = var(int, na.rm = FALSE)) %>% arrange(some_grouping) %>% collect(), tbl ) + }) + test_that("Group by any/all", { withr::local_options(list(arrow.debug = TRUE)) From 32a560c0f85e861134b654936e75485848fe1ec2 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:06:03 +0100 Subject: [PATCH 05/24] Add ddof option to sd and var --- r/R/dplyr-functions.R | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index df2d5f31d60..ce2793ce0f4 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -551,6 +551,15 @@ nse_funcs$pmax <- function(..., na.rm = FALSE) { ) } +nse_funcs$mean <- function(..., na.rm = FALSE) { + build_expr( + "mean", + ..., + options = list(na.rm = na.rm) + ) +} + + nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") { assert_that(is_integerish(width)) side <- match.arg(side) @@ -816,18 +825,18 @@ agg_funcs$mean <- function(x, na.rm = FALSE) { options = arrow_na_rm(na.rm = na.rm) ) } -agg_funcs$sd <- function(x, na.rm = FALSE) { +agg_funcs$sd <- function(x, na.rm = FALSE, ddof = 1) { list( fun = "stddev", data = x, - options = arrow_na_rm(na.rm = na.rm) + options = list(ddof = ddof) ) } -agg_funcs$var <- function(x, na.rm = FALSE) { +agg_funcs$var <- function(x, y = NULL, na.rm = FALSE, ddof = 1) { list( fun = "variance", data = x, - options = arrow_na_rm(na.rm = na.rm) + options = list(ddof = ddof) ) } arrow_na_rm <- function(na.rm) { From a06a7c6d8db3d0b8b530d4dff6affdd1c5daf2f6 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:18:24 +0100 Subject: [PATCH 06/24] Remove accidental function --- r/R/dplyr-functions.R | 9 --------- 1 file changed, 9 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index ce2793ce0f4..6331bf46df1 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -551,15 +551,6 @@ nse_funcs$pmax <- function(..., na.rm = FALSE) { ) } -nse_funcs$mean <- function(..., na.rm = FALSE) { - build_expr( - "mean", - ..., - options = list(na.rm = na.rm) - ) -} - - nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") { assert_that(is_integerish(width)) side <- match.arg(side) From f1de569847143078c2eda51794f5986126f73676 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:25:24 +0100 Subject: [PATCH 07/24] Remove things diff to upstream --- r/tests/testthat/test-dplyr-aggregate.R | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index b8b86c83c00..34ce9165a9b 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -60,9 +60,7 @@ test_that("Can aggregate in Arrow", { input %>% summarize(total = sum(int)) %>% collect(), - tbl, - # ARROW-13497: This is failing because the default is na.rm = FALSE - warning = TRUE + tbl ) }) @@ -92,8 +90,6 @@ test_that("Group by sum on dataset", { arrange(some_grouping) %>% collect(), tbl, - # ARROW-13497: This is failing because the default is na.rm = FALSE - warning = TRUE ) }) @@ -134,11 +130,8 @@ test_that("Group by sd on dataset", { summarize(sd = sd(int, na.rm = FALSE)) %>% arrange(some_grouping) %>% collect(), - tbl, - # ARROW-13497: This is failing because the default is na.rm = FALSE - warning = TRUE + tbl ) - }) test_that("Group by var on dataset", { From 2904a454b1baded2bd75edf6c3a65bf261a50f3b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:26:06 +0100 Subject: [PATCH 08/24] Diff to master --- r/R/dplyr-functions.R | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 6331bf46df1..289e3395aa1 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -829,11 +829,4 @@ agg_funcs$var <- function(x, y = NULL, na.rm = FALSE, ddof = 1) { data = x, options = list(ddof = ddof) ) -} -arrow_na_rm <- function(na.rm) { - if (!isTRUE(na.rm)) { - # TODO: ARROW-13497 - arrow_not_supported(paste("na.rm =", na.rm)) - } - list(na.rm = na.rm, na.min_count = 0L) -} +} \ No newline at end of file From 3e47166f96955e319a6ad20b6d43ca3dc5f23809 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:27:17 +0100 Subject: [PATCH 09/24] Add newline at end of file --- r/R/dplyr-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 289e3395aa1..5aadeef998b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -829,4 +829,4 @@ agg_funcs$var <- function(x, y = NULL, na.rm = FALSE, ddof = 1) { data = x, options = list(ddof = ddof) ) -} \ No newline at end of file +} From 5978ae4cf6495717f4358e6752aaa2394e430c4a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 18 Aug 2021 13:22:11 +0100 Subject: [PATCH 10/24] Add unit tests for mean, sd, and var --- r/tests/testthat/test-dplyr.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index ed03c58a884..d3a64da1361 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1492,3 +1492,23 @@ test_that("coalesce()", { fixed = TRUE ) }) + +test_that("mean, sd, and var", { + expect_dplyr_equal( + input %>% + mutate(agg = mean(int)), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate(agg = sd(int)), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate(agg = var(int)), + tbl + ) +}) From aca2c19cb7d7de2817c2436130b89b0cbf69348a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 18 Aug 2021 17:10:43 +0100 Subject: [PATCH 11/24] move tests --- r/tests/testthat/test-dplyr-aggregate.R | 49 +++++++++++++++++++++++++ r/tests/testthat/test-dplyr.R | 20 ---------- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 25cd0ccabfb..fc94a796e8b 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -46,8 +46,57 @@ test_that("summarize", { tbl, warning = TRUE ) + + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(mean_int = mean(int)), + tbl, + warning = TRUE + ) + + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(sd_int = sd(int)), + tbl, + warning = TRUE + ) + + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(var_int = var(int)), + tbl, + warning = TRUE + ) +}) + + +test_that("mean, sd, and var", { + expect_dplyr_equal( + input %>% + mutate(agg = mean(dbl)), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate(agg = sd(dbl)), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate(agg = var(dbl)), + tbl + ) }) + test_that("Can aggregate in Arrow", { expect_dplyr_equal( input %>% diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index d3a64da1361..ed03c58a884 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1492,23 +1492,3 @@ test_that("coalesce()", { fixed = TRUE ) }) - -test_that("mean, sd, and var", { - expect_dplyr_equal( - input %>% - mutate(agg = mean(int)), - tbl - ) - - expect_dplyr_equal( - input %>% - mutate(agg = sd(int)), - tbl - ) - - expect_dplyr_equal( - input %>% - mutate(agg = var(int)), - tbl - ) -}) From ad34a25534a50411315a3aadede339f99810c8e9 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 18 Aug 2021 17:11:00 +0100 Subject: [PATCH 12/24] implement mean/sd/var --- r/R/dplyr-functions.R | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index efba9f287f9..df2d5f31d60 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -808,3 +808,32 @@ agg_funcs$all <- function(x, na.rm = FALSE) { options = list(na.rm = na.rm, na.min_count = 0L) ) } + +agg_funcs$mean <- function(x, na.rm = FALSE) { + list( + fun = "mean", + data = x, + options = arrow_na_rm(na.rm = na.rm) + ) +} +agg_funcs$sd <- function(x, na.rm = FALSE) { + list( + fun = "stddev", + data = x, + options = arrow_na_rm(na.rm = na.rm) + ) +} +agg_funcs$var <- function(x, na.rm = FALSE) { + list( + fun = "variance", + data = x, + options = arrow_na_rm(na.rm = na.rm) + ) +} +arrow_na_rm <- function(na.rm) { + if (!isTRUE(na.rm)) { + # TODO: ARROW-13497 + arrow_not_supported(paste("na.rm =", na.rm)) + } + list(na.rm = na.rm, na.min_count = 0L) +} From 684b0ad22f5943723bf55e3d142dd39b299d9715 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:05:14 +0100 Subject: [PATCH 13/24] Add tests for var --- r/tests/testthat/test-dplyr-aggregate.R | 92 +++++++++++++++---------- 1 file changed, 57 insertions(+), 35 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index fc94a796e8b..873f6da81d7 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -47,100 +47,122 @@ test_that("summarize", { warning = TRUE ) +}) + +test_that("Can aggregate in Arrow", { expect_dplyr_equal( input %>% - select(int, chr) %>% - filter(int > 5) %>% - summarize(mean_int = mean(int)), - tbl, - warning = TRUE - ) - - expect_dplyr_equal( - input %>% - select(int, chr) %>% - filter(int > 5) %>% - summarize(sd_int = sd(int)), - tbl, - warning = TRUE + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl ) - expect_dplyr_equal( input %>% - select(int, chr) %>% - filter(int > 5) %>% - summarize(var_int = var(int)), + summarize(total = sum(int)) %>% + collect(), tbl, + # ARROW-13497: This is failing because the default is na.rm = FALSE warning = TRUE ) }) - -test_that("mean, sd, and var", { +test_that("Group by sum on dataset", { expect_dplyr_equal( input %>% - mutate(agg = mean(dbl)), + group_by(some_grouping) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), tbl ) - + expect_dplyr_equal( input %>% - mutate(agg = sd(dbl)), + group_by(some_grouping) %>% + summarize(total = sum(int * 4, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), tbl ) - + expect_dplyr_equal( input %>% - mutate(agg = var(dbl)), - tbl + group_by(some_grouping) %>% + summarize(total = sum(int)) %>% + arrange(some_grouping) %>% + collect(), + tbl, + # ARROW-13497: This is failing because the default is na.rm = FALSE + warning = TRUE ) }) - -test_that("Can aggregate in Arrow", { +test_that("Group by mean on dataset", { expect_dplyr_equal( input %>% - summarize(total = sum(int, na.rm = TRUE)) %>% + group_by(some_grouping) %>% + summarize(mean = mean(int, na.rm = TRUE)) %>% + arrange(some_grouping) %>% collect(), tbl ) + expect_dplyr_equal( input %>% - summarize(total = sum(int)) %>% + group_by(some_grouping) %>% + summarize(mean = mean(int, na.rm = FALSE)) %>% + arrange(some_grouping) %>% collect(), tbl ) + }) -test_that("Group by sum on dataset", { +test_that("Group by sd on dataset", { expect_dplyr_equal( input %>% group_by(some_grouping) %>% - summarize(total = sum(int, na.rm = TRUE)) %>% + summarize(sd = sd(int, na.rm = TRUE)) %>% arrange(some_grouping) %>% collect(), tbl ) + + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(sd = sd(int, na.rm = FALSE)) %>% + arrange(some_grouping) %>% + collect(), + tbl, + # ARROW-13497: This is failing because the default is na.rm = FALSE + warning = TRUE + ) + +}) +test_that("Group by var on dataset", { expect_dplyr_equal( input %>% group_by(some_grouping) %>% - summarize(total = sum(int * 4, na.rm = TRUE)) %>% + summarize(var = var(int, na.rm = TRUE)) %>% arrange(some_grouping) %>% collect(), tbl ) - + expect_dplyr_equal( input %>% group_by(some_grouping) %>% - summarize(total = sum(int)) %>% + summarize(var = var(int, na.rm = FALSE)) %>% arrange(some_grouping) %>% collect(), tbl ) + }) + test_that("Group by any/all", { withr::local_options(list(arrow.debug = TRUE)) From aacadfa3912440a961ca4dc6b507f27525cb77b9 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:06:03 +0100 Subject: [PATCH 14/24] Add ddof option to sd and var --- r/R/dplyr-functions.R | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index df2d5f31d60..ce2793ce0f4 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -551,6 +551,15 @@ nse_funcs$pmax <- function(..., na.rm = FALSE) { ) } +nse_funcs$mean <- function(..., na.rm = FALSE) { + build_expr( + "mean", + ..., + options = list(na.rm = na.rm) + ) +} + + nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") { assert_that(is_integerish(width)) side <- match.arg(side) @@ -816,18 +825,18 @@ agg_funcs$mean <- function(x, na.rm = FALSE) { options = arrow_na_rm(na.rm = na.rm) ) } -agg_funcs$sd <- function(x, na.rm = FALSE) { +agg_funcs$sd <- function(x, na.rm = FALSE, ddof = 1) { list( fun = "stddev", data = x, - options = arrow_na_rm(na.rm = na.rm) + options = list(ddof = ddof) ) } -agg_funcs$var <- function(x, na.rm = FALSE) { +agg_funcs$var <- function(x, y = NULL, na.rm = FALSE, ddof = 1) { list( fun = "variance", data = x, - options = arrow_na_rm(na.rm = na.rm) + options = list(ddof = ddof) ) } arrow_na_rm <- function(na.rm) { From 79cf5009a7b8277721818075ef6d1b47d3b6f986 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:18:24 +0100 Subject: [PATCH 15/24] Remove accidental function --- r/R/dplyr-functions.R | 9 --------- 1 file changed, 9 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index ce2793ce0f4..6331bf46df1 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -551,15 +551,6 @@ nse_funcs$pmax <- function(..., na.rm = FALSE) { ) } -nse_funcs$mean <- function(..., na.rm = FALSE) { - build_expr( - "mean", - ..., - options = list(na.rm = na.rm) - ) -} - - nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") { assert_that(is_integerish(width)) side <- match.arg(side) From a0dde5bd9ffa9df08fb1287ecaffe9eaadbe7df1 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:25:24 +0100 Subject: [PATCH 16/24] Remove things diff to upstream --- r/tests/testthat/test-dplyr-aggregate.R | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 873f6da81d7..fbae8a6b583 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -60,9 +60,7 @@ test_that("Can aggregate in Arrow", { input %>% summarize(total = sum(int)) %>% collect(), - tbl, - # ARROW-13497: This is failing because the default is na.rm = FALSE - warning = TRUE + tbl ) }) @@ -92,8 +90,6 @@ test_that("Group by sum on dataset", { arrange(some_grouping) %>% collect(), tbl, - # ARROW-13497: This is failing because the default is na.rm = FALSE - warning = TRUE ) }) @@ -134,11 +130,8 @@ test_that("Group by sd on dataset", { summarize(sd = sd(int, na.rm = FALSE)) %>% arrange(some_grouping) %>% collect(), - tbl, - # ARROW-13497: This is failing because the default is na.rm = FALSE - warning = TRUE + tbl ) - }) test_that("Group by var on dataset", { From 7e76e17f883d67758f3147f2e2f854ea67e1ae2c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:26:06 +0100 Subject: [PATCH 17/24] Diff to master --- r/R/dplyr-functions.R | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 6331bf46df1..289e3395aa1 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -829,11 +829,4 @@ agg_funcs$var <- function(x, y = NULL, na.rm = FALSE, ddof = 1) { data = x, options = list(ddof = ddof) ) -} -arrow_na_rm <- function(na.rm) { - if (!isTRUE(na.rm)) { - # TODO: ARROW-13497 - arrow_not_supported(paste("na.rm =", na.rm)) - } - list(na.rm = na.rm, na.min_count = 0L) -} +} \ No newline at end of file From c84ba429f732c3d1d94ad5b88aa51196a9fd39e3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Aug 2021 19:27:17 +0100 Subject: [PATCH 18/24] Add newline at end of file --- r/R/dplyr-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 289e3395aa1..5aadeef998b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -829,4 +829,4 @@ agg_funcs$var <- function(x, y = NULL, na.rm = FALSE, ddof = 1) { data = x, options = list(ddof = ddof) ) -} \ No newline at end of file +} From dd6698658cddfd99375a19a6b91fb578ea462157 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 25 Aug 2021 10:11:57 +0100 Subject: [PATCH 19/24] Remove unnecessary whitespace --- r/tests/testthat/test-dplyr-aggregate.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index fbae8a6b583..fa3be4cf629 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -46,7 +46,6 @@ test_that("summarize", { tbl, warning = TRUE ) - }) test_that("Can aggregate in Arrow", { From f642f5e6eb60a1113c08011251afe33552964b0c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 25 Aug 2021 10:27:49 +0100 Subject: [PATCH 20/24] Enable VarianceOptions for hash_stddev and hash_variance --- r/src/compute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index b697ecd96a0..9ca40c2fbec 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -350,7 +350,7 @@ std::shared_ptr make_compute_options( step); } - if (func_name == "variance" || func_name == "stddev") { + if (func_name == "variance" || func_name == "stddev" || func_name == "hash_variance" || func_name == "hash_stddev") { using Options = arrow::compute::VarianceOptions; return std::make_shared(cpp11::as_cpp(options["ddof"])); } From 7225a1da9be160a891cf61af5efe1f55f08419d8 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 25 Aug 2021 10:31:24 +0100 Subject: [PATCH 21/24] Skip tests that can't be run yet --- r/tests/testthat/test-dplyr-aggregate.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index fa3be4cf629..b19b3891dfe 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -110,7 +110,6 @@ test_that("Group by mean on dataset", { collect(), tbl ) - }) test_that("Group by sd on dataset", { @@ -123,6 +122,7 @@ test_that("Group by sd on dataset", { tbl ) + skip("ARROW-13691 - na.rm not yet implemented for VarianceOptions") expect_dplyr_equal( input %>% group_by(some_grouping) %>% @@ -143,6 +143,7 @@ test_that("Group by var on dataset", { tbl ) + skip("ARROW-13691 - na.rm not yet implemented for VarianceOptions") expect_dplyr_equal( input %>% group_by(some_grouping) %>% @@ -151,7 +152,6 @@ test_that("Group by var on dataset", { collect(), tbl ) - }) From 5e069dca71e7cc5d0bb77758a3175efbba146bdc Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 25 Aug 2021 10:31:38 +0100 Subject: [PATCH 22/24] Set ddof to 1 --- r/R/dplyr-functions.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 5aadeef998b..88746beaf2d 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -813,20 +813,20 @@ agg_funcs$mean <- function(x, na.rm = FALSE) { list( fun = "mean", data = x, - options = arrow_na_rm(na.rm = na.rm) + options = list(na.rm = na.rm, na.min_count = 0L) ) } -agg_funcs$sd <- function(x, na.rm = FALSE, ddof = 1) { +agg_funcs$sd <- function(x, na.rm = FALSE) { list( fun = "stddev", data = x, - options = list(ddof = ddof) + options = list(ddof = 1) ) } -agg_funcs$var <- function(x, y = NULL, na.rm = FALSE, ddof = 1) { +agg_funcs$var <- function(x, y = NULL, na.rm = FALSE) { list( fun = "variance", data = x, - options = list(ddof = ddof) + options = list(ddof = 1) ) } From d62abeeb875fad3324de5cc24dd7e96849e6def2 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 25 Aug 2021 10:45:07 +0100 Subject: [PATCH 23/24] Restyle cpp --- r/src/compute.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 9ca40c2fbec..dec8b2adfaa 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -350,7 +350,8 @@ std::shared_ptr make_compute_options( step); } - if (func_name == "variance" || func_name == "stddev" || func_name == "hash_variance" || func_name == "hash_stddev") { + if (func_name == "variance" || func_name == "stddev" || func_name == "hash_variance" || + func_name == "hash_stddev") { using Options = arrow::compute::VarianceOptions; return std::make_shared(cpp11::as_cpp(options["ddof"])); } From eab93e605114f327e49cfe64f2274dc960105505 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 25 Aug 2021 17:02:00 +0100 Subject: [PATCH 24/24] Expose ddof to user and add comments about why not passing in na.rm --- r/R/dplyr-functions.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 88746beaf2d..843b71244ba 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -816,17 +816,19 @@ agg_funcs$mean <- function(x, na.rm = FALSE) { options = list(na.rm = na.rm, na.min_count = 0L) ) } -agg_funcs$sd <- function(x, na.rm = FALSE) { +# na.rm not currently passed in due to ARROW-13691 +agg_funcs$sd <- function(x, na.rm = FALSE, ddof = 1) { list( fun = "stddev", data = x, - options = list(ddof = 1) + options = list(ddof = ddof) ) } -agg_funcs$var <- function(x, y = NULL, na.rm = FALSE) { +# na.rm not currently passed in due to ARROW-13691 +agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { list( fun = "variance", data = x, - options = list(ddof = 1) + options = list(ddof = ddof) ) }