From 0f666cdd2e4dade042f2c863e967d100c1516090 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 26 Aug 2021 14:56:27 +0100 Subject: [PATCH 01/14] Add bindings for dplyr::n_distinct --- r/R/dplyr-functions.R | 10 +++++++++- r/src/compute.cpp | 2 +- r/tests/testthat/test-dplyr-aggregate.R | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 843b71244ba..999256908c6 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -808,7 +808,6 @@ agg_funcs$all <- function(x, na.rm = FALSE) { options = list(na.rm = na.rm, na.min_count = 0L) ) } - agg_funcs$mean <- function(x, na.rm = FALSE) { list( fun = "mean", @@ -832,3 +831,12 @@ agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { options = list(ddof = ddof) ) } +agg_funcs$n_distinct <- function(x, na.rm = FALSE) { + list( + fun = "count_distinct", + data = x, + # ARROW-13764 Passing in na.rm = TRUE doesn't actually work yet as + # ScalarAggregateOptions not implemented for count_distinct + options = list(na.rm = na.rm, na.min_count = 0L) + ) +} diff --git a/r/src/compute.cpp b/r/src/compute.cpp index dec8b2adfaa..d5cb3ff8648 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -174,7 +174,7 @@ std::shared_ptr make_compute_options( if (func_name == "min_max" || func_name == "sum" || func_name == "mean" || func_name == "any" || func_name == "all" || func_name == "hash_min_max" || func_name == "hash_sum" || func_name == "hash_mean" || func_name == "hash_any" || - func_name == "hash_all") { + func_name == "hash_all" || func_name == "hash_count_distinct") { using Options = arrow::compute::ScalarAggregateOptions; auto out = std::make_shared(Options::Defaults()); out->min_count = cpp11::as_cpp(options["na.min_count"]); diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index b19b3891dfe..d4b1b2d7664 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -219,6 +219,24 @@ test_that("Group by any/all", { ) }) +test_that("Group by n_distinct() on dataset", { + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>% + collect(), + tbl + ) + skip("ARROW-13764 - na.rm not implemented for compute_distinct") + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>% + collect(), + tbl + ) +}) + test_that("Filter and aggregate", { expect_dplyr_equal( input %>% From d1388a60bb5f9571a9bf0972b0368f55ed1f306b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 26 Aug 2021 17:12:31 +0100 Subject: [PATCH 02/14] Style files --- r/R/dplyr-functions.R | 2 +- r/tests/testthat/test-dplyr-aggregate.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 999256908c6..eb3d342539e 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -835,7 +835,7 @@ agg_funcs$n_distinct <- function(x, na.rm = FALSE) { list( fun = "count_distinct", data = x, - # ARROW-13764 Passing in na.rm = TRUE doesn't actually work yet as + # ARROW-13764 Passing in na.rm = TRUE doesn't actually work yet as # ScalarAggregateOptions not implemented for count_distinct options = list(na.rm = na.rm, na.min_count = 0L) ) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index d4b1b2d7664..ff163349b7a 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -101,7 +101,7 @@ test_that("Group by mean on dataset", { collect(), tbl ) - + expect_dplyr_equal( input %>% group_by(some_grouping) %>% @@ -121,7 +121,7 @@ test_that("Group by sd on dataset", { collect(), tbl ) - + skip("ARROW-13691 - na.rm not yet implemented for VarianceOptions") expect_dplyr_equal( input %>% @@ -142,7 +142,7 @@ test_that("Group by var on dataset", { collect(), tbl ) - + skip("ARROW-13691 - na.rm not yet implemented for VarianceOptions") expect_dplyr_equal( input %>% From 7d363e6685e3c571c065bded5315dd8a674b6eaa Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 26 Aug 2021 17:14:22 +0100 Subject: [PATCH 03/14] Rephrase comment --- r/R/dplyr-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index eb3d342539e..2ec60c3cbba 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -836,7 +836,7 @@ agg_funcs$n_distinct <- function(x, na.rm = FALSE) { fun = "count_distinct", data = x, # ARROW-13764 Passing in na.rm = TRUE doesn't actually work yet as - # ScalarAggregateOptions not implemented for count_distinct + # CountOptions not yet implemented for count_distinct options = list(na.rm = na.rm, na.min_count = 0L) ) } From bce60d3d0f455d11c82cb9a8dde7896bc62a078b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 26 Aug 2021 17:53:27 +0100 Subject: [PATCH 04/14] Change options for hash_count_distinct and remove extra option --- r/R/dplyr-functions.R | 6 +++++- r/src/compute.cpp | 4 ++-- r/tests/testthat/test-dplyr-aggregate.R | 4 ++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 2ec60c3cbba..177a7b5c802 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -837,6 +837,10 @@ agg_funcs$n_distinct <- function(x, na.rm = FALSE) { data = x, # ARROW-13764 Passing in na.rm = TRUE doesn't actually work yet as # CountOptions not yet implemented for count_distinct - options = list(na.rm = na.rm, na.min_count = 0L) + options = list(na.rm = na.rm) ) } + + + + diff --git a/r/src/compute.cpp b/r/src/compute.cpp index d5cb3ff8648..2d4a96c0b7b 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -174,7 +174,7 @@ std::shared_ptr make_compute_options( if (func_name == "min_max" || func_name == "sum" || func_name == "mean" || func_name == "any" || func_name == "all" || func_name == "hash_min_max" || func_name == "hash_sum" || func_name == "hash_mean" || func_name == "hash_any" || - func_name == "hash_all" || func_name == "hash_count_distinct") { + func_name == "hash_all") { using Options = arrow::compute::ScalarAggregateOptions; auto out = std::make_shared(Options::Defaults()); out->min_count = cpp11::as_cpp(options["na.min_count"]); @@ -182,7 +182,7 @@ std::shared_ptr make_compute_options( return out; } - if (func_name == "count") { + if (func_name == "count" || func_name == "hash_count_distinct") { using Options = arrow::compute::CountOptions; auto out = std::make_shared(Options::Defaults()); out->mode = diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index ff163349b7a..aab2b6c5e23 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -274,3 +274,7 @@ test_that("Filter and aggregate", { tbl ) }) + + + + From 915bfd48b032dd97647ed0330621729c18e08065 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 27 Aug 2021 09:55:21 +0100 Subject: [PATCH 05/14] Remove unnecessary whitespace --- r/R/dplyr-functions.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 177a7b5c802..3a64b4d6775 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -840,7 +840,3 @@ agg_funcs$n_distinct <- function(x, na.rm = FALSE) { options = list(na.rm = na.rm) ) } - - - - From 4b449ff8e65550b74ebd39b9fea350e39122f0c1 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 27 Aug 2021 09:57:49 +0100 Subject: [PATCH 06/14] Improve comment clarity --- r/tests/testthat/test-dplyr-aggregate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index aab2b6c5e23..7a3111a2d31 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -227,7 +227,7 @@ test_that("Group by n_distinct() on dataset", { collect(), tbl ) - skip("ARROW-13764 - na.rm not implemented for compute_distinct") + skip("ARROW-13764 - CountOptions (na.rm) not yet implemented for compute_distinct") expect_dplyr_equal( input %>% group_by(some_grouping) %>% From 5a990b54200576496f7c366b8ea037e02f26a472 Mon Sep 17 00:00:00 2001 From: Nic Date: Fri, 27 Aug 2021 13:42:01 +0000 Subject: [PATCH 07/14] Update r/tests/testthat/test-dplyr-aggregate.R Co-authored-by: Neal Richardson --- r/tests/testthat/test-dplyr-aggregate.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 7a3111a2d31..df450281728 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -274,7 +274,3 @@ test_that("Filter and aggregate", { tbl ) }) - - - - From 6b21886ea20f238709e6feeb4755dfc75b0a07e1 Mon Sep 17 00:00:00 2001 From: Nic Date: Mon, 30 Aug 2021 22:50:59 +0100 Subject: [PATCH 08/14] Missing brace from merge --- r/R/dplyr-functions.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index bf147193aee..cc9ca41137b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -832,6 +832,7 @@ agg_funcs$n_distinct <- function(x, na.rm = FALSE) { # ARROW-13764 Passing in na.rm = TRUE doesn't actually work yet as # CountOptions not yet implemented for count_distinct options = list(na.rm = na.rm) +} agg_funcs$n <- function() { list( From c16cc9fb6dede805f9d740404619698bc3349ece Mon Sep 17 00:00:00 2001 From: Nic Date: Mon, 30 Aug 2021 22:58:57 +0100 Subject: [PATCH 09/14] Missing bracket from merge --- r/R/dplyr-functions.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index cc9ca41137b..9d163182fe9 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -832,6 +832,7 @@ agg_funcs$n_distinct <- function(x, na.rm = FALSE) { # ARROW-13764 Passing in na.rm = TRUE doesn't actually work yet as # CountOptions not yet implemented for count_distinct options = list(na.rm = na.rm) + ) } agg_funcs$n <- function() { From 86a4e2d4af01a5b497e78d9a4e70b936d18cc99b Mon Sep 17 00:00:00 2001 From: Nic Date: Tue, 31 Aug 2021 18:49:02 +0000 Subject: [PATCH 10/14] Update r/R/dplyr-functions.R Co-authored-by: Ian Cook --- r/R/dplyr-functions.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 9d163182fe9..e535546dd1b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -829,8 +829,6 @@ agg_funcs$n_distinct <- function(x, na.rm = FALSE) { list( fun = "count_distinct", data = x, - # ARROW-13764 Passing in na.rm = TRUE doesn't actually work yet as - # CountOptions not yet implemented for count_distinct options = list(na.rm = na.rm) ) } From f182d00d8404baa258ed24882283407495a4eda1 Mon Sep 17 00:00:00 2001 From: Nic Date: Tue, 31 Aug 2021 18:49:19 +0000 Subject: [PATCH 11/14] Update r/tests/testthat/test-dplyr-aggregate.R Co-authored-by: Ian Cook --- r/tests/testthat/test-dplyr-aggregate.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index c1b2ad1635e..3a04b6d2314 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -243,7 +243,6 @@ test_that("Group by n_distinct() on dataset", { collect(), tbl ) - skip("ARROW-13764 - CountOptions (na.rm) not yet implemented for compute_distinct") expect_dplyr_equal( input %>% group_by(some_grouping) %>% From 72ddf595dbf4170901373eb4126d112e889d88e2 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 1 Sep 2021 11:30:12 +0100 Subject: [PATCH 12/14] Use correct hash options --- r/src/compute.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 00f9bd360f5..b9746d19c42 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -186,13 +186,21 @@ std::shared_ptr make_compute_options( return out; } - if (func_name == "count" || func_name == "hash_count_distinct") { + if (func_name == "count") { using Options = arrow::compute::CountOptions; auto out = std::make_shared(Options::Defaults()); out->mode = cpp11::as_cpp(options["na.rm"]) ? Options::ONLY_VALID : Options::ONLY_NULL; return out; } + + if (func_name == "hash_count_distinct") { + using Options = arrow::compute::CountOptions; + auto out = std::make_shared(Options::Defaults()); + out->mode = + cpp11::as_cpp(options["na.rm"]) ? Options::ONLY_VALID : Options::ANY; + return out; + } if (func_name == "min_element_wise" || func_name == "max_element_wise") { using Options = arrow::compute::ElementWiseAggregateOptions; From bf9f4c65005071938db295d55aaa58f3f3e2b387 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 1 Sep 2021 11:30:50 +0100 Subject: [PATCH 13/14] Should be all not any --- r/src/compute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index b9746d19c42..a2a44f55280 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -198,7 +198,7 @@ std::shared_ptr make_compute_options( using Options = arrow::compute::CountOptions; auto out = std::make_shared(Options::Defaults()); out->mode = - cpp11::as_cpp(options["na.rm"]) ? Options::ONLY_VALID : Options::ANY; + cpp11::as_cpp(options["na.rm"]) ? Options::ONLY_VALID : Options::ALL; return out; } From f4d015a9158a4aa561036ecf49765d758cadb45c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 1 Sep 2021 12:01:10 +0100 Subject: [PATCH 14/14] lint --- r/src/compute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index a2a44f55280..788e75a03d2 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -193,7 +193,7 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["na.rm"]) ? Options::ONLY_VALID : Options::ONLY_NULL; return out; } - + if (func_name == "hash_count_distinct") { using Options = arrow::compute::CountOptions; auto out = std::make_shared(Options::Defaults());