From 0510ea0c70c1536d97af714127aed904279a9021 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 16:28:08 +0100 Subject: [PATCH 01/10] Add sd and var dplyr bindings --- r/R/arrow-package.R | 2 +- r/R/dplyr.R | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 51f4987484c..8fa8284dd1b 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail +#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail sd var #' @importFrom R6 R6Class #' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl keep #' @importFrom assertthat assert_that is.string diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 845cb3a1815..a0bf287b310 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -480,6 +480,18 @@ build_function_list <- function(FUN) { between = function(x, left, right) { x >= left & x <= right }, + sd = function(x, na.rm = FALSE){ + if (!na.rm && x$null_count > 0) { + return(Scalar$create(NA_real_)) + } + FUN("stddev", x, options = list(ddof = 0)) + }, + var = function(x, na.rm = FALSE){ + if (!na.rm && x$null_count > 0) { + return(Scalar$create(NA_real_)) + } + FUN("variance", x, options = list(ddof = 0)) + }, # Now also include all available Arrow Compute functions, # namespaced as arrow_fun set_names( From 57fe43ebfdd9b5529cad96443b8cc280f5ffb456 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 16:28:44 +0100 Subject: [PATCH 02/10] Add bindings and tests for stddev and variance --- r/R/compute.R | 19 ++++++++++++++++++ r/src/compute.cpp | 7 ++++++- r/tests/testthat/test-compute-aggregate.R | 24 +++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/r/R/compute.R b/r/R/compute.R index 0641bf1615c..06c8987c680 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -267,6 +267,25 @@ value_counts <- function(x) { call_function("value_counts", x) } + +#' `variance` and `stddev` for Arrow objects +#' +#' These functions calculate the variance and standard deviation of Arrow arrays +#' @param x `Array` or `ChunkedArray` +#' @param ddof The divisor used in calculations is N - ddof, where N is the number of elements. +#' By default, ddof is zero, and population variance or stddev is returned. +#' @return A `Scalar` containing the calculated value. +#' @export +stddev <- function(x, ddof = 0) { + call_function("stddev", x, options = list(ddof = ddof)) +} + +#' @rdname stddev +#' @export +variance <- function(x, ddof = 0) { + call_function("variance", x, options = list(ddof = ddof)) +} + #' Cast options #' #' @param safe logical: enforce safe conversion? Default `TRUE` diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 34bc3bea456..5294497ed3b 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -232,7 +232,12 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["replacement"]), max_replacements); } - + + if (func_name == "variance" || func_name == "stddev") { + using Options = arrow::compute::VarianceOptions; + return std::make_shared(cpp11::as_cpp(options["ddof"])); + } + return nullptr; } diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 0621b7779c7..c55db822847 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -381,3 +381,27 @@ test_that("all.Array and all.ChunkedArray", { expect_vector_equal(all(input, na.rm = TRUE), data_logical) }) + +test_that("variance", { + data <- c(1, 4, 3, 1, 1, 3, NA) + arr <- Array$create(data) + chunked_arr <- ChunkedArray$create(data) + + expect_equal(variance(arr), Scalar$create(1.4722222222222223)) + expect_equal(variance(arr, ddof = 5), Scalar$create(8.833333333333334)) + + expect_equal(variance(chunked_arr), Scalar$create(1.4722222222222223)) + expect_equal(variance(chunked_arr, ddof = 5), Scalar$create(8.833333333333334)) +}) + +test_that("stddev", { + data <- c(1, 4, 3, 1, 1, 3, NA) + arr <- Array$create(data) + chunked_arr <- ChunkedArray$create(data) + + expect_equal(stddev(arr), Scalar$create(1.2133516482134197)) + expect_equal(stddev(arr, ddof = 5), Scalar$create(2.972092416687835)) + + expect_equal(stddev(chunked_arr), Scalar$create(1.2133516482134197)) + expect_equal(stddev(chunked_arr, ddof = 5), Scalar$create(2.972092416687835)) +}) \ No newline at end of file From 5a3ad9aed310ff09eb604b789ce5127abcdba14b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 17:52:40 +0100 Subject: [PATCH 03/10] Remove bindings for stddev and variance --- r/R/compute.R | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 06c8987c680..0641bf1615c 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -267,25 +267,6 @@ value_counts <- function(x) { call_function("value_counts", x) } - -#' `variance` and `stddev` for Arrow objects -#' -#' These functions calculate the variance and standard deviation of Arrow arrays -#' @param x `Array` or `ChunkedArray` -#' @param ddof The divisor used in calculations is N - ddof, where N is the number of elements. -#' By default, ddof is zero, and population variance or stddev is returned. -#' @return A `Scalar` containing the calculated value. -#' @export -stddev <- function(x, ddof = 0) { - call_function("stddev", x, options = list(ddof = ddof)) -} - -#' @rdname stddev -#' @export -variance <- function(x, ddof = 0) { - call_function("variance", x, options = list(ddof = ddof)) -} - #' Cast options #' #' @param safe logical: enforce safe conversion? Default `TRUE` From 0b704aec595eb22a4605793097aa989beb0adbbb Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 17:52:59 +0100 Subject: [PATCH 04/10] Make sd fail if in dplyr call --- r/R/dplyr.R | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index a0bf287b310..c172e9ba065 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -480,18 +480,6 @@ build_function_list <- function(FUN) { between = function(x, left, right) { x >= left & x <= right }, - sd = function(x, na.rm = FALSE){ - if (!na.rm && x$null_count > 0) { - return(Scalar$create(NA_real_)) - } - FUN("stddev", x, options = list(ddof = 0)) - }, - var = function(x, na.rm = FALSE){ - if (!na.rm && x$null_count > 0) { - return(Scalar$create(NA_real_)) - } - FUN("variance", x, options = list(ddof = 0)) - }, # Now also include all available Arrow Compute functions, # namespaced as arrow_fun set_names( @@ -644,7 +632,7 @@ arrow_mask <- function(.data) { # Some R functions will still try to evaluate on an Expression # and return NA with a warning fail <- function(...) stop("Not implemented") - for (f in c("mean")) { + for (f in c("mean", "sd")) { f_env[[f]] <- fail } @@ -1018,7 +1006,6 @@ abandon_ship <- function(call, .data, msg = NULL) { stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE) } } - # else, collect and call dplyr method if (!is.null(msg)) { warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE) From a5ebe892e83f605087b9e7905542d5a01eac13a6 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 17:53:14 +0100 Subject: [PATCH 05/10] Remove tests for removed bindings --- r/tests/testthat/test-compute-aggregate.R | 24 ----------------------- 1 file changed, 24 deletions(-) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index c55db822847..55855431898 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -380,28 +380,4 @@ test_that("all.Array and all.ChunkedArray", { expect_vector_equal(all(input), data_logical) expect_vector_equal(all(input, na.rm = TRUE), data_logical) -}) - -test_that("variance", { - data <- c(1, 4, 3, 1, 1, 3, NA) - arr <- Array$create(data) - chunked_arr <- ChunkedArray$create(data) - - expect_equal(variance(arr), Scalar$create(1.4722222222222223)) - expect_equal(variance(arr, ddof = 5), Scalar$create(8.833333333333334)) - - expect_equal(variance(chunked_arr), Scalar$create(1.4722222222222223)) - expect_equal(variance(chunked_arr, ddof = 5), Scalar$create(8.833333333333334)) -}) - -test_that("stddev", { - data <- c(1, 4, 3, 1, 1, 3, NA) - arr <- Array$create(data) - chunked_arr <- ChunkedArray$create(data) - - expect_equal(stddev(arr), Scalar$create(1.2133516482134197)) - expect_equal(stddev(arr, ddof = 5), Scalar$create(2.972092416687835)) - - expect_equal(stddev(chunked_arr), Scalar$create(1.2133516482134197)) - expect_equal(stddev(chunked_arr, ddof = 5), Scalar$create(2.972092416687835)) }) \ No newline at end of file From 6604ef0bb655d622be1e709720a168df58eab9e0 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 17:55:42 +0100 Subject: [PATCH 06/10] Remove import of var and sd --- r/R/arrow-package.R | 2 +- r/tests/testthat/test-compute-aggregate.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 8fa8284dd1b..51f4987484c 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail sd var +#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail #' @importFrom R6 R6Class #' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl keep #' @importFrom assertthat assert_that is.string diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 55855431898..0621b7779c7 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -380,4 +380,4 @@ test_that("all.Array and all.ChunkedArray", { expect_vector_equal(all(input), data_logical) expect_vector_equal(all(input, na.rm = TRUE), data_logical) -}) \ No newline at end of file +}) From 0c90152469eb2108e1cc81575e23e6062a153d39 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 30 Apr 2021 18:11:45 +0100 Subject: [PATCH 07/10] Fix linting --- r/src/compute.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 5294497ed3b..c215d661e3a 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -232,12 +232,12 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["replacement"]), max_replacements); } - + if (func_name == "variance" || func_name == "stddev") { using Options = arrow::compute::VarianceOptions; return std::make_shared(cpp11::as_cpp(options["ddof"])); } - + return nullptr; } From df4bc4d6b9ab6e1e0947bd9fc18a88dcb3d87d5e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 May 2021 17:15:40 +0100 Subject: [PATCH 08/10] Add in tests for variance and stddev --- r/tests/testthat/test-compute-aggregate.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 0621b7779c7..56d24a0bf33 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -381,3 +381,22 @@ test_that("all.Array and all.ChunkedArray", { expect_vector_equal(all(input, na.rm = TRUE), data_logical) }) + +test_that("variance", { + data <- c(1, 4, 3, 1, 1, 3, NA) + arr <- Array$create(data) + chunked_arr <- ChunkedArray$create(data) + + expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(8.833333333333334)) + expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(8.833333333333334)) + +}) + +test_that("stddev", { + data <- c(1, 4, 3, 1, 1, 3, NA) + arr <- Array$create(data) + chunked_arr <- ChunkedArray$create(data) + + expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(2.972092416687835)) + expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(2.972092416687835)) +}) From 50ffbe3d81def90a6486e55fc124fafc21e8d2ff Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 May 2021 17:54:13 +0100 Subject: [PATCH 09/10] Whitespace fix --- r/tests/testthat/test-compute-aggregate.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 56d24a0bf33..d1cb5a1835d 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -389,7 +389,6 @@ test_that("variance", { expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(8.833333333333334)) expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(8.833333333333334)) - }) test_that("stddev", { From ae46e555236831b23b98dae69a2969cb78260653 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 May 2021 19:04:40 +0100 Subject: [PATCH 10/10] Change values in case weird floating point stuff happening --- r/tests/testthat/test-compute-aggregate.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index d1cb5a1835d..398b39fb17f 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -383,19 +383,19 @@ test_that("all.Array and all.ChunkedArray", { }) test_that("variance", { - data <- c(1, 4, 3, 1, 1, 3, NA) + data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA) arr <- Array$create(data) chunked_arr <- ChunkedArray$create(data) - expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(8.833333333333334)) - expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(8.833333333333334)) + expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(34596)) + expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(34596)) }) test_that("stddev", { - data <- c(1, 4, 3, 1, 1, 3, NA) + data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA) arr <- Array$create(data) chunked_arr <- ChunkedArray$create(data) - expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(2.972092416687835)) - expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(2.972092416687835)) + expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(186)) + expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(186)) })