From 93276a0801543bcf2716966a903d3be7c03b1988 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 2 Apr 2021 00:12:28 -0400 Subject: [PATCH 01/13] Implement quantile --- r/NAMESPACE | 2 ++ r/R/compute.R | 21 +++++++++++++++++++++ r/R/enums.R | 6 ++++++ r/man/enums.Rd | 5 +++++ r/src/compute.cpp | 14 ++++++++++++++ 5 files changed, 48 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index 49d845805f40..add95d85f74a 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -73,6 +73,7 @@ S3method(print,array_expression) S3method(print,arrow_dplyr_query) S3method(print,arrow_info) S3method(print,arrow_r_metadata) +S3method(quantile,ArrowDatum) S3method(read_message,InputStream) S3method(read_message,MessageReader) S3method(read_message,default) @@ -152,6 +153,7 @@ export(ParquetFileWriter) export(ParquetVersionType) export(ParquetWriterProperties) export(Partitioning) +export(QuantileInterpolation) export(RandomAccessFile) export(ReadableFile) export(RecordBatchFileReader) diff --git a/r/R/compute.R b/r/R/compute.R index 09f2c653a8ae..0dcad2a824fe 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -80,6 +80,27 @@ collect_arrays_from_dots <- function(dots) { ChunkedArray$create(!!!arrays) } +#' @export +quantile.ArrowDatum <- function(x, + probs = seq(0, 1, 0.25), + na.rm = FALSE, + interpolation = c("linear", "lower", "higher", "nearest", "midpoint"), + ...) { + if (inherits(x, "Scalar")) x <- Array$create(x) + assert_is(probs, c("numeric", "integer")) + assert_that(length(probs) > 0) + assert_that(all(probs >= 0 & probs <= 1)) + if (!na.rm && TRUE %in% as.vector(unique(is.na(x)))) { + stop("Missing values not allowed if 'na.rm' is FALSE", call. = FALSE) + } + interpolation <- QuantileInterpolation[[toupper(match.arg(interpolation))]] + out <- call_function("quantile", x, options = list(q = probs, interpolation = interpolation)) + if (length(out) == 0) { + out <- Array$create(rep(NA_real_, length(probs))) + } + out +} + #' @export unique.ArrowDatum <- function(x, incomparables = FALSE, ...) { call_function("unique", x) diff --git a/r/R/enums.R b/r/R/enums.R index 14910bc92e03..170abf998657 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -128,3 +128,9 @@ ParquetVersionType <- enum("ParquetVersionType", MetadataVersion <- enum("MetadataVersion", V1 = 0L, V2 = 1L, V3 = 2L, V4 = 3L, V5 = 4L ) + +#' @export +#' @rdname enums +QuantileInterpolation <- enum("QuantileInterpolation", + LINEAR = 0L, LOWER = 1L, HIGHER = 2L, NEAREST = 3L, MIDPOINT = 4L +) diff --git a/r/man/enums.Rd b/r/man/enums.Rd index e4cb2d854697..fa3c64b8f955 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -13,6 +13,7 @@ \alias{FileType} \alias{ParquetVersionType} \alias{MetadataVersion} +\alias{QuantileInterpolation} \title{Arrow enums} \format{ An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4. @@ -34,6 +35,8 @@ An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4 An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 2. An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of length 5. + +An object of class \code{QuantileInterpolation} (inherits from \code{arrow-enum}) of length 5. } \usage{ TimeUnit @@ -55,6 +58,8 @@ FileType ParquetVersionType MetadataVersion + +QuantileInterpolation } \description{ Arrow enums diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 5cf8c7c37d29..b4820f7abcf6 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -179,6 +179,20 @@ std::shared_ptr make_compute_options( return out; } + if (func_name == "quantile") { + using Options = arrow::compute::QuantileOptions; + auto out = std::make_shared(Options::Defaults()); + SEXP q = options["q"]; + if (!Rf_isNull(q) && TYPEOF(q) == REALSXP) { + out->q = cpp11::as_cpp>(q); + } + SEXP interpolation = options["interpolation"]; + if (!Rf_isNull(interpolation) && TYPEOF(interpolation) == INTSXP && XLENGTH(interpolation) == 1) { + out->interpolation = cpp11::as_cpp(interpolation); + } + return out; + } + if (func_name == "is_in" || func_name == "index_in") { using Options = arrow::compute::SetLookupOptions; return std::make_shared(cpp11::as_cpp(options["value_set"]), From db5250adf6a48fec1f1a24318bc0eee0573f7854 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 2 Apr 2021 00:12:46 -0400 Subject: [PATCH 02/13] Implement median --- r/NAMESPACE | 1 + r/R/compute.R | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index add95d85f74a..843b59d1f7a9 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -58,6 +58,7 @@ S3method(match_arrow,ArrowDatum) S3method(match_arrow,default) S3method(max,ArrowDatum) S3method(mean,ArrowDatum) +S3method(median,ArrowDatum) S3method(min,ArrowDatum) S3method(names,Dataset) S3method(names,FeatherReader) diff --git a/r/R/compute.R b/r/R/compute.R index 0dcad2a824fe..ae65698495f8 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -101,6 +101,15 @@ quantile.ArrowDatum <- function(x, out } +#' @export +median.ArrowDatum <- function(x, na.rm = FALSE, ...) { + if (!na.rm && TRUE %in% as.vector(unique(is.na(x)))) { + Scalar$create(NA_real_) + } else { + Scalar$create(quantile(x, probs = 0.5, na.rm = TRUE, ...)) + } +} + #' @export unique.ArrowDatum <- function(x, incomparables = FALSE, ...) { call_function("unique", x) From e5621f6a5a148fe808d42720bdacc7aef37dedc8 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 2 Apr 2021 00:33:42 -0400 Subject: [PATCH 03/13] Add tests --- r/tests/testthat/test-compute-aggregate.R | 104 ++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 5f4eaba49bd2..d22c0a98318b 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -199,6 +199,110 @@ test_that("Edge cases", { } }) +test_that("quantile.Array and quantile.ChunkedArray", { + a <- Array$create(c(0, 1, 2, 3)) + ca <- ChunkedArray$create(c(0, 1), c(2, 3)) + probs <- c(0.49, 0.51) + for(ad in list(a, ca)) { + for (type in c(int32(), uint64(), float64())) { + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "linear"), + Array$create(c(1.47, 1.53)) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "lower"), + Array$create(c(1, 1))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "higher"), + Array$create(c(2, 2))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "nearest"), + Array$create(c(1, 2))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "midpoint"), + Array$create(c(1.5, 1.5)) + ) + } + } +}) + +test_that("quantile and median NAs and edge cases", { + expect_equal( + quantile(Array$create(c(1, 2)), probs = c(0, 1)), + Array$create(c(1, 2)) + ) + expect_error( + quantile(Array$create(c(1, 2, NA))), + "Missing values not allowed if 'na.rm' is FALSE" + ) + expect_equal( + quantile(Array$create(numeric(0))), + Array$create(rep(NA_real_, 5)) + ) + expect_equal( + quantile(Array$create(rep(NA_integer_, 3)), na.rm = TRUE), + Array$create(rep(NA_real_, 5)) + ) + expect_error( + median(Array$create(c(1, 2)), probs = c(.25, .75)), + "formal argument \"probs\" matched by multiple actual arguments" + ) + expect_equal( + median(Array$create(c(1, 2)), interpolation = "higher"), + Scalar$create(2) + ) + expect_equal( + quantile(Scalar$create(0L)), + Array$create(rep(0, 5)) + ) + expect_equal( + median(Scalar$create(1L)), + Scalar$create(1) + ) +}) + +test_that("median.Array and median.ChunkedArray", { + expect_vector_equal( + median(input), + 1:4 + ) + expect_vector_equal( + median(input), + 1:5 + ) + expect_vector_equal( + median(input), + numeric(0) + ) + expect_vector_equal( + median(input, na.rm = FALSE), + c(1, 2, NA) + ) + expect_vector_equal( + median(input, na.rm = TRUE), + c(1, 2, NA) + ) + expect_vector_equal( + median(input, na.rm = TRUE), + NA_real_ + ) + expect_vector_equal( + median(input, na.rm = FALSE), + c(1, 2, NA) + ) + expect_vector_equal( + median(input, na.rm = TRUE), + c(1, 2, NA) + ) + expect_vector_equal( + median(input, na.rm = TRUE), + NA_real_ + ) +}) + test_that("unique.Array", { a <- Array$create(c(1, 4, 3, 1, 1, 3, 4)) expect_equal(unique(a), Array$create(c(1, 4, 3))) From d35902c1dc2d43c3493d6c1229540c3136215df8 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 2 Apr 2021 00:34:12 -0400 Subject: [PATCH 04/13] Import median and quantile generics from stats --- r/NAMESPACE | 2 ++ r/R/arrow-package.R | 1 + 2 files changed, 3 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index 843b59d1f7a9..fb3ea82c4af4 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -311,6 +311,8 @@ importFrom(rlang,seq2) importFrom(rlang,set_names) importFrom(rlang,syms) importFrom(rlang,warn) +importFrom(stats,median) +importFrom(stats,quantile) importFrom(tidyselect,contains) importFrom(tidyselect,ends_with) importFrom(tidyselect,eval_select) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 2515e7ac9208..10dae65fd3f7 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +#' @importFrom stats quantile median #' @importFrom R6 R6Class #' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl keep #' @importFrom assertthat assert_that is.string From d60361b39c1bde54f541841cc4c2c81f13a956e0 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 2 Apr 2021 00:49:12 -0400 Subject: [PATCH 05/13] Lint --- r/src/compute.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index b4820f7abcf6..b46072faf9b7 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -187,8 +187,11 @@ std::shared_ptr make_compute_options( out->q = cpp11::as_cpp>(q); } SEXP interpolation = options["interpolation"]; - if (!Rf_isNull(interpolation) && TYPEOF(interpolation) == INTSXP && XLENGTH(interpolation) == 1) { - out->interpolation = cpp11::as_cpp(interpolation); + if (!Rf_isNull(interpolation) && TYPEOF(interpolation) == INTSXP && + XLENGTH(interpolation) == 1) { + out->interpolation = + cpp11::as_cpp( + interpolation); } return out; } From 79fa50b21782c087d69279a7a10e53be02cc7a24 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 2 Apr 2021 14:41:44 -0400 Subject: [PATCH 06/13] Use x$null_count in median definition --- r/R/compute.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/compute.R b/r/R/compute.R index ae65698495f8..982aa8d1d6ea 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -103,7 +103,7 @@ quantile.ArrowDatum <- function(x, #' @export median.ArrowDatum <- function(x, na.rm = FALSE, ...) { - if (!na.rm && TRUE %in% as.vector(unique(is.na(x)))) { + if (!na.rm && x$null_count > 0) { Scalar$create(NA_real_) } else { Scalar$create(quantile(x, probs = 0.5, na.rm = TRUE, ...)) From f60cd0c9999e8f310b876125df135cd93f2cd31f Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 2 Apr 2021 14:42:06 -0400 Subject: [PATCH 07/13] Use x$null_count in quantile definition Co-authored-by: Neal Richardson --- r/R/compute.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/compute.R b/r/R/compute.R index 982aa8d1d6ea..2bea8b4867f3 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -90,7 +90,7 @@ quantile.ArrowDatum <- function(x, assert_is(probs, c("numeric", "integer")) assert_that(length(probs) > 0) assert_that(all(probs >= 0 & probs <= 1)) - if (!na.rm && TRUE %in% as.vector(unique(is.na(x)))) { + if (!na.rm && x$null_count > 0) { stop("Missing values not allowed if 'na.rm' is FALSE", call. = FALSE) } interpolation <- QuantileInterpolation[[toupper(match.arg(interpolation))]] From d697d7c9f947dcf856430d7b78accfbe81149790 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 2 Apr 2021 14:42:56 -0400 Subject: [PATCH 08/13] Add stats to Imports --- r/DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index f37e6a4e84f9..3de40f6f9a7e 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -30,6 +30,7 @@ Imports: purrr, R6, rlang, + stats, tidyselect, utils, vctrs From 188ceea3fbc037c3944ac936e9492f9f388f3c95 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 5 Apr 2021 17:36:11 -0400 Subject: [PATCH 09/13] Informative error when type != 7 --- r/R/compute.R | 10 ++++++++++ r/tests/testthat/test-compute-aggregate.R | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/r/R/compute.R b/r/R/compute.R index 2bea8b4867f3..20e41fb96261 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -84,6 +84,7 @@ collect_arrays_from_dots <- function(dots) { quantile.ArrowDatum <- function(x, probs = seq(0, 1, 0.25), na.rm = FALSE, + type = 7, interpolation = c("linear", "lower", "higher", "nearest", "midpoint"), ...) { if (inherits(x, "Scalar")) x <- Array$create(x) @@ -93,6 +94,15 @@ quantile.ArrowDatum <- function(x, if (!na.rm && x$null_count > 0) { stop("Missing values not allowed if 'na.rm' is FALSE", call. = FALSE) } + if (type != 7) { + stop( + "Argument `type` not supported in Arrow. To control the quantile ", + "interpolation algorithm, set argument `interpolation` to one of: ", + "\"linear\" (the default), \"lower\", \"higher\", \"nearest\", or ", + "\"midpoint\".", + call. = FALSE + ) + } interpolation <- QuantileInterpolation[[toupper(match.arg(interpolation))]] out <- call_function("quantile", x, options = list(q = probs, interpolation = interpolation)) if (length(out) == 0) { diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index d22c0a98318b..f6c3946acc5d 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -229,7 +229,7 @@ test_that("quantile.Array and quantile.ChunkedArray", { } }) -test_that("quantile and median NAs and edge cases", { +test_that("quantile and median NAs, edge cases, and exceptions", { expect_equal( quantile(Array$create(c(1, 2)), probs = c(0, 1)), Array$create(c(1, 2)) @@ -262,6 +262,10 @@ test_that("quantile and median NAs and edge cases", { median(Scalar$create(1L)), Scalar$create(1) ) + expect_error( + quantile(Array$create(1:3), type = 9), + "not supported" + ) }) test_that("median.Array and median.ChunkedArray", { From 5b3b43d4044c06305ee5e4baae38551b18890306 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 5 Apr 2021 17:41:54 -0400 Subject: [PATCH 10/13] Trigger CI From e47595528e3250099cef1e746129ad87e7f0b7d7 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 5 Apr 2021 21:12:39 -0400 Subject: [PATCH 11/13] Remove assertion of base R error msg --- r/tests/testthat/test-compute-aggregate.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index f6c3946acc5d..32acffb7bebe 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -247,8 +247,7 @@ test_that("quantile and median NAs, edge cases, and exceptions", { Array$create(rep(NA_real_, 5)) ) expect_error( - median(Array$create(c(1, 2)), probs = c(.25, .75)), - "formal argument \"probs\" matched by multiple actual arguments" + median(Array$create(c(1, 2)), probs = c(.25, .75)) ) expect_equal( median(Array$create(c(1, 2)), interpolation = "higher"), From 0b27da71fc7d2419420ce2ccc75b0081e3d9d6c9 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 5 Apr 2021 22:54:01 -0400 Subject: [PATCH 12/13] Add comment --- r/R/compute.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/R/compute.R b/r/R/compute.R index 20e41fb96261..567635b79bdb 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -106,6 +106,9 @@ quantile.ArrowDatum <- function(x, interpolation <- QuantileInterpolation[[toupper(match.arg(interpolation))]] out <- call_function("quantile", x, options = list(q = probs, interpolation = interpolation)) if (length(out) == 0) { + # When there are no non-null values in the data, the Arrow quantile function + # returns an empty Array, but for consistency with the R quantile function, + # we want vector of NA_real_ with the same length as probs out <- Array$create(rep(NA_real_, length(probs))) } out From 5dbe77e8a541568e1b1b46572a52c3d9a645ebaa Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 5 Apr 2021 22:56:01 -0400 Subject: [PATCH 13/13] Add comment --- r/R/compute.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 567635b79bdb..749da6d52773 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -106,9 +106,9 @@ quantile.ArrowDatum <- function(x, interpolation <- QuantileInterpolation[[toupper(match.arg(interpolation))]] out <- call_function("quantile", x, options = list(q = probs, interpolation = interpolation)) if (length(out) == 0) { - # When there are no non-null values in the data, the Arrow quantile function - # returns an empty Array, but for consistency with the R quantile function, - # we want vector of NA_real_ with the same length as probs + # When there are no non-missing values in the data, the Arrow quantile + # function returns an empty Array, but for consistency with the R quantile + # function, we want an Array of NA_real_ with the same length as probs out <- Array$create(rep(NA_real_, length(probs))) } out