diff --git a/r/DESCRIPTION b/r/DESCRIPTION index ffc473f365d..4678d0a236b 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -60,6 +60,7 @@ Collate: 'chunked-array.R' 'io.R' 'compression.R' + 'scalar.R' 'compute.R' 'config.R' 'csv.R' @@ -85,7 +86,6 @@ Collate: 'record-batch-writer.R' 'reexports-bit64.R' 'reexports-tidyselect.R' - 'scalar.R' 'schema.R' 'struct.R' 'util.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 6a8c0f9b3ea..01604ab0a9b 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -15,8 +15,17 @@ S3method(Ops,ChunkedArray) S3method(Ops,Expression) S3method(Ops,array_expression) S3method(all,equal.ArrowObject) +S3method(as.character,Array) +S3method(as.character,ChunkedArray) +S3method(as.character,Scalar) S3method(as.data.frame,RecordBatch) S3method(as.data.frame,Table) +S3method(as.double,Array) +S3method(as.double,ChunkedArray) +S3method(as.double,Scalar) +S3method(as.integer,Array) +S3method(as.integer,ChunkedArray) +S3method(as.integer,Scalar) S3method(as.list,RecordBatch) S3method(as.list,Table) S3method(as.raw,Buffer) @@ -44,6 +53,9 @@ S3method(length,Array) S3method(length,ChunkedArray) S3method(length,Scalar) S3method(length,Schema) +S3method(mean,Array) +S3method(mean,ChunkedArray) +S3method(mean,Scalar) S3method(names,Dataset) S3method(names,RecordBatch) S3method(names,ScannerBuilder) @@ -58,6 +70,9 @@ S3method(read_message,MessageReader) S3method(read_message,default) S3method(row.names,RecordBatch) S3method(row.names,Table) +S3method(sum,Array) +S3method(sum,ChunkedArray) +S3method(sum,Scalar) S3method(tail,Array) S3method(tail,ChunkedArray) S3method(tail,RecordBatch) diff --git a/r/R/array.R b/r/R/array.R index f17ea56cb01..194cdeb42f8 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -341,3 +341,12 @@ is.Array <- function(x, type = NULL) { } is_it } + +#' @export +as.double.Array <- function(x, ...) as.double(as.vector(x), ...) + +#' @export +as.integer.Array <- function(x, ...) as.integer(as.vector(x), ...) + +#' @export +as.character.Array <- function(x, ...) as.character(as.vector(x), ...) diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index f3a3eaec9fb..5a8df533c8a 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -75,8 +75,6 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowObject, if (is.integer(i)) { i <- Array$create(i) } - # Invalid: Tried executing function with non-value type: ChunkedArray - # so use old methods shared_ptr(ChunkedArray, call_function("take", self, i)) }, Filter = function(i, keep_na = TRUE) { @@ -151,3 +149,12 @@ head.ChunkedArray <- head.Array #' @export tail.ChunkedArray <- tail.Array + +#' @export +as.double.ChunkedArray <- as.double.Array + +#' @export +as.integer.ChunkedArray <- as.integer.Array + +#' @export +as.character.ChunkedArray <- as.character.Array diff --git a/r/R/compute.R b/r/R/compute.R index 000c5c86f07..a860b0ef91f 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -16,12 +16,73 @@ # under the License. #' @include array.R +#' @include chunked-array.R +#' @include scalar.R call_function <- function(function_name, ..., options = list()) { assert_that(is.string(function_name)) compute__CallFunction(function_name, list(...), options) } +#' @export +sum.Array <- function(..., na.rm = FALSE) scalar_aggregate("sum", ..., na.rm = na.rm) + +#' @export +sum.ChunkedArray <- sum.Array + +#' @export +sum.Scalar <- sum.Array + +#' @export +mean.Array <- function(..., na.rm = FALSE) scalar_aggregate("mean", ..., na.rm = na.rm) + +#' @export +mean.ChunkedArray <- mean.Array + +#' @export +mean.Scalar <- mean.Array + +min.Array <- function(..., na.rm = FALSE) { + extrema <- scalar_aggregate("minmax", ..., na.rm = na.rm) + # TODO: StructScalar needs field accessor methods in C++: ARROW-9070 + Scalar$create(as.vector(extrema)$min) +} + +scalar_aggregate <- function(FUN, ..., na.rm = FALSE) { + a <- collect_arrays_from_dots(list(...)) + if (!na.rm && a$null_count > 0) { + # Arrow sum/mean function always drops NAs so handle that here + # https://issues.apache.org/jira/browse/ARROW-9054 + Scalar$create(NA_integer_, type = a$type) + } else { + if (inherits(a$type, "Boolean")) { + # Bool sum/mean not implemented so cast to int + # https://issues.apache.org/jira/browse/ARROW-9055 + a <- a$cast(int8()) + } + shared_ptr(Scalar, call_function(FUN, a)) + } +} + +collect_arrays_from_dots <- function(dots) { + # Given a list that may contain both Arrays and ChunkedArrays, + # return a single ChunkedArray containing all of those chunks + # (may return a regular Array if there is only one element in dots) + assert_that(all(map_lgl(dots, is.Array))) + if (length(dots) == 1) { + return(dots[[1]]) + } + + arrays <- unlist(lapply(dots, function(x) { + if (inherits(x, "ChunkedArray")) { + x$chunks + } else { + x + } + })) + ChunkedArray$create(!!!arrays) +} + CastOptions <- R6Class("CastOptions", inherit = ArrowObject) #' Cast options @@ -32,12 +93,10 @@ CastOptions <- R6Class("CastOptions", inherit = ArrowObject) #' @param allow_float_truncate allow float truncate, `!safe` by default #' #' @export -cast_options <- function( - safe = TRUE, - allow_int_overflow = !safe, - allow_time_truncate = !safe, - allow_float_truncate = !safe -){ +cast_options <- function(safe = TRUE, + allow_int_overflow = !safe, + allow_time_truncate = !safe, + allow_float_truncate = !safe) { shared_ptr(CastOptions, compute___CastOptions__initialize(allow_int_overflow, allow_time_truncate, allow_float_truncate) ) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 78d28cf3cb4..2ecde6fb285 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -119,8 +119,6 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowObject, i <- Array$create(i) } assert_is(i, "Array") - # Invalid: Tried executing function with non-value type: RecordBatch - # so use old methods shared_ptr(RecordBatch, call_function("take", self, i)) }, Filter = function(i, keep_na = TRUE) { diff --git a/r/R/scalar.R b/r/R/scalar.R index df06f7b3a38..f824c2ed8f6 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -38,6 +38,7 @@ Scalar <- R6Class("Scalar", inherit = ArrowObject, ), active = list( is_valid = function() Scalar__is_valid(self), + null_count = function() sum(!self$is_valid), type = function() DataType$create(Scalar__type(self)) ) ) @@ -62,3 +63,12 @@ is.na.Scalar <- function(x) !x$is_valid #' @export as.vector.Scalar <- function(x, mode) x$as_vector() + +#' @export +as.double.Scalar <- as.double.Array + +#' @export +as.integer.Scalar <- as.integer.Array + +#' @export +as.character.Scalar <- as.character.Array diff --git a/r/R/table.R b/r/R/table.R index 785a10979c6..73a25e76315 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -146,8 +146,6 @@ Table <- R6Class("Table", inherit = ArrowObject, if (is.integer(i)) { i <- Array$create(i) } - # Invalid: Tried executing function with non-value type: Table - # so use old methods shared_ptr(Table, call_function("take", self, i)) }, Filter = function(i, keep_na = TRUE) { diff --git a/r/src/compute.cpp b/r/src/compute.cpp index b83576d3d26..11ac2aeda69 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -160,6 +160,12 @@ std::shared_ptr make_compute_options( return out; } + if (func_name == "minmax") { + auto out = std::make_shared( + arrow::compute::MinMaxOptions::Defaults()); + return out; + } + return nullptr; } diff --git a/r/tests/testthat/test-compute.R b/r/tests/testthat/test-compute.R index 1d0d23a788d..4c2d0f26bd4 100644 --- a/r/tests/testthat/test-compute.R +++ b/r/tests/testthat/test-compute.R @@ -17,6 +17,102 @@ context("compute") +test_that("sum.Array", { + ints <- 1:5 + a <- Array$create(ints) + expect_is(sum(a), "Scalar") + expect_identical(as.integer(sum(a)), sum(ints)) + + floats <- c(1.3, 2.4, 3) + f <- Array$create(floats) + expect_identical(as.numeric(sum(f)), sum(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + expect_identical(as.numeric(sum(na)), sum(floats)) + expect_is(sum(na, na.rm = TRUE), "Scalar") + expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- Array$create(bools) + expect_identical(as.integer(sum(b)), sum(bools)) +}) + +test_that("sum.ChunkedArray", { + a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_is(sum(a), "Scalar") + expect_true(is.na(as.vector(sum(a)))) + expect_identical(as.numeric(sum(a, na.rm = TRUE)), 35) +}) + +test_that("sum dots", { + a1 <- Array$create(1:4) + a2 <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_identical(as.numeric(sum(a1, a2, na.rm = TRUE)), 45) +}) + +test_that("sum.Scalar", { + skip("No sum method in arrow for Scalar: ARROW-9056") + s <- Scalar$create(4) + expect_identical(as.numeric(s), as.numeric(sum(s))) +}) + +test_that("mean.Array", { + ints <- 1:4 + a <- Array$create(ints) + expect_is(mean(a), "Scalar") + expect_identical(as.vector(mean(a)), mean(ints)) + + floats <- c(1.3, 2.4, 3) + f <- Array$create(floats) + expect_identical(as.vector(mean(f)), mean(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + expect_identical(as.vector(mean(na)), mean(floats)) + expect_is(mean(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- Array$create(bools) + expect_identical(as.vector(mean(b)), mean(bools)) +}) + +test_that("mean.ChunkedArray", { + a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_is(mean(a), "Scalar") + expect_true(is.na(as.vector(mean(a)))) + expect_identical(as.vector(mean(a, na.rm = TRUE)), 35/13) +}) + +test_that("mean.Scalar", { + skip("No mean method in arrow for Scalar: ARROW-9056") + s <- Scalar$create(4) + expect_identical(as.vector(s), mean(s)) +}) + test_that("Bad input handling of call_function", { expect_error(call_function("sum", 2, 3), "to_datum: Not implemented for type double") }) + +test_that("min/max.Array", { + skip("arrow::MakeArrayFromScalar can't handle struct: ARROW-6604") + ints <- 1:4 + a <- Array$create(ints) + expect_is(min(a), "Scalar") + expect_identical(as.vector(min(a)), min(ints)) + + floats <- c(1.3, 3, 2.4) + f <- Array$create(floats) + expect_identical(as.vector(min(f)), min(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + expect_identical(as.vector(min(na)), min(floats)) + expect_is(min(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- Array$create(bools) + expect_identical(as.vector(min(b)), min(bools)) +})