Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

## Enhancements

* Arithmetic operations (`+`, `*`, etc.) are supported on Arrays and ChunkedArrays and can be used in filter expressions in Arrow `dplyr` pipelines
* Table columns can now be added, replaced, or removed by assigning (`<-`) with either `$` or `[[`
* Column names of Tables and RecordBatches can be renamed by assigning `names()`
* Large string types can now be written to Parquet files
Expand Down
4 changes: 0 additions & 4 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

87 changes: 81 additions & 6 deletions r/R/expression.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,53 @@ build_array_expression <- function(.Generic, e1, e2, ...) {
if (.Generic %in% names(.unary_function_map)) {
expr <- array_expression(.unary_function_map[[.Generic]], e1)
} else {
e1 <- .wrap_arrow(e1, .Generic, e2$type)
e2 <- .wrap_arrow(e2, .Generic, e1$type)
e1 <- .wrap_arrow(e1, .Generic)
e2 <- .wrap_arrow(e2, .Generic)

# In Arrow, "divide" is one function, which does integer division on
# integer inputs and floating-point division on floats
if (.Generic == "/") {
# TODO: omg so many ways it's wrong to assume these types
e1 <- cast_array_expression(e1, float64())
e2 <- cast_array_expression(e2, float64())
} else if (.Generic == "%/%") {
# In R, integer division works like floor(float division)
out <- build_array_expression("/", e1, e2)
return(cast_array_expression(out, int32(), allow_float_truncate = TRUE))
} else if (.Generic == "%%") {
# {e1 - e2 * ( e1 %/% e2 )}
# ^^^ form doesn't work because Ops.Array evaluates eagerly,
# but we can build that up
quotient <- build_array_expression("%/%", e1, e2)
# this cast is to ensure that the result of this and e1 are the same
# (autocasting only applies to scalars)
base <- cast_array_expression(quotient * e2, e1$type)
return(build_array_expression("-", e1, base))
}

expr <- array_expression(.binary_function_map[[.Generic]], e1, e2, ...)
}
expr
}

.wrap_arrow <- function(arg, fun, type) {
cast_array_expression <- function(x, to_type, safe = TRUE, ...) {
opts <- list(
to_type = to_type,
allow_int_overflow = !safe,
allow_time_truncate = !safe,
allow_float_truncate = !safe
)
array_expression("cast", x, options = modifyList(opts, list(...)))
}

.wrap_arrow <- function(arg, fun) {
if (!inherits(arg, c("ArrowObject", "array_expression"))) {
# TODO: Array$create if lengths are equal?
# TODO: these kernels should autocast like the dataset ones do (e.g. int vs. float)
if (fun == "%in%") {
arg <- Array$create(arg, type = type)
arg <- Array$create(arg)
} else {
arg <- Scalar$create(arg, type = type)
arg <- Scalar$create(arg)
}
}
arg
Expand All @@ -91,6 +123,15 @@ build_array_expression <- function(.Generic, e1, e2, ...) {
"<=" = "less_equal",
"&" = "and_kleene",
"|" = "or_kleene",
"+" = "add_checked",
"-" = "subtract_checked",
"*" = "multiply_checked",
"/" = "divide_checked",
"%/%" = "divide_checked",
# we don't actually use divide_checked with `%%`, rather it is rewritten to
# use %/% above.
"%%" = "divide_checked",
# TODO: "^" (ARROW-11070)
"%in%" = "is_in_meta_binary"
)

Expand All @@ -104,6 +145,16 @@ eval_array_expression <- function(x) {
a
}
})
if (length(x$args) == 2L) {
# Insert implicit casts
if (inherits(x$args[[1]], "Scalar")) {
x$args[[1]] <- x$args[[1]]$cast(x$args[[2]]$type)
} else if (inherits(x$args[[2]], "Scalar")) {
x$args[[2]] <- x$args[[2]]$cast(x$args[[1]]$type)
} else if (x$fun == "is_in_meta_binary" && inherits(x$args[[2]], "Array")) {
x$args[[2]] <- x$args[[2]]$cast(x$args[[1]]$type)
}
}
call_function(x$fun, args = x$args, options = x$options %||% empty_named_list())
}

Expand Down Expand Up @@ -160,7 +211,16 @@ print.array_expression <- function(x, ...) {
#' @export
Expression <- R6Class("Expression", inherit = ArrowObject,
public = list(
ToString = function() dataset___expr__ToString(self)
ToString = function() dataset___expr__ToString(self),
cast = function(to_type, safe = TRUE, ...) {
opts <- list(
to_type = to_type,
allow_int_overflow = !safe,
allow_time_truncate = !safe,
allow_float_truncate = !safe
)
Expression$create("cast", self, options = modifyList(opts, list(...)))
}
)
)
Expression$create <- function(function_name,
Expand Down Expand Up @@ -196,6 +256,21 @@ build_dataset_expression <- function(.Generic, e1, e2, ...) {
if (!inherits(e2, "Expression")) {
e2 <- Expression$scalar(e2)
}

# In Arrow, "divide" is one function, which does integer division on
# integer inputs and floating-point division on floats
if (.Generic == "/") {
# TODO: omg so many ways it's wrong to assume these types
e1 <- e1$cast(float64())
e2 <- e2$cast(float64())
} else if (.Generic == "%/%") {
# In R, integer division works like floor(float division)
out <- build_dataset_expression("/", e1, e2)
return(out$cast(int32(), allow_float_truncate = TRUE))
} else if (.Generic == "%%") {
return(e1 - e2 * ( e1 %/% e2 ))
}

expr <- Expression$create(.binary_function_map[[.Generic]], e1, e2, ...)
}
expr
Expand Down
10 changes: 8 additions & 2 deletions r/R/scalar.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,14 @@ Scalar <- R6Class("Scalar",
# TODO: document the methods
public = list(
ToString = function() Scalar__ToString(self),
cast = function(target_type) {
Scalar__CastTo(self, as_type(target_type))
cast = function(target_type, safe = TRUE, ...) {
opts <- list(
to_type = as_type(target_type),
allow_int_overflow = !safe,
allow_time_truncate = !safe,
allow_float_truncate = !safe
)
call_function("cast", self, options = modifyList(opts, list(...)))
},
as_vector = function() Scalar__as_vector(self)
),
Expand Down
17 changes: 0 additions & 17 deletions r/src/arrowExports.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions r/src/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,33 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
cpp11::as_cpp<bool>(options["skip_nulls"]));
}

// hacky attempt to pass through to_type and other options
if (func_name == "cast") {
using Options = arrow::compute::CastOptions;
auto out = std::make_shared<Options>(true);
SEXP to_type = options["to_type"];
if (!Rf_isNull(to_type) && cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(to_type)) {
out->to_type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(to_type);
}

SEXP allow_float_truncate = options["allow_float_truncate"];
if (!Rf_isNull(allow_float_truncate) && cpp11::as_cpp<bool>(allow_float_truncate)) {
out->allow_float_truncate = cpp11::as_cpp<bool>(allow_float_truncate);
}

SEXP allow_time_truncate = options["allow_time_truncate"];
if (!Rf_isNull(allow_time_truncate) && cpp11::as_cpp<bool>(allow_time_truncate)) {
out->allow_time_truncate = cpp11::as_cpp<bool>(allow_time_truncate);
}

SEXP allow_int_overflow = options["allow_int_overflow"];
if (!Rf_isNull(allow_int_overflow) && cpp11::as_cpp<bool>(allow_int_overflow)) {
out->allow_int_overflow = cpp11::as_cpp<bool>(allow_int_overflow);
}

return out;
}

return nullptr;
}

Expand Down
6 changes: 0 additions & 6 deletions r/src/scalar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,6 @@ std::string Scalar__ToString(const std::shared_ptr<arrow::Scalar>& s) {
return s->ToString();
}

// [[arrow::export]]
std::shared_ptr<arrow::Scalar> Scalar__CastTo(const std::shared_ptr<arrow::Scalar>& s,
const std::shared_ptr<arrow::DataType>& t) {
return ValueOrStop(s->CastTo(t));
}

// [[arrow::export]]
std::shared_ptr<arrow::Scalar> StructScalar__field(
const std::shared_ptr<arrow::StructScalar>& s, int i) {
Expand Down
78 changes: 78 additions & 0 deletions r/tests/testthat/test-compute-arith.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

test_that("Addition", {
a <- Array$create(c(1:4, NA_integer_))
expect_type_equal(a, int32())
expect_type_equal(a + 4, int32())
expect_equal(a + 4, Array$create(c(5:8, NA_integer_)))
expect_identical(as.vector(a + 4), c(5:8, NA_integer_))
expect_equal(a + 4L, Array$create(c(5:8, NA_integer_)))
expect_vector(a + 4L, c(5:8, NA_integer_))
expect_equal(a + NA_integer_, Array$create(rep(NA_integer_, 5)))

# overflow errors — this is slightly different from R's `NA` coercion when
# overflowing, but better than the alternative of silently restarting
casted <- a$cast(int8())
expect_error(casted + 127)
expect_error(casted + 200)

skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-8919")
expect_type_equal(a + 4.1, float64())
expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_)))
})

test_that("Subtraction", {
a <- Array$create(c(1:4, NA_integer_))
expect_equal(a - 3, Array$create(c(-2:1, NA_integer_)))
})

test_that("Multiplication", {
a <- Array$create(c(1:4, NA_integer_))
expect_equal(a * 2, Array$create(c(1:4 * 2L, NA_integer_)))
})

test_that("Division", {
a <- Array$create(c(1:4, NA_integer_))
expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_)))
expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_)))
expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))

b <- a$cast(float64())
expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_)))
expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))

# the behavior of %/% matches R's (i.e. the integer of the quotient, not
# simply dividing two integers)
expect_equal(b / 2.2, Array$create(c(1:4 / 2.2, NA_real_)))
# c(1:4) %/% 2.2 != c(1:4) %/% as.integer(2.2)
# c(1:4) %/% 2.2 == c(0L, 0L, 1L, 1L)
# c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L)
expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_)))

expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_)))

expect_equal(b %% 2, Array$create(c(1:4 %% 2, NA_real_)))
})

test_that("Dates casting", {
a <- Array$create(c(Sys.Date() + 1:4, NA_integer_))

skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-8919")
expect_equal(a + 2, Array$create(c((Sys.Date() + 1:4 ) + 2), NA_integer_))
})
Loading