From 5ecbe8eb1f722ad0eea0f44b272536871de5b0f1 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 16 Dec 2020 16:09:33 -0800 Subject: [PATCH 1/9] Start adding support for arithmetic functions in R --- r/R/expression.R | 25 ++++++++++++ r/tests/testthat/test-compute-arith.R | 57 +++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 r/tests/testthat/test-compute-arith.R diff --git a/r/R/expression.R b/r/R/expression.R index f9e09c2fadd..046b4f11ee6 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -59,6 +59,20 @@ build_array_expression <- function(.Generic, e1, e2, ...) { } else { e1 <- .wrap_arrow(e1, .Generic, e2$type) e2 <- .wrap_arrow(e2, .Generic, e1$type) + + # In Arrow, "divide" is one function, which does integer division on + # integer inputs and floating-point division on floats + if (.Generic == "/") { + # TODO: cast needs to be an expression + # TODO: omg so many ways it's wrong to assume these types + e1 <- e1$cast(float64()) + e2 <- e2$cast(float64()) + } else if (.Generic == "%/%") { + e1 <- e1$cast(int32()) + e2 <- e2$cast(int32()) + } else if (.Generic == "%%") { + # e1 - e1 %/% e2 + } expr <- array_expression(.binary_function_map[[.Generic]], e1, e2, ...) } expr @@ -91,9 +105,20 @@ build_array_expression <- function(.Generic, e1, e2, ...) { "<=" = "less_equal", "&" = "and_kleene", "|" = "or_kleene", + "+" = "add", + "-" = "subtract", + "*" = "multiply", + "/" = "divide", + "%/%" = "divide", "%in%" = "is_in_meta_binary" ) + +# ‘"^"’, +# ‘"%%"’, +# ‘"%/%"’ + + .array_function_map <- c(.unary_function_map, .binary_function_map) eval_array_expression <- function(x) { diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R new file mode 100644 index 00000000000..633cc57af1b --- /dev/null +++ b/r/tests/testthat/test-compute-arith.R @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TODO: +# * Use _checked variations? See what R does +# * More tests for edge cases, esp. with division; add test helpers here? +# * Is there a better "autocasting" solution? See what rules C++ Datasets do +# * test-dplyr tests +# * then, dataset tests, special casing for division + +test_that("Addition", { + a <- Array$create(c(1:4, NA_integer_)) + expect_type_equal(a, int32()) + expect_type_equal(a + 4, int32()) + expect_equal(a + 4, Array$create(c(5:8, NA_integer_))) + expect_identical(as.vector(a + 4), c(5:8, NA_integer_)) + expect_equal(a + 4L, Array$create(c(5:8, NA_integer_))) + expect_vector(a + 4L, c(5:8, NA_integer_)) + expect_equal(a + NA_integer_, Array$create(rep(NA_integer_, 5))) + skip("autocasting should happen in compute kernels; R workaround fails on this") + expect_type_equal(a + 4.1, float64()) + expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_))) +}) + +test_that("Subtraction", { + a <- Array$create(c(1:4, NA_integer_)) + expect_equal(a - 3, Array$create(c(-2:1, NA_integer_))) +}) + +test_that("Multiplication", { + a <- Array$create(c(1:4, NA_integer_)) + expect_equal(a * 2, Array$create(c(1:4 * 2L, NA_integer_))) +}) + +test_that("Division", { + a <- Array$create(c(1:4, NA_integer_)) + expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_))) + expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_))) + + b <- a$cast(float64()) + expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_))) + expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_))) +}) From 735a484077c4435b75a397d4c888fd0c99eaad27 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 23 Dec 2020 11:23:10 -0600 Subject: [PATCH 2/9] Adjust some of the casting, datasets not (yet) working --- r/R/expression.R | 36 +++++++++++++++------- r/src/compute.cpp | 12 ++++++++ r/tests/testthat/test-compute-arith.R | 20 +++++++++++-- r/tests/testthat/test-dataset.R | 43 +++++++++++++++++++++++++++ r/tests/testthat/test-dplyr.R | 42 ++++++++++++++++++++++++++ 5 files changed, 140 insertions(+), 13 deletions(-) diff --git a/r/R/expression.R b/r/R/expression.R index 046b4f11ee6..817941a1caa 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -63,15 +63,15 @@ build_array_expression <- function(.Generic, e1, e2, ...) { # In Arrow, "divide" is one function, which does integer division on # integer inputs and floating-point division on floats if (.Generic == "/") { - # TODO: cast needs to be an expression # TODO: omg so many ways it's wrong to assume these types - e1 <- e1$cast(float64()) - e2 <- e2$cast(float64()) + e1 <- array_expression("cast", e1, options = list(to_type = float64())) + e2 <- array_expression("cast", e2, options = list(to_type = float64())) } else if (.Generic == "%/%") { - e1 <- e1$cast(int32()) - e2 <- e2$cast(int32()) + e1 <- array_expression("cast", e1, options = list(to_type = float64())) + e2 <- array_expression("cast", e2, options = list(to_type = float64())) + return(array_expression("cast", array_expression(.binary_function_map[[.Generic]], e1, e2, ...), options = list(to_type = int32()))) } else if (.Generic == "%%") { - # e1 - e1 %/% e2 + # e1 - e2 * ( e1 %/% e2 ) } expr <- array_expression(.binary_function_map[[.Generic]], e1, e2, ...) } @@ -105,11 +105,11 @@ build_array_expression <- function(.Generic, e1, e2, ...) { "<=" = "less_equal", "&" = "and_kleene", "|" = "or_kleene", - "+" = "add", - "-" = "subtract", - "*" = "multiply", - "/" = "divide", - "%/%" = "divide", + "+" = "add_checked", + "-" = "subtract_checked", + "*" = "multiply_checked", + "/" = "divide_checked", + "%/%" = "divide_checked", "%in%" = "is_in_meta_binary" ) @@ -221,6 +221,20 @@ build_dataset_expression <- function(.Generic, e1, e2, ...) { if (!inherits(e2, "Expression")) { e2 <- Expression$scalar(e2) } + + # In Arrow, "divide" is one function, which does integer division on + # integer inputs and floating-point division on floats + if (.Generic == "/") { + # TODO: omg so many ways it's wrong to assume these types + e1 <- array_expression("cast", e1, options = list(to_type = float64())) + e2 <- array_expression("cast", e2, options = list(to_type = float64())) + } else if (.Generic == "%/%") { + e1 <- array_expression("cast", e1, options = list(to_type = int32())) + e2 <- array_expression("cast", e2, options = list(to_type = int32())) + } else if (.Generic == "%%") { + # e1 - e2 * ( e1 %/% e2 ) + } + expr <- Expression$create(.binary_function_map[[.Generic]], e1, e2, ...) } expr diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 8a75a251d8e..c44d1674153 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -185,6 +185,18 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["skip_nulls"])); } + // hacky attempt to pass through to_type + if (func_name == "cast") { + using Options = arrow::compute::CastOptions; + auto out = std::make_shared(false); + SEXP to_type = options["to_type"]; + if (!Rf_isNull(to_type) && cpp11::as_cpp>(to_type)) { + out->to_type = cpp11::as_cpp>(to_type); + } + return out; + + } + return nullptr; } diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R index 633cc57af1b..27d78dd090d 100644 --- a/r/tests/testthat/test-compute-arith.R +++ b/r/tests/testthat/test-compute-arith.R @@ -16,10 +16,10 @@ # under the License. # TODO: -# * Use _checked variations? See what R does # * More tests for edge cases, esp. with division; add test helpers here? # * Is there a better "autocasting" solution? See what rules C++ Datasets do -# * test-dplyr tests +# * test-dplyr tests (Added one addition, and one summarize, but check to see if +# we can make summarize route through arrow need more?) # * then, dataset tests, special casing for division test_that("Addition", { @@ -31,6 +31,12 @@ test_that("Addition", { expect_equal(a + 4L, Array$create(c(5:8, NA_integer_))) expect_vector(a + 4L, c(5:8, NA_integer_)) expect_equal(a + NA_integer_, Array$create(rep(NA_integer_, 5))) + + # overflow errors — this is slightly different from R's `NA` coercion when + # overflowing, but better than the alternative of silently restarting + casted <- a$cast(int8()) + expect_error(casted + 257) + skip("autocasting should happen in compute kernels; R workaround fails on this") expect_type_equal(a + 4.1, float64()) expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_))) @@ -50,8 +56,18 @@ test_that("Division", { a <- Array$create(c(1:4, NA_integer_)) expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_))) expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_))) + expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_))) + expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_))) b <- a$cast(float64()) expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_))) expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_))) + + # the behavior of %/% matches R's (i.e. the integer of the quotient, not + # simply dividing two integers) + expect_equal(b / 2.2, Array$create(c(1:4 / 2.2, NA_real_))) + # c(1:4) %/% 2.2 != c(1:4) %/% as.integer(2.2) + # c(1:4) %/% 2.2 == c(0L, 0L, 1L, 1L) + # c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L) + expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_))) }) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 4c8db531411..e617a29cbd7 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -523,6 +523,49 @@ test_that("filter() on date32 columns", { ) }) +test_that("filter() with expressions", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + expect_is(ds$format, "ParquetFileFormat") + expect_is(ds$filesystem, "LocalFileSystem") + expect_is(ds, "Dataset") + expect_equivalent( + ds %>% + select(chr, dbl) %>% + filter(dbl * 2 > 14 & dbl - 50 < 3L) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) + + # check division's special casing. + expect_equivalent( + ds %>% + select(chr, dbl) %>% + filter(dbl / 2 > 3.5 & dbl < 53) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) + + expect_equivalent( + ds %>% + select(chr, dbl) %>% + filter(dbl / 2L > 3.5 & dbl < 53) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) +}) + test_that("filter scalar validation doesn't crash (ARROW-7772)", { expect_error( ds %>% diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index f9a01d8ceb6..d5c8cb9cd8d 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -133,6 +133,40 @@ test_that("filtering with expression", { ) }) +test_that("filtering with arithmetic", { + expect_dplyr_equal( + input %>% + filter(dbl + 1 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(dbl / 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(dbl / 2L > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(dbl %/% 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) +}) + test_that("More complex select/filter", { expect_dplyr_equal( input %>% @@ -276,6 +310,14 @@ test_that("summarize", { summarize(min_int = min(int)), tbl ) + + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(min_int = min(int) / 2), + tbl + ) }) test_that("mutate", { From 4b561bff85684df406ea1406ff1c9fdd47335f63 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 23 Dec 2020 15:43:03 -0600 Subject: [PATCH 3/9] expresion-based casting for division --- r/R/expression.R | 57 +++++++++++++++++++++------ r/src/compute.cpp | 23 +++++++++-- r/tests/testthat/test-compute-arith.R | 4 ++ r/tests/testthat/test-dataset.R | 12 ------ r/tests/testthat/test-dplyr.R | 4 +- r/tests/testthat/test-expression.R | 19 +++++++++ 6 files changed, 91 insertions(+), 28 deletions(-) diff --git a/r/R/expression.R b/r/R/expression.R index 817941a1caa..d5ed4c845e9 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -69,9 +69,26 @@ build_array_expression <- function(.Generic, e1, e2, ...) { } else if (.Generic == "%/%") { e1 <- array_expression("cast", e1, options = list(to_type = float64())) e2 <- array_expression("cast", e2, options = list(to_type = float64())) - return(array_expression("cast", array_expression(.binary_function_map[[.Generic]], e1, e2, ...), options = list(to_type = int32()))) + return(array_expression("cast", array_expression(.binary_function_map[[.Generic]], e1, e2, ...), options = list(to_type = int32(), allow_float_truncate = TRUE))) } else if (.Generic == "%%") { - # e1 - e2 * ( e1 %/% e2 ) + # {e1 - e2 * ( e1 %/% e2 )} + # TODO: there has to be a way to use the form ^^^ instead of this. + out <- array_expression( + "subtract_checked", e1, array_expression( + "multiply_checked", e2, array_expression( + # this outer cast is to ensure that the result of this and the + # result of multiply are the same + "cast", + array_expression( + "cast", + array_expression(.binary_function_map[[.Generic]], e1, e2, ...), + options = list(to_type = int32(), allow_float_truncate = TRUE) + ), + options = list(to_type = e2$type, allow_float_truncate = TRUE) + ) + ) + ) + return(out) } expr <- array_expression(.binary_function_map[[.Generic]], e1, e2, ...) } @@ -110,13 +127,13 @@ build_array_expression <- function(.Generic, e1, e2, ...) { "*" = "multiply_checked", "/" = "divide_checked", "%/%" = "divide_checked", - "%in%" = "is_in_meta_binary" + "%in%" = "is_in_meta_binary", + "%%" = "divide_checked" ) -# ‘"^"’, -# ‘"%%"’, -# ‘"%/%"’ +# ‘"^"’ + .array_function_map <- c(.unary_function_map, .binary_function_map) @@ -226,13 +243,31 @@ build_dataset_expression <- function(.Generic, e1, e2, ...) { # integer inputs and floating-point division on floats if (.Generic == "/") { # TODO: omg so many ways it's wrong to assume these types - e1 <- array_expression("cast", e1, options = list(to_type = float64())) - e2 <- array_expression("cast", e2, options = list(to_type = float64())) + e1 <- Expression$create("cast", e1, options = list(to_type = float64())) + e2 <- Expression$create("cast", e2, options = list(to_type = float64())) } else if (.Generic == "%/%") { - e1 <- array_expression("cast", e1, options = list(to_type = int32())) - e2 <- array_expression("cast", e2, options = list(to_type = int32())) + e1 <- Expression$create("cast", e1, options = list(to_type = float64())) + e2 <- Expression$create("cast", e2, options = list(to_type = float64())) + return(Expression$create("cast", Expression$create(.binary_function_map[[.Generic]], e1, e2, ...), options = list(to_type = int32(), allow_float_truncate = TRUE))) } else if (.Generic == "%%") { - # e1 - e2 * ( e1 %/% e2 ) + # {e1 - e2 * ( e1 %/% e2 )} + # TODO: there has to be a way to use the form ^^^ instead of this. + out <- Expression$create( + "subtract_checked", e1, Expression$create( + "multiply_checked", e2, Expression$create( + # this outer cast is to ensure that the result of this and the + # result of multiply are the same + "cast", + Expression$create( + "cast", + Expression$create(.binary_function_map[[.Generic]], e1, e2, ...), + options = list(to_type = int32(), allow_float_truncate = TRUE) + ), + options = list(to_type = e2$type, allow_float_truncate = TRUE) + ) + ) + ) + return(out) } expr <- Expression$create(.binary_function_map[[.Generic]], e1, e2, ...) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index c44d1674153..4497f5b59a3 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -185,16 +185,31 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["skip_nulls"])); } - // hacky attempt to pass through to_type + // hacky attempt to pass through to_type and other options if (func_name == "cast") { using Options = arrow::compute::CastOptions; - auto out = std::make_shared(false); + auto out = std::make_shared(true); SEXP to_type = options["to_type"]; if (!Rf_isNull(to_type) && cpp11::as_cpp>(to_type)) { out->to_type = cpp11::as_cpp>(to_type); - } - return out; + } + SEXP allow_float_truncate = options["allow_float_truncate"]; + if (!Rf_isNull(allow_float_truncate) && cpp11::as_cpp(allow_float_truncate)) { + out->allow_float_truncate = cpp11::as_cpp(allow_float_truncate); + } + + SEXP allow_time_truncate = options["allow_time_truncate"]; + if (!Rf_isNull(allow_time_truncate) && cpp11::as_cpp(allow_time_truncate)) { + out->allow_time_truncate = cpp11::as_cpp(allow_time_truncate); + } + + SEXP allow_int_overflow = options["allow_int_overflow"]; + if (!Rf_isNull(allow_int_overflow) && cpp11::as_cpp(allow_int_overflow)) { + out->allow_int_overflow = cpp11::as_cpp(allow_int_overflow); + } + + return out; } return nullptr; diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R index 27d78dd090d..829eaae5796 100644 --- a/r/tests/testthat/test-compute-arith.R +++ b/r/tests/testthat/test-compute-arith.R @@ -70,4 +70,8 @@ test_that("Division", { # c(1:4) %/% 2.2 == c(0L, 0L, 1L, 1L) # c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L) expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_))) + + expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_))) + + expect_equal(b %% 2, Array$create(c(1:4 %% 2, NA_real_))) }) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index e617a29cbd7..1125e83cd8d 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -552,18 +552,6 @@ test_that("filter() with expressions", { df2[1:2, c("chr", "dbl")] ) ) - - expect_equivalent( - ds %>% - select(chr, dbl) %>% - filter(dbl / 2L > 3.5 & dbl < 53) %>% - collect() %>% - arrange(dbl), - rbind( - df1[8:10, c("chr", "dbl")], - df2[1:2, c("chr", "dbl")] - ) - ) }) test_that("filter scalar validation doesn't crash (ARROW-7772)", { diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index d5c8cb9cd8d..79317a9b201 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -163,7 +163,9 @@ test_that("filtering with arithmetic", { filter(dbl %/% 2 > 3) %>% select(string = chr, int, dbl) %>% collect(), - tbl + tbl, + # TODO: why are record batched versions problematic? + skip_record_batch = "record batches aren't (auto?) casting correctly" ) }) diff --git a/r/tests/testthat/test-expression.R b/r/tests/testthat/test-expression.R index 0c5ef4c12da..1251cf0e7c5 100644 --- a/r/tests/testthat/test-expression.R +++ b/r/tests/testthat/test-expression.R @@ -66,3 +66,22 @@ test_that("C++ expressions", { # Interprets that as a list type expect_is(f == c(1L, 2L), "Expression") }) + +test_that("Can create an expression", { + a <- Array$create(as.numeric(1:5)) + expr <- array_expression("cast", a, options = list(to_type = int32())) + expect_is(expr, "array_expression") + expect_equal(eval_array_expression(expr), Array$create(1:5)) + + b <- Array$create(0.5:4.5) + bad_expr <- array_expression("cast", b, options = list(to_type = int32())) + expect_is(bad_expr, "array_expression") + expect_error( + eval_array_expression(bad_expr), + "Invalid: Float value .* was truncated converting" + ) + expr <- array_expression("cast", b, options = list(to_type = int32(), allow_float_truncate = TRUE)) + expect_is(expr, "array_expression") + expect_equal(eval_array_expression(expr), Array$create(0:4)) +}) + From 3a6a545024f5c5a0547d770a2c845d7a599f9582 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 30 Dec 2020 12:27:18 -0600 Subject: [PATCH 4/9] PR comments --- r/R/expression.R | 63 ++++++++++++--------------- r/tests/testthat/test-compute-arith.R | 24 ++++++---- r/tests/testthat/test-dataset.R | 61 ++++++++++++++++++++++++++ r/tests/testthat/test-dplyr.R | 50 ++++++++++++++++++--- 4 files changed, 150 insertions(+), 48 deletions(-) diff --git a/r/R/expression.R b/r/R/expression.R index d5ed4c845e9..673225ea15a 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -64,15 +64,15 @@ build_array_expression <- function(.Generic, e1, e2, ...) { # integer inputs and floating-point division on floats if (.Generic == "/") { # TODO: omg so many ways it's wrong to assume these types - e1 <- array_expression("cast", e1, options = list(to_type = float64())) - e2 <- array_expression("cast", e2, options = list(to_type = float64())) + e1 <- e1$cast(float64()) + e2 <- e2$cast(float64()) } else if (.Generic == "%/%") { - e1 <- array_expression("cast", e1, options = list(to_type = float64())) - e2 <- array_expression("cast", e2, options = list(to_type = float64())) return(array_expression("cast", array_expression(.binary_function_map[[.Generic]], e1, e2, ...), options = list(to_type = int32(), allow_float_truncate = TRUE))) } else if (.Generic == "%%") { # {e1 - e2 * ( e1 %/% e2 )} # TODO: there has to be a way to use the form ^^^ instead of this. + # with return(e1 - e2 * (e1 %/% e2)) we get: + # "cannot add bindings to a locked environment" out <- array_expression( "subtract_checked", e1, array_expression( "multiply_checked", e2, array_expression( @@ -90,6 +90,13 @@ build_array_expression <- function(.Generic, e1, e2, ...) { ) return(out) } + + # hack to use subtract instead of subtract_checked for timestamps + if (inherits(e1$type, "Timestamp") && inherits(e2$type, "Timestamp") && .Generic == "-"){ + # don't use the checked variant for timestamp + return(array_expression("subtract", e1, e2, ...)) + } + expr <- array_expression(.binary_function_map[[.Generic]], e1, e2, ...) } expr @@ -127,15 +134,13 @@ build_array_expression <- function(.Generic, e1, e2, ...) { "*" = "multiply_checked", "/" = "divide_checked", "%/%" = "divide_checked", - "%in%" = "is_in_meta_binary", - "%%" = "divide_checked" + # we don't actually use divide_checked with `%%`, rather it is rewritten to + # use %/% above. + "%%" = "divide_checked", + # TODO: "^" (ARROW-11070) + "%in%" = "is_in_meta_binary" ) - -# ‘"^"’ - - - .array_function_map <- c(.unary_function_map, .binary_function_map) eval_array_expression <- function(x) { @@ -202,7 +207,10 @@ print.array_expression <- function(x, ...) { #' @export Expression <- R6Class("Expression", inherit = ArrowObject, public = list( - ToString = function() dataset___expr__ToString(self) + ToString = function() dataset___expr__ToString(self), + cast = function(to_type, ...) { + Expression$create("cast", self, options = list(to_type = to_type, ...)) + } ) ) Expression$create <- function(function_name, @@ -243,31 +251,16 @@ build_dataset_expression <- function(.Generic, e1, e2, ...) { # integer inputs and floating-point division on floats if (.Generic == "/") { # TODO: omg so many ways it's wrong to assume these types - e1 <- Expression$create("cast", e1, options = list(to_type = float64())) - e2 <- Expression$create("cast", e2, options = list(to_type = float64())) + e1 <- e1$cast(float64()) + e2 <- e2$cast(float64()) } else if (.Generic == "%/%") { - e1 <- Expression$create("cast", e1, options = list(to_type = float64())) - e2 <- Expression$create("cast", e2, options = list(to_type = float64())) - return(Expression$create("cast", Expression$create(.binary_function_map[[.Generic]], e1, e2, ...), options = list(to_type = int32(), allow_float_truncate = TRUE))) + # In R, integer division works like floor(float division) + out <- build_dataset_expression("/", e1, e2) + return(out$cast(int32(), allow_float_truncate = TRUE)) } else if (.Generic == "%%") { - # {e1 - e2 * ( e1 %/% e2 )} - # TODO: there has to be a way to use the form ^^^ instead of this. - out <- Expression$create( - "subtract_checked", e1, Expression$create( - "multiply_checked", e2, Expression$create( - # this outer cast is to ensure that the result of this and the - # result of multiply are the same - "cast", - Expression$create( - "cast", - Expression$create(.binary_function_map[[.Generic]], e1, e2, ...), - options = list(to_type = int32(), allow_float_truncate = TRUE) - ), - options = list(to_type = e2$type, allow_float_truncate = TRUE) - ) - ) - ) - return(out) + # TODO: need to do something with types to ensure that e2 is compatible + # with e1 %/% e2 and e1. + return(e1 - e2 * ( e1 %/% e2 )) } expr <- Expression$create(.binary_function_map[[.Generic]], e1, e2, ...) diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R index 829eaae5796..7ac5bd9016c 100644 --- a/r/tests/testthat/test-compute-arith.R +++ b/r/tests/testthat/test-compute-arith.R @@ -15,13 +15,6 @@ # specific language governing permissions and limitations # under the License. -# TODO: -# * More tests for edge cases, esp. with division; add test helpers here? -# * Is there a better "autocasting" solution? See what rules C++ Datasets do -# * test-dplyr tests (Added one addition, and one summarize, but check to see if -# we can make summarize route through arrow need more?) -# * then, dataset tests, special casing for division - test_that("Addition", { a <- Array$create(c(1:4, NA_integer_)) expect_type_equal(a, int32()) @@ -37,7 +30,7 @@ test_that("Addition", { casted <- a$cast(int8()) expect_error(casted + 257) - skip("autocasting should happen in compute kernels; R workaround fails on this") + skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") expect_type_equal(a + 4.1, float64()) expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_))) }) @@ -75,3 +68,18 @@ test_that("Division", { expect_equal(b %% 2, Array$create(c(1:4 %% 2, NA_real_))) }) + +test_that("Dates casting", { + a <- Array$create(c(Sys.Date() + 1:4, NA_integer_)) + + skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") + expect_equal(a + 2, Array$create(c((Sys.Date() + 1:4 ) + 2), NA_integer_)) +}) + +test_that("Datetimes", { + a <- Array$create(c(Sys.time() + 1:4, NA_integer_)) + b <- Scalar$create(Sys.time()) + result <- a - b + expect_is(result$type, "DataType") + expect_identical(result$type$ToString(), "duration[us]") +}) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 1125e83cd8d..7b127b6e515 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -552,6 +552,67 @@ test_that("filter() with expressions", { df2[1:2, c("chr", "dbl")] ) ) + + expect_equivalent( + ds %>% + select(chr, dbl, int) %>% + filter(int %/% 2L > 3 & dbl < 53) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl", "int")], + df2[1:2, c("chr", "dbl", "int")] + ) + ) + + expect_equivalent( + ds %>% + select(chr, dbl, int) %>% + filter(int %/% 2 > 3 & dbl < 53) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl", "int")], + df2[1:2, c("chr", "dbl", "int")] + ) + ) + + expect_equivalent( + ds %>% + select(chr, dbl, int) %>% + filter(int %% 2L > 0 & dbl < 53) %>% + collect() %>% + arrange(dbl), + rbind( + df1[c(1, 3, 5, 7, 9), c("chr", "dbl", "int")], + df2[1, c("chr", "dbl", "int")] + ) + ) + + skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") + expect_equivalent( + ds %>% + select(chr, dbl, int) %>% + filter(int %% 2 > 0 & dbl < 53) %>% + collect() %>% + arrange(dbl), + rbind( + df1[c(1, 3, 5, 7, 9), c("chr", "dbl", "int")], + df2[1, c("chr", "dbl", "int")] + ) + ) + + expect_equivalent( + ds %>% + select(chr, dbl, int) %>% + filter(dbl + int > 15 & dbl < 53L) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl", "int")], + df2[1:2, c("chr", "dbl", "int")] + ) + ) }) test_that("filter scalar validation doesn't crash (ARROW-7772)", { diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 79317a9b201..f98bda0e82a 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -27,6 +27,8 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with `input` as its star expr <- rlang::enquo(expr) expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = tbl))) + skip_msg <- NULL + if (is.null(skip_record_batch)) { via_batch <- rlang::eval_tidy( expr, @@ -34,7 +36,7 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with `input` as its star ) expect_equivalent(via_batch, expected, ...) } else { - skip(skip_record_batch) + skip_msg <- c(skip_msg, skip_record_batch) } if (is.null(skip_table)) { @@ -44,7 +46,11 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with `input` as its star ) expect_equivalent(via_table, expected, ...) } else { - skip(skip_table) + skip_msg <- c(skip_msg, skip_table) + } + + if (!is.null(skip_msg)) { + skip(paste(skip_msg, collpase = "\n")) } } @@ -158,14 +164,48 @@ test_that("filtering with arithmetic", { tbl ) + expect_dplyr_equal( + input %>% + filter(int / 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(int / 2L > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") expect_dplyr_equal( input %>% filter(dbl %/% 2 > 3) %>% select(string = chr, int, dbl) %>% collect(), - tbl, - # TODO: why are record batched versions problematic? - skip_record_batch = "record batches aren't (auto?) casting correctly" + tbl + ) +}) + +test_that("filtering with expression + autocasting", { + skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") + expect_dplyr_equal( + input %>% + filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(int + 1 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl ) }) From 7cf15d4e5e4175a133ad0da4803440d29032748a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 30 Dec 2020 13:47:00 -0800 Subject: [PATCH 5/9] Move array/scalar autocasting to eval time --- r/R/arrowExports.R | 4 -- r/R/expression.R | 76 ++++++++++++++++----------- r/R/scalar.R | 10 +++- r/src/arrowExports.cpp | 19 +------ r/src/scalar.cpp | 6 --- r/tests/testthat/test-compute-arith.R | 3 +- r/tests/testthat/test-dataset.R | 17 +++++- r/tests/testthat/test-dplyr.R | 4 +- r/tests/testthat/test-expression.R | 3 +- 9 files changed, 74 insertions(+), 68 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 11fd99b2321..c41ef33cb69 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1412,10 +1412,6 @@ Scalar__ToString <- function(s){ .Call(`_arrow_Scalar__ToString` , s) } -Scalar__CastTo <- function(s, t){ - .Call(`_arrow_Scalar__CastTo` , s, t) -} - StructScalar__field <- function(s, i){ .Call(`_arrow_StructScalar__field` , s, i) } diff --git a/r/R/expression.R b/r/R/expression.R index 673225ea15a..bd944a27a59 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -57,38 +57,28 @@ build_array_expression <- function(.Generic, e1, e2, ...) { if (.Generic %in% names(.unary_function_map)) { expr <- array_expression(.unary_function_map[[.Generic]], e1) } else { - e1 <- .wrap_arrow(e1, .Generic, e2$type) - e2 <- .wrap_arrow(e2, .Generic, e1$type) + e1 <- .wrap_arrow(e1, .Generic) + e2 <- .wrap_arrow(e2, .Generic) # In Arrow, "divide" is one function, which does integer division on # integer inputs and floating-point division on floats if (.Generic == "/") { # TODO: omg so many ways it's wrong to assume these types - e1 <- e1$cast(float64()) - e2 <- e2$cast(float64()) + e1 <- cast_array_expression(e1, float64()) + e2 <- cast_array_expression(e2, float64()) } else if (.Generic == "%/%") { - return(array_expression("cast", array_expression(.binary_function_map[[.Generic]], e1, e2, ...), options = list(to_type = int32(), allow_float_truncate = TRUE))) + # In R, integer division works like floor(float division) + out <- build_array_expression("/", e1, e2) + return(cast_array_expression(out, int32(), allow_float_truncate = TRUE)) } else if (.Generic == "%%") { # {e1 - e2 * ( e1 %/% e2 )} - # TODO: there has to be a way to use the form ^^^ instead of this. - # with return(e1 - e2 * (e1 %/% e2)) we get: - # "cannot add bindings to a locked environment" - out <- array_expression( - "subtract_checked", e1, array_expression( - "multiply_checked", e2, array_expression( - # this outer cast is to ensure that the result of this and the - # result of multiply are the same - "cast", - array_expression( - "cast", - array_expression(.binary_function_map[[.Generic]], e1, e2, ...), - options = list(to_type = int32(), allow_float_truncate = TRUE) - ), - options = list(to_type = e2$type, allow_float_truncate = TRUE) - ) - ) - ) - return(out) + # ^^^ form doesn't work because Ops.Array evaluates eagerly, + # but we can build that up + quotient <- build_array_expression("%/%", e1, e2) + # this cast is to ensure that the result of this and e1 are the same + # (autocasting only applies to scalars) + base <- cast_array_expression(quotient * e2, e1$type) + return(build_array_expression("-", e1, base)) } # hack to use subtract instead of subtract_checked for timestamps @@ -102,14 +92,24 @@ build_array_expression <- function(.Generic, e1, e2, ...) { expr } -.wrap_arrow <- function(arg, fun, type) { +cast_array_expression <- function(x, to_type, safe = TRUE, ...) { + opts <- list( + to_type = to_type, + allow_int_overflow = !safe, + allow_time_truncate = !safe, + allow_float_truncate = !safe + ) + array_expression("cast", x, options = modifyList(opts, list(...))) +} + +.wrap_arrow <- function(arg, fun) { if (!inherits(arg, c("ArrowObject", "array_expression"))) { # TODO: Array$create if lengths are equal? # TODO: these kernels should autocast like the dataset ones do (e.g. int vs. float) if (fun == "%in%") { - arg <- Array$create(arg, type = type) + arg <- Array$create(arg) } else { - arg <- Scalar$create(arg, type = type) + arg <- Scalar$create(arg) } } arg @@ -151,6 +151,16 @@ eval_array_expression <- function(x) { a } }) + if (length(x$args) == 2L) { + # Insert implicit casts + if (inherits(x$args[[1]], "Scalar")) { + x$args[[1]] <- x$args[[1]]$cast(x$args[[2]]$type) + } else if (inherits(x$args[[2]], "Scalar")) { + x$args[[2]] <- x$args[[2]]$cast(x$args[[1]]$type) + } else if (x$fun == "is_in_meta_binary" && inherits(x$args[[2]], "Array")) { + x$args[[2]] <- x$args[[2]]$cast(x$args[[1]]$type) + } + } call_function(x$fun, args = x$args, options = x$options %||% empty_named_list()) } @@ -208,8 +218,14 @@ print.array_expression <- function(x, ...) { Expression <- R6Class("Expression", inherit = ArrowObject, public = list( ToString = function() dataset___expr__ToString(self), - cast = function(to_type, ...) { - Expression$create("cast", self, options = list(to_type = to_type, ...)) + cast = function(to_type, safe = TRUE, ...) { + opts <- list( + to_type = to_type, + allow_int_overflow = !safe, + allow_time_truncate = !safe, + allow_float_truncate = !safe + ) + Expression$create("cast", self, options = modifyList(opts, list(...))) } ) ) @@ -258,8 +274,6 @@ build_dataset_expression <- function(.Generic, e1, e2, ...) { out <- build_dataset_expression("/", e1, e2) return(out$cast(int32(), allow_float_truncate = TRUE)) } else if (.Generic == "%%") { - # TODO: need to do something with types to ensure that e2 is compatible - # with e1 %/% e2 and e1. return(e1 - e2 * ( e1 %/% e2 )) } diff --git a/r/R/scalar.R b/r/R/scalar.R index 12f29990e0a..774fe571145 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -32,8 +32,14 @@ Scalar <- R6Class("Scalar", # TODO: document the methods public = list( ToString = function() Scalar__ToString(self), - cast = function(target_type) { - Scalar__CastTo(self, as_type(target_type)) + cast = function(target_type, safe = TRUE, ...) { + opts <- list( + to_type = as_type(target_type), + allow_int_overflow = !safe, + allow_time_truncate = !safe, + allow_float_truncate = !safe + ) + call_function("cast", self, options = modifyList(opts, list(...))) }, as_vector = function() Scalar__as_vector(self) ), diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 670bee20665..882c9469344 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -5539,23 +5539,7 @@ END_CPP11 } #else extern "C" SEXP _arrow_Scalar__ToString(SEXP s_sexp){ - Rf_error("Cannot call Scalar__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); -} -#endif - -// scalar.cpp -#if defined(ARROW_R_WITH_ARROW) -std::shared_ptr Scalar__CastTo(const std::shared_ptr& s, const std::shared_ptr& t); -extern "C" SEXP _arrow_Scalar__CastTo(SEXP s_sexp, SEXP t_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type s(s_sexp); - arrow::r::Input&>::type t(t_sexp); - return cpp11::as_sexp(Scalar__CastTo(s, t)); -END_CPP11 -} -#else -extern "C" SEXP _arrow_Scalar__CastTo(SEXP s_sexp, SEXP t_sexp){ - Rf_error("Cannot call Scalar__CastTo(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); + Rf_error("Cannot call Scalar__ToString(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -6617,7 +6601,6 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 4}, { "_arrow_Array__GetScalar", (DL_FUNC) &_arrow_Array__GetScalar, 2}, { "_arrow_Scalar__ToString", (DL_FUNC) &_arrow_Scalar__ToString, 1}, - { "_arrow_Scalar__CastTo", (DL_FUNC) &_arrow_Scalar__CastTo, 2}, { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, diff --git a/r/src/scalar.cpp b/r/src/scalar.cpp index 2c2d291b5bf..c0cc396b02d 100644 --- a/r/src/scalar.cpp +++ b/r/src/scalar.cpp @@ -47,12 +47,6 @@ std::string Scalar__ToString(const std::shared_ptr& s) { return s->ToString(); } -// [[arrow::export]] -std::shared_ptr Scalar__CastTo(const std::shared_ptr& s, - const std::shared_ptr& t) { - return ValueOrStop(s->CastTo(t)); -} - // [[arrow::export]] std::shared_ptr StructScalar__field( const std::shared_ptr& s, int i) { diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R index 7ac5bd9016c..393fdf45455 100644 --- a/r/tests/testthat/test-compute-arith.R +++ b/r/tests/testthat/test-compute-arith.R @@ -28,7 +28,8 @@ test_that("Addition", { # overflow errors — this is slightly different from R's `NA` coercion when # overflowing, but better than the alternative of silently restarting casted <- a$cast(int8()) - expect_error(casted + 257) + expect_error(casted + 127) + expect_error(casted + 200) skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") expect_type_equal(a + 4.1, float64()) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 7b127b6e515..fbdf5307cea 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -589,7 +589,20 @@ test_that("filter() with expressions", { ) ) - skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") + expect_equivalent( + ds %>% + select(chr, dbl, int) %>% + filter(int %% 2L > 0 & dbl < 53) %>% + collect() %>% + arrange(dbl), + rbind( + df1[c(1, 3, 5, 7, 9), c("chr", "dbl", "int")], + df2[1, c("chr", "dbl", "int")] + ) + ) + + skip("Implicit casts aren't being inserted everywhere they need to be") + # Error: NotImplemented: Function multiply_checked has no kernel matching input types (scalar[double], array[int32]) expect_equivalent( ds %>% select(chr, dbl, int) %>% @@ -602,6 +615,8 @@ test_that("filter() with expressions", { ) ) + skip("Implicit casts are only inserted for scalars") + # Error: NotImplemented: Function add_checked has no kernel matching input types (array[double], array[int32]) expect_equivalent( ds %>% select(chr, dbl, int) %>% diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index f98bda0e82a..a80e17c6f3e 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -180,7 +180,6 @@ test_that("filtering with arithmetic", { tbl ) - skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") expect_dplyr_equal( input %>% filter(dbl %/% 2 > 3) %>% @@ -191,7 +190,6 @@ test_that("filtering with arithmetic", { }) test_that("filtering with expression + autocasting", { - skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") expect_dplyr_equal( input %>% filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L @@ -243,7 +241,7 @@ test_that("Print method", { int: int32 chr: string -* Filter: and(and(greater(, 2), or(equal(, "d"), equal(, "f"))), less(, 5L)) +* Filter: and(and(greater(, 2), or(equal(, "d"), equal(, "f"))), less(, 5)) See $.data for the source Arrow object', fixed = TRUE ) diff --git a/r/tests/testthat/test-expression.R b/r/tests/testthat/test-expression.R index 1251cf0e7c5..3c100812ff1 100644 --- a/r/tests/testthat/test-expression.R +++ b/r/tests/testthat/test-expression.R @@ -29,7 +29,7 @@ test_that("array_expression print method", { expect_output( print(build_array_expression(">", Array$create(1:5), 4)), # Not ideal but it is informative - "greater(, 4L)", + "greater(, 4)", fixed = TRUE ) }) @@ -84,4 +84,3 @@ test_that("Can create an expression", { expect_is(expr, "array_expression") expect_equal(eval_array_expression(expr), Array$create(0:4)) }) - From 8e9cbf834fc935248aef637b20050f8623e018f1 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 30 Dec 2020 14:16:42 -0800 Subject: [PATCH 6/9] Add jira to skips --- r/tests/testthat/test-dataset.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index fbdf5307cea..2aae3bd31bf 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -601,7 +601,7 @@ test_that("filter() with expressions", { ) ) - skip("Implicit casts aren't being inserted everywhere they need to be") + skip("Implicit casts aren't being inserted everywhere they need to be (ARROW-11080)") # Error: NotImplemented: Function multiply_checked has no kernel matching input types (scalar[double], array[int32]) expect_equivalent( ds %>% @@ -615,7 +615,7 @@ test_that("filter() with expressions", { ) ) - skip("Implicit casts are only inserted for scalars") + skip("Implicit casts are only inserted for scalars (ARROW-11080)") # Error: NotImplemented: Function add_checked has no kernel matching input types (array[double], array[int32]) expect_equivalent( ds %>% From 7e3a2db05e73d5336584ff70ab3d4e78e405e525 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 31 Dec 2020 10:00:57 -0600 Subject: [PATCH 7/9] Remove datetime hackery --- r/R/expression.R | 6 ------ r/tests/testthat/test-compute-arith.R | 8 -------- 2 files changed, 14 deletions(-) diff --git a/r/R/expression.R b/r/R/expression.R index bd944a27a59..9a5e575183d 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -81,12 +81,6 @@ build_array_expression <- function(.Generic, e1, e2, ...) { return(build_array_expression("-", e1, base)) } - # hack to use subtract instead of subtract_checked for timestamps - if (inherits(e1$type, "Timestamp") && inherits(e2$type, "Timestamp") && .Generic == "-"){ - # don't use the checked variant for timestamp - return(array_expression("subtract", e1, e2, ...)) - } - expr <- array_expression(.binary_function_map[[.Generic]], e1, e2, ...) } expr diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R index 393fdf45455..ffde12c4d9b 100644 --- a/r/tests/testthat/test-compute-arith.R +++ b/r/tests/testthat/test-compute-arith.R @@ -76,11 +76,3 @@ test_that("Dates casting", { skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") expect_equal(a + 2, Array$create(c((Sys.Date() + 1:4 ) + 2), NA_integer_)) }) - -test_that("Datetimes", { - a <- Array$create(c(Sys.time() + 1:4, NA_integer_)) - b <- Scalar$create(Sys.time()) - result <- a - b - expect_is(result$type, "DataType") - expect_identical(result$type$ToString(), "duration[us]") -}) From e4ec8321a7d1bd5e758b74c1830fb3941d519952 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 6 Jan 2021 14:44:29 -0800 Subject: [PATCH 8/9] Update JIRAs in skips --- r/src/arrowExports.cpp | 2 +- r/tests/testthat/test-compute-arith.R | 4 ++-- r/tests/testthat/test-dataset.R | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 882c9469344..a66b69d0bab 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -5539,7 +5539,7 @@ END_CPP11 } #else extern "C" SEXP _arrow_Scalar__ToString(SEXP s_sexp){ - Rf_error("Cannot call Scalar__ToString(). Please use arrow::install_arrow() to install required runtime libraries. "); + Rf_error("Cannot call Scalar__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R index ffde12c4d9b..d37367d47c8 100644 --- a/r/tests/testthat/test-compute-arith.R +++ b/r/tests/testthat/test-compute-arith.R @@ -31,7 +31,7 @@ test_that("Addition", { expect_error(casted + 127) expect_error(casted + 200) - skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") + skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-8919") expect_type_equal(a + 4.1, float64()) expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_))) }) @@ -73,6 +73,6 @@ test_that("Division", { test_that("Dates casting", { a <- Array$create(c(Sys.Date() + 1:4, NA_integer_)) - skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-11078") + skip("autocasting should happen in compute kernels; R workaround fails on this ARROW-8919") expect_equal(a + 2, Array$create(c((Sys.Date() + 1:4 ) + 2), NA_integer_)) }) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 2aae3bd31bf..5bdbc42410e 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -601,7 +601,7 @@ test_that("filter() with expressions", { ) ) - skip("Implicit casts aren't being inserted everywhere they need to be (ARROW-11080)") + skip("Implicit casts aren't being inserted everywhere they need to be (ARROW-8919)") # Error: NotImplemented: Function multiply_checked has no kernel matching input types (scalar[double], array[int32]) expect_equivalent( ds %>% @@ -615,7 +615,7 @@ test_that("filter() with expressions", { ) ) - skip("Implicit casts are only inserted for scalars (ARROW-11080)") + skip("Implicit casts are only inserted for scalars (ARROW-8919)") # Error: NotImplemented: Function add_checked has no kernel matching input types (array[double], array[int32]) expect_equivalent( ds %>% From a6b2a6f94cc8605afdca26ac844dfbd1b74d4760 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 7 Jan 2021 08:29:30 -0800 Subject: [PATCH 9/9] News --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index ebf80ee1d81..a606c03b9cf 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -29,6 +29,7 @@ ## Enhancements +* Arithmetic operations (`+`, `*`, etc.) are supported on Arrays and ChunkedArrays and can be used in filter expressions in Arrow `dplyr` pipelines * Table columns can now be added, replaced, or removed by assigning (`<-`) with either `$` or `[[` * Column names of Tables and RecordBatches can be renamed by assigning `names()` * Large string types can now be written to Parquet files