From fe6ac2ecb11fd099ca6c318d731383866cc284e5 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 15 Jul 2021 15:21:14 +0100 Subject: [PATCH 01/24] Add ifelse and if_else --- r/R/dplyr-functions.R | 34 ++++++++++++++++------ r/tests/testthat/test-dplyr.R | 53 ++++++++++++++++++++++------------- 2 files changed, 58 insertions(+), 29 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 35db573550d..e13794279c5 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -294,8 +294,8 @@ nse_funcs$substr <- function(x, start, stop) { msg = "`stop` must be length 1 - other lengths are not supported in Arrow" ) - # substr treats values as if they're on a continous number line, so values - # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics + # substr treats values as if they're on a continous number line, so values + # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics # this behavior if (start <= 0) { start <- 1 @@ -310,7 +310,7 @@ nse_funcs$substr <- function(x, start, stop) { Expression$create( "utf8_slice_codeunits", x, - # we don't need to subtract 1 from `stop` as C++ counts exclusively + # we don't need to subtract 1 from `stop` as C++ counts exclusively # which effectively cancels out the difference in indexing between R & C++ options = list(start = start - 1L, stop = stop) ) @@ -336,14 +336,14 @@ nse_funcs$str_sub <- function(string, start = 1L, end = -1L) { end <- .Machine$integer.max } - # An end value lower than a start value returns an empty string in + # An end value lower than a start value returns an empty string in # stringr::str_sub so set end to 0 here to match this behavior if (end < start) { end <- 0 } # subtract 1 from `start` because C++ is 0-based and R is 1-based - # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0 + # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0 # when `start` < 0, both str_sub and utf8_slice_codeunits count backwards from the end if (start > 0) { start <- start - 1L @@ -634,20 +634,36 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption } nse_funcs$log <- function(x, base = exp(1)) { - + if (base == exp(1)) { return(Expression$create("ln_checked", x)) } - + if (base == 2) { return(Expression$create("log2_checked", x)) } - + if (base == 10) { return(Expression$create("log10_checked", x)) - } + } # ARROW-13345 stop("`base` values other than exp(1), 2 and 10 not supported in Arrow", call. = FALSE) } + nse_funcs$logb <- nse_funcs$log + +nse_funcs$ifelse <- function(test, yes, no){ + Expression$create("if_else", test, yes, no) +} + +nse_funcs$if_else <- function(condition, true, false, missing = NULL){ + + assert_is(false, class(true)) + + if (inherits(true, "character") || inherits(false, "character")) { + stop("`true` and `false` character values not yet supported in Arrow") + } + + Expression$create("if_else", condition, true, false) +} diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 63d0433fc23..a45d878c967 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -67,7 +67,7 @@ chr: string See $.data for the source Arrow object', fixed = TRUE ) - + }) test_that("summarize", { @@ -821,7 +821,7 @@ test_that("type checks on expressions", { collect(), tbl ) - + # the code in the expectation below depends on RE2 skip_if_not_available("re2") @@ -947,64 +947,64 @@ test_that("abs()", { }) test_that("log functions", { - + df <- tibble(x = c(1:10, NA, NA)) - + expect_dplyr_equal( input %>% mutate(y = log(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log(x, base = exp(1))) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log(x, base = 2)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log(x, base = 10)) %>% collect(), df ) - + expect_error( nse_funcs$log(Expression$scalar(x), base = 5), "`base` values other than exp(1), 2 and 10 not supported in Arrow", fixed = TRUE ) - + expect_dplyr_equal( input %>% mutate(y = logb(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log1p(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log2(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log10(x)) %>% @@ -1013,39 +1013,39 @@ test_that("log functions", { ) }) - + test_that("trig functions", { - + df <- tibble(x = c(seq(from = 0, to = 1, by = 0.1), NA)) - + expect_dplyr_equal( input %>% mutate(y = sin(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = cos(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = tan(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = asin(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = acos(x)) %>% @@ -1053,4 +1053,17 @@ test_that("trig functions", { df ) -}) \ No newline at end of file +}) + +test_that("if_else", { + df <- tibble(x = c(-127, -10, -1, -0 , 0, 1, 10, 127, NA)) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(x > 0, 1, 0) + ) %>% collect(), + df + ) + +}) From 6b3a7aada16a9e905153c82407a095fd321f6be3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 15 Jul 2021 15:44:06 +0100 Subject: [PATCH 02/24] Use build_expr not Expression$create --- r/R/dplyr-functions.R | 12 +++++++----- r/tests/testthat/test-dplyr.R | 10 +++++++++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index e13794279c5..5ed3c3ed867 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -650,7 +650,6 @@ nse_funcs$log <- function(x, base = exp(1)) { stop("`base` values other than exp(1), 2 and 10 not supported in Arrow", call. = FALSE) } - nse_funcs$logb <- nse_funcs$log nse_funcs$ifelse <- function(test, yes, no){ @@ -658,12 +657,15 @@ nse_funcs$ifelse <- function(test, yes, no){ } nse_funcs$if_else <- function(condition, true, false, missing = NULL){ - - assert_is(false, class(true)) - if (inherits(true, "character") || inherits(false, "character")) { stop("`true` and `false` character values not yet supported in Arrow") } - Expression$create("if_else", condition, true, false) + build_expr("if_else", condition, true, false) +} + +# Although base R ifelse allows `yes` and `no` to be different classes +# +nse_funcs$ifelse <- function(test, yes, no){ + nse_funcs$if_else(condition = test, true = yes, false = no) } diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index a45d878c967..717a18f5e6e 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1055,7 +1055,7 @@ test_that("trig functions", { }) -test_that("if_else", { +test_that("if_else and ifelse", { df <- tibble(x = c(-127, -10, -1, -0 , 0, 1, 10, 127, NA)) expect_dplyr_equal( @@ -1066,4 +1066,12 @@ test_that("if_else", { df ) + expect_dplyr_equal( + input %>% + mutate( + y = ifelse(x > 0, 1, 0) + ) %>% collect(), + df + ) + }) From 613a1de7aba69d4fbc698bd70066fea11bb07a1f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 15 Jul 2021 16:15:35 +0100 Subject: [PATCH 03/24] Add tests and warnings --- r/tests/testthat/test-dplyr.R | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 717a18f5e6e..9a2a2e678fb 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1066,6 +1066,29 @@ test_that("if_else and ifelse", { df ) + expect_dplyr_equal( + input %>% + mutate( + y = if_else(x > 0, 1, 0) + ) %>% collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(x > 0, x, 0) + ) %>% collect(), + df, + # Do we need to open a JIRA to implement this?? + warn = TRUE + ) + + expect_error( + nse_funcs$if_else(x > 0, 1, FALSE), + 'false must be a "numeric"' + ) + expect_dplyr_equal( input %>% mutate( @@ -1074,4 +1097,22 @@ test_that("if_else and ifelse", { df ) + expect_dplyr_equal( + input %>% + mutate(y = ifelse(x > 0, 1, FALSE)) %>% + collect(), + df, + warn = TRUE + ) + + expect_dplyr_equal( + input %>% + mutate( + y = ifelse(x > 0, x, 0) + ) %>% collect(), + df, + # Do we need to open a JIRA to implement this?? + warn = TRUE + ) + }) From 1190697ba0cf6a5cb05abddc541a18b4c0920e71 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 15 Jul 2021 12:06:28 -0500 Subject: [PATCH 04/24] A few changes --- r/R/dplyr-functions.R | 6 ++++++ r/tests/testthat/test-dplyr.R | 38 +++++++++++++++++------------------ 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 5ed3c3ed867..25d6e116e4b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -657,6 +657,12 @@ nse_funcs$ifelse <- function(test, yes, no){ } nse_funcs$if_else <- function(condition, true, false, missing = NULL){ + # We ought to assert that the types of the true and false conditions will result + # in the same types. We can't compare the objects themselves directly because + # they might be expressions (that will result in a type) or R objects that will + # need to be compared to see if they are compatible with arrow types. + # ARROW-13186 might make this easier with a more robust way. + # TODO: do this ^^^ if (inherits(true, "character") || inherits(false, "character")) { stop("`true` and `false` character values not yet supported in Arrow") } diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 9a2a2e678fb..ba9207528db 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1069,40 +1069,33 @@ test_that("if_else and ifelse", { expect_dplyr_equal( input %>% mutate( - y = if_else(x > 0, 1, 0) + y = if_else(x > 0, x, 0) ) %>% collect(), df ) - expect_dplyr_equal( - input %>% + expect_error( + Table$create(df) %>% mutate( - y = if_else(x > 0, x, 0) + y = if_else(x > 0, 1, FALSE) ) %>% collect(), - df, - # Do we need to open a JIRA to implement this?? - warn = TRUE - ) - - expect_error( - nse_funcs$if_else(x > 0, 1, FALSE), - 'false must be a "numeric"' + 'NotImplemented: Function if_else has no kernel matching input types' ) expect_dplyr_equal( input %>% mutate( - y = ifelse(x > 0, 1, 0) + y = if_else(x > 0, 1, NA_real_) ) %>% collect(), df ) expect_dplyr_equal( input %>% - mutate(y = ifelse(x > 0, 1, FALSE)) %>% - collect(), - df, - warn = TRUE + mutate( + y = ifelse(x > 0, 1, 0) + ) %>% collect(), + df ) expect_dplyr_equal( @@ -1110,9 +1103,14 @@ test_that("if_else and ifelse", { mutate( y = ifelse(x > 0, x, 0) ) %>% collect(), - df, - # Do we need to open a JIRA to implement this?? - warn = TRUE + df ) + skip("TODO: could? should? we support the autocasting in ifelse") + expect_dplyr_equal( + input %>% + mutate(y = ifelse(x > 0, 1, FALSE)) %>% + collect(), + df + ) }) From cd7a2d9254b3f39f3da89ed85af22648f0816074 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 15 Jul 2021 15:11:34 -0500 Subject: [PATCH 05/24] A few more tests, slightly more guard rails for unimplemented types --- r/R/dplyr-functions.R | 17 +++++++-- r/tests/testthat/test-dplyr.R | 67 ++++++++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 25d6e116e4b..36676bce45e 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -663,8 +663,21 @@ nse_funcs$if_else <- function(condition, true, false, missing = NULL){ # need to be compared to see if they are compatible with arrow types. # ARROW-13186 might make this easier with a more robust way. # TODO: do this ^^^ - if (inherits(true, "character") || inherits(false, "character")) { - stop("`true` and `false` character values not yet supported in Arrow") + + # if_else only supports boolean, numeric, or temporal types right now + # TODO: remove when ARROW-12955 merges + # If true/false are R types, we can use `is.*` directly + invalid_r_types <- is.character(true) || is.character(false) || is.list(true) || + is.list(false) || is.factor(true) || is.factor(false) + # However, if they are expressions, we need to use the functions from nse_funcs + invalid_expression_types_true <- inherits(true, "Expression") && ( + nse_funcs$is.character(true) || nse_funcs$is.list(true) || nse_funcs$is.factor(true) + ) + invalid_expression_types_false <- inherits(false, "Expression") && ( + nse_funcs$is.character(false) || nse_funcs$is.list(false) || nse_funcs$is.factor(false) + ) + if (invalid_r_types | invalid_expression_types_true | invalid_expression_types_false) { + stop("`true` and `false` character values not yet supported in Arrow", call. = FALSE) } build_expr("if_else", condition, true, false) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index ba9207528db..6e5558b30d8 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1056,28 +1056,26 @@ test_that("trig functions", { }) test_that("if_else and ifelse", { - df <- tibble(x = c(-127, -10, -1, -0 , 0, 1, 10, 127, NA)) - expect_dplyr_equal( input %>% mutate( - y = if_else(x > 0, 1, 0) + y = if_else(int > 5, 1, 0) ) %>% collect(), - df + example_data ) expect_dplyr_equal( input %>% mutate( - y = if_else(x > 0, x, 0) + y = if_else(int > 5, int, 0L) ) %>% collect(), - df + example_data ) expect_error( - Table$create(df) %>% + Table$create(example_data) %>% mutate( - y = if_else(x > 0, 1, FALSE) + y = if_else(int > 5, 1, FALSE) ) %>% collect(), 'NotImplemented: Function if_else has no kernel matching input types' ) @@ -1085,32 +1083,69 @@ test_that("if_else and ifelse", { expect_dplyr_equal( input %>% mutate( - y = if_else(x > 0, 1, NA_real_) + y = if_else(int > 5, 1, NA_real_) ) %>% collect(), - df + example_data ) expect_dplyr_equal( input %>% mutate( - y = ifelse(x > 0, 1, 0) + y = ifelse(int > 5, 1, 0) ) %>% collect(), - df + example_data ) expect_dplyr_equal( input %>% mutate( - y = ifelse(x > 0, x, 0) + y = ifelse(dbl > 5, TRUE, FALSE) ) %>% collect(), - df + example_data + ) + + expect_dplyr_equal( + input %>% + mutate( + y = ifelse(chr %in% letters[1:3], 1L, 3L) + ) %>% collect(), + example_data + ) + + # TODO: this should not warn / pull into R, once ARROW-12955 merges + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, "one", "zero") + ) %>% collect(), + example_data, + warn = TRUE + ) + + # TODO: this should not warn / pull into R, once ARROW-12955 merges + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, chr, chr) + ) %>% collect(), + example_data, + warn = TRUE + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, fct, factor("a")) + ) %>% collect(), + example_data, + warn = TRUE ) skip("TODO: could? should? we support the autocasting in ifelse") expect_dplyr_equal( input %>% - mutate(y = ifelse(x > 0, 1, FALSE)) %>% + mutate(y = ifelse(int > 5, 1, FALSE)) %>% collect(), - df + example_data ) }) From 1ede692e6599a2594181eacb7fa24694b9d62c7d Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 15 Jul 2021 15:22:02 -0500 Subject: [PATCH 06/24] ifelse -> if_else for generic tests --- r/tests/testthat/test-dplyr.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 6e5558b30d8..3b3c0c07dd4 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1099,7 +1099,7 @@ test_that("if_else and ifelse", { expect_dplyr_equal( input %>% mutate( - y = ifelse(dbl > 5, TRUE, FALSE) + y = if_else(dbl > 5, TRUE, FALSE) ) %>% collect(), example_data ) @@ -1107,7 +1107,7 @@ test_that("if_else and ifelse", { expect_dplyr_equal( input %>% mutate( - y = ifelse(chr %in% letters[1:3], 1L, 3L) + y = if_else(chr %in% letters[1:3], 1L, 3L) ) %>% collect(), example_data ) From b6167fb8be0df9aa98ff7afdc92dec8b16947c70 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 15 Jul 2021 15:24:33 -0500 Subject: [PATCH 07/24] take out errange ifelse from rebase --- r/R/dplyr-functions.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 36676bce45e..b90d97da941 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -652,10 +652,6 @@ nse_funcs$log <- function(x, base = exp(1)) { nse_funcs$logb <- nse_funcs$log -nse_funcs$ifelse <- function(test, yes, no){ - Expression$create("if_else", test, yes, no) -} - nse_funcs$if_else <- function(condition, true, false, missing = NULL){ # We ought to assert that the types of the true and false conditions will result # in the same types. We can't compare the objects themselves directly because From 93f51a3a5f96ac67c7134beb0c2046ee37e876b0 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 16 Jul 2021 10:32:48 -0500 Subject: [PATCH 08/24] Clean up, rebase --- r/R/dplyr-functions.R | 32 ++++++++++----------------- r/tests/testthat/helper-expectation.R | 2 +- r/tests/testthat/test-dplyr.R | 23 +++++++++++++------ 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index b90d97da941..89a1c2774af 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -653,27 +653,19 @@ nse_funcs$log <- function(x, base = exp(1)) { nse_funcs$logb <- nse_funcs$log nse_funcs$if_else <- function(condition, true, false, missing = NULL){ - # We ought to assert that the types of the true and false conditions will result - # in the same types. We can't compare the objects themselves directly because - # they might be expressions (that will result in a type) or R objects that will - # need to be compared to see if they are compatible with arrow types. - # ARROW-13186 might make this easier with a more robust way. - # TODO: do this ^^^ - - # if_else only supports boolean, numeric, or temporal types right now - # TODO: remove when ARROW-12955 merges - # If true/false are R types, we can use `is.*` directly - invalid_r_types <- is.character(true) || is.character(false) || is.list(true) || - is.list(false) || is.factor(true) || is.factor(false) + if (!is.null(missing)) { + arrow_not_supported("missing argument") + } + + # TODO: if_else doesn't yet support factors/dictionaries this can be removed when + # ARROW-13358 merges + warn_r_types <- is.factor(true) || is.factor(false) # However, if they are expressions, we need to use the functions from nse_funcs - invalid_expression_types_true <- inherits(true, "Expression") && ( - nse_funcs$is.character(true) || nse_funcs$is.list(true) || nse_funcs$is.factor(true) - ) - invalid_expression_types_false <- inherits(false, "Expression") && ( - nse_funcs$is.character(false) || nse_funcs$is.list(false) || nse_funcs$is.factor(false) - ) - if (invalid_r_types | invalid_expression_types_true | invalid_expression_types_false) { - stop("`true` and `false` character values not yet supported in Arrow", call. = FALSE) + warn_expression_types_true <- inherits(true, "Expression") && nse_funcs$is.factor(true) + warn_expression_types_false <- inherits(false, "Expression") && nse_funcs$is.factor(false) + + if (warn_r_types | warn_expression_types_true | warn_expression_types_false) { + warning("Factors are currently converted to chracters in if_else and ifelse", call. = FALSE) } build_expr("if_else", condition, true, false) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 359e31ef57d..a39c6b4bbe0 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -91,7 +91,7 @@ expect_dplyr_equal <- function(expr, if (isTRUE(warning)) { # Special-case the simple warning: - warning <- "not supported in Arrow; pulling data into R" + warning <- "not supported (in|by) Arrow; pulling data into R" } skip_msg <- NULL diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 3b3c0c07dd4..573a143f238 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1112,33 +1112,42 @@ test_that("if_else and ifelse", { example_data ) - # TODO: this should not warn / pull into R, once ARROW-12955 merges expect_dplyr_equal( input %>% mutate( y = if_else(int > 5, "one", "zero") ) %>% collect(), - example_data, - warn = TRUE + example_data ) - # TODO: this should not warn / pull into R, once ARROW-12955 merges expect_dplyr_equal( input %>% mutate( y = if_else(int > 5, chr, chr) ) %>% collect(), + example_data + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, chr, chr, missing = "MISSING") + ) %>% collect(), example_data, - warn = TRUE + warning = TRUE ) + # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow + # supports factors in if(_)else expect_dplyr_equal( input %>% mutate( y = if_else(int > 5, fct, factor("a")) - ) %>% collect(), + ) %>% collect() %>% + # This is a no-op on the Arrow side, but necesary to make the results equal + mutate(y = as.character(y)), example_data, - warn = TRUE + warning = "Factors are currently converted to chracters in if_else and ifelse" ) skip("TODO: could? should? we support the autocasting in ifelse") From 6f67ecbbdf322f18f05660d9616fde1ca97f9253 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 16 Jul 2021 12:04:24 -0500 Subject: [PATCH 09/24] CR comments + add support for the missing arg (mostly) --- r/R/dplyr-functions.R | 8 ++++++-- r/tests/testthat/helper-expectation.R | 1 + r/tests/testthat/test-dplyr.R | 16 +++++++++++++--- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 89a1c2774af..1a1a36a734b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -654,7 +654,11 @@ nse_funcs$logb <- nse_funcs$log nse_funcs$if_else <- function(condition, true, false, missing = NULL){ if (!is.null(missing)) { - arrow_not_supported("missing argument") + return(nse_funcs$if_else( + Expression$create("is_null", condition), + missing, + nse_funcs$if_else(condition, true, false) + )) } # TODO: if_else doesn't yet support factors/dictionaries this can be removed when @@ -665,7 +669,7 @@ nse_funcs$if_else <- function(condition, true, false, missing = NULL){ warn_expression_types_false <- inherits(false, "Expression") && nse_funcs$is.factor(false) if (warn_r_types | warn_expression_types_true | warn_expression_types_false) { - warning("Factors are currently converted to chracters in if_else and ifelse", call. = FALSE) + warning("Factors are currently converted to characters in if_else and ifelse", call. = FALSE) } build_expr("if_else", condition, true, false) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index a39c6b4bbe0..c4dab9ace45 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -91,6 +91,7 @@ expect_dplyr_equal <- function(expr, if (isTRUE(warning)) { # Special-case the simple warning: + # TODO: ARROW-13362 pick one of in or by and use it everywhere warning <- "not supported (in|by) Arrow; pulling data into R" } diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 573a143f238..53c74aacc3d 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1133,8 +1133,7 @@ test_that("if_else and ifelse", { mutate( y = if_else(int > 5, chr, chr, missing = "MISSING") ) %>% collect(), - example_data, - warning = TRUE + example_data ) # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow @@ -1147,7 +1146,18 @@ test_that("if_else and ifelse", { # This is a no-op on the Arrow side, but necesary to make the results equal mutate(y = as.character(y)), example_data, - warning = "Factors are currently converted to chracters in if_else and ifelse" + warning = "Factors are currently converted to characters in if_else and ifelse" + ) + + skip("ARROW-12055 for better NaN support") + # currently NaNs are not NAs and so the missing argument is not correctly + # applied + expect_dplyr_equal( + input %>% + mutate( + y = if_else(dbl > 5, chr, chr, missing = "MISSING") + ) %>% collect(), + example_data_for_sorting ) skip("TODO: could? should? we support the autocasting in ifelse") From afe21ea8fdbc9c5accae6b8773dc3f1a22338cde Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 13:09:57 -0400 Subject: [PATCH 10/24] Make nse_funcs$is.*() type check functions work on R literals --- r/R/dplyr-functions.R | 16 ++++++++-------- r/tests/testthat/test-dplyr.R | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 89a1c2774af..ee1f8984d08 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -147,35 +147,35 @@ nse_funcs$as.numeric <- function(x) { # is.* type functions nse_funcs$is.character <- function(x) { - x$type_id() %in% Type[c("STRING", "LARGE_STRING")] + is.character(x) || x$type_id() %in% Type[c("STRING", "LARGE_STRING")] } nse_funcs$is.numeric <- function(x) { - x$type_id() %in% Type[c( + is.numeric(x) || x$type_id() %in% Type[c( "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE", "DECIMAL", "DECIMAL256" )] } nse_funcs$is.double <- function(x) { - x$type_id() == Type["DOUBLE"] + is.double(x) || x$type_id() == Type["DOUBLE"] } nse_funcs$is.integer <- function(x) { - x$type_id() %in% Type[c( + is.integer(x) || x$type_id() %in% Type[c( "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64" )] } nse_funcs$is.integer64 <- function(x) { - x$type_id() == Type["INT64"] + is.integer64(x) || x$type_id() == Type["INT64"] } nse_funcs$is.logical <- function(x) { - x$type_id() == Type["BOOL"] + is.logical(x) || x$type_id() == Type["BOOL"] } nse_funcs$is.factor <- function(x) { - x$type_id() == Type["DICTIONARY"] + is.factor(x) || x$type_id() == Type["DICTIONARY"] } nse_funcs$is.list <- function(x) { - x$type_id() %in% Type[c("LIST", "FIXED_SIZE_LIST", "LARGE_LIST")] + is.list(x) || x$type_id() %in% Type[c("LIST", "FIXED_SIZE_LIST", "LARGE_LIST")] } # rlang::is_* type functions diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 573a143f238..fb669a5a96e 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -835,6 +835,31 @@ test_that("type checks on expressions", { ) }) +test_that("type checks on R scalar literals", { + expect_dplyr_equal( + input %>% + transmute( + chr_is_chr = is.character("foo"), + int_is_chr = is.character(42L), + int_is_int = is.integer(42L), + chr_is_int = is.integer("foo"), + dbl_is_num = is.numeric(3.14159), + int_is_num = is.numeric(42L), + chr_is_num = is.numeric("foo"), + dbl_is_dbl = is.double(3.14159), + chr_is_dbl = is.double("foo"), + lgl_is_lgl = is.logical(TRUE), + chr_is_lgl = is.logical("foo"), + fct_is_fct = is.factor(factor("foo", levels = c("foo", "bar", "baz"))), + chr_is_fct = is.factor("foo"), + lst_is_lst = is.list(list(c(a = "foo", b = "bar"))), + chr_is_lst = is.list("foo") + ) %>% + collect(), + tbl + ) +}) + test_that("as.factor()/dictionary_encode()", { skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression {x=dictionary_encode(x, {NON-REPRESENTABLE OPTIONS})}") df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B")) From 8024723334394752e55bb684757fd2a64eec330a Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 13:10:10 -0400 Subject: [PATCH 11/24] Simplify type warning code in if_else() --- r/R/dplyr-functions.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index ee1f8984d08..0341e6e4ce6 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -659,12 +659,8 @@ nse_funcs$if_else <- function(condition, true, false, missing = NULL){ # TODO: if_else doesn't yet support factors/dictionaries this can be removed when # ARROW-13358 merges - warn_r_types <- is.factor(true) || is.factor(false) - # However, if they are expressions, we need to use the functions from nse_funcs - warn_expression_types_true <- inherits(true, "Expression") && nse_funcs$is.factor(true) - warn_expression_types_false <- inherits(false, "Expression") && nse_funcs$is.factor(false) - - if (warn_r_types | warn_expression_types_true | warn_expression_types_false) { + warn_types <- nse_funcs$is.factor(true) | nse_funcs$is.factor(false) + if (warn_types) { warning("Factors are currently converted to chracters in if_else and ifelse", call. = FALSE) } From e876926453c6509c9b73775dd49f1b428f010c73 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 13:16:08 -0400 Subject: [PATCH 12/24] Fix misspelling --- r/R/dplyr-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 9c78cdb895f..d65724661b5 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -665,7 +665,7 @@ nse_funcs$if_else <- function(condition, true, false, missing = NULL){ # ARROW-13358 merges warn_types <- nse_funcs$is.factor(true) | nse_funcs$is.factor(false) if (warn_types) { - warning("Factors are currently converted to chracters in if_else and ifelse", call. = FALSE) + warning("Factors are currently converted to characters in if_else and ifelse", call. = FALSE) } build_expr("if_else", condition, true, false) From f450b339621cc1af73a7a96c925aa98342ecc808 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 16:13:01 -0400 Subject: [PATCH 13/24] Fix bug in is.() functions --- r/R/dplyr-functions.R | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index d65724661b5..5376021d5d8 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -147,35 +147,38 @@ nse_funcs$as.numeric <- function(x) { # is.* type functions nse_funcs$is.character <- function(x) { - is.character(x) || x$type_id() %in% Type[c("STRING", "LARGE_STRING")] + is.character(x) || (inherits(x, "Expression") && + x$type_id() %in% Type[c("STRING", "LARGE_STRING")]) } nse_funcs$is.numeric <- function(x) { - is.numeric(x) || x$type_id() %in% Type[c( + is.numeric(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE", "DECIMAL", "DECIMAL256" - )] + )]) } nse_funcs$is.double <- function(x) { - is.double(x) || x$type_id() == Type["DOUBLE"] + is.double(x) || (inherits(x, "Expression") && x$type_id() == Type["DOUBLE"]) } nse_funcs$is.integer <- function(x) { - is.integer(x) || x$type_id() %in% Type[c( + is.integer(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64" - )] + )]) } nse_funcs$is.integer64 <- function(x) { - is.integer64(x) || x$type_id() == Type["INT64"] + is.integer64(x) || (inherits(x, "Expression") && x$type_id() == Type["INT64"]) } nse_funcs$is.logical <- function(x) { - is.logical(x) || x$type_id() == Type["BOOL"] + is.logical(x) || (inherits(x, "Expression") && x$type_id() == Type["BOOL"]) } nse_funcs$is.factor <- function(x) { - is.factor(x) || x$type_id() == Type["DICTIONARY"] + is.factor(x) || (inherits(x, "Expression") && x$type_id() == Type["DICTIONARY"]) } nse_funcs$is.list <- function(x) { - is.list(x) || x$type_id() %in% Type[c("LIST", "FIXED_SIZE_LIST", "LARGE_LIST")] + is.list(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( + "LIST", "FIXED_SIZE_LIST", "LARGE_LIST" + )]) } # rlang::is_* type functions From b6e8b3b329deac84d2f86141f19623c81908d505 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 16:18:03 -0400 Subject: [PATCH 14/24] Make is.na() and is.nan() consistent with base R --- r/NEWS.md | 2 ++ r/R/arrow-datum.R | 19 +++++++++++++++++-- r/R/dplyr-functions.R | 18 ++++++++++++++++++ r/R/enums.R | 2 ++ r/R/expression.R | 12 +++++++++--- r/tests/testthat/test-Array.R | 17 +++++++++++++++++ r/tests/testthat/test-compute-sort.R | 1 - r/tests/testthat/test-dplyr.R | 13 ++++++++++++- 8 files changed, 77 insertions(+), 7 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 63be8b9df9b..55a6fedca25 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -26,6 +26,8 @@ * Added bindings for the remainder of C data interface: Type, Field, and RecordBatchReader (from the experimental C stream interface). These also have `reticulate::py_to_r()` and `r_to_py()` methods. Along with the addition of the `Scanner$ToRecordBatchReader()` method, you can now build up a Dataset query in R and pass the resulting stream of batches to another tool in process. * `match_arrow()` now converts `x` into an `Array` if it is not a `Scalar`, `Array` or `ChunkedArray` and no longer dispatches `base::match()`. * `transmute()` now errors if passed arguments `.keep`, `.before`, or `.after`, for consistency with the behavior of `dplyr` on `data.frame`s. +* `is.na()` now evaluates to `TRUE` on `NaN` values in floating point number fields, for consistency with base R. +* `is.nan()` now evaluates to `FALSE` on `NA` values in floating point number fields, for consistency with base R. # arrow 4.0.1 diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index 8becc37daf2..1208e4060da 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -47,10 +47,25 @@ is.infinite.ArrowDatum <- function(x) { } #' @export -is.na.ArrowDatum <- function(x) call_function("is_null", x) +is.na.ArrowDatum <- function(x) { + if (x$type_id() %in% TYPES_WITH_NAN) { + call_function("is_nan", x) | call_function("is_null", x) + } else { + call_function("is_null", x) + } +} #' @export -is.nan.ArrowDatum <- function(x) call_function("is_nan", x) +is.nan.ArrowDatum <- function(x) { + if (x$type_id() %in% TYPES_WITH_NAN) { + call_function("is_nan", x) & !call_function("is_null", x) + } else { + # This is just a hacky way to return an ArrowDatum identical to the input + # in shape but with a Boolean value of false in every position. + # TODO: implement this more efficiently and elegantly if possible + call_function("is_valid", x) & call_function("is_null", x) + } +} #' @export as.vector.ArrowDatum <- function(x, mode) { diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 5376021d5d8..360f58b854e 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -57,6 +57,24 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { Expression$create("cast", x, options = opts) } +nse_funcs$is.na <- function(x) { + if (is.double(x) || (inherits(x, "Expression") && + x$type_id() %in% TYPES_WITH_NAN)) { + build_expr("is_nan", x) | build_expr("is_null", x) + } else { + build_expr("is_null", x) + } +} + +nse_funcs$is.nan <- function(x) { + if (is.double(x) || (inherits(x, "Expression") && + x$type_id() %in% TYPES_WITH_NAN)) { + build_expr("is_nan", x) & !build_expr("is_null", x) + } else { + Expression$scalar(FALSE) + } +} + nse_funcs$is <- function(object, class2) { if (is.string(class2)) { switch(class2, diff --git a/r/R/enums.R b/r/R/enums.R index 8a5bf7366a9..019ebc7a337 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -81,6 +81,8 @@ Type <- enum("Type::type", LARGE_LIST = 36L ) +TYPES_WITH_NAN <- Type[c("HALF_FLOAT", "FLOAT", "DOUBLE")] + #' @rdname enums #' @export StatusCode <- enum("StatusCode", diff --git a/r/R/expression.R b/r/R/expression.R index b3fc9fe20c7..d4ed973a475 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -20,8 +20,8 @@ .unary_function_map <- list( "!" = "invert", "as.factor" = "dictionary_encode", - "is.na" = "is_null", - "is.nan" = "is_nan", + # is.na is defined in dplyr-functions.R + # is.nan is defined in dplyr-functions.R "abs" = "abs_checked", # nchar is defined in dplyr-functions.R "tolower" = "utf8_lower", @@ -206,4 +206,10 @@ Ops.Expression <- function(e1, e2) { } #' @export -is.na.Expression <- function(x) Expression$create("is_null", x) +is.na.Expression <- function(x) { + if (!is.null(x$schema) && x$type_id() %in% TYPES_WITH_NAN) { + Expression$create("is_nan", x) | build_expr("is_null", x) + } else { + Expression$create("is_null", x) + } +} diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 63ac64eee5f..305f5a34634 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -317,6 +317,23 @@ test_that("support for NaN (ARROW-3615)", { expect_equal(y$null_count, 1L) }) +test_that("is.nan() evalutes to FALSE on NA (for consistency with base R)", { + x <- c(1.0, NA, NaN, -1.0) + expect_vector_equal(is.nan(input), x) +}) + +test_that("is.nan() evalutes to FALSE on non-floats (for consistency with base R)", { + x <- c(1L, 2L, 3L) + y <- c("foo", "bar") + expect_vector_equal(is.nan(input), x) + expect_vector_equal(is.nan(input), y) +}) + +test_that("is.na() evalutes to TRUE on NaN (for consistency with base R)", { + x <- c(1, NA, NaN, -1) + expect_vector_equal(is.na(input), x) +}) + test_that("integer types casts (ARROW-3741)", { # Defining some type groups for use here and in the following tests int_types <- c(int8(), int16(), int32(), int64()) diff --git a/r/tests/testthat/test-compute-sort.R b/r/tests/testthat/test-compute-sort.R index 63977b55414..373237ff9a1 100644 --- a/r/tests/testthat/test-compute-sort.R +++ b/r/tests/testthat/test-compute-sort.R @@ -118,7 +118,6 @@ test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results sort(input, decreasing = FALSE, na.last = TRUE), tbl$dbl ) - skip("is.na() evaluates to FALSE on Arrow NaN values (ARROW-12055)") expect_vector_equal( sort(input, decreasing = TRUE, na.last = NA), tbl$dbl diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 7615d419709..761e2c6bd65 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -524,7 +524,7 @@ test_that("is.finite(), is.infinite(), is.nan()", { ) %>% collect(), df ) - skip("is.nan() evaluates to NA on NA values (ARROW-12850)") + # is.nan() evaluates to FALSE on NA_real_ (ARROW-12850) expect_dplyr_equal( input %>% transmute( @@ -534,6 +534,17 @@ test_that("is.finite(), is.infinite(), is.nan()", { ) }) +test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", { + df <- tibble(x = c(1.1, 2.2, NA_real_, 4.4, NaN, 6.6, 7.7)) + expect_dplyr_equal( + input %>% + transmute( + is_na = is.na(x) + ) %>% collect(), + df + ) +}) + test_that("type checks with is() giving Arrow types", { # with class2=DataType expect_equal( From 0b14faa6c3516ad14a20adf3261a9b2abd3f8a42 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 16:18:13 -0400 Subject: [PATCH 15/24] Improve comment --- r/R/dplyr-functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 360f58b854e..deb5b53e952 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -682,8 +682,8 @@ nse_funcs$if_else <- function(condition, true, false, missing = NULL){ )) } - # TODO: if_else doesn't yet support factors/dictionaries this can be removed when - # ARROW-13358 merges + # if_else doesn't yet support factors/dictionaries + # TODO: remove this after ARROW-13358 is merged warn_types <- nse_funcs$is.factor(true) | nse_funcs$is.factor(false) if (warn_types) { warning("Factors are currently converted to characters in if_else and ifelse", call. = FALSE) From 7384206219e46d50862bfa68c6089d4a47fd645c Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 16 Jul 2021 15:56:23 -0500 Subject: [PATCH 16/24] Use the new is.na() functionality + edit warning about factors/dicts --- r/R/dplyr-functions.R | 4 ++-- r/tests/testthat/test-dplyr.R | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index deb5b53e952..eb270e809ea 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -676,7 +676,7 @@ nse_funcs$logb <- nse_funcs$log nse_funcs$if_else <- function(condition, true, false, missing = NULL){ if (!is.null(missing)) { return(nse_funcs$if_else( - Expression$create("is_null", condition), + is.na(condition), missing, nse_funcs$if_else(condition, true, false) )) @@ -686,7 +686,7 @@ nse_funcs$if_else <- function(condition, true, false, missing = NULL){ # TODO: remove this after ARROW-13358 is merged warn_types <- nse_funcs$is.factor(true) | nse_funcs$is.factor(false) if (warn_types) { - warning("Factors are currently converted to characters in if_else and ifelse", call. = FALSE) + warning("Dictionaries (in R: factors) are currently converted to strings (characters) in if_else and ifelse", call. = FALSE) } build_expr("if_else", condition, true, false) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 761e2c6bd65..d9276c1393b 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1182,12 +1182,20 @@ test_that("if_else and ifelse", { # This is a no-op on the Arrow side, but necesary to make the results equal mutate(y = as.character(y)), example_data, - warning = "Factors are currently converted to characters in if_else and ifelse" + warning = "Dictionaries .* are currently converted to strings .* in if_else and ifelse" ) - skip("ARROW-12055 for better NaN support") - # currently NaNs are not NAs and so the missing argument is not correctly - # applied + # detecting NA and NaN works just fine + expect_dplyr_equal( + input %>% + mutate( + y = if_else(is.na(dbl), chr, chr, missing = "MISSING") + ) %>% collect(), + example_data_for_sorting + ) + + # However, currently comparisons with NaNs return false and not NaNs or NAs + skip("ARROW-13364") expect_dplyr_equal( input %>% mutate( From 2c1b5358f7d9c0a804fffe507824e05fd4c61b44 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 15:29:10 -0600 Subject: [PATCH 17/24] Call is_valid instead of !is_null Co-authored-by: Neal Richardson --- r/R/arrow-datum.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index 1208e4060da..8a3c81e7b0b 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -58,7 +58,7 @@ is.na.ArrowDatum <- function(x) { #' @export is.nan.ArrowDatum <- function(x) { if (x$type_id() %in% TYPES_WITH_NAN) { - call_function("is_nan", x) & !call_function("is_null", x) + call_function("is_nan", x) & call_function("is_valid", x) } else { # This is just a hacky way to return an ArrowDatum identical to the input # in shape but with a Boolean value of false in every position. From a6b508172919181dcf200f6786a927737cfcb00b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 15:30:35 -0600 Subject: [PATCH 18/24] Call is_valid instead of !is_null Co-authored-by: Neal Richardson --- r/R/dplyr-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index eb270e809ea..8067c2c4243 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -69,7 +69,7 @@ nse_funcs$is.na <- function(x) { nse_funcs$is.nan <- function(x) { if (is.double(x) || (inherits(x, "Expression") && x$type_id() %in% TYPES_WITH_NAN)) { - build_expr("is_nan", x) & !build_expr("is_null", x) + build_expr("is_nan", x) & build_expr("is_valid", x) } else { Expression$scalar(FALSE) } From 6c2876335b5b9481f8ac9edd069e1e130e486bac Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 17:52:06 -0400 Subject: [PATCH 19/24] Add TODOs with Jira refs --- r/R/arrow-datum.R | 4 ++++ r/R/dplyr-functions.R | 4 ++++ r/R/expression.R | 2 ++ 3 files changed, 10 insertions(+) diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index 8a3c81e7b0b..fe5dd166f48 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -48,6 +48,8 @@ is.infinite.ArrowDatum <- function(x) { #' @export is.na.ArrowDatum <- function(x) { + # TODO: if an option is added to the is_null kernel to treat NaN as NA, + # use that to simplify the code here (ARROW-13367) if (x$type_id() %in% TYPES_WITH_NAN) { call_function("is_nan", x) | call_function("is_null", x) } else { @@ -58,6 +60,8 @@ is.na.ArrowDatum <- function(x) { #' @export is.nan.ArrowDatum <- function(x) { if (x$type_id() %in% TYPES_WITH_NAN) { + # TODO: if an option is added to the is_nan kernel to treat NA as NaN, + # use that to simplify the code here (ARROW-13366) call_function("is_nan", x) & call_function("is_valid", x) } else { # This is just a hacky way to return an ArrowDatum identical to the input diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 8067c2c4243..847209461e1 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -58,6 +58,8 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { } nse_funcs$is.na <- function(x) { + # TODO: if an option is added to the is_null kernel to treat NaN as NA, + # use that to simplify the code here (ARROW-13367) if (is.double(x) || (inherits(x, "Expression") && x$type_id() %in% TYPES_WITH_NAN)) { build_expr("is_nan", x) | build_expr("is_null", x) @@ -69,6 +71,8 @@ nse_funcs$is.na <- function(x) { nse_funcs$is.nan <- function(x) { if (is.double(x) || (inherits(x, "Expression") && x$type_id() %in% TYPES_WITH_NAN)) { + # TODO: if an option is added to the is_nan kernel to treat NA as NaN, + # use that to simplify the code here (ARROW-13366) build_expr("is_nan", x) & build_expr("is_valid", x) } else { Expression$scalar(FALSE) diff --git a/r/R/expression.R b/r/R/expression.R index d4ed973a475..064f88ec0c3 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -208,6 +208,8 @@ Ops.Expression <- function(e1, e2) { #' @export is.na.Expression <- function(x) { if (!is.null(x$schema) && x$type_id() %in% TYPES_WITH_NAN) { + # TODO: if an option is added to the is_null kernel to treat NaN as NA, + # use that to simplify the code here (ARROW-13367) Expression$create("is_nan", x) | build_expr("is_null", x) } else { Expression$create("is_null", x) From 0aaec220a28eaf833d9d1e1efe4ec7974b3f19da Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 15:53:17 -0600 Subject: [PATCH 20/24] Fix indentation Co-authored-by: Neal Richardson --- r/R/dplyr-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 847209461e1..430860d02d9 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -699,5 +699,5 @@ nse_funcs$if_else <- function(condition, true, false, missing = NULL){ # Although base R ifelse allows `yes` and `no` to be different classes # nse_funcs$ifelse <- function(test, yes, no){ - nse_funcs$if_else(condition = test, true = yes, false = no) + nse_funcs$if_else(condition = test, true = yes, false = no) } From 52e6bc29c64fd7ac58da6481132023e0a7cada1e Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 16 Jul 2021 17:59:04 -0400 Subject: [PATCH 21/24] Improve NEWS comment --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 55a6fedca25..792b2f4081c 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -27,7 +27,7 @@ * `match_arrow()` now converts `x` into an `Array` if it is not a `Scalar`, `Array` or `ChunkedArray` and no longer dispatches `base::match()`. * `transmute()` now errors if passed arguments `.keep`, `.before`, or `.after`, for consistency with the behavior of `dplyr` on `data.frame`s. * `is.na()` now evaluates to `TRUE` on `NaN` values in floating point number fields, for consistency with base R. -* `is.nan()` now evaluates to `FALSE` on `NA` values in floating point number fields, for consistency with base R. +* `is.nan()` now evaluates to `FALSE` on `NA` values in floating point number fields and `FALSE` on all values in non-floating point fields, for consistency with base R. # arrow 4.0.1 From 3045697e2be66e1b924f20854d67b508f576d959 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 17 Jul 2021 09:46:09 -0500 Subject: [PATCH 22/24] better tests --- r/tests/testthat/test-dplyr.R | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index d9276c1393b..67baab3ede0 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1092,12 +1092,15 @@ test_that("trig functions", { }) test_that("if_else and ifelse", { + tbl <- example_data + tbl$another_chr <- tail(letters, 10) + expect_dplyr_equal( input %>% mutate( y = if_else(int > 5, 1, 0) ) %>% collect(), - example_data + tbl ) expect_dplyr_equal( @@ -1105,11 +1108,11 @@ test_that("if_else and ifelse", { mutate( y = if_else(int > 5, int, 0L) ) %>% collect(), - example_data + tbl ) expect_error( - Table$create(example_data) %>% + Table$create(tbl) %>% mutate( y = if_else(int > 5, 1, FALSE) ) %>% collect(), @@ -1121,7 +1124,7 @@ test_that("if_else and ifelse", { mutate( y = if_else(int > 5, 1, NA_real_) ) %>% collect(), - example_data + tbl ) expect_dplyr_equal( @@ -1129,7 +1132,7 @@ test_that("if_else and ifelse", { mutate( y = ifelse(int > 5, 1, 0) ) %>% collect(), - example_data + tbl ) expect_dplyr_equal( @@ -1137,7 +1140,7 @@ test_that("if_else and ifelse", { mutate( y = if_else(dbl > 5, TRUE, FALSE) ) %>% collect(), - example_data + tbl ) expect_dplyr_equal( @@ -1145,7 +1148,7 @@ test_that("if_else and ifelse", { mutate( y = if_else(chr %in% letters[1:3], 1L, 3L) ) %>% collect(), - example_data + tbl ) expect_dplyr_equal( @@ -1153,23 +1156,23 @@ test_that("if_else and ifelse", { mutate( y = if_else(int > 5, "one", "zero") ) %>% collect(), - example_data + tbl ) expect_dplyr_equal( input %>% mutate( - y = if_else(int > 5, chr, chr) + y = if_else(int > 5, chr, another_chr) ) %>% collect(), - example_data + tbl ) expect_dplyr_equal( input %>% mutate( - y = if_else(int > 5, chr, chr, missing = "MISSING") + y = if_else(int > 5, "true", chr, missing = "MISSING") ) %>% collect(), - example_data + tbl ) # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow @@ -1181,7 +1184,7 @@ test_that("if_else and ifelse", { ) %>% collect() %>% # This is a no-op on the Arrow side, but necesary to make the results equal mutate(y = as.character(y)), - example_data, + tbl, warning = "Dictionaries .* are currently converted to strings .* in if_else and ifelse" ) @@ -1189,7 +1192,7 @@ test_that("if_else and ifelse", { expect_dplyr_equal( input %>% mutate( - y = if_else(is.na(dbl), chr, chr, missing = "MISSING") + y = if_else(is.na(dbl), chr, "false", missing = "MISSING") ) %>% collect(), example_data_for_sorting ) @@ -1199,7 +1202,7 @@ test_that("if_else and ifelse", { expect_dplyr_equal( input %>% mutate( - y = if_else(dbl > 5, chr, chr, missing = "MISSING") + y = if_else(dbl > 5, chr, another_chr, missing = "MISSING") ) %>% collect(), example_data_for_sorting ) @@ -1209,6 +1212,6 @@ test_that("if_else and ifelse", { input %>% mutate(y = ifelse(int > 5, 1, FALSE)) %>% collect(), - example_data + tbl ) }) From 7bac42876f5d10afe292ce20d7f85b87e2454f40 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 17 Jul 2021 09:46:13 -0500 Subject: [PATCH 23/24] Update r/R/dplyr-functions.R Co-authored-by: Neal Richardson --- r/R/dplyr-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 430860d02d9..d118eefaa85 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -680,7 +680,7 @@ nse_funcs$logb <- nse_funcs$log nse_funcs$if_else <- function(condition, true, false, missing = NULL){ if (!is.null(missing)) { return(nse_funcs$if_else( - is.na(condition), + nse_funcs$is.na(condition), missing, nse_funcs$if_else(condition, true, false) )) From 4470e72b6a40078f032435d2ad49e02e00eb23d3 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 17 Jul 2021 09:46:25 -0500 Subject: [PATCH 24/24] Update r/R/arrow-datum.R Co-authored-by: Neal Richardson --- r/R/arrow-datum.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index fe5dd166f48..4734d44c7ea 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -64,10 +64,7 @@ is.nan.ArrowDatum <- function(x) { # use that to simplify the code here (ARROW-13366) call_function("is_nan", x) & call_function("is_valid", x) } else { - # This is just a hacky way to return an ArrowDatum identical to the input - # in shape but with a Boolean value of false in every position. - # TODO: implement this more efficiently and elegantly if possible - call_function("is_valid", x) & call_function("is_null", x) + Scalar$create(FALSE)$as_array(length(x)) } }