Skip to content
2 changes: 1 addition & 1 deletion r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
* String operations: `strsplit()` and `str_split()`; `strptime()`; `paste()`, `paste0()`, and `str_c()`; `substr()` and `str_sub()`; `str_like()`; `str_pad()`; `stri_reverse()`
* Date/time operations: `lubridate` methods such as `year()`, `month()`, `wday()`, and so on
* Math: logarithms (`log()` et al.); trigonometry (`sin()`, `cos()`, et al.); `abs()`; `sign()`; `pmin()` and `pmax()`; `ceiling()`, `floor()`, and `trunc()`
* Conditional: `ifelse()` and `if_else()` (fixed-precision decimal numbers do not yet work and factors/dictionaries are converted to character strings); `case_when()` (currently works with numeric data types but not character strings, factors/dictionaries, or lists/structs)
* Conditional functions, with some limitations on input type in this release: `ifelse()` and `if_else()` for all but `Decimal` types; `case_when()` for logical, numeric, and temporal types only; `coalesce()` for all but lists/structs. Note also that in this release, factors/dictionaries are converted to strings in these functions.
* `is.*` functions are supported and can be used inside `relocate()`

* The print method for `arrow_dplyr_query` now includes the expression and the resulting type of columns derived by `mutate()`.
Expand Down
38 changes: 38 additions & 0 deletions r/R/dplyr-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,44 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) {
Expression$create("cast", x, options = opts)
}

nse_funcs$coalesce <- function(...) {
args <- list2(...)
if (length(args) < 1) {
abort("At least one argument must be supplied to coalesce()")
}

# Treat NaN like NA for consistency with dplyr::coalesce(), but if *all*
# the values are NaN, we should return NaN, not NA, so don't replace
# NaN with NA in the final (or only) argument
# TODO: if an option is added to the coalesce kernel to treat NaN as NA,
# use that to simplify the code here (ARROW-13389)
attr(args[[length(args)]], "last") <- TRUE
args <- lapply(args, function(arg) {
last_arg <- is.null(attr(arg, "last"))
attr(arg, "last") <- NULL

if (!inherits(arg, "Expression")) {
arg <- Expression$scalar(arg)
}

# coalesce doesn't yet support factors/dictionaries
# TODO: remove this after ARROW-13390 is merged
if (nse_funcs$is.factor(arg)) {
warning("Dictionaries (in R: factors) are currently converted to strings (characters) in coalesce", call. = FALSE)
}

if (last_arg && arg$type_id() %in% TYPES_WITH_NAN) {
# store the NA_real_ in the same type as arg to avoid avoid casting
# smaller float types to larger float types
NA_expr <- Expression$scalar(Scalar$create(NA_real_, type = arg$type()))
Expression$create("if_else", Expression$create("is_nan", arg), NA_expr, arg)
} else {
arg
}
})
Expression$create("coalesce", args = args)
}

nse_funcs$is.na <- function(x) {
# TODO: if an option is added to the is_null kernel to treat NaN as NA,
# use that to simplify the code here (ARROW-13367)
Expand Down
130 changes: 129 additions & 1 deletion r/tests/testthat/test-dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -1207,7 +1207,7 @@ test_that("if_else and ifelse", {
mutate(
y = if_else(int > 5, fct, factor("a"))
) %>% collect() %>%
# This is a no-op on the Arrow side, but necesary to make the results equal
# This is a no-op on the Arrow side, but necessary to make the results equal
mutate(y = as.character(y)),
tbl,
warning = "Dictionaries .* are currently converted to strings .* in if_else and ifelse"
Expand Down Expand Up @@ -1359,3 +1359,131 @@ test_that("case_when()", {
tbl
)
})

test_that("coalesce()", {
# character
df <- tibble(
w = c(NA_character_, NA_character_, NA_character_),
x = c(NA_character_, NA_character_, "c"),
y = c(NA_character_, "b", "c"),
z = c("a", "b", "c")
)
expect_dplyr_equal(
input %>%
mutate(
cw = coalesce(w),
cz = coalesce(z),
cwx = coalesce(w, x),
cwxy = coalesce(w, x, y),
cwxyz = coalesce(w, x, y, z)
) %>%
collect(),
df
)

# integer
df <- tibble(
w = c(NA_integer_, NA_integer_, NA_integer_),
x = c(NA_integer_, NA_integer_, 3L),
y = c(NA_integer_, 2L, 3L),
z = 1:3
)
expect_dplyr_equal(
input %>%
mutate(
cw = coalesce(w),
cz = coalesce(z),
cwx = coalesce(w, x),
cwxy = coalesce(w, x, y),
cwxyz = coalesce(w, x, y, z)
) %>%
collect(),
df
)

# double with NaNs
df <- tibble(
w = c(NA_real_, NaN, NA_real_),
x = c(NA_real_, NaN, 3.3),
y = c(NA_real_, 2.2, 3.3),
z = c(1.1, 2.2, 3.3)
)
expect_dplyr_equal(
input %>%
mutate(
cw = coalesce(w),
cz = coalesce(z),
cwx = coalesce(w, x),
cwxy = coalesce(w, x, y),
cwxyz = coalesce(w, x, y, z)
) %>%
collect(),
df
)
# NaNs stay NaN and are not converted to NA in the results
# (testing this requires expect_identical())
expect_identical(
df %>% Table$create() %>% mutate(cwx = coalesce(w, x)) %>% collect(),
df %>% mutate(cwx = coalesce(w, x))
)
expect_identical(
df %>% Table$create() %>% transmute(cw = coalesce(w)) %>% collect(),
df %>% transmute(cw = coalesce(w))
)
expect_identical(
df %>% Table$create() %>% transmute(cn = coalesce(NaN)) %>% collect(),
df %>% transmute(cn = coalesce(NaN))
)
# singles stay single
expect_equal(
(df %>%
Table$create(schema = schema(
w = float32(),
x = float32(),
y = float32(),
z = float32()
)) %>%
transmute(c = coalesce(w, x, y, z)) %>%
compute()
)$schema[[1]]$type,
float32()
)
# with R literal values
expect_dplyr_equal(
input %>%
mutate(
c1 = coalesce(4.4),
c2 = coalesce(NA_real_),
c3 = coalesce(NaN),
c4 = coalesce(w, x, y, 5.5),
c5 = coalesce(w, x, y, NA_real_),
c6 = coalesce(w, x, y, NaN)
) %>%
collect(),
df
)

# factors
# TODO: remove the mutate + warning after ARROW-13390 is merged and Arrow
# supports factors in coalesce
df <- tibble(
x = factor("a", levels = c("a", "z")),
y = factor("b", levels = c("a", "b", "c"))
)
expect_dplyr_equal(
input %>%
mutate(c = coalesce(x, y)) %>%
collect() %>%
# This is a no-op on the Arrow side, but necessary to make the results equal
mutate(c = as.character(c)),
df,
warning = "Dictionaries .* are currently converted to strings .* in coalesce"
)

# no arguments
expect_error(
nse_funcs$coalesce(),
"At least one argument must be supplied to coalesce()",
fixed = TRUE
)
})