diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index b815515a4fa..d173620398e 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -70,20 +70,39 @@ verify_output <- function(...) { testthat::verify_output(...) } -expect_dplyr_equal <- function(expr, # A dplyr pipeline with `input` as its start - tbl, # A tbl/df as reference, will make RB/Table with - skip_record_batch = NULL, # Msg, if should skip RB test - skip_table = NULL, # Msg, if should skip Table test +#' @param expr A dplyr pipeline with `input` as its start +#' @param tbl A tbl/df as reference, will make RB/Table with +#' @param skip_record_batch string skip message, if should skip RB test +#' @param skip_table string skip message, if should skip Table test +#' @param warning string expected warning from the RecordBatch and Table paths, +#' passed to `expect_warning()`. Special values: +#' * `NA` (the default) for ensuring no warning message +#' * `TRUE` is a special case to mean to check for the +#' "not supported in Arrow; pulling data into R" message. +#' @param ... additional arguments, passed to `expect_equivalent()` +expect_dplyr_equal <- function(expr, + tbl, + skip_record_batch = NULL, + skip_table = NULL, + warning = NA, ...) { expr <- rlang::enquo(expr) expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = tbl))) + if (isTRUE(warning)) { + # Special-case the simple warning: + warning <- "not supported in Arrow; pulling data into R" + } + skip_msg <- NULL if (is.null(skip_record_batch)) { - via_batch <- rlang::eval_tidy( - expr, - rlang::new_data_mask(rlang::env(input = record_batch(tbl))) + expect_warning( + via_batch <- rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(input = record_batch(tbl))) + ), + warning ) expect_equivalent(via_batch, expected, ...) } else { @@ -91,9 +110,12 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with `input` as its star } if (is.null(skip_table)) { - via_table <- rlang::eval_tidy( - expr, - rlang::new_data_mask(rlang::env(input = Table$create(tbl))) + expect_warning( + via_table <- rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(input = Table$create(tbl))) + ), + warning ) expect_equivalent(via_table, expected, ...) } else { @@ -110,7 +132,7 @@ expect_dplyr_error <- function(expr, # A dplyr pipeline with `input` as its star ...) { # ensure we have supplied tbl force(tbl) - + expr <- rlang::enquo(expr) msg <- tryCatch( rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = tbl))), @@ -126,7 +148,7 @@ expect_dplyr_error <- function(expr, # A dplyr pipeline with `input` as its star # but what we really care about is the `x` block # so (temporarily) let's pull those blocks out when we find them pattern <- i18ize_error_messages() - + if (grepl(pattern, msg)) { msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg) } @@ -179,7 +201,7 @@ expect_vector_equal <- function(expr, # A vectorized R expression containing `in if (is.null(skip_chunked_array)) { # split input vector into two to exercise ChunkedArray with >1 chunk split_vector <- split_vector_as_list(vec) - + via_chunked <- rlang::eval_tidy( expr, rlang::new_data_mask(rlang::env(input = ChunkedArray$create(split_vector[[1]], split_vector[[2]]))) @@ -199,29 +221,29 @@ expect_vector_error <- function(expr, # A vectorized R expression containing `in skip_array = NULL, # Msg, if should skip Array test skip_chunked_array = NULL, # Msg, if should skip ChunkedArray test ...) { - + expr <- rlang::enquo(expr) - + msg <- tryCatch( rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = vec))), error = function (e) { msg <- conditionMessage(e) - + pattern <- i18ize_error_messages() - + if (grepl(pattern, msg)) { msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg) } msg } ) - + expect_true(identical(typeof(msg), "character"), label = "vector errored") - + skip_msg <- NULL - + if (is.null(skip_array)) { - + expect_error( rlang::eval_tidy( expr, @@ -233,11 +255,11 @@ expect_vector_error <- function(expr, # A vectorized R expression containing `in } else { skip_msg <- c(skip_msg, skip_array) } - + if (is.null(skip_chunked_array)) { # split input vector into two to exercise ChunkedArray with >1 chunk split_vector <- split_vector_as_list(vec) - + expect_error( rlang::eval_tidy( expr, @@ -249,7 +271,7 @@ expect_vector_error <- function(expr, # A vectorized R expression containing `in } else { skip_msg <- c(skip_msg, skip_chunked_array) } - + if (!is.null(skip_msg)) { skip(paste(skip_msg, collpase = "\n")) } diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 25bdddef689..41418014bea 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -209,8 +209,15 @@ test_that("Edge cases", { for (type in c(int32(), float64(), bool())) { expect_equal(as.vector(sum(a$cast(type), na.rm = TRUE)), sum(NA, na.rm = TRUE)) expect_equal(as.vector(mean(a$cast(type), na.rm = TRUE)), mean(NA, na.rm = TRUE)) - expect_equal(as.vector(min(a$cast(type), na.rm = TRUE)), min(NA, na.rm = TRUE)) - expect_equal(as.vector(max(a$cast(type), na.rm = TRUE)), max(NA, na.rm = TRUE)) + expect_equal( + as.vector(min(a$cast(type), na.rm = TRUE)), + # Suppress the base R warning about no non-missing arguments + suppressWarnings(min(NA, na.rm = TRUE)) + ) + expect_equal( + as.vector(max(a$cast(type), na.rm = TRUE)), + suppressWarnings(max(NA, na.rm = TRUE)) + ) } }) @@ -342,29 +349,29 @@ test_that("match_arrow", { ca <- ChunkedArray$create(c(1, 4, 3, 1, 1, 3, 4)) expect_equal(match_arrow(ca, tab), ChunkedArray$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L))) - + sc <- Scalar$create(3) expect_equal(match_arrow(sc, tab), Scalar$create(1L)) - + vec <- c(1,2) expect_equal(match_arrow(vec, tab), Array$create(c(3L, 2L))) - + }) test_that("is_in", { a <- Array$create(c(9, 4, 3)) tab <- c(4, 3, 2, 1) expect_equal(is_in(a, tab), Array$create(c(FALSE, TRUE, TRUE))) - + ca <- ChunkedArray$create(c(9, 4, 3)) expect_equal(is_in(ca, tab), ChunkedArray$create(c(FALSE, TRUE, TRUE))) - + sc <- Scalar$create(3) expect_equal(is_in(sc, tab), Scalar$create(TRUE)) - + vec <- c(1,9) expect_equal(is_in(vec, tab), Array$create(c(TRUE, FALSE))) - + }) test_that("value_counts", { @@ -383,40 +390,40 @@ test_that("value_counts", { }) test_that("any.Array and any.ChunkedArray", { - + data <- c(1:10, NA, NA) expect_vector_equal(any(input > 5), data) expect_vector_equal(any(input < 1), data) expect_vector_equal(any(input < 1, na.rm = TRUE), data) - + data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) - + expect_vector_equal(any(input), data_logical) expect_vector_equal(any(input, na.rm = TRUE), data_logical) - + }) test_that("all.Array and all.ChunkedArray", { data <- c(1:10, NA, NA) - + expect_vector_equal(all(input > 5), data) expect_vector_equal(all(input < 11), data) expect_vector_equal(all(input < 11, na.rm = TRUE), data) - + data_logical <- c(TRUE, TRUE, NA) - + expect_vector_equal(all(input), data_logical) expect_vector_equal(all(input, na.rm = TRUE), data_logical) - + }) test_that("variance", { data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA) arr <- Array$create(data) chunked_arr <- ChunkedArray$create(data) - + expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(34596)) expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(34596)) }) @@ -425,7 +432,7 @@ test_that("stddev", { data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA) arr <- Array$create(data) chunked_arr <- ChunkedArray$create(data) - + expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(186)) expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(186)) }) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index 6bba58a7e06..f070a0150e9 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -315,31 +315,25 @@ test_that("Filtering on a column that doesn't exist errors correctly", { }) test_that("Filtering with unsupported functions", { - expect_warning( - expect_dplyr_equal( - input %>% - filter(int > 2, pnorm(dbl) > .99) %>% - collect(), - tbl - ), - 'Expression pnorm(dbl) > 0.99 not supported in Arrow; pulling data into R', - fixed = TRUE - ) - expect_warning( - expect_dplyr_equal( - input %>% - filter( - nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg - int > 2, # good - pnorm(dbl) > .99 # bad, opaque - ) %>% - collect(), - tbl - ), -'* In nchar(chr, type = "bytes", allowNA = TRUE) == 1, allowNA = TRUE not supported by Arrow -* Expression pnorm(dbl) > 0.99 not supported in Arrow -pulling data into R', - fixed = TRUE + expect_dplyr_equal( + input %>% + filter(int > 2, pnorm(dbl) > .99) %>% + collect(), + tbl, + warning = 'Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow; pulling data into R' + ) + expect_dplyr_equal( + input %>% + filter( + nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg + int > 2, # good + pnorm(dbl) > .99 # bad, opaque + ) %>% + collect(), + tbl, + warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported by Arrow +\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow +pulling data into R' ) }) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 47bee2c28e5..d01afc86fef 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -20,15 +20,21 @@ skip_if_not_available("dataset") library(lubridate) library(dplyr) +# base::strptime() defaults to local timezone +# but arrow's strptime defaults to UTC. +# So that tests are consistent, set the local timezone to UTC +# TODO: consider reevaluating this workaround after ARROW-12980 +withr::local_timezone("UTC") + test_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "") test_df <- tibble::tibble(date = test_date) # We can support this feature after ARROW-12980 is merged -test_that("timezone aware timestamps are not supported",{ - - tz_aware_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "BST") +test_that("timezone aware timestamps are not supported", { + + tz_aware_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "Asia/Pyongyang") tz_aware_df <- tibble::tibble(date = tz_aware_date) - + expect_error( Table$create(tz_aware_df) %>% mutate(x = wday(date)) %>% @@ -39,10 +45,10 @@ test_that("timezone aware timestamps are not supported",{ # We can support this feature when ARROW-13138 is resolved test_that("date32 objects are not supported",{ - + date <- ymd("2017-01-01") df <- tibble::tibble(date = date) - + expect_error( Table$create(df) %>% mutate(x = year(date)) %>% @@ -70,7 +76,7 @@ test_that("extract isoyear from date", { test_df ) }) - + test_that("extract quarter from date", { expect_dplyr_equal( input %>% @@ -106,7 +112,6 @@ test_that("extract day from date", { test_df ) }) - test_that("extract wday from date", { expect_dplyr_equal( @@ -115,21 +120,21 @@ test_that("extract wday from date", { collect(), test_df ) - + expect_dplyr_equal( input %>% mutate(x = wday(date, week_start = 3)) %>% collect(), test_df ) - + expect_dplyr_equal( input %>% mutate(x = wday(date, week_start = 1)) %>% collect(), test_df ) - + # We should be able to support the label argument after this ticket is resolved: # https://issues.apache.org/jira/browse/ARROW-13133 x <- Expression$field_ref("x") @@ -137,9 +142,9 @@ test_that("extract wday from date", { nse_funcs$wday(x, label = TRUE), "Label argument not supported by Arrow" ) - + }) - + test_that("extract yday from date", { expect_dplyr_equal( input %>% @@ -148,7 +153,7 @@ test_that("extract yday from date", { test_df ) }) - + test_that("extract hour from date", { expect_dplyr_equal( input %>% @@ -157,7 +162,7 @@ test_that("extract hour from date", { test_df ) }) - + test_that("extract minute from date", { expect_dplyr_equal( input %>% @@ -166,7 +171,7 @@ test_that("extract minute from date", { test_df ) }) - + test_that("extract second from date", { expect_dplyr_equal( input %>% @@ -177,4 +182,3 @@ test_that("extract second from date", { tolerance = 1e-6 ) }) - diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 66cb9ff305d..3e64891cec5 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -154,20 +154,17 @@ test_that("nchar() arguments", { tbl ) # This tests the whole abandon_ship() machinery - expect_warning( - expect_dplyr_equal( - input %>% - select(int, verses) %>% - mutate( - line_lengths = nchar(verses, type = "bytes", allowNA = TRUE), - longer = line_lengths * 10 - ) %>% - filter(line_lengths > 15) %>% - collect(), - tbl - ), - 'In nchar(verses, type = "bytes", allowNA = TRUE), allowNA = TRUE not supported by Arrow; pulling data into R', - fixed = TRUE + expect_dplyr_equal( + input %>% + select(int, verses) %>% + mutate( + line_lengths = nchar(verses, type = "bytes", allowNA = TRUE), + longer = line_lengths * 10 + ) %>% + filter(line_lengths > 15) %>% + collect(), + tbl, + warning = 'In nchar\\(verses, type = "bytes", allowNA = TRUE\\), allowNA = TRUE not supported by Arrow; pulling data into R' ) }) @@ -253,28 +250,24 @@ test_that("dplyr::mutate's examples", { # but warn that they're pulling data into R to do so # across + autosplicing: ARROW-11699 - expect_warning( - expect_dplyr_equal( - input %>% - select(name, homeworld, species) %>% - mutate(across(!name, as.factor)) %>% - collect(), - starwars - ), - "Expression across.*not supported in Arrow" + expect_dplyr_equal( + input %>% + select(name, homeworld, species) %>% + mutate(across(!name, as.factor)) %>% + collect(), + starwars, + warning = "Expression across.*not supported in Arrow" ) # group_by then mutate - expect_warning( - expect_dplyr_equal( - input %>% - select(name, mass, homeworld) %>% - group_by(homeworld) %>% - mutate(rank = min_rank(desc(mass))) %>% - collect(), - starwars - ), - "not supported in Arrow" + expect_dplyr_equal( + input %>% + select(name, mass, homeworld) %>% + group_by(homeworld) %>% + mutate(rank = min_rank(desc(mass))) %>% + collect(), + starwars, + warning = TRUE ) # `.before` and `.after` experimental args: ARROW-11701 @@ -345,15 +338,13 @@ test_that("dplyr::mutate's examples", { # tibbles because the expressions are computed within groups. # The following normalises `mass` by the global average: # TODO(ARROW-11702) - expect_warning( - expect_dplyr_equal( - input %>% - select(name, mass, species) %>% - mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>% - collect(), - starwars - ), - "not supported in Arrow" + expect_dplyr_equal( + input %>% + select(name, mass, species) %>% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>% + collect(), + starwars, + warning = TRUE ) }) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 438f1038e57..e7b860eb99c 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -652,6 +652,11 @@ test_that("edge cases in string detection and replacement", { }) test_that("strptime", { + # base::strptime() defaults to local timezone + # but arrow's strptime defaults to UTC. + # So that tests are consistent, set the local timezone to UTC + # TODO: consider reevaluating this workaround after ARROW-12980 + withr::local_timezone("UTC") t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) @@ -769,25 +774,28 @@ test_that("arrow_find_substring and arrow_find_substring_regex", { }) test_that("stri_reverse and arrow_ascii_reverse functions", { - + # TODO: these actually aren't implemented (ARROW-12869) + # Fix them, then remove the `warning = TRUE` arguments df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) - + df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux")) - + expect_dplyr_equal( input %>% mutate(x = stri_reverse(x)) %>% collect(), - df_utf8 + df_utf8, + warning = TRUE # Remove me ) - + expect_dplyr_equal( input %>% mutate(x = stri_reverse(x)) %>% collect(), - df_ascii + df_ascii, + warning = TRUE # Remove me ) - + expect_equivalent( df_ascii %>% Table$create() %>% @@ -795,7 +803,7 @@ test_that("stri_reverse and arrow_ascii_reverse functions", { collect(), tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab")) ) - + expect_error( df_utf8 %>% Table$create() %>% @@ -806,12 +814,12 @@ test_that("stri_reverse and arrow_ascii_reverse functions", { }) test_that("str_like", { - + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) - + # TODO: After new version of stringr with str_like has been released, update all # these tests to use expect_dplyr_equal - + # No match - entire string expect_equivalent( df %>% @@ -820,7 +828,7 @@ test_that("str_like", { collect(), tibble(x = c(FALSE, FALSE)) ) - + # Match - entire string expect_equivalent( df %>% @@ -829,7 +837,7 @@ test_that("str_like", { collect(), tibble(x = c(TRUE, FALSE)) ) - + # Wildcard expect_equivalent( df %>% @@ -838,7 +846,7 @@ test_that("str_like", { collect(), tibble(x = c(TRUE, FALSE)) ) - + # Ignore case expect_equivalent( df %>% @@ -847,7 +855,7 @@ test_that("str_like", { collect(), tibble(x = c(FALSE, FALSE)) ) - + # Single character expect_equivalent( df %>% @@ -856,9 +864,9 @@ test_that("str_like", { collect(), tibble(x = c(FALSE, TRUE)) ) - + # This will give an error until a new version of stringr with str_like has been released - skip("Test will fail until stringr > 1.4.0 is release") + skip_if_not(packageVersion("stringr") > "1.4.0") expect_dplyr_equal( input %>% mutate(x = str_like(x, "%baz%")) %>% @@ -868,42 +876,41 @@ test_that("str_like", { }) test_that("str_pad", { - df <- tibble(x = c("Foo and bar", "baz and qux and quux")) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 31)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 30, side = "right")) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 31, side = "both")) %>% collect(), df ) - + })