From f2a17ab8798e61933e47721c0da7e72262be8ee3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 May 2023 11:20:38 +0200 Subject: [PATCH 1/5] Call docgen --- r/R/dplyr-funcs-doc.R | 24 ++++++++++++------------ r/man/acero.Rd | 24 ++++++++++++------------ r/man/enums.Rd | 2 +- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index b619cfe509b..7a1fc14b36e 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -83,7 +83,7 @@ #' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both #' `str_sub()` and `stringr::str_sub()` work. #' -#' In addition to these functions, you can call any of Arrow's 246 compute +#' In addition to these functions, you can call any of Arrow's 251 compute #' functions directly. Arrow has many functions that don't map to an existing R #' function. In other cases where there is an R function mapping, you can still #' call the Arrow function directly if you don't want the adaptations that the R @@ -99,30 +99,31 @@ #' #' ## base #' -#' * [`-`][-()] #' * [`!`][!()] #' * [`!=`][!=()] -#' * [`*`][*()] -#' * [`/`][/()] -#' * [`&`][&()] -#' * [`%/%`][%/%()] #' * [`%%`][%%()] +#' * [`%/%`][%/%()] #' * [`%in%`][%in%()] -#' * [`^`][^()] +#' * [`&`][&()] +#' * [`*`][*()] #' * [`+`][+()] +#' * [`-`][-()] +#' * [`/`][/()] #' * [`<`][<()] #' * [`<=`][<=()] #' * [`==`][==()] #' * [`>`][>()] #' * [`>=`][>=()] -#' * [`|`][|()] +#' * [`ISOdate()`][base::ISOdate()] +#' * [`ISOdatetime()`][base::ISOdatetime()] +#' * [`^`][^()] #' * [`abs()`][base::abs()] #' * [`acos()`][base::acos()] #' * [`all()`][base::all()] #' * [`any()`][base::any()] -#' * [`as.character()`][base::as.character()] #' * [`as.Date()`][base::as.Date()]: Multiple `tryFormats` not supported in Arrow. #' Consider using the lubridate specialised parsing functions `ymd()`, `ymd()`, etc. +#' * [`as.character()`][base::as.character()] #' * [`as.difftime()`][base::as.difftime()]: only supports `units = "secs"` (the default) #' * [`as.double()`][base::as.double()] #' * [`as.integer()`][base::as.integer()] @@ -153,8 +154,6 @@ #' * [`is.na()`][base::is.na()] #' * [`is.nan()`][base::is.nan()] #' * [`is.numeric()`][base::is.numeric()] -#' * [`ISOdate()`][base::ISOdate()] -#' * [`ISOdatetime()`][base::ISOdatetime()] #' * [`log()`][base::log()] #' * [`log10()`][base::log10()] #' * [`log1p()`][base::log1p()] @@ -186,6 +185,7 @@ #' * [`tolower()`][base::tolower()] #' * [`toupper()`][base::toupper()] #' * [`trunc()`][base::trunc()] +#' * [`|`][|()] #' #' ## bit64 #' @@ -242,8 +242,8 @@ #' * [`format_ISO8601()`][lubridate::format_ISO8601()] #' * [`hour()`][lubridate::hour()] #' * [`is.Date()`][lubridate::is.Date()] -#' * [`is.instant()`][lubridate::is.instant()] #' * [`is.POSIXct()`][lubridate::is.POSIXct()] +#' * [`is.instant()`][lubridate::is.instant()] #' * [`is.timepoint()`][lubridate::is.timepoint()] #' * [`isoweek()`][lubridate::isoweek()] #' * [`isoyear()`][lubridate::isoyear()] diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 6d4476c44c2..9d390002ab0 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -68,7 +68,7 @@ can assume that the function works in Acero just as it does in R. Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both \code{str_sub()} and \code{stringr::str_sub()} work. -In addition to these functions, you can call any of Arrow's 246 compute +In addition to these functions, you can call any of Arrow's 251 compute functions directly. Arrow has many functions that don't map to an existing R function. In other cases where there is an R function mapping, you can still call the Arrow function directly if you don't want the adaptations that the R @@ -85,30 +85,31 @@ as \code{arrow_ascii_is_decimal}. \subsection{base}{ \itemize{ -\item \code{\link[=-]{-}} \item \code{\link[=!]{!}} \item \code{\link[=!=]{!=}} -\item \code{\link[=*]{*}} -\item \code{\link[=/]{/}} -\item \code{\link[=&]{&}} -\item \code{\link[=\%/\%]{\%/\%}} \item \code{\link[=\%\%]{\%\%}} +\item \code{\link[=\%/\%]{\%/\%}} \item \code{\link[=\%in\%]{\%in\%}} -\item \code{\link[=^]{^}} +\item \code{\link[=&]{&}} +\item \code{\link[=*]{*}} \item \code{\link[=+]{+}} +\item \code{\link[=-]{-}} +\item \code{\link[=/]{/}} \item \code{\link[=<]{<}} \item \code{\link[=<=]{<=}} \item \code{\link[===]{==}} \item \code{\link[=>]{>}} \item \code{\link[=>=]{>=}} -\item \code{\link[=|]{|}} +\item \code{\link[base:ISOdatetime]{ISOdate()}} +\item \code{\link[base:ISOdatetime]{ISOdatetime()}} +\item \code{\link[=^]{^}} \item \code{\link[base:MathFun]{abs()}} \item \code{\link[base:Trig]{acos()}} \item \code{\link[base:all]{all()}} \item \code{\link[base:any]{any()}} -\item \code{\link[base:character]{as.character()}} \item \code{\link[base:as.Date]{as.Date()}}: Multiple \code{tryFormats} not supported in Arrow. Consider using the lubridate specialised parsing functions \code{ymd()}, \code{ymd()}, etc. +\item \code{\link[base:character]{as.character()}} \item \code{\link[base:difftime]{as.difftime()}}: only supports \code{units = "secs"} (the default) \item \code{\link[base:double]{as.double()}} \item \code{\link[base:integer]{as.integer()}} @@ -139,8 +140,6 @@ Consider using the lubridate specialised parsing functions \code{ymd()}, \code{y \item \code{\link[base:NA]{is.na()}} \item \code{\link[base:is.finite]{is.nan()}} \item \code{\link[base:numeric]{is.numeric()}} -\item \code{\link[base:ISOdatetime]{ISOdate()}} -\item \code{\link[base:ISOdatetime]{ISOdatetime()}} \item \code{\link[base:Log]{log()}} \item \code{\link[base:Log]{log10()}} \item \code{\link[base:Log]{log1p()}} @@ -172,6 +171,7 @@ Valid values are "s", "ms" (default), "us", "ns". \item \code{\link[base:chartr]{tolower()}} \item \code{\link[base:chartr]{toupper()}} \item \code{\link[base:Round]{trunc()}} +\item \code{\link[=|]{|}} } } @@ -234,8 +234,8 @@ Valid values are "s", "ms" (default), "us", "ns". \item \code{\link[lubridate:format_ISO8601]{format_ISO8601()}} \item \code{\link[lubridate:hour]{hour()}} \item \code{\link[lubridate:date_utils]{is.Date()}} -\item \code{\link[lubridate:is.instant]{is.instant()}} \item \code{\link[lubridate:posix_utils]{is.POSIXct()}} +\item \code{\link[lubridate:is.instant]{is.instant()}} \item \code{\link[lubridate:is.instant]{is.timepoint()}} \item \code{\link[lubridate:week]{isoweek()}} \item \code{\link[lubridate:year]{isoyear()}} diff --git a/r/man/enums.Rd b/r/man/enums.Rd index 614c196fdee..853fa07028f 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -26,7 +26,7 @@ An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2 An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 37. -An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 17. +An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 13. An object of class \code{FileMode} (inherits from \code{arrow-enum}) of length 3. From 5b305ba2978fa9d6b6da5e6d26d08baf22f387fa Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 May 2023 11:22:11 +0200 Subject: [PATCH 2/5] Import download.file from utils to suppress note --- r/NAMESPACE | 1 + r/R/io.R | 1 + 2 files changed, 2 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index 7ab8d5c9020..eec50167bd1 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -503,6 +503,7 @@ importFrom(tidyselect,one_of) importFrom(tidyselect,starts_with) importFrom(tidyselect,vars_pull) importFrom(utils,capture.output) +importFrom(utils,download.file) importFrom(utils,getFromNamespace) importFrom(utils,head) importFrom(utils,install.packages) diff --git a/r/R/io.R b/r/R/io.R index b2989de78a1..e952d656f8c 100644 --- a/r/R/io.R +++ b/r/R/io.R @@ -232,6 +232,7 @@ mmap_open <- function(path, mode = c("read", "write", "readwrite")) { #' @param random_access Logical: whether the result must be a RandomAccessFile #' @return An `InputStream` or a subclass of one. #' @keywords internal +#' @importFrom utils download.file make_readable_file <- function(file, mmap = TRUE, random_access = TRUE) { if (inherits(file, "SubTreeFileSystem")) { filesystem <- file$base_fs From 27f86a8e2f29da37f3e8d03c43207c1312562f09 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 26 Apr 2023 10:12:54 +0100 Subject: [PATCH 3/5] GH-35131: [R] Test failure with dev waldo (#35308) This PR fixes the tests failing due to the dev version of the waldo package being more strict comparing NaN and NA_real_ values. (n.b. our CI doesn't yet use the dev version of waldo, so this PR should be tested locally to verify tests pass). * Closes: #35131 Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/tests/testthat/test-compute-sort.R | 17 ++++++--- .../testthat/test-dplyr-funcs-conditional.R | 37 ++++++++++++++----- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/r/tests/testthat/test-compute-sort.R b/r/tests/testthat/test-compute-sort.R index f521efeddc5..ba3039c3313 100644 --- a/r/tests/testthat/test-compute-sort.R +++ b/r/tests/testthat/test-compute-sort.R @@ -108,29 +108,34 @@ test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results }) test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on floats", { + + test_vec <- tbl$dbl + # Arrow sorts NA and NaN differently, but it's not important, so eliminate here + test_vec[is.nan(test_vec)] <- NA_real_ + compare_expression( sort(.input, decreasing = TRUE, na.last = TRUE), - tbl$dbl + test_vec ) compare_expression( sort(.input, decreasing = FALSE, na.last = TRUE), - tbl$dbl + test_vec ) compare_expression( sort(.input, decreasing = TRUE, na.last = NA), - tbl$dbl + test_vec ) compare_expression( sort(.input, decreasing = TRUE, na.last = FALSE), - tbl$dbl, + test_vec, ) compare_expression( sort(.input, decreasing = FALSE, na.last = NA), - tbl$dbl + test_vec ) compare_expression( sort(.input, decreasing = FALSE, na.last = FALSE), - tbl$dbl, + test_vec, ) }) diff --git a/r/tests/testthat/test-dplyr-funcs-conditional.R b/r/tests/testthat/test-dplyr-funcs-conditional.R index 85d21b73226..b3d86da8b41 100644 --- a/r/tests/testthat/test-dplyr-funcs-conditional.R +++ b/r/tests/testthat/test-dplyr-funcs-conditional.R @@ -377,8 +377,11 @@ test_that("coalesce()", { y = c(NA_real_, 2.2, 3.3), z = c(1.1, 2.2, 3.3) ) - compare_dplyr_binding( - .input %>% + + # we can't use compare_dplyr_binding here as dplyr silently converts NaN to NA in coalesce() + # see https://github.com/tidyverse/dplyr/issues/6833 + expect_identical( + arrow_table(df) %>% mutate( cw = coalesce(w), cz = coalesce(z), @@ -387,21 +390,29 @@ test_that("coalesce()", { cwxyz = coalesce(w, x, y, z) ) %>% collect(), - df + mutate( + df, + cw = c(NA, NaN, NA), + cz = c(1.1, 2.2, 3.3), + cwx = c(NA, NaN, 3.3), + cwxy = c(NA, 2.2, 3.3), + cwxyz = c(1.1, 2.2, 3.3) + ) ) + # NaNs stay NaN and are not converted to NA in the results # (testing this requires expect_identical()) expect_identical( df %>% Table$create() %>% mutate(cwx = coalesce(w, x)) %>% collect(), - df %>% mutate(cwx = coalesce(w, x)) + df %>% mutate(cwx = c(NA, NaN, 3.3)) ) expect_identical( df %>% Table$create() %>% transmute(cw = coalesce(w)) %>% collect(), - df %>% transmute(cw = coalesce(w)) + df %>% transmute(cw = w) ) expect_identical( df %>% Table$create() %>% transmute(cn = coalesce(NaN)) %>% collect(), - df %>% transmute(cn = coalesce(NaN)) + df %>% transmute(cn = NaN) ) # singles stay single expect_equal( @@ -418,8 +429,8 @@ test_that("coalesce()", { float32() ) # with R literal values - compare_dplyr_binding( - .input %>% + expect_identical( + arrow_table(df) %>% mutate( c1 = coalesce(4.4), c2 = coalesce(NA_real_), @@ -429,7 +440,15 @@ test_that("coalesce()", { c6 = coalesce(w, x, y, NaN) ) %>% collect(), - df + mutate( + df, + c1 = 4.4, + c2 = NA_real_, + c3 = NaN, + c4 = c(5.5, 2.2, 3.3), + c5 = c(NA, 2.2, 3.3), + c6 = c(NaN, 2.2, 3.3) + ) ) # no arguments From c68226a4323ecbcd1d63da6ae1ce6a1090a9adf0 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 May 2023 12:01:49 +0200 Subject: [PATCH 4/5] Remove badges from README --- r/README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/r/README.md b/r/README.md index d343d6979c0..ee4036d48f3 100644 --- a/r/README.md +++ b/r/README.md @@ -1,9 +1,5 @@ # arrow -[![cran](https://www.r-pkg.org/badges/version-last-release/arrow)](https://cran.r-project.org/package=arrow) -[![CI](https://github.com/apache/arrow/workflows/R/badge.svg?event=push)](https://github.com/apache/arrow/actions?query=workflow%3AR+branch%3Amain+event%3Apush) -[![conda-forge](https://img.shields.io/conda/vn/conda-forge/r-arrow.svg)](https://anaconda.org/conda-forge/r-arrow) - [Apache Arrow](https://arrow.apache.org/) is a cross-language development platform for in-memory and larger-than-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical From 70dc03ea827017594b7def6f3e937c1a4cdc2c2f Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 3 May 2023 10:08:05 -0400 Subject: [PATCH 5/5] ARROW_ACERO should be ON by default --- r/inst/build_arrow_static.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index e5a9f127edb..1baf011a412 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -55,7 +55,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_BUILD_TESTS=OFF \ -DARROW_BUILD_SHARED=OFF \ -DARROW_BUILD_STATIC=ON \ - -DARROW_ACERO=${ARROW_ACERO:-$ARROW_DEFAULT_PARAM} \ + -DARROW_ACERO=${ARROW_ACERO:-ON} \ -DARROW_COMPUTE=ON \ -DARROW_CSV=ON \ -DARROW_DATASET=${ARROW_DATASET:-ON} \