From 8ebc317795929dfd0d348ae620c2e3cdaa38a651 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 31 Aug 2022 12:18:22 -0400 Subject: [PATCH 1/7] First pass at generating function docs --- r/DESCRIPTION | 1 + r/Makefile | 1 + r/R/dplyr-funcs-datetime.R | 53 ++++--- r/R/dplyr-funcs-doc.R | 244 +++++++++++++++++++++++++++++++++ r/R/dplyr-funcs-string.R | 66 +++++---- r/R/dplyr-funcs.R | 17 ++- r/data-raw/docgen.R | 91 ++++++++++++ r/man/arrow-dplyr-functions.Rd | 228 ++++++++++++++++++++++++++++++ r/man/register_binding.Rd | 11 +- 9 files changed, 652 insertions(+), 60 deletions(-) create mode 100644 r/R/dplyr-funcs-doc.R create mode 100644 r/data-raw/docgen.R create mode 100644 r/man/arrow-dplyr-functions.Rd diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 7ae6a8de29f..7b60f0c510a 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -103,6 +103,7 @@ Collate: 'dplyr-funcs-augmented.R' 'dplyr-funcs-conditional.R' 'dplyr-funcs-datetime.R' + 'dplyr-funcs-doc.R' 'dplyr-funcs-math.R' 'dplyr-funcs-string.R' 'dplyr-funcs-type.R' diff --git a/r/Makefile b/r/Makefile index 1ddbe595dd2..cb76b4c9775 100644 --- a/r/Makefile +++ b/r/Makefile @@ -26,6 +26,7 @@ style-all: R -s -e 'styler::style_file(setdiff(dir(pattern = "R$$", recursive = TRUE), source(".styler_excludes.R")$$value))' doc: style + R -s -f data-raw/docgen.R R -s -e 'roxygen2::roxygenize()' -git add --all man/*.Rd diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9a010452b84..6106adbc5e4 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -649,55 +649,54 @@ register_bindings_datetime_parsers <- function() { build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) }) - } register_bindings_datetime_rounding <- function() { register_binding( - "round_date", + "lubridate::round_date", function(x, unit = "second", week_start = getOption("lubridate.week.start", 7)) { + opts <- parse_period_unit(unit) + if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start + return(shift_temporal_to_week("round_temporal", x, week_start, options = opts)) + } - opts <- parse_period_unit(unit) - if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start - return(shift_temporal_to_week("round_temporal", x, week_start, options = opts)) + Expression$create("round_temporal", x, options = opts) } - - Expression$create("round_temporal", x, options = opts) - }) + ) register_binding( - "floor_date", + "lubridate::floor_date", function(x, unit = "second", week_start = getOption("lubridate.week.start", 7)) { + opts <- parse_period_unit(unit) + if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start + return(shift_temporal_to_week("floor_temporal", x, week_start, options = opts)) + } - opts <- parse_period_unit(unit) - if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start - return(shift_temporal_to_week("floor_temporal", x, week_start, options = opts)) + Expression$create("floor_temporal", x, options = opts) } - - Expression$create("floor_temporal", x, options = opts) - }) + ) register_binding( - "ceiling_date", + "lubridate::ceiling_date", function(x, unit = "second", change_on_boundary = NULL, week_start = getOption("lubridate.week.start", 7)) { - opts <- parse_period_unit(unit) - if (is.null(change_on_boundary)) { - change_on_boundary <- ifelse(call_binding("is.Date", x), TRUE, FALSE) - } - opts$ceil_is_strictly_greater <- change_on_boundary - - if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start - return(shift_temporal_to_week("ceil_temporal", x, week_start, options = opts)) - } + opts <- parse_period_unit(unit) + if (is.null(change_on_boundary)) { + change_on_boundary <- ifelse(call_binding("is.Date", x), TRUE, FALSE) + } + opts$ceil_is_strictly_greater <- change_on_boundary - Expression$create("ceil_temporal", x, options = opts) - }) + if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start + return(shift_temporal_to_week("ceil_temporal", x, week_start, options = opts)) + } + Expression$create("ceil_temporal", x, options = opts) + } + ) } diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R new file mode 100644 index 00000000000..5104735cf1b --- /dev/null +++ b/r/R/dplyr-funcs-doc.R @@ -0,0 +1,244 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Generated by using data-raw/docgen.R -> do not edit by hand + +#' Functions available in Arrow dplyr queries +#' +#' The `arrow` package contains mappings of 196 R functions to the corresponding +#' functions in the Arrow compute library. This allows you to write code inside +#' of `dplyr` methods that call R functions, including many in packages like +#' `stringr` and `lubridate`, and they will get translated to Arrow and run +#' on the Arrow query engine (Acero). This document lists all of the mapped +#' functions. +#' +#' In the list below, any differences in behavior or support between Acero and +#' the R function are listed. If no notes follow the function name, then you +#' can assume that the function works in Acero just as it does in R. +#' +#' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both +#' `str_sub()` and `stringr::str_sub()` work. +#' +#' In addition to these functions, you can call any of Arrow's 243 compute +#' functions directly. Arrow has many functions that don't map to an existing R +#' function. In other cases where there is an R function mapping, you can still +#' call the Arrow function directly if you don't want the adaptations that the R +#' mapping has that make Acero behave like R. These functions are listed in the +#' [C++ documentation](https://arrow.apache.org/docs/cpp/compute.html), and +#' in the function registry in R, they are named with an `arrow_` prefix, such +#' as `arrow_ascii_is_decimal`. +#' +#' * [-()] +#' * [!()] +#' * [!=()] +#' * [*()] +#' * [/()] +#' * [&()] +#' * [%/%()] +#' * [%%()] +#' * [%in%()] +#' * [^()] +#' * [+()] +#' * [<()] +#' * [<=()] +#' * [==()] +#' * [>()] +#' * [>=()] +#' * [|()] +#' * [add_filename()] +#' * [base::abs()] +#' * [base::acos()] +#' * [base::all()] +#' * [base::any()] +#' * [base::as.character()] +#' * [base::as.Date()] +#' * [base::as.difftime()] +#' * [base::as.double()] +#' * [base::as.factor()] +#' * [base::as.integer()] +#' * [base::as.logical()] +#' * [base::as.numeric()] +#' * [base::asin()] +#' * [base::ceiling()] +#' * [base::cos()] +#' * [base::data.frame()] +#' * [base::difftime()] +#' * [base::endsWith()] +#' * [base::exp()] +#' * [base::floor()] +#' * [base::format()] +#' * [base::grepl()] +#' * [base::gsub()] +#' * [base::ifelse()] +#' * [base::is.character()] +#' * [base::is.double()] +#' * [base::is.factor()] +#' * [base::is.finite()] +#' * [base::is.infinite()] +#' * [base::is.integer()] +#' * [base::is.list()] +#' * [base::is.logical()] +#' * [base::is.na()] +#' * [base::is.nan()] +#' * [base::is.numeric()] +#' * [base::ISOdate()] +#' * [base::ISOdatetime()] +#' * [base::log()] +#' * [base::log10()] +#' * [base::log1p()] +#' * [base::log2()] +#' * [base::logb()] +#' * [base::max()] +#' * [base::mean()] +#' * [base::min()] +#' * [base::nchar()] +#' * [base::paste()]: the `collapse` argument is not yet supported +#' * [base::paste0()]: the `collapse` argument is not yet supported +#' * [base::pmax()] +#' * [base::pmin()] +#' * [base::round()] +#' * [base::sign()] +#' * [base::sin()] +#' * [base::sqrt()] +#' * [base::startsWith()] +#' * [base::strftime()] +#' * [base::strptime()] +#' * [base::strrep()] +#' * [base::strsplit()] +#' * [base::sub()] +#' * [base::substr()] +#' * [base::substring()] +#' * [base::sum()] +#' * [base::tan()] +#' * [base::tolower()] +#' * [base::toupper()] +#' * [base::trunc()] +#' * [bit64::as.integer64()] +#' * [bit64::is.integer64()] +#' * [cast()] +#' * [dictionary_encode()] +#' * [dplyr::between()] +#' * [dplyr::case_when()] +#' * [dplyr::coalesce()] +#' * [dplyr::if_else()] +#' * [dplyr::n()] +#' * [dplyr::n_distinct()] +#' * [lubridate::am()] +#' * [lubridate::as_date()] +#' * [lubridate::as_datetime()] +#' * [lubridate::ceiling_date()] +#' * [lubridate::date()] +#' * [lubridate::date_decimal()] +#' * [lubridate::day()] +#' * [lubridate::ddays()] +#' * [lubridate::decimal_date()] +#' * [lubridate::dhours()] +#' * [lubridate::dmicroseconds()] +#' * [lubridate::dmilliseconds()] +#' * [lubridate::dminutes()] +#' * [lubridate::dmonths()] +#' * [lubridate::dmy()] +#' * [lubridate::dmy_h()] +#' * [lubridate::dmy_hm()] +#' * [lubridate::dmy_hms()] +#' * [lubridate::dnanoseconds()] +#' * [lubridate::dpicoseconds()] +#' * [lubridate::dseconds()] +#' * [lubridate::dst()] +#' * [lubridate::dweeks()] +#' * [lubridate::dyears()] +#' * [lubridate::dym()] +#' * [lubridate::epiweek()] +#' * [lubridate::epiyear()] +#' * [lubridate::fast_strptime()] +#' * [lubridate::floor_date()] +#' * [lubridate::format_ISO8601()] +#' * [lubridate::hour()] +#' * [lubridate::is.Date()] +#' * [lubridate::is.instant()] +#' * [lubridate::is.POSIXct()] +#' * [lubridate::is.timepoint()] +#' * [lubridate::isoweek()] +#' * [lubridate::isoyear()] +#' * [lubridate::leap_year()] +#' * [lubridate::make_date()] +#' * [lubridate::make_datetime()] +#' * [lubridate::make_difftime()] +#' * [lubridate::mday()] +#' * [lubridate::mdy()] +#' * [lubridate::mdy_h()] +#' * [lubridate::mdy_hm()] +#' * [lubridate::mdy_hms()] +#' * [lubridate::minute()] +#' * [lubridate::month()] +#' * [lubridate::my()] +#' * [lubridate::myd()] +#' * [lubridate::parse_date_time()] +#' * [lubridate::pm()] +#' * [lubridate::qday()] +#' * [lubridate::quarter()] +#' * [lubridate::round_date()] +#' * [lubridate::second()] +#' * [lubridate::semester()] +#' * [lubridate::tz()] +#' * [lubridate::wday()] +#' * [lubridate::week()] +#' * [lubridate::yday()] +#' * [lubridate::ydm()] +#' * [lubridate::ydm_h()] +#' * [lubridate::ydm_hm()] +#' * [lubridate::ydm_hms()] +#' * [lubridate::year()] +#' * [lubridate::ym()] +#' * [lubridate::ymd()] +#' * [lubridate::ymd_h()] +#' * [lubridate::ymd_hm()] +#' * [lubridate::ymd_hms()] +#' * [lubridate::yq()] +#' * [methods::is()] +#' * [rlang::is_character()] +#' * [rlang::is_double()] +#' * [rlang::is_integer()] +#' * [rlang::is_list()] +#' * [rlang::is_logical()] +#' * [stats::median()] +#' * [stats::quantile()] +#' * [stats::sd()] +#' * [stats::var()] +#' * [stringi::stri_reverse()] +#' * [stringr::str_c()]: the `collapse` argument is not yet supported +#' * [stringr::str_count()] +#' * [stringr::str_detect()] +#' * [stringr::str_dup()] +#' * [stringr::str_ends()] +#' * [stringr::str_length()] +#' * [stringr::str_like()] +#' * [stringr::str_pad()] +#' * [stringr::str_replace()] +#' * [stringr::str_replace_all()] +#' * [stringr::str_split()] +#' * [stringr::str_starts()] +#' * [stringr::str_sub()] +#' * [stringr::str_to_lower()] +#' * [stringr::str_to_title()] +#' * [stringr::str_to_upper()] +#' * [stringr::str_trim()] +#' * [tibble::tibble()] +#' +#' @name arrow-dplyr-functions +NULL + diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index b300d7c439e..71dcce94cc7 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -161,32 +161,44 @@ register_bindings_string_join <- function() { } } - register_binding("base::paste", function(..., sep = " ", collapse = NULL, recycle0 = FALSE) { - assert_that( - is.null(collapse), - msg = "paste() with the collapse argument is not yet supported in Arrow" - ) - if (!inherits(sep, "Expression")) { - assert_that(!is.na(sep), msg = "Invalid separator") - } - arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep) - }) - - register_binding("base::paste0", function(..., collapse = NULL, recycle0 = FALSE) { - assert_that( - is.null(collapse), - msg = "paste0() with the collapse argument is not yet supported in Arrow" - ) - arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "") - }) - - register_binding("stringr::str_c", function(..., sep = "", collapse = NULL) { - assert_that( - is.null(collapse), - msg = "str_c() with the collapse argument is not yet supported in Arrow" - ) - arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep) - }) + register_binding( + "base::paste", + function(..., sep = " ", collapse = NULL, recycle0 = FALSE) { + assert_that( + is.null(collapse), + msg = "paste() with the collapse argument is not yet supported in Arrow" + ) + if (!inherits(sep, "Expression")) { + assert_that(!is.na(sep), msg = "Invalid separator") + } + arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep) + }, + notes = "the `collapse` argument is not yet supported" + ) + + register_binding( + "base::paste0", + function(..., collapse = NULL, recycle0 = FALSE) { + assert_that( + is.null(collapse), + msg = "paste0() with the collapse argument is not yet supported in Arrow" + ) + arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "") + }, + notes = "the `collapse` argument is not yet supported" + ) + + register_binding( + "stringr::str_c", + function(..., sep = "", collapse = NULL) { + assert_that( + is.null(collapse), + msg = "str_c() with the collapse argument is not yet supported in Arrow" + ) + arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep) + }, + notes = "the `collapse` argument is not yet supported" + ) } register_bindings_string_regex <- function() { @@ -337,7 +349,7 @@ register_bindings_string_regex <- function() { register_binding("stringr::str_replace_all", arrow_stringr_string_replace_function(-1L)) register_binding("base::strsplit", function(x, split, fixed = FALSE, perl = FALSE, - useBytes = FALSE) { + useBytes = FALSE) { assert_that(is.string(split)) arrow_fun <- ifelse(fixed, "split_pattern", "split_pattern_regex") diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 4dadff54b48..a66db112d98 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -59,13 +59,17 @@ NULL #' summarise) because the data mask has to be a list. #' @param registry An environment in which the functions should be #' assigned. -#' +#' @param notes string for the docs: note any limitations or differences in +#' behavior between the Arrow version and the R function. #' @return The previously registered binding or `NULL` if no previously #' registered function existed. #' @keywords internal #' -register_binding <- function(fun_name, fun, registry = nse_funcs, - update_cache = FALSE) { +register_binding <- function(fun_name, + fun, + registry = nse_funcs, + update_cache = FALSE, + notes = character(0)) { unqualified_name <- sub("^.*?:{+}", "", fun_name) previous_fun <- registry[[unqualified_name]] @@ -76,7 +80,8 @@ register_binding <- function(fun_name, fun, registry = nse_funcs, paste0( "A \"", unqualified_name, - "\" binding already exists in the registry and will be overwritten.") + "\" binding already exists in the registry and will be overwritten." + ) ) } @@ -85,6 +90,8 @@ register_binding <- function(fun_name, fun, registry = nse_funcs, registry[[unqualified_name]] <- fun registry[[fun_name]] <- fun + .cache$docs[[fun_name]] <- notes + if (update_cache) { fun_cache <- .cache$functions fun_cache[[unqualified_name]] <- fun @@ -131,7 +138,7 @@ call_binding_agg <- function(fun_name, ...) { # Called in .onLoad() create_binding_cache <- function() { - arrow_funcs <- list() + .cache$docs <- list() # Register all available Arrow Compute functions, namespaced as arrow_fun. all_arrow_funs <- list_compute_functions() diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R new file mode 100644 index 00000000000..a41aeb246c8 --- /dev/null +++ b/r/data-raw/docgen.R @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This code generates dplyr-funcs-doc.R. +# It requires that the package be installed. + +file_template <- "# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# \"License\"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Generated by using data-raw/docgen.R -> do not edit by hand + +#' Functions available in Arrow dplyr queries +#' +#' The `arrow` package contains mappings of %s R functions to the corresponding +#' functions in the Arrow compute library. This allows you to write code inside +#' of `dplyr` methods that call R functions, including many in packages like +#' `stringr` and `lubridate`, and they will get translated to Arrow and run +#' on the Arrow query engine (Acero). This document lists all of the mapped +#' functions. +#' +#' In the list below, any differences in behavior or support between Acero and +#' the R function are listed. If no notes follow the function name, then you +#' can assume that the function works in Acero just as it does in R. +#' +#' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both +#' `str_sub()` and `stringr::str_sub()` work. +#' +#' In addition to these functions, you can call any of Arrow's %s compute +#' functions directly. Arrow has many functions that don't map to an existing R +#' function. In other cases where there is an R function mapping, you can still +#' call the Arrow function directly if you don't want the adaptations that the R +#' mapping has that make Acero behave like R. These functions are listed in the +#' [C++ documentation](https://arrow.apache.org/docs/cpp/compute.html), and +#' in the function registry in R, they are named with an `arrow_` prefix, such +#' as `arrow_ascii_is_decimal`. +#' +%s +#' +#' @name arrow-dplyr-functions +NULL +" + +docs <- arrow:::.cache$docs +docs <- docs[order(names(docs))] +# TODO: group by package name, create subheadings + +doclets <- purrr::imap_chr(docs, function(x, n) { + out <- paste0("#' * [", n, "()]") + if (length(x)) { + out <- paste0(out, ": ", paste(x, collapse = " ")) + } + out +}) + +writeLines( + sprintf( + file_template, + length(docs), + length(arrow::list_compute_functions()), + paste(doclets, collapse = "\n") + ), + "R/dplyr-funcs-doc.R" +) diff --git a/r/man/arrow-dplyr-functions.Rd b/r/man/arrow-dplyr-functions.Rd new file mode 100644 index 00000000000..8cf74c5c78e --- /dev/null +++ b/r/man/arrow-dplyr-functions.Rd @@ -0,0 +1,228 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-funcs-doc.R +\name{arrow-dplyr-functions} +\alias{arrow-dplyr-functions} +\title{Functions available in Arrow dplyr queries} +\description{ +The \code{arrow} package contains mappings of 196 R functions to the corresponding +functions in the Arrow compute library. This allows you to write code inside +of \code{dplyr} methods that call R functions, including many in packages like +\code{stringr} and \code{lubridate}, and they will get translated to Arrow and run +on the Arrow query engine (Acero). This document lists all of the mapped +functions. +} +\details{ +In the list below, any differences in behavior or support between Acero and +the R function are listed. If no notes follow the function name, then you +can assume that the function works in Acero just as it does in R. + +Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both +\code{str_sub()} and \code{stringr::str_sub()} work. + +In addition to these functions, you can call any of Arrow's 243 compute +functions directly. Arrow has many functions that don't map to an existing R +function. In other cases where there is an R function mapping, you can still +call the Arrow function directly if you don't want the adaptations that the R +mapping has that make Acero behave like R. These functions are listed in the +\href{https://arrow.apache.org/docs/cpp/compute.html}{C++ documentation}, and +in the function registry in R, they are named with an \code{arrow_} prefix, such +as \code{arrow_ascii_is_decimal}. +\itemize{ +\item \code{\link[=-]{-()}} +\item \code{\link[=!]{!()}} +\item \code{\link[=!=]{!=()}} +\item \code{\link[=*]{*()}} +\item \code{\link[=/]{/()}} +\item \code{\link[=&]{&()}} +\item \code{\link[=\%/\%]{\%/\%()}} +\item \code{\link[=\%\%]{\%\%()}} +\item \code{\link[=\%in\%]{\%in\%()}} +\item \code{\link[=^]{^()}} +\item \code{\link[=+]{+()}} +\item \code{\link[=<]{<()}} +\item \code{\link[=<=]{<=()}} +\item \code{\link[===]{==()}} +\item \code{\link[=>]{>()}} +\item \code{\link[=>=]{>=()}} +\item \code{\link[=|]{|()}} +\item \code{\link[=add_filename]{add_filename()}} +\item \code{\link[base:MathFun]{base::abs()}} +\item \code{\link[base:Trig]{base::acos()}} +\item \code{\link[base:all]{base::all()}} +\item \code{\link[base:any]{base::any()}} +\item \code{\link[base:character]{base::as.character()}} +\item \code{\link[base:as.Date]{base::as.Date()}} +\item \code{\link[base:difftime]{base::as.difftime()}} +\item \code{\link[base:double]{base::as.double()}} +\item \code{\link[base:factor]{base::as.factor()}} +\item \code{\link[base:integer]{base::as.integer()}} +\item \code{\link[base:logical]{base::as.logical()}} +\item \code{\link[base:numeric]{base::as.numeric()}} +\item \code{\link[base:Trig]{base::asin()}} +\item \code{\link[base:Round]{base::ceiling()}} +\item \code{\link[base:Trig]{base::cos()}} +\item \code{\link[base:data.frame]{base::data.frame()}} +\item \code{\link[base:difftime]{base::difftime()}} +\item \code{\link[base:startsWith]{base::endsWith()}} +\item \code{\link[base:Log]{base::exp()}} +\item \code{\link[base:Round]{base::floor()}} +\item \code{\link[base:format]{base::format()}} +\item \code{\link[base:grep]{base::grepl()}} +\item \code{\link[base:grep]{base::gsub()}} +\item \code{\link[base:ifelse]{base::ifelse()}} +\item \code{\link[base:character]{base::is.character()}} +\item \code{\link[base:double]{base::is.double()}} +\item \code{\link[base:factor]{base::is.factor()}} +\item \code{\link[base:is.finite]{base::is.finite()}} +\item \code{\link[base:is.finite]{base::is.infinite()}} +\item \code{\link[base:integer]{base::is.integer()}} +\item \code{\link[base:list]{base::is.list()}} +\item \code{\link[base:logical]{base::is.logical()}} +\item \code{\link[base:NA]{base::is.na()}} +\item \code{\link[base:is.finite]{base::is.nan()}} +\item \code{\link[base:numeric]{base::is.numeric()}} +\item \code{\link[base:ISOdatetime]{base::ISOdate()}} +\item \code{\link[base:ISOdatetime]{base::ISOdatetime()}} +\item \code{\link[base:Log]{base::log()}} +\item \code{\link[base:Log]{base::log10()}} +\item \code{\link[base:Log]{base::log1p()}} +\item \code{\link[base:Log]{base::log2()}} +\item \code{\link[base:Log]{base::logb()}} +\item \code{\link[base:Extremes]{base::max()}} +\item \code{\link[base:mean]{base::mean()}} +\item \code{\link[base:Extremes]{base::min()}} +\item \code{\link[base:nchar]{base::nchar()}} +\item \code{\link[base:paste]{base::paste()}}: the \code{collapse} argument is not yet supported +\item \code{\link[base:paste]{base::paste0()}}: the \code{collapse} argument is not yet supported +\item \code{\link[base:Extremes]{base::pmax()}} +\item \code{\link[base:Extremes]{base::pmin()}} +\item \code{\link[base:Round]{base::round()}} +\item \code{\link[base:sign]{base::sign()}} +\item \code{\link[base:Trig]{base::sin()}} +\item \code{\link[base:MathFun]{base::sqrt()}} +\item \code{\link[base:startsWith]{base::startsWith()}} +\item \code{\link[base:strptime]{base::strftime()}} +\item \code{\link[base:strptime]{base::strptime()}} +\item \code{\link[base:strrep]{base::strrep()}} +\item \code{\link[base:strsplit]{base::strsplit()}} +\item \code{\link[base:grep]{base::sub()}} +\item \code{\link[base:substr]{base::substr()}} +\item \code{\link[base:substr]{base::substring()}} +\item \code{\link[base:sum]{base::sum()}} +\item \code{\link[base:Trig]{base::tan()}} +\item \code{\link[base:chartr]{base::tolower()}} +\item \code{\link[base:chartr]{base::toupper()}} +\item \code{\link[base:Round]{base::trunc()}} +\item \code{\link[bit64:as.integer64.character]{bit64::as.integer64()}} +\item \code{\link[bit64:bit64-package]{bit64::is.integer64()}} +\item \code{\link[=cast]{cast()}} +\item \code{\link[=dictionary_encode]{dictionary_encode()}} +\item \code{\link[dplyr:between]{dplyr::between()}} +\item \code{\link[dplyr:case_when]{dplyr::case_when()}} +\item \code{\link[dplyr:coalesce]{dplyr::coalesce()}} +\item \code{\link[dplyr:if_else]{dplyr::if_else()}} +\item \code{\link[dplyr:context]{dplyr::n()}} +\item \code{\link[dplyr:n_distinct]{dplyr::n_distinct()}} +\item \code{\link[lubridate:am]{lubridate::am()}} +\item \code{\link[lubridate:as_date]{lubridate::as_date()}} +\item \code{\link[lubridate:as_date]{lubridate::as_datetime()}} +\item \code{\link[lubridate:round_date]{lubridate::ceiling_date()}} +\item \code{\link[lubridate:date]{lubridate::date()}} +\item \code{\link[lubridate:date_decimal]{lubridate::date_decimal()}} +\item \code{\link[lubridate:day]{lubridate::day()}} +\item \code{\link[lubridate:duration]{lubridate::ddays()}} +\item \code{\link[lubridate:decimal_date]{lubridate::decimal_date()}} +\item \code{\link[lubridate:duration]{lubridate::dhours()}} +\item \code{\link[lubridate:duration]{lubridate::dmicroseconds()}} +\item \code{\link[lubridate:duration]{lubridate::dmilliseconds()}} +\item \code{\link[lubridate:duration]{lubridate::dminutes()}} +\item \code{\link[lubridate:duration]{lubridate::dmonths()}} +\item \code{\link[lubridate:ymd]{lubridate::dmy()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::dmy_h()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::dmy_hm()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::dmy_hms()}} +\item \code{\link[lubridate:duration]{lubridate::dnanoseconds()}} +\item \code{\link[lubridate:duration]{lubridate::dpicoseconds()}} +\item \code{\link[lubridate:duration]{lubridate::dseconds()}} +\item \code{\link[lubridate:dst]{lubridate::dst()}} +\item \code{\link[lubridate:duration]{lubridate::dweeks()}} +\item \code{\link[lubridate:duration]{lubridate::dyears()}} +\item \code{\link[lubridate:ymd]{lubridate::dym()}} +\item \code{\link[lubridate:week]{lubridate::epiweek()}} +\item \code{\link[lubridate:year]{lubridate::epiyear()}} +\item \code{\link[lubridate:parse_date_time]{lubridate::fast_strptime()}} +\item \code{\link[lubridate:round_date]{lubridate::floor_date()}} +\item \code{\link[lubridate:format_ISO8601]{lubridate::format_ISO8601()}} +\item \code{\link[lubridate:hour]{lubridate::hour()}} +\item \code{\link[lubridate:date_utils]{lubridate::is.Date()}} +\item \code{\link[lubridate:is.instant]{lubridate::is.instant()}} +\item \code{\link[lubridate:posix_utils]{lubridate::is.POSIXct()}} +\item \code{\link[lubridate:is.instant]{lubridate::is.timepoint()}} +\item \code{\link[lubridate:week]{lubridate::isoweek()}} +\item \code{\link[lubridate:year]{lubridate::isoyear()}} +\item \code{\link[lubridate:leap_year]{lubridate::leap_year()}} +\item \code{\link[lubridate:make_datetime]{lubridate::make_date()}} +\item \code{\link[lubridate:make_datetime]{lubridate::make_datetime()}} +\item \code{\link[lubridate:make_difftime]{lubridate::make_difftime()}} +\item \code{\link[lubridate:day]{lubridate::mday()}} +\item \code{\link[lubridate:ymd]{lubridate::mdy()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::mdy_h()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::mdy_hm()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::mdy_hms()}} +\item \code{\link[lubridate:minute]{lubridate::minute()}} +\item \code{\link[lubridate:month]{lubridate::month()}} +\item \code{\link[lubridate:ymd]{lubridate::my()}} +\item \code{\link[lubridate:ymd]{lubridate::myd()}} +\item \code{\link[lubridate:parse_date_time]{lubridate::parse_date_time()}} +\item \code{\link[lubridate:am]{lubridate::pm()}} +\item \code{\link[lubridate:day]{lubridate::qday()}} +\item \code{\link[lubridate:quarter]{lubridate::quarter()}} +\item \code{\link[lubridate:round_date]{lubridate::round_date()}} +\item \code{\link[lubridate:second]{lubridate::second()}} +\item \code{\link[lubridate:quarter]{lubridate::semester()}} +\item \code{\link[lubridate:tz]{lubridate::tz()}} +\item \code{\link[lubridate:day]{lubridate::wday()}} +\item \code{\link[lubridate:week]{lubridate::week()}} +\item \code{\link[lubridate:day]{lubridate::yday()}} +\item \code{\link[lubridate:ymd]{lubridate::ydm()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::ydm_h()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::ydm_hm()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::ydm_hms()}} +\item \code{\link[lubridate:year]{lubridate::year()}} +\item \code{\link[lubridate:ymd]{lubridate::ym()}} +\item \code{\link[lubridate:ymd]{lubridate::ymd()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::ymd_h()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::ymd_hm()}} +\item \code{\link[lubridate:ymd_hms]{lubridate::ymd_hms()}} +\item \code{\link[lubridate:ymd]{lubridate::yq()}} +\item \code{\link[methods:is]{methods::is()}} +\item \code{\link[rlang:type-predicates]{rlang::is_character()}} +\item \code{\link[rlang:type-predicates]{rlang::is_double()}} +\item \code{\link[rlang:type-predicates]{rlang::is_integer()}} +\item \code{\link[rlang:type-predicates]{rlang::is_list()}} +\item \code{\link[rlang:type-predicates]{rlang::is_logical()}} +\item \code{\link[stats:median]{stats::median()}} +\item \code{\link[stats:quantile]{stats::quantile()}} +\item \code{\link[stats:sd]{stats::sd()}} +\item \code{\link[stats:cor]{stats::var()}} +\item \code{\link[stringi:stri_reverse]{stringi::stri_reverse()}} +\item \code{\link[stringr:str_c]{stringr::str_c()}}: the \code{collapse} argument is not yet supported +\item \code{\link[stringr:str_count]{stringr::str_count()}} +\item \code{\link[stringr:str_detect]{stringr::str_detect()}} +\item \code{\link[stringr:str_dup]{stringr::str_dup()}} +\item \code{\link[stringr:str_starts]{stringr::str_ends()}} +\item \code{\link[stringr:str_length]{stringr::str_length()}} +\item \code{\link[stringr:str_like]{stringr::str_like()}} +\item \code{\link[stringr:str_pad]{stringr::str_pad()}} +\item \code{\link[stringr:str_replace]{stringr::str_replace()}} +\item \code{\link[stringr:str_replace]{stringr::str_replace_all()}} +\item \code{\link[stringr:str_split]{stringr::str_split()}} +\item \code{\link[stringr:str_starts]{stringr::str_starts()}} +\item \code{\link[stringr:str_sub]{stringr::str_sub()}} +\item \code{\link[stringr:case]{stringr::str_to_lower()}} +\item \code{\link[stringr:case]{stringr::str_to_title()}} +\item \code{\link[stringr:case]{stringr::str_to_upper()}} +\item \code{\link[stringr:str_trim]{stringr::str_trim()}} +\item \code{\link[tibble:tibble]{tibble::tibble()}} +} +} diff --git a/r/man/register_binding.Rd b/r/man/register_binding.Rd index c53df707516..d2a4a380543 100644 --- a/r/man/register_binding.Rd +++ b/r/man/register_binding.Rd @@ -4,7 +4,13 @@ \alias{register_binding} \title{Register compute bindings} \usage{ -register_binding(fun_name, fun, registry = nse_funcs, update_cache = FALSE) +register_binding( + fun_name, + fun, + registry = nse_funcs, + update_cache = FALSE, + notes = character(0) +) } \arguments{ \item{fun_name}{A string containing a function name in the form \code{"function"} or @@ -26,6 +32,9 @@ non-aggregate functions could be revisited...it is currently used as the data mask in mutate, filter, and aggregate (but not summarise) because the data mask has to be a list.} +\item{notes}{string for the docs: note any limitations or differences in +behavior between the Arrow version and the R function.} + \item{agg_fun}{An aggregate function or \code{NULL} to un-register a previous aggregate function. This function must accept \code{Expression} objects as arguments and return a \code{list()} with components: From 8d036c7d961c233154c79cad3b2775a3f2ccf252 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 1 Sep 2022 08:52:18 -0400 Subject: [PATCH 2/7] Group by package name and improve links --- r/R/dplyr-funcs-augmented.R | 2 +- r/R/dplyr-funcs-doc.R | 426 ++++++++++++++++--------------- r/R/dplyr-funcs-type.R | 13 +- r/data-raw/docgen.R | 54 +++- r/man/arrow-dplyr-functions.Rd | 445 ++++++++++++++++++--------------- 5 files changed, 521 insertions(+), 419 deletions(-) diff --git a/r/R/dplyr-funcs-augmented.R b/r/R/dplyr-funcs-augmented.R index 6e751d49f61..828e3df12be 100644 --- a/r/R/dplyr-funcs-augmented.R +++ b/r/R/dplyr-funcs-augmented.R @@ -16,7 +16,7 @@ # under the License. register_bindings_augmented <- function() { - register_binding("add_filename", function() { + register_binding("arrow::add_filename", function() { Expression$field_ref("__filename") }) } diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index 5104735cf1b..078cf4a9fa4 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -19,7 +19,7 @@ #' Functions available in Arrow dplyr queries #' -#' The `arrow` package contains mappings of 196 R functions to the corresponding +#' The `arrow` package contains mappings of 195 R functions to the corresponding #' functions in the Arrow compute library. This allows you to write code inside #' of `dplyr` methods that call R functions, including many in packages like #' `stringr` and `lubridate`, and they will get translated to Arrow and run @@ -42,203 +42,233 @@ #' in the function registry in R, they are named with an `arrow_` prefix, such #' as `arrow_ascii_is_decimal`. #' -#' * [-()] -#' * [!()] -#' * [!=()] -#' * [*()] -#' * [/()] -#' * [&()] -#' * [%/%()] -#' * [%%()] -#' * [%in%()] -#' * [^()] -#' * [+()] -#' * [<()] -#' * [<=()] -#' * [==()] -#' * [>()] -#' * [>=()] -#' * [|()] -#' * [add_filename()] -#' * [base::abs()] -#' * [base::acos()] -#' * [base::all()] -#' * [base::any()] -#' * [base::as.character()] -#' * [base::as.Date()] -#' * [base::as.difftime()] -#' * [base::as.double()] -#' * [base::as.factor()] -#' * [base::as.integer()] -#' * [base::as.logical()] -#' * [base::as.numeric()] -#' * [base::asin()] -#' * [base::ceiling()] -#' * [base::cos()] -#' * [base::data.frame()] -#' * [base::difftime()] -#' * [base::endsWith()] -#' * [base::exp()] -#' * [base::floor()] -#' * [base::format()] -#' * [base::grepl()] -#' * [base::gsub()] -#' * [base::ifelse()] -#' * [base::is.character()] -#' * [base::is.double()] -#' * [base::is.factor()] -#' * [base::is.finite()] -#' * [base::is.infinite()] -#' * [base::is.integer()] -#' * [base::is.list()] -#' * [base::is.logical()] -#' * [base::is.na()] -#' * [base::is.nan()] -#' * [base::is.numeric()] -#' * [base::ISOdate()] -#' * [base::ISOdatetime()] -#' * [base::log()] -#' * [base::log10()] -#' * [base::log1p()] -#' * [base::log2()] -#' * [base::logb()] -#' * [base::max()] -#' * [base::mean()] -#' * [base::min()] -#' * [base::nchar()] -#' * [base::paste()]: the `collapse` argument is not yet supported -#' * [base::paste0()]: the `collapse` argument is not yet supported -#' * [base::pmax()] -#' * [base::pmin()] -#' * [base::round()] -#' * [base::sign()] -#' * [base::sin()] -#' * [base::sqrt()] -#' * [base::startsWith()] -#' * [base::strftime()] -#' * [base::strptime()] -#' * [base::strrep()] -#' * [base::strsplit()] -#' * [base::sub()] -#' * [base::substr()] -#' * [base::substring()] -#' * [base::sum()] -#' * [base::tan()] -#' * [base::tolower()] -#' * [base::toupper()] -#' * [base::trunc()] -#' * [bit64::as.integer64()] -#' * [bit64::is.integer64()] -#' * [cast()] -#' * [dictionary_encode()] -#' * [dplyr::between()] -#' * [dplyr::case_when()] -#' * [dplyr::coalesce()] -#' * [dplyr::if_else()] -#' * [dplyr::n()] -#' * [dplyr::n_distinct()] -#' * [lubridate::am()] -#' * [lubridate::as_date()] -#' * [lubridate::as_datetime()] -#' * [lubridate::ceiling_date()] -#' * [lubridate::date()] -#' * [lubridate::date_decimal()] -#' * [lubridate::day()] -#' * [lubridate::ddays()] -#' * [lubridate::decimal_date()] -#' * [lubridate::dhours()] -#' * [lubridate::dmicroseconds()] -#' * [lubridate::dmilliseconds()] -#' * [lubridate::dminutes()] -#' * [lubridate::dmonths()] -#' * [lubridate::dmy()] -#' * [lubridate::dmy_h()] -#' * [lubridate::dmy_hm()] -#' * [lubridate::dmy_hms()] -#' * [lubridate::dnanoseconds()] -#' * [lubridate::dpicoseconds()] -#' * [lubridate::dseconds()] -#' * [lubridate::dst()] -#' * [lubridate::dweeks()] -#' * [lubridate::dyears()] -#' * [lubridate::dym()] -#' * [lubridate::epiweek()] -#' * [lubridate::epiyear()] -#' * [lubridate::fast_strptime()] -#' * [lubridate::floor_date()] -#' * [lubridate::format_ISO8601()] -#' * [lubridate::hour()] -#' * [lubridate::is.Date()] -#' * [lubridate::is.instant()] -#' * [lubridate::is.POSIXct()] -#' * [lubridate::is.timepoint()] -#' * [lubridate::isoweek()] -#' * [lubridate::isoyear()] -#' * [lubridate::leap_year()] -#' * [lubridate::make_date()] -#' * [lubridate::make_datetime()] -#' * [lubridate::make_difftime()] -#' * [lubridate::mday()] -#' * [lubridate::mdy()] -#' * [lubridate::mdy_h()] -#' * [lubridate::mdy_hm()] -#' * [lubridate::mdy_hms()] -#' * [lubridate::minute()] -#' * [lubridate::month()] -#' * [lubridate::my()] -#' * [lubridate::myd()] -#' * [lubridate::parse_date_time()] -#' * [lubridate::pm()] -#' * [lubridate::qday()] -#' * [lubridate::quarter()] -#' * [lubridate::round_date()] -#' * [lubridate::second()] -#' * [lubridate::semester()] -#' * [lubridate::tz()] -#' * [lubridate::wday()] -#' * [lubridate::week()] -#' * [lubridate::yday()] -#' * [lubridate::ydm()] -#' * [lubridate::ydm_h()] -#' * [lubridate::ydm_hm()] -#' * [lubridate::ydm_hms()] -#' * [lubridate::year()] -#' * [lubridate::ym()] -#' * [lubridate::ymd()] -#' * [lubridate::ymd_h()] -#' * [lubridate::ymd_hm()] -#' * [lubridate::ymd_hms()] -#' * [lubridate::yq()] -#' * [methods::is()] -#' * [rlang::is_character()] -#' * [rlang::is_double()] -#' * [rlang::is_integer()] -#' * [rlang::is_list()] -#' * [rlang::is_logical()] -#' * [stats::median()] -#' * [stats::quantile()] -#' * [stats::sd()] -#' * [stats::var()] -#' * [stringi::stri_reverse()] -#' * [stringr::str_c()]: the `collapse` argument is not yet supported -#' * [stringr::str_count()] -#' * [stringr::str_detect()] -#' * [stringr::str_dup()] -#' * [stringr::str_ends()] -#' * [stringr::str_length()] -#' * [stringr::str_like()] -#' * [stringr::str_pad()] -#' * [stringr::str_replace()] -#' * [stringr::str_replace_all()] -#' * [stringr::str_split()] -#' * [stringr::str_starts()] -#' * [stringr::str_sub()] -#' * [stringr::str_to_lower()] -#' * [stringr::str_to_title()] -#' * [stringr::str_to_upper()] -#' * [stringr::str_trim()] -#' * [tibble::tibble()] +#' ## arrow +#' +#' * [add_filename][arrow::add_filename()] +#' * [cast][arrow::cast()] +#' +#' ## base +#' +#' * [-][-()] +#' * [!][!()] +#' * [!=][!=()] +#' * [*][*()] +#' * [/][/()] +#' * [&][&()] +#' * [%/%][%/%()] +#' * [%%][%%()] +#' * [%in%][%in%()] +#' * [^][^()] +#' * [+][+()] +#' * [<][<()] +#' * [<=][<=()] +#' * [==][==()] +#' * [>][>()] +#' * [>=][>=()] +#' * [|][|()] +#' * [abs][base::abs()] +#' * [acos][base::acos()] +#' * [all][base::all()] +#' * [any][base::any()] +#' * [as.character][base::as.character()] +#' * [as.Date][base::as.Date()] +#' * [as.difftime][base::as.difftime()] +#' * [as.double][base::as.double()] +#' * [as.factor][base::as.factor()] +#' * [as.integer][base::as.integer()] +#' * [as.logical][base::as.logical()] +#' * [as.numeric][base::as.numeric()] +#' * [asin][base::asin()] +#' * [ceiling][base::ceiling()] +#' * [cos][base::cos()] +#' * [data.frame][base::data.frame()] +#' * [difftime][base::difftime()] +#' * [endsWith][base::endsWith()] +#' * [exp][base::exp()] +#' * [floor][base::floor()] +#' * [format][base::format()] +#' * [grepl][base::grepl()] +#' * [gsub][base::gsub()] +#' * [ifelse][base::ifelse()] +#' * [is.character][base::is.character()] +#' * [is.double][base::is.double()] +#' * [is.factor][base::is.factor()] +#' * [is.finite][base::is.finite()] +#' * [is.infinite][base::is.infinite()] +#' * [is.integer][base::is.integer()] +#' * [is.list][base::is.list()] +#' * [is.logical][base::is.logical()] +#' * [is.na][base::is.na()] +#' * [is.nan][base::is.nan()] +#' * [is.numeric][base::is.numeric()] +#' * [ISOdate][base::ISOdate()] +#' * [ISOdatetime][base::ISOdatetime()] +#' * [log][base::log()] +#' * [log10][base::log10()] +#' * [log1p][base::log1p()] +#' * [log2][base::log2()] +#' * [logb][base::logb()] +#' * [max][base::max()] +#' * [mean][base::mean()] +#' * [min][base::min()] +#' * [nchar][base::nchar()] +#' * [paste][base::paste()]: the `collapse` argument is not yet supported +#' * [paste0][base::paste0()]: the `collapse` argument is not yet supported +#' * [pmax][base::pmax()] +#' * [pmin][base::pmin()] +#' * [round][base::round()] +#' * [sign][base::sign()] +#' * [sin][base::sin()] +#' * [sqrt][base::sqrt()] +#' * [startsWith][base::startsWith()] +#' * [strftime][base::strftime()] +#' * [strptime][base::strptime()] +#' * [strrep][base::strrep()] +#' * [strsplit][base::strsplit()] +#' * [sub][base::sub()] +#' * [substr][base::substr()] +#' * [substring][base::substring()] +#' * [sum][base::sum()] +#' * [tan][base::tan()] +#' * [tolower][base::tolower()] +#' * [toupper][base::toupper()] +#' * [trunc][base::trunc()] +#' +#' ## bit64 +#' +#' * [as.integer64][bit64::as.integer64()] +#' * [is.integer64][bit64::is.integer64()] +#' +#' ## dplyr +#' +#' * [between][dplyr::between()] +#' * [case_when][dplyr::case_when()] +#' * [coalesce][dplyr::coalesce()] +#' * [if_else][dplyr::if_else()] +#' * [n][dplyr::n()] +#' * [n_distinct][dplyr::n_distinct()] +#' +#' ## lubridate +#' +#' * [am][lubridate::am()] +#' * [as_date][lubridate::as_date()] +#' * [as_datetime][lubridate::as_datetime()] +#' * [ceiling_date][lubridate::ceiling_date()] +#' * [date][lubridate::date()] +#' * [date_decimal][lubridate::date_decimal()] +#' * [day][lubridate::day()] +#' * [ddays][lubridate::ddays()] +#' * [decimal_date][lubridate::decimal_date()] +#' * [dhours][lubridate::dhours()] +#' * [dmicroseconds][lubridate::dmicroseconds()] +#' * [dmilliseconds][lubridate::dmilliseconds()] +#' * [dminutes][lubridate::dminutes()] +#' * [dmonths][lubridate::dmonths()] +#' * [dmy][lubridate::dmy()] +#' * [dmy_h][lubridate::dmy_h()] +#' * [dmy_hm][lubridate::dmy_hm()] +#' * [dmy_hms][lubridate::dmy_hms()] +#' * [dnanoseconds][lubridate::dnanoseconds()] +#' * [dpicoseconds][lubridate::dpicoseconds()] +#' * [dseconds][lubridate::dseconds()] +#' * [dst][lubridate::dst()] +#' * [dweeks][lubridate::dweeks()] +#' * [dyears][lubridate::dyears()] +#' * [dym][lubridate::dym()] +#' * [epiweek][lubridate::epiweek()] +#' * [epiyear][lubridate::epiyear()] +#' * [fast_strptime][lubridate::fast_strptime()] +#' * [floor_date][lubridate::floor_date()] +#' * [format_ISO8601][lubridate::format_ISO8601()] +#' * [hour][lubridate::hour()] +#' * [is.Date][lubridate::is.Date()] +#' * [is.instant][lubridate::is.instant()] +#' * [is.POSIXct][lubridate::is.POSIXct()] +#' * [is.timepoint][lubridate::is.timepoint()] +#' * [isoweek][lubridate::isoweek()] +#' * [isoyear][lubridate::isoyear()] +#' * [leap_year][lubridate::leap_year()] +#' * [make_date][lubridate::make_date()] +#' * [make_datetime][lubridate::make_datetime()] +#' * [make_difftime][lubridate::make_difftime()] +#' * [mday][lubridate::mday()] +#' * [mdy][lubridate::mdy()] +#' * [mdy_h][lubridate::mdy_h()] +#' * [mdy_hm][lubridate::mdy_hm()] +#' * [mdy_hms][lubridate::mdy_hms()] +#' * [minute][lubridate::minute()] +#' * [month][lubridate::month()] +#' * [my][lubridate::my()] +#' * [myd][lubridate::myd()] +#' * [parse_date_time][lubridate::parse_date_time()] +#' * [pm][lubridate::pm()] +#' * [qday][lubridate::qday()] +#' * [quarter][lubridate::quarter()] +#' * [round_date][lubridate::round_date()] +#' * [second][lubridate::second()] +#' * [semester][lubridate::semester()] +#' * [tz][lubridate::tz()] +#' * [wday][lubridate::wday()] +#' * [week][lubridate::week()] +#' * [yday][lubridate::yday()] +#' * [ydm][lubridate::ydm()] +#' * [ydm_h][lubridate::ydm_h()] +#' * [ydm_hm][lubridate::ydm_hm()] +#' * [ydm_hms][lubridate::ydm_hms()] +#' * [year][lubridate::year()] +#' * [ym][lubridate::ym()] +#' * [ymd][lubridate::ymd()] +#' * [ymd_h][lubridate::ymd_h()] +#' * [ymd_hm][lubridate::ymd_hm()] +#' * [ymd_hms][lubridate::ymd_hms()] +#' * [yq][lubridate::yq()] +#' +#' ## methods +#' +#' * [is][methods::is()] +#' +#' ## rlang +#' +#' * [is_character][rlang::is_character()] +#' * [is_double][rlang::is_double()] +#' * [is_integer][rlang::is_integer()] +#' * [is_list][rlang::is_list()] +#' * [is_logical][rlang::is_logical()] +#' +#' ## stats +#' +#' * [median][stats::median()] +#' * [quantile][stats::quantile()] +#' * [sd][stats::sd()] +#' * [var][stats::var()] +#' +#' ## stringi +#' +#' * [stri_reverse][stringi::stri_reverse()] +#' +#' ## stringr +#' +#' * [str_c][stringr::str_c()]: the `collapse` argument is not yet supported +#' * [str_count][stringr::str_count()] +#' * [str_detect][stringr::str_detect()] +#' * [str_dup][stringr::str_dup()] +#' * [str_ends][stringr::str_ends()] +#' * [str_length][stringr::str_length()] +#' * [str_like][stringr::str_like()] +#' * [str_pad][stringr::str_pad()] +#' * [str_replace][stringr::str_replace()] +#' * [str_replace_all][stringr::str_replace_all()] +#' * [str_split][stringr::str_split()] +#' * [str_starts][stringr::str_starts()] +#' * [str_sub][stringr::str_sub()] +#' * [str_to_lower][stringr::str_to_lower()] +#' * [str_to_title][stringr::str_to_title()] +#' * [str_to_upper][stringr::str_to_upper()] +#' * [str_trim][stringr::str_trim()] +#' +#' ## tibble +#' +#' * [tibble][tibble::tibble()] #' #' @name arrow-dplyr-functions NULL - diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 9925d0347f7..2c5112d7f73 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -24,23 +24,12 @@ register_bindings_type <- function() { } register_bindings_type_cast <- function() { - register_binding("cast", function(x, target_type, safe = TRUE, ...) { + register_binding("arrow::cast", function(x, target_type, safe = TRUE, ...) { opts <- cast_options(safe, ...) opts$to_type <- as_type(target_type) Expression$create("cast", x, options = opts) }) - register_binding("dictionary_encode", function(x, - null_encoding_behavior = c("mask", "encode")) { - behavior <- toupper(match.arg(null_encoding_behavior)) - null_encoding_behavior <- NullEncodingBehavior[[behavior]] - Expression$create( - "dictionary_encode", - x, - options = list(null_encoding_behavior = null_encoding_behavior) - ) - }) - # as.* type casting functions # as.factor() is mapped in expression.R register_binding("base::as.character", function(x) { diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R index a41aeb246c8..f78c1d47db2 100644 --- a/r/data-raw/docgen.R +++ b/r/data-raw/docgen.R @@ -65,27 +65,59 @@ file_template <- "# Licensed to the Apache Software Foundation (ASF) under one %s #' #' @name arrow-dplyr-functions -NULL -" +NULL" + +library(dplyr) +library(purrr) docs <- arrow:::.cache$docs -docs <- docs[order(names(docs))] -# TODO: group by package name, create subheadings -doclets <- purrr::imap_chr(docs, function(x, n) { - out <- paste0("#' * [", n, "()]") - if (length(x)) { - out <- paste0(out, ": ", paste(x, collapse = " ")) - } +docs_df <- tibble::tibble( + pkg_fun = names(docs), + notes = docs +) %>% + mutate( + has_pkg = grepl("::", pkg_fun), + fun = sub("^.*?:{+}", "", pkg_fun), + pkg = sub(":{+}.*$", "", pkg_fun), + # We will list operators under "base" (everything else must be pkg::fun) + pkg = if_else(has_pkg, pkg, "base"), + # Flatten notes to a single string + notes = map_chr(notes, ~ paste(., collapse = " ")) + ) %>% + arrange(pkg, fun) + +# Vectorized function to make entries for each function +render_fun <- function(fun, pkg_fun, notes) { + out <- paste0("* [", fun, "][", pkg_fun, "()]") + has_notes <- nzchar(notes) + out[has_notes] <- paste0(out[has_notes], ": ", notes[has_notes]) out -}) +} + +# This renders a bulleted list under a package heading +render_pkg <- function(df, pkg) { + bullets <- df %>% + transmute(render_fun(fun, pkg_fun, notes)) %>% + pull() + # Add header + bullets <- c( + paste("##", pkg), + "", + bullets + ) + paste("#'", bullets, collapse = "\n") +} + +# Group by package name and render the lists +doclets <- imap_chr(split(docs_df, docs_df$pkg), render_pkg) writeLines( sprintf( file_template, length(docs), length(arrow::list_compute_functions()), - paste(doclets, collapse = "\n") + paste(doclets, collapse = "\n#'\n") ), "R/dplyr-funcs-doc.R" ) diff --git a/r/man/arrow-dplyr-functions.Rd b/r/man/arrow-dplyr-functions.Rd index 8cf74c5c78e..9fcfeb23b07 100644 --- a/r/man/arrow-dplyr-functions.Rd +++ b/r/man/arrow-dplyr-functions.Rd @@ -4,7 +4,7 @@ \alias{arrow-dplyr-functions} \title{Functions available in Arrow dplyr queries} \description{ -The \code{arrow} package contains mappings of 196 R functions to the corresponding +The \code{arrow} package contains mappings of 195 R functions to the corresponding functions in the Arrow compute library. This allows you to write code inside of \code{dplyr} methods that call R functions, including many in packages like \code{stringr} and \code{lubridate}, and they will get translated to Arrow and run @@ -27,202 +27,253 @@ mapping has that make Acero behave like R. These functions are listed in the \href{https://arrow.apache.org/docs/cpp/compute.html}{C++ documentation}, and in the function registry in R, they are named with an \code{arrow_} prefix, such as \code{arrow_ascii_is_decimal}. +\subsection{arrow}{ \itemize{ -\item \code{\link[=-]{-()}} -\item \code{\link[=!]{!()}} -\item \code{\link[=!=]{!=()}} -\item \code{\link[=*]{*()}} -\item \code{\link[=/]{/()}} -\item \code{\link[=&]{&()}} -\item \code{\link[=\%/\%]{\%/\%()}} -\item \code{\link[=\%\%]{\%\%()}} -\item \code{\link[=\%in\%]{\%in\%()}} -\item \code{\link[=^]{^()}} -\item \code{\link[=+]{+()}} -\item \code{\link[=<]{<()}} -\item \code{\link[=<=]{<=()}} -\item \code{\link[===]{==()}} -\item \code{\link[=>]{>()}} -\item \code{\link[=>=]{>=()}} -\item \code{\link[=|]{|()}} -\item \code{\link[=add_filename]{add_filename()}} -\item \code{\link[base:MathFun]{base::abs()}} -\item \code{\link[base:Trig]{base::acos()}} -\item \code{\link[base:all]{base::all()}} -\item \code{\link[base:any]{base::any()}} -\item \code{\link[base:character]{base::as.character()}} -\item \code{\link[base:as.Date]{base::as.Date()}} -\item \code{\link[base:difftime]{base::as.difftime()}} -\item \code{\link[base:double]{base::as.double()}} -\item \code{\link[base:factor]{base::as.factor()}} -\item \code{\link[base:integer]{base::as.integer()}} -\item \code{\link[base:logical]{base::as.logical()}} -\item \code{\link[base:numeric]{base::as.numeric()}} -\item \code{\link[base:Trig]{base::asin()}} -\item \code{\link[base:Round]{base::ceiling()}} -\item \code{\link[base:Trig]{base::cos()}} -\item \code{\link[base:data.frame]{base::data.frame()}} -\item \code{\link[base:difftime]{base::difftime()}} -\item \code{\link[base:startsWith]{base::endsWith()}} -\item \code{\link[base:Log]{base::exp()}} -\item \code{\link[base:Round]{base::floor()}} -\item \code{\link[base:format]{base::format()}} -\item \code{\link[base:grep]{base::grepl()}} -\item \code{\link[base:grep]{base::gsub()}} -\item \code{\link[base:ifelse]{base::ifelse()}} -\item \code{\link[base:character]{base::is.character()}} -\item \code{\link[base:double]{base::is.double()}} -\item \code{\link[base:factor]{base::is.factor()}} -\item \code{\link[base:is.finite]{base::is.finite()}} -\item \code{\link[base:is.finite]{base::is.infinite()}} -\item \code{\link[base:integer]{base::is.integer()}} -\item \code{\link[base:list]{base::is.list()}} -\item \code{\link[base:logical]{base::is.logical()}} -\item \code{\link[base:NA]{base::is.na()}} -\item \code{\link[base:is.finite]{base::is.nan()}} -\item \code{\link[base:numeric]{base::is.numeric()}} -\item \code{\link[base:ISOdatetime]{base::ISOdate()}} -\item \code{\link[base:ISOdatetime]{base::ISOdatetime()}} -\item \code{\link[base:Log]{base::log()}} -\item \code{\link[base:Log]{base::log10()}} -\item \code{\link[base:Log]{base::log1p()}} -\item \code{\link[base:Log]{base::log2()}} -\item \code{\link[base:Log]{base::logb()}} -\item \code{\link[base:Extremes]{base::max()}} -\item \code{\link[base:mean]{base::mean()}} -\item \code{\link[base:Extremes]{base::min()}} -\item \code{\link[base:nchar]{base::nchar()}} -\item \code{\link[base:paste]{base::paste()}}: the \code{collapse} argument is not yet supported -\item \code{\link[base:paste]{base::paste0()}}: the \code{collapse} argument is not yet supported -\item \code{\link[base:Extremes]{base::pmax()}} -\item \code{\link[base:Extremes]{base::pmin()}} -\item \code{\link[base:Round]{base::round()}} -\item \code{\link[base:sign]{base::sign()}} -\item \code{\link[base:Trig]{base::sin()}} -\item \code{\link[base:MathFun]{base::sqrt()}} -\item \code{\link[base:startsWith]{base::startsWith()}} -\item \code{\link[base:strptime]{base::strftime()}} -\item \code{\link[base:strptime]{base::strptime()}} -\item \code{\link[base:strrep]{base::strrep()}} -\item \code{\link[base:strsplit]{base::strsplit()}} -\item \code{\link[base:grep]{base::sub()}} -\item \code{\link[base:substr]{base::substr()}} -\item \code{\link[base:substr]{base::substring()}} -\item \code{\link[base:sum]{base::sum()}} -\item \code{\link[base:Trig]{base::tan()}} -\item \code{\link[base:chartr]{base::tolower()}} -\item \code{\link[base:chartr]{base::toupper()}} -\item \code{\link[base:Round]{base::trunc()}} -\item \code{\link[bit64:as.integer64.character]{bit64::as.integer64()}} -\item \code{\link[bit64:bit64-package]{bit64::is.integer64()}} -\item \code{\link[=cast]{cast()}} -\item \code{\link[=dictionary_encode]{dictionary_encode()}} -\item \code{\link[dplyr:between]{dplyr::between()}} -\item \code{\link[dplyr:case_when]{dplyr::case_when()}} -\item \code{\link[dplyr:coalesce]{dplyr::coalesce()}} -\item \code{\link[dplyr:if_else]{dplyr::if_else()}} -\item \code{\link[dplyr:context]{dplyr::n()}} -\item \code{\link[dplyr:n_distinct]{dplyr::n_distinct()}} -\item \code{\link[lubridate:am]{lubridate::am()}} -\item \code{\link[lubridate:as_date]{lubridate::as_date()}} -\item \code{\link[lubridate:as_date]{lubridate::as_datetime()}} -\item \code{\link[lubridate:round_date]{lubridate::ceiling_date()}} -\item \code{\link[lubridate:date]{lubridate::date()}} -\item \code{\link[lubridate:date_decimal]{lubridate::date_decimal()}} -\item \code{\link[lubridate:day]{lubridate::day()}} -\item \code{\link[lubridate:duration]{lubridate::ddays()}} -\item \code{\link[lubridate:decimal_date]{lubridate::decimal_date()}} -\item \code{\link[lubridate:duration]{lubridate::dhours()}} -\item \code{\link[lubridate:duration]{lubridate::dmicroseconds()}} -\item \code{\link[lubridate:duration]{lubridate::dmilliseconds()}} -\item \code{\link[lubridate:duration]{lubridate::dminutes()}} -\item \code{\link[lubridate:duration]{lubridate::dmonths()}} -\item \code{\link[lubridate:ymd]{lubridate::dmy()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::dmy_h()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::dmy_hm()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::dmy_hms()}} -\item \code{\link[lubridate:duration]{lubridate::dnanoseconds()}} -\item \code{\link[lubridate:duration]{lubridate::dpicoseconds()}} -\item \code{\link[lubridate:duration]{lubridate::dseconds()}} -\item \code{\link[lubridate:dst]{lubridate::dst()}} -\item \code{\link[lubridate:duration]{lubridate::dweeks()}} -\item \code{\link[lubridate:duration]{lubridate::dyears()}} -\item \code{\link[lubridate:ymd]{lubridate::dym()}} -\item \code{\link[lubridate:week]{lubridate::epiweek()}} -\item \code{\link[lubridate:year]{lubridate::epiyear()}} -\item \code{\link[lubridate:parse_date_time]{lubridate::fast_strptime()}} -\item \code{\link[lubridate:round_date]{lubridate::floor_date()}} -\item \code{\link[lubridate:format_ISO8601]{lubridate::format_ISO8601()}} -\item \code{\link[lubridate:hour]{lubridate::hour()}} -\item \code{\link[lubridate:date_utils]{lubridate::is.Date()}} -\item \code{\link[lubridate:is.instant]{lubridate::is.instant()}} -\item \code{\link[lubridate:posix_utils]{lubridate::is.POSIXct()}} -\item \code{\link[lubridate:is.instant]{lubridate::is.timepoint()}} -\item \code{\link[lubridate:week]{lubridate::isoweek()}} -\item \code{\link[lubridate:year]{lubridate::isoyear()}} -\item \code{\link[lubridate:leap_year]{lubridate::leap_year()}} -\item \code{\link[lubridate:make_datetime]{lubridate::make_date()}} -\item \code{\link[lubridate:make_datetime]{lubridate::make_datetime()}} -\item \code{\link[lubridate:make_difftime]{lubridate::make_difftime()}} -\item \code{\link[lubridate:day]{lubridate::mday()}} -\item \code{\link[lubridate:ymd]{lubridate::mdy()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::mdy_h()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::mdy_hm()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::mdy_hms()}} -\item \code{\link[lubridate:minute]{lubridate::minute()}} -\item \code{\link[lubridate:month]{lubridate::month()}} -\item \code{\link[lubridate:ymd]{lubridate::my()}} -\item \code{\link[lubridate:ymd]{lubridate::myd()}} -\item \code{\link[lubridate:parse_date_time]{lubridate::parse_date_time()}} -\item \code{\link[lubridate:am]{lubridate::pm()}} -\item \code{\link[lubridate:day]{lubridate::qday()}} -\item \code{\link[lubridate:quarter]{lubridate::quarter()}} -\item \code{\link[lubridate:round_date]{lubridate::round_date()}} -\item \code{\link[lubridate:second]{lubridate::second()}} -\item \code{\link[lubridate:quarter]{lubridate::semester()}} -\item \code{\link[lubridate:tz]{lubridate::tz()}} -\item \code{\link[lubridate:day]{lubridate::wday()}} -\item \code{\link[lubridate:week]{lubridate::week()}} -\item \code{\link[lubridate:day]{lubridate::yday()}} -\item \code{\link[lubridate:ymd]{lubridate::ydm()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::ydm_h()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::ydm_hm()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::ydm_hms()}} -\item \code{\link[lubridate:year]{lubridate::year()}} -\item \code{\link[lubridate:ymd]{lubridate::ym()}} -\item \code{\link[lubridate:ymd]{lubridate::ymd()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::ymd_h()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::ymd_hm()}} -\item \code{\link[lubridate:ymd_hms]{lubridate::ymd_hms()}} -\item \code{\link[lubridate:ymd]{lubridate::yq()}} -\item \code{\link[methods:is]{methods::is()}} -\item \code{\link[rlang:type-predicates]{rlang::is_character()}} -\item \code{\link[rlang:type-predicates]{rlang::is_double()}} -\item \code{\link[rlang:type-predicates]{rlang::is_integer()}} -\item \code{\link[rlang:type-predicates]{rlang::is_list()}} -\item \code{\link[rlang:type-predicates]{rlang::is_logical()}} -\item \code{\link[stats:median]{stats::median()}} -\item \code{\link[stats:quantile]{stats::quantile()}} -\item \code{\link[stats:sd]{stats::sd()}} -\item \code{\link[stats:cor]{stats::var()}} -\item \code{\link[stringi:stri_reverse]{stringi::stri_reverse()}} -\item \code{\link[stringr:str_c]{stringr::str_c()}}: the \code{collapse} argument is not yet supported -\item \code{\link[stringr:str_count]{stringr::str_count()}} -\item \code{\link[stringr:str_detect]{stringr::str_detect()}} -\item \code{\link[stringr:str_dup]{stringr::str_dup()}} -\item \code{\link[stringr:str_starts]{stringr::str_ends()}} -\item \code{\link[stringr:str_length]{stringr::str_length()}} -\item \code{\link[stringr:str_like]{stringr::str_like()}} -\item \code{\link[stringr:str_pad]{stringr::str_pad()}} -\item \code{\link[stringr:str_replace]{stringr::str_replace()}} -\item \code{\link[stringr:str_replace]{stringr::str_replace_all()}} -\item \code{\link[stringr:str_split]{stringr::str_split()}} -\item \code{\link[stringr:str_starts]{stringr::str_starts()}} -\item \code{\link[stringr:str_sub]{stringr::str_sub()}} -\item \code{\link[stringr:case]{stringr::str_to_lower()}} -\item \code{\link[stringr:case]{stringr::str_to_title()}} -\item \code{\link[stringr:case]{stringr::str_to_upper()}} -\item \code{\link[stringr:str_trim]{stringr::str_trim()}} -\item \code{\link[tibble:tibble]{tibble::tibble()}} +\item \link[=add_filename]{add_filename} +\item \link[=cast]{cast} +} +} + +\subsection{base}{ +\itemize{ +\item \link[=-]{-} +\item \link[=!]{!} +\item \link[=!=]{!=} +\item \link[=*]{*} +\item \link[=/]{/} +\item \link[=&]{&} +\item \link[=\%/\%]{\%/\%} +\item \link[=\%\%]{\%\%} +\item \link[=\%in\%]{\%in\%} +\item \link[=^]{^} +\item \link[=+]{+} +\item \link[=<]{<} +\item \link[=<=]{<=} +\item \link[===]{==} +\item \link[=>]{>} +\item \link[=>=]{>=} +\item \link[=|]{|} +\item \link[base:MathFun]{abs} +\item \link[base:Trig]{acos} +\item \link[base:all]{all} +\item \link[base:any]{any} +\item \link[base:character]{as.character} +\item \link[base:as.Date]{as.Date} +\item \link[base:difftime]{as.difftime} +\item \link[base:double]{as.double} +\item \link[base:factor]{as.factor} +\item \link[base:integer]{as.integer} +\item \link[base:logical]{as.logical} +\item \link[base:numeric]{as.numeric} +\item \link[base:Trig]{asin} +\item \link[base:Round]{ceiling} +\item \link[base:Trig]{cos} +\item \link[base:data.frame]{data.frame} +\item \link[base:difftime]{difftime} +\item \link[base:startsWith]{endsWith} +\item \link[base:Log]{exp} +\item \link[base:Round]{floor} +\item \link[base:format]{format} +\item \link[base:grep]{grepl} +\item \link[base:grep]{gsub} +\item \link[base:ifelse]{ifelse} +\item \link[base:character]{is.character} +\item \link[base:double]{is.double} +\item \link[base:factor]{is.factor} +\item \link[base:is.finite]{is.finite} +\item \link[base:is.finite]{is.infinite} +\item \link[base:integer]{is.integer} +\item \link[base:list]{is.list} +\item \link[base:logical]{is.logical} +\item \link[base:NA]{is.na} +\item \link[base:is.finite]{is.nan} +\item \link[base:numeric]{is.numeric} +\item \link[base:ISOdatetime]{ISOdate} +\item \link[base:ISOdatetime]{ISOdatetime} +\item \link[base:Log]{log} +\item \link[base:Log]{log10} +\item \link[base:Log]{log1p} +\item \link[base:Log]{log2} +\item \link[base:Log]{logb} +\item \link[base:Extremes]{max} +\item \link[base:mean]{mean} +\item \link[base:Extremes]{min} +\item \link[base:nchar]{nchar} +\item \link[base:paste]{paste}: the \code{collapse} argument is not yet supported +\item \link[base:paste]{paste0}: the \code{collapse} argument is not yet supported +\item \link[base:Extremes]{pmax} +\item \link[base:Extremes]{pmin} +\item \link[base:Round]{round} +\item \link[base:sign]{sign} +\item \link[base:Trig]{sin} +\item \link[base:MathFun]{sqrt} +\item \link[base:startsWith]{startsWith} +\item \link[base:strptime]{strftime} +\item \link[base:strptime]{strptime} +\item \link[base:strrep]{strrep} +\item \link[base:strsplit]{strsplit} +\item \link[base:grep]{sub} +\item \link[base:substr]{substr} +\item \link[base:substr]{substring} +\item \link[base:sum]{sum} +\item \link[base:Trig]{tan} +\item \link[base:chartr]{tolower} +\item \link[base:chartr]{toupper} +\item \link[base:Round]{trunc} +} +} + +\subsection{bit64}{ +\itemize{ +\item \link[bit64:as.integer64.character]{as.integer64} +\item \link[bit64:bit64-package]{is.integer64} +} +} + +\subsection{dplyr}{ +\itemize{ +\item \link[dplyr:between]{between} +\item \link[dplyr:case_when]{case_when} +\item \link[dplyr:coalesce]{coalesce} +\item \link[dplyr:if_else]{if_else} +\item \link[dplyr:context]{n} +\item \link[dplyr:n_distinct]{n_distinct} +} +} + +\subsection{lubridate}{ +\itemize{ +\item \link[lubridate:am]{am} +\item \link[lubridate:as_date]{as_date} +\item \link[lubridate:as_date]{as_datetime} +\item \link[lubridate:round_date]{ceiling_date} +\item \link[lubridate:date]{date} +\item \link[lubridate:date_decimal]{date_decimal} +\item \link[lubridate:day]{day} +\item \link[lubridate:duration]{ddays} +\item \link[lubridate:decimal_date]{decimal_date} +\item \link[lubridate:duration]{dhours} +\item \link[lubridate:duration]{dmicroseconds} +\item \link[lubridate:duration]{dmilliseconds} +\item \link[lubridate:duration]{dminutes} +\item \link[lubridate:duration]{dmonths} +\item \link[lubridate:ymd]{dmy} +\item \link[lubridate:ymd_hms]{dmy_h} +\item \link[lubridate:ymd_hms]{dmy_hm} +\item \link[lubridate:ymd_hms]{dmy_hms} +\item \link[lubridate:duration]{dnanoseconds} +\item \link[lubridate:duration]{dpicoseconds} +\item \link[lubridate:duration]{dseconds} +\item \link[lubridate:dst]{dst} +\item \link[lubridate:duration]{dweeks} +\item \link[lubridate:duration]{dyears} +\item \link[lubridate:ymd]{dym} +\item \link[lubridate:week]{epiweek} +\item \link[lubridate:year]{epiyear} +\item \link[lubridate:parse_date_time]{fast_strptime} +\item \link[lubridate:round_date]{floor_date} +\item \link[lubridate:format_ISO8601]{format_ISO8601} +\item \link[lubridate:hour]{hour} +\item \link[lubridate:date_utils]{is.Date} +\item \link[lubridate:is.instant]{is.instant} +\item \link[lubridate:posix_utils]{is.POSIXct} +\item \link[lubridate:is.instant]{is.timepoint} +\item \link[lubridate:week]{isoweek} +\item \link[lubridate:year]{isoyear} +\item \link[lubridate:leap_year]{leap_year} +\item \link[lubridate:make_datetime]{make_date} +\item \link[lubridate:make_datetime]{make_datetime} +\item \link[lubridate:make_difftime]{make_difftime} +\item \link[lubridate:day]{mday} +\item \link[lubridate:ymd]{mdy} +\item \link[lubridate:ymd_hms]{mdy_h} +\item \link[lubridate:ymd_hms]{mdy_hm} +\item \link[lubridate:ymd_hms]{mdy_hms} +\item \link[lubridate:minute]{minute} +\item \link[lubridate:month]{month} +\item \link[lubridate:ymd]{my} +\item \link[lubridate:ymd]{myd} +\item \link[lubridate:parse_date_time]{parse_date_time} +\item \link[lubridate:am]{pm} +\item \link[lubridate:day]{qday} +\item \link[lubridate:quarter]{quarter} +\item \link[lubridate:round_date]{round_date} +\item \link[lubridate:second]{second} +\item \link[lubridate:quarter]{semester} +\item \link[lubridate:tz]{tz} +\item \link[lubridate:day]{wday} +\item \link[lubridate:week]{week} +\item \link[lubridate:day]{yday} +\item \link[lubridate:ymd]{ydm} +\item \link[lubridate:ymd_hms]{ydm_h} +\item \link[lubridate:ymd_hms]{ydm_hm} +\item \link[lubridate:ymd_hms]{ydm_hms} +\item \link[lubridate:year]{year} +\item \link[lubridate:ymd]{ym} +\item \link[lubridate:ymd]{ymd} +\item \link[lubridate:ymd_hms]{ymd_h} +\item \link[lubridate:ymd_hms]{ymd_hm} +\item \link[lubridate:ymd_hms]{ymd_hms} +\item \link[lubridate:ymd]{yq} +} +} + +\subsection{methods}{ +\itemize{ +\item \link[methods:is]{is} +} +} + +\subsection{rlang}{ +\itemize{ +\item \link[rlang:type-predicates]{is_character} +\item \link[rlang:type-predicates]{is_double} +\item \link[rlang:type-predicates]{is_integer} +\item \link[rlang:type-predicates]{is_list} +\item \link[rlang:type-predicates]{is_logical} +} +} + +\subsection{stats}{ +\itemize{ +\item \link[stats:median]{median} +\item \link[stats:quantile]{quantile} +\item \link[stats:sd]{sd} +\item \link[stats:cor]{var} +} +} + +\subsection{stringi}{ +\itemize{ +\item \link[stringi:stri_reverse]{stri_reverse} +} +} + +\subsection{stringr}{ +\itemize{ +\item \link[stringr:str_c]{str_c}: the \code{collapse} argument is not yet supported +\item \link[stringr:str_count]{str_count} +\item \link[stringr:str_detect]{str_detect} +\item \link[stringr:str_dup]{str_dup} +\item \link[stringr:str_starts]{str_ends} +\item \link[stringr:str_length]{str_length} +\item \link[stringr:str_like]{str_like} +\item \link[stringr:str_pad]{str_pad} +\item \link[stringr:str_replace]{str_replace} +\item \link[stringr:str_replace]{str_replace_all} +\item \link[stringr:str_split]{str_split} +\item \link[stringr:str_starts]{str_starts} +\item \link[stringr:str_sub]{str_sub} +\item \link[stringr:case]{str_to_lower} +\item \link[stringr:case]{str_to_title} +\item \link[stringr:case]{str_to_upper} +\item \link[stringr:str_trim]{str_trim} +} +} + +\subsection{tibble}{ +\itemize{ +\item \link[tibble:tibble]{tibble} +} } } From 20233904cf1d54af56a93d5e7b88382d634dbe51 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 1 Sep 2022 09:58:22 -0400 Subject: [PATCH 3/7] Add to pkgdown --- r/_pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index dfb0998ddff..6c23305a8f3 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -216,6 +216,7 @@ reference: - codec_is_available - title: Computation contents: + - arrow-dplyr-functions - call_function - match_arrow - value_counts From 42fb685ab716ad2e9de20851481f970bad2d0f89 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 9 Sep 2022 10:09:21 -0400 Subject: [PATCH 4/7] Add dplyr verbs and tidyselect to the page; add docs for cast and add_filename --- r/R/arrow-package.R | 51 ++++++++++++----- r/R/dplyr-funcs-augmented.R | 16 ++++++ r/R/dplyr-funcs-doc.R | 65 +++++++++++++++++++-- r/R/dplyr-funcs-string.R | 20 ++++--- r/R/dplyr-funcs-type.R | 26 +++++++++ r/data-raw/docgen.R | 100 ++++++++++++++++++++++++++------- r/man/add_filename.Rd | 20 +++++++ r/man/arrow-dplyr-functions.Rd | 69 +++++++++++++++++++++-- r/man/cast.Rd | 36 ++++++++++++ 9 files changed, 351 insertions(+), 52 deletions(-) create mode 100644 r/man/add_filename.Rd create mode 100644 r/man/cast.Rd diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 53fb0280a50..46a517c1e22 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -31,25 +31,50 @@ #' @keywords internal "_PACKAGE" +# Include notes about features not supported here. +supported_dplyr_methods <- list( + select = NULL, + filter = NULL, + collect = NULL, + summarise = NULL, + group_by = NULL, + groups = NULL, + group_vars = NULL, + group_by_drop_default = NULL, + ungroup = NULL, + mutate = NULL, + transmute = NULL, + arrange = NULL, + rename = NULL, + pull = NULL, + relocate = NULL, + compute = NULL, + collapse = NULL, + distinct = NULL, + left_join = NULL, + right_join = NULL, + inner_join = NULL, + full_join = NULL, + semi_join = NULL, + anti_join = NULL, + count = NULL, + tally = NULL, + rename_with = NULL, + union = NULL, + union_all = NULL, + glimpse = NULL, + show_query = NULL, + explain = NULL +) + #' @importFrom vctrs s3_register vec_size vec_cast vec_unique .onLoad <- function(...) { # Make sure C++ knows on which thread it is safe to call the R API InitializeMainRThread() - dplyr_methods <- paste0( - "dplyr::", - c( - "select", "filter", "collect", "summarise", "group_by", "groups", - "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute", - "arrange", "rename", "pull", "relocate", "compute", "collapse", - "distinct", "left_join", "right_join", "inner_join", "full_join", - "semi_join", "anti_join", "count", "tally", "rename_with", "union", - "union_all", "glimpse", "show_query", "explain" - ) - ) for (cl in c("Dataset", "ArrowTabular", "RecordBatchReader", "arrow_dplyr_query")) { - for (m in dplyr_methods) { - s3_register(m, cl) + for (m in names(supported_dplyr_methods)) { + s3_register(paste0("dplyr::", m), cl) } } s3_register("dplyr::tbl_vars", "arrow_dplyr_query") diff --git a/r/R/dplyr-funcs-augmented.R b/r/R/dplyr-funcs-augmented.R index 828e3df12be..efb62139368 100644 --- a/r/R/dplyr-funcs-augmented.R +++ b/r/R/dplyr-funcs-augmented.R @@ -15,6 +15,22 @@ # specific language governing permissions and limitations # under the License. +#' Add the data filename as a column +#' +#' This function only exists inside `arrow` `dplyr` queries, and it only is +#' valid when quering on a `FileSystemDataset`. +#' +#' @return A `FieldRef` `Expression` that refers to the filename augmented +#' column. +#' @examples +#' \dontrun{ +#' open_dataset("nyc-taxi") %>% +#' mutate(file = add_filename()) +#' } +#' @keywords internal +#' @name add_filename +NULL + register_bindings_augmented <- function() { register_binding("arrow::add_filename", function() { Expression$field_ref("__filename") diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index 078cf4a9fa4..d0a6d3dfcc6 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -19,13 +19,58 @@ #' Functions available in Arrow dplyr queries #' -#' The `arrow` package contains mappings of 195 R functions to the corresponding -#' functions in the Arrow compute library. This allows you to write code inside +#' The `arrow` package contains methods for 32 `dplyr` table functions, many of +#' which are "verbs" that do transformations to one or more tables. +#' The package also has mappings of 204 R functions to the corresponding +#' functions in the Arrow compute library. These allow you to write code inside #' of `dplyr` methods that call R functions, including many in packages like #' `stringr` and `lubridate`, and they will get translated to Arrow and run #' on the Arrow query engine (Acero). This document lists all of the mapped #' functions. #' +#' # `dplyr` verbs +#' +#' Most verb functions return an `arrow_dplyr_query` object, similar in spirit +#' to a `dbplyr::tbl_lazy`. This means that the verbs do not eagerly evaluate +#' the query on the data. To run the query, call either `compute()`, +#' which returns an `arrow` [Table], or `collect()`, which pulls the resulting +#' Table into an R `data.frame`. +#' +#' * [anti_join][dplyr::anti_join()] +#' * [arrange][dplyr::arrange()] +#' * [collapse][dplyr::collapse()] +#' * [collect][dplyr::collect()] +#' * [compute][dplyr::compute()] +#' * [count][dplyr::count()] +#' * [distinct][dplyr::distinct()] +#' * [explain][dplyr::explain()] +#' * [filter][dplyr::filter()] +#' * [full_join][dplyr::full_join()] +#' * [glimpse][dplyr::glimpse()] +#' * [group_by][dplyr::group_by()] +#' * [group_by_drop_default][dplyr::group_by_drop_default()] +#' * [group_vars][dplyr::group_vars()] +#' * [groups][dplyr::groups()] +#' * [inner_join][dplyr::inner_join()] +#' * [left_join][dplyr::left_join()] +#' * [mutate][dplyr::mutate()] +#' * [pull][dplyr::pull()] +#' * [relocate][dplyr::relocate()] +#' * [rename][dplyr::rename()] +#' * [rename_with][dplyr::rename_with()] +#' * [right_join][dplyr::right_join()] +#' * [select][dplyr::select()] +#' * [semi_join][dplyr::semi_join()] +#' * [show_query][dplyr::show_query()] +#' * [summarise][dplyr::summarise()] +#' * [tally][dplyr::tally()] +#' * [transmute][dplyr::transmute()] +#' * [ungroup][dplyr::ungroup()] +#' * [union][dplyr::union()] +#' * [union_all][dplyr::union_all()] +#' +#' # Function mappings +#' #' In the list below, any differences in behavior or support between Acero and #' the R function are listed. If no notes follow the function name, then you #' can assume that the function works in Acero just as it does in R. @@ -74,7 +119,6 @@ #' * [as.Date][base::as.Date()] #' * [as.difftime][base::as.difftime()] #' * [as.double][base::as.double()] -#' * [as.factor][base::as.factor()] #' * [as.integer][base::as.integer()] #' * [as.logical][base::as.logical()] #' * [as.numeric][base::as.numeric()] @@ -141,6 +185,7 @@ #' #' ## dplyr #' +#' * [across][dplyr::across()]: only supported inside `mutate()`; purrr-style lambda functions not yet supported #' * [between][dplyr::between()] #' * [case_when][dplyr::case_when()] #' * [coalesce][dplyr::coalesce()] @@ -254,7 +299,7 @@ #' * [str_dup][stringr::str_dup()] #' * [str_ends][stringr::str_ends()] #' * [str_length][stringr::str_length()] -#' * [str_like][stringr::str_like()] +#' * `str_like`: not yet in a released version of `stringr`, but it is supported in `arrow` #' * [str_pad][stringr::str_pad()] #' * [str_replace][stringr::str_replace()] #' * [str_replace_all][stringr::str_replace_all()] @@ -270,5 +315,17 @@ #' #' * [tibble][tibble::tibble()] #' +#' ## tidyselect +#' +#' * [all_of][tidyselect::all_of()] +#' * [contains][tidyselect::contains()] +#' * [ends_with][tidyselect::ends_with()] +#' * [everything][tidyselect::everything()] +#' * [last_col][tidyselect::last_col()] +#' * [matches][tidyselect::matches()] +#' * [num_range][tidyselect::num_range()] +#' * [one_of][tidyselect::one_of()] +#' * [starts_with][tidyselect::starts_with()] +#' #' @name arrow-dplyr-functions NULL diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index 71dcce94cc7..eb2326ed056 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -239,15 +239,17 @@ register_bindings_string_regex <- function() { out }) - register_binding("stringr::str_like", function(string, - pattern, - ignore_case = TRUE) { - Expression$create( - "match_like", - string, - options = list(pattern = pattern, ignore_case = ignore_case) - ) - }) + register_binding( + "stringr::str_like", + function(string, pattern, ignore_case = TRUE) { + Expression$create( + "match_like", + string, + options = list(pattern = pattern, ignore_case = ignore_case) + ) + }, + notes = "not yet in a released version of `stringr`, but it is supported in `arrow`" + ) register_binding("stringr::str_count", function(string, pattern) { opts <- get_stringr_pattern_options(enexpr(pattern)) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 2c5112d7f73..3d03476a881 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -23,6 +23,32 @@ register_bindings_type <- function() { register_bindings_type_format() } +#' Change the type of an array or column +#' +#' The `cast()` function only exists inside of `arrow` `dplyr` queries. Use it +#' as a more convenient way of changing the type of a value or field inside of +#' a `mutate()` call. To cast an `Array` or `ChunkedArray` outside of a query, +#' call the `$cast()` method on the object, which has the same semantics. +#' +#' @param x an `Expression` +#' @param target_type [DataType] to cast to +#' @param safe logical: only allow the type conversion if no data is lost +#' (truncation, overflow, etc.). Default is `TRUE` +#' @param ... specific `CastOptions` to set +#' @return an `Expression` +#' +#' @examples +#' \dontrun{ +#' mtcars %>% +#' arrow_table() %>% +#' mutate(cyl = cast(cyl, string())) +#' } +#' @keywords internal +#' @name cast +#' @seealso https://arrow.apache.org/docs/cpp/api/compute.html for the list of +#' supported CastOptions. +NULL + register_bindings_type_cast <- function() { register_binding("arrow::cast", function(x, target_type, safe = TRUE, ...) { opts <- cast_options(safe, ...) diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R index f78c1d47db2..5d767f27041 100644 --- a/r/data-raw/docgen.R +++ b/r/data-raw/docgen.R @@ -39,13 +39,27 @@ file_template <- "# Licensed to the Apache Software Foundation (ASF) under one #' Functions available in Arrow dplyr queries #' -#' The `arrow` package contains mappings of %s R functions to the corresponding -#' functions in the Arrow compute library. This allows you to write code inside +#' The `arrow` package contains methods for %s `dplyr` table functions, many of +#' which are \"verbs\" that do transformations to one or more tables. +#' The package also has mappings of %s R functions to the corresponding +#' functions in the Arrow compute library. These allow you to write code inside #' of `dplyr` methods that call R functions, including many in packages like #' `stringr` and `lubridate`, and they will get translated to Arrow and run #' on the Arrow query engine (Acero). This document lists all of the mapped #' functions. #' +#' # `dplyr` verbs +#' +#' Most verb functions return an `arrow_dplyr_query` object, similar in spirit +#' to a `dbplyr::tbl_lazy`. This means that the verbs do not eagerly evaluate +#' the query on the data. To run the query, call either `compute()`, +#' which returns an `arrow` [Table], or `collect()`, which pulls the resulting +#' Table into an R `data.frame`. +#' +%s +#' +#' # Function mappings +#' #' In the list below, any differences in behavior or support between Acero and #' the R function are listed. If no notes follow the function name, then you #' can assume that the function works in Acero just as it does in R. @@ -70,26 +84,18 @@ NULL" library(dplyr) library(purrr) -docs <- arrow:::.cache$docs - -docs_df <- tibble::tibble( - pkg_fun = names(docs), - notes = docs -) %>% - mutate( - has_pkg = grepl("::", pkg_fun), - fun = sub("^.*?:{+}", "", pkg_fun), - pkg = sub(":{+}.*$", "", pkg_fun), - # We will list operators under "base" (everything else must be pkg::fun) - pkg = if_else(has_pkg, pkg, "base"), - # Flatten notes to a single string - notes = map_chr(notes, ~ paste(., collapse = " ")) - ) %>% - arrange(pkg, fun) +# Functions that for whatever reason cause xref problems, so don't hyperlink +do_not_link <- c( + "stringr::str_like" # Still only in the unreleased version +) # Vectorized function to make entries for each function render_fun <- function(fun, pkg_fun, notes) { - out <- paste0("* [", fun, "][", pkg_fun, "()]") + out <- ifelse( + pkg_fun %in% do_not_link, + paste0("* `", fun, "`"), + paste0("* [", fun, "][", pkg_fun, "()]") + ) has_notes <- nzchar(notes) out[has_notes] <- paste0(out[has_notes], ": ", notes[has_notes]) out @@ -109,15 +115,67 @@ render_pkg <- function(df, pkg) { paste("#'", bullets, collapse = "\n") } +docs <- arrow:::.cache$docs + +# Add some functions + +# across() is handled by manipulating the quosures, not by nse_funcs +docs[["dplyr::across"]] <- c( + "only supported inside `mutate()`;", # TODO(ARROW-17362, ARROW-17387) + "purrr-style lambda functions not yet supported" # TODO(ARROW-17366) +) + +# add tidyselect helpers by parsing the reexports file +tidyselect <- grep("^tidyselect::", readLines("R/reexports-tidyselect.R"), value = TRUE) + +docs <- c(docs, setNames(rep(list(NULL), length(tidyselect)), tidyselect)) + +# TODO: add doc pages for add_filename() and cast() + +fun_df <- tibble::tibble( + pkg_fun = names(docs), + notes = docs +) %>% + mutate( + has_pkg = grepl("::", pkg_fun), + fun = sub("^.*?:{+}", "", pkg_fun), + pkg = sub(":{+}.*$", "", pkg_fun), + # We will list operators under "base" (everything else must be pkg::fun) + pkg = if_else(has_pkg, pkg, "base"), + # Flatten notes to a single string + notes = map_chr(notes, ~ paste(., collapse = " ")) + ) %>% + arrange(pkg, fun) + # Group by package name and render the lists -doclets <- imap_chr(split(docs_df, docs_df$pkg), render_pkg) +fun_doclets <- imap_chr(split(fun_df, fun_df$pkg), render_pkg) + +dplyr_verbs <- c( + arrow:::supported_dplyr_methods, + # Because this only has a method for arrow_dplyr_query, it's not in the main list + tbl_vars = NULL +) + +verb_bullets <- tibble::tibble( + fun = names(dplyr_verbs), + notes = dplyr_verbs +) %>% + mutate( + pkg_fun = paste0("dplyr::", fun), + notes = map_chr(notes, ~ paste(., collapse = " ")) + ) %>% + arrange(fun) %>% + transmute(render_fun(fun, pkg_fun, notes)) %>% + pull() writeLines( sprintf( file_template, + length(dplyr_verbs), length(docs), + paste("#'", verb_bullets, collapse = "\n"), length(arrow::list_compute_functions()), - paste(doclets, collapse = "\n#'\n") + paste(fun_doclets, collapse = "\n#'\n") ), "R/dplyr-funcs-doc.R" ) diff --git a/r/man/add_filename.Rd b/r/man/add_filename.Rd new file mode 100644 index 00000000000..56731281ab9 --- /dev/null +++ b/r/man/add_filename.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-funcs-augmented.R +\name{add_filename} +\alias{add_filename} +\title{Add the data filename as a column} +\value{ +A \code{FieldRef} \code{Expression} that refers to the filename augmented +column. +} +\description{ +This function only exists inside \code{arrow} \code{dplyr} queries, and it only is +valid when quering on a \code{FileSystemDataset}. +} +\examples{ +\dontrun{ +open_dataset("nyc-taxi") \%>\% + mutate(file = add_filename()) +} +} +\keyword{internal} diff --git a/r/man/arrow-dplyr-functions.Rd b/r/man/arrow-dplyr-functions.Rd index 9fcfeb23b07..438d8b64025 100644 --- a/r/man/arrow-dplyr-functions.Rd +++ b/r/man/arrow-dplyr-functions.Rd @@ -4,14 +4,58 @@ \alias{arrow-dplyr-functions} \title{Functions available in Arrow dplyr queries} \description{ -The \code{arrow} package contains mappings of 195 R functions to the corresponding -functions in the Arrow compute library. This allows you to write code inside +The \code{arrow} package contains methods for 32 \code{dplyr} table functions, many of +which are "verbs" that do transformations to one or more tables. +The package also has mappings of 204 R functions to the corresponding +functions in the Arrow compute library. These allow you to write code inside of \code{dplyr} methods that call R functions, including many in packages like \code{stringr} and \code{lubridate}, and they will get translated to Arrow and run on the Arrow query engine (Acero). This document lists all of the mapped functions. } -\details{ +\section{\code{dplyr} verbs}{ +Most verb functions return an \code{arrow_dplyr_query} object, similar in spirit +to a \code{dbplyr::tbl_lazy}. This means that the verbs do not eagerly evaluate +the query on the data. To run the query, call either \code{compute()}, +which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting +Table into an R \code{data.frame}. +\itemize{ +\item \link[dplyr:filter-joins]{anti_join} +\item \link[dplyr:arrange]{arrange} +\item \link[dplyr:compute]{collapse} +\item \link[dplyr:compute]{collect} +\item \link[dplyr:compute]{compute} +\item \link[dplyr:count]{count} +\item \link[dplyr:distinct]{distinct} +\item \link[dplyr:explain]{explain} +\item \link[dplyr:filter]{filter} +\item \link[dplyr:mutate-joins]{full_join} +\item \link[dplyr:glimpse]{glimpse} +\item \link[dplyr:group_by]{group_by} +\item \link[dplyr:group_by_drop_default]{group_by_drop_default} +\item \link[dplyr:group_data]{group_vars} +\item \link[dplyr:group_data]{groups} +\item \link[dplyr:mutate-joins]{inner_join} +\item \link[dplyr:mutate-joins]{left_join} +\item \link[dplyr:mutate]{mutate} +\item \link[dplyr:pull]{pull} +\item \link[dplyr:relocate]{relocate} +\item \link[dplyr:rename]{rename} +\item \link[dplyr:rename]{rename_with} +\item \link[dplyr:mutate-joins]{right_join} +\item \link[dplyr:select]{select} +\item \link[dplyr:filter-joins]{semi_join} +\item \link[dplyr:explain]{show_query} +\item \link[dplyr:summarise]{summarise} +\item \link[dplyr:count]{tally} +\item \link[dplyr:mutate]{transmute} +\item \link[dplyr:group_by]{ungroup} +\item \link[dplyr:reexports]{union} +\item \link[dplyr:setops]{union_all} +} +} + +\section{Function mappings}{ In the list below, any differences in behavior or support between Acero and the R function are listed. If no notes follow the function name, then you can assume that the function works in Acero just as it does in R. @@ -61,7 +105,6 @@ as \code{arrow_ascii_is_decimal}. \item \link[base:as.Date]{as.Date} \item \link[base:difftime]{as.difftime} \item \link[base:double]{as.double} -\item \link[base:factor]{as.factor} \item \link[base:integer]{as.integer} \item \link[base:logical]{as.logical} \item \link[base:numeric]{as.numeric} @@ -132,6 +175,7 @@ as \code{arrow_ascii_is_decimal}. \subsection{dplyr}{ \itemize{ +\item \link[dplyr:across]{across}: only supported inside \code{mutate()}; purrr-style lambda functions not yet supported \item \link[dplyr:between]{between} \item \link[dplyr:case_when]{case_when} \item \link[dplyr:coalesce]{coalesce} @@ -257,7 +301,7 @@ as \code{arrow_ascii_is_decimal}. \item \link[stringr:str_dup]{str_dup} \item \link[stringr:str_starts]{str_ends} \item \link[stringr:str_length]{str_length} -\item \link[stringr:str_like]{str_like} +\item \code{str_like}: not yet in a released version of \code{stringr}, but it is supported in \code{arrow} \item \link[stringr:str_pad]{str_pad} \item \link[stringr:str_replace]{str_replace} \item \link[stringr:str_replace]{str_replace_all} @@ -276,4 +320,19 @@ as \code{arrow_ascii_is_decimal}. \item \link[tibble:tibble]{tibble} } } + +\subsection{tidyselect}{ +\itemize{ +\item \link[tidyselect:all_of]{all_of} +\item \link[tidyselect:starts_with]{contains} +\item \link[tidyselect:starts_with]{ends_with} +\item \link[tidyselect:everything]{everything} +\item \link[tidyselect:everything]{last_col} +\item \link[tidyselect:starts_with]{matches} +\item \link[tidyselect:starts_with]{num_range} +\item \link[tidyselect:one_of]{one_of} +\item \link[tidyselect:starts_with]{starts_with} +} } +} + diff --git a/r/man/cast.Rd b/r/man/cast.Rd new file mode 100644 index 00000000000..90f0230d210 --- /dev/null +++ b/r/man/cast.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-funcs-type.R +\name{cast} +\alias{cast} +\title{Change the type of an array or column} +\arguments{ +\item{x}{an \code{Expression}} + +\item{target_type}{\link{DataType} to cast to} + +\item{safe}{logical: only allow the type conversion if no data is lost +(truncation, overflow, etc.). Default is \code{TRUE}} + +\item{...}{specific \code{CastOptions} to set} +} +\value{ +an \code{Expression} +} +\description{ +The \code{cast()} function only exists inside of \code{arrow} \code{dplyr} queries. Use it +as a more convenient way of changing the type of a value or field inside of +a \code{mutate()} call. To cast an \code{Array} or \code{ChunkedArray} outside of a query, +call the \verb{$cast()} method on the object, which has the same semantics. +} +\examples{ +\dontrun{ +mtcars \%>\% + arrow_table() \%>\% + mutate(cyl = cast(cyl, string())) +} +} +\seealso{ +https://arrow.apache.org/docs/cpp/api/compute.html for the list of +supported CastOptions. +} +\keyword{internal} From c48ff881924df02da7b8595de9573f6e1b0d84e8 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 9 Sep 2022 14:15:50 -0400 Subject: [PATCH 5/7] Add todo jira and fill in usage for new docs --- r/R/arrow-package.R | 2 +- r/R/dplyr-funcs-augmented.R | 2 ++ r/R/dplyr-funcs-type.R | 1 + r/man/add_filename.Rd | 3 +++ r/man/cast.Rd | 3 +++ 5 files changed, 10 insertions(+), 1 deletion(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 46a517c1e22..e6b3f481e21 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -31,7 +31,7 @@ #' @keywords internal "_PACKAGE" -# Include notes about features not supported here. +# TODO(ARROW-17666): Include notes about features not supported here. supported_dplyr_methods <- list( select = NULL, filter = NULL, diff --git a/r/R/dplyr-funcs-augmented.R b/r/R/dplyr-funcs-augmented.R index efb62139368..d1359005e37 100644 --- a/r/R/dplyr-funcs-augmented.R +++ b/r/R/dplyr-funcs-augmented.R @@ -20,6 +20,8 @@ #' This function only exists inside `arrow` `dplyr` queries, and it only is #' valid when quering on a `FileSystemDataset`. #' +#' @usage add_filename() +#' #' @return A `FieldRef` `Expression` that refers to the filename augmented #' column. #' @examples diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 3d03476a881..35734ced05d 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -30,6 +30,7 @@ register_bindings_type <- function() { #' a `mutate()` call. To cast an `Array` or `ChunkedArray` outside of a query, #' call the `$cast()` method on the object, which has the same semantics. #' +#' @usage cast(x, target_type, safe = TRUE, ...) #' @param x an `Expression` #' @param target_type [DataType] to cast to #' @param safe logical: only allow the type conversion if no data is lost diff --git a/r/man/add_filename.Rd b/r/man/add_filename.Rd index 56731281ab9..ca7ed0e4b17 100644 --- a/r/man/add_filename.Rd +++ b/r/man/add_filename.Rd @@ -3,6 +3,9 @@ \name{add_filename} \alias{add_filename} \title{Add the data filename as a column} +\usage{ +add_filename() +} \value{ A \code{FieldRef} \code{Expression} that refers to the filename augmented column. diff --git a/r/man/cast.Rd b/r/man/cast.Rd index 90f0230d210..81abfa6567d 100644 --- a/r/man/cast.Rd +++ b/r/man/cast.Rd @@ -3,6 +3,9 @@ \name{cast} \alias{cast} \title{Change the type of an array or column} +\usage{ +cast(x, target_type, safe = TRUE, ...) +} \arguments{ \item{x}{an \code{Expression}} From 2ced7b6c3e541bd6b3da34c8e7a58093f170425a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Sep 2022 12:47:17 -0400 Subject: [PATCH 6/7] Rename to acero.Rd and update stuff --- r/R/dplyr-funcs-doc.R | 477 +++++++++++++++++---------------- r/_pkgdown.yml | 2 +- r/data-raw/docgen.R | 27 +- r/man/acero.Rd | 339 +++++++++++++++++++++++ r/man/arrow-dplyr-functions.Rd | 338 ----------------------- 5 files changed, 598 insertions(+), 585 deletions(-) create mode 100644 r/man/acero.Rd delete mode 100644 r/man/arrow-dplyr-functions.Rd diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index d0a6d3dfcc6..cac0310f49b 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -21,7 +21,7 @@ #' #' The `arrow` package contains methods for 32 `dplyr` table functions, many of #' which are "verbs" that do transformations to one or more tables. -#' The package also has mappings of 204 R functions to the corresponding +#' The package also has mappings of 205 R functions to the corresponding #' functions in the Arrow compute library. These allow you to write code inside #' of `dplyr` methods that call R functions, including many in packages like #' `stringr` and `lubridate`, and they will get translated to Arrow and run @@ -36,38 +36,38 @@ #' which returns an `arrow` [Table], or `collect()`, which pulls the resulting #' Table into an R `data.frame`. #' -#' * [anti_join][dplyr::anti_join()] -#' * [arrange][dplyr::arrange()] -#' * [collapse][dplyr::collapse()] -#' * [collect][dplyr::collect()] -#' * [compute][dplyr::compute()] -#' * [count][dplyr::count()] -#' * [distinct][dplyr::distinct()] -#' * [explain][dplyr::explain()] -#' * [filter][dplyr::filter()] -#' * [full_join][dplyr::full_join()] -#' * [glimpse][dplyr::glimpse()] -#' * [group_by][dplyr::group_by()] -#' * [group_by_drop_default][dplyr::group_by_drop_default()] -#' * [group_vars][dplyr::group_vars()] -#' * [groups][dplyr::groups()] -#' * [inner_join][dplyr::inner_join()] -#' * [left_join][dplyr::left_join()] -#' * [mutate][dplyr::mutate()] -#' * [pull][dplyr::pull()] -#' * [relocate][dplyr::relocate()] -#' * [rename][dplyr::rename()] -#' * [rename_with][dplyr::rename_with()] -#' * [right_join][dplyr::right_join()] -#' * [select][dplyr::select()] -#' * [semi_join][dplyr::semi_join()] -#' * [show_query][dplyr::show_query()] -#' * [summarise][dplyr::summarise()] -#' * [tally][dplyr::tally()] -#' * [transmute][dplyr::transmute()] -#' * [ungroup][dplyr::ungroup()] -#' * [union][dplyr::union()] -#' * [union_all][dplyr::union_all()] +#' * [`anti_join()`][dplyr::anti_join()] +#' * [`arrange()`][dplyr::arrange()] +#' * [`collapse()`][dplyr::collapse()] +#' * [`collect()`][dplyr::collect()] +#' * [`compute()`][dplyr::compute()] +#' * [`count()`][dplyr::count()] +#' * [`distinct()`][dplyr::distinct()] +#' * [`explain()`][dplyr::explain()] +#' * [`filter()`][dplyr::filter()] +#' * [`full_join()`][dplyr::full_join()] +#' * [`glimpse()`][dplyr::glimpse()] +#' * [`group_by()`][dplyr::group_by()] +#' * [`group_by_drop_default()`][dplyr::group_by_drop_default()] +#' * [`group_vars()`][dplyr::group_vars()] +#' * [`groups()`][dplyr::groups()] +#' * [`inner_join()`][dplyr::inner_join()] +#' * [`left_join()`][dplyr::left_join()] +#' * [`mutate()`][dplyr::mutate()] +#' * [`pull()`][dplyr::pull()] +#' * [`relocate()`][dplyr::relocate()] +#' * [`rename()`][dplyr::rename()] +#' * [`rename_with()`][dplyr::rename_with()] +#' * [`right_join()`][dplyr::right_join()] +#' * [`select()`][dplyr::select()] +#' * [`semi_join()`][dplyr::semi_join()] +#' * [`show_query()`][dplyr::show_query()] +#' * [`summarise()`][dplyr::summarise()] +#' * [`tally()`][dplyr::tally()] +#' * [`transmute()`][dplyr::transmute()] +#' * [`ungroup()`][dplyr::ungroup()] +#' * [`union()`][dplyr::union()] +#' * [`union_all()`][dplyr::union_all()] #' #' # Function mappings #' @@ -89,243 +89,244 @@ #' #' ## arrow #' -#' * [add_filename][arrow::add_filename()] -#' * [cast][arrow::cast()] +#' * [`add_filename()`][arrow::add_filename()] +#' * [`cast()`][arrow::cast()] #' #' ## base #' -#' * [-][-()] -#' * [!][!()] -#' * [!=][!=()] -#' * [*][*()] -#' * [/][/()] -#' * [&][&()] -#' * [%/%][%/%()] -#' * [%%][%%()] -#' * [%in%][%in%()] -#' * [^][^()] -#' * [+][+()] -#' * [<][<()] -#' * [<=][<=()] -#' * [==][==()] -#' * [>][>()] -#' * [>=][>=()] -#' * [|][|()] -#' * [abs][base::abs()] -#' * [acos][base::acos()] -#' * [all][base::all()] -#' * [any][base::any()] -#' * [as.character][base::as.character()] -#' * [as.Date][base::as.Date()] -#' * [as.difftime][base::as.difftime()] -#' * [as.double][base::as.double()] -#' * [as.integer][base::as.integer()] -#' * [as.logical][base::as.logical()] -#' * [as.numeric][base::as.numeric()] -#' * [asin][base::asin()] -#' * [ceiling][base::ceiling()] -#' * [cos][base::cos()] -#' * [data.frame][base::data.frame()] -#' * [difftime][base::difftime()] -#' * [endsWith][base::endsWith()] -#' * [exp][base::exp()] -#' * [floor][base::floor()] -#' * [format][base::format()] -#' * [grepl][base::grepl()] -#' * [gsub][base::gsub()] -#' * [ifelse][base::ifelse()] -#' * [is.character][base::is.character()] -#' * [is.double][base::is.double()] -#' * [is.factor][base::is.factor()] -#' * [is.finite][base::is.finite()] -#' * [is.infinite][base::is.infinite()] -#' * [is.integer][base::is.integer()] -#' * [is.list][base::is.list()] -#' * [is.logical][base::is.logical()] -#' * [is.na][base::is.na()] -#' * [is.nan][base::is.nan()] -#' * [is.numeric][base::is.numeric()] -#' * [ISOdate][base::ISOdate()] -#' * [ISOdatetime][base::ISOdatetime()] -#' * [log][base::log()] -#' * [log10][base::log10()] -#' * [log1p][base::log1p()] -#' * [log2][base::log2()] -#' * [logb][base::logb()] -#' * [max][base::max()] -#' * [mean][base::mean()] -#' * [min][base::min()] -#' * [nchar][base::nchar()] -#' * [paste][base::paste()]: the `collapse` argument is not yet supported -#' * [paste0][base::paste0()]: the `collapse` argument is not yet supported -#' * [pmax][base::pmax()] -#' * [pmin][base::pmin()] -#' * [round][base::round()] -#' * [sign][base::sign()] -#' * [sin][base::sin()] -#' * [sqrt][base::sqrt()] -#' * [startsWith][base::startsWith()] -#' * [strftime][base::strftime()] -#' * [strptime][base::strptime()] -#' * [strrep][base::strrep()] -#' * [strsplit][base::strsplit()] -#' * [sub][base::sub()] -#' * [substr][base::substr()] -#' * [substring][base::substring()] -#' * [sum][base::sum()] -#' * [tan][base::tan()] -#' * [tolower][base::tolower()] -#' * [toupper][base::toupper()] -#' * [trunc][base::trunc()] +#' * [`-`][-()] +#' * [`!`][!()] +#' * [`!=`][!=()] +#' * [`*`][*()] +#' * [`/`][/()] +#' * [`&`][&()] +#' * [`%/%`][%/%()] +#' * [`%%`][%%()] +#' * [`%in%`][%in%()] +#' * [`^`][^()] +#' * [`+`][+()] +#' * [`<`][<()] +#' * [`<=`][<=()] +#' * [`==`][==()] +#' * [`>`][>()] +#' * [`>=`][>=()] +#' * [`|`][|()] +#' * [`abs()`][base::abs()] +#' * [`acos()`][base::acos()] +#' * [`all()`][base::all()] +#' * [`any()`][base::any()] +#' * [`as.character()`][base::as.character()] +#' * [`as.Date()`][base::as.Date()] +#' * [`as.difftime()`][base::as.difftime()] +#' * [`as.double()`][base::as.double()] +#' * [`as.integer()`][base::as.integer()] +#' * [`as.logical()`][base::as.logical()] +#' * [`as.numeric()`][base::as.numeric()] +#' * [`asin()`][base::asin()] +#' * [`ceiling()`][base::ceiling()] +#' * [`cos()`][base::cos()] +#' * [`data.frame()`][base::data.frame()] +#' * [`difftime()`][base::difftime()] +#' * [`endsWith()`][base::endsWith()] +#' * [`exp()`][base::exp()] +#' * [`floor()`][base::floor()] +#' * [`format()`][base::format()] +#' * [`grepl()`][base::grepl()] +#' * [`gsub()`][base::gsub()] +#' * [`ifelse()`][base::ifelse()] +#' * [`is.character()`][base::is.character()] +#' * [`is.double()`][base::is.double()] +#' * [`is.factor()`][base::is.factor()] +#' * [`is.finite()`][base::is.finite()] +#' * [`is.infinite()`][base::is.infinite()] +#' * [`is.integer()`][base::is.integer()] +#' * [`is.list()`][base::is.list()] +#' * [`is.logical()`][base::is.logical()] +#' * [`is.na()`][base::is.na()] +#' * [`is.nan()`][base::is.nan()] +#' * [`is.numeric()`][base::is.numeric()] +#' * [`ISOdate()`][base::ISOdate()] +#' * [`ISOdatetime()`][base::ISOdatetime()] +#' * [`log()`][base::log()] +#' * [`log10()`][base::log10()] +#' * [`log1p()`][base::log1p()] +#' * [`log2()`][base::log2()] +#' * [`logb()`][base::logb()] +#' * [`max()`][base::max()] +#' * [`mean()`][base::mean()] +#' * [`min()`][base::min()] +#' * [`nchar()`][base::nchar()] +#' * [`paste()`][base::paste()]: the `collapse` argument is not yet supported +#' * [`paste0()`][base::paste0()]: the `collapse` argument is not yet supported +#' * [`pmax()`][base::pmax()] +#' * [`pmin()`][base::pmin()] +#' * [`round()`][base::round()] +#' * [`sign()`][base::sign()] +#' * [`sin()`][base::sin()] +#' * [`sqrt()`][base::sqrt()] +#' * [`startsWith()`][base::startsWith()] +#' * [`strftime()`][base::strftime()] +#' * [`strptime()`][base::strptime()] +#' * [`strrep()`][base::strrep()] +#' * [`strsplit()`][base::strsplit()] +#' * [`sub()`][base::sub()] +#' * [`substr()`][base::substr()] +#' * [`substring()`][base::substring()] +#' * [`sum()`][base::sum()] +#' * [`tan()`][base::tan()] +#' * [`tolower()`][base::tolower()] +#' * [`toupper()`][base::toupper()] +#' * [`trunc()`][base::trunc()] #' #' ## bit64 #' -#' * [as.integer64][bit64::as.integer64()] -#' * [is.integer64][bit64::is.integer64()] +#' * [`as.integer64()`][bit64::as.integer64()] +#' * [`is.integer64()`][bit64::is.integer64()] #' #' ## dplyr #' -#' * [across][dplyr::across()]: only supported inside `mutate()`; purrr-style lambda functions not yet supported -#' * [between][dplyr::between()] -#' * [case_when][dplyr::case_when()] -#' * [coalesce][dplyr::coalesce()] -#' * [if_else][dplyr::if_else()] -#' * [n][dplyr::n()] -#' * [n_distinct][dplyr::n_distinct()] +#' * [`across()`][dplyr::across()]: only supported inside `mutate()`, `summarize()`, and `arrange()`; purrr-style lambda functions and use of `where()` selection helper not yet supported +#' * [`between()`][dplyr::between()] +#' * [`case_when()`][dplyr::case_when()] +#' * [`coalesce()`][dplyr::coalesce()] +#' * [`desc()`][dplyr::desc()] +#' * [`if_else()`][dplyr::if_else()] +#' * [`n()`][dplyr::n()] +#' * [`n_distinct()`][dplyr::n_distinct()] #' #' ## lubridate #' -#' * [am][lubridate::am()] -#' * [as_date][lubridate::as_date()] -#' * [as_datetime][lubridate::as_datetime()] -#' * [ceiling_date][lubridate::ceiling_date()] -#' * [date][lubridate::date()] -#' * [date_decimal][lubridate::date_decimal()] -#' * [day][lubridate::day()] -#' * [ddays][lubridate::ddays()] -#' * [decimal_date][lubridate::decimal_date()] -#' * [dhours][lubridate::dhours()] -#' * [dmicroseconds][lubridate::dmicroseconds()] -#' * [dmilliseconds][lubridate::dmilliseconds()] -#' * [dminutes][lubridate::dminutes()] -#' * [dmonths][lubridate::dmonths()] -#' * [dmy][lubridate::dmy()] -#' * [dmy_h][lubridate::dmy_h()] -#' * [dmy_hm][lubridate::dmy_hm()] -#' * [dmy_hms][lubridate::dmy_hms()] -#' * [dnanoseconds][lubridate::dnanoseconds()] -#' * [dpicoseconds][lubridate::dpicoseconds()] -#' * [dseconds][lubridate::dseconds()] -#' * [dst][lubridate::dst()] -#' * [dweeks][lubridate::dweeks()] -#' * [dyears][lubridate::dyears()] -#' * [dym][lubridate::dym()] -#' * [epiweek][lubridate::epiweek()] -#' * [epiyear][lubridate::epiyear()] -#' * [fast_strptime][lubridate::fast_strptime()] -#' * [floor_date][lubridate::floor_date()] -#' * [format_ISO8601][lubridate::format_ISO8601()] -#' * [hour][lubridate::hour()] -#' * [is.Date][lubridate::is.Date()] -#' * [is.instant][lubridate::is.instant()] -#' * [is.POSIXct][lubridate::is.POSIXct()] -#' * [is.timepoint][lubridate::is.timepoint()] -#' * [isoweek][lubridate::isoweek()] -#' * [isoyear][lubridate::isoyear()] -#' * [leap_year][lubridate::leap_year()] -#' * [make_date][lubridate::make_date()] -#' * [make_datetime][lubridate::make_datetime()] -#' * [make_difftime][lubridate::make_difftime()] -#' * [mday][lubridate::mday()] -#' * [mdy][lubridate::mdy()] -#' * [mdy_h][lubridate::mdy_h()] -#' * [mdy_hm][lubridate::mdy_hm()] -#' * [mdy_hms][lubridate::mdy_hms()] -#' * [minute][lubridate::minute()] -#' * [month][lubridate::month()] -#' * [my][lubridate::my()] -#' * [myd][lubridate::myd()] -#' * [parse_date_time][lubridate::parse_date_time()] -#' * [pm][lubridate::pm()] -#' * [qday][lubridate::qday()] -#' * [quarter][lubridate::quarter()] -#' * [round_date][lubridate::round_date()] -#' * [second][lubridate::second()] -#' * [semester][lubridate::semester()] -#' * [tz][lubridate::tz()] -#' * [wday][lubridate::wday()] -#' * [week][lubridate::week()] -#' * [yday][lubridate::yday()] -#' * [ydm][lubridate::ydm()] -#' * [ydm_h][lubridate::ydm_h()] -#' * [ydm_hm][lubridate::ydm_hm()] -#' * [ydm_hms][lubridate::ydm_hms()] -#' * [year][lubridate::year()] -#' * [ym][lubridate::ym()] -#' * [ymd][lubridate::ymd()] -#' * [ymd_h][lubridate::ymd_h()] -#' * [ymd_hm][lubridate::ymd_hm()] -#' * [ymd_hms][lubridate::ymd_hms()] -#' * [yq][lubridate::yq()] +#' * [`am()`][lubridate::am()] +#' * [`as_date()`][lubridate::as_date()] +#' * [`as_datetime()`][lubridate::as_datetime()] +#' * [`ceiling_date()`][lubridate::ceiling_date()] +#' * [`date()`][lubridate::date()] +#' * [`date_decimal()`][lubridate::date_decimal()] +#' * [`day()`][lubridate::day()] +#' * [`ddays()`][lubridate::ddays()] +#' * [`decimal_date()`][lubridate::decimal_date()] +#' * [`dhours()`][lubridate::dhours()] +#' * [`dmicroseconds()`][lubridate::dmicroseconds()] +#' * [`dmilliseconds()`][lubridate::dmilliseconds()] +#' * [`dminutes()`][lubridate::dminutes()] +#' * [`dmonths()`][lubridate::dmonths()] +#' * [`dmy()`][lubridate::dmy()] +#' * [`dmy_h()`][lubridate::dmy_h()] +#' * [`dmy_hm()`][lubridate::dmy_hm()] +#' * [`dmy_hms()`][lubridate::dmy_hms()] +#' * [`dnanoseconds()`][lubridate::dnanoseconds()] +#' * [`dpicoseconds()`][lubridate::dpicoseconds()] +#' * [`dseconds()`][lubridate::dseconds()] +#' * [`dst()`][lubridate::dst()] +#' * [`dweeks()`][lubridate::dweeks()] +#' * [`dyears()`][lubridate::dyears()] +#' * [`dym()`][lubridate::dym()] +#' * [`epiweek()`][lubridate::epiweek()] +#' * [`epiyear()`][lubridate::epiyear()] +#' * [`fast_strptime()`][lubridate::fast_strptime()] +#' * [`floor_date()`][lubridate::floor_date()] +#' * [`format_ISO8601()`][lubridate::format_ISO8601()] +#' * [`hour()`][lubridate::hour()] +#' * [`is.Date()`][lubridate::is.Date()] +#' * [`is.instant()`][lubridate::is.instant()] +#' * [`is.POSIXct()`][lubridate::is.POSIXct()] +#' * [`is.timepoint()`][lubridate::is.timepoint()] +#' * [`isoweek()`][lubridate::isoweek()] +#' * [`isoyear()`][lubridate::isoyear()] +#' * [`leap_year()`][lubridate::leap_year()] +#' * [`make_date()`][lubridate::make_date()] +#' * [`make_datetime()`][lubridate::make_datetime()] +#' * [`make_difftime()`][lubridate::make_difftime()] +#' * [`mday()`][lubridate::mday()] +#' * [`mdy()`][lubridate::mdy()] +#' * [`mdy_h()`][lubridate::mdy_h()] +#' * [`mdy_hm()`][lubridate::mdy_hm()] +#' * [`mdy_hms()`][lubridate::mdy_hms()] +#' * [`minute()`][lubridate::minute()] +#' * [`month()`][lubridate::month()] +#' * [`my()`][lubridate::my()] +#' * [`myd()`][lubridate::myd()] +#' * [`parse_date_time()`][lubridate::parse_date_time()] +#' * [`pm()`][lubridate::pm()] +#' * [`qday()`][lubridate::qday()] +#' * [`quarter()`][lubridate::quarter()] +#' * [`round_date()`][lubridate::round_date()] +#' * [`second()`][lubridate::second()] +#' * [`semester()`][lubridate::semester()] +#' * [`tz()`][lubridate::tz()] +#' * [`wday()`][lubridate::wday()] +#' * [`week()`][lubridate::week()] +#' * [`yday()`][lubridate::yday()] +#' * [`ydm()`][lubridate::ydm()] +#' * [`ydm_h()`][lubridate::ydm_h()] +#' * [`ydm_hm()`][lubridate::ydm_hm()] +#' * [`ydm_hms()`][lubridate::ydm_hms()] +#' * [`year()`][lubridate::year()] +#' * [`ym()`][lubridate::ym()] +#' * [`ymd()`][lubridate::ymd()] +#' * [`ymd_h()`][lubridate::ymd_h()] +#' * [`ymd_hm()`][lubridate::ymd_hm()] +#' * [`ymd_hms()`][lubridate::ymd_hms()] +#' * [`yq()`][lubridate::yq()] #' #' ## methods #' -#' * [is][methods::is()] +#' * [`is()`][methods::is()] #' #' ## rlang #' -#' * [is_character][rlang::is_character()] -#' * [is_double][rlang::is_double()] -#' * [is_integer][rlang::is_integer()] -#' * [is_list][rlang::is_list()] -#' * [is_logical][rlang::is_logical()] +#' * [`is_character()`][rlang::is_character()] +#' * [`is_double()`][rlang::is_double()] +#' * [`is_integer()`][rlang::is_integer()] +#' * [`is_list()`][rlang::is_list()] +#' * [`is_logical()`][rlang::is_logical()] #' #' ## stats #' -#' * [median][stats::median()] -#' * [quantile][stats::quantile()] -#' * [sd][stats::sd()] -#' * [var][stats::var()] +#' * [`median()`][stats::median()] +#' * [`quantile()`][stats::quantile()] +#' * [`sd()`][stats::sd()] +#' * [`var()`][stats::var()] #' #' ## stringi #' -#' * [stri_reverse][stringi::stri_reverse()] +#' * [`stri_reverse()`][stringi::stri_reverse()] #' #' ## stringr #' -#' * [str_c][stringr::str_c()]: the `collapse` argument is not yet supported -#' * [str_count][stringr::str_count()] -#' * [str_detect][stringr::str_detect()] -#' * [str_dup][stringr::str_dup()] -#' * [str_ends][stringr::str_ends()] -#' * [str_length][stringr::str_length()] -#' * `str_like`: not yet in a released version of `stringr`, but it is supported in `arrow` -#' * [str_pad][stringr::str_pad()] -#' * [str_replace][stringr::str_replace()] -#' * [str_replace_all][stringr::str_replace_all()] -#' * [str_split][stringr::str_split()] -#' * [str_starts][stringr::str_starts()] -#' * [str_sub][stringr::str_sub()] -#' * [str_to_lower][stringr::str_to_lower()] -#' * [str_to_title][stringr::str_to_title()] -#' * [str_to_upper][stringr::str_to_upper()] -#' * [str_trim][stringr::str_trim()] +#' * [`str_c()`][stringr::str_c()]: the `collapse` argument is not yet supported +#' * [`str_count()`][stringr::str_count()] +#' * [`str_detect()`][stringr::str_detect()] +#' * [`str_dup()`][stringr::str_dup()] +#' * [`str_ends()`][stringr::str_ends()] +#' * [`str_length()`][stringr::str_length()] +#' * `str_like()`: not yet in a released version of `stringr`, but it is supported in `arrow` +#' * [`str_pad()`][stringr::str_pad()] +#' * [`str_replace()`][stringr::str_replace()] +#' * [`str_replace_all()`][stringr::str_replace_all()] +#' * [`str_split()`][stringr::str_split()] +#' * [`str_starts()`][stringr::str_starts()] +#' * [`str_sub()`][stringr::str_sub()] +#' * [`str_to_lower()`][stringr::str_to_lower()] +#' * [`str_to_title()`][stringr::str_to_title()] +#' * [`str_to_upper()`][stringr::str_to_upper()] +#' * [`str_trim()`][stringr::str_trim()] #' #' ## tibble #' -#' * [tibble][tibble::tibble()] +#' * [`tibble()`][tibble::tibble()] #' #' ## tidyselect #' -#' * [all_of][tidyselect::all_of()] -#' * [contains][tidyselect::contains()] -#' * [ends_with][tidyselect::ends_with()] -#' * [everything][tidyselect::everything()] -#' * [last_col][tidyselect::last_col()] -#' * [matches][tidyselect::matches()] -#' * [num_range][tidyselect::num_range()] -#' * [one_of][tidyselect::one_of()] -#' * [starts_with][tidyselect::starts_with()] +#' * [`all_of()`][tidyselect::all_of()] +#' * [`contains()`][tidyselect::contains()] +#' * [`ends_with()`][tidyselect::ends_with()] +#' * [`everything()`][tidyselect::everything()] +#' * [`last_col()`][tidyselect::last_col()] +#' * [`matches()`][tidyselect::matches()] +#' * [`num_range()`][tidyselect::num_range()] +#' * [`one_of()`][tidyselect::one_of()] +#' * [`starts_with()`][tidyselect::starts_with()] #' -#' @name arrow-dplyr-functions +#' @name acero NULL diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 6c23305a8f3..70bd7ac518c 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -216,7 +216,7 @@ reference: - codec_is_available - title: Computation contents: - - arrow-dplyr-functions + - acero - call_function - match_arrow - value_counts diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R index 5d767f27041..ef39bec272f 100644 --- a/r/data-raw/docgen.R +++ b/r/data-raw/docgen.R @@ -78,7 +78,7 @@ file_template <- "# Licensed to the Apache Software Foundation (ASF) under one #' %s #' -#' @name arrow-dplyr-functions +#' @name acero NULL" library(dplyr) @@ -91,14 +91,22 @@ do_not_link <- c( # Vectorized function to make entries for each function render_fun <- function(fun, pkg_fun, notes) { + # Add () to fun if it's not an operator + not_operators <- grepl("^[[:alpha:]]", fun) + fun[not_operators] <- paste0(fun[not_operators], "()") + # Make it \code{} for better formatting + fun <- paste0("`", fun, "`") + # Wrap in \link{} out <- ifelse( pkg_fun %in% do_not_link, - paste0("* `", fun, "`"), - paste0("* [", fun, "][", pkg_fun, "()]") + fun, + paste0("[", fun, "][", pkg_fun, "()]") ) + # Add notes after :, if exist has_notes <- nzchar(notes) out[has_notes] <- paste0(out[has_notes], ": ", notes[has_notes]) - out + # Make bullets + paste("*", out) } # This renders a bulleted list under a package heading @@ -121,17 +129,20 @@ docs <- arrow:::.cache$docs # across() is handled by manipulating the quosures, not by nse_funcs docs[["dplyr::across"]] <- c( - "only supported inside `mutate()`;", # TODO(ARROW-17362, ARROW-17387) - "purrr-style lambda functions not yet supported" # TODO(ARROW-17366) + # TODO(ARROW-17387, ARROW-17389, ARROW-17390) + "only supported inside `mutate()`, `summarize()`, and `arrange()`;", + # TODO(ARROW-17366) + "purrr-style lambda functions", + "and use of `where()` selection helper not yet supported" ) +# desc() is a special helper handled inside of arrange() +docs[["dplyr::desc"]] <- character(0) # add tidyselect helpers by parsing the reexports file tidyselect <- grep("^tidyselect::", readLines("R/reexports-tidyselect.R"), value = TRUE) docs <- c(docs, setNames(rep(list(NULL), length(tidyselect)), tidyselect)) -# TODO: add doc pages for add_filename() and cast() - fun_df <- tibble::tibble( pkg_fun = names(docs), notes = docs diff --git a/r/man/acero.Rd b/r/man/acero.Rd new file mode 100644 index 00000000000..5b5920f386e --- /dev/null +++ b/r/man/acero.Rd @@ -0,0 +1,339 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-funcs-doc.R +\name{acero} +\alias{acero} +\title{Functions available in Arrow dplyr queries} +\description{ +The \code{arrow} package contains methods for 32 \code{dplyr} table functions, many of +which are "verbs" that do transformations to one or more tables. +The package also has mappings of 205 R functions to the corresponding +functions in the Arrow compute library. These allow you to write code inside +of \code{dplyr} methods that call R functions, including many in packages like +\code{stringr} and \code{lubridate}, and they will get translated to Arrow and run +on the Arrow query engine (Acero). This document lists all of the mapped +functions. +} +\section{\code{dplyr} verbs}{ +Most verb functions return an \code{arrow_dplyr_query} object, similar in spirit +to a \code{dbplyr::tbl_lazy}. This means that the verbs do not eagerly evaluate +the query on the data. To run the query, call either \code{compute()}, +which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting +Table into an R \code{data.frame}. +\itemize{ +\item \code{\link[dplyr:filter-joins]{anti_join()}} +\item \code{\link[dplyr:arrange]{arrange()}} +\item \code{\link[dplyr:compute]{collapse()}} +\item \code{\link[dplyr:compute]{collect()}} +\item \code{\link[dplyr:compute]{compute()}} +\item \code{\link[dplyr:count]{count()}} +\item \code{\link[dplyr:distinct]{distinct()}} +\item \code{\link[dplyr:explain]{explain()}} +\item \code{\link[dplyr:filter]{filter()}} +\item \code{\link[dplyr:mutate-joins]{full_join()}} +\item \code{\link[dplyr:glimpse]{glimpse()}} +\item \code{\link[dplyr:group_by]{group_by()}} +\item \code{\link[dplyr:group_by_drop_default]{group_by_drop_default()}} +\item \code{\link[dplyr:group_data]{group_vars()}} +\item \code{\link[dplyr:group_data]{groups()}} +\item \code{\link[dplyr:mutate-joins]{inner_join()}} +\item \code{\link[dplyr:mutate-joins]{left_join()}} +\item \code{\link[dplyr:mutate]{mutate()}} +\item \code{\link[dplyr:pull]{pull()}} +\item \code{\link[dplyr:relocate]{relocate()}} +\item \code{\link[dplyr:rename]{rename()}} +\item \code{\link[dplyr:rename]{rename_with()}} +\item \code{\link[dplyr:mutate-joins]{right_join()}} +\item \code{\link[dplyr:select]{select()}} +\item \code{\link[dplyr:filter-joins]{semi_join()}} +\item \code{\link[dplyr:explain]{show_query()}} +\item \code{\link[dplyr:summarise]{summarise()}} +\item \code{\link[dplyr:count]{tally()}} +\item \code{\link[dplyr:mutate]{transmute()}} +\item \code{\link[dplyr:group_by]{ungroup()}} +\item \code{\link[dplyr:reexports]{union()}} +\item \code{\link[dplyr:setops]{union_all()}} +} +} + +\section{Function mappings}{ +In the list below, any differences in behavior or support between Acero and +the R function are listed. If no notes follow the function name, then you +can assume that the function works in Acero just as it does in R. + +Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both +\code{str_sub()} and \code{stringr::str_sub()} work. + +In addition to these functions, you can call any of Arrow's 243 compute +functions directly. Arrow has many functions that don't map to an existing R +function. In other cases where there is an R function mapping, you can still +call the Arrow function directly if you don't want the adaptations that the R +mapping has that make Acero behave like R. These functions are listed in the +\href{https://arrow.apache.org/docs/cpp/compute.html}{C++ documentation}, and +in the function registry in R, they are named with an \code{arrow_} prefix, such +as \code{arrow_ascii_is_decimal}. +\subsection{arrow}{ +\itemize{ +\item \code{\link[=add_filename]{add_filename()}} +\item \code{\link[=cast]{cast()}} +} +} + +\subsection{base}{ +\itemize{ +\item \code{\link[=-]{-}} +\item \code{\link[=!]{!}} +\item \code{\link[=!=]{!=}} +\item \code{\link[=*]{*}} +\item \code{\link[=/]{/}} +\item \code{\link[=&]{&}} +\item \code{\link[=\%/\%]{\%/\%}} +\item \code{\link[=\%\%]{\%\%}} +\item \code{\link[=\%in\%]{\%in\%}} +\item \code{\link[=^]{^}} +\item \code{\link[=+]{+}} +\item \code{\link[=<]{<}} +\item \code{\link[=<=]{<=}} +\item \code{\link[===]{==}} +\item \code{\link[=>]{>}} +\item \code{\link[=>=]{>=}} +\item \code{\link[=|]{|}} +\item \code{\link[base:MathFun]{abs()}} +\item \code{\link[base:Trig]{acos()}} +\item \code{\link[base:all]{all()}} +\item \code{\link[base:any]{any()}} +\item \code{\link[base:character]{as.character()}} +\item \code{\link[base:as.Date]{as.Date()}} +\item \code{\link[base:difftime]{as.difftime()}} +\item \code{\link[base:double]{as.double()}} +\item \code{\link[base:integer]{as.integer()}} +\item \code{\link[base:logical]{as.logical()}} +\item \code{\link[base:numeric]{as.numeric()}} +\item \code{\link[base:Trig]{asin()}} +\item \code{\link[base:Round]{ceiling()}} +\item \code{\link[base:Trig]{cos()}} +\item \code{\link[base:data.frame]{data.frame()}} +\item \code{\link[base:difftime]{difftime()}} +\item \code{\link[base:startsWith]{endsWith()}} +\item \code{\link[base:Log]{exp()}} +\item \code{\link[base:Round]{floor()}} +\item \code{\link[base:format]{format()}} +\item \code{\link[base:grep]{grepl()}} +\item \code{\link[base:grep]{gsub()}} +\item \code{\link[base:ifelse]{ifelse()}} +\item \code{\link[base:character]{is.character()}} +\item \code{\link[base:double]{is.double()}} +\item \code{\link[base:factor]{is.factor()}} +\item \code{\link[base:is.finite]{is.finite()}} +\item \code{\link[base:is.finite]{is.infinite()}} +\item \code{\link[base:integer]{is.integer()}} +\item \code{\link[base:list]{is.list()}} +\item \code{\link[base:logical]{is.logical()}} +\item \code{\link[base:NA]{is.na()}} +\item \code{\link[base:is.finite]{is.nan()}} +\item \code{\link[base:numeric]{is.numeric()}} +\item \code{\link[base:ISOdatetime]{ISOdate()}} +\item \code{\link[base:ISOdatetime]{ISOdatetime()}} +\item \code{\link[base:Log]{log()}} +\item \code{\link[base:Log]{log10()}} +\item \code{\link[base:Log]{log1p()}} +\item \code{\link[base:Log]{log2()}} +\item \code{\link[base:Log]{logb()}} +\item \code{\link[base:Extremes]{max()}} +\item \code{\link[base:mean]{mean()}} +\item \code{\link[base:Extremes]{min()}} +\item \code{\link[base:nchar]{nchar()}} +\item \code{\link[base:paste]{paste()}}: the \code{collapse} argument is not yet supported +\item \code{\link[base:paste]{paste0()}}: the \code{collapse} argument is not yet supported +\item \code{\link[base:Extremes]{pmax()}} +\item \code{\link[base:Extremes]{pmin()}} +\item \code{\link[base:Round]{round()}} +\item \code{\link[base:sign]{sign()}} +\item \code{\link[base:Trig]{sin()}} +\item \code{\link[base:MathFun]{sqrt()}} +\item \code{\link[base:startsWith]{startsWith()}} +\item \code{\link[base:strptime]{strftime()}} +\item \code{\link[base:strptime]{strptime()}} +\item \code{\link[base:strrep]{strrep()}} +\item \code{\link[base:strsplit]{strsplit()}} +\item \code{\link[base:grep]{sub()}} +\item \code{\link[base:substr]{substr()}} +\item \code{\link[base:substr]{substring()}} +\item \code{\link[base:sum]{sum()}} +\item \code{\link[base:Trig]{tan()}} +\item \code{\link[base:chartr]{tolower()}} +\item \code{\link[base:chartr]{toupper()}} +\item \code{\link[base:Round]{trunc()}} +} +} + +\subsection{bit64}{ +\itemize{ +\item \code{\link[bit64:as.integer64.character]{as.integer64()}} +\item \code{\link[bit64:bit64-package]{is.integer64()}} +} +} + +\subsection{dplyr}{ +\itemize{ +\item \code{\link[dplyr:across]{across()}}: only supported inside \code{mutate()}, \code{summarize()}, and \code{arrange()}; purrr-style lambda functions and use of \code{where()} selection helper not yet supported +\item \code{\link[dplyr:between]{between()}} +\item \code{\link[dplyr:case_when]{case_when()}} +\item \code{\link[dplyr:coalesce]{coalesce()}} +\item \code{\link[dplyr:desc]{desc()}} +\item \code{\link[dplyr:if_else]{if_else()}} +\item \code{\link[dplyr:context]{n()}} +\item \code{\link[dplyr:n_distinct]{n_distinct()}} +} +} + +\subsection{lubridate}{ +\itemize{ +\item \code{\link[lubridate:am]{am()}} +\item \code{\link[lubridate:as_date]{as_date()}} +\item \code{\link[lubridate:as_date]{as_datetime()}} +\item \code{\link[lubridate:round_date]{ceiling_date()}} +\item \code{\link[lubridate:date]{date()}} +\item \code{\link[lubridate:date_decimal]{date_decimal()}} +\item \code{\link[lubridate:day]{day()}} +\item \code{\link[lubridate:duration]{ddays()}} +\item \code{\link[lubridate:decimal_date]{decimal_date()}} +\item \code{\link[lubridate:duration]{dhours()}} +\item \code{\link[lubridate:duration]{dmicroseconds()}} +\item \code{\link[lubridate:duration]{dmilliseconds()}} +\item \code{\link[lubridate:duration]{dminutes()}} +\item \code{\link[lubridate:duration]{dmonths()}} +\item \code{\link[lubridate:ymd]{dmy()}} +\item \code{\link[lubridate:ymd_hms]{dmy_h()}} +\item \code{\link[lubridate:ymd_hms]{dmy_hm()}} +\item \code{\link[lubridate:ymd_hms]{dmy_hms()}} +\item \code{\link[lubridate:duration]{dnanoseconds()}} +\item \code{\link[lubridate:duration]{dpicoseconds()}} +\item \code{\link[lubridate:duration]{dseconds()}} +\item \code{\link[lubridate:dst]{dst()}} +\item \code{\link[lubridate:duration]{dweeks()}} +\item \code{\link[lubridate:duration]{dyears()}} +\item \code{\link[lubridate:ymd]{dym()}} +\item \code{\link[lubridate:week]{epiweek()}} +\item \code{\link[lubridate:year]{epiyear()}} +\item \code{\link[lubridate:parse_date_time]{fast_strptime()}} +\item \code{\link[lubridate:round_date]{floor_date()}} +\item \code{\link[lubridate:format_ISO8601]{format_ISO8601()}} +\item \code{\link[lubridate:hour]{hour()}} +\item \code{\link[lubridate:date_utils]{is.Date()}} +\item \code{\link[lubridate:is.instant]{is.instant()}} +\item \code{\link[lubridate:posix_utils]{is.POSIXct()}} +\item \code{\link[lubridate:is.instant]{is.timepoint()}} +\item \code{\link[lubridate:week]{isoweek()}} +\item \code{\link[lubridate:year]{isoyear()}} +\item \code{\link[lubridate:leap_year]{leap_year()}} +\item \code{\link[lubridate:make_datetime]{make_date()}} +\item \code{\link[lubridate:make_datetime]{make_datetime()}} +\item \code{\link[lubridate:make_difftime]{make_difftime()}} +\item \code{\link[lubridate:day]{mday()}} +\item \code{\link[lubridate:ymd]{mdy()}} +\item \code{\link[lubridate:ymd_hms]{mdy_h()}} +\item \code{\link[lubridate:ymd_hms]{mdy_hm()}} +\item \code{\link[lubridate:ymd_hms]{mdy_hms()}} +\item \code{\link[lubridate:minute]{minute()}} +\item \code{\link[lubridate:month]{month()}} +\item \code{\link[lubridate:ymd]{my()}} +\item \code{\link[lubridate:ymd]{myd()}} +\item \code{\link[lubridate:parse_date_time]{parse_date_time()}} +\item \code{\link[lubridate:am]{pm()}} +\item \code{\link[lubridate:day]{qday()}} +\item \code{\link[lubridate:quarter]{quarter()}} +\item \code{\link[lubridate:round_date]{round_date()}} +\item \code{\link[lubridate:second]{second()}} +\item \code{\link[lubridate:quarter]{semester()}} +\item \code{\link[lubridate:tz]{tz()}} +\item \code{\link[lubridate:day]{wday()}} +\item \code{\link[lubridate:week]{week()}} +\item \code{\link[lubridate:day]{yday()}} +\item \code{\link[lubridate:ymd]{ydm()}} +\item \code{\link[lubridate:ymd_hms]{ydm_h()}} +\item \code{\link[lubridate:ymd_hms]{ydm_hm()}} +\item \code{\link[lubridate:ymd_hms]{ydm_hms()}} +\item \code{\link[lubridate:year]{year()}} +\item \code{\link[lubridate:ymd]{ym()}} +\item \code{\link[lubridate:ymd]{ymd()}} +\item \code{\link[lubridate:ymd_hms]{ymd_h()}} +\item \code{\link[lubridate:ymd_hms]{ymd_hm()}} +\item \code{\link[lubridate:ymd_hms]{ymd_hms()}} +\item \code{\link[lubridate:ymd]{yq()}} +} +} + +\subsection{methods}{ +\itemize{ +\item \code{\link[methods:is]{is()}} +} +} + +\subsection{rlang}{ +\itemize{ +\item \code{\link[rlang:type-predicates]{is_character()}} +\item \code{\link[rlang:type-predicates]{is_double()}} +\item \code{\link[rlang:type-predicates]{is_integer()}} +\item \code{\link[rlang:type-predicates]{is_list()}} +\item \code{\link[rlang:type-predicates]{is_logical()}} +} +} + +\subsection{stats}{ +\itemize{ +\item \code{\link[stats:median]{median()}} +\item \code{\link[stats:quantile]{quantile()}} +\item \code{\link[stats:sd]{sd()}} +\item \code{\link[stats:cor]{var()}} +} +} + +\subsection{stringi}{ +\itemize{ +\item \code{\link[stringi:stri_reverse]{stri_reverse()}} +} +} + +\subsection{stringr}{ +\itemize{ +\item \code{\link[stringr:str_c]{str_c()}}: the \code{collapse} argument is not yet supported +\item \code{\link[stringr:str_count]{str_count()}} +\item \code{\link[stringr:str_detect]{str_detect()}} +\item \code{\link[stringr:str_dup]{str_dup()}} +\item \code{\link[stringr:str_starts]{str_ends()}} +\item \code{\link[stringr:str_length]{str_length()}} +\item \code{str_like()}: not yet in a released version of \code{stringr}, but it is supported in \code{arrow} +\item \code{\link[stringr:str_pad]{str_pad()}} +\item \code{\link[stringr:str_replace]{str_replace()}} +\item \code{\link[stringr:str_replace]{str_replace_all()}} +\item \code{\link[stringr:str_split]{str_split()}} +\item \code{\link[stringr:str_starts]{str_starts()}} +\item \code{\link[stringr:str_sub]{str_sub()}} +\item \code{\link[stringr:case]{str_to_lower()}} +\item \code{\link[stringr:case]{str_to_title()}} +\item \code{\link[stringr:case]{str_to_upper()}} +\item \code{\link[stringr:str_trim]{str_trim()}} +} +} + +\subsection{tibble}{ +\itemize{ +\item \code{\link[tibble:tibble]{tibble()}} +} +} + +\subsection{tidyselect}{ +\itemize{ +\item \code{\link[tidyselect:all_of]{all_of()}} +\item \code{\link[tidyselect:starts_with]{contains()}} +\item \code{\link[tidyselect:starts_with]{ends_with()}} +\item \code{\link[tidyselect:everything]{everything()}} +\item \code{\link[tidyselect:everything]{last_col()}} +\item \code{\link[tidyselect:starts_with]{matches()}} +\item \code{\link[tidyselect:starts_with]{num_range()}} +\item \code{\link[tidyselect:one_of]{one_of()}} +\item \code{\link[tidyselect:starts_with]{starts_with()}} +} +} +} + diff --git a/r/man/arrow-dplyr-functions.Rd b/r/man/arrow-dplyr-functions.Rd deleted file mode 100644 index 438d8b64025..00000000000 --- a/r/man/arrow-dplyr-functions.Rd +++ /dev/null @@ -1,338 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr-funcs-doc.R -\name{arrow-dplyr-functions} -\alias{arrow-dplyr-functions} -\title{Functions available in Arrow dplyr queries} -\description{ -The \code{arrow} package contains methods for 32 \code{dplyr} table functions, many of -which are "verbs" that do transformations to one or more tables. -The package also has mappings of 204 R functions to the corresponding -functions in the Arrow compute library. These allow you to write code inside -of \code{dplyr} methods that call R functions, including many in packages like -\code{stringr} and \code{lubridate}, and they will get translated to Arrow and run -on the Arrow query engine (Acero). This document lists all of the mapped -functions. -} -\section{\code{dplyr} verbs}{ -Most verb functions return an \code{arrow_dplyr_query} object, similar in spirit -to a \code{dbplyr::tbl_lazy}. This means that the verbs do not eagerly evaluate -the query on the data. To run the query, call either \code{compute()}, -which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting -Table into an R \code{data.frame}. -\itemize{ -\item \link[dplyr:filter-joins]{anti_join} -\item \link[dplyr:arrange]{arrange} -\item \link[dplyr:compute]{collapse} -\item \link[dplyr:compute]{collect} -\item \link[dplyr:compute]{compute} -\item \link[dplyr:count]{count} -\item \link[dplyr:distinct]{distinct} -\item \link[dplyr:explain]{explain} -\item \link[dplyr:filter]{filter} -\item \link[dplyr:mutate-joins]{full_join} -\item \link[dplyr:glimpse]{glimpse} -\item \link[dplyr:group_by]{group_by} -\item \link[dplyr:group_by_drop_default]{group_by_drop_default} -\item \link[dplyr:group_data]{group_vars} -\item \link[dplyr:group_data]{groups} -\item \link[dplyr:mutate-joins]{inner_join} -\item \link[dplyr:mutate-joins]{left_join} -\item \link[dplyr:mutate]{mutate} -\item \link[dplyr:pull]{pull} -\item \link[dplyr:relocate]{relocate} -\item \link[dplyr:rename]{rename} -\item \link[dplyr:rename]{rename_with} -\item \link[dplyr:mutate-joins]{right_join} -\item \link[dplyr:select]{select} -\item \link[dplyr:filter-joins]{semi_join} -\item \link[dplyr:explain]{show_query} -\item \link[dplyr:summarise]{summarise} -\item \link[dplyr:count]{tally} -\item \link[dplyr:mutate]{transmute} -\item \link[dplyr:group_by]{ungroup} -\item \link[dplyr:reexports]{union} -\item \link[dplyr:setops]{union_all} -} -} - -\section{Function mappings}{ -In the list below, any differences in behavior or support between Acero and -the R function are listed. If no notes follow the function name, then you -can assume that the function works in Acero just as it does in R. - -Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both -\code{str_sub()} and \code{stringr::str_sub()} work. - -In addition to these functions, you can call any of Arrow's 243 compute -functions directly. Arrow has many functions that don't map to an existing R -function. In other cases where there is an R function mapping, you can still -call the Arrow function directly if you don't want the adaptations that the R -mapping has that make Acero behave like R. These functions are listed in the -\href{https://arrow.apache.org/docs/cpp/compute.html}{C++ documentation}, and -in the function registry in R, they are named with an \code{arrow_} prefix, such -as \code{arrow_ascii_is_decimal}. -\subsection{arrow}{ -\itemize{ -\item \link[=add_filename]{add_filename} -\item \link[=cast]{cast} -} -} - -\subsection{base}{ -\itemize{ -\item \link[=-]{-} -\item \link[=!]{!} -\item \link[=!=]{!=} -\item \link[=*]{*} -\item \link[=/]{/} -\item \link[=&]{&} -\item \link[=\%/\%]{\%/\%} -\item \link[=\%\%]{\%\%} -\item \link[=\%in\%]{\%in\%} -\item \link[=^]{^} -\item \link[=+]{+} -\item \link[=<]{<} -\item \link[=<=]{<=} -\item \link[===]{==} -\item \link[=>]{>} -\item \link[=>=]{>=} -\item \link[=|]{|} -\item \link[base:MathFun]{abs} -\item \link[base:Trig]{acos} -\item \link[base:all]{all} -\item \link[base:any]{any} -\item \link[base:character]{as.character} -\item \link[base:as.Date]{as.Date} -\item \link[base:difftime]{as.difftime} -\item \link[base:double]{as.double} -\item \link[base:integer]{as.integer} -\item \link[base:logical]{as.logical} -\item \link[base:numeric]{as.numeric} -\item \link[base:Trig]{asin} -\item \link[base:Round]{ceiling} -\item \link[base:Trig]{cos} -\item \link[base:data.frame]{data.frame} -\item \link[base:difftime]{difftime} -\item \link[base:startsWith]{endsWith} -\item \link[base:Log]{exp} -\item \link[base:Round]{floor} -\item \link[base:format]{format} -\item \link[base:grep]{grepl} -\item \link[base:grep]{gsub} -\item \link[base:ifelse]{ifelse} -\item \link[base:character]{is.character} -\item \link[base:double]{is.double} -\item \link[base:factor]{is.factor} -\item \link[base:is.finite]{is.finite} -\item \link[base:is.finite]{is.infinite} -\item \link[base:integer]{is.integer} -\item \link[base:list]{is.list} -\item \link[base:logical]{is.logical} -\item \link[base:NA]{is.na} -\item \link[base:is.finite]{is.nan} -\item \link[base:numeric]{is.numeric} -\item \link[base:ISOdatetime]{ISOdate} -\item \link[base:ISOdatetime]{ISOdatetime} -\item \link[base:Log]{log} -\item \link[base:Log]{log10} -\item \link[base:Log]{log1p} -\item \link[base:Log]{log2} -\item \link[base:Log]{logb} -\item \link[base:Extremes]{max} -\item \link[base:mean]{mean} -\item \link[base:Extremes]{min} -\item \link[base:nchar]{nchar} -\item \link[base:paste]{paste}: the \code{collapse} argument is not yet supported -\item \link[base:paste]{paste0}: the \code{collapse} argument is not yet supported -\item \link[base:Extremes]{pmax} -\item \link[base:Extremes]{pmin} -\item \link[base:Round]{round} -\item \link[base:sign]{sign} -\item \link[base:Trig]{sin} -\item \link[base:MathFun]{sqrt} -\item \link[base:startsWith]{startsWith} -\item \link[base:strptime]{strftime} -\item \link[base:strptime]{strptime} -\item \link[base:strrep]{strrep} -\item \link[base:strsplit]{strsplit} -\item \link[base:grep]{sub} -\item \link[base:substr]{substr} -\item \link[base:substr]{substring} -\item \link[base:sum]{sum} -\item \link[base:Trig]{tan} -\item \link[base:chartr]{tolower} -\item \link[base:chartr]{toupper} -\item \link[base:Round]{trunc} -} -} - -\subsection{bit64}{ -\itemize{ -\item \link[bit64:as.integer64.character]{as.integer64} -\item \link[bit64:bit64-package]{is.integer64} -} -} - -\subsection{dplyr}{ -\itemize{ -\item \link[dplyr:across]{across}: only supported inside \code{mutate()}; purrr-style lambda functions not yet supported -\item \link[dplyr:between]{between} -\item \link[dplyr:case_when]{case_when} -\item \link[dplyr:coalesce]{coalesce} -\item \link[dplyr:if_else]{if_else} -\item \link[dplyr:context]{n} -\item \link[dplyr:n_distinct]{n_distinct} -} -} - -\subsection{lubridate}{ -\itemize{ -\item \link[lubridate:am]{am} -\item \link[lubridate:as_date]{as_date} -\item \link[lubridate:as_date]{as_datetime} -\item \link[lubridate:round_date]{ceiling_date} -\item \link[lubridate:date]{date} -\item \link[lubridate:date_decimal]{date_decimal} -\item \link[lubridate:day]{day} -\item \link[lubridate:duration]{ddays} -\item \link[lubridate:decimal_date]{decimal_date} -\item \link[lubridate:duration]{dhours} -\item \link[lubridate:duration]{dmicroseconds} -\item \link[lubridate:duration]{dmilliseconds} -\item \link[lubridate:duration]{dminutes} -\item \link[lubridate:duration]{dmonths} -\item \link[lubridate:ymd]{dmy} -\item \link[lubridate:ymd_hms]{dmy_h} -\item \link[lubridate:ymd_hms]{dmy_hm} -\item \link[lubridate:ymd_hms]{dmy_hms} -\item \link[lubridate:duration]{dnanoseconds} -\item \link[lubridate:duration]{dpicoseconds} -\item \link[lubridate:duration]{dseconds} -\item \link[lubridate:dst]{dst} -\item \link[lubridate:duration]{dweeks} -\item \link[lubridate:duration]{dyears} -\item \link[lubridate:ymd]{dym} -\item \link[lubridate:week]{epiweek} -\item \link[lubridate:year]{epiyear} -\item \link[lubridate:parse_date_time]{fast_strptime} -\item \link[lubridate:round_date]{floor_date} -\item \link[lubridate:format_ISO8601]{format_ISO8601} -\item \link[lubridate:hour]{hour} -\item \link[lubridate:date_utils]{is.Date} -\item \link[lubridate:is.instant]{is.instant} -\item \link[lubridate:posix_utils]{is.POSIXct} -\item \link[lubridate:is.instant]{is.timepoint} -\item \link[lubridate:week]{isoweek} -\item \link[lubridate:year]{isoyear} -\item \link[lubridate:leap_year]{leap_year} -\item \link[lubridate:make_datetime]{make_date} -\item \link[lubridate:make_datetime]{make_datetime} -\item \link[lubridate:make_difftime]{make_difftime} -\item \link[lubridate:day]{mday} -\item \link[lubridate:ymd]{mdy} -\item \link[lubridate:ymd_hms]{mdy_h} -\item \link[lubridate:ymd_hms]{mdy_hm} -\item \link[lubridate:ymd_hms]{mdy_hms} -\item \link[lubridate:minute]{minute} -\item \link[lubridate:month]{month} -\item \link[lubridate:ymd]{my} -\item \link[lubridate:ymd]{myd} -\item \link[lubridate:parse_date_time]{parse_date_time} -\item \link[lubridate:am]{pm} -\item \link[lubridate:day]{qday} -\item \link[lubridate:quarter]{quarter} -\item \link[lubridate:round_date]{round_date} -\item \link[lubridate:second]{second} -\item \link[lubridate:quarter]{semester} -\item \link[lubridate:tz]{tz} -\item \link[lubridate:day]{wday} -\item \link[lubridate:week]{week} -\item \link[lubridate:day]{yday} -\item \link[lubridate:ymd]{ydm} -\item \link[lubridate:ymd_hms]{ydm_h} -\item \link[lubridate:ymd_hms]{ydm_hm} -\item \link[lubridate:ymd_hms]{ydm_hms} -\item \link[lubridate:year]{year} -\item \link[lubridate:ymd]{ym} -\item \link[lubridate:ymd]{ymd} -\item \link[lubridate:ymd_hms]{ymd_h} -\item \link[lubridate:ymd_hms]{ymd_hm} -\item \link[lubridate:ymd_hms]{ymd_hms} -\item \link[lubridate:ymd]{yq} -} -} - -\subsection{methods}{ -\itemize{ -\item \link[methods:is]{is} -} -} - -\subsection{rlang}{ -\itemize{ -\item \link[rlang:type-predicates]{is_character} -\item \link[rlang:type-predicates]{is_double} -\item \link[rlang:type-predicates]{is_integer} -\item \link[rlang:type-predicates]{is_list} -\item \link[rlang:type-predicates]{is_logical} -} -} - -\subsection{stats}{ -\itemize{ -\item \link[stats:median]{median} -\item \link[stats:quantile]{quantile} -\item \link[stats:sd]{sd} -\item \link[stats:cor]{var} -} -} - -\subsection{stringi}{ -\itemize{ -\item \link[stringi:stri_reverse]{stri_reverse} -} -} - -\subsection{stringr}{ -\itemize{ -\item \link[stringr:str_c]{str_c}: the \code{collapse} argument is not yet supported -\item \link[stringr:str_count]{str_count} -\item \link[stringr:str_detect]{str_detect} -\item \link[stringr:str_dup]{str_dup} -\item \link[stringr:str_starts]{str_ends} -\item \link[stringr:str_length]{str_length} -\item \code{str_like}: not yet in a released version of \code{stringr}, but it is supported in \code{arrow} -\item \link[stringr:str_pad]{str_pad} -\item \link[stringr:str_replace]{str_replace} -\item \link[stringr:str_replace]{str_replace_all} -\item \link[stringr:str_split]{str_split} -\item \link[stringr:str_starts]{str_starts} -\item \link[stringr:str_sub]{str_sub} -\item \link[stringr:case]{str_to_lower} -\item \link[stringr:case]{str_to_title} -\item \link[stringr:case]{str_to_upper} -\item \link[stringr:str_trim]{str_trim} -} -} - -\subsection{tibble}{ -\itemize{ -\item \link[tibble:tibble]{tibble} -} -} - -\subsection{tidyselect}{ -\itemize{ -\item \link[tidyselect:all_of]{all_of} -\item \link[tidyselect:starts_with]{contains} -\item \link[tidyselect:starts_with]{ends_with} -\item \link[tidyselect:everything]{everything} -\item \link[tidyselect:everything]{last_col} -\item \link[tidyselect:starts_with]{matches} -\item \link[tidyselect:starts_with]{num_range} -\item \link[tidyselect:one_of]{one_of} -\item \link[tidyselect:starts_with]{starts_with} -} -} -} - From 8f50f440efe63de96a0c46f625d984f982103013 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Sep 2022 17:48:51 -0400 Subject: [PATCH 7/7] Fix check warning --- r/R/dplyr-funcs-augmented.R | 9 ++------- r/R/dplyr-funcs-type.R | 23 +++++++++-------------- r/R/expression.R | 11 +++-------- r/man/cast.Rd | 13 ++++++------- 4 files changed, 20 insertions(+), 36 deletions(-) diff --git a/r/R/dplyr-funcs-augmented.R b/r/R/dplyr-funcs-augmented.R index d1359005e37..1067f15573b 100644 --- a/r/R/dplyr-funcs-augmented.R +++ b/r/R/dplyr-funcs-augmented.R @@ -20,8 +20,6 @@ #' This function only exists inside `arrow` `dplyr` queries, and it only is #' valid when quering on a `FileSystemDataset`. #' -#' @usage add_filename() -#' #' @return A `FieldRef` `Expression` that refers to the filename augmented #' column. #' @examples @@ -30,11 +28,8 @@ #' mutate(file = add_filename()) #' } #' @keywords internal -#' @name add_filename -NULL +add_filename <- function() Expression$field_ref("__filename") register_bindings_augmented <- function() { - register_binding("arrow::add_filename", function() { - Expression$field_ref("__filename") - }) + register_binding("arrow::add_filename", add_filename) } diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 35734ced05d..aa50cdebc5d 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -25,14 +25,12 @@ register_bindings_type <- function() { #' Change the type of an array or column #' -#' The `cast()` function only exists inside of `arrow` `dplyr` queries. Use it -#' as a more convenient way of changing the type of a value or field inside of -#' a `mutate()` call. To cast an `Array` or `ChunkedArray` outside of a query, -#' call the `$cast()` method on the object, which has the same semantics. +#' This is a wrapper around the `$cast()` method that many Arrow objects have. +#' It is more convenient to call inside `dplyr` pipelines than the method. #' -#' @usage cast(x, target_type, safe = TRUE, ...) -#' @param x an `Expression` -#' @param target_type [DataType] to cast to +#' @param x an `Array`, `Table`, `Expression`, or similar Arrow data object. +#' @param to [DataType] to cast to; for [Table] and [RecordBatch], +#' it should be a [Schema]. #' @param safe logical: only allow the type conversion if no data is lost #' (truncation, overflow, etc.). Default is `TRUE` #' @param ... specific `CastOptions` to set @@ -45,17 +43,14 @@ register_bindings_type <- function() { #' mutate(cyl = cast(cyl, string())) #' } #' @keywords internal -#' @name cast #' @seealso https://arrow.apache.org/docs/cpp/api/compute.html for the list of #' supported CastOptions. -NULL +cast <- function(x, to, safe = TRUE, ...) { + x$cast(to, safe = safe, ...) +} register_bindings_type_cast <- function() { - register_binding("arrow::cast", function(x, target_type, safe = TRUE, ...) { - opts <- cast_options(safe, ...) - opts$to_type <- as_type(target_type) - Expression$create("cast", x, options = opts) - }) + register_binding("arrow::cast", cast) # as.* type casting functions # as.factor() is mapped in expression.R diff --git a/r/R/expression.R b/r/R/expression.R index 09a8ea24608..7a5a600d956 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -76,7 +76,6 @@ "lubridate::yday" = "day_of_year", "lubridate::year" = "year", "lubridate::leap_year" = "is_leap_year" - ) .binary_function_map <- list( @@ -158,13 +157,9 @@ Expression <- R6Class("Expression", compute___expr__type_id(self, schema) }, cast = function(to_type, safe = TRUE, ...) { - opts <- list( - to_type = to_type, - allow_int_overflow = !safe, - allow_time_truncate = !safe, - allow_float_truncate = !safe - ) - Expression$create("cast", self, options = modifyList(opts, list(...))) + opts <- cast_options(safe, ...) + opts$to_type <- as_type(to_type) + Expression$create("cast", self, options = opts) } ), active = list( diff --git a/r/man/cast.Rd b/r/man/cast.Rd index 81abfa6567d..88134f2e022 100644 --- a/r/man/cast.Rd +++ b/r/man/cast.Rd @@ -4,12 +4,13 @@ \alias{cast} \title{Change the type of an array or column} \usage{ -cast(x, target_type, safe = TRUE, ...) +cast(x, to, safe = TRUE, ...) } \arguments{ -\item{x}{an \code{Expression}} +\item{x}{an \code{Array}, \code{Table}, \code{Expression}, or similar Arrow data object.} -\item{target_type}{\link{DataType} to cast to} +\item{to}{\link{DataType} to cast to; for \link{Table} and \link{RecordBatch}, +it should be a \link{Schema}.} \item{safe}{logical: only allow the type conversion if no data is lost (truncation, overflow, etc.). Default is \code{TRUE}} @@ -20,10 +21,8 @@ cast(x, target_type, safe = TRUE, ...) an \code{Expression} } \description{ -The \code{cast()} function only exists inside of \code{arrow} \code{dplyr} queries. Use it -as a more convenient way of changing the type of a value or field inside of -a \code{mutate()} call. To cast an \code{Array} or \code{ChunkedArray} outside of a query, -call the \verb{$cast()} method on the object, which has the same semantics. +This is a wrapper around the \verb{$cast()} method that many Arrow objects have. +It is more convenient to call inside \code{dplyr} pipelines than the method. } \examples{ \dontrun{