diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 7ae6a8de29f..7b60f0c510a 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -103,6 +103,7 @@ Collate: 'dplyr-funcs-augmented.R' 'dplyr-funcs-conditional.R' 'dplyr-funcs-datetime.R' + 'dplyr-funcs-doc.R' 'dplyr-funcs-math.R' 'dplyr-funcs-string.R' 'dplyr-funcs-type.R' diff --git a/r/Makefile b/r/Makefile index 1ddbe595dd2..cb76b4c9775 100644 --- a/r/Makefile +++ b/r/Makefile @@ -26,6 +26,7 @@ style-all: R -s -e 'styler::style_file(setdiff(dir(pattern = "R$$", recursive = TRUE), source(".styler_excludes.R")$$value))' doc: style + R -s -f data-raw/docgen.R R -s -e 'roxygen2::roxygenize()' -git add --all man/*.Rd diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 53fb0280a50..e6b3f481e21 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -31,25 +31,50 @@ #' @keywords internal "_PACKAGE" +# TODO(ARROW-17666): Include notes about features not supported here. +supported_dplyr_methods <- list( + select = NULL, + filter = NULL, + collect = NULL, + summarise = NULL, + group_by = NULL, + groups = NULL, + group_vars = NULL, + group_by_drop_default = NULL, + ungroup = NULL, + mutate = NULL, + transmute = NULL, + arrange = NULL, + rename = NULL, + pull = NULL, + relocate = NULL, + compute = NULL, + collapse = NULL, + distinct = NULL, + left_join = NULL, + right_join = NULL, + inner_join = NULL, + full_join = NULL, + semi_join = NULL, + anti_join = NULL, + count = NULL, + tally = NULL, + rename_with = NULL, + union = NULL, + union_all = NULL, + glimpse = NULL, + show_query = NULL, + explain = NULL +) + #' @importFrom vctrs s3_register vec_size vec_cast vec_unique .onLoad <- function(...) { # Make sure C++ knows on which thread it is safe to call the R API InitializeMainRThread() - dplyr_methods <- paste0( - "dplyr::", - c( - "select", "filter", "collect", "summarise", "group_by", "groups", - "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute", - "arrange", "rename", "pull", "relocate", "compute", "collapse", - "distinct", "left_join", "right_join", "inner_join", "full_join", - "semi_join", "anti_join", "count", "tally", "rename_with", "union", - "union_all", "glimpse", "show_query", "explain" - ) - ) for (cl in c("Dataset", "ArrowTabular", "RecordBatchReader", "arrow_dplyr_query")) { - for (m in dplyr_methods) { - s3_register(m, cl) + for (m in names(supported_dplyr_methods)) { + s3_register(paste0("dplyr::", m), cl) } } s3_register("dplyr::tbl_vars", "arrow_dplyr_query") diff --git a/r/R/dplyr-funcs-augmented.R b/r/R/dplyr-funcs-augmented.R index 6e751d49f61..1067f15573b 100644 --- a/r/R/dplyr-funcs-augmented.R +++ b/r/R/dplyr-funcs-augmented.R @@ -15,8 +15,21 @@ # specific language governing permissions and limitations # under the License. +#' Add the data filename as a column +#' +#' This function only exists inside `arrow` `dplyr` queries, and it only is +#' valid when quering on a `FileSystemDataset`. +#' +#' @return A `FieldRef` `Expression` that refers to the filename augmented +#' column. +#' @examples +#' \dontrun{ +#' open_dataset("nyc-taxi") %>% +#' mutate(file = add_filename()) +#' } +#' @keywords internal +add_filename <- function() Expression$field_ref("__filename") + register_bindings_augmented <- function() { - register_binding("add_filename", function() { - Expression$field_ref("__filename") - }) + register_binding("arrow::add_filename", add_filename) } diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9a010452b84..6106adbc5e4 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -649,55 +649,54 @@ register_bindings_datetime_parsers <- function() { build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) }) - } register_bindings_datetime_rounding <- function() { register_binding( - "round_date", + "lubridate::round_date", function(x, unit = "second", week_start = getOption("lubridate.week.start", 7)) { + opts <- parse_period_unit(unit) + if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start + return(shift_temporal_to_week("round_temporal", x, week_start, options = opts)) + } - opts <- parse_period_unit(unit) - if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start - return(shift_temporal_to_week("round_temporal", x, week_start, options = opts)) + Expression$create("round_temporal", x, options = opts) } - - Expression$create("round_temporal", x, options = opts) - }) + ) register_binding( - "floor_date", + "lubridate::floor_date", function(x, unit = "second", week_start = getOption("lubridate.week.start", 7)) { + opts <- parse_period_unit(unit) + if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start + return(shift_temporal_to_week("floor_temporal", x, week_start, options = opts)) + } - opts <- parse_period_unit(unit) - if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start - return(shift_temporal_to_week("floor_temporal", x, week_start, options = opts)) + Expression$create("floor_temporal", x, options = opts) } - - Expression$create("floor_temporal", x, options = opts) - }) + ) register_binding( - "ceiling_date", + "lubridate::ceiling_date", function(x, unit = "second", change_on_boundary = NULL, week_start = getOption("lubridate.week.start", 7)) { - opts <- parse_period_unit(unit) - if (is.null(change_on_boundary)) { - change_on_boundary <- ifelse(call_binding("is.Date", x), TRUE, FALSE) - } - opts$ceil_is_strictly_greater <- change_on_boundary - - if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start - return(shift_temporal_to_week("ceil_temporal", x, week_start, options = opts)) - } + opts <- parse_period_unit(unit) + if (is.null(change_on_boundary)) { + change_on_boundary <- ifelse(call_binding("is.Date", x), TRUE, FALSE) + } + opts$ceil_is_strictly_greater <- change_on_boundary - Expression$create("ceil_temporal", x, options = opts) - }) + if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start + return(shift_temporal_to_week("ceil_temporal", x, week_start, options = opts)) + } + Expression$create("ceil_temporal", x, options = opts) + } + ) } diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R new file mode 100644 index 00000000000..cac0310f49b --- /dev/null +++ b/r/R/dplyr-funcs-doc.R @@ -0,0 +1,332 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Generated by using data-raw/docgen.R -> do not edit by hand + +#' Functions available in Arrow dplyr queries +#' +#' The `arrow` package contains methods for 32 `dplyr` table functions, many of +#' which are "verbs" that do transformations to one or more tables. +#' The package also has mappings of 205 R functions to the corresponding +#' functions in the Arrow compute library. These allow you to write code inside +#' of `dplyr` methods that call R functions, including many in packages like +#' `stringr` and `lubridate`, and they will get translated to Arrow and run +#' on the Arrow query engine (Acero). This document lists all of the mapped +#' functions. +#' +#' # `dplyr` verbs +#' +#' Most verb functions return an `arrow_dplyr_query` object, similar in spirit +#' to a `dbplyr::tbl_lazy`. This means that the verbs do not eagerly evaluate +#' the query on the data. To run the query, call either `compute()`, +#' which returns an `arrow` [Table], or `collect()`, which pulls the resulting +#' Table into an R `data.frame`. +#' +#' * [`anti_join()`][dplyr::anti_join()] +#' * [`arrange()`][dplyr::arrange()] +#' * [`collapse()`][dplyr::collapse()] +#' * [`collect()`][dplyr::collect()] +#' * [`compute()`][dplyr::compute()] +#' * [`count()`][dplyr::count()] +#' * [`distinct()`][dplyr::distinct()] +#' * [`explain()`][dplyr::explain()] +#' * [`filter()`][dplyr::filter()] +#' * [`full_join()`][dplyr::full_join()] +#' * [`glimpse()`][dplyr::glimpse()] +#' * [`group_by()`][dplyr::group_by()] +#' * [`group_by_drop_default()`][dplyr::group_by_drop_default()] +#' * [`group_vars()`][dplyr::group_vars()] +#' * [`groups()`][dplyr::groups()] +#' * [`inner_join()`][dplyr::inner_join()] +#' * [`left_join()`][dplyr::left_join()] +#' * [`mutate()`][dplyr::mutate()] +#' * [`pull()`][dplyr::pull()] +#' * [`relocate()`][dplyr::relocate()] +#' * [`rename()`][dplyr::rename()] +#' * [`rename_with()`][dplyr::rename_with()] +#' * [`right_join()`][dplyr::right_join()] +#' * [`select()`][dplyr::select()] +#' * [`semi_join()`][dplyr::semi_join()] +#' * [`show_query()`][dplyr::show_query()] +#' * [`summarise()`][dplyr::summarise()] +#' * [`tally()`][dplyr::tally()] +#' * [`transmute()`][dplyr::transmute()] +#' * [`ungroup()`][dplyr::ungroup()] +#' * [`union()`][dplyr::union()] +#' * [`union_all()`][dplyr::union_all()] +#' +#' # Function mappings +#' +#' In the list below, any differences in behavior or support between Acero and +#' the R function are listed. If no notes follow the function name, then you +#' can assume that the function works in Acero just as it does in R. +#' +#' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both +#' `str_sub()` and `stringr::str_sub()` work. +#' +#' In addition to these functions, you can call any of Arrow's 243 compute +#' functions directly. Arrow has many functions that don't map to an existing R +#' function. In other cases where there is an R function mapping, you can still +#' call the Arrow function directly if you don't want the adaptations that the R +#' mapping has that make Acero behave like R. These functions are listed in the +#' [C++ documentation](https://arrow.apache.org/docs/cpp/compute.html), and +#' in the function registry in R, they are named with an `arrow_` prefix, such +#' as `arrow_ascii_is_decimal`. +#' +#' ## arrow +#' +#' * [`add_filename()`][arrow::add_filename()] +#' * [`cast()`][arrow::cast()] +#' +#' ## base +#' +#' * [`-`][-()] +#' * [`!`][!()] +#' * [`!=`][!=()] +#' * [`*`][*()] +#' * [`/`][/()] +#' * [`&`][&()] +#' * [`%/%`][%/%()] +#' * [`%%`][%%()] +#' * [`%in%`][%in%()] +#' * [`^`][^()] +#' * [`+`][+()] +#' * [`<`][<()] +#' * [`<=`][<=()] +#' * [`==`][==()] +#' * [`>`][>()] +#' * [`>=`][>=()] +#' * [`|`][|()] +#' * [`abs()`][base::abs()] +#' * [`acos()`][base::acos()] +#' * [`all()`][base::all()] +#' * [`any()`][base::any()] +#' * [`as.character()`][base::as.character()] +#' * [`as.Date()`][base::as.Date()] +#' * [`as.difftime()`][base::as.difftime()] +#' * [`as.double()`][base::as.double()] +#' * [`as.integer()`][base::as.integer()] +#' * [`as.logical()`][base::as.logical()] +#' * [`as.numeric()`][base::as.numeric()] +#' * [`asin()`][base::asin()] +#' * [`ceiling()`][base::ceiling()] +#' * [`cos()`][base::cos()] +#' * [`data.frame()`][base::data.frame()] +#' * [`difftime()`][base::difftime()] +#' * [`endsWith()`][base::endsWith()] +#' * [`exp()`][base::exp()] +#' * [`floor()`][base::floor()] +#' * [`format()`][base::format()] +#' * [`grepl()`][base::grepl()] +#' * [`gsub()`][base::gsub()] +#' * [`ifelse()`][base::ifelse()] +#' * [`is.character()`][base::is.character()] +#' * [`is.double()`][base::is.double()] +#' * [`is.factor()`][base::is.factor()] +#' * [`is.finite()`][base::is.finite()] +#' * [`is.infinite()`][base::is.infinite()] +#' * [`is.integer()`][base::is.integer()] +#' * [`is.list()`][base::is.list()] +#' * [`is.logical()`][base::is.logical()] +#' * [`is.na()`][base::is.na()] +#' * [`is.nan()`][base::is.nan()] +#' * [`is.numeric()`][base::is.numeric()] +#' * [`ISOdate()`][base::ISOdate()] +#' * [`ISOdatetime()`][base::ISOdatetime()] +#' * [`log()`][base::log()] +#' * [`log10()`][base::log10()] +#' * [`log1p()`][base::log1p()] +#' * [`log2()`][base::log2()] +#' * [`logb()`][base::logb()] +#' * [`max()`][base::max()] +#' * [`mean()`][base::mean()] +#' * [`min()`][base::min()] +#' * [`nchar()`][base::nchar()] +#' * [`paste()`][base::paste()]: the `collapse` argument is not yet supported +#' * [`paste0()`][base::paste0()]: the `collapse` argument is not yet supported +#' * [`pmax()`][base::pmax()] +#' * [`pmin()`][base::pmin()] +#' * [`round()`][base::round()] +#' * [`sign()`][base::sign()] +#' * [`sin()`][base::sin()] +#' * [`sqrt()`][base::sqrt()] +#' * [`startsWith()`][base::startsWith()] +#' * [`strftime()`][base::strftime()] +#' * [`strptime()`][base::strptime()] +#' * [`strrep()`][base::strrep()] +#' * [`strsplit()`][base::strsplit()] +#' * [`sub()`][base::sub()] +#' * [`substr()`][base::substr()] +#' * [`substring()`][base::substring()] +#' * [`sum()`][base::sum()] +#' * [`tan()`][base::tan()] +#' * [`tolower()`][base::tolower()] +#' * [`toupper()`][base::toupper()] +#' * [`trunc()`][base::trunc()] +#' +#' ## bit64 +#' +#' * [`as.integer64()`][bit64::as.integer64()] +#' * [`is.integer64()`][bit64::is.integer64()] +#' +#' ## dplyr +#' +#' * [`across()`][dplyr::across()]: only supported inside `mutate()`, `summarize()`, and `arrange()`; purrr-style lambda functions and use of `where()` selection helper not yet supported +#' * [`between()`][dplyr::between()] +#' * [`case_when()`][dplyr::case_when()] +#' * [`coalesce()`][dplyr::coalesce()] +#' * [`desc()`][dplyr::desc()] +#' * [`if_else()`][dplyr::if_else()] +#' * [`n()`][dplyr::n()] +#' * [`n_distinct()`][dplyr::n_distinct()] +#' +#' ## lubridate +#' +#' * [`am()`][lubridate::am()] +#' * [`as_date()`][lubridate::as_date()] +#' * [`as_datetime()`][lubridate::as_datetime()] +#' * [`ceiling_date()`][lubridate::ceiling_date()] +#' * [`date()`][lubridate::date()] +#' * [`date_decimal()`][lubridate::date_decimal()] +#' * [`day()`][lubridate::day()] +#' * [`ddays()`][lubridate::ddays()] +#' * [`decimal_date()`][lubridate::decimal_date()] +#' * [`dhours()`][lubridate::dhours()] +#' * [`dmicroseconds()`][lubridate::dmicroseconds()] +#' * [`dmilliseconds()`][lubridate::dmilliseconds()] +#' * [`dminutes()`][lubridate::dminutes()] +#' * [`dmonths()`][lubridate::dmonths()] +#' * [`dmy()`][lubridate::dmy()] +#' * [`dmy_h()`][lubridate::dmy_h()] +#' * [`dmy_hm()`][lubridate::dmy_hm()] +#' * [`dmy_hms()`][lubridate::dmy_hms()] +#' * [`dnanoseconds()`][lubridate::dnanoseconds()] +#' * [`dpicoseconds()`][lubridate::dpicoseconds()] +#' * [`dseconds()`][lubridate::dseconds()] +#' * [`dst()`][lubridate::dst()] +#' * [`dweeks()`][lubridate::dweeks()] +#' * [`dyears()`][lubridate::dyears()] +#' * [`dym()`][lubridate::dym()] +#' * [`epiweek()`][lubridate::epiweek()] +#' * [`epiyear()`][lubridate::epiyear()] +#' * [`fast_strptime()`][lubridate::fast_strptime()] +#' * [`floor_date()`][lubridate::floor_date()] +#' * [`format_ISO8601()`][lubridate::format_ISO8601()] +#' * [`hour()`][lubridate::hour()] +#' * [`is.Date()`][lubridate::is.Date()] +#' * [`is.instant()`][lubridate::is.instant()] +#' * [`is.POSIXct()`][lubridate::is.POSIXct()] +#' * [`is.timepoint()`][lubridate::is.timepoint()] +#' * [`isoweek()`][lubridate::isoweek()] +#' * [`isoyear()`][lubridate::isoyear()] +#' * [`leap_year()`][lubridate::leap_year()] +#' * [`make_date()`][lubridate::make_date()] +#' * [`make_datetime()`][lubridate::make_datetime()] +#' * [`make_difftime()`][lubridate::make_difftime()] +#' * [`mday()`][lubridate::mday()] +#' * [`mdy()`][lubridate::mdy()] +#' * [`mdy_h()`][lubridate::mdy_h()] +#' * [`mdy_hm()`][lubridate::mdy_hm()] +#' * [`mdy_hms()`][lubridate::mdy_hms()] +#' * [`minute()`][lubridate::minute()] +#' * [`month()`][lubridate::month()] +#' * [`my()`][lubridate::my()] +#' * [`myd()`][lubridate::myd()] +#' * [`parse_date_time()`][lubridate::parse_date_time()] +#' * [`pm()`][lubridate::pm()] +#' * [`qday()`][lubridate::qday()] +#' * [`quarter()`][lubridate::quarter()] +#' * [`round_date()`][lubridate::round_date()] +#' * [`second()`][lubridate::second()] +#' * [`semester()`][lubridate::semester()] +#' * [`tz()`][lubridate::tz()] +#' * [`wday()`][lubridate::wday()] +#' * [`week()`][lubridate::week()] +#' * [`yday()`][lubridate::yday()] +#' * [`ydm()`][lubridate::ydm()] +#' * [`ydm_h()`][lubridate::ydm_h()] +#' * [`ydm_hm()`][lubridate::ydm_hm()] +#' * [`ydm_hms()`][lubridate::ydm_hms()] +#' * [`year()`][lubridate::year()] +#' * [`ym()`][lubridate::ym()] +#' * [`ymd()`][lubridate::ymd()] +#' * [`ymd_h()`][lubridate::ymd_h()] +#' * [`ymd_hm()`][lubridate::ymd_hm()] +#' * [`ymd_hms()`][lubridate::ymd_hms()] +#' * [`yq()`][lubridate::yq()] +#' +#' ## methods +#' +#' * [`is()`][methods::is()] +#' +#' ## rlang +#' +#' * [`is_character()`][rlang::is_character()] +#' * [`is_double()`][rlang::is_double()] +#' * [`is_integer()`][rlang::is_integer()] +#' * [`is_list()`][rlang::is_list()] +#' * [`is_logical()`][rlang::is_logical()] +#' +#' ## stats +#' +#' * [`median()`][stats::median()] +#' * [`quantile()`][stats::quantile()] +#' * [`sd()`][stats::sd()] +#' * [`var()`][stats::var()] +#' +#' ## stringi +#' +#' * [`stri_reverse()`][stringi::stri_reverse()] +#' +#' ## stringr +#' +#' * [`str_c()`][stringr::str_c()]: the `collapse` argument is not yet supported +#' * [`str_count()`][stringr::str_count()] +#' * [`str_detect()`][stringr::str_detect()] +#' * [`str_dup()`][stringr::str_dup()] +#' * [`str_ends()`][stringr::str_ends()] +#' * [`str_length()`][stringr::str_length()] +#' * `str_like()`: not yet in a released version of `stringr`, but it is supported in `arrow` +#' * [`str_pad()`][stringr::str_pad()] +#' * [`str_replace()`][stringr::str_replace()] +#' * [`str_replace_all()`][stringr::str_replace_all()] +#' * [`str_split()`][stringr::str_split()] +#' * [`str_starts()`][stringr::str_starts()] +#' * [`str_sub()`][stringr::str_sub()] +#' * [`str_to_lower()`][stringr::str_to_lower()] +#' * [`str_to_title()`][stringr::str_to_title()] +#' * [`str_to_upper()`][stringr::str_to_upper()] +#' * [`str_trim()`][stringr::str_trim()] +#' +#' ## tibble +#' +#' * [`tibble()`][tibble::tibble()] +#' +#' ## tidyselect +#' +#' * [`all_of()`][tidyselect::all_of()] +#' * [`contains()`][tidyselect::contains()] +#' * [`ends_with()`][tidyselect::ends_with()] +#' * [`everything()`][tidyselect::everything()] +#' * [`last_col()`][tidyselect::last_col()] +#' * [`matches()`][tidyselect::matches()] +#' * [`num_range()`][tidyselect::num_range()] +#' * [`one_of()`][tidyselect::one_of()] +#' * [`starts_with()`][tidyselect::starts_with()] +#' +#' @name acero +NULL diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index b300d7c439e..eb2326ed056 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -161,32 +161,44 @@ register_bindings_string_join <- function() { } } - register_binding("base::paste", function(..., sep = " ", collapse = NULL, recycle0 = FALSE) { - assert_that( - is.null(collapse), - msg = "paste() with the collapse argument is not yet supported in Arrow" - ) - if (!inherits(sep, "Expression")) { - assert_that(!is.na(sep), msg = "Invalid separator") - } - arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep) - }) - - register_binding("base::paste0", function(..., collapse = NULL, recycle0 = FALSE) { - assert_that( - is.null(collapse), - msg = "paste0() with the collapse argument is not yet supported in Arrow" - ) - arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "") - }) - - register_binding("stringr::str_c", function(..., sep = "", collapse = NULL) { - assert_that( - is.null(collapse), - msg = "str_c() with the collapse argument is not yet supported in Arrow" - ) - arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep) - }) + register_binding( + "base::paste", + function(..., sep = " ", collapse = NULL, recycle0 = FALSE) { + assert_that( + is.null(collapse), + msg = "paste() with the collapse argument is not yet supported in Arrow" + ) + if (!inherits(sep, "Expression")) { + assert_that(!is.na(sep), msg = "Invalid separator") + } + arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep) + }, + notes = "the `collapse` argument is not yet supported" + ) + + register_binding( + "base::paste0", + function(..., collapse = NULL, recycle0 = FALSE) { + assert_that( + is.null(collapse), + msg = "paste0() with the collapse argument is not yet supported in Arrow" + ) + arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "") + }, + notes = "the `collapse` argument is not yet supported" + ) + + register_binding( + "stringr::str_c", + function(..., sep = "", collapse = NULL) { + assert_that( + is.null(collapse), + msg = "str_c() with the collapse argument is not yet supported in Arrow" + ) + arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep) + }, + notes = "the `collapse` argument is not yet supported" + ) } register_bindings_string_regex <- function() { @@ -227,15 +239,17 @@ register_bindings_string_regex <- function() { out }) - register_binding("stringr::str_like", function(string, - pattern, - ignore_case = TRUE) { - Expression$create( - "match_like", - string, - options = list(pattern = pattern, ignore_case = ignore_case) - ) - }) + register_binding( + "stringr::str_like", + function(string, pattern, ignore_case = TRUE) { + Expression$create( + "match_like", + string, + options = list(pattern = pattern, ignore_case = ignore_case) + ) + }, + notes = "not yet in a released version of `stringr`, but it is supported in `arrow`" + ) register_binding("stringr::str_count", function(string, pattern) { opts <- get_stringr_pattern_options(enexpr(pattern)) @@ -337,7 +351,7 @@ register_bindings_string_regex <- function() { register_binding("stringr::str_replace_all", arrow_stringr_string_replace_function(-1L)) register_binding("base::strsplit", function(x, split, fixed = FALSE, perl = FALSE, - useBytes = FALSE) { + useBytes = FALSE) { assert_that(is.string(split)) arrow_fun <- ifelse(fixed, "split_pattern", "split_pattern_regex") diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 9925d0347f7..aa50cdebc5d 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -23,23 +23,34 @@ register_bindings_type <- function() { register_bindings_type_format() } -register_bindings_type_cast <- function() { - register_binding("cast", function(x, target_type, safe = TRUE, ...) { - opts <- cast_options(safe, ...) - opts$to_type <- as_type(target_type) - Expression$create("cast", x, options = opts) - }) +#' Change the type of an array or column +#' +#' This is a wrapper around the `$cast()` method that many Arrow objects have. +#' It is more convenient to call inside `dplyr` pipelines than the method. +#' +#' @param x an `Array`, `Table`, `Expression`, or similar Arrow data object. +#' @param to [DataType] to cast to; for [Table] and [RecordBatch], +#' it should be a [Schema]. +#' @param safe logical: only allow the type conversion if no data is lost +#' (truncation, overflow, etc.). Default is `TRUE` +#' @param ... specific `CastOptions` to set +#' @return an `Expression` +#' +#' @examples +#' \dontrun{ +#' mtcars %>% +#' arrow_table() %>% +#' mutate(cyl = cast(cyl, string())) +#' } +#' @keywords internal +#' @seealso https://arrow.apache.org/docs/cpp/api/compute.html for the list of +#' supported CastOptions. +cast <- function(x, to, safe = TRUE, ...) { + x$cast(to, safe = safe, ...) +} - register_binding("dictionary_encode", function(x, - null_encoding_behavior = c("mask", "encode")) { - behavior <- toupper(match.arg(null_encoding_behavior)) - null_encoding_behavior <- NullEncodingBehavior[[behavior]] - Expression$create( - "dictionary_encode", - x, - options = list(null_encoding_behavior = null_encoding_behavior) - ) - }) +register_bindings_type_cast <- function() { + register_binding("arrow::cast", cast) # as.* type casting functions # as.factor() is mapped in expression.R diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 4dadff54b48..a66db112d98 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -59,13 +59,17 @@ NULL #' summarise) because the data mask has to be a list. #' @param registry An environment in which the functions should be #' assigned. -#' +#' @param notes string for the docs: note any limitations or differences in +#' behavior between the Arrow version and the R function. #' @return The previously registered binding or `NULL` if no previously #' registered function existed. #' @keywords internal #' -register_binding <- function(fun_name, fun, registry = nse_funcs, - update_cache = FALSE) { +register_binding <- function(fun_name, + fun, + registry = nse_funcs, + update_cache = FALSE, + notes = character(0)) { unqualified_name <- sub("^.*?:{+}", "", fun_name) previous_fun <- registry[[unqualified_name]] @@ -76,7 +80,8 @@ register_binding <- function(fun_name, fun, registry = nse_funcs, paste0( "A \"", unqualified_name, - "\" binding already exists in the registry and will be overwritten.") + "\" binding already exists in the registry and will be overwritten." + ) ) } @@ -85,6 +90,8 @@ register_binding <- function(fun_name, fun, registry = nse_funcs, registry[[unqualified_name]] <- fun registry[[fun_name]] <- fun + .cache$docs[[fun_name]] <- notes + if (update_cache) { fun_cache <- .cache$functions fun_cache[[unqualified_name]] <- fun @@ -131,7 +138,7 @@ call_binding_agg <- function(fun_name, ...) { # Called in .onLoad() create_binding_cache <- function() { - arrow_funcs <- list() + .cache$docs <- list() # Register all available Arrow Compute functions, namespaced as arrow_fun. all_arrow_funs <- list_compute_functions() diff --git a/r/R/expression.R b/r/R/expression.R index 09a8ea24608..7a5a600d956 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -76,7 +76,6 @@ "lubridate::yday" = "day_of_year", "lubridate::year" = "year", "lubridate::leap_year" = "is_leap_year" - ) .binary_function_map <- list( @@ -158,13 +157,9 @@ Expression <- R6Class("Expression", compute___expr__type_id(self, schema) }, cast = function(to_type, safe = TRUE, ...) { - opts <- list( - to_type = to_type, - allow_int_overflow = !safe, - allow_time_truncate = !safe, - allow_float_truncate = !safe - ) - Expression$create("cast", self, options = modifyList(opts, list(...))) + opts <- cast_options(safe, ...) + opts$to_type <- as_type(to_type) + Expression$create("cast", self, options = opts) } ), active = list( diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index dfb0998ddff..70bd7ac518c 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -216,6 +216,7 @@ reference: - codec_is_available - title: Computation contents: + - acero - call_function - match_arrow - value_counts diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R new file mode 100644 index 00000000000..ef39bec272f --- /dev/null +++ b/r/data-raw/docgen.R @@ -0,0 +1,192 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This code generates dplyr-funcs-doc.R. +# It requires that the package be installed. + +file_template <- "# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# \"License\"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Generated by using data-raw/docgen.R -> do not edit by hand + +#' Functions available in Arrow dplyr queries +#' +#' The `arrow` package contains methods for %s `dplyr` table functions, many of +#' which are \"verbs\" that do transformations to one or more tables. +#' The package also has mappings of %s R functions to the corresponding +#' functions in the Arrow compute library. These allow you to write code inside +#' of `dplyr` methods that call R functions, including many in packages like +#' `stringr` and `lubridate`, and they will get translated to Arrow and run +#' on the Arrow query engine (Acero). This document lists all of the mapped +#' functions. +#' +#' # `dplyr` verbs +#' +#' Most verb functions return an `arrow_dplyr_query` object, similar in spirit +#' to a `dbplyr::tbl_lazy`. This means that the verbs do not eagerly evaluate +#' the query on the data. To run the query, call either `compute()`, +#' which returns an `arrow` [Table], or `collect()`, which pulls the resulting +#' Table into an R `data.frame`. +#' +%s +#' +#' # Function mappings +#' +#' In the list below, any differences in behavior or support between Acero and +#' the R function are listed. If no notes follow the function name, then you +#' can assume that the function works in Acero just as it does in R. +#' +#' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both +#' `str_sub()` and `stringr::str_sub()` work. +#' +#' In addition to these functions, you can call any of Arrow's %s compute +#' functions directly. Arrow has many functions that don't map to an existing R +#' function. In other cases where there is an R function mapping, you can still +#' call the Arrow function directly if you don't want the adaptations that the R +#' mapping has that make Acero behave like R. These functions are listed in the +#' [C++ documentation](https://arrow.apache.org/docs/cpp/compute.html), and +#' in the function registry in R, they are named with an `arrow_` prefix, such +#' as `arrow_ascii_is_decimal`. +#' +%s +#' +#' @name acero +NULL" + +library(dplyr) +library(purrr) + +# Functions that for whatever reason cause xref problems, so don't hyperlink +do_not_link <- c( + "stringr::str_like" # Still only in the unreleased version +) + +# Vectorized function to make entries for each function +render_fun <- function(fun, pkg_fun, notes) { + # Add () to fun if it's not an operator + not_operators <- grepl("^[[:alpha:]]", fun) + fun[not_operators] <- paste0(fun[not_operators], "()") + # Make it \code{} for better formatting + fun <- paste0("`", fun, "`") + # Wrap in \link{} + out <- ifelse( + pkg_fun %in% do_not_link, + fun, + paste0("[", fun, "][", pkg_fun, "()]") + ) + # Add notes after :, if exist + has_notes <- nzchar(notes) + out[has_notes] <- paste0(out[has_notes], ": ", notes[has_notes]) + # Make bullets + paste("*", out) +} + +# This renders a bulleted list under a package heading +render_pkg <- function(df, pkg) { + bullets <- df %>% + transmute(render_fun(fun, pkg_fun, notes)) %>% + pull() + # Add header + bullets <- c( + paste("##", pkg), + "", + bullets + ) + paste("#'", bullets, collapse = "\n") +} + +docs <- arrow:::.cache$docs + +# Add some functions + +# across() is handled by manipulating the quosures, not by nse_funcs +docs[["dplyr::across"]] <- c( + # TODO(ARROW-17387, ARROW-17389, ARROW-17390) + "only supported inside `mutate()`, `summarize()`, and `arrange()`;", + # TODO(ARROW-17366) + "purrr-style lambda functions", + "and use of `where()` selection helper not yet supported" +) +# desc() is a special helper handled inside of arrange() +docs[["dplyr::desc"]] <- character(0) + +# add tidyselect helpers by parsing the reexports file +tidyselect <- grep("^tidyselect::", readLines("R/reexports-tidyselect.R"), value = TRUE) + +docs <- c(docs, setNames(rep(list(NULL), length(tidyselect)), tidyselect)) + +fun_df <- tibble::tibble( + pkg_fun = names(docs), + notes = docs +) %>% + mutate( + has_pkg = grepl("::", pkg_fun), + fun = sub("^.*?:{+}", "", pkg_fun), + pkg = sub(":{+}.*$", "", pkg_fun), + # We will list operators under "base" (everything else must be pkg::fun) + pkg = if_else(has_pkg, pkg, "base"), + # Flatten notes to a single string + notes = map_chr(notes, ~ paste(., collapse = " ")) + ) %>% + arrange(pkg, fun) + +# Group by package name and render the lists +fun_doclets <- imap_chr(split(fun_df, fun_df$pkg), render_pkg) + +dplyr_verbs <- c( + arrow:::supported_dplyr_methods, + # Because this only has a method for arrow_dplyr_query, it's not in the main list + tbl_vars = NULL +) + +verb_bullets <- tibble::tibble( + fun = names(dplyr_verbs), + notes = dplyr_verbs +) %>% + mutate( + pkg_fun = paste0("dplyr::", fun), + notes = map_chr(notes, ~ paste(., collapse = " ")) + ) %>% + arrange(fun) %>% + transmute(render_fun(fun, pkg_fun, notes)) %>% + pull() + +writeLines( + sprintf( + file_template, + length(dplyr_verbs), + length(docs), + paste("#'", verb_bullets, collapse = "\n"), + length(arrow::list_compute_functions()), + paste(fun_doclets, collapse = "\n#'\n") + ), + "R/dplyr-funcs-doc.R" +) diff --git a/r/man/acero.Rd b/r/man/acero.Rd new file mode 100644 index 00000000000..5b5920f386e --- /dev/null +++ b/r/man/acero.Rd @@ -0,0 +1,339 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-funcs-doc.R +\name{acero} +\alias{acero} +\title{Functions available in Arrow dplyr queries} +\description{ +The \code{arrow} package contains methods for 32 \code{dplyr} table functions, many of +which are "verbs" that do transformations to one or more tables. +The package also has mappings of 205 R functions to the corresponding +functions in the Arrow compute library. These allow you to write code inside +of \code{dplyr} methods that call R functions, including many in packages like +\code{stringr} and \code{lubridate}, and they will get translated to Arrow and run +on the Arrow query engine (Acero). This document lists all of the mapped +functions. +} +\section{\code{dplyr} verbs}{ +Most verb functions return an \code{arrow_dplyr_query} object, similar in spirit +to a \code{dbplyr::tbl_lazy}. This means that the verbs do not eagerly evaluate +the query on the data. To run the query, call either \code{compute()}, +which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting +Table into an R \code{data.frame}. +\itemize{ +\item \code{\link[dplyr:filter-joins]{anti_join()}} +\item \code{\link[dplyr:arrange]{arrange()}} +\item \code{\link[dplyr:compute]{collapse()}} +\item \code{\link[dplyr:compute]{collect()}} +\item \code{\link[dplyr:compute]{compute()}} +\item \code{\link[dplyr:count]{count()}} +\item \code{\link[dplyr:distinct]{distinct()}} +\item \code{\link[dplyr:explain]{explain()}} +\item \code{\link[dplyr:filter]{filter()}} +\item \code{\link[dplyr:mutate-joins]{full_join()}} +\item \code{\link[dplyr:glimpse]{glimpse()}} +\item \code{\link[dplyr:group_by]{group_by()}} +\item \code{\link[dplyr:group_by_drop_default]{group_by_drop_default()}} +\item \code{\link[dplyr:group_data]{group_vars()}} +\item \code{\link[dplyr:group_data]{groups()}} +\item \code{\link[dplyr:mutate-joins]{inner_join()}} +\item \code{\link[dplyr:mutate-joins]{left_join()}} +\item \code{\link[dplyr:mutate]{mutate()}} +\item \code{\link[dplyr:pull]{pull()}} +\item \code{\link[dplyr:relocate]{relocate()}} +\item \code{\link[dplyr:rename]{rename()}} +\item \code{\link[dplyr:rename]{rename_with()}} +\item \code{\link[dplyr:mutate-joins]{right_join()}} +\item \code{\link[dplyr:select]{select()}} +\item \code{\link[dplyr:filter-joins]{semi_join()}} +\item \code{\link[dplyr:explain]{show_query()}} +\item \code{\link[dplyr:summarise]{summarise()}} +\item \code{\link[dplyr:count]{tally()}} +\item \code{\link[dplyr:mutate]{transmute()}} +\item \code{\link[dplyr:group_by]{ungroup()}} +\item \code{\link[dplyr:reexports]{union()}} +\item \code{\link[dplyr:setops]{union_all()}} +} +} + +\section{Function mappings}{ +In the list below, any differences in behavior or support between Acero and +the R function are listed. If no notes follow the function name, then you +can assume that the function works in Acero just as it does in R. + +Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both +\code{str_sub()} and \code{stringr::str_sub()} work. + +In addition to these functions, you can call any of Arrow's 243 compute +functions directly. Arrow has many functions that don't map to an existing R +function. In other cases where there is an R function mapping, you can still +call the Arrow function directly if you don't want the adaptations that the R +mapping has that make Acero behave like R. These functions are listed in the +\href{https://arrow.apache.org/docs/cpp/compute.html}{C++ documentation}, and +in the function registry in R, they are named with an \code{arrow_} prefix, such +as \code{arrow_ascii_is_decimal}. +\subsection{arrow}{ +\itemize{ +\item \code{\link[=add_filename]{add_filename()}} +\item \code{\link[=cast]{cast()}} +} +} + +\subsection{base}{ +\itemize{ +\item \code{\link[=-]{-}} +\item \code{\link[=!]{!}} +\item \code{\link[=!=]{!=}} +\item \code{\link[=*]{*}} +\item \code{\link[=/]{/}} +\item \code{\link[=&]{&}} +\item \code{\link[=\%/\%]{\%/\%}} +\item \code{\link[=\%\%]{\%\%}} +\item \code{\link[=\%in\%]{\%in\%}} +\item \code{\link[=^]{^}} +\item \code{\link[=+]{+}} +\item \code{\link[=<]{<}} +\item \code{\link[=<=]{<=}} +\item \code{\link[===]{==}} +\item \code{\link[=>]{>}} +\item \code{\link[=>=]{>=}} +\item \code{\link[=|]{|}} +\item \code{\link[base:MathFun]{abs()}} +\item \code{\link[base:Trig]{acos()}} +\item \code{\link[base:all]{all()}} +\item \code{\link[base:any]{any()}} +\item \code{\link[base:character]{as.character()}} +\item \code{\link[base:as.Date]{as.Date()}} +\item \code{\link[base:difftime]{as.difftime()}} +\item \code{\link[base:double]{as.double()}} +\item \code{\link[base:integer]{as.integer()}} +\item \code{\link[base:logical]{as.logical()}} +\item \code{\link[base:numeric]{as.numeric()}} +\item \code{\link[base:Trig]{asin()}} +\item \code{\link[base:Round]{ceiling()}} +\item \code{\link[base:Trig]{cos()}} +\item \code{\link[base:data.frame]{data.frame()}} +\item \code{\link[base:difftime]{difftime()}} +\item \code{\link[base:startsWith]{endsWith()}} +\item \code{\link[base:Log]{exp()}} +\item \code{\link[base:Round]{floor()}} +\item \code{\link[base:format]{format()}} +\item \code{\link[base:grep]{grepl()}} +\item \code{\link[base:grep]{gsub()}} +\item \code{\link[base:ifelse]{ifelse()}} +\item \code{\link[base:character]{is.character()}} +\item \code{\link[base:double]{is.double()}} +\item \code{\link[base:factor]{is.factor()}} +\item \code{\link[base:is.finite]{is.finite()}} +\item \code{\link[base:is.finite]{is.infinite()}} +\item \code{\link[base:integer]{is.integer()}} +\item \code{\link[base:list]{is.list()}} +\item \code{\link[base:logical]{is.logical()}} +\item \code{\link[base:NA]{is.na()}} +\item \code{\link[base:is.finite]{is.nan()}} +\item \code{\link[base:numeric]{is.numeric()}} +\item \code{\link[base:ISOdatetime]{ISOdate()}} +\item \code{\link[base:ISOdatetime]{ISOdatetime()}} +\item \code{\link[base:Log]{log()}} +\item \code{\link[base:Log]{log10()}} +\item \code{\link[base:Log]{log1p()}} +\item \code{\link[base:Log]{log2()}} +\item \code{\link[base:Log]{logb()}} +\item \code{\link[base:Extremes]{max()}} +\item \code{\link[base:mean]{mean()}} +\item \code{\link[base:Extremes]{min()}} +\item \code{\link[base:nchar]{nchar()}} +\item \code{\link[base:paste]{paste()}}: the \code{collapse} argument is not yet supported +\item \code{\link[base:paste]{paste0()}}: the \code{collapse} argument is not yet supported +\item \code{\link[base:Extremes]{pmax()}} +\item \code{\link[base:Extremes]{pmin()}} +\item \code{\link[base:Round]{round()}} +\item \code{\link[base:sign]{sign()}} +\item \code{\link[base:Trig]{sin()}} +\item \code{\link[base:MathFun]{sqrt()}} +\item \code{\link[base:startsWith]{startsWith()}} +\item \code{\link[base:strptime]{strftime()}} +\item \code{\link[base:strptime]{strptime()}} +\item \code{\link[base:strrep]{strrep()}} +\item \code{\link[base:strsplit]{strsplit()}} +\item \code{\link[base:grep]{sub()}} +\item \code{\link[base:substr]{substr()}} +\item \code{\link[base:substr]{substring()}} +\item \code{\link[base:sum]{sum()}} +\item \code{\link[base:Trig]{tan()}} +\item \code{\link[base:chartr]{tolower()}} +\item \code{\link[base:chartr]{toupper()}} +\item \code{\link[base:Round]{trunc()}} +} +} + +\subsection{bit64}{ +\itemize{ +\item \code{\link[bit64:as.integer64.character]{as.integer64()}} +\item \code{\link[bit64:bit64-package]{is.integer64()}} +} +} + +\subsection{dplyr}{ +\itemize{ +\item \code{\link[dplyr:across]{across()}}: only supported inside \code{mutate()}, \code{summarize()}, and \code{arrange()}; purrr-style lambda functions and use of \code{where()} selection helper not yet supported +\item \code{\link[dplyr:between]{between()}} +\item \code{\link[dplyr:case_when]{case_when()}} +\item \code{\link[dplyr:coalesce]{coalesce()}} +\item \code{\link[dplyr:desc]{desc()}} +\item \code{\link[dplyr:if_else]{if_else()}} +\item \code{\link[dplyr:context]{n()}} +\item \code{\link[dplyr:n_distinct]{n_distinct()}} +} +} + +\subsection{lubridate}{ +\itemize{ +\item \code{\link[lubridate:am]{am()}} +\item \code{\link[lubridate:as_date]{as_date()}} +\item \code{\link[lubridate:as_date]{as_datetime()}} +\item \code{\link[lubridate:round_date]{ceiling_date()}} +\item \code{\link[lubridate:date]{date()}} +\item \code{\link[lubridate:date_decimal]{date_decimal()}} +\item \code{\link[lubridate:day]{day()}} +\item \code{\link[lubridate:duration]{ddays()}} +\item \code{\link[lubridate:decimal_date]{decimal_date()}} +\item \code{\link[lubridate:duration]{dhours()}} +\item \code{\link[lubridate:duration]{dmicroseconds()}} +\item \code{\link[lubridate:duration]{dmilliseconds()}} +\item \code{\link[lubridate:duration]{dminutes()}} +\item \code{\link[lubridate:duration]{dmonths()}} +\item \code{\link[lubridate:ymd]{dmy()}} +\item \code{\link[lubridate:ymd_hms]{dmy_h()}} +\item \code{\link[lubridate:ymd_hms]{dmy_hm()}} +\item \code{\link[lubridate:ymd_hms]{dmy_hms()}} +\item \code{\link[lubridate:duration]{dnanoseconds()}} +\item \code{\link[lubridate:duration]{dpicoseconds()}} +\item \code{\link[lubridate:duration]{dseconds()}} +\item \code{\link[lubridate:dst]{dst()}} +\item \code{\link[lubridate:duration]{dweeks()}} +\item \code{\link[lubridate:duration]{dyears()}} +\item \code{\link[lubridate:ymd]{dym()}} +\item \code{\link[lubridate:week]{epiweek()}} +\item \code{\link[lubridate:year]{epiyear()}} +\item \code{\link[lubridate:parse_date_time]{fast_strptime()}} +\item \code{\link[lubridate:round_date]{floor_date()}} +\item \code{\link[lubridate:format_ISO8601]{format_ISO8601()}} +\item \code{\link[lubridate:hour]{hour()}} +\item \code{\link[lubridate:date_utils]{is.Date()}} +\item \code{\link[lubridate:is.instant]{is.instant()}} +\item \code{\link[lubridate:posix_utils]{is.POSIXct()}} +\item \code{\link[lubridate:is.instant]{is.timepoint()}} +\item \code{\link[lubridate:week]{isoweek()}} +\item \code{\link[lubridate:year]{isoyear()}} +\item \code{\link[lubridate:leap_year]{leap_year()}} +\item \code{\link[lubridate:make_datetime]{make_date()}} +\item \code{\link[lubridate:make_datetime]{make_datetime()}} +\item \code{\link[lubridate:make_difftime]{make_difftime()}} +\item \code{\link[lubridate:day]{mday()}} +\item \code{\link[lubridate:ymd]{mdy()}} +\item \code{\link[lubridate:ymd_hms]{mdy_h()}} +\item \code{\link[lubridate:ymd_hms]{mdy_hm()}} +\item \code{\link[lubridate:ymd_hms]{mdy_hms()}} +\item \code{\link[lubridate:minute]{minute()}} +\item \code{\link[lubridate:month]{month()}} +\item \code{\link[lubridate:ymd]{my()}} +\item \code{\link[lubridate:ymd]{myd()}} +\item \code{\link[lubridate:parse_date_time]{parse_date_time()}} +\item \code{\link[lubridate:am]{pm()}} +\item \code{\link[lubridate:day]{qday()}} +\item \code{\link[lubridate:quarter]{quarter()}} +\item \code{\link[lubridate:round_date]{round_date()}} +\item \code{\link[lubridate:second]{second()}} +\item \code{\link[lubridate:quarter]{semester()}} +\item \code{\link[lubridate:tz]{tz()}} +\item \code{\link[lubridate:day]{wday()}} +\item \code{\link[lubridate:week]{week()}} +\item \code{\link[lubridate:day]{yday()}} +\item \code{\link[lubridate:ymd]{ydm()}} +\item \code{\link[lubridate:ymd_hms]{ydm_h()}} +\item \code{\link[lubridate:ymd_hms]{ydm_hm()}} +\item \code{\link[lubridate:ymd_hms]{ydm_hms()}} +\item \code{\link[lubridate:year]{year()}} +\item \code{\link[lubridate:ymd]{ym()}} +\item \code{\link[lubridate:ymd]{ymd()}} +\item \code{\link[lubridate:ymd_hms]{ymd_h()}} +\item \code{\link[lubridate:ymd_hms]{ymd_hm()}} +\item \code{\link[lubridate:ymd_hms]{ymd_hms()}} +\item \code{\link[lubridate:ymd]{yq()}} +} +} + +\subsection{methods}{ +\itemize{ +\item \code{\link[methods:is]{is()}} +} +} + +\subsection{rlang}{ +\itemize{ +\item \code{\link[rlang:type-predicates]{is_character()}} +\item \code{\link[rlang:type-predicates]{is_double()}} +\item \code{\link[rlang:type-predicates]{is_integer()}} +\item \code{\link[rlang:type-predicates]{is_list()}} +\item \code{\link[rlang:type-predicates]{is_logical()}} +} +} + +\subsection{stats}{ +\itemize{ +\item \code{\link[stats:median]{median()}} +\item \code{\link[stats:quantile]{quantile()}} +\item \code{\link[stats:sd]{sd()}} +\item \code{\link[stats:cor]{var()}} +} +} + +\subsection{stringi}{ +\itemize{ +\item \code{\link[stringi:stri_reverse]{stri_reverse()}} +} +} + +\subsection{stringr}{ +\itemize{ +\item \code{\link[stringr:str_c]{str_c()}}: the \code{collapse} argument is not yet supported +\item \code{\link[stringr:str_count]{str_count()}} +\item \code{\link[stringr:str_detect]{str_detect()}} +\item \code{\link[stringr:str_dup]{str_dup()}} +\item \code{\link[stringr:str_starts]{str_ends()}} +\item \code{\link[stringr:str_length]{str_length()}} +\item \code{str_like()}: not yet in a released version of \code{stringr}, but it is supported in \code{arrow} +\item \code{\link[stringr:str_pad]{str_pad()}} +\item \code{\link[stringr:str_replace]{str_replace()}} +\item \code{\link[stringr:str_replace]{str_replace_all()}} +\item \code{\link[stringr:str_split]{str_split()}} +\item \code{\link[stringr:str_starts]{str_starts()}} +\item \code{\link[stringr:str_sub]{str_sub()}} +\item \code{\link[stringr:case]{str_to_lower()}} +\item \code{\link[stringr:case]{str_to_title()}} +\item \code{\link[stringr:case]{str_to_upper()}} +\item \code{\link[stringr:str_trim]{str_trim()}} +} +} + +\subsection{tibble}{ +\itemize{ +\item \code{\link[tibble:tibble]{tibble()}} +} +} + +\subsection{tidyselect}{ +\itemize{ +\item \code{\link[tidyselect:all_of]{all_of()}} +\item \code{\link[tidyselect:starts_with]{contains()}} +\item \code{\link[tidyselect:starts_with]{ends_with()}} +\item \code{\link[tidyselect:everything]{everything()}} +\item \code{\link[tidyselect:everything]{last_col()}} +\item \code{\link[tidyselect:starts_with]{matches()}} +\item \code{\link[tidyselect:starts_with]{num_range()}} +\item \code{\link[tidyselect:one_of]{one_of()}} +\item \code{\link[tidyselect:starts_with]{starts_with()}} +} +} +} + diff --git a/r/man/add_filename.Rd b/r/man/add_filename.Rd new file mode 100644 index 00000000000..ca7ed0e4b17 --- /dev/null +++ b/r/man/add_filename.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-funcs-augmented.R +\name{add_filename} +\alias{add_filename} +\title{Add the data filename as a column} +\usage{ +add_filename() +} +\value{ +A \code{FieldRef} \code{Expression} that refers to the filename augmented +column. +} +\description{ +This function only exists inside \code{arrow} \code{dplyr} queries, and it only is +valid when quering on a \code{FileSystemDataset}. +} +\examples{ +\dontrun{ +open_dataset("nyc-taxi") \%>\% + mutate(file = add_filename()) +} +} +\keyword{internal} diff --git a/r/man/cast.Rd b/r/man/cast.Rd new file mode 100644 index 00000000000..88134f2e022 --- /dev/null +++ b/r/man/cast.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-funcs-type.R +\name{cast} +\alias{cast} +\title{Change the type of an array or column} +\usage{ +cast(x, to, safe = TRUE, ...) +} +\arguments{ +\item{x}{an \code{Array}, \code{Table}, \code{Expression}, or similar Arrow data object.} + +\item{to}{\link{DataType} to cast to; for \link{Table} and \link{RecordBatch}, +it should be a \link{Schema}.} + +\item{safe}{logical: only allow the type conversion if no data is lost +(truncation, overflow, etc.). Default is \code{TRUE}} + +\item{...}{specific \code{CastOptions} to set} +} +\value{ +an \code{Expression} +} +\description{ +This is a wrapper around the \verb{$cast()} method that many Arrow objects have. +It is more convenient to call inside \code{dplyr} pipelines than the method. +} +\examples{ +\dontrun{ +mtcars \%>\% + arrow_table() \%>\% + mutate(cyl = cast(cyl, string())) +} +} +\seealso{ +https://arrow.apache.org/docs/cpp/api/compute.html for the list of +supported CastOptions. +} +\keyword{internal} diff --git a/r/man/register_binding.Rd b/r/man/register_binding.Rd index c53df707516..d2a4a380543 100644 --- a/r/man/register_binding.Rd +++ b/r/man/register_binding.Rd @@ -4,7 +4,13 @@ \alias{register_binding} \title{Register compute bindings} \usage{ -register_binding(fun_name, fun, registry = nse_funcs, update_cache = FALSE) +register_binding( + fun_name, + fun, + registry = nse_funcs, + update_cache = FALSE, + notes = character(0) +) } \arguments{ \item{fun_name}{A string containing a function name in the form \code{"function"} or @@ -26,6 +32,9 @@ non-aggregate functions could be revisited...it is currently used as the data mask in mutate, filter, and aggregate (but not summarise) because the data mask has to be a list.} +\item{notes}{string for the docs: note any limitations or differences in +behavior between the Arrow version and the R function.} + \item{agg_fun}{An aggregate function or \code{NULL} to un-register a previous aggregate function. This function must accept \code{Expression} objects as arguments and return a \code{list()} with components: