From ce5b9671070a17e4aafe16c55a310da872e4d6d4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 08:54:25 +0100 Subject: [PATCH 01/20] Add simulate_data_frame helper function --- r/R/util.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/r/R/util.R b/r/R/util.R index d8ecb2732f2..e2f8f51ad7c 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -251,3 +251,17 @@ augment_io_error_msg <- function(e, call, schema = NULL, format = NULL) { handle_augmented_field_misuse(msg, call) abort(msg, call = call) } + +simulate_data_frame <- function(schema) { + arrays <- lapply(schema$fields, function(field) concat_arrays(type = field$type)) + vectors <- lapply( + arrays, + function(array) tryCatch( + as.vector(array), + error = function(...) vctrs::unspecified() + ) + ) + + names(vectors) <- names(schema) + tibble::new_tibble(vectors, nrow = 0) +} From 4b3572624d52bc1bb3efbcf6663cc28b00170dae Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 08:54:44 +0100 Subject: [PATCH 02/20] Use eval_select instead of vars_select --- r/R/dplyr-select.R | 49 ++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index 6e7dc7a1aa2..0d9ebd3fec4 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -22,34 +22,30 @@ tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns) select.arrow_dplyr_query <- function(.data, ...) { check_select_helpers(enexprs(...)) - column_select(as_adq(.data), !!!enquos(...)) + .data <- as_adq(.data) + + sim_df <- simulate_data_frame(.data$.data$schema) + out <- eval_select(expr(c(...)), sim_df) + + .data$selected_columns <- set_names(.data$selected_columns[out], names(out)) + + renamed <- out[names(out) != out] + if (length(renamed)) { + # Massage group_by + gbv <- .data$group_by_vars + renamed_groups <- gbv %in% renamed + gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] + .data$group_by_vars <- gbv + } + .data } select.Dataset <- select.ArrowTabular <- select.RecordBatchReader <- select.arrow_dplyr_query rename.arrow_dplyr_query <- function(.data, ...) { check_select_helpers(enexprs(...)) - column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename) -} -rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query - -rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...) { - .fn <- as_function(.fn) - old_names <- names(dplyr::select(.data, {{ .cols }})) - dplyr::rename(.data, !!set_names(old_names, .fn(old_names))) -} -rename_with.Dataset <- rename_with.ArrowTabular <- rename_with.RecordBatchReader <- rename_with.arrow_dplyr_query - -column_select <- function(.data, ..., .FUN = vars_select) { - # .FUN is either tidyselect::vars_select or tidyselect::vars_rename - # It operates on the names() of selected_columns, i.e. the column names - # factoring in any renaming that may already have happened - out <- .FUN(names(.data), !!!enquos(...)) - # Make sure that the resulting selected columns map back to the original data, - # as in when there are multiple renaming steps + .data <- as_adq(.data) + out <- vars_rename(names(.data), !!!enquos(...)) .data$selected_columns <- set_names(.data$selected_columns[out], names(out)) - - # If we've renamed columns, we need to project that renaming into other - # query parameters we've collected renamed <- out[names(out) != out] if (length(renamed)) { # Massage group_by @@ -57,10 +53,17 @@ column_select <- function(.data, ..., .FUN = vars_select) { renamed_groups <- gbv %in% renamed gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] .data$group_by_vars <- gbv - # No need to massage filters because those contain references to Arrow objects } .data } +rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query + +rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...) { + .fn <- as_function(.fn) + old_names <- names(dplyr::select(.data, {{ .cols }})) + dplyr::rename(.data, !!set_names(old_names, .fn(old_names))) +} +rename_with.Dataset <- rename_with.ArrowTabular <- rename_with.RecordBatchReader <- rename_with.arrow_dplyr_query relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) { # The code in this function is adapted from the code in dplyr::relocate.data.frame From 2fcab492e3d8bdc5acec18e1b1aa2b9072381af4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 08:56:10 +0100 Subject: [PATCH 03/20] Enable test for where() --- r/tests/testthat/test-dplyr-select.R | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-dplyr-select.R b/r/tests/testthat/test-dplyr-select.R index 98dcd6396d9..fd20b26e8cb 100644 --- a/r/tests/testthat/test-dplyr-select.R +++ b/r/tests/testthat/test-dplyr-select.R @@ -87,15 +87,14 @@ test_that("select/rename/rename_with using selection helpers", { collect(), tbl ) - expect_error( - compare_dplyr_binding( - .input %>% - select(where(is.numeric)) %>% - collect(), - tbl - ), - "Unsupported selection helper" + + compare_dplyr_binding( + .input %>% + select(where(is.numeric)) %>% + collect(), + tbl ) + compare_dplyr_binding( .input %>% rename_with(toupper) %>% From aa2ed7ee106b523fe00373533660eee1b26cc9e3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 08:56:59 +0100 Subject: [PATCH 04/20] Don't check for usage of where() --- r/R/dplyr-select.R | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index 0d9ebd3fec4..e7df106cea4 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -21,7 +21,7 @@ tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns) select.arrow_dplyr_query <- function(.data, ...) { - check_select_helpers(enexprs(...)) + .data <- as_adq(.data) sim_df <- simulate_data_frame(.data$.data$schema) @@ -42,7 +42,6 @@ select.arrow_dplyr_query <- function(.data, ...) { select.Dataset <- select.ArrowTabular <- select.RecordBatchReader <- select.arrow_dplyr_query rename.arrow_dplyr_query <- function(.data, ...) { - check_select_helpers(enexprs(...)) .data <- as_adq(.data) out <- vars_rename(names(.data), !!!enquos(...)) .data$selected_columns <- set_names(.data$selected_columns[out], names(out)) @@ -117,19 +116,3 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL .data } relocate.Dataset <- relocate.ArrowTabular <- relocate.RecordBatchReader <- relocate.arrow_dplyr_query - -check_select_helpers <- function(exprs) { - # Throw an error if unsupported tidyselect selection helpers in `exprs` - exprs <- lapply(exprs, function(x) if (is_quosure(x)) quo_get_expr(x) else x) - unsup_select_helpers <- "where" - funs_in_exprs <- unlist(lapply(exprs, all_funs)) - unsup_funs <- funs_in_exprs[funs_in_exprs %in% unsup_select_helpers] - if (length(unsup_funs)) { - stop( - "Unsupported selection ", - ngettext(length(unsup_funs), "helper: ", "helpers: "), - oxford_paste(paste0(unsup_funs, "()"), quote = FALSE), - call. = FALSE - ) - } -} From b3663da4642c91d57180ae2eb3df817d3840ebd5 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 09:01:10 +0100 Subject: [PATCH 05/20] Import eval_rename instead of vars_rename --- r/NAMESPACE | 2 +- r/R/arrow-package.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index e20e61c0e32..6d35a6607b2 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -475,6 +475,7 @@ importFrom(stats,quantile) importFrom(tidyselect,all_of) importFrom(tidyselect,contains) importFrom(tidyselect,ends_with) +importFrom(tidyselect,eval_rename) importFrom(tidyselect,eval_select) importFrom(tidyselect,everything) importFrom(tidyselect,last_col) @@ -483,7 +484,6 @@ importFrom(tidyselect,num_range) importFrom(tidyselect,one_of) importFrom(tidyselect,starts_with) importFrom(tidyselect,vars_pull) -importFrom(tidyselect,vars_rename) importFrom(tidyselect,vars_select) importFrom(utils,capture.output) importFrom(utils,getFromNamespace) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 143f4c191bd..ec8ce2a4361 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -27,7 +27,7 @@ #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args #' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure #' @importFrom rlang new_quosures expr_text -#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select +#' @importFrom tidyselect vars_pull vars_select eval_select eval_rename #' @importFrom glue glue #' @useDynLib arrow, .registration = TRUE #' @keywords internal From 7827b50d574ddf4ebcbb5f803186db0b554bedef Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 10:55:48 +0100 Subject: [PATCH 06/20] Reimplement column_select --- r/R/dplyr-select.R | 67 +++++++++++++++++----------- r/tests/testthat/test-dplyr-select.R | 29 ++++++++++++ 2 files changed, 71 insertions(+), 25 deletions(-) diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index e7df106cea4..a680f8fdc27 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -21,41 +21,49 @@ tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns) select.arrow_dplyr_query <- function(.data, ...) { + column_select(.data, enquos(...), op = "select") +} +select.Dataset <- select.ArrowTabular <- select.RecordBatchReader <- select.arrow_dplyr_query - .data <- as_adq(.data) +rename.arrow_dplyr_query <- function(.data, ...) { + column_select(.data, enquos(...), op = "rename") +} +rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query - sim_df <- simulate_data_frame(.data$.data$schema) - out <- eval_select(expr(c(...)), sim_df) +column_select <- function(.data, select_expression, op = c("select", "rename")) { - .data$selected_columns <- set_names(.data$selected_columns[out], names(out)) + op <- match.arg(op) - renamed <- out[names(out) != out] - if (length(renamed)) { - # Massage group_by - gbv <- .data$group_by_vars - renamed_groups <- gbv %in% renamed - gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] - .data$group_by_vars <- gbv + .data <- as_adq(.data) + sim_df <- simulate_data_frame(implicit_schema(.data)) + old_names <- names(sim_df) + + if (op == "select") { + out <- eval_select(expr(c(!!!select_expression)), sim_df) + # select only columns from `out` + subset <- out + } else if (op == "rename") { + out <- eval_rename(expr(c(!!!select_expression)), sim_df) + # select all columns as only renaming + subset <- set_names(seq_along(old_names), old_names) + names(subset)[out] <- names(out) } - .data -} -select.Dataset <- select.ArrowTabular <- select.RecordBatchReader <- select.arrow_dplyr_query -rename.arrow_dplyr_query <- function(.data, ...) { - .data <- as_adq(.data) - out <- vars_rename(names(.data), !!!enquos(...)) - .data$selected_columns <- set_names(.data$selected_columns[out], names(out)) - renamed <- out[names(out) != out] + .data$selected_columns <- set_names(.data$selected_columns[subset], names(subset)) + + # check if names have updated + new_names <- old_names + new_names[out] <- names(out) + names_compared <- set_names(old_names, new_names) + renamed <- names_compared[names(names_compared) != names_compared] + if (length(renamed)) { - # Massage group_by - gbv <- .data$group_by_vars - renamed_groups <- gbv %in% renamed - gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] - .data$group_by_vars <- gbv + .data <- update_group_names(.data, renamed) } .data + + } -rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...) { .fn <- as_function(.fn) @@ -116,3 +124,12 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL .data } relocate.Dataset <- relocate.ArrowTabular <- relocate.RecordBatchReader <- relocate.arrow_dplyr_query + +# Update names in group_by if changed in select() or rename() +update_group_names <- function(.data, renamed){ + gbv <- .data$group_by_vars + renamed_groups <- gbv %in% renamed + gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] + .data$group_by_vars <- gbv + .data +} diff --git a/r/tests/testthat/test-dplyr-select.R b/r/tests/testthat/test-dplyr-select.R index fd20b26e8cb..f71c4000442 100644 --- a/r/tests/testthat/test-dplyr-select.R +++ b/r/tests/testthat/test-dplyr-select.R @@ -186,3 +186,32 @@ test_that("relocate with selection helpers", { df ) }) + +test_that("multiple select/rename and group_by", { + compare_dplyr_binding( + .input %>% + group_by(chr) %>% + rename(string = chr, dub = dbl2) %>% + rename(chr_actually = string) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(chr) %>% + select(string = chr, dub = dbl2) %>% + rename(chr_actually = string) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(chr) %>% + rename(string = chr, dub = dbl2) %>% + select(chr_actually = string) %>% + collect(), + tbl + ) +}) From 86867e003f2cd71ace0ef2ce5673c5f56ebcff7e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 10:57:52 +0100 Subject: [PATCH 07/20] Refactor function back into next level --- r/R/dplyr-select.R | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index a680f8fdc27..d4b647e8c59 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -31,7 +31,6 @@ rename.arrow_dplyr_query <- function(.data, ...) { rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query column_select <- function(.data, select_expression, op = c("select", "rename")) { - op <- match.arg(op) .data <- as_adq(.data) @@ -57,12 +56,15 @@ column_select <- function(.data, select_expression, op = c("select", "rename")) names_compared <- set_names(old_names, new_names) renamed <- names_compared[names(names_compared) != names_compared] + # Update names in group_by if changed in select() or rename() if (length(renamed)) { - .data <- update_group_names(.data, renamed) + gbv <- .data$group_by_vars + renamed_groups <- gbv %in% renamed + gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] + .data$group_by_vars <- gbv } - .data - + .data } rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...) { @@ -124,12 +126,3 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL .data } relocate.Dataset <- relocate.ArrowTabular <- relocate.RecordBatchReader <- relocate.arrow_dplyr_query - -# Update names in group_by if changed in select() or rename() -update_group_names <- function(.data, renamed){ - gbv <- .data$group_by_vars - renamed_groups <- gbv %in% renamed - gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] - .data$group_by_vars <- gbv - .data -} From 3396ad6b3828525690856dca315305e3cc0be229 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 10:59:42 +0100 Subject: [PATCH 08/20] Move helper function to bottom of file --- r/R/dplyr-select.R | 74 +++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index d4b647e8c59..005345e8a25 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -30,43 +30,6 @@ rename.arrow_dplyr_query <- function(.data, ...) { } rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query -column_select <- function(.data, select_expression, op = c("select", "rename")) { - op <- match.arg(op) - - .data <- as_adq(.data) - sim_df <- simulate_data_frame(implicit_schema(.data)) - old_names <- names(sim_df) - - if (op == "select") { - out <- eval_select(expr(c(!!!select_expression)), sim_df) - # select only columns from `out` - subset <- out - } else if (op == "rename") { - out <- eval_rename(expr(c(!!!select_expression)), sim_df) - # select all columns as only renaming - subset <- set_names(seq_along(old_names), old_names) - names(subset)[out] <- names(out) - } - - .data$selected_columns <- set_names(.data$selected_columns[subset], names(subset)) - - # check if names have updated - new_names <- old_names - new_names[out] <- names(out) - names_compared <- set_names(old_names, new_names) - renamed <- names_compared[names(names_compared) != names_compared] - - # Update names in group_by if changed in select() or rename() - if (length(renamed)) { - gbv <- .data$group_by_vars - renamed_groups <- gbv %in% renamed - gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] - .data$group_by_vars <- gbv - } - - .data -} - rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...) { .fn <- as_function(.fn) old_names <- names(dplyr::select(.data, {{ .cols }})) @@ -126,3 +89,40 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL .data } relocate.Dataset <- relocate.ArrowTabular <- relocate.RecordBatchReader <- relocate.arrow_dplyr_query + +column_select <- function(.data, select_expression, op = c("select", "rename")) { + op <- match.arg(op) + + .data <- as_adq(.data) + sim_df <- simulate_data_frame(implicit_schema(.data)) + old_names <- names(sim_df) + + if (op == "select") { + out <- eval_select(expr(c(!!!select_expression)), sim_df) + # select only columns from `out` + subset <- out + } else if (op == "rename") { + out <- eval_rename(expr(c(!!!select_expression)), sim_df) + # select all columns as only renaming + subset <- set_names(seq_along(old_names), old_names) + names(subset)[out] <- names(out) + } + + .data$selected_columns <- set_names(.data$selected_columns[subset], names(subset)) + + # check if names have updated + new_names <- old_names + new_names[out] <- names(out) + names_compared <- set_names(old_names, new_names) + renamed <- names_compared[names(names_compared) != names_compared] + + # Update names in group_by if changed in select() or rename() + if (length(renamed)) { + gbv <- .data$group_by_vars + renamed_groups <- gbv %in% renamed + gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] + .data$group_by_vars <- gbv + } + + .data +} From 0bcf4b39dc94515d7508d4f5563a05c32363630e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 11:04:40 +0100 Subject: [PATCH 09/20] Update tests where `where()` now works and update corresponding docs --- r/R/dplyr-funcs-doc.R | 8 ++++---- r/data-raw/docgen.R | 5 +---- r/man/acero.Rd | 8 ++++---- r/tests/testthat/test-dplyr-group-by.R | 14 +++++--------- r/tests/testthat/test-dplyr-mutate.R | 16 +++++----------- 5 files changed, 19 insertions(+), 32 deletions(-) diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index 5360c7fad66..866b716cc1b 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -21,7 +21,7 @@ #' #' The `arrow` package contains methods for 32 `dplyr` table functions, many of #' which are "verbs" that do transformations to one or more tables. -#' The package also has mappings of 205 R functions to the corresponding +#' The package also has mappings of 207 R functions to the corresponding #' functions in the Arrow compute library. These allow you to write code inside #' of `dplyr` methods that call R functions, including many in packages like #' `stringr` and `lubridate`, and they will get translated to Arrow and run @@ -185,13 +185,13 @@ #' #' ## dplyr #' -#' * [`across()`][dplyr::across()]: not yet supported inside `filter()`; -#' purrr-style lambda functions -#' and use of `where()` selection helper not yet supported +#' * [`across()`][dplyr::across()] #' * [`between()`][dplyr::between()] #' * [`case_when()`][dplyr::case_when()] #' * [`coalesce()`][dplyr::coalesce()] #' * [`desc()`][dplyr::desc()] +#' * [`if_all()`][dplyr::if_all()] +#' * [`if_any()`][dplyr::if_any()] #' * [`if_else()`][dplyr::if_else()] #' * [`n()`][dplyr::n()] #' * [`n_distinct()`][dplyr::n_distinct()] diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R index e2c7f94eafc..0fb312e45a4 100644 --- a/r/data-raw/docgen.R +++ b/r/data-raw/docgen.R @@ -127,10 +127,7 @@ docs <- arrow:::.cache$docs # Add some functions # across() is handled by manipulating the quosures, not by nse_funcs -docs[["dplyr::across"]] <- c( - # TODO(ARROW-17384): implement where - "Use of `where()` selection helper not yet supported" -) +docs[["dplyr::across"]] <- character(0) # if_any() and if_all() are used instead of across() in filter() # they are both handled by manipulating the quosures, not by nse_funcs diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 76f1b13fe3a..586dea48851 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -6,7 +6,7 @@ \description{ The \code{arrow} package contains methods for 32 \code{dplyr} table functions, many of which are "verbs" that do transformations to one or more tables. -The package also has mappings of 205 R functions to the corresponding +The package also has mappings of 207 R functions to the corresponding functions in the Arrow compute library. These allow you to write code inside of \code{dplyr} methods that call R functions, including many in packages like \code{stringr} and \code{lubridate}, and they will get translated to Arrow and run @@ -175,13 +175,13 @@ as \code{arrow_ascii_is_decimal}. \subsection{dplyr}{ \itemize{ -\item \code{\link[dplyr:across]{across()}}: not yet supported inside \code{filter()}; -purrr-style lambda functions -and use of \code{where()} selection helper not yet supported +\item \code{\link[dplyr:across]{across()}} \item \code{\link[dplyr:between]{between()}} \item \code{\link[dplyr:case_when]{case_when()}} \item \code{\link[dplyr:coalesce]{coalesce()}} \item \code{\link[dplyr:desc]{desc()}} +\item \code{\link[dplyr:across]{if_all()}} +\item \code{\link[dplyr:across]{if_any()}} \item \code{\link[dplyr:if_else]{if_else()}} \item \code{\link[dplyr:context]{n()}} \item \code{\link[dplyr:n_distinct]{n_distinct()}} diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index 9bb6aa9600d..0c93f530faa 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -265,14 +265,10 @@ test_that("Can use across() within group_by()", { tbl ) - # ARROW-12778 - `where()` is not yet supported - expect_error( - compare_dplyr_binding( - .input %>% - group_by(across(where(is.numeric))) %>% - collect(), - tbl - ), - "Unsupported selection helper" + compare_dplyr_binding( + .input %>% + group_by(across(where(is.numeric))) %>% + collect(), + tbl ) }) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 86c243e5490..ee13c8be2e3 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -616,15 +616,11 @@ test_that("Can use across() within mutate()", { ) ) - # ARROW-12778 - `where()` is not yet supported - expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(where(is.double))) %>% - collect(), - example_data - ), - "Unsupported selection helper" + compare_dplyr_binding( + .input %>% + mutate(across(where(is.double))) %>% + collect(), + example_data ) # gives the right error with window functions @@ -642,7 +638,6 @@ test_that("Can use across() within mutate()", { }) test_that("Can use across() within transmute()", { - compare_dplyr_binding( .input %>% transmute( @@ -654,5 +649,4 @@ test_that("Can use across() within transmute()", { collect(), example_data ) - }) From 3d381b7161d3e443f9a7801860461d366a28ab2b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 Oct 2022 11:55:09 +0100 Subject: [PATCH 10/20] If can't find type, just use NULL --- r/R/util.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/r/R/util.R b/r/R/util.R index e2f8f51ad7c..9e3d40606e8 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -253,7 +253,15 @@ augment_io_error_msg <- function(e, call, schema = NULL, format = NULL) { } simulate_data_frame <- function(schema) { - arrays <- lapply(schema$fields, function(field) concat_arrays(type = field$type)) + + arrays <- lapply( + schema$fields, + function(field) tryCatch( + concat_arrays(type = field$type), + error = function(...) concat_arrays(type = NULL) + ) + ) + vectors <- lapply( arrays, function(array) tryCatch( From 3b092c662658a5f42b3ce57bf3c3ff1d80b87e7f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 13 Oct 2022 13:07:57 +0100 Subject: [PATCH 11/20] Add schema to 0-row Table C++ function --- r/R/arrowExports.R | 5 +++++ r/src/arrowExports.cpp | 9 +++++++++ r/src/table.cpp | 22 ++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index b73bef71023..dc49278236c 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -2024,6 +2024,10 @@ Table__from_record_batches <- function(batches, schema_sxp) { .Call(`_arrow_Table__from_record_batches`, batches, schema_sxp) } +Table__from_schema <- function(schema_sxp) { + .Call(`_arrow_Table__from_schema`, schema_sxp) +} + Table__ReferencedBufferSize <- function(table) { .Call(`_arrow_Table__ReferencedBufferSize`, table) } @@ -2051,3 +2055,4 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } + diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index aa4fd01af49..336b12cb482 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -5101,6 +5101,14 @@ BEGIN_CPP11 END_CPP11 } // table.cpp +std::shared_ptr Table__from_schema(SEXP schema_sxp); +extern "C" SEXP _arrow_Table__from_schema(SEXP schema_sxp_sexp){ +BEGIN_CPP11 + arrow::r::Input::type schema_sxp(schema_sxp_sexp); + return cpp11::as_sexp(Table__from_schema(schema_sxp)); +END_CPP11 +} +// table.cpp r_vec_size Table__ReferencedBufferSize(const std::shared_ptr& table); extern "C" SEXP _arrow_Table__ReferencedBufferSize(SEXP table_sexp){ BEGIN_CPP11 @@ -5724,6 +5732,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2}, { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1}, { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2}, + { "_arrow_Table__from_schema", (DL_FUNC) &_arrow_Table__from_schema, 1}, { "_arrow_Table__ReferencedBufferSize", (DL_FUNC) &_arrow_Table__ReferencedBufferSize, 1}, { "_arrow_Table__ConcatenateTables", (DL_FUNC) &_arrow_Table__ConcatenateTables, 2}, { "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0}, diff --git a/r/src/table.cpp b/r/src/table.cpp index f31aac33eff..db6b76ab3dc 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -18,6 +18,7 @@ #include "./arrow_types.h" #include +#include #include #include #include @@ -302,6 +303,27 @@ std::shared_ptr Table__from_record_batches( return tab; } +// [[arrow::export]] +std::shared_ptr Table__from_schema(SEXP schema_sxp) { + auto schema = cpp11::as_cpp>(schema_sxp); + + int num_fields = schema->num_fields(); + + std::vector> columns; + + for (int i = 0; i < num_fields; i++) { + std::shared_ptr type = schema->field(i)->type(); + + std::shared_ptr array; + std::unique_ptr type_builder; + StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), type, &type_builder)); + StopIfNotOk(type_builder->Finish(&array)); + columns.push_back(array); + } + + return (arrow::Table::Make(schema, std::move(columns))); +} + // [[arrow::export]] r_vec_size Table__ReferencedBufferSize(const std::shared_ptr& table) { return r_vec_size(ValueOrStop(arrow::util::ReferencedBufferSize(*table))); From 6d01a13a298a2457647744581ad4602db2d8b5eb Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 13 Oct 2022 13:12:38 +0100 Subject: [PATCH 12/20] Update simulate_data_frame to use C++ function instead --- r/R/util.R | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index 9e3d40606e8..9517f684d07 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -253,23 +253,5 @@ augment_io_error_msg <- function(e, call, schema = NULL, format = NULL) { } simulate_data_frame <- function(schema) { - - arrays <- lapply( - schema$fields, - function(field) tryCatch( - concat_arrays(type = field$type), - error = function(...) concat_arrays(type = NULL) - ) - ) - - vectors <- lapply( - arrays, - function(array) tryCatch( - as.vector(array), - error = function(...) vctrs::unspecified() - ) - ) - - names(vectors) <- names(schema) - tibble::new_tibble(vectors, nrow = 0) + as.data.frame(Table__from_schema(schema)) } From 991feb21aa6de965846a7c4a0b4c5d6b80587bd0 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 13 Oct 2022 15:04:21 +0100 Subject: [PATCH 13/20] Clearer var names Co-authored-by: Neal Richardson --- r/R/dplyr-select.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index 005345e8a25..c30dacf3d2e 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -114,7 +114,7 @@ column_select <- function(.data, select_expression, op = c("select", "rename")) new_names <- old_names new_names[out] <- names(out) names_compared <- set_names(old_names, new_names) - renamed <- names_compared[names(names_compared) != names_compared] + renamed <- names_compared[old_names != new_names] # Update names in group_by if changed in select() or rename() if (length(renamed)) { From c0b890aba066b1904b1abd1818af53b3c6d08d68 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 13 Oct 2022 18:51:42 +0100 Subject: [PATCH 14/20] Handle extension types --- r/src/table.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/r/src/table.cpp b/r/src/table.cpp index db6b76ab3dc..062f85b719c 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -312,7 +312,17 @@ std::shared_ptr Table__from_schema(SEXP schema_sxp) { std::vector> columns; for (int i = 0; i < num_fields; i++) { - std::shared_ptr type = schema->field(i)->type(); + bool is_extension_type = schema->field(i)->type()->name() == "extension"; + std::shared_ptr type; + + // need to handle extension types a bit differently + if (is_extension_type) { + // TODO: ARROW-18043 - update this to properly construct extension types instead of + // converting to null + type = arrow::null(); + } else { + type = schema->field(i)->type(); + } std::shared_ptr array; std::unique_ptr type_builder; From 8196763de63ea7c590a99b0ff0d22cba1e15e708 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 13 Oct 2022 19:08:52 +0100 Subject: [PATCH 15/20] Use as S3 method instead of own function --- r/R/dplyr-select.R | 2 +- r/R/schema.R | 5 +++++ r/R/table.R | 11 +++++++++++ r/R/util.R | 5 ----- r/tests/testthat/test-Table.R | 8 ++++++++ 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index 005345e8a25..dd4c4b6e1ca 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -94,7 +94,7 @@ column_select <- function(.data, select_expression, op = c("select", "rename")) op <- match.arg(op) .data <- as_adq(.data) - sim_df <- simulate_data_frame(implicit_schema(.data)) + sim_df <- as.data.frame(implicit_schema(.data)) old_names <- names(sim_df) if (op == "select") { diff --git a/r/R/schema.R b/r/R/schema.R index 86a968b5003..c7e26652c90 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -383,3 +383,8 @@ as_schema.Schema <- function(x, ...) { as_schema.StructType <- function(x, ...) { schema(!!!x$fields()) } + +#' @export +as.data.frame.Schema <- function(x, row.names = NULL, optional = FALSE, ...) { + as.data.frame(Table__from_schema(x)) +} diff --git a/r/R/table.R b/r/R/table.R index c5291257792..2007a3887bc 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -134,6 +134,11 @@ Table$create <- function(..., schema = NULL) { if (is.null(names(dots))) { names(dots) <- rep_len("", length(dots)) } + + if (length(dots) == 0 && inherits(schema, "Schema")) { + return(Table__from_schema(schema)) + } + stopifnot(length(dots) > 0) if (all_record_batches(dots)) { @@ -330,3 +335,9 @@ as_arrow_table.RecordBatchReader <- function(x, ...) { as_arrow_table.arrow_dplyr_query <- function(x, ...) { as_arrow_table(as_record_batch_reader(x)) } + +#' @rdname as_arrow_table +#' @export +as_arrow_table.Schema <- function(x, ...) { + Table__from_schema(x) +} diff --git a/r/R/util.R b/r/R/util.R index 9517f684d07..0f4a4129bf8 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -238,7 +238,6 @@ is_compressed <- function(compression) { # handler function which checks for a number of different read errors augment_io_error_msg <- function(e, call, schema = NULL, format = NULL) { - msg <- conditionMessage(e) if (!is.null(schema)) { @@ -251,7 +250,3 @@ augment_io_error_msg <- function(e, call, schema = NULL, format = NULL) { handle_augmented_field_misuse(msg, call) abort(msg, call = call) } - -simulate_data_frame <- function(schema) { - as.data.frame(Table__from_schema(schema)) -} diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index d2818943823..409df85f06d 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -693,3 +693,11 @@ test_that("num_rows method not susceptible to integer overflow", { expect_identical(big_string_array$data()$buffers[[3]]$size, 2148007936) }) + +test_that("can create empty table from schema", { + schema <- schema(col1 = float64(), col2 = string()) + out <- Table$create(schema = schema) + expect_r6_class(out, "Table") + expect_equal(nrow(out), 0) + expect_equal(out$schema, schema) +}) From a317245a1689c25c68a7d07ad5b420468d2c7ee7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 14 Oct 2022 12:25:06 +0100 Subject: [PATCH 16/20] Update feather reader to use eval_select not vars_select --- r/R/feather.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/r/R/feather.R b/r/R/feather.R index 4e2e9947cb9..7791b9e8aa8 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -178,8 +178,11 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T reader <- FeatherReader$create(file) col_select <- enquo(col_select) + columns <- if (!quo_is_null(col_select)) { - vars_select(names(reader), !!col_select) + sim_df <- as.data.frame(reader$schema) + indices <- eval_select(col_select, sim_df) + names(reader)[indices] } out <- tryCatch( From c5c7642bbceb0b7818b86a36f0d3203253e97942 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 14 Oct 2022 12:33:12 +0100 Subject: [PATCH 17/20] Update parquet reader to use eval_select --- r/R/parquet.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/r/R/parquet.R b/r/R/parquet.R index 0b3f93b20e1..ac3ca616741 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -55,9 +55,8 @@ read_parquet <- function(file, col_select <- enquo(col_select) if (!quo_is_null(col_select)) { # infer which columns to keep from schema - schema <- reader$GetSchema() - names <- names(schema) - indices <- match(vars_select(names, !!col_select), names) - 1L + sim_df <- as.data.frame(reader$GetSchema()) + indices <- eval_select(col_select, sim_df) - 1L tab <- tryCatch( reader$ReadTable(indices), error = read_compressed_error From 3711de83b9d44fe8d3b8ce43a8ddd1a0d0a9d60f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 14 Oct 2022 12:41:50 +0100 Subject: [PATCH 18/20] Update JSON reader to use eval_select --- r/R/json.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/json.R b/r/R/json.R index 2b1f4916cb4..a6ad22f642f 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -56,7 +56,8 @@ read_json_arrow <- function(file, col_select <- enquo(col_select) if (!quo_is_null(col_select)) { - tab <- tab[vars_select(names(tab), !!col_select)] + sim_df <- as.data.frame(tab$schema) + tab <- tab[eval_select(col_select, sim_df)] } if (isTRUE(as_data_frame)) { From 8b68607249eac2cb07e5270726257fa63d2b9ec4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 14 Oct 2022 12:43:44 +0100 Subject: [PATCH 19/20] Update CSV reader to use eval_select --- r/R/csv.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/csv.R b/r/R/csv.R index 45232984162..61ee47b7ed4 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -215,7 +215,8 @@ read_delim_arrow <- function(file, # TODO: move this into convert_options using include_columns col_select <- enquo(col_select) if (!quo_is_null(col_select)) { - tab <- tab[vars_select(names(tab), !!col_select)] + sim_df <- as.data.frame(tab$schema) + tab <- tab[eval_select(col_select, sim_df)] } if (isTRUE(as_data_frame)) { From 6a9122a6ee3f4fa1b6fda039b89c5412a8e30972 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 14 Oct 2022 12:45:15 +0100 Subject: [PATCH 20/20] Remove import of vars_select, and run devtools::document() --- r/NAMESPACE | 3 ++- r/R/arrow-package.R | 2 +- r/R/csv.R | 2 +- r/man/as_arrow_table.Rd | 3 +++ r/man/read_delim_arrow.Rd | 2 +- r/man/read_feather.Rd | 2 +- r/man/read_json_arrow.Rd | 2 +- r/man/read_parquet.Rd | 2 +- 8 files changed, 11 insertions(+), 7 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 6d35a6607b2..a670b773b90 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -30,6 +30,7 @@ S3method(as.character,FileFormat) S3method(as.character,FragmentScanOptions) S3method(as.data.frame,ArrowTabular) S3method(as.data.frame,RecordBatchReader) +S3method(as.data.frame,Schema) S3method(as.data.frame,StructArray) S3method(as.data.frame,arrow_dplyr_query) S3method(as.double,ArrowDatum) @@ -48,6 +49,7 @@ S3method(as_arrow_array,pyarrow.lib.Array) S3method(as_arrow_array,vctrs_list_of) S3method(as_arrow_table,RecordBatch) S3method(as_arrow_table,RecordBatchReader) +S3method(as_arrow_table,Schema) S3method(as_arrow_table,Table) S3method(as_arrow_table,arrow_dplyr_query) S3method(as_arrow_table,data.frame) @@ -484,7 +486,6 @@ importFrom(tidyselect,num_range) importFrom(tidyselect,one_of) importFrom(tidyselect,starts_with) importFrom(tidyselect,vars_pull) -importFrom(tidyselect,vars_select) importFrom(utils,capture.output) importFrom(utils,getFromNamespace) importFrom(utils,head) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index ec8ce2a4361..253922232cd 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -27,7 +27,7 @@ #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args #' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure #' @importFrom rlang new_quosures expr_text -#' @importFrom tidyselect vars_pull vars_select eval_select eval_rename +#' @importFrom tidyselect vars_pull eval_select eval_rename #' @importFrom glue glue #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/csv.R b/r/R/csv.R index 61ee47b7ed4..76bd688cc38 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -102,7 +102,7 @@ #' `NULL` (the default) to infer types from the data. #' @param col_select A character vector of column names to keep, as in the #' "select" argument to `data.table::fread()`, or a -#' [tidy selection specification][tidyselect::vars_select()] +#' [tidy selection specification][tidyselect::eval_select()] #' of columns, as used in `dplyr::select()`. #' @param na A character vector of strings to interpret as missing values. #' @param quoted_na Should missing values inside quotes be treated as missing diff --git a/r/man/as_arrow_table.Rd b/r/man/as_arrow_table.Rd index aac4495e7c6..22d4ea1c191 100644 --- a/r/man/as_arrow_table.Rd +++ b/r/man/as_arrow_table.Rd @@ -8,6 +8,7 @@ \alias{as_arrow_table.data.frame} \alias{as_arrow_table.RecordBatchReader} \alias{as_arrow_table.arrow_dplyr_query} +\alias{as_arrow_table.Schema} \title{Convert an object to an Arrow Table} \usage{ as_arrow_table(x, ..., schema = NULL) @@ -23,6 +24,8 @@ as_arrow_table(x, ..., schema = NULL) \method{as_arrow_table}{RecordBatchReader}(x, ...) \method{as_arrow_table}{arrow_dplyr_query}(x, ...) + +\method{as_arrow_table}{Schema}(x, ...) } \arguments{ \item{x}{An object to convert to an Arrow Table} diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index 997a7f4101a..369bbd1c999 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -101,7 +101,7 @@ Alternatively, you can specify a character vector of column names.} \item{col_select}{A character vector of column names to keep, as in the "select" argument to \code{data.table::fread()}, or a -\link[tidyselect:vars_select]{tidy selection specification} +\link[tidyselect:eval_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} \item{na}{A character vector of strings to interpret as missing values.} diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index 218a163b990..000aa541aac 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -18,7 +18,7 @@ open.} \item{col_select}{A character vector of column names to keep, as in the "select" argument to \code{data.table::fread()}, or a -\link[tidyselect:vars_select]{tidy selection specification} +\link[tidyselect:eval_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index 2ad600725fa..9399e885059 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -22,7 +22,7 @@ open.} \item{col_select}{A character vector of column names to keep, as in the "select" argument to \code{data.table::fread()}, or a -\link[tidyselect:vars_select]{tidy selection specification} +\link[tidyselect:eval_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index d509f8068e7..68e56903d14 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -21,7 +21,7 @@ open.} \item{col_select}{A character vector of column names to keep, as in the "select" argument to \code{data.table::fread()}, or a -\link[tidyselect:vars_select]{tidy selection specification} +\link[tidyselect:eval_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or