apache · thisisnic · Oct 14, 2022 · Oct 11, 2022 · Oct 11, 2022 · Oct 11, 2022
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -30,6 +30,7 @@ S3method(as.character,FileFormat)
 S3method(as.character,FragmentScanOptions)
 S3method(as.data.frame,ArrowTabular)
 S3method(as.data.frame,RecordBatchReader)
+S3method(as.data.frame,Schema)
 S3method(as.data.frame,StructArray)
 S3method(as.data.frame,arrow_dplyr_query)
 S3method(as.double,ArrowDatum)
@@ -48,6 +49,7 @@ S3method(as_arrow_array,pyarrow.lib.Array)
 S3method(as_arrow_array,vctrs_list_of)
 S3method(as_arrow_table,RecordBatch)
 S3method(as_arrow_table,RecordBatchReader)
+S3method(as_arrow_table,Schema)
 S3method(as_arrow_table,Table)
 S3method(as_arrow_table,arrow_dplyr_query)
 S3method(as_arrow_table,data.frame)
@@ -478,6 +480,7 @@ importFrom(stats,runif)
 importFrom(tidyselect,all_of)
 importFrom(tidyselect,contains)
 importFrom(tidyselect,ends_with)
+importFrom(tidyselect,eval_rename)
 importFrom(tidyselect,eval_select)
 importFrom(tidyselect,everything)
 importFrom(tidyselect,last_col)
@@ -486,8 +489,6 @@ importFrom(tidyselect,num_range)
 importFrom(tidyselect,one_of)
 importFrom(tidyselect,starts_with)
 importFrom(tidyselect,vars_pull)
-importFrom(tidyselect,vars_rename)
-importFrom(tidyselect,vars_select)
 importFrom(utils,capture.output)
 importFrom(utils,getFromNamespace)
 importFrom(utils,head)

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
@@ -27,7 +27,7 @@
 #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args
 #' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure
 #' @importFrom rlang new_quosures expr_text caller_env check_dots_empty dots_list
-#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
+#' @importFrom tidyselect vars_pull eval_select eval_rename
 #' @importFrom glue glue
 #' @useDynLib arrow, .registration = TRUE
 #' @keywords internal

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/csv.R b/r/R/csv.R
@@ -102,7 +102,7 @@
 #' an Arrow [Schema], or `NULL` (the default) to infer types from the data.
 #' @param col_select A character vector of column names to keep, as in the
 #' "select" argument to `data.table::fread()`, or a
-#' [tidy selection specification][tidyselect::vars_select()]
+#' [tidy selection specification][tidyselect::eval_select()]
 #' of columns, as used in `dplyr::select()`.
 #' @param na A character vector of strings to interpret as missing values.
 #' @param quoted_na Should missing values inside quotes be treated as missing
@@ -226,7 +226,8 @@ read_delim_arrow <- function(file,
   # TODO: move this into convert_options using include_columns
   col_select <- enquo(col_select)
   if (!quo_is_null(col_select)) {
-    tab <- tab[vars_select(names(tab), !!col_select)]
+    sim_df <- as.data.frame(tab$schema)
+    tab <- tab[eval_select(col_select, sim_df)]
   }
 
   if (isTRUE(as_data_frame)) {

diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R
@@ -190,7 +190,7 @@
 #'
 #' ## dplyr
 #'
-#' * [`across()`][dplyr::across()]: Use of `where()` selection helper not yet supported
+#' * [`across()`][dplyr::across()]
 #' * [`between()`][dplyr::between()]
 #' * [`case_when()`][dplyr::case_when()]
 #' * [`coalesce()`][dplyr::coalesce()]

diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R
@@ -21,14 +21,12 @@
 tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)
 
 select.arrow_dplyr_query <- function(.data, ...) {
-  check_select_helpers(enexprs(...))
-  column_select(as_adq(.data), !!!enquos(...))
+  column_select(.data, enquos(...), op = "select")
 }
 select.Dataset <- select.ArrowTabular <- select.RecordBatchReader <- select.arrow_dplyr_query
 
 rename.arrow_dplyr_query <- function(.data, ...) {
-  check_select_helpers(enexprs(...))
-  column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename)
+  column_select(.data, enquos(...), op = "rename")
 }
 rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query
 
@@ -39,29 +37,6 @@ rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...)
 }
 rename_with.Dataset <- rename_with.ArrowTabular <- rename_with.RecordBatchReader <- rename_with.arrow_dplyr_query
 
-column_select <- function(.data, ..., .FUN = vars_select) {
-  # .FUN is either tidyselect::vars_select or tidyselect::vars_rename
-  # It operates on the names() of selected_columns, i.e. the column names
-  # factoring in any renaming that may already have happened
-  out <- .FUN(names(.data), !!!enquos(...))
-  # Make sure that the resulting selected columns map back to the original data,
-  # as in when there are multiple renaming steps
-  .data$selected_columns <- set_names(.data$selected_columns[out], names(out))
-
-  # If we've renamed columns, we need to project that renaming into other
-  # query parameters we've collected
-  renamed <- out[names(out) != out]
-  if (length(renamed)) {
-    # Massage group_by
-    gbv <- .data$group_by_vars
-    renamed_groups <- gbv %in% renamed
-    gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
-    .data$group_by_vars <- gbv
-    # No need to massage filters because those contain references to Arrow objects
-  }
-  .data
-}
-
 relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) {
   # The code in this function is adapted from the code in dplyr::relocate.data.frame
   # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R
@@ -115,18 +90,39 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL
 }
 relocate.Dataset <- relocate.ArrowTabular <- relocate.RecordBatchReader <- relocate.arrow_dplyr_query
 
-check_select_helpers <- function(exprs) {
-  # Throw an error if unsupported tidyselect selection helpers in `exprs`
-  exprs <- lapply(exprs, function(x) if (is_quosure(x)) quo_get_expr(x) else x)
-  unsup_select_helpers <- "where"
-  funs_in_exprs <- unlist(lapply(exprs, all_funs))
-  unsup_funs <- funs_in_exprs[funs_in_exprs %in% unsup_select_helpers]
-  if (length(unsup_funs)) {
-    stop(
-      "Unsupported selection ",
-      ngettext(length(unsup_funs), "helper: ", "helpers: "),
-      oxford_paste(paste0(unsup_funs, "()"), quote = FALSE),
-      call. = FALSE
-    )
+column_select <- function(.data, select_expression, op = c("select", "rename")) {
+  op <- match.arg(op)
+
+  .data <- as_adq(.data)
+  sim_df <- as.data.frame(implicit_schema(.data))
+  old_names <- names(sim_df)
+
+  if (op == "select") {
+    out <- eval_select(expr(c(!!!select_expression)), sim_df)
+    # select only columns from `out`
+    subset <- out
+  } else if (op == "rename") {
+    out <- eval_rename(expr(c(!!!select_expression)), sim_df)
+    # select all columns as only renaming
+    subset <- set_names(seq_along(old_names), old_names)
+    names(subset)[out] <- names(out)
+  }
+
+  .data$selected_columns <- set_names(.data$selected_columns[subset], names(subset))
+
+  # check if names have updated
+  new_names <- old_names
+  new_names[out] <- names(out)
+  names_compared <- set_names(old_names, new_names)
+  renamed <- names_compared[old_names != new_names]
+
+  # Update names in group_by if changed in select() or rename()
+  if (length(renamed)) {
+    gbv <- .data$group_by_vars
+    renamed_groups <- gbv %in% renamed
+    gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
+    .data$group_by_vars <- gbv
   }
+
+  .data
 }
diff --git a/r/R/feather.R b/r/R/feather.R
@@ -178,8 +178,11 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T
   reader <- FeatherReader$create(file)
 
   col_select <- enquo(col_select)
+
   columns <- if (!quo_is_null(col_select)) {
-    vars_select(names(reader), !!col_select)
+    sim_df <- as.data.frame(reader$schema)
+    indices <- eval_select(col_select, sim_df)
+    names(reader)[indices]
   }
 
   out <- tryCatch(

diff --git a/r/R/json.R b/r/R/json.R
@@ -68,7 +68,8 @@ read_json_arrow <- function(file,
 
   col_select <- enquo(col_select)
   if (!quo_is_null(col_select)) {
-    tab <- tab[vars_select(names(tab), !!col_select)]
+    sim_df <- as.data.frame(tab$schema)
+    tab <- tab[eval_select(col_select, sim_df)]
   }
 
   if (isTRUE(as_data_frame)) {

diff --git a/r/R/parquet.R b/r/R/parquet.R
@@ -55,9 +55,8 @@ read_parquet <- function(file,
   col_select <- enquo(col_select)
   if (!quo_is_null(col_select)) {
     # infer which columns to keep from schema
-    schema <- reader$GetSchema()
-    names <- names(schema)
-    indices <- match(vars_select(names, !!col_select), names) - 1L
+    sim_df <- as.data.frame(reader$GetSchema())
+    indices <- eval_select(col_select, sim_df) - 1L
     tab <- tryCatch(
       reader$ReadTable(indices),
       error = read_compressed_error

diff --git a/r/R/schema.R b/r/R/schema.R
@@ -383,3 +383,8 @@ as_schema.Schema <- function(x, ...) {
 as_schema.StructType <- function(x, ...) {
   schema(!!!x$fields())
 }
+
+#' @export
+as.data.frame.Schema <- function(x, row.names = NULL, optional = FALSE, ...) {
+  as.data.frame(Table__from_schema(x))
+}
diff --git a/r/R/table.R b/r/R/table.R
@@ -134,6 +134,11 @@ Table$create <- function(..., schema = NULL) {
   if (is.null(names(dots))) {
     names(dots) <- rep_len("", length(dots))
   }
+
+  if (length(dots) == 0 && inherits(schema, "Schema")) {
+    return(Table__from_schema(schema))
+  }
+
   stopifnot(length(dots) > 0)
 
   if (all_record_batches(dots)) {
@@ -330,3 +335,9 @@ as_arrow_table.RecordBatchReader <- function(x, ...) {
 as_arrow_table.arrow_dplyr_query <- function(x, ...) {
   as_arrow_table(as_record_batch_reader(x))
 }
+
+#' @rdname as_arrow_table
+#' @export
+as_arrow_table.Schema <- function(x, ...) {
+  Table__from_schema(x)
+}
diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R
@@ -127,10 +127,7 @@ docs <- arrow:::.cache$docs
 # Add some functions
 
 # across() is handled by manipulating the quosures, not by nse_funcs
-docs[["dplyr::across"]] <- c(
-  # TODO(ARROW-17384): implement where
-  "Use of `where()` selection helper not yet supported"
-)
+docs[["dplyr::across"]] <- character(0)
 
 # if_any() and if_all() are used instead of across() in filter()
 # they are both handled by manipulating the quosures, not by nse_funcs

diff --git a/r/man/acero.Rd b/r/man/acero.Rd
diff --git a/r/man/as_arrow_table.Rd b/r/man/as_arrow_table.Rd
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd
diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
diff --git a/r/src/table.cpp b/r/src/table.cpp
@@ -18,6 +18,7 @@
 #include "./arrow_types.h"
 
 #include <arrow/array/array_base.h>
+#include <arrow/builder.h>
 #include <arrow/table.h>
 #include <arrow/util/byte_size.h>
 #include <arrow/util/key_value_metadata.h>
@@ -302,6 +303,37 @@ std::shared_ptr<arrow::Table> Table__from_record_batches(
   return tab;
 }
 
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__from_schema(SEXP schema_sxp) {
+  auto schema = cpp11::as_cpp<std::shared_ptr<arrow::Schema>>(schema_sxp);
+
+  int num_fields = schema->num_fields();
+
+  std::vector<std::shared_ptr<arrow::Array>> columns;
+
+  for (int i = 0; i < num_fields; i++) {
+    bool is_extension_type = schema->field(i)->type()->name() == "extension";
+    std::shared_ptr<arrow::DataType> type;
+
+    // need to handle extension types a bit differently
+    if (is_extension_type) {
+      // TODO: ARROW-18043 - update this to properly construct extension types instead of
+      // converting to null
+      type = arrow::null();
+    } else {
+      type = schema->field(i)->type();
+    }
+
+    std::shared_ptr<arrow::Array> array;
+    std::unique_ptr<arrow::ArrayBuilder> type_builder;
+    StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), type, &type_builder));
+    StopIfNotOk(type_builder->Finish(&array));
+    columns.push_back(array);
+  }
+
+  return (arrow::Table::Make(schema, std::move(columns)));
+}
+
 // [[arrow::export]]
 r_vec_size Table__ReferencedBufferSize(const std::shared_ptr<arrow::Table>& table) {
   return r_vec_size(ValueOrStop(arrow::util::ReferencedBufferSize(*table)));