Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ce5b967
Add simulate_data_frame helper function
thisisnic Oct 11, 2022
4b35726
Use eval_select instead of vars_select
thisisnic Oct 11, 2022
2fcab49
Enable test for where()
thisisnic Oct 11, 2022
aa2ed7e
Don't check for usage of where()
thisisnic Oct 11, 2022
b3663da
Import eval_rename instead of vars_rename
thisisnic Oct 11, 2022
7827b50
Reimplement column_select
thisisnic Oct 11, 2022
86867e0
Refactor function back into next level
thisisnic Oct 11, 2022
3396ad6
Move helper function to bottom of file
thisisnic Oct 11, 2022
0bcf4b3
Update tests where `where()` now works and update corresponding docs
thisisnic Oct 11, 2022
3d381b7
If can't find type, just use NULL
thisisnic Oct 11, 2022
3b092c6
Add schema to 0-row Table C++ function
thisisnic Oct 13, 2022
6d01a13
Update simulate_data_frame to use C++ function instead
thisisnic Oct 13, 2022
991feb2
Clearer var names
thisisnic Oct 13, 2022
c0b890a
Handle extension types
thisisnic Oct 13, 2022
8196763
Use as S3 method instead of own function
thisisnic Oct 13, 2022
0e24c99
Merge branch 'ARROW-12105_eval_select' of github.com:thisisnic/arrow …
thisisnic Oct 13, 2022
a317245
Update feather reader to use eval_select not vars_select
thisisnic Oct 14, 2022
c5c7642
Update parquet reader to use eval_select
thisisnic Oct 14, 2022
3711de8
Update JSON reader to use eval_select
thisisnic Oct 14, 2022
8b68607
Update CSV reader to use eval_select
thisisnic Oct 14, 2022
6a9122a
Remove import of vars_select, and run devtools::document()
thisisnic Oct 14, 2022
5d37f1f
Merge branch 'master' into ARROW-12105_eval_select
thisisnic Oct 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ S3method(as.character,FileFormat)
S3method(as.character,FragmentScanOptions)
S3method(as.data.frame,ArrowTabular)
S3method(as.data.frame,RecordBatchReader)
S3method(as.data.frame,Schema)
S3method(as.data.frame,StructArray)
S3method(as.data.frame,arrow_dplyr_query)
S3method(as.double,ArrowDatum)
Expand All @@ -48,6 +49,7 @@ S3method(as_arrow_array,pyarrow.lib.Array)
S3method(as_arrow_array,vctrs_list_of)
S3method(as_arrow_table,RecordBatch)
S3method(as_arrow_table,RecordBatchReader)
S3method(as_arrow_table,Schema)
S3method(as_arrow_table,Table)
S3method(as_arrow_table,arrow_dplyr_query)
S3method(as_arrow_table,data.frame)
Expand Down Expand Up @@ -478,6 +480,7 @@ importFrom(stats,runif)
importFrom(tidyselect,all_of)
importFrom(tidyselect,contains)
importFrom(tidyselect,ends_with)
importFrom(tidyselect,eval_rename)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,everything)
importFrom(tidyselect,last_col)
Expand All @@ -486,8 +489,6 @@ importFrom(tidyselect,num_range)
importFrom(tidyselect,one_of)
importFrom(tidyselect,starts_with)
importFrom(tidyselect,vars_pull)
importFrom(tidyselect,vars_rename)
importFrom(tidyselect,vars_select)
importFrom(utils,capture.output)
importFrom(utils,getFromNamespace)
importFrom(utils,head)
Expand Down
2 changes: 1 addition & 1 deletion r/R/arrow-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args
#' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure
#' @importFrom rlang new_quosures expr_text caller_env check_dots_empty dots_list
#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
#' @importFrom tidyselect vars_pull eval_select eval_rename
#' @importFrom glue glue
#' @useDynLib arrow, .registration = TRUE
#' @keywords internal
Expand Down
5 changes: 5 additions & 0 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions r/R/csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
#' an Arrow [Schema], or `NULL` (the default) to infer types from the data.
#' @param col_select A character vector of column names to keep, as in the
#' "select" argument to `data.table::fread()`, or a
#' [tidy selection specification][tidyselect::vars_select()]
#' [tidy selection specification][tidyselect::eval_select()]
#' of columns, as used in `dplyr::select()`.
#' @param na A character vector of strings to interpret as missing values.
#' @param quoted_na Should missing values inside quotes be treated as missing
Expand Down Expand Up @@ -226,7 +226,8 @@ read_delim_arrow <- function(file,
# TODO: move this into convert_options using include_columns
col_select <- enquo(col_select)
if (!quo_is_null(col_select)) {
tab <- tab[vars_select(names(tab), !!col_select)]
sim_df <- as.data.frame(tab$schema)
tab <- tab[eval_select(col_select, sim_df)]
}

if (isTRUE(as_data_frame)) {
Expand Down
2 changes: 1 addition & 1 deletion r/R/dplyr-funcs-doc.R
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@
#'
#' ## dplyr
#'
#' * [`across()`][dplyr::across()]: Use of `where()` selection helper not yet supported
#' * [`across()`][dplyr::across()]
#' * [`between()`][dplyr::between()]
#' * [`case_when()`][dplyr::case_when()]
#' * [`coalesce()`][dplyr::coalesce()]
Expand Down
76 changes: 36 additions & 40 deletions r/R/dplyr-select.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,12 @@
tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)

select.arrow_dplyr_query <- function(.data, ...) {
check_select_helpers(enexprs(...))
column_select(as_adq(.data), !!!enquos(...))
column_select(.data, enquos(...), op = "select")
}
select.Dataset <- select.ArrowTabular <- select.RecordBatchReader <- select.arrow_dplyr_query

rename.arrow_dplyr_query <- function(.data, ...) {
check_select_helpers(enexprs(...))
column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename)
column_select(.data, enquos(...), op = "rename")
}
rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query

Expand All @@ -39,29 +37,6 @@ rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...)
}
rename_with.Dataset <- rename_with.ArrowTabular <- rename_with.RecordBatchReader <- rename_with.arrow_dplyr_query

column_select <- function(.data, ..., .FUN = vars_select) {
# .FUN is either tidyselect::vars_select or tidyselect::vars_rename
# It operates on the names() of selected_columns, i.e. the column names
# factoring in any renaming that may already have happened
out <- .FUN(names(.data), !!!enquos(...))
# Make sure that the resulting selected columns map back to the original data,
# as in when there are multiple renaming steps
.data$selected_columns <- set_names(.data$selected_columns[out], names(out))

# If we've renamed columns, we need to project that renaming into other
# query parameters we've collected
renamed <- out[names(out) != out]
if (length(renamed)) {
# Massage group_by
gbv <- .data$group_by_vars
renamed_groups <- gbv %in% renamed
gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
.data$group_by_vars <- gbv
# No need to massage filters because those contain references to Arrow objects
}
.data
}

relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) {
# The code in this function is adapted from the code in dplyr::relocate.data.frame
# at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R
Expand Down Expand Up @@ -115,18 +90,39 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL
}
relocate.Dataset <- relocate.ArrowTabular <- relocate.RecordBatchReader <- relocate.arrow_dplyr_query

check_select_helpers <- function(exprs) {
# Throw an error if unsupported tidyselect selection helpers in `exprs`
exprs <- lapply(exprs, function(x) if (is_quosure(x)) quo_get_expr(x) else x)
unsup_select_helpers <- "where"
funs_in_exprs <- unlist(lapply(exprs, all_funs))
unsup_funs <- funs_in_exprs[funs_in_exprs %in% unsup_select_helpers]
if (length(unsup_funs)) {
stop(
"Unsupported selection ",
ngettext(length(unsup_funs), "helper: ", "helpers: "),
oxford_paste(paste0(unsup_funs, "()"), quote = FALSE),
call. = FALSE
)
column_select <- function(.data, select_expression, op = c("select", "rename")) {
op <- match.arg(op)

.data <- as_adq(.data)
sim_df <- as.data.frame(implicit_schema(.data))
old_names <- names(sim_df)

if (op == "select") {
out <- eval_select(expr(c(!!!select_expression)), sim_df)
# select only columns from `out`
subset <- out
} else if (op == "rename") {
out <- eval_rename(expr(c(!!!select_expression)), sim_df)
# select all columns as only renaming
subset <- set_names(seq_along(old_names), old_names)
names(subset)[out] <- names(out)
}

.data$selected_columns <- set_names(.data$selected_columns[subset], names(subset))

# check if names have updated
new_names <- old_names
new_names[out] <- names(out)
names_compared <- set_names(old_names, new_names)
renamed <- names_compared[old_names != new_names]

# Update names in group_by if changed in select() or rename()
if (length(renamed)) {
gbv <- .data$group_by_vars
renamed_groups <- gbv %in% renamed
gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
.data$group_by_vars <- gbv
}

.data
}
5 changes: 4 additions & 1 deletion r/R/feather.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,11 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T
reader <- FeatherReader$create(file)

col_select <- enquo(col_select)

columns <- if (!quo_is_null(col_select)) {
vars_select(names(reader), !!col_select)
sim_df <- as.data.frame(reader$schema)
indices <- eval_select(col_select, sim_df)
names(reader)[indices]
}

out <- tryCatch(
Expand Down
3 changes: 2 additions & 1 deletion r/R/json.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ read_json_arrow <- function(file,

col_select <- enquo(col_select)
if (!quo_is_null(col_select)) {
tab <- tab[vars_select(names(tab), !!col_select)]
sim_df <- as.data.frame(tab$schema)
tab <- tab[eval_select(col_select, sim_df)]
}

if (isTRUE(as_data_frame)) {
Expand Down
5 changes: 2 additions & 3 deletions r/R/parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,8 @@ read_parquet <- function(file,
col_select <- enquo(col_select)
if (!quo_is_null(col_select)) {
# infer which columns to keep from schema
schema <- reader$GetSchema()
names <- names(schema)
indices <- match(vars_select(names, !!col_select), names) - 1L
sim_df <- as.data.frame(reader$GetSchema())
indices <- eval_select(col_select, sim_df) - 1L
tab <- tryCatch(
reader$ReadTable(indices),
error = read_compressed_error
Expand Down
5 changes: 5 additions & 0 deletions r/R/schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -383,3 +383,8 @@ as_schema.Schema <- function(x, ...) {
as_schema.StructType <- function(x, ...) {
schema(!!!x$fields())
}

#' @export
as.data.frame.Schema <- function(x, row.names = NULL, optional = FALSE, ...) {
as.data.frame(Table__from_schema(x))
}
11 changes: 11 additions & 0 deletions r/R/table.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ Table$create <- function(..., schema = NULL) {
if (is.null(names(dots))) {
names(dots) <- rep_len("", length(dots))
}

if (length(dots) == 0 && inherits(schema, "Schema")) {
return(Table__from_schema(schema))
}

stopifnot(length(dots) > 0)

if (all_record_batches(dots)) {
Expand Down Expand Up @@ -330,3 +335,9 @@ as_arrow_table.RecordBatchReader <- function(x, ...) {
as_arrow_table.arrow_dplyr_query <- function(x, ...) {
as_arrow_table(as_record_batch_reader(x))
}

#' @rdname as_arrow_table
#' @export
as_arrow_table.Schema <- function(x, ...) {
Table__from_schema(x)
}
5 changes: 1 addition & 4 deletions r/data-raw/docgen.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,7 @@ docs <- arrow:::.cache$docs
# Add some functions

# across() is handled by manipulating the quosures, not by nse_funcs
docs[["dplyr::across"]] <- c(
# TODO(ARROW-17384): implement where
"Use of `where()` selection helper not yet supported"
)
docs[["dplyr::across"]] <- character(0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎉


# if_any() and if_all() are used instead of across() in filter()
# they are both handled by manipulating the quosures, not by nse_funcs
Expand Down
2 changes: 1 addition & 1 deletion r/man/acero.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions r/man/as_arrow_table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/read_delim_arrow.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/read_feather.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/read_json_arrow.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/read_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions r/src/arrowExports.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions r/src/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "./arrow_types.h"

#include <arrow/array/array_base.h>
#include <arrow/builder.h>
#include <arrow/table.h>
#include <arrow/util/byte_size.h>
#include <arrow/util/key_value_metadata.h>
Expand Down Expand Up @@ -302,6 +303,37 @@ std::shared_ptr<arrow::Table> Table__from_record_batches(
return tab;
}

// [[arrow::export]]
std::shared_ptr<arrow::Table> Table__from_schema(SEXP schema_sxp) {
auto schema = cpp11::as_cpp<std::shared_ptr<arrow::Schema>>(schema_sxp);

int num_fields = schema->num_fields();

std::vector<std::shared_ptr<arrow::Array>> columns;

for (int i = 0; i < num_fields; i++) {
bool is_extension_type = schema->field(i)->type()->name() == "extension";
std::shared_ptr<arrow::DataType> type;

// need to handle extension types a bit differently
if (is_extension_type) {
// TODO: ARROW-18043 - update this to properly construct extension types instead of
// converting to null
type = arrow::null();
} else {
type = schema->field(i)->type();
}

std::shared_ptr<arrow::Array> array;
std::unique_ptr<arrow::ArrayBuilder> type_builder;
StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), type, &type_builder));
StopIfNotOk(type_builder->Finish(&array));
columns.push_back(array);
}

return (arrow::Table::Make(schema, std::move(columns)));
}

// [[arrow::export]]
r_vec_size Table__ReferencedBufferSize(const std::shared_ptr<arrow::Table>& table) {
return r_vec_size(ValueOrStop(arrow::util::ReferencedBufferSize(*table)));
Expand Down
Loading