From e62c77c9e2b4ef9154972f58376cb2a94ff3ff61 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 14 Jul 2021 14:17:46 -0400 Subject: [PATCH 01/24] Progress commit --- r/R/query-engine.R | 29 ++++++++++ r/src/compute-exec.cpp | 74 +++++++++++++++++++++++++ r/tests/testthat/test-dplyr-aggregate.R | 36 ++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 r/R/query-engine.R create mode 100644 r/src/compute-exec.cpp create mode 100644 r/tests/testthat/test-dplyr-aggregate.R diff --git a/r/R/query-engine.R b/r/R/query-engine.R new file mode 100644 index 00000000000..32670192241 --- /dev/null +++ b/r/R/query-engine.R @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ExecNode <- R6Class("ExecNode", inherit = ArrowObject, + public = list( + Project = function(cols) { + assert_is_list_of(cols, "Expression") + ExecNode_Project(self, cols, names(cols)) + }, + Filter = function(expr) { + assert_is(expr, "Expression") + ExecNode_Filter(self, expr) + } + ) +) \ No newline at end of file diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp new file mode 100644 index 00000000000..eeecb6e2db7 --- /dev/null +++ b/r/src/compute-exec.cpp @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "./arrow_types.h" + +#if defined(ARROW_R_WITH_ARROW) + +#include +#include + +namespace compute = ::arrow::compute; + +#if defined(ARROW_R_WITH_DATASET) + +#include + +std::shared_ptr StartExecPlan( + std::shared_ptr dataset) { + auto plan = ValueOrStop(compute::ExecPlan::Make()); + // TODO: pass in ScanOptions by file type + auto options = std::make_shared(); + return std::shared_ptr( + ValueOrStop(arrow::dataset::MakeScanNode(plan.get(), dataset, options))); +} + +#endif + +std::shared_ptr ExecNode_Filter( + std::shared_ptr input, + std::shared_ptr filter) { + return std::shared_ptr( + ValueOrStop(compute::MakeFilterNode(input.get(), /*label=*/"filter", *filter)), + /* empty destructor: ExecNode lifetime is managed by an ExecPlan */ + [](...) {}); +} + +std::shared_ptr ExecNode_Project( + std::shared_ptr input, + std::vector> exprs, + std::vector names = {}) { + // We have shared_ptrs of expressions but need the Expressions + std::vector expressions; + for (auto expr : exprs) { + expressions.push_back(*expr); + } + return std::shared_ptr( + ValueOrStop( + compute::MakeProjectNode(input.get(), /*label=*/"project", expressions, names)), + [](...) {}); +} + +std::shared_ptr ExecNode_ScalarAggregate( + std::shared_ptr input, + std::vector aggregates); + +// ARROW_EXPORT +// Result MakeScalarAggregateNode(ExecNode* input, std::string label, +// std::vector aggregates); + +#endif diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R new file mode 100644 index 00000000000..7186acb9aed --- /dev/null +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both") + +test_that("Can aggregate", { + expect_dplyr_equal( + input %>% + summarize(total = sum(int)), + tbl + ) +}) \ No newline at end of file From 051074e68f7baf4d499979872bd1b0f43f4fc2d8 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 14 Jul 2021 19:46:23 -0400 Subject: [PATCH 02/24] Apply Ben's patch and sketch R side --- r/R/query-engine.R | 24 +++++++++- r/src/arrow_types.h | 1 + r/src/compute-exec.cpp | 104 ++++++++++++++++++++++++++++++++++------- r/src/compute.cpp | 4 +- 4 files changed, 113 insertions(+), 20 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 32670192241..5d14264b90c 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -15,6 +15,21 @@ # specific language governing permissions and limitations # under the License. +ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, + public = list( + Scan = function(dataset) { + # Handle arrow_dplyr_query + # TODO: why do I need to filter/project here? + ExecNode_Scan(self, dataset, filter, colnames) + }, + Run = function(node) { + assert_is(node, "ExecNode") + ExecPlan_run(self, node) + } + ) +) +ExecPlan$create <- ExecPlan_create + ExecNode <- R6Class("ExecNode", inherit = ArrowObject, public = list( Project = function(cols) { @@ -24,6 +39,13 @@ ExecNode <- R6Class("ExecNode", inherit = ArrowObject, Filter = function(expr) { assert_is(expr, "Expression") ExecNode_Filter(self, expr) + }, + ScalarAggregate = function(options, targets, out_field_names) { + ExecNode_ScalarAggregate(self, options, targets, out_field_names) } ) -) \ No newline at end of file +) + +# plan <- ExecPlan$create() +# final_node <- plan$Scan(dataset)$Filter(expr)$Project(exprs)$ScalarAggregate(something) +# plan$Run(final_node) \ No newline at end of file diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index b5a8914d432..49bdefb6f44 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -60,6 +60,7 @@ namespace fs = ::arrow::fs; std::shared_ptr RecordBatch__from_arrays(SEXP, SEXP); arrow::MemoryPool* gc_memory_pool(); +arrow::compute::ExecContext* gc_context(); #if (R_VERSION < R_Version(3, 5, 0)) #define LOGICAL_RO(x) ((const int*)LOGICAL(x)) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index eeecb6e2db7..e9e9cc04a4c 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -21,33 +21,86 @@ #include #include +#include +#include namespace compute = ::arrow::compute; +// [[arrow::export]] +std::shared_ptr ExecPlan_create() { + return ValueOrStop(compute::ExecPlan::Make(gc_context())); +} + +// [[arrow::export]] +std::shared_ptr ExecPlan_run( + std::shared_ptr plan, + std::shared_ptr final_node) { + // For now, don't require R to construct SinkNodes. + // Instead, just pass the node we should collect as an argument. + auto sink_gen = compute::MakeSinkNode(final_node.get(), "sink"); + + StopIfNotOk(plan->Validate()); + StopIfNotOk(plan->StartProducing()); + + std::shared_ptr sink_reader = compute::MakeGeneratorReader( + final_node->output_schema(), std::move(sink_gen), gc_memory_pool()); + + plan->finished().Wait(); + return ValueOrStop(arrow::Table::FromRecordBatchReader(sink_reader.get())); +} + +std::shared_ptr ExecNodeOrStop( + arrow::Result maybe_node) { + return std::shared_ptr(ValueOrStop(maybe_node), [](...) { + // empty destructor: ExecNode lifetime is managed by an ExecPlan + }); +} + #if defined(ARROW_R_WITH_DATASET) #include -std::shared_ptr StartExecPlan( - std::shared_ptr dataset) { - auto plan = ValueOrStop(compute::ExecPlan::Make()); +// [[arrow::export]] +std::shared_ptr ExecNode_Scan( + std::shared_ptr plan, + std::shared_ptr dataset, + std::shared_ptr filter, + std::vector materialized_field_names) { // TODO: pass in ScanOptions by file type auto options = std::make_shared(); - return std::shared_ptr( - ValueOrStop(arrow::dataset::MakeScanNode(plan.get(), dataset, options))); + + options->use_async = true; + + options->dataset_schema = dataset->schema(); + + // ScanNode needs the filter to do predicate pushdown and skip partitions + options->filter = ValueOrStop(filter->Bind(*dataset->schema())); + + // ScanNode needs to know which fields to materialize (and which are unnecessary) + std::vector exprs; + for (const auto& name : materialized_field_names) { + exprs.push_back(compute::field_ref(name)); + } + + options->projection = + ValueOrStop(call("project", std::move(exprs), + compute::ProjectOptions{std::move(materialized_field_names)}) + .Bind(*dataset->schema())); + + return ExecNodeOrStop(arrow::dataset::MakeScanNode(plan.get(), dataset, options)); } #endif +// [[arrow::export]] std::shared_ptr ExecNode_Filter( std::shared_ptr input, std::shared_ptr filter) { - return std::shared_ptr( - ValueOrStop(compute::MakeFilterNode(input.get(), /*label=*/"filter", *filter)), - /* empty destructor: ExecNode lifetime is managed by an ExecPlan */ - [](...) {}); + return ExecNodeOrStop( + compute::MakeFilterNode(input.get(), /*label=*/"filter", *filter)); } +// [[arrow::export]] std::shared_ptr ExecNode_Project( std::shared_ptr input, std::vector> exprs, @@ -57,18 +110,33 @@ std::shared_ptr ExecNode_Project( for (auto expr : exprs) { expressions.push_back(*expr); } - return std::shared_ptr( - ValueOrStop( - compute::MakeProjectNode(input.get(), /*label=*/"project", expressions, names)), - [](...) {}); + return ExecNodeOrStop(compute::MakeProjectNode( + input.get(), /*label=*/"project", std::move(expressions), std::move(names))); } std::shared_ptr ExecNode_ScalarAggregate( - std::shared_ptr input, - std::vector aggregates); + std::shared_ptr input, cpp11::list options, + std::vector targets, std::vector out_field_names) { + // PROBLEM: need to keep these alive as long as the plan somehow. + std::vector> keep_alives; + std::vector aggregates; -// ARROW_EXPORT -// Result MakeScalarAggregateNode(ExecNode* input, std::string label, -// std::vector aggregates); + for (cpp11::list name_opts : options) { + auto name = cpp11::as_cpp(name_opts[0]); + auto opts = make_compute_options(name, name_opts[1]); + + aggregates.push_back( + arrow::compute::internal::Aggregate{std::move(name), opts.get()}); + keep_alives.push_back(std::move(opts)); + } + + auto scalar_agg = ValueOrStop(MakeScalarAggregateNode( + source, /*label=*/"scalar_agg", aggregates, targets, out_field_names)); + + return std::shared_ptr(scalar_agg, [keep_alives](...) { + // empty destructor: ExecNode lifetime is managed by an ExecPlan + // also carries the function options + }); +} #endif diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 30821137383..142a460d2eb 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -22,11 +22,13 @@ #include #include #include +#include std::shared_ptr make_cast_options(cpp11::list options); arrow::compute::ExecContext* gc_context() { - static arrow::compute::ExecContext context(gc_memory_pool()); + static arrow::compute::ExecContext context(gc_memory_pool(), + arrow::internal::GetCpuThreadPool()); return &context; } From 41c0826ac5442ab2956677991fa715207171fede Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Jul 2021 07:16:00 -0400 Subject: [PATCH 03/24] More R; try to get C++ to compile --- r/R/arrowExports.R | 1771 ++++++++++++----------- r/R/dplyr-summarize.R | 78 +- r/R/query-engine.R | 23 +- r/src/arrowExports.cpp | 86 ++ r/src/arrow_types.h | 9 + r/src/compute-exec.cpp | 9 +- r/tests/testthat/test-dplyr-aggregate.R | 1 + 7 files changed, 1090 insertions(+), 887 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index e54f88e9d4e..250fd53f1a0 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1,1749 +1,1772 @@ # Generated by using data-raw/codegen.R -> do not edit by hand -is_altrep_int_nonull <- function(x) { - .Call(`_arrow_is_altrep_int_nonull`, x) +is_altrep_int_nonull <- function(x){ + .Call(`_arrow_is_altrep_int_nonull`, x) } -is_altrep_dbl_nonull <- function(x) { - .Call(`_arrow_is_altrep_dbl_nonull`, x) +is_altrep_dbl_nonull <- function(x){ + .Call(`_arrow_is_altrep_dbl_nonull`, x) } -Array__Slice1 <- function(array, offset) { - .Call(`_arrow_Array__Slice1`, array, offset) +Array__Slice1 <- function(array, offset){ + .Call(`_arrow_Array__Slice1`, array, offset) } -Array__Slice2 <- function(array, offset, length) { - .Call(`_arrow_Array__Slice2`, array, offset, length) +Array__Slice2 <- function(array, offset, length){ + .Call(`_arrow_Array__Slice2`, array, offset, length) } -Array__IsNull <- function(x, i) { - .Call(`_arrow_Array__IsNull`, x, i) +Array__IsNull <- function(x, i){ + .Call(`_arrow_Array__IsNull`, x, i) } -Array__IsValid <- function(x, i) { - .Call(`_arrow_Array__IsValid`, x, i) +Array__IsValid <- function(x, i){ + .Call(`_arrow_Array__IsValid`, x, i) } -Array__length <- function(x) { - .Call(`_arrow_Array__length`, x) +Array__length <- function(x){ + .Call(`_arrow_Array__length`, x) } -Array__offset <- function(x) { - .Call(`_arrow_Array__offset`, x) +Array__offset <- function(x){ + .Call(`_arrow_Array__offset`, x) } -Array__null_count <- function(x) { - .Call(`_arrow_Array__null_count`, x) +Array__null_count <- function(x){ + .Call(`_arrow_Array__null_count`, x) } -Array__type <- function(x) { - .Call(`_arrow_Array__type`, x) +Array__type <- function(x){ + .Call(`_arrow_Array__type`, x) } -Array__ToString <- function(x) { - .Call(`_arrow_Array__ToString`, x) +Array__ToString <- function(x){ + .Call(`_arrow_Array__ToString`, x) } -Array__type_id <- function(x) { - .Call(`_arrow_Array__type_id`, x) +Array__type_id <- function(x){ + .Call(`_arrow_Array__type_id`, x) } -Array__Equals <- function(lhs, rhs) { - .Call(`_arrow_Array__Equals`, lhs, rhs) +Array__Equals <- function(lhs, rhs){ + .Call(`_arrow_Array__Equals`, lhs, rhs) } -Array__ApproxEquals <- function(lhs, rhs) { - .Call(`_arrow_Array__ApproxEquals`, lhs, rhs) +Array__ApproxEquals <- function(lhs, rhs){ + .Call(`_arrow_Array__ApproxEquals`, lhs, rhs) } -Array__Diff <- function(lhs, rhs) { - .Call(`_arrow_Array__Diff`, lhs, rhs) +Array__Diff <- function(lhs, rhs){ + .Call(`_arrow_Array__Diff`, lhs, rhs) } -Array__data <- function(array) { - .Call(`_arrow_Array__data`, array) +Array__data <- function(array){ + .Call(`_arrow_Array__data`, array) } -Array__RangeEquals <- function(self, other, start_idx, end_idx, other_start_idx) { - .Call(`_arrow_Array__RangeEquals`, self, other, start_idx, end_idx, other_start_idx) +Array__RangeEquals <- function(self, other, start_idx, end_idx, other_start_idx){ + .Call(`_arrow_Array__RangeEquals`, self, other, start_idx, end_idx, other_start_idx) } -Array__View <- function(array, type) { - .Call(`_arrow_Array__View`, array, type) +Array__View <- function(array, type){ + .Call(`_arrow_Array__View`, array, type) } -Array__Validate <- function(array) { - invisible(.Call(`_arrow_Array__Validate`, array)) +Array__Validate <- function(array){ + invisible(.Call(`_arrow_Array__Validate`, array)) } -DictionaryArray__indices <- function(array) { - .Call(`_arrow_DictionaryArray__indices`, array) +DictionaryArray__indices <- function(array){ + .Call(`_arrow_DictionaryArray__indices`, array) } -DictionaryArray__dictionary <- function(array) { - .Call(`_arrow_DictionaryArray__dictionary`, array) +DictionaryArray__dictionary <- function(array){ + .Call(`_arrow_DictionaryArray__dictionary`, array) } -StructArray__field <- function(array, i) { - .Call(`_arrow_StructArray__field`, array, i) +StructArray__field <- function(array, i){ + .Call(`_arrow_StructArray__field`, array, i) } -StructArray__GetFieldByName <- function(array, name) { - .Call(`_arrow_StructArray__GetFieldByName`, array, name) +StructArray__GetFieldByName <- function(array, name){ + .Call(`_arrow_StructArray__GetFieldByName`, array, name) } -StructArray__Flatten <- function(array) { - .Call(`_arrow_StructArray__Flatten`, array) +StructArray__Flatten <- function(array){ + .Call(`_arrow_StructArray__Flatten`, array) } -ListArray__value_type <- function(array) { - .Call(`_arrow_ListArray__value_type`, array) +ListArray__value_type <- function(array){ + .Call(`_arrow_ListArray__value_type`, array) } -LargeListArray__value_type <- function(array) { - .Call(`_arrow_LargeListArray__value_type`, array) +LargeListArray__value_type <- function(array){ + .Call(`_arrow_LargeListArray__value_type`, array) } -ListArray__values <- function(array) { - .Call(`_arrow_ListArray__values`, array) +ListArray__values <- function(array){ + .Call(`_arrow_ListArray__values`, array) } -LargeListArray__values <- function(array) { - .Call(`_arrow_LargeListArray__values`, array) +LargeListArray__values <- function(array){ + .Call(`_arrow_LargeListArray__values`, array) } -ListArray__value_length <- function(array, i) { - .Call(`_arrow_ListArray__value_length`, array, i) +ListArray__value_length <- function(array, i){ + .Call(`_arrow_ListArray__value_length`, array, i) } -LargeListArray__value_length <- function(array, i) { - .Call(`_arrow_LargeListArray__value_length`, array, i) +LargeListArray__value_length <- function(array, i){ + .Call(`_arrow_LargeListArray__value_length`, array, i) } -FixedSizeListArray__value_length <- function(array, i) { - .Call(`_arrow_FixedSizeListArray__value_length`, array, i) +FixedSizeListArray__value_length <- function(array, i){ + .Call(`_arrow_FixedSizeListArray__value_length`, array, i) } -ListArray__value_offset <- function(array, i) { - .Call(`_arrow_ListArray__value_offset`, array, i) +ListArray__value_offset <- function(array, i){ + .Call(`_arrow_ListArray__value_offset`, array, i) } -LargeListArray__value_offset <- function(array, i) { - .Call(`_arrow_LargeListArray__value_offset`, array, i) +LargeListArray__value_offset <- function(array, i){ + .Call(`_arrow_LargeListArray__value_offset`, array, i) } -FixedSizeListArray__value_offset <- function(array, i) { - .Call(`_arrow_FixedSizeListArray__value_offset`, array, i) +FixedSizeListArray__value_offset <- function(array, i){ + .Call(`_arrow_FixedSizeListArray__value_offset`, array, i) } -ListArray__raw_value_offsets <- function(array) { - .Call(`_arrow_ListArray__raw_value_offsets`, array) +ListArray__raw_value_offsets <- function(array){ + .Call(`_arrow_ListArray__raw_value_offsets`, array) } -LargeListArray__raw_value_offsets <- function(array) { - .Call(`_arrow_LargeListArray__raw_value_offsets`, array) +LargeListArray__raw_value_offsets <- function(array){ + .Call(`_arrow_LargeListArray__raw_value_offsets`, array) } -Array__as_vector <- function(array) { - .Call(`_arrow_Array__as_vector`, array) +Array__as_vector <- function(array){ + .Call(`_arrow_Array__as_vector`, array) } -ChunkedArray__as_vector <- function(chunked_array, use_threads) { - .Call(`_arrow_ChunkedArray__as_vector`, chunked_array, use_threads) +ChunkedArray__as_vector <- function(chunked_array, use_threads){ + .Call(`_arrow_ChunkedArray__as_vector`, chunked_array, use_threads) } -RecordBatch__to_dataframe <- function(batch, use_threads) { - .Call(`_arrow_RecordBatch__to_dataframe`, batch, use_threads) +RecordBatch__to_dataframe <- function(batch, use_threads){ + .Call(`_arrow_RecordBatch__to_dataframe`, batch, use_threads) } -Table__to_dataframe <- function(table, use_threads) { - .Call(`_arrow_Table__to_dataframe`, table, use_threads) +Table__to_dataframe <- function(table, use_threads){ + .Call(`_arrow_Table__to_dataframe`, table, use_threads) } -ArrayData__get_type <- function(x) { - .Call(`_arrow_ArrayData__get_type`, x) +ArrayData__get_type <- function(x){ + .Call(`_arrow_ArrayData__get_type`, x) } -ArrayData__get_length <- function(x) { - .Call(`_arrow_ArrayData__get_length`, x) +ArrayData__get_length <- function(x){ + .Call(`_arrow_ArrayData__get_length`, x) } -ArrayData__get_null_count <- function(x) { - .Call(`_arrow_ArrayData__get_null_count`, x) +ArrayData__get_null_count <- function(x){ + .Call(`_arrow_ArrayData__get_null_count`, x) } -ArrayData__get_offset <- function(x) { - .Call(`_arrow_ArrayData__get_offset`, x) +ArrayData__get_offset <- function(x){ + .Call(`_arrow_ArrayData__get_offset`, x) } -ArrayData__buffers <- function(x) { - .Call(`_arrow_ArrayData__buffers`, x) +ArrayData__buffers <- function(x){ + .Call(`_arrow_ArrayData__buffers`, x) } -Buffer__is_mutable <- function(buffer) { - .Call(`_arrow_Buffer__is_mutable`, buffer) +Buffer__is_mutable <- function(buffer){ + .Call(`_arrow_Buffer__is_mutable`, buffer) } -Buffer__ZeroPadding <- function(buffer) { - invisible(.Call(`_arrow_Buffer__ZeroPadding`, buffer)) +Buffer__ZeroPadding <- function(buffer){ + invisible(.Call(`_arrow_Buffer__ZeroPadding`, buffer)) } -Buffer__capacity <- function(buffer) { - .Call(`_arrow_Buffer__capacity`, buffer) +Buffer__capacity <- function(buffer){ + .Call(`_arrow_Buffer__capacity`, buffer) } -Buffer__size <- function(buffer) { - .Call(`_arrow_Buffer__size`, buffer) +Buffer__size <- function(buffer){ + .Call(`_arrow_Buffer__size`, buffer) } -r___RBuffer__initialize <- function(x) { - .Call(`_arrow_r___RBuffer__initialize`, x) +r___RBuffer__initialize <- function(x){ + .Call(`_arrow_r___RBuffer__initialize`, x) } -Buffer__data <- function(buffer) { - .Call(`_arrow_Buffer__data`, buffer) +Buffer__data <- function(buffer){ + .Call(`_arrow_Buffer__data`, buffer) } -Buffer__Equals <- function(x, y) { - .Call(`_arrow_Buffer__Equals`, x, y) +Buffer__Equals <- function(x, y){ + .Call(`_arrow_Buffer__Equals`, x, y) } -ChunkedArray__length <- function(chunked_array) { - .Call(`_arrow_ChunkedArray__length`, chunked_array) +ChunkedArray__length <- function(chunked_array){ + .Call(`_arrow_ChunkedArray__length`, chunked_array) } -ChunkedArray__null_count <- function(chunked_array) { - .Call(`_arrow_ChunkedArray__null_count`, chunked_array) +ChunkedArray__null_count <- function(chunked_array){ + .Call(`_arrow_ChunkedArray__null_count`, chunked_array) } -ChunkedArray__num_chunks <- function(chunked_array) { - .Call(`_arrow_ChunkedArray__num_chunks`, chunked_array) +ChunkedArray__num_chunks <- function(chunked_array){ + .Call(`_arrow_ChunkedArray__num_chunks`, chunked_array) } -ChunkedArray__chunk <- function(chunked_array, i) { - .Call(`_arrow_ChunkedArray__chunk`, chunked_array, i) +ChunkedArray__chunk <- function(chunked_array, i){ + .Call(`_arrow_ChunkedArray__chunk`, chunked_array, i) } -ChunkedArray__chunks <- function(chunked_array) { - .Call(`_arrow_ChunkedArray__chunks`, chunked_array) +ChunkedArray__chunks <- function(chunked_array){ + .Call(`_arrow_ChunkedArray__chunks`, chunked_array) } -ChunkedArray__type <- function(chunked_array) { - .Call(`_arrow_ChunkedArray__type`, chunked_array) +ChunkedArray__type <- function(chunked_array){ + .Call(`_arrow_ChunkedArray__type`, chunked_array) } -ChunkedArray__Slice1 <- function(chunked_array, offset) { - .Call(`_arrow_ChunkedArray__Slice1`, chunked_array, offset) +ChunkedArray__Slice1 <- function(chunked_array, offset){ + .Call(`_arrow_ChunkedArray__Slice1`, chunked_array, offset) } -ChunkedArray__Slice2 <- function(chunked_array, offset, length) { - .Call(`_arrow_ChunkedArray__Slice2`, chunked_array, offset, length) +ChunkedArray__Slice2 <- function(chunked_array, offset, length){ + .Call(`_arrow_ChunkedArray__Slice2`, chunked_array, offset, length) } -ChunkedArray__View <- function(array, type) { - .Call(`_arrow_ChunkedArray__View`, array, type) +ChunkedArray__View <- function(array, type){ + .Call(`_arrow_ChunkedArray__View`, array, type) } -ChunkedArray__Validate <- function(chunked_array) { - invisible(.Call(`_arrow_ChunkedArray__Validate`, chunked_array)) +ChunkedArray__Validate <- function(chunked_array){ + invisible(.Call(`_arrow_ChunkedArray__Validate`, chunked_array)) } -ChunkedArray__Equals <- function(x, y) { - .Call(`_arrow_ChunkedArray__Equals`, x, y) +ChunkedArray__Equals <- function(x, y){ + .Call(`_arrow_ChunkedArray__Equals`, x, y) } -ChunkedArray__ToString <- function(x) { - .Call(`_arrow_ChunkedArray__ToString`, x) +ChunkedArray__ToString <- function(x){ + .Call(`_arrow_ChunkedArray__ToString`, x) } -ChunkedArray__from_list <- function(chunks, s_type) { - .Call(`_arrow_ChunkedArray__from_list`, chunks, s_type) +ChunkedArray__from_list <- function(chunks, s_type){ + .Call(`_arrow_ChunkedArray__from_list`, chunks, s_type) } -util___Codec__Create <- function(codec, compression_level) { - .Call(`_arrow_util___Codec__Create`, codec, compression_level) +util___Codec__Create <- function(codec, compression_level){ + .Call(`_arrow_util___Codec__Create`, codec, compression_level) } -util___Codec__name <- function(codec) { - .Call(`_arrow_util___Codec__name`, codec) +util___Codec__name <- function(codec){ + .Call(`_arrow_util___Codec__name`, codec) } -util___Codec__IsAvailable <- function(codec) { - .Call(`_arrow_util___Codec__IsAvailable`, codec) +util___Codec__IsAvailable <- function(codec){ + .Call(`_arrow_util___Codec__IsAvailable`, codec) } -io___CompressedOutputStream__Make <- function(codec, raw) { - .Call(`_arrow_io___CompressedOutputStream__Make`, codec, raw) +io___CompressedOutputStream__Make <- function(codec, raw){ + .Call(`_arrow_io___CompressedOutputStream__Make`, codec, raw) } -io___CompressedInputStream__Make <- function(codec, raw) { - .Call(`_arrow_io___CompressedInputStream__Make`, codec, raw) +io___CompressedInputStream__Make <- function(codec, raw){ + .Call(`_arrow_io___CompressedInputStream__Make`, codec, raw) } -RecordBatch__cast <- function(batch, schema, options) { - .Call(`_arrow_RecordBatch__cast`, batch, schema, options) +ExecPlan_create <- function(){ + .Call(`_arrow_ExecPlan_create`) } -Table__cast <- function(table, schema, options) { - .Call(`_arrow_Table__cast`, table, schema, options) +ExecPlan_run <- function(plan, final_node){ + .Call(`_arrow_ExecPlan_run`, plan, final_node) } -compute__CallFunction <- function(func_name, args, options) { - .Call(`_arrow_compute__CallFunction`, func_name, args, options) +ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names){ + .Call(`_arrow_ExecNode_Scan`, plan, dataset, filter, materialized_field_names) } -compute__GroupBy <- function(arguments, keys, options) { - .Call(`_arrow_compute__GroupBy`, arguments, keys, options) +ExecNode_Filter <- function(input, filter){ + .Call(`_arrow_ExecNode_Filter`, input, filter) } -compute__GetFunctionNames <- function() { - .Call(`_arrow_compute__GetFunctionNames`) +ExecNode_Project <- function(input, exprs, names){ + .Call(`_arrow_ExecNode_Project`, input, exprs, names) } -build_info <- function() { - .Call(`_arrow_build_info`) +RecordBatch__cast <- function(batch, schema, options){ + .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } -runtime_info <- function() { - .Call(`_arrow_runtime_info`) +Table__cast <- function(table, schema, options){ + .Call(`_arrow_Table__cast`, table, schema, options) } -csv___WriteOptions__initialize <- function(options) { - .Call(`_arrow_csv___WriteOptions__initialize`, options) +compute__CallFunction <- function(func_name, args, options){ + .Call(`_arrow_compute__CallFunction`, func_name, args, options) } -csv___ReadOptions__initialize <- function(options) { - .Call(`_arrow_csv___ReadOptions__initialize`, options) +compute__GroupBy <- function(arguments, keys, options){ + .Call(`_arrow_compute__GroupBy`, arguments, keys, options) } -csv___ParseOptions__initialize <- function(options) { - .Call(`_arrow_csv___ParseOptions__initialize`, options) +compute__GetFunctionNames <- function(){ + .Call(`_arrow_compute__GetFunctionNames`) } -csv___ReadOptions__column_names <- function(options) { - .Call(`_arrow_csv___ReadOptions__column_names`, options) +build_info <- function(){ + .Call(`_arrow_build_info`) } -csv___ConvertOptions__initialize <- function(options) { - .Call(`_arrow_csv___ConvertOptions__initialize`, options) +runtime_info <- function(){ + .Call(`_arrow_runtime_info`) } -csv___TableReader__Make <- function(input, read_options, parse_options, convert_options) { - .Call(`_arrow_csv___TableReader__Make`, input, read_options, parse_options, convert_options) +csv___WriteOptions__initialize <- function(options){ + .Call(`_arrow_csv___WriteOptions__initialize`, options) } -csv___TableReader__Read <- function(table_reader) { - .Call(`_arrow_csv___TableReader__Read`, table_reader) +csv___ReadOptions__initialize <- function(options){ + .Call(`_arrow_csv___ReadOptions__initialize`, options) } -TimestampParser__kind <- function(parser) { - .Call(`_arrow_TimestampParser__kind`, parser) +csv___ParseOptions__initialize <- function(options){ + .Call(`_arrow_csv___ParseOptions__initialize`, options) } -TimestampParser__format <- function(parser) { - .Call(`_arrow_TimestampParser__format`, parser) +csv___ReadOptions__column_names <- function(options){ + .Call(`_arrow_csv___ReadOptions__column_names`, options) } -TimestampParser__MakeStrptime <- function(format) { - .Call(`_arrow_TimestampParser__MakeStrptime`, format) +csv___ConvertOptions__initialize <- function(options){ + .Call(`_arrow_csv___ConvertOptions__initialize`, options) } -TimestampParser__MakeISO8601 <- function() { - .Call(`_arrow_TimestampParser__MakeISO8601`) +csv___TableReader__Make <- function(input, read_options, parse_options, convert_options){ + .Call(`_arrow_csv___TableReader__Make`, input, read_options, parse_options, convert_options) } -csv___WriteCSV__Table <- function(table, write_options, stream) { - invisible(.Call(`_arrow_csv___WriteCSV__Table`, table, write_options, stream)) +csv___TableReader__Read <- function(table_reader){ + .Call(`_arrow_csv___TableReader__Read`, table_reader) } -csv___WriteCSV__RecordBatch <- function(record_batch, write_options, stream) { - invisible(.Call(`_arrow_csv___WriteCSV__RecordBatch`, record_batch, write_options, stream)) +TimestampParser__kind <- function(parser){ + .Call(`_arrow_TimestampParser__kind`, parser) } -dataset___Dataset__NewScan <- function(ds) { - .Call(`_arrow_dataset___Dataset__NewScan`, ds) +TimestampParser__format <- function(parser){ + .Call(`_arrow_TimestampParser__format`, parser) } -dataset___Dataset__schema <- function(dataset) { - .Call(`_arrow_dataset___Dataset__schema`, dataset) +TimestampParser__MakeStrptime <- function(format){ + .Call(`_arrow_TimestampParser__MakeStrptime`, format) } -dataset___Dataset__type_name <- function(dataset) { - .Call(`_arrow_dataset___Dataset__type_name`, dataset) +TimestampParser__MakeISO8601 <- function(){ + .Call(`_arrow_TimestampParser__MakeISO8601`) } -dataset___Dataset__ReplaceSchema <- function(dataset, schm) { - .Call(`_arrow_dataset___Dataset__ReplaceSchema`, dataset, schm) +csv___WriteCSV__Table <- function(table, write_options, stream){ + invisible(.Call(`_arrow_csv___WriteCSV__Table`, table, write_options, stream)) } -dataset___UnionDataset__create <- function(datasets, schm) { - .Call(`_arrow_dataset___UnionDataset__create`, datasets, schm) +csv___WriteCSV__RecordBatch <- function(record_batch, write_options, stream){ + invisible(.Call(`_arrow_csv___WriteCSV__RecordBatch`, record_batch, write_options, stream)) } -dataset___InMemoryDataset__create <- function(table) { - .Call(`_arrow_dataset___InMemoryDataset__create`, table) +dataset___Dataset__NewScan <- function(ds){ + .Call(`_arrow_dataset___Dataset__NewScan`, ds) } -dataset___UnionDataset__children <- function(ds) { - .Call(`_arrow_dataset___UnionDataset__children`, ds) +dataset___Dataset__schema <- function(dataset){ + .Call(`_arrow_dataset___Dataset__schema`, dataset) } -dataset___FileSystemDataset__format <- function(dataset) { - .Call(`_arrow_dataset___FileSystemDataset__format`, dataset) +dataset___Dataset__type_name <- function(dataset){ + .Call(`_arrow_dataset___Dataset__type_name`, dataset) } -dataset___FileSystemDataset__filesystem <- function(dataset) { - .Call(`_arrow_dataset___FileSystemDataset__filesystem`, dataset) +dataset___Dataset__ReplaceSchema <- function(dataset, schm){ + .Call(`_arrow_dataset___Dataset__ReplaceSchema`, dataset, schm) } -dataset___FileSystemDataset__files <- function(dataset) { - .Call(`_arrow_dataset___FileSystemDataset__files`, dataset) +dataset___UnionDataset__create <- function(datasets, schm){ + .Call(`_arrow_dataset___UnionDataset__create`, datasets, schm) } -dataset___DatasetFactory__Finish1 <- function(factory, unify_schemas) { - .Call(`_arrow_dataset___DatasetFactory__Finish1`, factory, unify_schemas) +dataset___InMemoryDataset__create <- function(table){ + .Call(`_arrow_dataset___InMemoryDataset__create`, table) } -dataset___DatasetFactory__Finish2 <- function(factory, schema) { - .Call(`_arrow_dataset___DatasetFactory__Finish2`, factory, schema) +dataset___UnionDataset__children <- function(ds){ + .Call(`_arrow_dataset___UnionDataset__children`, ds) } -dataset___DatasetFactory__Inspect <- function(factory, unify_schemas) { - .Call(`_arrow_dataset___DatasetFactory__Inspect`, factory, unify_schemas) +dataset___FileSystemDataset__format <- function(dataset){ + .Call(`_arrow_dataset___FileSystemDataset__format`, dataset) } -dataset___UnionDatasetFactory__Make <- function(children) { - .Call(`_arrow_dataset___UnionDatasetFactory__Make`, children) +dataset___FileSystemDataset__filesystem <- function(dataset){ + .Call(`_arrow_dataset___FileSystemDataset__filesystem`, dataset) } -dataset___FileSystemDatasetFactory__Make0 <- function(fs, paths, format) { - .Call(`_arrow_dataset___FileSystemDatasetFactory__Make0`, fs, paths, format) +dataset___FileSystemDataset__files <- function(dataset){ + .Call(`_arrow_dataset___FileSystemDataset__files`, dataset) } -dataset___FileSystemDatasetFactory__Make2 <- function(fs, selector, format, partitioning) { - .Call(`_arrow_dataset___FileSystemDatasetFactory__Make2`, fs, selector, format, partitioning) +dataset___DatasetFactory__Finish1 <- function(factory, unify_schemas){ + .Call(`_arrow_dataset___DatasetFactory__Finish1`, factory, unify_schemas) } -dataset___FileSystemDatasetFactory__Make1 <- function(fs, selector, format) { - .Call(`_arrow_dataset___FileSystemDatasetFactory__Make1`, fs, selector, format) +dataset___DatasetFactory__Finish2 <- function(factory, schema){ + .Call(`_arrow_dataset___DatasetFactory__Finish2`, factory, schema) } -dataset___FileSystemDatasetFactory__Make3 <- function(fs, selector, format, factory) { - .Call(`_arrow_dataset___FileSystemDatasetFactory__Make3`, fs, selector, format, factory) +dataset___DatasetFactory__Inspect <- function(factory, unify_schemas){ + .Call(`_arrow_dataset___DatasetFactory__Inspect`, factory, unify_schemas) } -dataset___FileFormat__type_name <- function(format) { - .Call(`_arrow_dataset___FileFormat__type_name`, format) +dataset___UnionDatasetFactory__Make <- function(children){ + .Call(`_arrow_dataset___UnionDatasetFactory__Make`, children) } -dataset___FileFormat__DefaultWriteOptions <- function(fmt) { - .Call(`_arrow_dataset___FileFormat__DefaultWriteOptions`, fmt) +dataset___FileSystemDatasetFactory__Make0 <- function(fs, paths, format){ + .Call(`_arrow_dataset___FileSystemDatasetFactory__Make0`, fs, paths, format) } -dataset___ParquetFileFormat__Make <- function(options, dict_columns) { - .Call(`_arrow_dataset___ParquetFileFormat__Make`, options, dict_columns) +dataset___FileSystemDatasetFactory__Make2 <- function(fs, selector, format, partitioning){ + .Call(`_arrow_dataset___FileSystemDatasetFactory__Make2`, fs, selector, format, partitioning) } -dataset___FileWriteOptions__type_name <- function(options) { - .Call(`_arrow_dataset___FileWriteOptions__type_name`, options) +dataset___FileSystemDatasetFactory__Make1 <- function(fs, selector, format){ + .Call(`_arrow_dataset___FileSystemDatasetFactory__Make1`, fs, selector, format) } -dataset___ParquetFileWriteOptions__update <- function(options, writer_props, arrow_writer_props) { - invisible(.Call(`_arrow_dataset___ParquetFileWriteOptions__update`, options, writer_props, arrow_writer_props)) +dataset___FileSystemDatasetFactory__Make3 <- function(fs, selector, format, factory){ + .Call(`_arrow_dataset___FileSystemDatasetFactory__Make3`, fs, selector, format, factory) } -dataset___IpcFileWriteOptions__update2 <- function(ipc_options, use_legacy_format, codec, metadata_version) { - invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update2`, ipc_options, use_legacy_format, codec, metadata_version)) +dataset___FileFormat__type_name <- function(format){ + .Call(`_arrow_dataset___FileFormat__type_name`, format) } -dataset___IpcFileWriteOptions__update1 <- function(ipc_options, use_legacy_format, metadata_version) { - invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update1`, ipc_options, use_legacy_format, metadata_version)) +dataset___FileFormat__DefaultWriteOptions <- function(fmt){ + .Call(`_arrow_dataset___FileFormat__DefaultWriteOptions`, fmt) } -dataset___CsvFileWriteOptions__update <- function(csv_options, write_options) { - invisible(.Call(`_arrow_dataset___CsvFileWriteOptions__update`, csv_options, write_options)) +dataset___ParquetFileFormat__Make <- function(options, dict_columns){ + .Call(`_arrow_dataset___ParquetFileFormat__Make`, options, dict_columns) } -dataset___IpcFileFormat__Make <- function() { - .Call(`_arrow_dataset___IpcFileFormat__Make`) +dataset___FileWriteOptions__type_name <- function(options){ + .Call(`_arrow_dataset___FileWriteOptions__type_name`, options) } -dataset___CsvFileFormat__Make <- function(parse_options, convert_options, read_options) { - .Call(`_arrow_dataset___CsvFileFormat__Make`, parse_options, convert_options, read_options) +dataset___ParquetFileWriteOptions__update <- function(options, writer_props, arrow_writer_props){ + invisible(.Call(`_arrow_dataset___ParquetFileWriteOptions__update`, options, writer_props, arrow_writer_props)) } -dataset___FragmentScanOptions__type_name <- function(fragment_scan_options) { - .Call(`_arrow_dataset___FragmentScanOptions__type_name`, fragment_scan_options) +dataset___IpcFileWriteOptions__update2 <- function(ipc_options, use_legacy_format, codec, metadata_version){ + invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update2`, ipc_options, use_legacy_format, codec, metadata_version)) } -dataset___CsvFragmentScanOptions__Make <- function(convert_options, read_options) { - .Call(`_arrow_dataset___CsvFragmentScanOptions__Make`, convert_options, read_options) +dataset___IpcFileWriteOptions__update1 <- function(ipc_options, use_legacy_format, metadata_version){ + invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update1`, ipc_options, use_legacy_format, metadata_version)) } -dataset___ParquetFragmentScanOptions__Make <- function(use_buffered_stream, buffer_size, pre_buffer) { - .Call(`_arrow_dataset___ParquetFragmentScanOptions__Make`, use_buffered_stream, buffer_size, pre_buffer) +dataset___CsvFileWriteOptions__update <- function(csv_options, write_options){ + invisible(.Call(`_arrow_dataset___CsvFileWriteOptions__update`, csv_options, write_options)) } -dataset___DirectoryPartitioning <- function(schm, segment_encoding) { - .Call(`_arrow_dataset___DirectoryPartitioning`, schm, segment_encoding) +dataset___IpcFileFormat__Make <- function(){ + .Call(`_arrow_dataset___IpcFileFormat__Make`) } -dataset___DirectoryPartitioning__MakeFactory <- function(field_names, segment_encoding) { - .Call(`_arrow_dataset___DirectoryPartitioning__MakeFactory`, field_names, segment_encoding) +dataset___CsvFileFormat__Make <- function(parse_options, convert_options, read_options){ + .Call(`_arrow_dataset___CsvFileFormat__Make`, parse_options, convert_options, read_options) } -dataset___HivePartitioning <- function(schm, null_fallback, segment_encoding) { - .Call(`_arrow_dataset___HivePartitioning`, schm, null_fallback, segment_encoding) +dataset___FragmentScanOptions__type_name <- function(fragment_scan_options){ + .Call(`_arrow_dataset___FragmentScanOptions__type_name`, fragment_scan_options) } -dataset___HivePartitioning__MakeFactory <- function(null_fallback, segment_encoding) { - .Call(`_arrow_dataset___HivePartitioning__MakeFactory`, null_fallback, segment_encoding) +dataset___CsvFragmentScanOptions__Make <- function(convert_options, read_options){ + .Call(`_arrow_dataset___CsvFragmentScanOptions__Make`, convert_options, read_options) } -dataset___ScannerBuilder__ProjectNames <- function(sb, cols) { - invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectNames`, sb, cols)) +dataset___ParquetFragmentScanOptions__Make <- function(use_buffered_stream, buffer_size, pre_buffer){ + .Call(`_arrow_dataset___ParquetFragmentScanOptions__Make`, use_buffered_stream, buffer_size, pre_buffer) } -dataset___ScannerBuilder__ProjectExprs <- function(sb, exprs, names) { - invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectExprs`, sb, exprs, names)) +dataset___DirectoryPartitioning <- function(schm, segment_encoding){ + .Call(`_arrow_dataset___DirectoryPartitioning`, schm, segment_encoding) } -dataset___ScannerBuilder__Filter <- function(sb, expr) { - invisible(.Call(`_arrow_dataset___ScannerBuilder__Filter`, sb, expr)) +dataset___DirectoryPartitioning__MakeFactory <- function(field_names, segment_encoding){ + .Call(`_arrow_dataset___DirectoryPartitioning__MakeFactory`, field_names, segment_encoding) } -dataset___ScannerBuilder__UseThreads <- function(sb, threads) { - invisible(.Call(`_arrow_dataset___ScannerBuilder__UseThreads`, sb, threads)) +dataset___HivePartitioning <- function(schm, null_fallback, segment_encoding){ + .Call(`_arrow_dataset___HivePartitioning`, schm, null_fallback, segment_encoding) } -dataset___ScannerBuilder__UseAsync <- function(sb, use_async) { - invisible(.Call(`_arrow_dataset___ScannerBuilder__UseAsync`, sb, use_async)) +dataset___HivePartitioning__MakeFactory <- function(null_fallback, segment_encoding){ + .Call(`_arrow_dataset___HivePartitioning__MakeFactory`, null_fallback, segment_encoding) } -dataset___ScannerBuilder__BatchSize <- function(sb, batch_size) { - invisible(.Call(`_arrow_dataset___ScannerBuilder__BatchSize`, sb, batch_size)) +dataset___ScannerBuilder__ProjectNames <- function(sb, cols){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectNames`, sb, cols)) } -dataset___ScannerBuilder__FragmentScanOptions <- function(sb, options) { - invisible(.Call(`_arrow_dataset___ScannerBuilder__FragmentScanOptions`, sb, options)) +dataset___ScannerBuilder__ProjectExprs <- function(sb, exprs, names){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectExprs`, sb, exprs, names)) } -dataset___ScannerBuilder__schema <- function(sb) { - .Call(`_arrow_dataset___ScannerBuilder__schema`, sb) +dataset___ScannerBuilder__Filter <- function(sb, expr){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__Filter`, sb, expr)) } -dataset___ScannerBuilder__Finish <- function(sb) { - .Call(`_arrow_dataset___ScannerBuilder__Finish`, sb) +dataset___ScannerBuilder__UseThreads <- function(sb, threads){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__UseThreads`, sb, threads)) } -dataset___Scanner__ToTable <- function(scanner) { - .Call(`_arrow_dataset___Scanner__ToTable`, scanner) +dataset___ScannerBuilder__UseAsync <- function(sb, use_async){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__UseAsync`, sb, use_async)) } -dataset___Scanner__ScanBatches <- function(scanner) { - .Call(`_arrow_dataset___Scanner__ScanBatches`, scanner) +dataset___ScannerBuilder__BatchSize <- function(sb, batch_size){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__BatchSize`, sb, batch_size)) } -dataset___Scanner__ToRecordBatchReader <- function(scanner) { - .Call(`_arrow_dataset___Scanner__ToRecordBatchReader`, scanner) +dataset___ScannerBuilder__FragmentScanOptions <- function(sb, options){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__FragmentScanOptions`, sb, options)) } -dataset___Scanner__head <- function(scanner, n) { - .Call(`_arrow_dataset___Scanner__head`, scanner, n) +dataset___ScannerBuilder__schema <- function(sb){ + .Call(`_arrow_dataset___ScannerBuilder__schema`, sb) } -dataset___Scanner__schema <- function(sc) { - .Call(`_arrow_dataset___Scanner__schema`, sc) +dataset___ScannerBuilder__Finish <- function(sb){ + .Call(`_arrow_dataset___ScannerBuilder__Finish`, sb) } -dataset___ScanTask__get_batches <- function(scan_task) { - .Call(`_arrow_dataset___ScanTask__get_batches`, scan_task) +dataset___Scanner__ToTable <- function(scanner){ + .Call(`_arrow_dataset___Scanner__ToTable`, scanner) } -dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner) { - invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner)) +dataset___Scanner__ScanBatches <- function(scanner){ + .Call(`_arrow_dataset___Scanner__ScanBatches`, scanner) } -dataset___Scanner__TakeRows <- function(scanner, indices) { - .Call(`_arrow_dataset___Scanner__TakeRows`, scanner, indices) +dataset___Scanner__ToRecordBatchReader <- function(scanner){ + .Call(`_arrow_dataset___Scanner__ToRecordBatchReader`, scanner) } -dataset___Scanner__CountRows <- function(scanner) { - .Call(`_arrow_dataset___Scanner__CountRows`, scanner) +dataset___Scanner__head <- function(scanner, n){ + .Call(`_arrow_dataset___Scanner__head`, scanner, n) } -Int8__initialize <- function() { - .Call(`_arrow_Int8__initialize`) +dataset___Scanner__schema <- function(sc){ + .Call(`_arrow_dataset___Scanner__schema`, sc) } -Int16__initialize <- function() { - .Call(`_arrow_Int16__initialize`) +dataset___ScanTask__get_batches <- function(scan_task){ + .Call(`_arrow_dataset___ScanTask__get_batches`, scan_task) } -Int32__initialize <- function() { - .Call(`_arrow_Int32__initialize`) +dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner){ + invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner)) } -Int64__initialize <- function() { - .Call(`_arrow_Int64__initialize`) +dataset___Scanner__TakeRows <- function(scanner, indices){ + .Call(`_arrow_dataset___Scanner__TakeRows`, scanner, indices) } -UInt8__initialize <- function() { - .Call(`_arrow_UInt8__initialize`) +dataset___Scanner__CountRows <- function(scanner){ + .Call(`_arrow_dataset___Scanner__CountRows`, scanner) } -UInt16__initialize <- function() { - .Call(`_arrow_UInt16__initialize`) +Int8__initialize <- function(){ + .Call(`_arrow_Int8__initialize`) } -UInt32__initialize <- function() { - .Call(`_arrow_UInt32__initialize`) +Int16__initialize <- function(){ + .Call(`_arrow_Int16__initialize`) } -UInt64__initialize <- function() { - .Call(`_arrow_UInt64__initialize`) +Int32__initialize <- function(){ + .Call(`_arrow_Int32__initialize`) } -Float16__initialize <- function() { - .Call(`_arrow_Float16__initialize`) +Int64__initialize <- function(){ + .Call(`_arrow_Int64__initialize`) } -Float32__initialize <- function() { - .Call(`_arrow_Float32__initialize`) +UInt8__initialize <- function(){ + .Call(`_arrow_UInt8__initialize`) } -Float64__initialize <- function() { - .Call(`_arrow_Float64__initialize`) +UInt16__initialize <- function(){ + .Call(`_arrow_UInt16__initialize`) } -Boolean__initialize <- function() { - .Call(`_arrow_Boolean__initialize`) +UInt32__initialize <- function(){ + .Call(`_arrow_UInt32__initialize`) } -Utf8__initialize <- function() { - .Call(`_arrow_Utf8__initialize`) +UInt64__initialize <- function(){ + .Call(`_arrow_UInt64__initialize`) } -LargeUtf8__initialize <- function() { - .Call(`_arrow_LargeUtf8__initialize`) +Float16__initialize <- function(){ + .Call(`_arrow_Float16__initialize`) } -Binary__initialize <- function() { - .Call(`_arrow_Binary__initialize`) +Float32__initialize <- function(){ + .Call(`_arrow_Float32__initialize`) } -LargeBinary__initialize <- function() { - .Call(`_arrow_LargeBinary__initialize`) +Float64__initialize <- function(){ + .Call(`_arrow_Float64__initialize`) } -Date32__initialize <- function() { - .Call(`_arrow_Date32__initialize`) +Boolean__initialize <- function(){ + .Call(`_arrow_Boolean__initialize`) } -Date64__initialize <- function() { - .Call(`_arrow_Date64__initialize`) +Utf8__initialize <- function(){ + .Call(`_arrow_Utf8__initialize`) } -Null__initialize <- function() { - .Call(`_arrow_Null__initialize`) +LargeUtf8__initialize <- function(){ + .Call(`_arrow_LargeUtf8__initialize`) } -Decimal128Type__initialize <- function(precision, scale) { - .Call(`_arrow_Decimal128Type__initialize`, precision, scale) +Binary__initialize <- function(){ + .Call(`_arrow_Binary__initialize`) } -FixedSizeBinary__initialize <- function(byte_width) { - .Call(`_arrow_FixedSizeBinary__initialize`, byte_width) +LargeBinary__initialize <- function(){ + .Call(`_arrow_LargeBinary__initialize`) } -Timestamp__initialize <- function(unit, timezone) { - .Call(`_arrow_Timestamp__initialize`, unit, timezone) +Date32__initialize <- function(){ + .Call(`_arrow_Date32__initialize`) } -Time32__initialize <- function(unit) { - .Call(`_arrow_Time32__initialize`, unit) +Date64__initialize <- function(){ + .Call(`_arrow_Date64__initialize`) } -Time64__initialize <- function(unit) { - .Call(`_arrow_Time64__initialize`, unit) +Null__initialize <- function(){ + .Call(`_arrow_Null__initialize`) } -list__ <- function(x) { - .Call(`_arrow_list__`, x) +Decimal128Type__initialize <- function(precision, scale){ + .Call(`_arrow_Decimal128Type__initialize`, precision, scale) } -large_list__ <- function(x) { - .Call(`_arrow_large_list__`, x) +FixedSizeBinary__initialize <- function(byte_width){ + .Call(`_arrow_FixedSizeBinary__initialize`, byte_width) } -fixed_size_list__ <- function(x, list_size) { - .Call(`_arrow_fixed_size_list__`, x, list_size) +Timestamp__initialize <- function(unit, timezone){ + .Call(`_arrow_Timestamp__initialize`, unit, timezone) } -struct__ <- function(fields) { - .Call(`_arrow_struct__`, fields) +Time32__initialize <- function(unit){ + .Call(`_arrow_Time32__initialize`, unit) } -DataType__ToString <- function(type) { - .Call(`_arrow_DataType__ToString`, type) +Time64__initialize <- function(unit){ + .Call(`_arrow_Time64__initialize`, unit) } -DataType__name <- function(type) { - .Call(`_arrow_DataType__name`, type) +list__ <- function(x){ + .Call(`_arrow_list__`, x) } -DataType__Equals <- function(lhs, rhs) { - .Call(`_arrow_DataType__Equals`, lhs, rhs) +large_list__ <- function(x){ + .Call(`_arrow_large_list__`, x) } -DataType__num_fields <- function(type) { - .Call(`_arrow_DataType__num_fields`, type) +fixed_size_list__ <- function(x, list_size){ + .Call(`_arrow_fixed_size_list__`, x, list_size) } -DataType__fields <- function(type) { - .Call(`_arrow_DataType__fields`, type) +struct__ <- function(fields){ + .Call(`_arrow_struct__`, fields) } -DataType__id <- function(type) { - .Call(`_arrow_DataType__id`, type) +DataType__ToString <- function(type){ + .Call(`_arrow_DataType__ToString`, type) } -ListType__ToString <- function(type) { - .Call(`_arrow_ListType__ToString`, type) +DataType__name <- function(type){ + .Call(`_arrow_DataType__name`, type) } -FixedWidthType__bit_width <- function(type) { - .Call(`_arrow_FixedWidthType__bit_width`, type) +DataType__Equals <- function(lhs, rhs){ + .Call(`_arrow_DataType__Equals`, lhs, rhs) } -DateType__unit <- function(type) { - .Call(`_arrow_DateType__unit`, type) +DataType__num_fields <- function(type){ + .Call(`_arrow_DataType__num_fields`, type) } -TimeType__unit <- function(type) { - .Call(`_arrow_TimeType__unit`, type) +DataType__fields <- function(type){ + .Call(`_arrow_DataType__fields`, type) } -DecimalType__precision <- function(type) { - .Call(`_arrow_DecimalType__precision`, type) +DataType__id <- function(type){ + .Call(`_arrow_DataType__id`, type) } -DecimalType__scale <- function(type) { - .Call(`_arrow_DecimalType__scale`, type) +ListType__ToString <- function(type){ + .Call(`_arrow_ListType__ToString`, type) } -TimestampType__timezone <- function(type) { - .Call(`_arrow_TimestampType__timezone`, type) +FixedWidthType__bit_width <- function(type){ + .Call(`_arrow_FixedWidthType__bit_width`, type) } -TimestampType__unit <- function(type) { - .Call(`_arrow_TimestampType__unit`, type) +DateType__unit <- function(type){ + .Call(`_arrow_DateType__unit`, type) } -DictionaryType__initialize <- function(index_type, value_type, ordered) { - .Call(`_arrow_DictionaryType__initialize`, index_type, value_type, ordered) +TimeType__unit <- function(type){ + .Call(`_arrow_TimeType__unit`, type) } -DictionaryType__index_type <- function(type) { - .Call(`_arrow_DictionaryType__index_type`, type) +DecimalType__precision <- function(type){ + .Call(`_arrow_DecimalType__precision`, type) } -DictionaryType__value_type <- function(type) { - .Call(`_arrow_DictionaryType__value_type`, type) +DecimalType__scale <- function(type){ + .Call(`_arrow_DecimalType__scale`, type) } -DictionaryType__name <- function(type) { - .Call(`_arrow_DictionaryType__name`, type) +TimestampType__timezone <- function(type){ + .Call(`_arrow_TimestampType__timezone`, type) } -DictionaryType__ordered <- function(type) { - .Call(`_arrow_DictionaryType__ordered`, type) +TimestampType__unit <- function(type){ + .Call(`_arrow_TimestampType__unit`, type) } -StructType__GetFieldByName <- function(type, name) { - .Call(`_arrow_StructType__GetFieldByName`, type, name) +DictionaryType__initialize <- function(index_type, value_type, ordered){ + .Call(`_arrow_DictionaryType__initialize`, index_type, value_type, ordered) } -StructType__GetFieldIndex <- function(type, name) { - .Call(`_arrow_StructType__GetFieldIndex`, type, name) +DictionaryType__index_type <- function(type){ + .Call(`_arrow_DictionaryType__index_type`, type) } -StructType__field_names <- function(type) { - .Call(`_arrow_StructType__field_names`, type) +DictionaryType__value_type <- function(type){ + .Call(`_arrow_DictionaryType__value_type`, type) } -ListType__value_field <- function(type) { - .Call(`_arrow_ListType__value_field`, type) +DictionaryType__name <- function(type){ + .Call(`_arrow_DictionaryType__name`, type) } -ListType__value_type <- function(type) { - .Call(`_arrow_ListType__value_type`, type) +DictionaryType__ordered <- function(type){ + .Call(`_arrow_DictionaryType__ordered`, type) } -LargeListType__value_field <- function(type) { - .Call(`_arrow_LargeListType__value_field`, type) +StructType__GetFieldByName <- function(type, name){ + .Call(`_arrow_StructType__GetFieldByName`, type, name) } -LargeListType__value_type <- function(type) { - .Call(`_arrow_LargeListType__value_type`, type) +StructType__GetFieldIndex <- function(type, name){ + .Call(`_arrow_StructType__GetFieldIndex`, type, name) } -FixedSizeListType__value_field <- function(type) { - .Call(`_arrow_FixedSizeListType__value_field`, type) +StructType__field_names <- function(type){ + .Call(`_arrow_StructType__field_names`, type) } -FixedSizeListType__value_type <- function(type) { - .Call(`_arrow_FixedSizeListType__value_type`, type) +ListType__value_field <- function(type){ + .Call(`_arrow_ListType__value_field`, type) } -FixedSizeListType__list_size <- function(type) { - .Call(`_arrow_FixedSizeListType__list_size`, type) +ListType__value_type <- function(type){ + .Call(`_arrow_ListType__value_type`, type) } -compute___expr__call <- function(func_name, argument_list, options) { - .Call(`_arrow_compute___expr__call`, func_name, argument_list, options) +LargeListType__value_field <- function(type){ + .Call(`_arrow_LargeListType__value_field`, type) } -compute___expr__field_ref <- function(name) { - .Call(`_arrow_compute___expr__field_ref`, name) +LargeListType__value_type <- function(type){ + .Call(`_arrow_LargeListType__value_type`, type) } -compute___expr__get_field_ref_name <- function(x) { - .Call(`_arrow_compute___expr__get_field_ref_name`, x) +FixedSizeListType__value_field <- function(type){ + .Call(`_arrow_FixedSizeListType__value_field`, type) } -compute___expr__scalar <- function(x) { - .Call(`_arrow_compute___expr__scalar`, x) +FixedSizeListType__value_type <- function(type){ + .Call(`_arrow_FixedSizeListType__value_type`, type) } -compute___expr__ToString <- function(x) { - .Call(`_arrow_compute___expr__ToString`, x) +FixedSizeListType__list_size <- function(type){ + .Call(`_arrow_FixedSizeListType__list_size`, type) } -compute___expr__type <- function(x, schema) { - .Call(`_arrow_compute___expr__type`, x, schema) +compute___expr__call <- function(func_name, argument_list, options){ + .Call(`_arrow_compute___expr__call`, func_name, argument_list, options) } -compute___expr__type_id <- function(x, schema) { - .Call(`_arrow_compute___expr__type_id`, x, schema) +compute___expr__field_ref <- function(name){ + .Call(`_arrow_compute___expr__field_ref`, name) } -ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level) { - invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level)) +compute___expr__get_field_ref_name <- function(x){ + .Call(`_arrow_compute___expr__get_field_ref_name`, x) } -ipc___feather___Reader__version <- function(reader) { - .Call(`_arrow_ipc___feather___Reader__version`, reader) +compute___expr__scalar <- function(x){ + .Call(`_arrow_compute___expr__scalar`, x) } -ipc___feather___Reader__Read <- function(reader, columns) { - .Call(`_arrow_ipc___feather___Reader__Read`, reader, columns) +compute___expr__ToString <- function(x){ + .Call(`_arrow_compute___expr__ToString`, x) } -ipc___feather___Reader__Open <- function(stream) { - .Call(`_arrow_ipc___feather___Reader__Open`, stream) +compute___expr__type <- function(x, schema){ + .Call(`_arrow_compute___expr__type`, x, schema) } -ipc___feather___Reader__schema <- function(reader) { - .Call(`_arrow_ipc___feather___Reader__schema`, reader) +compute___expr__type_id <- function(x, schema){ + .Call(`_arrow_compute___expr__type_id`, x, schema) } -Field__initialize <- function(name, field, nullable) { - .Call(`_arrow_Field__initialize`, name, field, nullable) +ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level){ + invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level)) } -Field__ToString <- function(field) { - .Call(`_arrow_Field__ToString`, field) +ipc___feather___Reader__version <- function(reader){ + .Call(`_arrow_ipc___feather___Reader__version`, reader) } -Field__name <- function(field) { - .Call(`_arrow_Field__name`, field) +ipc___feather___Reader__Read <- function(reader, columns){ + .Call(`_arrow_ipc___feather___Reader__Read`, reader, columns) } -Field__Equals <- function(field, other) { - .Call(`_arrow_Field__Equals`, field, other) +ipc___feather___Reader__Open <- function(stream){ + .Call(`_arrow_ipc___feather___Reader__Open`, stream) } -Field__nullable <- function(field) { - .Call(`_arrow_Field__nullable`, field) +ipc___feather___Reader__schema <- function(reader){ + .Call(`_arrow_ipc___feather___Reader__schema`, reader) } -Field__type <- function(field) { - .Call(`_arrow_Field__type`, field) +Field__initialize <- function(name, field, nullable){ + .Call(`_arrow_Field__initialize`, name, field, nullable) } -fs___FileInfo__type <- function(x) { - .Call(`_arrow_fs___FileInfo__type`, x) +Field__ToString <- function(field){ + .Call(`_arrow_Field__ToString`, field) } -fs___FileInfo__set_type <- function(x, type) { - invisible(.Call(`_arrow_fs___FileInfo__set_type`, x, type)) +Field__name <- function(field){ + .Call(`_arrow_Field__name`, field) } -fs___FileInfo__path <- function(x) { - .Call(`_arrow_fs___FileInfo__path`, x) +Field__Equals <- function(field, other){ + .Call(`_arrow_Field__Equals`, field, other) } -fs___FileInfo__set_path <- function(x, path) { - invisible(.Call(`_arrow_fs___FileInfo__set_path`, x, path)) +Field__nullable <- function(field){ + .Call(`_arrow_Field__nullable`, field) } -fs___FileInfo__size <- function(x) { - .Call(`_arrow_fs___FileInfo__size`, x) +Field__type <- function(field){ + .Call(`_arrow_Field__type`, field) } -fs___FileInfo__set_size <- function(x, size) { - invisible(.Call(`_arrow_fs___FileInfo__set_size`, x, size)) +fs___FileInfo__type <- function(x){ + .Call(`_arrow_fs___FileInfo__type`, x) } -fs___FileInfo__base_name <- function(x) { - .Call(`_arrow_fs___FileInfo__base_name`, x) +fs___FileInfo__set_type <- function(x, type){ + invisible(.Call(`_arrow_fs___FileInfo__set_type`, x, type)) } -fs___FileInfo__extension <- function(x) { - .Call(`_arrow_fs___FileInfo__extension`, x) +fs___FileInfo__path <- function(x){ + .Call(`_arrow_fs___FileInfo__path`, x) } -fs___FileInfo__mtime <- function(x) { - .Call(`_arrow_fs___FileInfo__mtime`, x) +fs___FileInfo__set_path <- function(x, path){ + invisible(.Call(`_arrow_fs___FileInfo__set_path`, x, path)) } -fs___FileInfo__set_mtime <- function(x, time) { - invisible(.Call(`_arrow_fs___FileInfo__set_mtime`, x, time)) +fs___FileInfo__size <- function(x){ + .Call(`_arrow_fs___FileInfo__size`, x) } -fs___FileSelector__base_dir <- function(selector) { - .Call(`_arrow_fs___FileSelector__base_dir`, selector) +fs___FileInfo__set_size <- function(x, size){ + invisible(.Call(`_arrow_fs___FileInfo__set_size`, x, size)) } -fs___FileSelector__allow_not_found <- function(selector) { - .Call(`_arrow_fs___FileSelector__allow_not_found`, selector) +fs___FileInfo__base_name <- function(x){ + .Call(`_arrow_fs___FileInfo__base_name`, x) } -fs___FileSelector__recursive <- function(selector) { - .Call(`_arrow_fs___FileSelector__recursive`, selector) +fs___FileInfo__extension <- function(x){ + .Call(`_arrow_fs___FileInfo__extension`, x) } -fs___FileSelector__create <- function(base_dir, allow_not_found, recursive) { - .Call(`_arrow_fs___FileSelector__create`, base_dir, allow_not_found, recursive) +fs___FileInfo__mtime <- function(x){ + .Call(`_arrow_fs___FileInfo__mtime`, x) } -fs___FileSystem__GetTargetInfos_Paths <- function(file_system, paths) { - .Call(`_arrow_fs___FileSystem__GetTargetInfos_Paths`, file_system, paths) +fs___FileInfo__set_mtime <- function(x, time){ + invisible(.Call(`_arrow_fs___FileInfo__set_mtime`, x, time)) } -fs___FileSystem__GetTargetInfos_FileSelector <- function(file_system, selector) { - .Call(`_arrow_fs___FileSystem__GetTargetInfos_FileSelector`, file_system, selector) +fs___FileSelector__base_dir <- function(selector){ + .Call(`_arrow_fs___FileSelector__base_dir`, selector) } -fs___FileSystem__CreateDir <- function(file_system, path, recursive) { - invisible(.Call(`_arrow_fs___FileSystem__CreateDir`, file_system, path, recursive)) +fs___FileSelector__allow_not_found <- function(selector){ + .Call(`_arrow_fs___FileSelector__allow_not_found`, selector) } -fs___FileSystem__DeleteDir <- function(file_system, path) { - invisible(.Call(`_arrow_fs___FileSystem__DeleteDir`, file_system, path)) +fs___FileSelector__recursive <- function(selector){ + .Call(`_arrow_fs___FileSelector__recursive`, selector) } -fs___FileSystem__DeleteDirContents <- function(file_system, path) { - invisible(.Call(`_arrow_fs___FileSystem__DeleteDirContents`, file_system, path)) +fs___FileSelector__create <- function(base_dir, allow_not_found, recursive){ + .Call(`_arrow_fs___FileSelector__create`, base_dir, allow_not_found, recursive) } -fs___FileSystem__DeleteFile <- function(file_system, path) { - invisible(.Call(`_arrow_fs___FileSystem__DeleteFile`, file_system, path)) +fs___FileSystem__GetTargetInfos_Paths <- function(file_system, paths){ + .Call(`_arrow_fs___FileSystem__GetTargetInfos_Paths`, file_system, paths) } -fs___FileSystem__DeleteFiles <- function(file_system, paths) { - invisible(.Call(`_arrow_fs___FileSystem__DeleteFiles`, file_system, paths)) +fs___FileSystem__GetTargetInfos_FileSelector <- function(file_system, selector){ + .Call(`_arrow_fs___FileSystem__GetTargetInfos_FileSelector`, file_system, selector) } -fs___FileSystem__Move <- function(file_system, src, dest) { - invisible(.Call(`_arrow_fs___FileSystem__Move`, file_system, src, dest)) +fs___FileSystem__CreateDir <- function(file_system, path, recursive){ + invisible(.Call(`_arrow_fs___FileSystem__CreateDir`, file_system, path, recursive)) } -fs___FileSystem__CopyFile <- function(file_system, src, dest) { - invisible(.Call(`_arrow_fs___FileSystem__CopyFile`, file_system, src, dest)) +fs___FileSystem__DeleteDir <- function(file_system, path){ + invisible(.Call(`_arrow_fs___FileSystem__DeleteDir`, file_system, path)) } -fs___FileSystem__OpenInputStream <- function(file_system, path) { - .Call(`_arrow_fs___FileSystem__OpenInputStream`, file_system, path) +fs___FileSystem__DeleteDirContents <- function(file_system, path){ + invisible(.Call(`_arrow_fs___FileSystem__DeleteDirContents`, file_system, path)) } -fs___FileSystem__OpenInputFile <- function(file_system, path) { - .Call(`_arrow_fs___FileSystem__OpenInputFile`, file_system, path) +fs___FileSystem__DeleteFile <- function(file_system, path){ + invisible(.Call(`_arrow_fs___FileSystem__DeleteFile`, file_system, path)) } -fs___FileSystem__OpenOutputStream <- function(file_system, path) { - .Call(`_arrow_fs___FileSystem__OpenOutputStream`, file_system, path) +fs___FileSystem__DeleteFiles <- function(file_system, paths){ + invisible(.Call(`_arrow_fs___FileSystem__DeleteFiles`, file_system, paths)) } -fs___FileSystem__OpenAppendStream <- function(file_system, path) { - .Call(`_arrow_fs___FileSystem__OpenAppendStream`, file_system, path) +fs___FileSystem__Move <- function(file_system, src, dest){ + invisible(.Call(`_arrow_fs___FileSystem__Move`, file_system, src, dest)) } -fs___FileSystem__type_name <- function(file_system) { - .Call(`_arrow_fs___FileSystem__type_name`, file_system) +fs___FileSystem__CopyFile <- function(file_system, src, dest){ + invisible(.Call(`_arrow_fs___FileSystem__CopyFile`, file_system, src, dest)) } -fs___LocalFileSystem__create <- function() { - .Call(`_arrow_fs___LocalFileSystem__create`) +fs___FileSystem__OpenInputStream <- function(file_system, path){ + .Call(`_arrow_fs___FileSystem__OpenInputStream`, file_system, path) } -fs___SubTreeFileSystem__create <- function(base_path, base_fs) { - .Call(`_arrow_fs___SubTreeFileSystem__create`, base_path, base_fs) +fs___FileSystem__OpenInputFile <- function(file_system, path){ + .Call(`_arrow_fs___FileSystem__OpenInputFile`, file_system, path) } -fs___SubTreeFileSystem__base_fs <- function(file_system) { - .Call(`_arrow_fs___SubTreeFileSystem__base_fs`, file_system) +fs___FileSystem__OpenOutputStream <- function(file_system, path){ + .Call(`_arrow_fs___FileSystem__OpenOutputStream`, file_system, path) } -fs___SubTreeFileSystem__base_path <- function(file_system) { - .Call(`_arrow_fs___SubTreeFileSystem__base_path`, file_system) +fs___FileSystem__OpenAppendStream <- function(file_system, path){ + .Call(`_arrow_fs___FileSystem__OpenAppendStream`, file_system, path) } -fs___FileSystemFromUri <- function(path) { - .Call(`_arrow_fs___FileSystemFromUri`, path) +fs___FileSystem__type_name <- function(file_system){ + .Call(`_arrow_fs___FileSystem__type_name`, file_system) } -fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads) { - invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads)) +fs___LocalFileSystem__create <- function(){ + .Call(`_arrow_fs___LocalFileSystem__create`) } -fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) { - .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) +fs___SubTreeFileSystem__create <- function(base_path, base_fs){ + .Call(`_arrow_fs___SubTreeFileSystem__create`, base_path, base_fs) } -fs___S3FileSystem__region <- function(fs) { - .Call(`_arrow_fs___S3FileSystem__region`, fs) +fs___SubTreeFileSystem__base_fs <- function(file_system){ + .Call(`_arrow_fs___SubTreeFileSystem__base_fs`, file_system) } -io___Readable__Read <- function(x, nbytes) { - .Call(`_arrow_io___Readable__Read`, x, nbytes) +fs___SubTreeFileSystem__base_path <- function(file_system){ + .Call(`_arrow_fs___SubTreeFileSystem__base_path`, file_system) } -io___InputStream__Close <- function(x) { - invisible(.Call(`_arrow_io___InputStream__Close`, x)) +fs___FileSystemFromUri <- function(path){ + .Call(`_arrow_fs___FileSystemFromUri`, path) } -io___OutputStream__Close <- function(x) { - invisible(.Call(`_arrow_io___OutputStream__Close`, x)) +fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads){ + invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads)) } -io___RandomAccessFile__GetSize <- function(x) { - .Call(`_arrow_io___RandomAccessFile__GetSize`, x) +fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes){ + .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) } -io___RandomAccessFile__supports_zero_copy <- function(x) { - .Call(`_arrow_io___RandomAccessFile__supports_zero_copy`, x) +fs___S3FileSystem__region <- function(fs){ + .Call(`_arrow_fs___S3FileSystem__region`, fs) } -io___RandomAccessFile__Seek <- function(x, position) { - invisible(.Call(`_arrow_io___RandomAccessFile__Seek`, x, position)) +io___Readable__Read <- function(x, nbytes){ + .Call(`_arrow_io___Readable__Read`, x, nbytes) } -io___RandomAccessFile__Tell <- function(x) { - .Call(`_arrow_io___RandomAccessFile__Tell`, x) +io___InputStream__Close <- function(x){ + invisible(.Call(`_arrow_io___InputStream__Close`, x)) } -io___RandomAccessFile__Read0 <- function(x) { - .Call(`_arrow_io___RandomAccessFile__Read0`, x) +io___OutputStream__Close <- function(x){ + invisible(.Call(`_arrow_io___OutputStream__Close`, x)) } -io___RandomAccessFile__ReadAt <- function(x, position, nbytes) { - .Call(`_arrow_io___RandomAccessFile__ReadAt`, x, position, nbytes) +io___RandomAccessFile__GetSize <- function(x){ + .Call(`_arrow_io___RandomAccessFile__GetSize`, x) } -io___MemoryMappedFile__Create <- function(path, size) { - .Call(`_arrow_io___MemoryMappedFile__Create`, path, size) +io___RandomAccessFile__supports_zero_copy <- function(x){ + .Call(`_arrow_io___RandomAccessFile__supports_zero_copy`, x) } -io___MemoryMappedFile__Open <- function(path, mode) { - .Call(`_arrow_io___MemoryMappedFile__Open`, path, mode) +io___RandomAccessFile__Seek <- function(x, position){ + invisible(.Call(`_arrow_io___RandomAccessFile__Seek`, x, position)) } -io___MemoryMappedFile__Resize <- function(x, size) { - invisible(.Call(`_arrow_io___MemoryMappedFile__Resize`, x, size)) +io___RandomAccessFile__Tell <- function(x){ + .Call(`_arrow_io___RandomAccessFile__Tell`, x) } -io___ReadableFile__Open <- function(path) { - .Call(`_arrow_io___ReadableFile__Open`, path) +io___RandomAccessFile__Read0 <- function(x){ + .Call(`_arrow_io___RandomAccessFile__Read0`, x) } -io___BufferReader__initialize <- function(buffer) { - .Call(`_arrow_io___BufferReader__initialize`, buffer) +io___RandomAccessFile__ReadAt <- function(x, position, nbytes){ + .Call(`_arrow_io___RandomAccessFile__ReadAt`, x, position, nbytes) } -io___Writable__write <- function(stream, buf) { - invisible(.Call(`_arrow_io___Writable__write`, stream, buf)) +io___MemoryMappedFile__Create <- function(path, size){ + .Call(`_arrow_io___MemoryMappedFile__Create`, path, size) } -io___OutputStream__Tell <- function(stream) { - .Call(`_arrow_io___OutputStream__Tell`, stream) +io___MemoryMappedFile__Open <- function(path, mode){ + .Call(`_arrow_io___MemoryMappedFile__Open`, path, mode) } -io___FileOutputStream__Open <- function(path) { - .Call(`_arrow_io___FileOutputStream__Open`, path) +io___MemoryMappedFile__Resize <- function(x, size){ + invisible(.Call(`_arrow_io___MemoryMappedFile__Resize`, x, size)) } -io___BufferOutputStream__Create <- function(initial_capacity) { - .Call(`_arrow_io___BufferOutputStream__Create`, initial_capacity) +io___ReadableFile__Open <- function(path){ + .Call(`_arrow_io___ReadableFile__Open`, path) } -io___BufferOutputStream__capacity <- function(stream) { - .Call(`_arrow_io___BufferOutputStream__capacity`, stream) +io___BufferReader__initialize <- function(buffer){ + .Call(`_arrow_io___BufferReader__initialize`, buffer) } -io___BufferOutputStream__Finish <- function(stream) { - .Call(`_arrow_io___BufferOutputStream__Finish`, stream) +io___Writable__write <- function(stream, buf){ + invisible(.Call(`_arrow_io___Writable__write`, stream, buf)) } -io___BufferOutputStream__Tell <- function(stream) { - .Call(`_arrow_io___BufferOutputStream__Tell`, stream) +io___OutputStream__Tell <- function(stream){ + .Call(`_arrow_io___OutputStream__Tell`, stream) } -io___BufferOutputStream__Write <- function(stream, bytes) { - invisible(.Call(`_arrow_io___BufferOutputStream__Write`, stream, bytes)) +io___FileOutputStream__Open <- function(path){ + .Call(`_arrow_io___FileOutputStream__Open`, path) } -json___ReadOptions__initialize <- function(use_threads, block_size) { - .Call(`_arrow_json___ReadOptions__initialize`, use_threads, block_size) +io___BufferOutputStream__Create <- function(initial_capacity){ + .Call(`_arrow_io___BufferOutputStream__Create`, initial_capacity) } -json___ParseOptions__initialize1 <- function(newlines_in_values) { - .Call(`_arrow_json___ParseOptions__initialize1`, newlines_in_values) +io___BufferOutputStream__capacity <- function(stream){ + .Call(`_arrow_io___BufferOutputStream__capacity`, stream) } -json___ParseOptions__initialize2 <- function(newlines_in_values, explicit_schema) { - .Call(`_arrow_json___ParseOptions__initialize2`, newlines_in_values, explicit_schema) +io___BufferOutputStream__Finish <- function(stream){ + .Call(`_arrow_io___BufferOutputStream__Finish`, stream) } -json___TableReader__Make <- function(input, read_options, parse_options) { - .Call(`_arrow_json___TableReader__Make`, input, read_options, parse_options) +io___BufferOutputStream__Tell <- function(stream){ + .Call(`_arrow_io___BufferOutputStream__Tell`, stream) } -json___TableReader__Read <- function(table_reader) { - .Call(`_arrow_json___TableReader__Read`, table_reader) +io___BufferOutputStream__Write <- function(stream, bytes){ + invisible(.Call(`_arrow_io___BufferOutputStream__Write`, stream, bytes)) } -MemoryPool__default <- function() { - .Call(`_arrow_MemoryPool__default`) +json___ReadOptions__initialize <- function(use_threads, block_size){ + .Call(`_arrow_json___ReadOptions__initialize`, use_threads, block_size) } -MemoryPool__bytes_allocated <- function(pool) { - .Call(`_arrow_MemoryPool__bytes_allocated`, pool) +json___ParseOptions__initialize1 <- function(newlines_in_values){ + .Call(`_arrow_json___ParseOptions__initialize1`, newlines_in_values) } -MemoryPool__max_memory <- function(pool) { - .Call(`_arrow_MemoryPool__max_memory`, pool) +json___ParseOptions__initialize2 <- function(newlines_in_values, explicit_schema){ + .Call(`_arrow_json___ParseOptions__initialize2`, newlines_in_values, explicit_schema) } -MemoryPool__backend_name <- function(pool) { - .Call(`_arrow_MemoryPool__backend_name`, pool) +json___TableReader__Make <- function(input, read_options, parse_options){ + .Call(`_arrow_json___TableReader__Make`, input, read_options, parse_options) } -supported_memory_backends <- function() { - .Call(`_arrow_supported_memory_backends`) +json___TableReader__Read <- function(table_reader){ + .Call(`_arrow_json___TableReader__Read`, table_reader) } -ipc___Message__body_length <- function(message) { - .Call(`_arrow_ipc___Message__body_length`, message) +MemoryPool__default <- function(){ + .Call(`_arrow_MemoryPool__default`) } -ipc___Message__metadata <- function(message) { - .Call(`_arrow_ipc___Message__metadata`, message) +MemoryPool__bytes_allocated <- function(pool){ + .Call(`_arrow_MemoryPool__bytes_allocated`, pool) } -ipc___Message__body <- function(message) { - .Call(`_arrow_ipc___Message__body`, message) +MemoryPool__max_memory <- function(pool){ + .Call(`_arrow_MemoryPool__max_memory`, pool) } -ipc___Message__Verify <- function(message) { - .Call(`_arrow_ipc___Message__Verify`, message) +MemoryPool__backend_name <- function(pool){ + .Call(`_arrow_MemoryPool__backend_name`, pool) } -ipc___Message__type <- function(message) { - .Call(`_arrow_ipc___Message__type`, message) +supported_memory_backends <- function(){ + .Call(`_arrow_supported_memory_backends`) } -ipc___Message__Equals <- function(x, y) { - .Call(`_arrow_ipc___Message__Equals`, x, y) +ipc___Message__body_length <- function(message){ + .Call(`_arrow_ipc___Message__body_length`, message) } -ipc___ReadRecordBatch__Message__Schema <- function(message, schema) { - .Call(`_arrow_ipc___ReadRecordBatch__Message__Schema`, message, schema) +ipc___Message__metadata <- function(message){ + .Call(`_arrow_ipc___Message__metadata`, message) } -ipc___ReadSchema_InputStream <- function(stream) { - .Call(`_arrow_ipc___ReadSchema_InputStream`, stream) +ipc___Message__body <- function(message){ + .Call(`_arrow_ipc___Message__body`, message) } -ipc___ReadSchema_Message <- function(message) { - .Call(`_arrow_ipc___ReadSchema_Message`, message) +ipc___Message__Verify <- function(message){ + .Call(`_arrow_ipc___Message__Verify`, message) } -ipc___MessageReader__Open <- function(stream) { - .Call(`_arrow_ipc___MessageReader__Open`, stream) +ipc___Message__type <- function(message){ + .Call(`_arrow_ipc___Message__type`, message) } -ipc___MessageReader__ReadNextMessage <- function(reader) { - .Call(`_arrow_ipc___MessageReader__ReadNextMessage`, reader) +ipc___Message__Equals <- function(x, y){ + .Call(`_arrow_ipc___Message__Equals`, x, y) } -ipc___ReadMessage <- function(stream) { - .Call(`_arrow_ipc___ReadMessage`, stream) +ipc___ReadRecordBatch__Message__Schema <- function(message, schema){ + .Call(`_arrow_ipc___ReadRecordBatch__Message__Schema`, message, schema) } -parquet___arrow___ArrowReaderProperties__Make <- function(use_threads) { - .Call(`_arrow_parquet___arrow___ArrowReaderProperties__Make`, use_threads) +ipc___ReadSchema_InputStream <- function(stream){ + .Call(`_arrow_ipc___ReadSchema_InputStream`, stream) } -parquet___arrow___ArrowReaderProperties__set_use_threads <- function(properties, use_threads) { - invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads`, properties, use_threads)) +ipc___ReadSchema_Message <- function(message){ + .Call(`_arrow_ipc___ReadSchema_Message`, message) } -parquet___arrow___ArrowReaderProperties__get_use_threads <- function(properties, use_threads) { - .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads`, properties, use_threads) +ipc___MessageReader__Open <- function(stream){ + .Call(`_arrow_ipc___MessageReader__Open`, stream) } -parquet___arrow___ArrowReaderProperties__get_read_dictionary <- function(properties, column_index) { - .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary`, properties, column_index) +ipc___MessageReader__ReadNextMessage <- function(reader){ + .Call(`_arrow_ipc___MessageReader__ReadNextMessage`, reader) } -parquet___arrow___ArrowReaderProperties__set_read_dictionary <- function(properties, column_index, read_dict) { - invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary`, properties, column_index, read_dict)) +ipc___ReadMessage <- function(stream){ + .Call(`_arrow_ipc___ReadMessage`, stream) } -parquet___arrow___FileReader__OpenFile <- function(file, props) { - .Call(`_arrow_parquet___arrow___FileReader__OpenFile`, file, props) +parquet___arrow___ArrowReaderProperties__Make <- function(use_threads){ + .Call(`_arrow_parquet___arrow___ArrowReaderProperties__Make`, use_threads) } -parquet___arrow___FileReader__ReadTable1 <- function(reader) { - .Call(`_arrow_parquet___arrow___FileReader__ReadTable1`, reader) +parquet___arrow___ArrowReaderProperties__set_use_threads <- function(properties, use_threads){ + invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads`, properties, use_threads)) } -parquet___arrow___FileReader__ReadTable2 <- function(reader, column_indices) { - .Call(`_arrow_parquet___arrow___FileReader__ReadTable2`, reader, column_indices) +parquet___arrow___ArrowReaderProperties__get_use_threads <- function(properties, use_threads){ + .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads`, properties, use_threads) } -parquet___arrow___FileReader__ReadRowGroup1 <- function(reader, i) { - .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup1`, reader, i) +parquet___arrow___ArrowReaderProperties__get_read_dictionary <- function(properties, column_index){ + .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary`, properties, column_index) } -parquet___arrow___FileReader__ReadRowGroup2 <- function(reader, i, column_indices) { - .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup2`, reader, i, column_indices) +parquet___arrow___ArrowReaderProperties__set_read_dictionary <- function(properties, column_index, read_dict){ + invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary`, properties, column_index, read_dict)) } -parquet___arrow___FileReader__ReadRowGroups1 <- function(reader, row_groups) { - .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups1`, reader, row_groups) +parquet___arrow___FileReader__OpenFile <- function(file, props){ + .Call(`_arrow_parquet___arrow___FileReader__OpenFile`, file, props) } -parquet___arrow___FileReader__ReadRowGroups2 <- function(reader, row_groups, column_indices) { - .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups2`, reader, row_groups, column_indices) +parquet___arrow___FileReader__ReadTable1 <- function(reader){ + .Call(`_arrow_parquet___arrow___FileReader__ReadTable1`, reader) } -parquet___arrow___FileReader__num_rows <- function(reader) { - .Call(`_arrow_parquet___arrow___FileReader__num_rows`, reader) +parquet___arrow___FileReader__ReadTable2 <- function(reader, column_indices){ + .Call(`_arrow_parquet___arrow___FileReader__ReadTable2`, reader, column_indices) } -parquet___arrow___FileReader__num_columns <- function(reader) { - .Call(`_arrow_parquet___arrow___FileReader__num_columns`, reader) +parquet___arrow___FileReader__ReadRowGroup1 <- function(reader, i){ + .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup1`, reader, i) } -parquet___arrow___FileReader__num_row_groups <- function(reader) { - .Call(`_arrow_parquet___arrow___FileReader__num_row_groups`, reader) +parquet___arrow___FileReader__ReadRowGroup2 <- function(reader, i, column_indices){ + .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup2`, reader, i, column_indices) } -parquet___arrow___FileReader__ReadColumn <- function(reader, i) { - .Call(`_arrow_parquet___arrow___FileReader__ReadColumn`, reader, i) +parquet___arrow___FileReader__ReadRowGroups1 <- function(reader, row_groups){ + .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups1`, reader, row_groups) } -parquet___ArrowWriterProperties___create <- function(allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit) { - .Call(`_arrow_parquet___ArrowWriterProperties___create`, allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit) +parquet___arrow___FileReader__ReadRowGroups2 <- function(reader, row_groups, column_indices){ + .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups2`, reader, row_groups, column_indices) } -parquet___WriterProperties___Builder__create <- function() { - .Call(`_arrow_parquet___WriterProperties___Builder__create`) +parquet___arrow___FileReader__num_rows <- function(reader){ + .Call(`_arrow_parquet___arrow___FileReader__num_rows`, reader) } -parquet___WriterProperties___Builder__version <- function(builder, version) { - invisible(.Call(`_arrow_parquet___WriterProperties___Builder__version`, builder, version)) +parquet___arrow___FileReader__num_columns <- function(reader){ + .Call(`_arrow_parquet___arrow___FileReader__num_columns`, reader) } -parquet___ArrowWriterProperties___Builder__set_compressions <- function(builder, paths, types) { - invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compressions`, builder, paths, types)) +parquet___arrow___FileReader__num_row_groups <- function(reader){ + .Call(`_arrow_parquet___arrow___FileReader__num_row_groups`, reader) } -parquet___ArrowWriterProperties___Builder__set_compression_levels <- function(builder, paths, levels) { - invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels`, builder, paths, levels)) +parquet___arrow___FileReader__ReadColumn <- function(reader, i){ + .Call(`_arrow_parquet___arrow___FileReader__ReadColumn`, reader, i) } -parquet___ArrowWriterProperties___Builder__set_use_dictionary <- function(builder, paths, use_dictionary) { - invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary`, builder, paths, use_dictionary)) +parquet___ArrowWriterProperties___create <- function(allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit){ + .Call(`_arrow_parquet___ArrowWriterProperties___create`, allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit) } -parquet___ArrowWriterProperties___Builder__set_write_statistics <- function(builder, paths, write_statistics) { - invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics`, builder, paths, write_statistics)) +parquet___WriterProperties___Builder__create <- function(){ + .Call(`_arrow_parquet___WriterProperties___Builder__create`) } -parquet___ArrowWriterProperties___Builder__data_page_size <- function(builder, data_page_size) { - invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__data_page_size`, builder, data_page_size)) +parquet___WriterProperties___Builder__version <- function(builder, version){ + invisible(.Call(`_arrow_parquet___WriterProperties___Builder__version`, builder, version)) } -parquet___WriterProperties___Builder__build <- function(builder) { - .Call(`_arrow_parquet___WriterProperties___Builder__build`, builder) +parquet___ArrowWriterProperties___Builder__set_compressions <- function(builder, paths, types){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compressions`, builder, paths, types)) } -parquet___arrow___ParquetFileWriter__Open <- function(schema, sink, properties, arrow_properties) { - .Call(`_arrow_parquet___arrow___ParquetFileWriter__Open`, schema, sink, properties, arrow_properties) +parquet___ArrowWriterProperties___Builder__set_compression_levels <- function(builder, paths, levels){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels`, builder, paths, levels)) } -parquet___arrow___FileWriter__WriteTable <- function(writer, table, chunk_size) { - invisible(.Call(`_arrow_parquet___arrow___FileWriter__WriteTable`, writer, table, chunk_size)) +parquet___ArrowWriterProperties___Builder__set_use_dictionary <- function(builder, paths, use_dictionary){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary`, builder, paths, use_dictionary)) } -parquet___arrow___FileWriter__Close <- function(writer) { - invisible(.Call(`_arrow_parquet___arrow___FileWriter__Close`, writer)) +parquet___ArrowWriterProperties___Builder__set_write_statistics <- function(builder, paths, write_statistics){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics`, builder, paths, write_statistics)) } -parquet___arrow___WriteTable <- function(table, sink, properties, arrow_properties) { - invisible(.Call(`_arrow_parquet___arrow___WriteTable`, table, sink, properties, arrow_properties)) +parquet___ArrowWriterProperties___Builder__data_page_size <- function(builder, data_page_size){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__data_page_size`, builder, data_page_size)) } -parquet___arrow___FileReader__GetSchema <- function(reader) { - .Call(`_arrow_parquet___arrow___FileReader__GetSchema`, reader) +parquet___WriterProperties___Builder__build <- function(builder){ + .Call(`_arrow_parquet___WriterProperties___Builder__build`, builder) } -allocate_arrow_schema <- function() { - .Call(`_arrow_allocate_arrow_schema`) +parquet___arrow___ParquetFileWriter__Open <- function(schema, sink, properties, arrow_properties){ + .Call(`_arrow_parquet___arrow___ParquetFileWriter__Open`, schema, sink, properties, arrow_properties) } -delete_arrow_schema <- function(ptr) { - invisible(.Call(`_arrow_delete_arrow_schema`, ptr)) +parquet___arrow___FileWriter__WriteTable <- function(writer, table, chunk_size){ + invisible(.Call(`_arrow_parquet___arrow___FileWriter__WriteTable`, writer, table, chunk_size)) } -allocate_arrow_array <- function() { - .Call(`_arrow_allocate_arrow_array`) +parquet___arrow___FileWriter__Close <- function(writer){ + invisible(.Call(`_arrow_parquet___arrow___FileWriter__Close`, writer)) } -delete_arrow_array <- function(ptr) { - invisible(.Call(`_arrow_delete_arrow_array`, ptr)) +parquet___arrow___WriteTable <- function(table, sink, properties, arrow_properties){ + invisible(.Call(`_arrow_parquet___arrow___WriteTable`, table, sink, properties, arrow_properties)) } -allocate_arrow_array_stream <- function() { - .Call(`_arrow_allocate_arrow_array_stream`) +parquet___arrow___FileReader__GetSchema <- function(reader){ + .Call(`_arrow_parquet___arrow___FileReader__GetSchema`, reader) } -delete_arrow_array_stream <- function(ptr) { - invisible(.Call(`_arrow_delete_arrow_array_stream`, ptr)) +allocate_arrow_schema <- function(){ + .Call(`_arrow_allocate_arrow_schema`) } -ImportArray <- function(array, schema) { - .Call(`_arrow_ImportArray`, array, schema) +delete_arrow_schema <- function(ptr){ + invisible(.Call(`_arrow_delete_arrow_schema`, ptr)) } -ImportRecordBatch <- function(array, schema) { - .Call(`_arrow_ImportRecordBatch`, array, schema) +allocate_arrow_array <- function(){ + .Call(`_arrow_allocate_arrow_array`) } -ImportSchema <- function(schema) { - .Call(`_arrow_ImportSchema`, schema) +delete_arrow_array <- function(ptr){ + invisible(.Call(`_arrow_delete_arrow_array`, ptr)) } -ImportField <- function(field) { - .Call(`_arrow_ImportField`, field) +allocate_arrow_array_stream <- function(){ + .Call(`_arrow_allocate_arrow_array_stream`) } -ImportType <- function(type) { - .Call(`_arrow_ImportType`, type) +delete_arrow_array_stream <- function(ptr){ + invisible(.Call(`_arrow_delete_arrow_array_stream`, ptr)) } -ImportRecordBatchReader <- function(stream) { - .Call(`_arrow_ImportRecordBatchReader`, stream) +ImportArray <- function(array, schema){ + .Call(`_arrow_ImportArray`, array, schema) } -ExportType <- function(type, ptr) { - invisible(.Call(`_arrow_ExportType`, type, ptr)) +ImportRecordBatch <- function(array, schema){ + .Call(`_arrow_ImportRecordBatch`, array, schema) } -ExportField <- function(field, ptr) { - invisible(.Call(`_arrow_ExportField`, field, ptr)) +ImportSchema <- function(schema){ + .Call(`_arrow_ImportSchema`, schema) } -ExportSchema <- function(schema, ptr) { - invisible(.Call(`_arrow_ExportSchema`, schema, ptr)) +ImportField <- function(field){ + .Call(`_arrow_ImportField`, field) } -ExportArray <- function(array, array_ptr, schema_ptr) { - invisible(.Call(`_arrow_ExportArray`, array, array_ptr, schema_ptr)) +ImportType <- function(type){ + .Call(`_arrow_ImportType`, type) } -ExportRecordBatch <- function(batch, array_ptr, schema_ptr) { - invisible(.Call(`_arrow_ExportRecordBatch`, batch, array_ptr, schema_ptr)) +ImportRecordBatchReader <- function(stream){ + .Call(`_arrow_ImportRecordBatchReader`, stream) } -ExportRecordBatchReader <- function(reader, stream_ptr) { - invisible(.Call(`_arrow_ExportRecordBatchReader`, reader, stream_ptr)) +ExportType <- function(type, ptr){ + invisible(.Call(`_arrow_ExportType`, type, ptr)) } -Table__from_dots <- function(lst, schema_sxp, use_threads) { - .Call(`_arrow_Table__from_dots`, lst, schema_sxp, use_threads) +ExportField <- function(field, ptr){ + invisible(.Call(`_arrow_ExportField`, field, ptr)) } -vec_to_arrow <- function(x, s_type) { - .Call(`_arrow_vec_to_arrow`, x, s_type) +ExportSchema <- function(schema, ptr){ + invisible(.Call(`_arrow_ExportSchema`, schema, ptr)) } -DictionaryArray__FromArrays <- function(type, indices, dict) { - .Call(`_arrow_DictionaryArray__FromArrays`, type, indices, dict) +ExportArray <- function(array, array_ptr, schema_ptr){ + invisible(.Call(`_arrow_ExportArray`, array, array_ptr, schema_ptr)) } -RecordBatch__num_columns <- function(x) { - .Call(`_arrow_RecordBatch__num_columns`, x) +ExportRecordBatch <- function(batch, array_ptr, schema_ptr){ + invisible(.Call(`_arrow_ExportRecordBatch`, batch, array_ptr, schema_ptr)) } -RecordBatch__num_rows <- function(x) { - .Call(`_arrow_RecordBatch__num_rows`, x) +ExportRecordBatchReader <- function(reader, stream_ptr){ + invisible(.Call(`_arrow_ExportRecordBatchReader`, reader, stream_ptr)) } -RecordBatch__schema <- function(x) { - .Call(`_arrow_RecordBatch__schema`, x) +Table__from_dots <- function(lst, schema_sxp, use_threads){ + .Call(`_arrow_Table__from_dots`, lst, schema_sxp, use_threads) } -RecordBatch__RenameColumns <- function(batch, names) { - .Call(`_arrow_RecordBatch__RenameColumns`, batch, names) +vec_to_arrow <- function(x, s_type){ + .Call(`_arrow_vec_to_arrow`, x, s_type) } -RecordBatch__ReplaceSchemaMetadata <- function(x, metadata) { - .Call(`_arrow_RecordBatch__ReplaceSchemaMetadata`, x, metadata) +DictionaryArray__FromArrays <- function(type, indices, dict){ + .Call(`_arrow_DictionaryArray__FromArrays`, type, indices, dict) } -RecordBatch__columns <- function(batch) { - .Call(`_arrow_RecordBatch__columns`, batch) +RecordBatch__num_columns <- function(x){ + .Call(`_arrow_RecordBatch__num_columns`, x) } -RecordBatch__column <- function(batch, i) { - .Call(`_arrow_RecordBatch__column`, batch, i) +RecordBatch__num_rows <- function(x){ + .Call(`_arrow_RecordBatch__num_rows`, x) } -RecordBatch__GetColumnByName <- function(batch, name) { - .Call(`_arrow_RecordBatch__GetColumnByName`, batch, name) +RecordBatch__schema <- function(x){ + .Call(`_arrow_RecordBatch__schema`, x) } -RecordBatch__SelectColumns <- function(batch, indices) { - .Call(`_arrow_RecordBatch__SelectColumns`, batch, indices) +RecordBatch__RenameColumns <- function(batch, names){ + .Call(`_arrow_RecordBatch__RenameColumns`, batch, names) } -RecordBatch__Equals <- function(self, other, check_metadata) { - .Call(`_arrow_RecordBatch__Equals`, self, other, check_metadata) +RecordBatch__ReplaceSchemaMetadata <- function(x, metadata){ + .Call(`_arrow_RecordBatch__ReplaceSchemaMetadata`, x, metadata) } -RecordBatch__AddColumn <- function(batch, i, field, column) { - .Call(`_arrow_RecordBatch__AddColumn`, batch, i, field, column) +RecordBatch__columns <- function(batch){ + .Call(`_arrow_RecordBatch__columns`, batch) } -RecordBatch__SetColumn <- function(batch, i, field, column) { - .Call(`_arrow_RecordBatch__SetColumn`, batch, i, field, column) +RecordBatch__column <- function(batch, i){ + .Call(`_arrow_RecordBatch__column`, batch, i) } -RecordBatch__RemoveColumn <- function(batch, i) { - .Call(`_arrow_RecordBatch__RemoveColumn`, batch, i) +RecordBatch__GetColumnByName <- function(batch, name){ + .Call(`_arrow_RecordBatch__GetColumnByName`, batch, name) } -RecordBatch__column_name <- function(batch, i) { - .Call(`_arrow_RecordBatch__column_name`, batch, i) +RecordBatch__SelectColumns <- function(batch, indices){ + .Call(`_arrow_RecordBatch__SelectColumns`, batch, indices) } -RecordBatch__names <- function(batch) { - .Call(`_arrow_RecordBatch__names`, batch) +RecordBatch__Equals <- function(self, other, check_metadata){ + .Call(`_arrow_RecordBatch__Equals`, self, other, check_metadata) } -RecordBatch__Slice1 <- function(self, offset) { - .Call(`_arrow_RecordBatch__Slice1`, self, offset) +RecordBatch__AddColumn <- function(batch, i, field, column){ + .Call(`_arrow_RecordBatch__AddColumn`, batch, i, field, column) } -RecordBatch__Slice2 <- function(self, offset, length) { - .Call(`_arrow_RecordBatch__Slice2`, self, offset, length) +RecordBatch__SetColumn <- function(batch, i, field, column){ + .Call(`_arrow_RecordBatch__SetColumn`, batch, i, field, column) } -ipc___SerializeRecordBatch__Raw <- function(batch) { - .Call(`_arrow_ipc___SerializeRecordBatch__Raw`, batch) +RecordBatch__RemoveColumn <- function(batch, i){ + .Call(`_arrow_RecordBatch__RemoveColumn`, batch, i) } -ipc___ReadRecordBatch__InputStream__Schema <- function(stream, schema) { - .Call(`_arrow_ipc___ReadRecordBatch__InputStream__Schema`, stream, schema) +RecordBatch__column_name <- function(batch, i){ + .Call(`_arrow_RecordBatch__column_name`, batch, i) } -RecordBatch__from_arrays <- function(schema_sxp, lst) { - .Call(`_arrow_RecordBatch__from_arrays`, schema_sxp, lst) +RecordBatch__names <- function(batch){ + .Call(`_arrow_RecordBatch__names`, batch) } -RecordBatchReader__schema <- function(reader) { - .Call(`_arrow_RecordBatchReader__schema`, reader) +RecordBatch__Slice1 <- function(self, offset){ + .Call(`_arrow_RecordBatch__Slice1`, self, offset) } -RecordBatchReader__ReadNext <- function(reader) { - .Call(`_arrow_RecordBatchReader__ReadNext`, reader) +RecordBatch__Slice2 <- function(self, offset, length){ + .Call(`_arrow_RecordBatch__Slice2`, self, offset, length) } -RecordBatchReader__batches <- function(reader) { - .Call(`_arrow_RecordBatchReader__batches`, reader) +ipc___SerializeRecordBatch__Raw <- function(batch){ + .Call(`_arrow_ipc___SerializeRecordBatch__Raw`, batch) } -Table__from_RecordBatchReader <- function(reader) { - .Call(`_arrow_Table__from_RecordBatchReader`, reader) +ipc___ReadRecordBatch__InputStream__Schema <- function(stream, schema){ + .Call(`_arrow_ipc___ReadRecordBatch__InputStream__Schema`, stream, schema) } -ipc___RecordBatchStreamReader__Open <- function(stream) { - .Call(`_arrow_ipc___RecordBatchStreamReader__Open`, stream) +RecordBatch__from_arrays <- function(schema_sxp, lst){ + .Call(`_arrow_RecordBatch__from_arrays`, schema_sxp, lst) } -ipc___RecordBatchFileReader__schema <- function(reader) { - .Call(`_arrow_ipc___RecordBatchFileReader__schema`, reader) +RecordBatchReader__schema <- function(reader){ + .Call(`_arrow_RecordBatchReader__schema`, reader) } -ipc___RecordBatchFileReader__num_record_batches <- function(reader) { - .Call(`_arrow_ipc___RecordBatchFileReader__num_record_batches`, reader) +RecordBatchReader__ReadNext <- function(reader){ + .Call(`_arrow_RecordBatchReader__ReadNext`, reader) } -ipc___RecordBatchFileReader__ReadRecordBatch <- function(reader, i) { - .Call(`_arrow_ipc___RecordBatchFileReader__ReadRecordBatch`, reader, i) +RecordBatchReader__batches <- function(reader){ + .Call(`_arrow_RecordBatchReader__batches`, reader) } -ipc___RecordBatchFileReader__Open <- function(file) { - .Call(`_arrow_ipc___RecordBatchFileReader__Open`, file) +Table__from_RecordBatchReader <- function(reader){ + .Call(`_arrow_Table__from_RecordBatchReader`, reader) } -Table__from_RecordBatchFileReader <- function(reader) { - .Call(`_arrow_Table__from_RecordBatchFileReader`, reader) +ipc___RecordBatchStreamReader__Open <- function(stream){ + .Call(`_arrow_ipc___RecordBatchStreamReader__Open`, stream) } -ipc___RecordBatchFileReader__batches <- function(reader) { - .Call(`_arrow_ipc___RecordBatchFileReader__batches`, reader) +ipc___RecordBatchFileReader__schema <- function(reader){ + .Call(`_arrow_ipc___RecordBatchFileReader__schema`, reader) } -ipc___RecordBatchWriter__WriteRecordBatch <- function(batch_writer, batch) { - invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteRecordBatch`, batch_writer, batch)) +ipc___RecordBatchFileReader__num_record_batches <- function(reader){ + .Call(`_arrow_ipc___RecordBatchFileReader__num_record_batches`, reader) } -ipc___RecordBatchWriter__WriteTable <- function(batch_writer, table) { - invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteTable`, batch_writer, table)) +ipc___RecordBatchFileReader__ReadRecordBatch <- function(reader, i){ + .Call(`_arrow_ipc___RecordBatchFileReader__ReadRecordBatch`, reader, i) } -ipc___RecordBatchWriter__Close <- function(batch_writer) { - invisible(.Call(`_arrow_ipc___RecordBatchWriter__Close`, batch_writer)) +ipc___RecordBatchFileReader__Open <- function(file){ + .Call(`_arrow_ipc___RecordBatchFileReader__Open`, file) } -ipc___RecordBatchFileWriter__Open <- function(stream, schema, use_legacy_format, metadata_version) { - .Call(`_arrow_ipc___RecordBatchFileWriter__Open`, stream, schema, use_legacy_format, metadata_version) +Table__from_RecordBatchFileReader <- function(reader){ + .Call(`_arrow_Table__from_RecordBatchFileReader`, reader) } -ipc___RecordBatchStreamWriter__Open <- function(stream, schema, use_legacy_format, metadata_version) { - .Call(`_arrow_ipc___RecordBatchStreamWriter__Open`, stream, schema, use_legacy_format, metadata_version) +ipc___RecordBatchFileReader__batches <- function(reader){ + .Call(`_arrow_ipc___RecordBatchFileReader__batches`, reader) } -Array__GetScalar <- function(x, i) { - .Call(`_arrow_Array__GetScalar`, x, i) +ipc___RecordBatchWriter__WriteRecordBatch <- function(batch_writer, batch){ + invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteRecordBatch`, batch_writer, batch)) } -Scalar__ToString <- function(s) { - .Call(`_arrow_Scalar__ToString`, s) +ipc___RecordBatchWriter__WriteTable <- function(batch_writer, table){ + invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteTable`, batch_writer, table)) } -StructScalar__field <- function(s, i) { - .Call(`_arrow_StructScalar__field`, s, i) +ipc___RecordBatchWriter__Close <- function(batch_writer){ + invisible(.Call(`_arrow_ipc___RecordBatchWriter__Close`, batch_writer)) } -StructScalar__GetFieldByName <- function(s, name) { - .Call(`_arrow_StructScalar__GetFieldByName`, s, name) +ipc___RecordBatchFileWriter__Open <- function(stream, schema, use_legacy_format, metadata_version){ + .Call(`_arrow_ipc___RecordBatchFileWriter__Open`, stream, schema, use_legacy_format, metadata_version) } -Scalar__as_vector <- function(scalar) { - .Call(`_arrow_Scalar__as_vector`, scalar) +ipc___RecordBatchStreamWriter__Open <- function(stream, schema, use_legacy_format, metadata_version){ + .Call(`_arrow_ipc___RecordBatchStreamWriter__Open`, stream, schema, use_legacy_format, metadata_version) } -MakeArrayFromScalar <- function(scalar, n) { - .Call(`_arrow_MakeArrayFromScalar`, scalar, n) +Array__GetScalar <- function(x, i){ + .Call(`_arrow_Array__GetScalar`, x, i) } -Scalar__is_valid <- function(s) { - .Call(`_arrow_Scalar__is_valid`, s) +Scalar__ToString <- function(s){ + .Call(`_arrow_Scalar__ToString`, s) } -Scalar__type <- function(s) { - .Call(`_arrow_Scalar__type`, s) +StructScalar__field <- function(s, i){ + .Call(`_arrow_StructScalar__field`, s, i) } -Scalar__Equals <- function(lhs, rhs) { - .Call(`_arrow_Scalar__Equals`, lhs, rhs) +StructScalar__GetFieldByName <- function(s, name){ + .Call(`_arrow_StructScalar__GetFieldByName`, s, name) } -Scalar__ApproxEquals <- function(lhs, rhs) { - .Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs) +Scalar__as_vector <- function(scalar){ + .Call(`_arrow_Scalar__as_vector`, scalar) } -schema_ <- function(fields) { - .Call(`_arrow_schema_`, fields) +MakeArrayFromScalar <- function(scalar, n){ + .Call(`_arrow_MakeArrayFromScalar`, scalar, n) } -Schema__ToString <- function(s) { - .Call(`_arrow_Schema__ToString`, s) +Scalar__is_valid <- function(s){ + .Call(`_arrow_Scalar__is_valid`, s) } -Schema__num_fields <- function(s) { - .Call(`_arrow_Schema__num_fields`, s) +Scalar__type <- function(s){ + .Call(`_arrow_Scalar__type`, s) } -Schema__field <- function(s, i) { - .Call(`_arrow_Schema__field`, s, i) +Scalar__Equals <- function(lhs, rhs){ + .Call(`_arrow_Scalar__Equals`, lhs, rhs) } -Schema__AddField <- function(s, i, field) { - .Call(`_arrow_Schema__AddField`, s, i, field) +Scalar__ApproxEquals <- function(lhs, rhs){ + .Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs) } -Schema__SetField <- function(s, i, field) { - .Call(`_arrow_Schema__SetField`, s, i, field) +schema_ <- function(fields){ + .Call(`_arrow_schema_`, fields) } -Schema__RemoveField <- function(s, i) { - .Call(`_arrow_Schema__RemoveField`, s, i) +Schema__ToString <- function(s){ + .Call(`_arrow_Schema__ToString`, s) } -Schema__GetFieldByName <- function(s, x) { - .Call(`_arrow_Schema__GetFieldByName`, s, x) +Schema__num_fields <- function(s){ + .Call(`_arrow_Schema__num_fields`, s) } -Schema__fields <- function(schema) { - .Call(`_arrow_Schema__fields`, schema) +Schema__field <- function(s, i){ + .Call(`_arrow_Schema__field`, s, i) } -Schema__field_names <- function(schema) { - .Call(`_arrow_Schema__field_names`, schema) +Schema__AddField <- function(s, i, field){ + .Call(`_arrow_Schema__AddField`, s, i, field) } -Schema__HasMetadata <- function(schema) { - .Call(`_arrow_Schema__HasMetadata`, schema) +Schema__SetField <- function(s, i, field){ + .Call(`_arrow_Schema__SetField`, s, i, field) } -Schema__metadata <- function(schema) { - .Call(`_arrow_Schema__metadata`, schema) +Schema__RemoveField <- function(s, i){ + .Call(`_arrow_Schema__RemoveField`, s, i) } -Schema__WithMetadata <- function(schema, metadata) { - .Call(`_arrow_Schema__WithMetadata`, schema, metadata) +Schema__GetFieldByName <- function(s, x){ + .Call(`_arrow_Schema__GetFieldByName`, s, x) } -Schema__serialize <- function(schema) { - .Call(`_arrow_Schema__serialize`, schema) +Schema__fields <- function(schema){ + .Call(`_arrow_Schema__fields`, schema) } -Schema__Equals <- function(schema, other, check_metadata) { - .Call(`_arrow_Schema__Equals`, schema, other, check_metadata) +Schema__field_names <- function(schema){ + .Call(`_arrow_Schema__field_names`, schema) } -arrow__UnifySchemas <- function(schemas) { - .Call(`_arrow_arrow__UnifySchemas`, schemas) +Schema__HasMetadata <- function(schema){ + .Call(`_arrow_Schema__HasMetadata`, schema) } -Table__num_columns <- function(x) { - .Call(`_arrow_Table__num_columns`, x) +Schema__metadata <- function(schema){ + .Call(`_arrow_Schema__metadata`, schema) } -Table__num_rows <- function(x) { - .Call(`_arrow_Table__num_rows`, x) +Schema__WithMetadata <- function(schema, metadata){ + .Call(`_arrow_Schema__WithMetadata`, schema, metadata) } -Table__schema <- function(x) { - .Call(`_arrow_Table__schema`, x) +Schema__serialize <- function(schema){ + .Call(`_arrow_Schema__serialize`, schema) } -Table__ReplaceSchemaMetadata <- function(x, metadata) { - .Call(`_arrow_Table__ReplaceSchemaMetadata`, x, metadata) +Schema__Equals <- function(schema, other, check_metadata){ + .Call(`_arrow_Schema__Equals`, schema, other, check_metadata) } -Table__column <- function(table, i) { - .Call(`_arrow_Table__column`, table, i) +arrow__UnifySchemas <- function(schemas){ + .Call(`_arrow_arrow__UnifySchemas`, schemas) } -Table__field <- function(table, i) { - .Call(`_arrow_Table__field`, table, i) +Table__num_columns <- function(x){ + .Call(`_arrow_Table__num_columns`, x) } -Table__columns <- function(table) { - .Call(`_arrow_Table__columns`, table) +Table__num_rows <- function(x){ + .Call(`_arrow_Table__num_rows`, x) } -Table__ColumnNames <- function(table) { - .Call(`_arrow_Table__ColumnNames`, table) +Table__schema <- function(x){ + .Call(`_arrow_Table__schema`, x) } -Table__RenameColumns <- function(table, names) { - .Call(`_arrow_Table__RenameColumns`, table, names) +Table__ReplaceSchemaMetadata <- function(x, metadata){ + .Call(`_arrow_Table__ReplaceSchemaMetadata`, x, metadata) } -Table__Slice1 <- function(table, offset) { - .Call(`_arrow_Table__Slice1`, table, offset) +Table__column <- function(table, i){ + .Call(`_arrow_Table__column`, table, i) } -Table__Slice2 <- function(table, offset, length) { - .Call(`_arrow_Table__Slice2`, table, offset, length) +Table__field <- function(table, i){ + .Call(`_arrow_Table__field`, table, i) } -Table__Equals <- function(lhs, rhs, check_metadata) { - .Call(`_arrow_Table__Equals`, lhs, rhs, check_metadata) +Table__columns <- function(table){ + .Call(`_arrow_Table__columns`, table) } -Table__Validate <- function(table) { - .Call(`_arrow_Table__Validate`, table) +Table__ColumnNames <- function(table){ + .Call(`_arrow_Table__ColumnNames`, table) } -Table__ValidateFull <- function(table) { - .Call(`_arrow_Table__ValidateFull`, table) +Table__RenameColumns <- function(table, names){ + .Call(`_arrow_Table__RenameColumns`, table, names) } -Table__GetColumnByName <- function(table, name) { - .Call(`_arrow_Table__GetColumnByName`, table, name) +Table__Slice1 <- function(table, offset){ + .Call(`_arrow_Table__Slice1`, table, offset) } -Table__RemoveColumn <- function(table, i) { - .Call(`_arrow_Table__RemoveColumn`, table, i) +Table__Slice2 <- function(table, offset, length){ + .Call(`_arrow_Table__Slice2`, table, offset, length) } -Table__AddColumn <- function(table, i, field, column) { - .Call(`_arrow_Table__AddColumn`, table, i, field, column) +Table__Equals <- function(lhs, rhs, check_metadata){ + .Call(`_arrow_Table__Equals`, lhs, rhs, check_metadata) } -Table__SetColumn <- function(table, i, field, column) { - .Call(`_arrow_Table__SetColumn`, table, i, field, column) +Table__Validate <- function(table){ + .Call(`_arrow_Table__Validate`, table) } -Table__SelectColumns <- function(table, indices) { - .Call(`_arrow_Table__SelectColumns`, table, indices) +Table__ValidateFull <- function(table){ + .Call(`_arrow_Table__ValidateFull`, table) } -all_record_batches <- function(lst) { - .Call(`_arrow_all_record_batches`, lst) +Table__GetColumnByName <- function(table, name){ + .Call(`_arrow_Table__GetColumnByName`, table, name) } -Table__from_record_batches <- function(batches, schema_sxp) { - .Call(`_arrow_Table__from_record_batches`, batches, schema_sxp) +Table__RemoveColumn <- function(table, i){ + .Call(`_arrow_Table__RemoveColumn`, table, i) } -GetCpuThreadPoolCapacity <- function() { - .Call(`_arrow_GetCpuThreadPoolCapacity`) +Table__AddColumn <- function(table, i, field, column){ + .Call(`_arrow_Table__AddColumn`, table, i, field, column) } -SetCpuThreadPoolCapacity <- function(threads) { - invisible(.Call(`_arrow_SetCpuThreadPoolCapacity`, threads)) +Table__SetColumn <- function(table, i, field, column){ + .Call(`_arrow_Table__SetColumn`, table, i, field, column) } -GetIOThreadPoolCapacity <- function() { - .Call(`_arrow_GetIOThreadPoolCapacity`) +Table__SelectColumns <- function(table, indices){ + .Call(`_arrow_Table__SelectColumns`, table, indices) } -SetIOThreadPoolCapacity <- function(threads) { - invisible(.Call(`_arrow_SetIOThreadPoolCapacity`, threads)) +all_record_batches <- function(lst){ + .Call(`_arrow_all_record_batches`, lst) } -Array__infer_type <- function(x) { - .Call(`_arrow_Array__infer_type`, x) +Table__from_record_batches <- function(batches, schema_sxp){ + .Call(`_arrow_Table__from_record_batches`, batches, schema_sxp) } + +GetCpuThreadPoolCapacity <- function(){ + .Call(`_arrow_GetCpuThreadPoolCapacity`) +} + +SetCpuThreadPoolCapacity <- function(threads){ + invisible(.Call(`_arrow_SetCpuThreadPoolCapacity`, threads)) +} + +GetIOThreadPoolCapacity <- function(){ + .Call(`_arrow_GetIOThreadPoolCapacity`) +} + +SetIOThreadPoolCapacity <- function(threads){ + invisible(.Call(`_arrow_SetIOThreadPoolCapacity`, threads)) +} + +Array__infer_type <- function(x){ + .Call(`_arrow_Array__infer_type`, x) +} + + + diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 26db190099f..2db8a954918 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -28,14 +28,82 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb dplyr::group_vars(.data) # vars needed for grouping )) .data <- dplyr::select(.data, vars_to_keep) - if (match.arg(.engine) == "duckdb") { - dplyr::summarise(to_duckdb(.data), ...) - } else { - if (query_on_dataset(.data)) { - not_implemented_for_dataset("summarize()") + dplyr::summarise(to_duckdb(.data), ...) + } else if (isTRUE(getOption("arrow.summarize", FALSE))) { + # Try stuff, if successful return() + out <- try(do_arrow_summarize(.data, ...), silent = TRUE) + if (inherits(out, "try-error")) { + return(abandon_ship(call, .data, format(out))) + } else { + return(out) } + } else { + # If unsuccessful or if option not set, do the work in R dplyr::summarise(dplyr::collect(.data), ...) } } summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query + +do_arrow_summarize <- function(.data, ...) { + if (length(dplyr::group_vars(.data))) { + stop("Grouped aggregation not supprted in Arrow", call. = FALSE) + } + + exprs <- quos(...) + # Check for unnamed expressions and fix if any + unnamed <- !nzchar(names(exprs)) + # Deparse and take the first element in case they're long expressions + names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) + + mask <- arrow_mask(.data) + # Add aggregation wrappers to arrow_mask somehow + # (this is not ideal, would overwrite same-named objects) + mask$sum <- function(x, na.rm = FALSE) { + list( + fun = "sum", + data = x, + options = list(na.rm = na.rm) + ) + } + results <- list() + for (i in seq_along(exprs)) { + # Iterate over the indices and not the names because names may be repeated + # (which overwrites the previous name) + new_var <- names(exprs)[i] + results[[new_var]] <- arrow_eval(exprs[[i]], mask) + if (inherits(results[[new_var]], "try-error")) { + msg <- paste('Expression', as_label(exprs[[i]]), 'not supported in Arrow') + stop(msg, call. = FALSE) + } + # Put it in the data mask too? + #mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] + } + + # Now, from that, split out the data (expressions) and options + .data$aggregations <- lapply(results, function(x) x[c("fun", "options")]) + + inputs <- lapply(results, function(x) x$data) + # This is essentially a projection, and the column names don't matter + # (but must exist) + names(inputs) <- as.character(seq_along(inputs)) + .data$selected_columns <- inputs + + # Eventually, we will return .data here if (dataset) but do it eagerly now + do_exec_plan(.data) +} + +do_exec_plan <- function(.data) { + plan <- ExecPlan$create() + # Scan also will filter and select columns, so we don't need to Filter + start_node <- plan$Scan(.data) + # If any columns are derived we need to Project (otherwise this may be no-op) + project_node <- start_node$Project(.data$selected_columns) + + final_node <- project_node$ScalarAggregate( + options = .data$aggregates, + targets = names(.data), + out_field_names = names(.data$aggregates) + ) + plan$Run(final_node) +} \ No newline at end of file diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 5d14264b90c..fa12396740e 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -19,7 +19,24 @@ ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, public = list( Scan = function(dataset) { # Handle arrow_dplyr_query - # TODO: why do I need to filter/project here? + if (inherits(dataset, "arrow_dplyr_query")) { + filter <- dataset$filtered_rows + if (isTRUE(filter)) { + filter <- Expression$scalar(TRUE) + } + # TODO: use FieldsInExpression to find all from dataset$selected_columns + colnames <- names(dataset$.data) + dataset <- dataset$.data + } else { + if (inherits(dataset, "ArrowTabular")) { + dataset <- InMemoryDataset$create(dataset) + } + assert_is(dataset, "Dataset") + # Set some defaults + filter <- Expression$scalar(TRUE) + colnames <- names(dataset) + } + # TODO: why do I _need_ to filter/project here? ExecNode_Scan(self, dataset, filter, colnames) }, Run = function(node) { @@ -45,7 +62,3 @@ ExecNode <- R6Class("ExecNode", inherit = ArrowObject, } ) ) - -# plan <- ExecPlan$create() -# final_node <- plan$Scan(dataset)$Filter(expr)$Project(exprs)$ScalarAggregate(something) -# plan$Run(final_node) \ No newline at end of file diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 19095a4cbde..86418634226 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1092,6 +1092,87 @@ extern "C" SEXP _arrow_io___CompressedInputStream__Make(SEXP codec_sexp, SEXP ra } #endif +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExecPlan_create(); +extern "C" SEXP _arrow_ExecPlan_create(){ +BEGIN_CPP11 + return cpp11::as_sexp(ExecPlan_create()); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExecPlan_create(){ + Rf_error("Cannot call ExecPlan_create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExecPlan_run(std::shared_ptr plan, std::shared_ptr final_node); +extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ +BEGIN_CPP11 + arrow::r::Input>::type plan(plan_sexp); + arrow::r::Input>::type final_node(final_node_sexp); + return cpp11::as_sexp(ExecPlan_run(plan, final_node)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ + Rf_error("Cannot call ExecPlan_run(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExecNode_Scan(std::shared_ptr plan, std::shared_ptr dataset, std::shared_ptr filter, std::vector materialized_field_names); +extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP filter_sexp, SEXP materialized_field_names_sexp){ +BEGIN_CPP11 + arrow::r::Input>::type plan(plan_sexp); + arrow::r::Input>::type dataset(dataset_sexp); + arrow::r::Input>::type filter(filter_sexp); + arrow::r::Input>::type materialized_field_names(materialized_field_names_sexp); + return cpp11::as_sexp(ExecNode_Scan(plan, dataset, filter, materialized_field_names)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP filter_sexp, SEXP materialized_field_names_sexp){ + Rf_error("Cannot call ExecNode_Scan(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExecNode_Filter(std::shared_ptr input, std::shared_ptr filter); +extern "C" SEXP _arrow_ExecNode_Filter(SEXP input_sexp, SEXP filter_sexp){ +BEGIN_CPP11 + arrow::r::Input>::type input(input_sexp); + arrow::r::Input>::type filter(filter_sexp); + return cpp11::as_sexp(ExecNode_Filter(input, filter)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExecNode_Filter(SEXP input_sexp, SEXP filter_sexp){ + Rf_error("Cannot call ExecNode_Filter(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExecNode_Project(std::shared_ptr input, std::vector> exprs, std::vector names); +extern "C" SEXP _arrow_ExecNode_Project(SEXP input_sexp, SEXP exprs_sexp, SEXP names_sexp){ +BEGIN_CPP11 + arrow::r::Input>::type input(input_sexp); + arrow::r::Input>>::type exprs(exprs_sexp); + arrow::r::Input>::type names(names_sexp); + return cpp11::as_sexp(ExecNode_Project(input, exprs, names)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExecNode_Project(SEXP input_sexp, SEXP exprs_sexp, SEXP names_sexp){ + Rf_error("Cannot call ExecNode_Project(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // compute.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, cpp11::list options); @@ -7011,6 +7092,11 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, + { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 0}, + { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 2}, + { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, + { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, + { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 49bdefb6f44..4ecb99174b5 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -47,6 +47,15 @@ #include #include +namespace arrow { +namespace compute { + +class ExecPlan; +class ExecNode; + +} // namespace compute +} // namespace arrow + #if defined(ARROW_R_WITH_PARQUET) #include #endif diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index e9e9cc04a4c..f9b1c6e2818 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -26,6 +26,9 @@ namespace compute = ::arrow::compute; +std::shared_ptr make_compute_options(std::string func_name, + cpp11::list options); + // [[arrow::export]] std::shared_ptr ExecPlan_create() { return ValueOrStop(compute::ExecPlan::Make(gc_context())); @@ -104,7 +107,7 @@ std::shared_ptr ExecNode_Filter( std::shared_ptr ExecNode_Project( std::shared_ptr input, std::vector> exprs, - std::vector names = {}) { + std::vector names) { // We have shared_ptrs of expressions but need the Expressions std::vector expressions; for (auto expr : exprs) { @@ -130,8 +133,8 @@ std::shared_ptr ExecNode_ScalarAggregate( keep_alives.push_back(std::move(opts)); } - auto scalar_agg = ValueOrStop(MakeScalarAggregateNode( - source, /*label=*/"scalar_agg", aggregates, targets, out_field_names)); + auto scalar_agg = ValueOrStop(compute::MakeScalarAggregateNode( + input, /*label=*/"scalar_agg", aggregates, targets, out_field_names)); return std::shared_ptr(scalar_agg, [keep_alives](...) { // empty destructor: ExecNode lifetime is managed by an ExecPlan diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 7186acb9aed..279e5448753 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -28,6 +28,7 @@ tbl$verses <- verses[[1]] tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both") test_that("Can aggregate", { + withr::local_options(list(arrow.summarize = TRUE)) expect_dplyr_equal( input %>% summarize(total = sum(int)), From 1947e156a014b0c2bcaede1ace7547addc5a6586 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Jul 2021 10:15:41 -0400 Subject: [PATCH 04/24] const --- r/src/arrowExports.cpp | 26 +++++++++++++------------- r/src/compute-exec.cpp | 20 ++++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 86418634226..371aae96703 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1108,11 +1108,11 @@ extern "C" SEXP _arrow_ExecPlan_create(){ // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ExecPlan_run(std::shared_ptr plan, std::shared_ptr final_node); +std::shared_ptr ExecPlan_run(const std::shared_ptr& plan, const std::shared_ptr& final_node); extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ BEGIN_CPP11 - arrow::r::Input>::type plan(plan_sexp); - arrow::r::Input>::type final_node(final_node_sexp); + arrow::r::Input&>::type plan(plan_sexp); + arrow::r::Input&>::type final_node(final_node_sexp); return cpp11::as_sexp(ExecPlan_run(plan, final_node)); END_CPP11 } @@ -1124,12 +1124,12 @@ extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ExecNode_Scan(std::shared_ptr plan, std::shared_ptr dataset, std::shared_ptr filter, std::vector materialized_field_names); +std::shared_ptr ExecNode_Scan(const std::shared_ptr& plan, const std::shared_ptr& dataset, const std::shared_ptr& filter, std::vector materialized_field_names); extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP filter_sexp, SEXP materialized_field_names_sexp){ BEGIN_CPP11 - arrow::r::Input>::type plan(plan_sexp); - arrow::r::Input>::type dataset(dataset_sexp); - arrow::r::Input>::type filter(filter_sexp); + arrow::r::Input&>::type plan(plan_sexp); + arrow::r::Input&>::type dataset(dataset_sexp); + arrow::r::Input&>::type filter(filter_sexp); arrow::r::Input>::type materialized_field_names(materialized_field_names_sexp); return cpp11::as_sexp(ExecNode_Scan(plan, dataset, filter, materialized_field_names)); END_CPP11 @@ -1142,11 +1142,11 @@ extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP fil // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ExecNode_Filter(std::shared_ptr input, std::shared_ptr filter); +std::shared_ptr ExecNode_Filter(const std::shared_ptr& input, const std::shared_ptr& filter); extern "C" SEXP _arrow_ExecNode_Filter(SEXP input_sexp, SEXP filter_sexp){ BEGIN_CPP11 - arrow::r::Input>::type input(input_sexp); - arrow::r::Input>::type filter(filter_sexp); + arrow::r::Input&>::type input(input_sexp); + arrow::r::Input&>::type filter(filter_sexp); return cpp11::as_sexp(ExecNode_Filter(input, filter)); END_CPP11 } @@ -1158,11 +1158,11 @@ extern "C" SEXP _arrow_ExecNode_Filter(SEXP input_sexp, SEXP filter_sexp){ // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ExecNode_Project(std::shared_ptr input, std::vector> exprs, std::vector names); +std::shared_ptr ExecNode_Project(const std::shared_ptr& input, const std::vector>& exprs, std::vector names); extern "C" SEXP _arrow_ExecNode_Project(SEXP input_sexp, SEXP exprs_sexp, SEXP names_sexp){ BEGIN_CPP11 - arrow::r::Input>::type input(input_sexp); - arrow::r::Input>>::type exprs(exprs_sexp); + arrow::r::Input&>::type input(input_sexp); + arrow::r::Input>&>::type exprs(exprs_sexp); arrow::r::Input>::type names(names_sexp); return cpp11::as_sexp(ExecNode_Project(input, exprs, names)); END_CPP11 diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index f9b1c6e2818..dd341784899 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -36,8 +36,8 @@ std::shared_ptr ExecPlan_create() { // [[arrow::export]] std::shared_ptr ExecPlan_run( - std::shared_ptr plan, - std::shared_ptr final_node) { + const std::shared_ptr& plan, + const std::shared_ptr& final_node) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. auto sink_gen = compute::MakeSinkNode(final_node.get(), "sink"); @@ -65,9 +65,9 @@ std::shared_ptr ExecNodeOrStop( // [[arrow::export]] std::shared_ptr ExecNode_Scan( - std::shared_ptr plan, - std::shared_ptr dataset, - std::shared_ptr filter, + const std::shared_ptr& plan, + const std::shared_ptr& dataset, + const std::shared_ptr& filter, std::vector materialized_field_names) { // TODO: pass in ScanOptions by file type auto options = std::make_shared(); @@ -97,16 +97,16 @@ std::shared_ptr ExecNode_Scan( // [[arrow::export]] std::shared_ptr ExecNode_Filter( - std::shared_ptr input, - std::shared_ptr filter) { + const std::shared_ptr& input, + const std::shared_ptr& filter) { return ExecNodeOrStop( compute::MakeFilterNode(input.get(), /*label=*/"filter", *filter)); } // [[arrow::export]] std::shared_ptr ExecNode_Project( - std::shared_ptr input, - std::vector> exprs, + const std::shared_ptr& input, + const std::vector>& exprs, std::vector names) { // We have shared_ptrs of expressions but need the Expressions std::vector expressions; @@ -118,7 +118,7 @@ std::shared_ptr ExecNode_Project( } std::shared_ptr ExecNode_ScalarAggregate( - std::shared_ptr input, cpp11::list options, + const std::shared_ptr& input, cpp11::list options, std::vector targets, std::vector out_field_names) { // PROBLEM: need to keep these alive as long as the plan somehow. std::vector> keep_alives; From 1bc0789f6af3434a7ab61e49df2a3900724bda5b Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 15 Jul 2021 11:09:56 -0400 Subject: [PATCH 05/24] improve keepalive pattern --- cpp/src/arrow/compute/exec/exec_plan.cc | 35 +++++++++++++++------ cpp/src/arrow/compute/exec/exec_plan.h | 4 ++- cpp/src/arrow/compute/exec/plan_test.cc | 8 +++-- r/DESCRIPTION | 1 + r/R/arrowExports.R | 4 +-- r/R/query-engine.R | 7 +++-- r/src/arrowExports.cpp | 11 ++++--- r/src/compute-exec.cpp | 42 ++++++++++++++++--------- r/src/compute.cpp | 4 +-- 9 files changed, 77 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/compute/exec/exec_plan.cc b/cpp/src/arrow/compute/exec/exec_plan.cc index 20c8c347cc1..4a4758c8471 100644 --- a/cpp/src/arrow/compute/exec/exec_plan.cc +++ b/cpp/src/arrow/compute/exec/exec_plan.cc @@ -719,11 +719,13 @@ struct ScalarAggregateNode : ExecNode { ScalarAggregateNode(ExecNode* input, std::string label, std::shared_ptr output_schema, std::vector kernels, + std::vector argument_indices, std::vector>> states) : ExecNode(input->plan(), std::move(label), {input}, {"target"}, /*output_schema=*/std::move(output_schema), /*num_outputs=*/1), kernels_(std::move(kernels)), + argument_indices_(std::move(argument_indices)), states_(std::move(states)) {} const char* kind_name() override { return "ScalarAggregateNode"; } @@ -733,7 +735,7 @@ struct ScalarAggregateNode : ExecNode { KernelContext batch_ctx{plan()->exec_context()}; batch_ctx.SetState(states_[i][thread_index].get()); - ExecBatch single_column_batch{{batch.values[i]}, batch.length}; + ExecBatch single_column_batch{{batch[argument_indices_[i]]}, batch.length}; RETURN_NOT_OK(kernels_[i]->consume(&batch_ctx, single_column_batch)); } return Status::OK(); @@ -807,7 +809,8 @@ struct ScalarAggregateNode : ExecNode { } Future<> finished_ = Future<>::MakeFinished(); - std::vector kernels_; + const std::vector kernels_; + const std::vector argument_indices_; std::vector>> states_; @@ -816,11 +819,17 @@ struct ScalarAggregateNode : ExecNode { }; Result MakeScalarAggregateNode(ExecNode* input, std::string label, - std::vector aggregates) { - if (input->output_schema()->num_fields() != static_cast(aggregates.size())) { - return Status::Invalid("Provided ", aggregates.size(), - " aggregates, expected one for each field of ", - input->output_schema()->ToString()); + std::vector aggregates, + std::vector arguments, + std::vector out_field_names) { + if (aggregates.size() != arguments.size()) { + return Status::Invalid("Provided ", aggregates.size(), " aggregates but ", + arguments.size(), " arguments."); + } + + if (aggregates.size() != out_field_names.size()) { + return Status::Invalid("Provided ", aggregates.size(), " aggregates but ", + out_field_names.size(), " field names for the output."); } auto exec_ctx = input->plan()->exec_context(); @@ -828,8 +837,16 @@ Result MakeScalarAggregateNode(ExecNode* input, std::string label, std::vector kernels(aggregates.size()); std::vector>> states(kernels.size()); FieldVector fields(kernels.size()); + std::vector argument_indices(kernels.size()); for (size_t i = 0; i < kernels.size(); ++i) { + if (!arguments[i].IsName()) { + return Status::NotImplemented("Non name field refs"); + } + ARROW_ASSIGN_OR_RAISE(auto match, + arguments[i].FindOneOrNone(*input->output_schema())); + argument_indices[i] = match[0]; + ARROW_ASSIGN_OR_RAISE(auto function, exec_ctx->func_registry()->GetFunction(aggregates[i].function)); @@ -862,12 +879,12 @@ Result MakeScalarAggregateNode(ExecNode* input, std::string label, ARROW_ASSIGN_OR_RAISE( auto descr, kernels[i]->signature->out_type().Resolve(&kernel_ctx, {in_type})); - fields[i] = field(aggregates[i].function, std::move(descr.type)); + fields[i] = field(std::move(out_field_names[i]), std::move(descr.type)); } return input->plan()->EmplaceNode( input, std::move(label), schema(std::move(fields)), std::move(kernels), - std::move(states)); + std::move(argument_indices), std::move(states)); } namespace internal { diff --git a/cpp/src/arrow/compute/exec/exec_plan.h b/cpp/src/arrow/compute/exec/exec_plan.h index 07bb365bbc7..fc3af92af4a 100644 --- a/cpp/src/arrow/compute/exec/exec_plan.h +++ b/cpp/src/arrow/compute/exec/exec_plan.h @@ -285,7 +285,9 @@ Result MakeProjectNode(ExecNode* input, std::string label, ARROW_EXPORT Result MakeScalarAggregateNode(ExecNode* input, std::string label, - std::vector aggregates); + std::vector aggregates, + std::vector arguments, + std::vector out_field_names); /// \brief Make a node which groups input rows based on key fields and computes /// aggregates for each group diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc index aa807468bcb..f7fce4dddef 100644 --- a/cpp/src/arrow/compute/exec/plan_test.cc +++ b/cpp/src/arrow/compute/exec/plan_test.cc @@ -531,9 +531,11 @@ TEST(ExecPlanExecution, SourceScalarAggSink) { MakeTestSourceNode(plan.get(), "source", basic_data, /*parallel=*/false, /*slow=*/false)); - ASSERT_OK_AND_ASSIGN(auto scalar_agg, - MakeScalarAggregateNode(source, "scalar_agg", - {{"sum", nullptr}, {"any", nullptr}})); + ASSERT_OK_AND_ASSIGN( + auto scalar_agg, + MakeScalarAggregateNode(source, "scalar_agg", {{"sum", nullptr}, {"any", nullptr}}, + /*targets=*/{"i32", "bool"}, + /*out_field_names=*/{"sum(i32)", "any(bool)"})); auto sink_gen = MakeSinkNode(scalar_agg, "sink"); diff --git a/r/DESCRIPTION b/r/DESCRIPTION index a0c4b61b7a0..3d10aa4745e 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -109,6 +109,7 @@ Collate: 'metadata.R' 'parquet.R' 'python.R' + 'query-engine.R' 'record-batch-reader.R' 'record-batch-writer.R' 'reexports-bit64.R' diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 250fd53f1a0..91553754672 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -280,8 +280,8 @@ io___CompressedInputStream__Make <- function(codec, raw){ .Call(`_arrow_io___CompressedInputStream__Make`, codec, raw) } -ExecPlan_create <- function(){ - .Call(`_arrow_ExecPlan_create`) +ExecPlan_create <- function(use_threads){ + .Call(`_arrow_ExecPlan_create`, use_threads) } ExecPlan_run <- function(plan, final_node){ diff --git a/r/R/query-engine.R b/r/R/query-engine.R index fa12396740e..614811e26a4 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -36,7 +36,8 @@ ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, filter <- Expression$scalar(TRUE) colnames <- names(dataset) } - # TODO: why do I _need_ to filter/project here? + # ScanNode needs the filter to do predicate pushdown and skip partitions, + # and it needs to know which fields to materialize (and which are unnecessary) ExecNode_Scan(self, dataset, filter, colnames) }, Run = function(node) { @@ -45,7 +46,9 @@ ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, } ) ) -ExecPlan$create <- ExecPlan_create +ExecPlan$create <- function(use_threads = option_use_threads()) { + ExecPlan_create(use_threads) +} ExecNode <- R6Class("ExecNode", inherit = ArrowObject, public = list( diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 371aae96703..ff0acd50953 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1094,14 +1094,15 @@ extern "C" SEXP _arrow_io___CompressedInputStream__Make(SEXP codec_sexp, SEXP ra // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ExecPlan_create(); -extern "C" SEXP _arrow_ExecPlan_create(){ +std::shared_ptr ExecPlan_create(bool use_threads); +extern "C" SEXP _arrow_ExecPlan_create(SEXP use_threads_sexp){ BEGIN_CPP11 - return cpp11::as_sexp(ExecPlan_create()); + arrow::r::Input::type use_threads(use_threads_sexp); + return cpp11::as_sexp(ExecPlan_create(use_threads)); END_CPP11 } #else -extern "C" SEXP _arrow_ExecPlan_create(){ +extern "C" SEXP _arrow_ExecPlan_create(SEXP use_threads_sexp){ Rf_error("Cannot call ExecPlan_create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -7092,7 +7093,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, - { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 0}, + { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 1}, { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 2}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index dd341784899..e3fb08cfb0f 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -21,17 +21,32 @@ #include #include +#include #include #include +#include namespace compute = ::arrow::compute; std::shared_ptr make_compute_options(std::string func_name, cpp11::list options); +template +void AddKeepalive(compute::ExecPlan* plan, T keepalive) { + struct Callback { + void operator()(const arrow::Status&) && {} + T keepalive; + }; + plan->finished().AddCallback(Callback{std::move(keepalive)}); +} + // [[arrow::export]] -std::shared_ptr ExecPlan_create() { - return ValueOrStop(compute::ExecPlan::Make(gc_context())); +std::shared_ptr ExecPlan_create(bool use_threads) { + auto executor = use_threads ? arrow::internal::GetCpuThreadPool() : nullptr; + auto context = std::make_shared(gc_memory_pool(), executor); + auto plan = ValueOrStop(compute::ExecPlan::Make(context.get())); + AddKeepalive(plan.get(), std::move(context)); + return plan; } // [[arrow::export]] @@ -69,7 +84,7 @@ std::shared_ptr ExecNode_Scan( const std::shared_ptr& dataset, const std::shared_ptr& filter, std::vector materialized_field_names) { - // TODO: pass in ScanOptions by file type + // TODO: pass in FragmentScanOptions auto options = std::make_shared(); options->use_async = true; @@ -119,9 +134,7 @@ std::shared_ptr ExecNode_Project( std::shared_ptr ExecNode_ScalarAggregate( const std::shared_ptr& input, cpp11::list options, - std::vector targets, std::vector out_field_names) { - // PROBLEM: need to keep these alive as long as the plan somehow. - std::vector> keep_alives; + std::vector target_names, std::vector out_field_names) { std::vector aggregates; for (cpp11::list name_opts : options) { @@ -130,16 +143,17 @@ std::shared_ptr ExecNode_ScalarAggregate( aggregates.push_back( arrow::compute::internal::Aggregate{std::move(name), opts.get()}); - keep_alives.push_back(std::move(opts)); - } - auto scalar_agg = ValueOrStop(compute::MakeScalarAggregateNode( - input, /*label=*/"scalar_agg", aggregates, targets, out_field_names)); + AddKeepalive(input->plan(), std::move(opts)); + } - return std::shared_ptr(scalar_agg, [keep_alives](...) { - // empty destructor: ExecNode lifetime is managed by an ExecPlan - // also carries the function options - }); + std::vector targets; + for (auto&& name : target_names) { + targets.emplace_back(std::move(name)); + } + return ExecNodeOrStop(compute::MakeScalarAggregateNode( + input.get(), /*label=*/"scalar_agg", std::move(aggregates), std::move(targets), + std::move(out_field_names))); } #endif diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 142a460d2eb..30821137383 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -22,13 +22,11 @@ #include #include #include -#include std::shared_ptr make_cast_options(cpp11::list options); arrow::compute::ExecContext* gc_context() { - static arrow::compute::ExecContext context(gc_memory_pool(), - arrow::internal::GetCpuThreadPool()); + static arrow::compute::ExecContext context(gc_memory_pool()); return &context; } From b5b41a3ad357a1c4f7d643d4f9d55dd1e7f84f99 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Jul 2021 13:58:16 -0400 Subject: [PATCH 06/24] Compiles but segfaults --- r/R/arrowExports.R | 4 ++++ r/R/dplyr-summarize.R | 10 +++++----- r/R/query-engine.R | 4 ++-- r/src/arrowExports.cpp | 21 ++++++++++++++++++++- r/src/compute-exec.cpp | 3 ++- 5 files changed, 33 insertions(+), 9 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 91553754672..a870e7fb372 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -300,6 +300,10 @@ ExecNode_Project <- function(input, exprs, names){ .Call(`_arrow_ExecNode_Project`, input, exprs, names) } +ExecNode_ScalarAggregate <- function(input, options, target_names, out_field_names){ + .Call(`_arrow_ExecNode_ScalarAggregate`, input, options, target_names, out_field_names) +} + RecordBatch__cast <- function(batch, schema, options){ .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 2db8a954918..05933a62b22 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -32,7 +32,7 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb dplyr::summarise(to_duckdb(.data), ...) } else if (isTRUE(getOption("arrow.summarize", FALSE))) { # Try stuff, if successful return() - out <- try(do_arrow_summarize(.data, ...), silent = TRUE) + out <- do_arrow_summarize(.data, ...) if (inherits(out, "try-error")) { return(abandon_ship(call, .data, format(out))) } else { @@ -63,7 +63,7 @@ do_arrow_summarize <- function(.data, ...) { list( fun = "sum", data = x, - options = list(na.rm = na.rm) + options = list(na.rm = na.rm, na.min_count = 0L) ) } results <- list() @@ -101,9 +101,9 @@ do_exec_plan <- function(.data) { project_node <- start_node$Project(.data$selected_columns) final_node <- project_node$ScalarAggregate( - options = .data$aggregates, - targets = names(.data), - out_field_names = names(.data$aggregates) + options = .data$aggregations, + target_names = names(.data), + out_field_names = names(.data$aggregations) ) plan$Run(final_node) } \ No newline at end of file diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 614811e26a4..c358b1de396 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -60,8 +60,8 @@ ExecNode <- R6Class("ExecNode", inherit = ArrowObject, assert_is(expr, "Expression") ExecNode_Filter(self, expr) }, - ScalarAggregate = function(options, targets, out_field_names) { - ExecNode_ScalarAggregate(self, options, targets, out_field_names) + ScalarAggregate = function(options, target_names, out_field_names) { + ExecNode_ScalarAggregate(self, options, target_names, out_field_names) } ) ) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index ff0acd50953..874361a2d8a 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1124,7 +1124,7 @@ extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ #endif // compute-exec.cpp -#if defined(ARROW_R_WITH_ARROW) +#if defined(ARROW_R_WITH_DATASET) std::shared_ptr ExecNode_Scan(const std::shared_ptr& plan, const std::shared_ptr& dataset, const std::shared_ptr& filter, std::vector materialized_field_names); extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP filter_sexp, SEXP materialized_field_names_sexp){ BEGIN_CPP11 @@ -1174,6 +1174,24 @@ extern "C" SEXP _arrow_ExecNode_Project(SEXP input_sexp, SEXP exprs_sexp, SEXP n } #endif +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExecNode_ScalarAggregate(const std::shared_ptr& input, cpp11::list options, std::vector target_names, std::vector out_field_names); +extern "C" SEXP _arrow_ExecNode_ScalarAggregate(SEXP input_sexp, SEXP options_sexp, SEXP target_names_sexp, SEXP out_field_names_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type input(input_sexp); + arrow::r::Input::type options(options_sexp); + arrow::r::Input>::type target_names(target_names_sexp); + arrow::r::Input>::type out_field_names(out_field_names_sexp); + return cpp11::as_sexp(ExecNode_ScalarAggregate(input, options, target_names, out_field_names)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExecNode_ScalarAggregate(SEXP input_sexp, SEXP options_sexp, SEXP target_names_sexp, SEXP out_field_names_sexp){ + Rf_error("Cannot call ExecNode_ScalarAggregate(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // compute.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, cpp11::list options); @@ -7098,6 +7116,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, + { "_arrow_ExecNode_ScalarAggregate", (DL_FUNC) &_arrow_ExecNode_ScalarAggregate, 4}, { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index e3fb08cfb0f..932566fc696 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -78,7 +78,7 @@ std::shared_ptr ExecNodeOrStop( #include -// [[arrow::export]] +// [[dataset::export]] std::shared_ptr ExecNode_Scan( const std::shared_ptr& plan, const std::shared_ptr& dataset, @@ -132,6 +132,7 @@ std::shared_ptr ExecNode_Project( input.get(), /*label=*/"project", std::move(expressions), std::move(names))); } +// [[arrow::export]] std::shared_ptr ExecNode_ScalarAggregate( const std::shared_ptr& input, cpp11::list options, std::vector target_names, std::vector out_field_names) { From f34c932be0be67dd3ec848c4c1bc08de4d92d056 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 15 Jul 2021 14:39:08 -0400 Subject: [PATCH 07/24] revert keepalives --- r/src/compute-exec.cpp | 23 ++++++++--------------- r/tests/testthat/test-dplyr-aggregate.R | 3 +-- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 932566fc696..f5a734db510 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -19,6 +19,8 @@ #if defined(ARROW_R_WITH_ARROW) +#include + #include #include #include @@ -31,21 +33,12 @@ namespace compute = ::arrow::compute; std::shared_ptr make_compute_options(std::string func_name, cpp11::list options); -template -void AddKeepalive(compute::ExecPlan* plan, T keepalive) { - struct Callback { - void operator()(const arrow::Status&) && {} - T keepalive; - }; - plan->finished().AddCallback(Callback{std::move(keepalive)}); -} - // [[arrow::export]] std::shared_ptr ExecPlan_create(bool use_threads) { - auto executor = use_threads ? arrow::internal::GetCpuThreadPool() : nullptr; - auto context = std::make_shared(gc_memory_pool(), executor); - auto plan = ValueOrStop(compute::ExecPlan::Make(context.get())); - AddKeepalive(plan.get(), std::move(context)); + static compute::ExecContext threaded_context{gc_memory_pool(), + arrow::internal::GetCpuThreadPool()}; + auto plan = ValueOrStop( + compute::ExecPlan::Make(use_threads ? &threaded_context : gc_context())); return plan; } @@ -137,6 +130,7 @@ std::shared_ptr ExecNode_ScalarAggregate( const std::shared_ptr& input, cpp11::list options, std::vector target_names, std::vector out_field_names) { std::vector aggregates; + std::vector> keep_alives; for (cpp11::list name_opts : options) { auto name = cpp11::as_cpp(name_opts[0]); @@ -144,8 +138,7 @@ std::shared_ptr ExecNode_ScalarAggregate( aggregates.push_back( arrow::compute::internal::Aggregate{std::move(name), opts.get()}); - - AddKeepalive(input->plan(), std::move(opts)); + keep_alives.push_back(std::move(opts)); } std::vector targets; diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 279e5448753..eec5357fe47 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -28,10 +28,9 @@ tbl$verses <- verses[[1]] tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both") test_that("Can aggregate", { - withr::local_options(list(arrow.summarize = TRUE)) expect_dplyr_equal( input %>% summarize(total = sum(int)), tbl ) -}) \ No newline at end of file +}) From 683dbcc3535f231f52855d7fcbda2d6f8087f325 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Jul 2021 14:59:02 -0400 Subject: [PATCH 08/24] Actually run the tests --- r/tests/testthat/test-dplyr-aggregate.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index eec5357fe47..b615384ebea 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -28,9 +28,19 @@ tbl$verses <- verses[[1]] tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both") test_that("Can aggregate", { + withr::local_options(list(arrow.summarize = TRUE)) expect_dplyr_equal( input %>% - summarize(total = sum(int)), + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), tbl ) + # This is failing because the default is na.rm = FALSE + expect_dplyr_equal( + input %>% + summarize(total = sum(int)) %>% + collect(), + tbl + ) + }) From a1f676d40ead2fbbc3665f6f3ceb04c720f62390 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Jul 2021 15:01:38 -0400 Subject: [PATCH 09/24] Restore docs --- r/man/ChunkedArray.Rd | 22 +++++++++++++++ r/man/Field.Rd | 5 ++++ r/man/FileFormat.Rd | 15 ++++++++++ r/man/ParquetFileReader.Rd | 12 ++++++++ r/man/RecordBatch.Rd | 11 ++++++++ r/man/RecordBatchReader.Rd | 37 +++++++++++++++++++++++++ r/man/RecordBatchWriter.Rd | 37 +++++++++++++++++++++++++ r/man/Scalar.Rd | 17 ++++++++++++ r/man/Schema.Rd | 9 ++++++ r/man/Table.Rd | 11 ++++++++ r/man/array.Rd | 23 ++++++++++++++++ r/man/buffer.Rd | 9 ++++++ r/man/call_function.Rd | 10 +++++++ r/man/codec_is_available.Rd | 5 ++++ r/man/copy_files.Rd | 10 +++++++ r/man/data-type.Rd | 8 ++++++ r/man/hive_partition.Rd | 5 ++++ r/man/list_compute_functions.Rd | 7 +++++ r/man/load_flight_server.Rd | 5 ++++ r/man/match_arrow.Rd | 25 +++++++++++++++++ r/man/open_dataset.Rd | 49 +++++++++++++++++++++++++++++++++ r/man/read_delim_arrow.Rd | 11 ++++++++ r/man/read_feather.Rd | 11 ++++++++ r/man/read_json_arrow.Rd | 12 ++++++++ r/man/read_parquet.Rd | 9 ++++++ r/man/s3_bucket.Rd | 5 ++++ r/man/type.Rd | 10 +++++++ r/man/unify_schemas.Rd | 7 +++++ r/man/value_counts.Rd | 6 ++++ r/man/write_csv_arrow.Rd | 7 +++++ r/man/write_feather.Rd | 7 +++++ r/man/write_ipc_stream.Rd | 7 +++++ r/man/write_parquet.Rd | 12 ++++++++ r/man/write_to_raw.Rd | 7 +++++ 34 files changed, 443 insertions(+) diff --git a/r/man/ChunkedArray.Rd b/r/man/ChunkedArray.Rd index 486b6222af7..3a504f01466 100644 --- a/r/man/ChunkedArray.Rd +++ b/r/man/ChunkedArray.Rd @@ -53,6 +53,28 @@ within the array's internal data. This can be an expensive check, potentially \c } } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Pass items into chunked_array as separate objects to create chunks +class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73)) +class_scores$num_chunks + +# When taking a Slice from a chunked_array, chunks are preserved +class_scores$Slice(2, length = 5) + +# You can combine Take and SortIndices to return a ChunkedArray with 1 chunk +# containing all values, ordered. +class_scores$Take(class_scores$SortIndices(descending = TRUE)) + +# If you pass a list into chunked_array, you get a list of length 1 +list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8))) +list_scores$num_chunks + +# When constructing a ChunkedArray, the first chunk is used to infer type. +doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L)) +doubles$type +\dontshow{\}) # examplesIf} +} \seealso{ \link{Array} } diff --git a/r/man/Field.Rd b/r/man/Field.Rd index 03dffd11ca9..77d31fa637a 100644 --- a/r/man/Field.Rd +++ b/r/man/Field.Rd @@ -28,3 +28,8 @@ field(name, type, metadata) } } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +field("x", int32()) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd index b8d4dc01bad..5bc9475b408 100644 --- a/r/man/FileFormat.Rd +++ b/r/man/FileFormat.Rd @@ -51,3 +51,18 @@ From \link{CsvFragmentScanOptions} (these values can be overridden at scan time) It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat}) } +\examples{ +\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +## Semi-colon delimited files +# Set up directory for examples +tf <- tempfile() +dir.create(tf) +on.exit(unlink(tf)) +write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE) + +# Create FileFormat object +format <- FileFormat$create(format = "text", delimiter = ";") + +open_dataset(tf, format = format) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/ParquetFileReader.Rd b/r/man/ParquetFileReader.Rd index 0b49df79d6b..31de9ead104 100644 --- a/r/man/ParquetFileReader.Rd +++ b/r/man/ParquetFileReader.Rd @@ -45,3 +45,15 @@ The optional \verb{column_indices=} argument is a 0-based integer vector indicat } } +\examples{ +\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +f <- system.file("v0.7.1.parquet", package="arrow") +pq <- ParquetFileReader$create(f) +pq$GetSchema() +if (codec_is_available("snappy")) { + # This file has compressed data columns + tab <- pq$ReadTable() + tab$schema +} +\dontshow{\}) # examplesIf} +} diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd index e3024b91b7a..ff08c215853 100644 --- a/r/man/RecordBatch.Rd +++ b/r/man/RecordBatch.Rd @@ -79,3 +79,14 @@ All list elements are coerced to string. See \code{schema()} for more informatio } } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +batch <- record_batch(name = rownames(mtcars), mtcars) +dim(batch) +dim(head(batch)) +names(batch) +batch$mpg +batch[["cyl"]] +as.data.frame(batch[4:8, c("gear", "hp", "wt")]) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/RecordBatchReader.Rd b/r/man/RecordBatchReader.Rd index a206c30c8fb..90c796a6693 100644 --- a/r/man/RecordBatchReader.Rd +++ b/r/man/RecordBatchReader.Rd @@ -43,6 +43,43 @@ are in the file. } } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) + +batch <- record_batch(chickwts) + +# This opens a connection to the file in Arrow +file_obj <- FileOutputStream$create(tf) +# Pass that to a RecordBatchWriter to write data conforming to a schema +writer <- RecordBatchFileWriter$create(file_obj, batch$schema) +writer$write(batch) +# You may write additional batches to the stream, provided that they have +# the same schema. +# Call "close" on the writer to indicate end-of-file/stream +writer$close() +# Then, close the connection--closing the IPC message does not close the file +file_obj$close() + +# Now, we have a file we can read from. Same pattern: open file connection, +# then pass it to a RecordBatchReader +read_file_obj <- ReadableFile$create(tf) +reader <- RecordBatchFileReader$create(read_file_obj) +# RecordBatchFileReader knows how many batches it has (StreamReader does not) +reader$num_record_batches +# We could consume the Reader by calling $read_next_batch() until all are, +# consumed, or we can call $read_table() to pull them all into a Table +tab <- reader$read_table() +# Call as.data.frame to turn that Table into an R data.frame +df <- as.data.frame(tab) +# This should be the same data we sent +all.equal(df, chickwts, check.attributes = FALSE) +# Unlike the Writers, we don't have to close RecordBatchReaders, +# but we do still need to close the file connection +read_file_obj$close() +\dontshow{\}) # examplesIf} +} \seealso{ \code{\link[=read_ipc_stream]{read_ipc_stream()}} and \code{\link[=read_feather]{read_feather()}} provide a much simpler interface for reading data from these formats and are sufficient for many use cases. diff --git a/r/man/RecordBatchWriter.Rd b/r/man/RecordBatchWriter.Rd index cc6d2feb3ac..219c150e6a4 100644 --- a/r/man/RecordBatchWriter.Rd +++ b/r/man/RecordBatchWriter.Rd @@ -45,6 +45,43 @@ to be closed separately. } } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) + +batch <- record_batch(chickwts) + +# This opens a connection to the file in Arrow +file_obj <- FileOutputStream$create(tf) +# Pass that to a RecordBatchWriter to write data conforming to a schema +writer <- RecordBatchFileWriter$create(file_obj, batch$schema) +writer$write(batch) +# You may write additional batches to the stream, provided that they have +# the same schema. +# Call "close" on the writer to indicate end-of-file/stream +writer$close() +# Then, close the connection--closing the IPC message does not close the file +file_obj$close() + +# Now, we have a file we can read from. Same pattern: open file connection, +# then pass it to a RecordBatchReader +read_file_obj <- ReadableFile$create(tf) +reader <- RecordBatchFileReader$create(read_file_obj) +# RecordBatchFileReader knows how many batches it has (StreamReader does not) +reader$num_record_batches +# We could consume the Reader by calling $read_next_batch() until all are, +# consumed, or we can call $read_table() to pull them all into a Table +tab <- reader$read_table() +# Call as.data.frame to turn that Table into an R data.frame +df <- as.data.frame(tab) +# This should be the same data we sent +all.equal(df, chickwts, check.attributes = FALSE) +# Unlike the Writers, we don't have to close RecordBatchReaders, +# but we do still need to close the file connection +read_file_obj$close() +\dontshow{\}) # examplesIf} +} \seealso{ \code{\link[=write_ipc_stream]{write_ipc_stream()}} and \code{\link[=write_feather]{write_feather()}} provide a much simpler interface for writing data to these formats and are sufficient for many use diff --git a/r/man/Scalar.Rd b/r/man/Scalar.Rd index 9128988d11c..21e04c12e08 100644 --- a/r/man/Scalar.Rd +++ b/r/man/Scalar.Rd @@ -19,3 +19,20 @@ A \code{Scalar} holds a single value of an Arrow type. \verb{$type}: Scalar type } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +Scalar$create(pi) +Scalar$create(404) +# If you pass a vector into Scalar$create, you get a list containing your items +Scalar$create(c(1, 2, 3)) + +# Comparisons +my_scalar <- Scalar$create(99) +my_scalar$ApproxEquals(Scalar$create(99.00001)) # FALSE +my_scalar$ApproxEquals(Scalar$create(99.000009)) # TRUE +my_scalar$Equals(Scalar$create(99.000009)) # FALSE +my_scalar$Equals(Scalar$create(99L)) # FALSE (types don't match) + +my_scalar$ToString() +\dontshow{\}) # examplesIf} +} diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd index 0c66e5c2a42..6e385bb804e 100644 --- a/r/man/Schema.Rd +++ b/r/man/Schema.Rd @@ -74,3 +74,12 @@ Files with compressed metadata are readable by older versions of arrow, but the metadata is dropped. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5)) +tab1 <- Table$create(df) +tab1$schema +tab2 <- Table$create(df, schema = schema(col1 = int8(), col2 = float32())) +tab2$schema +\dontshow{\}) # examplesIf} +} diff --git a/r/man/Table.Rd b/r/man/Table.Rd index d955b0f5a29..2675943e572 100644 --- a/r/man/Table.Rd +++ b/r/man/Table.Rd @@ -79,3 +79,14 @@ All list elements are coerced to string. See \code{schema()} for more informatio } } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tab <- Table$create(name = rownames(mtcars), mtcars) +dim(tab) +dim(head(tab)) +names(tab) +tab$mpg +tab[["cyl"]] +as.data.frame(tab[4:8, c("gear", "hp", "wt")]) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/array.Rd b/r/man/array.Rd index ed25a2b0a34..71957aff90c 100644 --- a/r/man/array.Rd +++ b/r/man/array.Rd @@ -82,3 +82,26 @@ within the array's internal data. This can be an expensive check, potentially \c } } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +my_array <- Array$create(1:10) +my_array$type +my_array$cast(int8()) + +# Check if value is null; zero-indexed +na_array <- Array$create(c(1:5, NA)) +na_array$IsNull(0) +na_array$IsNull(5) +na_array$IsValid(5) +na_array$null_count + +# zero-copy slicing; the offset of the new Array will be the same as the index passed to $Slice +new_array <- na_array$Slice(5) +new_array$offset + +# Compare 2 arrays +na_array2 = na_array +na_array2 == na_array # element-wise comparison +na_array2$Equals(na_array) # overall comparison +\dontshow{\}) # examplesIf} +} diff --git a/r/man/buffer.Rd b/r/man/buffer.Rd index 99b636da3c7..a3ca1fc2fcb 100644 --- a/r/man/buffer.Rd +++ b/r/man/buffer.Rd @@ -33,3 +33,12 @@ contiguous memory with a particular size. } } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +my_buffer <- buffer(c(1, 2, 3, 4)) +my_buffer$is_mutable +my_buffer$ZeroPadding() +my_buffer$size +my_buffer$capacity +\dontshow{\}) # examplesIf} +} diff --git a/r/man/call_function.Rd b/r/man/call_function.Rd index 7e9b7e50ea0..bef89f10b18 100644 --- a/r/man/call_function.Rd +++ b/r/man/call_function.Rd @@ -35,6 +35,16 @@ are callable with an \code{arrow_} prefix. When passing indices in \code{...}, \code{args}, or \code{options}, express them as 0-based integers (consistent with C++). } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +a <- Array$create(c(1L, 2L, 3L, NA, 5L)) +s <- Scalar$create(4L) +call_function("fill_null", a, s) + +a <- Array$create(rnorm(10000)) +call_function("quantile", a, options = list(q = seq(0, 1, 0.25))) +\dontshow{\}) # examplesIf} +} \seealso{ \href{https://arrow.apache.org/docs/cpp/compute.html}{Arrow C++ documentation} for the functions and their respective options. diff --git a/r/man/codec_is_available.Rd b/r/man/codec_is_available.Rd index 1b5e8278fa9..b3238ff1dca 100644 --- a/r/man/codec_is_available.Rd +++ b/r/man/codec_is_available.Rd @@ -18,3 +18,8 @@ Support for compression libraries depends on the build-time settings of the Arrow C++ library. This function lets you know which are available for use. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +codec_is_available("gzip") +\dontshow{\}) # examplesIf} +} diff --git a/r/man/copy_files.Rd b/r/man/copy_files.Rd index 75cc4405d8a..1b83703f19f 100644 --- a/r/man/copy_files.Rd +++ b/r/man/copy_files.Rd @@ -23,3 +23,13 @@ Nothing: called for side effects in the file system \description{ Copy files between FileSystems } +\examples{ +\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Copy an S3 bucket's files to a local directory: +copy_files("s3://your-bucket-name", "local-directory") +# Using a FileSystem object +copy_files(s3_bucket("your-bucket-name"), "local-directory") +# Or go the other way, from local to S3 +copy_files("local-directory", s3_bucket("your-bucket-name")) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index 101702a2fb2..a0631897573 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -150,6 +150,14 @@ are translated to R objects, \code{uint32} and \code{uint64} are converted to \c types, this conversion can be disabled (so that \code{int64} always yields a \code{bit64::integer64} object) by setting \code{options(arrow.int64_downcast = FALSE)}. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +bool() +struct(a = int32(), b = double()) +timestamp("ms", timezone = "CEST") +time64("ns") +\dontshow{\}) # examplesIf} +} \seealso{ \code{\link[=dictionary]{dictionary()}} for creating a dictionary (factor-like) type. } diff --git a/r/man/hive_partition.Rd b/r/man/hive_partition.Rd index 39d5d8d0ae2..eef9f9157ea 100644 --- a/r/man/hive_partition.Rd +++ b/r/man/hive_partition.Rd @@ -28,3 +28,8 @@ Hive partitioning embeds field names and values in path segments, such as Because fields are named in the path segments, order of fields passed to \code{hive_partition()} does not matter. } +\examples{ +\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +hive_partition(year = int16(), month = int8()) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/list_compute_functions.Rd b/r/man/list_compute_functions.Rd index ba17688d833..668e090c0ca 100644 --- a/r/man/list_compute_functions.Rd +++ b/r/man/list_compute_functions.Rd @@ -37,3 +37,10 @@ The package includes Arrow methods for many base R functions that can be called directly on Arrow objects, as well as some tidyverse-flavored versions available inside \code{dplyr} verbs. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +list_compute_functions() +list_compute_functions(pattern = "^UTF8", ignore.case = TRUE) +list_compute_functions(pattern = "^is", invert = TRUE) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/load_flight_server.Rd b/r/man/load_flight_server.Rd index 7e2000a9ca2..66d30f39147 100644 --- a/r/man/load_flight_server.Rd +++ b/r/man/load_flight_server.Rd @@ -15,3 +15,8 @@ to look in the \verb{inst/} directory for included modules.} \description{ Load a Python Flight server } +\examples{ +\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +load_flight_server("demo_flight_server") +\dontshow{\}) # examplesIf} +} diff --git a/r/man/match_arrow.Rd b/r/man/match_arrow.Rd index 21481af4c6b..d63ef3eed87 100644 --- a/r/man/match_arrow.Rd +++ b/r/man/match_arrow.Rd @@ -26,3 +26,28 @@ per element of \code{x} it it is present in \code{table}. \code{base::match()} is not a generic, so we can't just define Arrow methods for it. This function exposes the analogous functions in the Arrow C++ library. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# note that the returned value is 0-indexed +cars_tbl <- Table$create(name = rownames(mtcars), mtcars) +match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name) + +is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name) + +# Although there are multiple matches, you are returned the index of the first +# match, as with the base R equivalent +match(4, mtcars$cyl) # 1-indexed +match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed + +# If `x` contains multiple values, you are returned the indices of the first +# match for each value. +match(c(4, 6, 8), mtcars$cyl) +match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl) + +# Return type matches type of `x` +is_in(c(4, 6, 8), mtcars$cyl) # returns vector +is_in(Scalar$create(4), mtcars$cyl) # returns Scalar +is_in(Array$create(c(4, 6, 8)), cars_tbl$cyl) # returns Array +is_in(ChunkedArray$create(c(4, 6), 8), cars_tbl$cyl) # returns ChunkedArray +\dontshow{\}) # examplesIf} +} diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd index 974d4286f59..1ca3d661880 100644 --- a/r/man/open_dataset.Rd +++ b/r/man/open_dataset.Rd @@ -90,6 +90,55 @@ can accelerate queries that only touch some partitions (files). Call \code{open_dataset()} to point to a directory of data files and return a \code{Dataset}, then use \code{dplyr} methods to query it. } +\examples{ +\dontshow{if (arrow_with_dataset() & arrow_with_parquet() ) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Set up directory for examples +tf <- tempfile() +dir.create(tf) +on.exit(unlink(tf)) + +data <- dplyr::group_by(mtcars, cyl) +write_dataset(data, tf) + +# You can specify a directory containing the files for your dataset and +# open_dataset will scan all files in your directory. +open_dataset(tf) + +# You can also supply a vector of paths +open_dataset(c(file.path(tf, "cyl=4/part-1.parquet"), file.path(tf,"cyl=8/part-2.parquet"))) + +## You must specify the file format if using a format other than parquet. +tf2 <- tempfile() +dir.create(tf2) +on.exit(unlink(tf2)) +write_dataset(data, tf2, format = "ipc") +# This line will results in errors when you try to work with the data +\dontrun{open_dataset(tf2)} +# This line will work +open_dataset(tf2, format = "ipc") + +## You can specify file partitioning to include it as a field in your dataset +# Create a temporary directory and write example dataset +tf3 <- tempfile() +dir.create(tf3) +on.exit(unlink(tf3)) +write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = FALSE) + +# View files - you can see the partitioning means that files have been written +# to folders based on Month/Day values +list.files(tf3, recursive = TRUE) + +# With no partitioning specified, dataset contains all files but doesn't include +# directory names as field names +open_dataset(tf3) + +# Now that partitioning has been specified, your dataset contains columns for Month and Day +open_dataset(tf3, partitioning = c("Month", "Day")) + +# If you want to specify the data types for your fields, you can pass in a Schema +open_dataset(tf3, partitioning = schema(Month = int8(), Day = int8())) +\dontshow{\}) # examplesIf} +} \seealso{ \code{vignette("dataset", package = "arrow")} } diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index d9c80306931..71394e547c9 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -205,3 +205,14 @@ Note that if you are specifying column names, whether by \code{schema} or to idenfity column names, you'll need to add \code{skip = 1} to skip that row. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} + tf <- tempfile() + on.exit(unlink(tf)) + write.csv(mtcars, file = tf) + df <- read_csv_arrow(tf) + dim(df) + # Can select columns + df <- read_csv_arrow(tf, col_select = starts_with("d")) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index fa18e3f7844..95f4d1d12c6 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -34,6 +34,17 @@ and to make sharing data across data analysis languages easy. This function reads both the original, limited specification of the format and the version 2 specification, which is the Apache Arrow IPC file format. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_feather(mtcars, tf) +df <- read_feather(tf) +dim(df) +# Can select columns +df <- read_feather(tf, col_select = starts_with("d")) +\dontshow{\}) # examplesIf} +} \seealso{ \link{FeatherReader} and \link{RecordBatchReader} for lower-level access to reading Arrow IPC data. } diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index 476c99fe4de..4806b4ad1f0 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -38,3 +38,15 @@ A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. \description{ Using \link{JsonTableReader} } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} + tf <- tempfile() + on.exit(unlink(tf)) + writeLines(' + { "hello": 3.5, "world": false, "yo": "thing" } + { "hello": 3.25, "world": null } + { "hello": 0.0, "world": true, "yo": null } + ', tf, useBytes=TRUE) + df <- read_json_arrow(tf) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index ffb2cf7109f..056e8644747 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -39,3 +39,12 @@ A \link[=Table]{arrow::Table}, or a \code{data.frame} if \code{as_data_frame} is '\href{https://parquet.apache.org/}{Parquet}' is a columnar storage file format. This function enables you to read Parquet files into R. } +\examples{ +\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_parquet(mtcars, tf) +df <- read_parquet(tf, col_select = starts_with("d")) +head(df) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/s3_bucket.Rd b/r/man/s3_bucket.Rd index 78d527a56c4..95a086deae5 100644 --- a/r/man/s3_bucket.Rd +++ b/r/man/s3_bucket.Rd @@ -21,3 +21,8 @@ are authorized to access the bucket's contents. that automatically detects the bucket's AWS region and holding onto the its relative path. } +\examples{ +\dontshow{if (arrow_with_s3()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +bucket <- s3_bucket("ursa-labs-taxi-data") +\dontshow{\}) # examplesIf} +} diff --git a/r/man/type.Rd b/r/man/type.Rd index 2f85e4a6ac6..d55bbe24bd5 100644 --- a/r/man/type.Rd +++ b/r/man/type.Rd @@ -15,3 +15,13 @@ an arrow logical type \description{ infer the arrow Array type from an R vector } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +type(1:10) +type(1L:10L) +type(c(1, 1.5, 2)) +type(c("A", "B", "C")) +type(mtcars) +type(Sys.Date()) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/unify_schemas.Rd b/r/man/unify_schemas.Rd index 709e33a5e74..50c80c2dda9 100644 --- a/r/man/unify_schemas.Rd +++ b/r/man/unify_schemas.Rd @@ -18,3 +18,10 @@ A \code{Schema} with the union of fields contained in the inputs, or \description{ Combine and harmonize schemas } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +a <- schema(b = double(), c = bool()) +z <- schema(b = double(), k = utf8()) +unify_schemas(a, z) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/value_counts.Rd b/r/man/value_counts.Rd index 139af8edc63..6ef77cd4727 100644 --- a/r/man/value_counts.Rd +++ b/r/man/value_counts.Rd @@ -16,3 +16,9 @@ A \code{StructArray} containing "values" (same type as \code{x}) and "counts" \description{ This function tabulates the values in the array and returns a table of counts. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +cyl_vals <- Array$create(mtcars$cyl) +value_counts(cyl_vals) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd index d6df2bcd08e..55a239ca998 100644 --- a/r/man/write_csv_arrow.Rd +++ b/r/man/write_csv_arrow.Rd @@ -23,3 +23,10 @@ the stream will be left open. \description{ Write CSV file to disk } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_csv_arrow(mtcars, tf) +\dontshow{\}) # examplesIf} +} diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 0cc8c591369..c6273b61be8 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -47,6 +47,13 @@ and to make sharing data across data analysis languages easy. This function writes both the original, limited specification of the format and the version 2 specification, which is the Apache Arrow IPC file format. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_feather(mtcars, tf) +\dontshow{\}) # examplesIf} +} \seealso{ \link{RecordBatchWriter} for lower-level access to writing Arrow IPC data. diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd index 4f742ce9178..888d947eb99 100644 --- a/r/man/write_ipc_stream.Rd +++ b/r/man/write_ipc_stream.Rd @@ -31,6 +31,13 @@ with some nonstandard behavior, is deprecated. You should explicitly choose the function that will write the desired IPC format (stream or file) since either can be written to a file or \code{OutputStream}. } +\examples{ +\dontshow{if (arrow_available() ) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_ipc_stream(mtcars, tf) +\dontshow{\}) # examplesIf} +} \seealso{ \code{\link[=write_feather]{write_feather()}} for writing IPC files. \code{\link[=write_to_raw]{write_to_raw()}} to serialize data to a buffer. diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index 823a6038e84..d7147f7e8e6 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -94,3 +94,15 @@ The default "snappy" is used if available, otherwise "uncompressed". To disable compression, set \code{compression = "uncompressed"}. Note that "uncompressed" columns may still have dictionary encoding. } +\examples{ +\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf1 <- tempfile(fileext = ".parquet") +write_parquet(data.frame(x = 1:5), tf1) + +# using compression +if (codec_is_available("gzip")) { + tf2 <- tempfile(fileext = ".gz.parquet") + write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5) +} +\dontshow{\}) # examplesIf} +} diff --git a/r/man/write_to_raw.Rd b/r/man/write_to_raw.Rd index 46af09a96e8..1f507e384c3 100644 --- a/r/man/write_to_raw.Rd +++ b/r/man/write_to_raw.Rd @@ -20,3 +20,10 @@ the data (\code{data.frame}, \code{RecordBatch}, or \code{Table}) they were give This function wraps those so that you can serialize data to a buffer and access that buffer as a \code{raw} vector in R. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# The default format is "stream" +write_to_raw(mtcars) +write_to_raw(mtcars, format = "file") +\dontshow{\}) # examplesIf} +} From 100a178f1626636ada6637ab1584f5f97333eacd Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Jul 2021 15:02:27 -0400 Subject: [PATCH 10/24] Restore try() --- r/R/dplyr-summarize.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 05933a62b22..6f6a5f31f31 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -32,7 +32,7 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb dplyr::summarise(to_duckdb(.data), ...) } else if (isTRUE(getOption("arrow.summarize", FALSE))) { # Try stuff, if successful return() - out <- do_arrow_summarize(.data, ...) + out <- try(do_arrow_summarize(.data, ...), silent = TRUE) if (inherits(out, "try-error")) { return(abandon_ship(call, .data, format(out))) } else { From d3190a23203c5865a00ee0f36a451b6364d65ac6 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 15 Jul 2021 15:34:34 -0400 Subject: [PATCH 11/24] Use FieldsInExpression to project in Scan --- r/R/arrowExports.R | 8 ++++++-- r/R/query-engine.R | 5 +++-- r/src/arrowExports.cpp | 30 +++++++++++++++++++++++------- r/src/expression.cpp | 15 +++++++++++++-- 4 files changed, 45 insertions(+), 13 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index a870e7fb372..e30a4d35d72 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -828,14 +828,18 @@ compute___expr__call <- function(func_name, argument_list, options){ .Call(`_arrow_compute___expr__call`, func_name, argument_list, options) } -compute___expr__field_ref <- function(name){ - .Call(`_arrow_compute___expr__field_ref`, name) +field_names_in_expression <- function(x){ + .Call(`_arrow_field_names_in_expression`, x) } compute___expr__get_field_ref_name <- function(x){ .Call(`_arrow_compute___expr__get_field_ref_name`, x) } +compute___expr__field_ref <- function(name){ + .Call(`_arrow_compute___expr__field_ref`, name) +} + compute___expr__scalar <- function(x){ .Call(`_arrow_compute___expr__scalar`, x) } diff --git a/r/R/query-engine.R b/r/R/query-engine.R index c358b1de396..6822bc0be9b 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -24,8 +24,9 @@ ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, if (isTRUE(filter)) { filter <- Expression$scalar(TRUE) } - # TODO: use FieldsInExpression to find all from dataset$selected_columns - colnames <- names(dataset$.data) + # Use FieldsInExpression to find all from dataset$selected_columns + colnames <- unique(unlist(map(dataset$selected_columns, + field_names_in_expression))) dataset <- dataset$.data } else { if (inherits(dataset, "ArrowTabular")) { diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 874361a2d8a..336d3f3824a 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3223,16 +3223,16 @@ extern "C" SEXP _arrow_compute___expr__call(SEXP func_name_sexp, SEXP argument_l // expression.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr compute___expr__field_ref(std::string name); -extern "C" SEXP _arrow_compute___expr__field_ref(SEXP name_sexp){ +std::vector field_names_in_expression(const std::shared_ptr& x); +extern "C" SEXP _arrow_field_names_in_expression(SEXP x_sexp){ BEGIN_CPP11 - arrow::r::Input::type name(name_sexp); - return cpp11::as_sexp(compute___expr__field_ref(name)); + arrow::r::Input&>::type x(x_sexp); + return cpp11::as_sexp(field_names_in_expression(x)); END_CPP11 } #else -extern "C" SEXP _arrow_compute___expr__field_ref(SEXP name_sexp){ - Rf_error("Cannot call compute___expr__field_ref(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +extern "C" SEXP _arrow_field_names_in_expression(SEXP x_sexp){ + Rf_error("Cannot call field_names_in_expression(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -3251,6 +3251,21 @@ extern "C" SEXP _arrow_compute___expr__get_field_ref_name(SEXP x_sexp){ } #endif +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr compute___expr__field_ref(std::string name); +extern "C" SEXP _arrow_compute___expr__field_ref(SEXP name_sexp){ +BEGIN_CPP11 + arrow::r::Input::type name(name_sexp); + return cpp11::as_sexp(compute___expr__field_ref(name)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_compute___expr__field_ref(SEXP name_sexp){ + Rf_error("Cannot call compute___expr__field_ref(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // expression.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr compute___expr__scalar(const std::shared_ptr& x); @@ -7248,8 +7263,9 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1}, { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1}, { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, - { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, + { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1}, { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, + { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, diff --git a/r/src/expression.cpp b/r/src/expression.cpp index 4b671cb99dd..3fcba46e911 100644 --- a/r/src/expression.cpp +++ b/r/src/expression.cpp @@ -44,8 +44,14 @@ std::shared_ptr compute___expr__call(std::string func_name, } // [[arrow::export]] -std::shared_ptr compute___expr__field_ref(std::string name) { - return std::make_shared(compute::field_ref(std::move(name))); +std::vector field_names_in_expression( + const std::shared_ptr& x) { + std::vector out; + auto field_refs = FieldsInExpression(*x); + for (auto f : field_refs) { + out.push_back(*f.name()); + } + return out; } // [[arrow::export]] @@ -57,6 +63,11 @@ std::string compute___expr__get_field_ref_name( return ""; } +// [[arrow::export]] +std::shared_ptr compute___expr__field_ref(std::string name) { + return std::make_shared(compute::field_ref(std::move(name))); +} + // [[arrow::export]] std::shared_ptr compute___expr__scalar( const std::shared_ptr& x) { From 1107cd25bcccad735d6904d0da00889721304043 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 26 Jul 2021 13:50:59 -0400 Subject: [PATCH 12/24] repair merge error --- cpp/src/arrow/compute/exec/plan_test.cc | 3 ++- cpp/src/arrow/dataset/scanner.cc | 9 +++++---- cpp/src/arrow/dataset/scanner_test.cc | 14 ++++++++------ 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc index f7fce4dddef..101257f5de8 100644 --- a/cpp/src/arrow/compute/exec/plan_test.cc +++ b/cpp/src/arrow/compute/exec/plan_test.cc @@ -567,7 +567,8 @@ TEST(ExecPlanExecution, ScalarSourceScalarAggSink) { ASSERT_OK_AND_ASSIGN( auto scalar_agg, MakeScalarAggregateNode(source, "scalar_agg", - {{"count", nullptr}, {"sum", nullptr}, {"mean", nullptr}})); + {{"count", nullptr}, {"sum", nullptr}, {"mean", nullptr}}, + {"a", "b", "c"}, {"sum a", "sum b", "sum c"})); auto sink_gen = MakeSinkNode(scalar_agg, "sink"); diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 192f84f46df..d81b9cd1c5c 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -816,14 +816,15 @@ Result AsyncScanner::CountRows() { ARROW_ASSIGN_OR_RAISE(auto scan, MakeScanNode(plan.get(), std::move(fragment_gen), options)); - ARROW_ASSIGN_OR_RAISE( - auto get_selection, - compute::MakeProjectNode(scan, "get_selection", {options->filter})); + ARROW_ASSIGN_OR_RAISE(auto get_selection, + compute::MakeProjectNode(scan, "get_selection", {options->filter}, + {"selection_mask"})); ARROW_ASSIGN_OR_RAISE( auto sum_selection, compute::MakeScalarAggregateNode(get_selection, "sum_selection", - {compute::internal::Aggregate{"sum", nullptr}})); + {compute::internal::Aggregate{"sum", nullptr}}, + {"selection_mask"}, {"sum"})); AsyncGenerator> sink_gen = compute::MakeSinkNode(sum_selection, "sink"); diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index de7f780183a..34fa1486ef2 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1471,14 +1471,16 @@ TEST(ScanNode, MinimalScalarAggEndToEnd) { ASSERT_OK_AND_ASSIGN( compute::ExecNode * sum, compute::MakeScalarAggregateNode(project, "scalar_agg", - {compute::internal::Aggregate{"sum", nullptr}})); + {compute::internal::Aggregate{"sum", nullptr}}, + {a_times_2.ToString()}, {"a*2 sum"})); // finally, pipe the project node into a sink node auto sink_gen = compute::MakeSinkNode(sum, "sink"); // translate sink_gen (async) to sink_reader (sync) - std::shared_ptr sink_reader = compute::MakeGeneratorReader( - schema({field("sum", int64())}), std::move(sink_gen), exec_context.memory_pool()); + std::shared_ptr sink_reader = + compute::MakeGeneratorReader(schema({field("a*2 sum", int64())}), + std::move(sink_gen), exec_context.memory_pool()); // start the ExecPlan ASSERT_OK(plan->StartProducing()); @@ -1489,9 +1491,9 @@ TEST(ScanNode, MinimalScalarAggEndToEnd) { // wait 1s for completion ASSERT_TRUE(plan->finished().Wait(/*seconds=*/1)) << "ExecPlan didn't finish within 1s"; - auto expected = TableFromJSON(schema({field("sum", int64())}), { - R"([ - {"sum": 4} + auto expected = TableFromJSON(schema({field("a*2 sum", int64())}), { + R"([ + {"a*2 sum": 4} ])"}); AssertTablesEqual(*expected, *collected, /*same_chunk_layout=*/false); } From 2576f59e8f4d73189733b4c5220d7f0097872de1 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 26 Jul 2021 15:00:25 -0400 Subject: [PATCH 13/24] Basic exercise of GroupByNode --- r/R/arrowExports.R | 4 +++ r/R/dplyr-summarize.R | 47 +++++++++++++++++-------- r/R/query-engine.R | 3 ++ r/src/arrowExports.cpp | 19 ++++++++++ r/src/compute-exec.cpp | 32 ++++++++++++++--- r/tests/testthat/test-dplyr-aggregate.R | 15 ++++++-- 6 files changed, 99 insertions(+), 21 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index e30a4d35d72..268a17ef4f4 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -304,6 +304,10 @@ ExecNode_ScalarAggregate <- function(input, options, target_names, out_field_nam .Call(`_arrow_ExecNode_ScalarAggregate`, input, options, target_names, out_field_names) } +ExecNode_GroupByAggregate <- function(input, group_vars, agg_srcs, aggregations){ + .Call(`_arrow_ExecNode_GroupByAggregate`, input, group_vars, agg_srcs, aggregations) +} + RecordBatch__cast <- function(batch, schema, options){ .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 6f6a5f31f31..217230d9ab2 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -46,10 +46,6 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query do_arrow_summarize <- function(.data, ...) { - if (length(dplyr::group_vars(.data))) { - stop("Grouped aggregation not supprted in Arrow", call. = FALSE) - } - exprs <- quos(...) # Check for unnamed expressions and fix if any unnamed <- !nzchar(names(exprs)) @@ -90,20 +86,41 @@ do_arrow_summarize <- function(.data, ...) { .data$selected_columns <- inputs # Eventually, we will return .data here if (dataset) but do it eagerly now - do_exec_plan(.data) + do_exec_plan(.data, group_vars = dplyr::group_vars(.data)) } -do_exec_plan <- function(.data) { +do_exec_plan <- function(.data, group_vars = NULL) { plan <- ExecPlan$create() - # Scan also will filter and select columns, so we don't need to Filter - start_node <- plan$Scan(.data) - # If any columns are derived we need to Project (otherwise this may be no-op) - project_node <- start_node$Project(.data$selected_columns) - final_node <- project_node$ScalarAggregate( - options = .data$aggregations, - target_names = names(.data), - out_field_names = names(.data$aggregations) - ) + if (length(group_vars) == 0) { + # Scan also will filter and select columns, so we don't need to Filter + start_node <- plan$Scan(.data) + # If any columns are derived we need to Project (otherwise this may be no-op) + project_node <- start_node$Project(.data$selected_columns) + final_node <- project_node$ScalarAggregate( + options = .data$aggregations, + target_names = names(.data), + out_field_names = names(.data$aggregations) + ) + } else { + # Collect the target names first because we have to add back the group vars + target_names <- names(.data) + .data <- ensure_group_vars(.data) + + # We also need to prefix all of the aggregation function names with "hash_" + .data$aggregations <- lapply(.data$aggregations, function(x) { + x[["fun"]] <- paste0("hash_", x[["fun"]]) + x + }) + # Scan also will filter and select columns, so we don't need to Filter + start_node <- plan$Scan(.data) + # If any columns are derived we need to Project (otherwise this may be no-op) + project_node <- start_node$Project(.data$selected_columns) + final_node <- project_node$GroupByAggregate( + group_vars, + target_names = target_names, + aggregations = .data$aggregations + ) + } plan$Run(final_node) } \ No newline at end of file diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 6822bc0be9b..1d1125628e1 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -63,6 +63,9 @@ ExecNode <- R6Class("ExecNode", inherit = ArrowObject, }, ScalarAggregate = function(options, target_names, out_field_names) { ExecNode_ScalarAggregate(self, options, target_names, out_field_names) + }, + GroupByAggregate = function(group_vars, target_names, aggregations) { + ExecNode_GroupByAggregate(self, group_vars, target_names, aggregations) } ) ) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 336d3f3824a..92ddbae23fd 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1192,6 +1192,24 @@ extern "C" SEXP _arrow_ExecNode_ScalarAggregate(SEXP input_sexp, SEXP options_se } #endif +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExecNode_GroupByAggregate(const std::shared_ptr& input, std::vector group_vars, std::vector agg_srcs, cpp11::list aggregations); +extern "C" SEXP _arrow_ExecNode_GroupByAggregate(SEXP input_sexp, SEXP group_vars_sexp, SEXP agg_srcs_sexp, SEXP aggregations_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type input(input_sexp); + arrow::r::Input>::type group_vars(group_vars_sexp); + arrow::r::Input>::type agg_srcs(agg_srcs_sexp); + arrow::r::Input::type aggregations(aggregations_sexp); + return cpp11::as_sexp(ExecNode_GroupByAggregate(input, group_vars, agg_srcs, aggregations)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExecNode_GroupByAggregate(SEXP input_sexp, SEXP group_vars_sexp, SEXP agg_srcs_sexp, SEXP aggregations_sexp){ + Rf_error("Cannot call ExecNode_GroupByAggregate(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // compute.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, cpp11::list options); @@ -7132,6 +7150,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, { "_arrow_ExecNode_ScalarAggregate", (DL_FUNC) &_arrow_ExecNode_ScalarAggregate, 4}, + { "_arrow_ExecNode_GroupByAggregate", (DL_FUNC) &_arrow_ExecNode_GroupByAggregate, 4}, { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index f5a734db510..61a79bf462e 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -19,8 +19,6 @@ #if defined(ARROW_R_WITH_ARROW) -#include - #include #include #include @@ -28,6 +26,8 @@ #include #include +#include + namespace compute = ::arrow::compute; std::shared_ptr make_compute_options(std::string func_name, @@ -94,8 +94,8 @@ std::shared_ptr ExecNode_Scan( } options->projection = - ValueOrStop(call("project", std::move(exprs), - compute::ProjectOptions{std::move(materialized_field_names)}) + ValueOrStop(call("make_struct", std::move(exprs), + compute::MakeStructOptions{std::move(materialized_field_names)}) .Bind(*dataset->schema())); return ExecNodeOrStop(arrow::dataset::MakeScanNode(plan.get(), dataset, options)); @@ -150,4 +150,28 @@ std::shared_ptr ExecNode_ScalarAggregate( std::move(out_field_names))); } +// [[arrow::export]] +std::shared_ptr ExecNode_GroupByAggregate( + const std::shared_ptr& input, std::vector group_vars, + std::vector agg_srcs, cpp11::list aggregations) { + std::vector aggs; + std::vector> keep_alives; + + for (cpp11::list name_opts : aggregations) { + auto name = cpp11::as_cpp(name_opts[0]); + auto opts = make_compute_options(name, name_opts[1]); + + aggs.push_back(arrow::compute::internal::Aggregate{std::move(name), opts.get()}); + keep_alives.push_back(std::move(opts)); + } + + return ExecNodeOrStop(compute::MakeGroupByNode(input.get(), /*label=*/"group_agg", + /*keys=*/std::move(group_vars), + std::move(agg_srcs), std::move(aggs))); +} + +// Result MakeGroupByNode(ExecNode* input, std::string label, +// std::vector keys, +// std::vector agg_srcs, +// std::vector aggs); #endif diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index b615384ebea..1f258b2f736 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -26,6 +26,7 @@ tbl$verses <- verses[[1]] # c(" a ", " b ", " c ", ...) increasing padding # nchar = 3 5 7 9 11 13 15 17 19 21 tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both") +tbl$some_grouping <- rep(c(1, 2), 5) test_that("Can aggregate", { withr::local_options(list(arrow.summarize = TRUE)) @@ -35,12 +36,22 @@ test_that("Can aggregate", { collect(), tbl ) - # This is failing because the default is na.rm = FALSE + skip("This is failing because the default is na.rm = FALSE") expect_dplyr_equal( input %>% summarize(total = sum(int)) %>% collect(), tbl ) - }) + +test_that("Group by aggregate on dataset", { + withr::local_options(list(arrow.summarize = TRUE)) + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) +}) \ No newline at end of file From 1b423a0a4620bd76ab34a0bc21bd9897b6b87060 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 27 Jul 2021 07:38:27 -0400 Subject: [PATCH 14/24] fix ExecBatch slicing --- cpp/src/arrow/compute/exec.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 2a32c96ed3b..7d6db9f58db 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -115,7 +115,7 @@ ExecBatch ExecBatch::Slice(int64_t offset, int64_t length) const { if (value.is_scalar()) continue; value = value.array()->Slice(offset, length); } - out.length = length; + out.length = std::min(length, this->length - offset); return out; } From 1816f2cfe11e2388165fd15eb810cee94415765c Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 27 Jul 2021 14:38:56 -0400 Subject: [PATCH 15/24] Adapt result to meet dplyr expectation --- r/R/dplyr-summarize.R | 48 +++++++++++++++---------- r/tests/testthat/test-dplyr-aggregate.R | 11 ++++++ 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 217230d9ab2..ba9fe8be046 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -92,35 +92,45 @@ do_arrow_summarize <- function(.data, ...) { do_exec_plan <- function(.data, group_vars = NULL) { plan <- ExecPlan$create() - if (length(group_vars) == 0) { - # Scan also will filter and select columns, so we don't need to Filter - start_node <- plan$Scan(.data) - # If any columns are derived we need to Project (otherwise this may be no-op) - project_node <- start_node$Project(.data$selected_columns) - final_node <- project_node$ScalarAggregate( - options = .data$aggregations, - target_names = names(.data), - out_field_names = names(.data$aggregations) - ) - } else { - # Collect the target names first because we have to add back the group vars - target_names <- names(.data) - .data <- ensure_group_vars(.data) + grouped <- length(group_vars) > 0 + + # Collect the target names first because we have to add back the group vars + target_names <- names(.data) + if (grouped) { + .data <- ensure_group_vars(.data) # We also need to prefix all of the aggregation function names with "hash_" .data$aggregations <- lapply(.data$aggregations, function(x) { x[["fun"]] <- paste0("hash_", x[["fun"]]) x }) - # Scan also will filter and select columns, so we don't need to Filter - start_node <- plan$Scan(.data) - # If any columns are derived we need to Project (otherwise this may be no-op) - project_node <- start_node$Project(.data$selected_columns) + } + + # Scan also will filter and select columns, so we don't need to Filter + start_node <- plan$Scan(.data) + # If any columns are derived we need to Project (otherwise this may be no-op) + project_node <- start_node$Project(.data$selected_columns) + + if (grouped) { final_node <- project_node$GroupByAggregate( group_vars, target_names = target_names, aggregations = .data$aggregations ) + out <- plan$Run(final_node) + # The result will have result columns first (named by their function) + # then the grouping cols. dplyr orders group cols first, and it accepts + # names for the result cols. Adapt the result to meet that expectation. + n_results <- length(.data$aggregations) + names(out)[seq_along(.data$aggregations)] <- names(.data$aggregations) + out <- out[c((n_results + 1):ncol(out), seq_along(.data$aggregations))] + } else { + final_node <- project_node$ScalarAggregate( + options = .data$aggregations, + target_names = target_names, + out_field_names = names(.data$aggregations) + ) + out <- plan$Run(final_node) } - plan$Run(final_node) + out } \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 1f258b2f736..d444f1bf391 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -51,6 +51,17 @@ test_that("Group by aggregate on dataset", { input %>% group_by(some_grouping) %>% summarize(total = sum(int, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) + + skip("This is failing because the default is na.rm = FALSE") + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(total = sum(int)) %>% + arrange(some_grouping) %>% collect(), tbl ) From 776e1f52236810b594cbad25a9f5cf95a8f43002 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 29 Jul 2021 10:45:38 -0400 Subject: [PATCH 16/24] Remove some tests for features not implemented for datasets since that's no longer a thing :tada: --- r/R/dplyr.R | 21 +++------------------ r/tests/testthat/test-dataset.R | 11 ----------- 2 files changed, 3 insertions(+), 29 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 88accac24e9..00443c7834d 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -216,31 +216,16 @@ restore_dplyr_features <- function(df, query) { # Helper to handle unsupported dplyr features # * For Table/RecordBatch, we collect() and then call the dplyr method in R # * For Dataset, we just error -abandon_ship <- function(call, .data, msg = NULL) { +abandon_ship <- function(call, .data, msg) { dplyr_fun_name <- sub("^(.*?)\\..*", "\\1", as.character(call[[1]])) if (query_on_dataset(.data)) { - if (is.null(msg)) { - # Default message: function not implemented - not_implemented_for_dataset(paste0(dplyr_fun_name, "()")) - } else { - stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE) - } + stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE) } # else, collect and call dplyr method - if (!is.null(msg)) { - warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE) - } + warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE) call$.data <- dplyr::collect(.data) call[[1]] <- get(dplyr_fun_name, envir = asNamespace("dplyr")) eval.parent(call, 2) } query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") - -not_implemented_for_dataset <- function(method) { - stop( - method, " is not currently implemented for Arrow Datasets. ", - "Call collect() first to pull data into R.", - call. = FALSE - ) -} diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 4711cacfcd0..793ba06c4a8 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -986,17 +986,6 @@ test_that("dplyr method not implemented messages", { "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.", fixed = TRUE ) - # One explicit test of the full message - expect_error( - ds %>% summarize(mean(int)), - "summarize() is not currently implemented for Arrow Datasets. Call collect() first to pull data into R.", - fixed = TRUE - ) - # Helper for everything else - expect_not_implemented <- function(x) { - expect_error(x, "is not currently implemented for Arrow Datasets") - } - expect_not_implemented(ds %>% filter(int == 1) %>% summarize(n())) }) test_that("Dataset and query print methods", { From 58f4930b6a3d770546c306e7f4e62103588fd5d5 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 29 Jul 2021 11:55:42 -0400 Subject: [PATCH 17/24] Refactor agg function definition and registry and add any/all --- r/R/dplyr-eval.R | 8 ++- r/R/dplyr-functions.R | 31 ++++++++++++ r/R/dplyr-summarize.R | 17 +++---- r/tests/testthat/test-dplyr-aggregate.R | 65 +++++++++++++++++++++++-- 4 files changed, 105 insertions(+), 16 deletions(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 57497e41cd2..3a1261602a3 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -39,7 +39,7 @@ arrow_eval <- function(expr, mask) { } out <- structure(msg, class = "try-error", condition = e) - if (grepl("not supported.*Arrow", msg)) { + if (grepl("not supported.*Arrow", msg) || getOption("arrow.debug", FALSE)) { # One of ours. Mark it so that consumers can handle it differently class(out) <- c("arrow-try-error", class(out)) } @@ -75,7 +75,7 @@ arrow_not_supported <- function(msg) { } # Create a data mask for evaluating a dplyr expression -arrow_mask <- function(.data) { +arrow_mask <- function(.data, aggregation = FALSE) { f_env <- new_environment(.cache$functions) # Add functions that need to error hard and clear. @@ -86,6 +86,10 @@ arrow_mask <- function(.data) { f_env[[f]] <- fail } + if (aggregation) { + f_env <- new_environment(agg_funcs, parent = f_env) + } + # Assign the schema to the expressions map(.data$selected_columns, ~ (.$schema <- .data$.data$schema)) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 40e4cd4776b..83cf2d8ab88 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -777,3 +777,34 @@ nse_funcs$case_when <- function(...) { ) ) } + +# Aggregation functions +# These all return a list of: +#' @param fun string function name +#' @param data Expression (these are all currently a single field) +#' @parma options list of function options, as passed to call_function +#' For group-by aggregation, `hash_` gets prepended to the function name. +#' So to see a list of available hash aggregation functions, do +#' list_compute_functions("^hash_") +agg_funcs <- list() +agg_funcs$sum <- function(x, na.rm = FALSE) { + list( + fun = "sum", + data = x, + options = list(na.rm = na.rm, na.min_count = 0L) + ) +} +agg_funcs$any <- function(x, na.rm = FALSE) { + list( + fun = "any", + data = x, + options = list(na.rm = na.rm, na.min_count = 0L) + ) +} +agg_funcs$all <- function(x, na.rm = FALSE) { + list( + fun = "all", + data = x, + options = list(na.rm = na.rm, na.min_count = 0L) + ) +} \ No newline at end of file diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index ba9fe8be046..7c8788f786f 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -52,16 +52,8 @@ do_arrow_summarize <- function(.data, ...) { # Deparse and take the first element in case they're long expressions names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) - mask <- arrow_mask(.data) - # Add aggregation wrappers to arrow_mask somehow - # (this is not ideal, would overwrite same-named objects) - mask$sum <- function(x, na.rm = FALSE) { - list( - fun = "sum", - data = x, - options = list(na.rm = na.rm, na.min_count = 0L) - ) - } + mask <- arrow_mask(.data, aggregation = TRUE) + results <- list() for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated @@ -69,7 +61,10 @@ do_arrow_summarize <- function(.data, ...) { new_var <- names(exprs)[i] results[[new_var]] <- arrow_eval(exprs[[i]], mask) if (inherits(results[[new_var]], "try-error")) { - msg <- paste('Expression', as_label(exprs[[i]]), 'not supported in Arrow') + msg <- handle_arrow_not_supported( + results[[new_var]], + as_label(exprs[[i]]) + ) stop(msg, call. = FALSE) } # Put it in the data mask too? diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index d444f1bf391..bf3870cbc47 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -17,6 +17,8 @@ skip_if_not_available("dataset") +withr::local_options(list(arrow.summarize = TRUE)) + library(dplyr) library(stringr) @@ -29,7 +31,6 @@ tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = tbl$some_grouping <- rep(c(1, 2), 5) test_that("Can aggregate", { - withr::local_options(list(arrow.summarize = TRUE)) expect_dplyr_equal( input %>% summarize(total = sum(int, na.rm = TRUE)) %>% @@ -45,8 +46,7 @@ test_that("Can aggregate", { ) }) -test_that("Group by aggregate on dataset", { - withr::local_options(list(arrow.summarize = TRUE)) +test_that("Group by sum on dataset", { expect_dplyr_equal( input %>% group_by(some_grouping) %>% @@ -56,6 +56,15 @@ test_that("Group by aggregate on dataset", { tbl ) + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(total = sum(int * 4, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) + skip("This is failing because the default is na.rm = FALSE") expect_dplyr_equal( input %>% @@ -65,4 +74,54 @@ test_that("Group by aggregate on dataset", { collect(), tbl ) +}) + +test_that("Group by any/all", { + withr::local_options(list(arrow.debug = TRUE)) + + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(any(lgl, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(all(lgl, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) + # na.rm option also is not being passed/received to any/all + + expect_dplyr_equal( + input %>% + mutate(has_words = nchar(verses) < 0) %>% + group_by(some_grouping) %>% + summarize(any(has_words)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) + expect_dplyr_equal( + input %>% + mutate(has_words = nchar(verses) < 0) %>% + group_by(some_grouping) %>% + summarize(all(has_words)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) + skip("This seems to be calling base::nchar") + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(has_words = all(nchar(verses) < 0)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) }) \ No newline at end of file From aeb0bf88907be202aeaba86374a772cca0867e93 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 30 Jul 2021 12:08:06 -0400 Subject: [PATCH 18/24] Add jira references --- r/tests/testthat/test-dplyr-aggregate.R | 45 +++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index bf3870cbc47..76088536982 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -37,7 +37,7 @@ test_that("Can aggregate", { collect(), tbl ) - skip("This is failing because the default is na.rm = FALSE") + skip("ARROW-13497: This is failing because the default is na.rm = FALSE") expect_dplyr_equal( input %>% summarize(total = sum(int)) %>% @@ -65,7 +65,7 @@ test_that("Group by sum on dataset", { tbl ) - skip("This is failing because the default is na.rm = FALSE") + skip("ARROW-13497: This is failing because the default is na.rm = FALSE") expect_dplyr_equal( input %>% group_by(some_grouping) %>% @@ -95,7 +95,7 @@ test_that("Group by any/all", { collect(), tbl ) - # na.rm option also is not being passed/received to any/all + # ARROW-13497: na.rm option also is not being passed/received to any/all expect_dplyr_equal( input %>% @@ -124,4 +124,43 @@ test_that("Group by any/all", { collect(), tbl ) +}) + +test_that("Filter and aggregate", { + skip("ARROW-13498") + expect_dplyr_equal( + input %>% + filter(some_grouping == 2) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(int > 5) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(some_grouping == 2) %>% + group_by(some_grouping) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(int > 5) %>% + group_by(some_grouping) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) }) \ No newline at end of file From a7f5cde6b2bdecf1beb3b1baa16377b92d13e1d5 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 30 Jul 2021 12:54:52 -0400 Subject: [PATCH 19/24] Use filter node to actually filter --- r/R/dplyr-summarize.R | 5 ++++- r/tests/testthat/test-dplyr-aggregate.R | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 7c8788f786f..366fb5d0f24 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -101,8 +101,11 @@ do_exec_plan <- function(.data, group_vars = NULL) { }) } - # Scan also will filter and select columns, so we don't need to Filter start_node <- plan$Scan(.data) + # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again + if (inherits(.data$filtered_rows, "Expression")) { + start_node <- start_node$Filter(.data$filtered_rows) + } # If any columns are derived we need to Project (otherwise this may be no-op) project_node <- start_node$Project(.data$selected_columns) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 76088536982..2774c3a4db4 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -127,7 +127,6 @@ test_that("Group by any/all", { }) test_that("Filter and aggregate", { - skip("ARROW-13498") expect_dplyr_equal( input %>% filter(some_grouping == 2) %>% From eab89e808aae815eba9fd1e8fcef7f92459313b9 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 3 Aug 2021 11:23:42 -0400 Subject: [PATCH 20/24] Format and re-doc --- r/R/dplyr-eval.R | 2 +- r/R/dplyr-functions.R | 12 ++++++------ r/R/dplyr-summarize.R | 4 ++-- r/R/dplyr.R | 2 +- r/R/duckdb.R | 7 +++++-- r/R/query-engine.R | 14 +++++++++----- r/man/FileFormat.Rd | 2 +- r/man/ParquetFileReader.Rd | 2 +- r/man/array.Rd | 2 +- r/man/list_compute_functions.Rd | 2 +- r/man/match_arrow.Rd | 4 ++-- r/man/open_dataset.Rd | 12 +++++++----- r/man/read_delim_arrow.Rd | 14 +++++++------- r/man/read_json_arrow.Rd | 10 +++++----- r/man/to_duckdb.Rd | 19 +++++++++++++++++++ r/man/write_ipc_stream.Rd | 2 +- r/tests/testthat/test-dplyr-aggregate.R | 2 +- 17 files changed, 70 insertions(+), 42 deletions(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 3a1261602a3..a60d97657bc 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -103,4 +103,4 @@ arrow_mask <- function(.data, aggregation = FALSE) { # (because if we do we get `Error: Can't modify the data pronoun` in mutate()) out$.data <- .data$selected_columns out -} +} \ No newline at end of file diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 83cf2d8ab88..c65a286d2cc 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -780,12 +780,12 @@ nse_funcs$case_when <- function(...) { # Aggregation functions # These all return a list of: -#' @param fun string function name -#' @param data Expression (these are all currently a single field) -#' @parma options list of function options, as passed to call_function -#' For group-by aggregation, `hash_` gets prepended to the function name. -#' So to see a list of available hash aggregation functions, do -#' list_compute_functions("^hash_") +# @param fun string function name +# @param data Expression (these are all currently a single field) +# @param options list of function options, as passed to call_function +# For group-by aggregation, `hash_` gets prepended to the function name. +# So to see a list of available hash aggregation functions, do +# list_compute_functions("^hash_") agg_funcs <- list() agg_funcs$sum <- function(x, na.rm = FALSE) { list( diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 366fb5d0f24..126b949164d 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -29,7 +29,7 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb )) .data <- dplyr::select(.data, vars_to_keep) if (match.arg(.engine) == "duckdb") { - dplyr::summarise(to_duckdb(.data), ...) + dplyr::summarise(to_duckdb(.data), ...) } else if (isTRUE(getOption("arrow.summarize", FALSE))) { # Try stuff, if successful return() out <- try(do_arrow_summarize(.data, ...), silent = TRUE) @@ -68,7 +68,7 @@ do_arrow_summarize <- function(.data, ...) { stop(msg, call. = FALSE) } # Put it in the data mask too? - #mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] + # mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] } # Now, from that, split out the data (expressions) and options diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 00443c7834d..aa1d10439b4 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -228,4 +228,4 @@ abandon_ship <- function(call, .data, msg) { eval.parent(call, 2) } -query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") +query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") \ No newline at end of file diff --git a/r/R/duckdb.R b/r/R/duckdb.R index 6ed1df3d826..ba9c4469fea 100644 --- a/r/R/duckdb.R +++ b/r/R/duckdb.R @@ -40,8 +40,7 @@ #' #' @name to_duckdb #' @export -#' @examplesIf { arrow_with_dataset() && requireNamespace("duckdb", quietly = TRUE) && -#' packageVersion("duckdb") > "0.2.7" && requireNamespace("dplyr", quietly = TRUE) } +#' @examplesIf getFromNamespace("run_duckdb_examples", "arrow")() #' library(dplyr) #' #' ds <- InMemoryDataset$create(mtcars) @@ -113,3 +112,7 @@ duckdb_disconnector <- function(con, tbl_name) { }) environment() } + +run_duckdb_examples <- function() { + arrow_with_dataset() && requireNamespace("duckdb", quietly = TRUE) && packageVersion("duckdb") > "0.2.7" && requireNamespace("dplyr", quietly = TRUE) +} \ No newline at end of file diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 1d1125628e1..cb6dc292707 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -15,7 +15,8 @@ # specific language governing permissions and limitations # under the License. -ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, +ExecPlan <- R6Class("ExecPlan", + inherit = ArrowObject, public = list( Scan = function(dataset) { # Handle arrow_dplyr_query @@ -25,8 +26,10 @@ ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, filter <- Expression$scalar(TRUE) } # Use FieldsInExpression to find all from dataset$selected_columns - colnames <- unique(unlist(map(dataset$selected_columns, - field_names_in_expression))) + colnames <- unique(unlist(map( + dataset$selected_columns, + field_names_in_expression + ))) dataset <- dataset$.data } else { if (inherits(dataset, "ArrowTabular")) { @@ -51,7 +54,8 @@ ExecPlan$create <- function(use_threads = option_use_threads()) { ExecPlan_create(use_threads) } -ExecNode <- R6Class("ExecNode", inherit = ArrowObject, +ExecNode <- R6Class("ExecNode", + inherit = ArrowObject, public = list( Project = function(cols) { assert_is_list_of(cols, "Expression") @@ -68,4 +72,4 @@ ExecNode <- R6Class("ExecNode", inherit = ArrowObject, ExecNode_GroupByAggregate(self, group_vars, target_names, aggregations) } ) -) +) \ No newline at end of file diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd index 5bc9475b408..cabacc93755 100644 --- a/r/man/FileFormat.Rd +++ b/r/man/FileFormat.Rd @@ -52,7 +52,7 @@ It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFile } \examples{ -\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} ## Semi-colon delimited files # Set up directory for examples tf <- tempfile() diff --git a/r/man/ParquetFileReader.Rd b/r/man/ParquetFileReader.Rd index 31de9ead104..30d0725a498 100644 --- a/r/man/ParquetFileReader.Rd +++ b/r/man/ParquetFileReader.Rd @@ -47,7 +47,7 @@ The optional \verb{column_indices=} argument is a 0-based integer vector indicat \examples{ \dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -f <- system.file("v0.7.1.parquet", package="arrow") +f <- system.file("v0.7.1.parquet", package = "arrow") pq <- ParquetFileReader$create(f) pq$GetSchema() if (codec_is_available("snappy")) { diff --git a/r/man/array.Rd b/r/man/array.Rd index 71957aff90c..78d3eaff6ea 100644 --- a/r/man/array.Rd +++ b/r/man/array.Rd @@ -100,7 +100,7 @@ new_array <- na_array$Slice(5) new_array$offset # Compare 2 arrays -na_array2 = na_array +na_array2 <- na_array na_array2 == na_array # element-wise comparison na_array2$Equals(na_array) # overall comparison \dontshow{\}) # examplesIf} diff --git a/r/man/list_compute_functions.Rd b/r/man/list_compute_functions.Rd index 668e090c0ca..4ca0e518f13 100644 --- a/r/man/list_compute_functions.Rd +++ b/r/man/list_compute_functions.Rd @@ -39,7 +39,7 @@ available inside \code{dplyr} verbs. } \examples{ \dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -list_compute_functions() +list_compute_functions() list_compute_functions(pattern = "^UTF8", ignore.case = TRUE) list_compute_functions(pattern = "^is", invert = TRUE) \dontshow{\}) # examplesIf} diff --git a/r/man/match_arrow.Rd b/r/man/match_arrow.Rd index d63ef3eed87..c2343361c6e 100644 --- a/r/man/match_arrow.Rd +++ b/r/man/match_arrow.Rd @@ -34,12 +34,12 @@ match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name) is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name) -# Although there are multiple matches, you are returned the index of the first +# Although there are multiple matches, you are returned the index of the first # match, as with the base R equivalent match(4, mtcars$cyl) # 1-indexed match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed -# If `x` contains multiple values, you are returned the indices of the first +# If `x` contains multiple values, you are returned the indices of the first # match for each value. match(c(4, 6, 8), mtcars$cyl) match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl) diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd index 1ca3d661880..53eade595be 100644 --- a/r/man/open_dataset.Rd +++ b/r/man/open_dataset.Rd @@ -91,7 +91,7 @@ can accelerate queries that only touch some partitions (files). Call \code{Dataset}, then use \code{dplyr} methods to query it. } \examples{ -\dontshow{if (arrow_with_dataset() & arrow_with_parquet() ) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_with_dataset() & arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # Set up directory for examples tf <- tempfile() dir.create(tf) @@ -105,7 +105,7 @@ write_dataset(data, tf) open_dataset(tf) # You can also supply a vector of paths -open_dataset(c(file.path(tf, "cyl=4/part-1.parquet"), file.path(tf,"cyl=8/part-2.parquet"))) +open_dataset(c(file.path(tf, "cyl=4/part-1.parquet"), file.path(tf, "cyl=8/part-2.parquet"))) ## You must specify the file format if using a format other than parquet. tf2 <- tempfile() @@ -113,9 +113,11 @@ dir.create(tf2) on.exit(unlink(tf2)) write_dataset(data, tf2, format = "ipc") # This line will results in errors when you try to work with the data -\dontrun{open_dataset(tf2)} +\dontrun{ +open_dataset(tf2) +} # This line will work -open_dataset(tf2, format = "ipc") +open_dataset(tf2, format = "ipc") ## You can specify file partitioning to include it as a field in your dataset # Create a temporary directory and write example dataset @@ -124,7 +126,7 @@ dir.create(tf3) on.exit(unlink(tf3)) write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = FALSE) -# View files - you can see the partitioning means that files have been written +# View files - you can see the partitioning means that files have been written # to folders based on Month/Day values list.files(tf3, recursive = TRUE) diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index 71394e547c9..30b146a4fee 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -207,12 +207,12 @@ to idenfity column names, you'll need to add \code{skip = 1} to skip that row. \examples{ \dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} - tf <- tempfile() - on.exit(unlink(tf)) - write.csv(mtcars, file = tf) - df <- read_csv_arrow(tf) - dim(df) - # Can select columns - df <- read_csv_arrow(tf, col_select = starts_with("d")) +tf <- tempfile() +on.exit(unlink(tf)) +write.csv(mtcars, file = tf) +df <- read_csv_arrow(tf) +dim(df) +# Can select columns +df <- read_csv_arrow(tf, col_select = starts_with("d")) \dontshow{\}) # examplesIf} } diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index 4806b4ad1f0..53d7107ae81 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -40,13 +40,13 @@ Using \link{JsonTableReader} } \examples{ \dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} - tf <- tempfile() - on.exit(unlink(tf)) - writeLines(' +tf <- tempfile() +on.exit(unlink(tf)) +writeLines(' { "hello": 3.5, "world": false, "yo": "thing" } { "hello": 3.25, "world": null } { "hello": 0.0, "world": true, "yo": null } - ', tf, useBytes=TRUE) - df <- read_json_arrow(tf) + ', tf, useBytes = TRUE) +df <- read_json_arrow(tf) \dontshow{\}) # examplesIf} } diff --git a/r/man/to_duckdb.Rd b/r/man/to_duckdb.Rd index c273a7520d5..ffde91f14f2 100644 --- a/r/man/to_duckdb.Rd +++ b/r/man/to_duckdb.Rd @@ -39,3 +39,22 @@ that starts with an Arrow object to use DuckDB to calculate the summarization step. Internally, this calls \code{to_duckdb()} with all of the default argument values. } +\examples{ +\dontshow{if (getFromNamespace("run_duckdb_examples", "arrow")()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +library(dplyr) + +ds <- InMemoryDataset$create(mtcars) + +ds \%>\% + filter(mpg < 30) \%>\% + to_duckdb() \%>\% + group_by(cyl) \%>\% + summarize(mean_mpg = mean(mpg, na.rm = TRUE)) + +# the same query can be simplified using .engine = "duckdb" +ds \%>\% + filter(mpg < 30) \%>\% + group_by(cyl) \%>\% + summarize(mean_mpg = mean(mpg, na.rm = TRUE), .engine = "duckdb") +\dontshow{\}) # examplesIf} +} diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd index 888d947eb99..2f215f25fd7 100644 --- a/r/man/write_ipc_stream.Rd +++ b/r/man/write_ipc_stream.Rd @@ -32,7 +32,7 @@ the function that will write the desired IPC format (stream or file) since either can be written to a file or \code{OutputStream}. } \examples{ -\dontshow{if (arrow_available() ) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} tf <- tempfile() on.exit(unlink(tf)) write_ipc_stream(mtcars, tf) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 2774c3a4db4..413c6afdf71 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -27,7 +27,7 @@ tbl <- example_data tbl$verses <- verses[[1]] # c(" a ", " b ", " c ", ...) increasing padding # nchar = 3 5 7 9 11 13 15 17 19 21 -tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both") +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") tbl$some_grouping <- rep(c(1, 2), 5) test_that("Can aggregate", { From da43f5ce10f3f15a6fccf08a717998122849bc2e Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 4 Aug 2021 09:04:54 -0400 Subject: [PATCH 21/24] Remove feature flag --- r/R/dplyr-summarize.R | 5 +---- r/R/dplyr.R | 1 + r/tests/testthat/test-dplyr-aggregate.R | 24 +++++++++++++++++++++--- r/tests/testthat/test-dplyr-group-by.R | 6 ++++-- r/tests/testthat/test-dplyr.R | 18 ------------------ 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 126b949164d..1320f67c66c 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -30,7 +30,7 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb .data <- dplyr::select(.data, vars_to_keep) if (match.arg(.engine) == "duckdb") { dplyr::summarise(to_duckdb(.data), ...) - } else if (isTRUE(getOption("arrow.summarize", FALSE))) { + } else { # Try stuff, if successful return() out <- try(do_arrow_summarize(.data, ...), silent = TRUE) if (inherits(out, "try-error")) { @@ -38,9 +38,6 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb } else { return(out) } - } else { - # If unsuccessful or if option not set, do the work in R - dplyr::summarise(dplyr::collect(.data), ...) } } summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query diff --git a/r/R/dplyr.R b/r/R/dplyr.R index aa1d10439b4..ff5e30e66c5 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -222,6 +222,7 @@ abandon_ship <- function(call, .data, msg) { stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE) } # else, collect and call dplyr method + msg <- sub("\\n$", "", msg) warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE) call$.data <- dplyr::collect(.data) call[[1]] <- get(dplyr_fun_name, envir = asNamespace("dplyr")) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 413c6afdf71..1a53a1b23b5 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -17,8 +17,6 @@ skip_if_not_available("dataset") -withr::local_options(list(arrow.summarize = TRUE)) - library(dplyr) library(stringr) @@ -30,7 +28,27 @@ tbl$verses <- verses[[1]] tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") tbl$some_grouping <- rep(c(1, 2), 5) -test_that("Can aggregate", { +test_that("summarize", { + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(min_int = min(int)), + tbl, + warning = TRUE + ) + + expect_dplyr_equal( + input %>% + select(int, chr) %>% + filter(int > 5) %>% + summarize(min_int = min(int) / 2), + tbl, + warning = TRUE + ) +}) + +test_that("Can aggregate in Arrow", { expect_dplyr_equal( input %>% summarize(total = sum(int, na.rm = TRUE)) %>% diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index fe0394bc636..18be2a9304a 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -29,7 +29,8 @@ test_that("group_by groupings are recorded", { select(int, chr) %>% filter(int > 5) %>% summarize(min_int = min(int)), - tbl + tbl, + warning = TRUE ) }) @@ -62,7 +63,8 @@ test_that("ungroup", { ungroup() %>% filter(int > 5) %>% summarize(min_int = min(int)), - tbl + tbl, + warning = TRUE ) }) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index da21ccd9ed1..9297df09490 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -69,24 +69,6 @@ See $.data for the source Arrow object', ) }) -test_that("summarize", { - expect_dplyr_equal( - input %>% - select(int, chr) %>% - filter(int > 5) %>% - summarize(min_int = min(int)), - tbl - ) - - expect_dplyr_equal( - input %>% - select(int, chr) %>% - filter(int > 5) %>% - summarize(min_int = min(int) / 2), - tbl - ) -}) - test_that("Empty select returns no columns", { expect_dplyr_equal( input %>% select() %>% collect(), From 56df2d3c875ea577b29f780adde2fef6b30c743d Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 4 Aug 2021 09:07:36 -0400 Subject: [PATCH 22/24] handle .groups argument Co-authored-by: Ian Cook --- r/R/dplyr-summarize.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 1320f67c66c..5677afb904a 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -42,7 +42,11 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb } summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query -do_arrow_summarize <- function(.data, ...) { +do_arrow_summarize <- function(.data, ..., .groups = NULL) { + if (!is.null(.groups)) { + # ARROW-13550 + abort("`summarize()` with `.groups` argument not supported in Arrow") + } exprs <- quos(...) # Check for unnamed expressions and fix if any unnamed <- !nzchar(names(exprs)) @@ -128,4 +132,4 @@ do_exec_plan <- function(.data, group_vars = NULL) { out <- plan$Run(final_node) } out -} \ No newline at end of file +} From f5d5d30b1c1361829b2a2def9b33ea0c7fbe9499 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 4 Aug 2021 09:50:21 -0400 Subject: [PATCH 23/24] Prevent na.rm = FALSE aggregation because it's wrong --- r/R/dplyr-functions.R | 18 +++++++++++++----- r/tests/testthat/test-dplyr-aggregate.R | 14 ++++++++------ r/tests/testthat/test-dplyr.R | 2 +- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index c65a286d2cc..607be82c36b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -713,7 +713,7 @@ nse_funcs$log <- nse_funcs$logb <- function(x, base = exp(1)) { return(Expression$create("log10_checked", x)) } # ARROW-13345 - stop("`base` values other than exp(1), 2 and 10 not supported in Arrow", call. = FALSE) + arrow_not_supported("`base` values other than exp(1), 2 and 10") } nse_funcs$if_else <- function(condition, true, false, missing = NULL) { @@ -791,20 +791,28 @@ agg_funcs$sum <- function(x, na.rm = FALSE) { list( fun = "sum", data = x, - options = list(na.rm = na.rm, na.min_count = 0L) + options = arrow_na_rm(na.rm = na.rm) ) } agg_funcs$any <- function(x, na.rm = FALSE) { list( fun = "any", data = x, - options = list(na.rm = na.rm, na.min_count = 0L) + options = arrow_na_rm(na.rm) ) } agg_funcs$all <- function(x, na.rm = FALSE) { list( fun = "all", data = x, - options = list(na.rm = na.rm, na.min_count = 0L) + options = arrow_na_rm(na.rm) ) -} \ No newline at end of file +} + +arrow_na_rm <- function(na.rm) { + if (!isTRUE(na.rm)) { + # TODO: ARROW-13497 + arrow_not_supported(paste("na.rm =", na.rm)) + } + list(na.rm = na.rm, na.min_count = 0L) +} diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 1a53a1b23b5..2807f0053fa 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -55,12 +55,13 @@ test_that("Can aggregate in Arrow", { collect(), tbl ) - skip("ARROW-13497: This is failing because the default is na.rm = FALSE") expect_dplyr_equal( input %>% summarize(total = sum(int)) %>% collect(), - tbl + tbl, + # ARROW-13497: This is failing because the default is na.rm = FALSE + warning = TRUE ) }) @@ -83,14 +84,15 @@ test_that("Group by sum on dataset", { tbl ) - skip("ARROW-13497: This is failing because the default is na.rm = FALSE") expect_dplyr_equal( input %>% group_by(some_grouping) %>% summarize(total = sum(int)) %>% arrange(some_grouping) %>% collect(), - tbl + tbl, + # ARROW-13497: This is failing because the default is na.rm = FALSE + warning = TRUE ) }) @@ -119,7 +121,7 @@ test_that("Group by any/all", { input %>% mutate(has_words = nchar(verses) < 0) %>% group_by(some_grouping) %>% - summarize(any(has_words)) %>% + summarize(any(has_words, na.rm = TRUE)) %>% arrange(some_grouping) %>% collect(), tbl @@ -128,7 +130,7 @@ test_that("Group by any/all", { input %>% mutate(has_words = nchar(verses) < 0) %>% group_by(some_grouping) %>% - summarize(all(has_words)) %>% + summarize(all(has_words, na.rm = TRUE)) %>% arrange(some_grouping) %>% collect(), tbl diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 9297df09490..ed03c58a884 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -1036,7 +1036,7 @@ test_that("log functions", { expect_error( nse_funcs$log(Expression$scalar(x), base = 5), - "`base` values other than exp(1), 2 and 10 not supported in Arrow", + "`base` values other than exp(1), 2 and 10 not supported by Arrow", fixed = TRUE ) From 69228153f34052ad7fed0038bed5ddecfd8722cd Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 4 Aug 2021 14:47:13 -0400 Subject: [PATCH 24/24] Suppress warning and style files --- r/R/dplyr-eval.R | 2 +- r/R/dplyr.R | 2 +- r/R/duckdb.R | 2 +- r/R/query-engine.R | 2 +- r/tests/testthat/test-dataset.R | 15 +++++++++------ r/tests/testthat/test-dplyr-aggregate.R | 2 +- 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index a60d97657bc..3a1261602a3 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -103,4 +103,4 @@ arrow_mask <- function(.data, aggregation = FALSE) { # (because if we do we get `Error: Can't modify the data pronoun` in mutate()) out$.data <- .data$selected_columns out -} \ No newline at end of file +} diff --git a/r/R/dplyr.R b/r/R/dplyr.R index ff5e30e66c5..b2793bdb3c3 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -229,4 +229,4 @@ abandon_ship <- function(call, .data, msg) { eval.parent(call, 2) } -query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") \ No newline at end of file +query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") diff --git a/r/R/duckdb.R b/r/R/duckdb.R index ba9c4469fea..bc003a6ea8f 100644 --- a/r/R/duckdb.R +++ b/r/R/duckdb.R @@ -115,4 +115,4 @@ duckdb_disconnector <- function(con, tbl_name) { run_duckdb_examples <- function() { arrow_with_dataset() && requireNamespace("duckdb", quietly = TRUE) && packageVersion("duckdb") > "0.2.7" && requireNamespace("dplyr", quietly = TRUE) -} \ No newline at end of file +} diff --git a/r/R/query-engine.R b/r/R/query-engine.R index cb6dc292707..72c35c515db 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -72,4 +72,4 @@ ExecNode <- R6Class("ExecNode", ExecNode_GroupByAggregate(self, group_vars, target_names, aggregations) } ) -) \ No newline at end of file +) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 793ba06c4a8..1a71fea86c7 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -638,12 +638,15 @@ test_that("Creating UnionDataset", { test_that("map_batches", { skip_if_not_available("parquet") ds <- open_dataset(dataset_dir, partitioning = "part") - expect_equivalent( - ds %>% - filter(int > 5) %>% - select(int, lgl) %>% - map_batches(~ summarize(., min_int = min(int))), - tibble(min_int = c(6L, 101L)) + expect_warning( + expect_equivalent( + ds %>% + filter(int > 5) %>% + select(int, lgl) %>% + map_batches(~ summarize(., min_int = min(int))), + tibble(min_int = c(6L, 101L)) + ), + "pulling data into R" # ARROW-13502 ) }) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 2807f0053fa..8235ef29948 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -182,4 +182,4 @@ test_that("Filter and aggregate", { collect(), tbl ) -}) \ No newline at end of file +})