From 867e147e4e1812859c9846d91d6a8c95fb938277 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 24 Aug 2021 15:00:12 -0400 Subject: [PATCH 01/27] Refactor ExecPlan building; use it in collect() --- r/R/dplyr-collect.R | 8 ++-- r/R/dplyr-summarize.R | 46 +++------------------ r/R/query-engine.R | 55 +++++++++++++++++++++++-- r/tests/testthat/test-dplyr-aggregate.R | 47 ++++++++++++++++++++- 4 files changed, 104 insertions(+), 52 deletions(-) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index cec56ab9110..d17b0ddfca2 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -19,11 +19,9 @@ # The following S3 methods are registered on load if dplyr is present collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { - x <- ensure_group_vars(x) - x <- ensure_arrange_vars(x) # this sets x$temp_columns # Pull only the selected rows and cols into R - # See dataset.R for Dataset and Scanner(Builder) classes - tab <- Scanner$create(x)$ToTable() + # See query-engine.R for ExecPlan/Nodes + tab <- do_exec_plan(x) # Arrange rows if (length(x$arrange_vars) > 0) { tab <- tab[ @@ -59,4 +57,4 @@ pull.arrow_dplyr_query <- function(.data, var = -1) { .data$selected_columns <- set_names(.data$selected_columns[var], var) dplyr::collect(.data)[[1]] } -pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query +pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query \ No newline at end of file diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 394e5fe2ac9..a4d6db9cf38 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -82,47 +82,11 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { .data$selected_columns <- inputs # Eventually, we will return .data here if (dataset) but do it eagerly now - do_exec_plan(.data, group_vars = dplyr::group_vars(.data)) + do_exec_plan(.data) } -do_exec_plan <- function(.data, group_vars = NULL) { +do_exec_plan <- function(.data) { plan <- ExecPlan$create() - - grouped <- length(group_vars) > 0 - - # Collect the target names first because we have to add back the group vars - target_names <- names(.data) - - if (grouped) { - .data <- ensure_group_vars(.data) - # We also need to prefix all of the aggregation function names with "hash_" - .data$aggregations <- lapply(.data$aggregations, function(x) { - x[["fun"]] <- paste0("hash_", x[["fun"]]) - x - }) - } - - start_node <- plan$Scan(.data) - # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again - if (inherits(.data$filtered_rows, "Expression")) { - start_node <- start_node$Filter(.data$filtered_rows) - } - # If any columns are derived we need to Project (otherwise this may be no-op) - project_node <- start_node$Project(.data$selected_columns) - - final_node <- project_node$Aggregate( - options = .data$aggregations, - target_names = target_names, - out_field_names = names(.data$aggregations), - key_names = group_vars - ) - - out <- plan$Run(final_node) - if (grouped) { - # The result will have result columns first then the grouping cols. - # dplyr orders group cols first, so adapt the result to meet that expectation. - n_results <- length(.data$aggregations) - out <- out[c((n_results + 1):ncol(out), seq_along(.data$aggregations))] - } - out -} + final_node <- plan$Build(.data) + plan$Run(final_node) +} \ No newline at end of file diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 4de2f87165b..2aaaa2c6597 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -42,7 +42,50 @@ ExecPlan <- R6Class("ExecPlan", } # ScanNode needs the filter to do predicate pushdown and skip partitions, # and it needs to know which fields to materialize (and which are unnecessary) - ExecNode_Scan(self, dataset, filter, colnames) + ExecNode_Scan(self, dataset, filter, colnames %||% character(0)) + }, + Build = function(.data) { + group_vars <- dplyr::group_vars(.data) + grouped <- length(group_vars) > 0 + + # Collect the target names first because we have to add back the group vars + target_names <- names(.data) + .data <- ensure_group_vars(.data) + .data <- ensure_arrange_vars(.data) # this sets x$temp_columns + + node <- self$Scan(.data) + # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again + if (inherits(.data$filtered_rows, "Expression")) { + node <- node$Filter(.data$filtered_rows) + } + # If any columns are derived we need to Project (otherwise this may be no-op) + node <- node$Project(c(.data$selected_columns, .data$temp_columns)) + + if (length(.data$aggregations)) { + if (grouped) { + # We need to prefix all of the aggregation function names with "hash_" + .data$aggregations <- lapply(.data$aggregations, function(x) { + x[["fun"]] <- paste0("hash_", x[["fun"]]) + x + }) + } + + node <- node$Aggregate( + options = .data$aggregations, + target_names = target_names, + out_field_names = names(.data$aggregations), + key_names = group_vars + ) + + if (grouped) { + # The result will have result columns first then the grouping cols. + # dplyr orders group cols first, so adapt the result to meet that expectation. + node <- node$Project( + make_field_refs(c(group_vars, names(.data$aggregations))) + ) + } + } + node }, Run = function(node) { assert_is(node, "ExecNode") @@ -58,8 +101,12 @@ ExecNode <- R6Class("ExecNode", inherit = ArrowObject, public = list( Project = function(cols) { - assert_is_list_of(cols, "Expression") - ExecNode_Project(self, cols, names(cols)) + if (length(cols)) { + assert_is_list_of(cols, "Expression") + ExecNode_Project(self, cols, names(cols)) + } else { + ExecNode_Project(self, character(0), character(0)) + } }, Filter = function(expr) { assert_is(expr, "Expression") @@ -69,4 +116,4 @@ ExecNode <- R6Class("ExecNode", ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) } ) -) +) \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 3a04b6d2314..1aa30654495 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -33,7 +33,8 @@ test_that("summarize", { input %>% select(int, chr) %>% filter(int > 5) %>% - summarize(min_int = min(int)), + summarize(min_int = min(int)) %>% + collect(), tbl, warning = TRUE ) @@ -42,12 +43,28 @@ test_that("summarize", { input %>% select(int, chr) %>% filter(int > 5) %>% - summarize(min_int = min(int) / 2), + summarize(min_int = min(int) / 2) %>% + collect(), tbl, warning = TRUE ) }) +test_that("summarize() doesn't evaluate eagerly", { + skip("TODO") + expect_s3_class( + Table$create(tbl) %>% + summarize(total = sum(int)), + "arrow_dplyr_query" + ) + expect_r6_class( + Table$create(tbl) %>% + summarize(total = sum(int)) %>% + collect(), + "ArrowTabular" + ) +}) + test_that("Can aggregate in Arrow", { expect_dplyr_equal( input %>% @@ -289,3 +306,29 @@ test_that("Filter and aggregate", { tbl ) }) + +test_that("Expressions on aggregations", { + # This is what it effectively is + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize( + any = any(lgl), + all = all(lgl) + ) %>% + arrange(some_grouping) %>% + transmute(some = any & !all) %>% + collect(), + tbl + ) + # More concisely: + skip("Not implemented") + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(any(lgl) & !all(lgl)) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) +}) \ No newline at end of file From 1bf8a074b60386e093ee8210834161edf8402d3d Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 24 Aug 2021 16:51:26 -0400 Subject: [PATCH 02/27] Implement order_by_sink and sort results of summarize --- r/R/arrowExports.R | 5 +- r/R/dplyr-collect.R | 32 +- r/R/dplyr-summarize.R | 6 - r/R/dplyr.R | 22 - r/R/query-engine.R | 38 +- r/src/arrowExports.cpp | 899 ++++++++++++------------ r/src/compute-exec.cpp | 16 +- r/tests/testthat/test-dataset.R | 4 +- r/tests/testthat/test-dplyr-aggregate.R | 15 +- 9 files changed, 530 insertions(+), 507 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 72a5e455858..ce6d2e872d4 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -280,8 +280,8 @@ ExecPlan_create <- function(use_threads) { .Call(`_arrow_ExecPlan_create`, use_threads) } -ExecPlan_run <- function(plan, final_node) { - .Call(`_arrow_ExecPlan_run`, plan, final_node) +ExecPlan_run <- function(plan, final_node, sort_options) { + .Call(`_arrow_ExecPlan_run`, plan, final_node, sort_options) } ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { @@ -1767,3 +1767,4 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } + diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index d17b0ddfca2..7db1b682305 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -22,14 +22,6 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { # Pull only the selected rows and cols into R # See query-engine.R for ExecPlan/Nodes tab <- do_exec_plan(x) - # Arrange rows - if (length(x$arrange_vars) > 0) { - tab <- tab[ - tab$SortIndices(names(x$arrange_vars), x$arrange_desc), - names(x$selected_columns), # this omits x$temp_columns from the result - drop = FALSE - ] - } if (as_data_frame) { df <- as.data.frame(tab) tab$invalidate() @@ -57,4 +49,26 @@ pull.arrow_dplyr_query <- function(.data, var = -1) { .data$selected_columns <- set_names(.data$selected_columns[var], var) dplyr::collect(.data)[[1]] } -pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query \ No newline at end of file +pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query + +restore_dplyr_features <- function(df, query) { + # An arrow_dplyr_query holds some attributes that Arrow doesn't know about + # After calling collect(), make sure these features are carried over + + if (length(query$group_by_vars) > 0) { + # Preserve groupings, if present + if (is.data.frame(df)) { + df <- dplyr::grouped_df( + df, + dplyr::group_vars(query), + drop = dplyr::group_by_drop_default(query) + ) + } else { + # This is a Table, via compute() or collect(as_data_frame = FALSE) + df <- arrow_dplyr_query(df) + df$group_by_vars <- query$group_by_vars + df$drop_empty_groups <- query$drop_empty_groups + } + } + df +} diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index a4d6db9cf38..9a0cfc5bf91 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -84,9 +84,3 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # Eventually, we will return .data here if (dataset) but do it eagerly now do_exec_plan(.data) } - -do_exec_plan <- function(.data) { - plan <- ExecPlan$create() - final_node <- plan$Build(.data) - plan$Run(final_node) -} \ No newline at end of file diff --git a/r/R/dplyr.R b/r/R/dplyr.R index b2793bdb3c3..c3029a114c3 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -191,28 +191,6 @@ ensure_arrange_vars <- function(x) { x } -restore_dplyr_features <- function(df, query) { - # An arrow_dplyr_query holds some attributes that Arrow doesn't know about - # After calling collect(), make sure these features are carried over - - if (length(query$group_by_vars) > 0) { - # Preserve groupings, if present - if (is.data.frame(df)) { - df <- dplyr::grouped_df( - df, - dplyr::group_vars(query), - drop = dplyr::group_by_drop_default(query) - ) - } else { - # This is a Table, via compute() or collect(as_data_frame = FALSE) - df <- arrow_dplyr_query(df) - df$group_by_vars <- query$group_by_vars - df$drop_empty_groups <- query$drop_empty_groups - } - } - df -} - # Helper to handle unsupported dplyr features # * For Table/RecordBatch, we collect() and then call the dplyr method in R # * For Dataset, we just error diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 2aaaa2c6597..c595dd27df3 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -15,6 +15,19 @@ # specific language governing permissions and limitations # under the License. +do_exec_plan <- function(.data) { + plan <- ExecPlan$create() + final_node <- plan$Build(.data) + tab <- plan$Run(final_node) + + if (length(final_node$sort$temp_columns) > 0) { + # If arrange() created $temp_columns, make sure to omit them from the result + tab <- tab[, setdiff(names(tab), final_node$sort$temp_columns), drop = FALSE] + } + + tab +} + ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, public = list( @@ -85,11 +98,33 @@ ExecPlan <- R6Class("ExecPlan", ) } } + + # tab <- tab[ + # tab$SortIndices(names(x$arrange_vars), x$arrange_desc), + # names(x$selected_columns), # this omits x$temp_columns from the result + # drop = FALSE + # ] + + # Apply sorting: this is currently not an ExecNode itself, it is a + # sink node option. + # TODO: error if doing a subsequent operation that would throw away sorting! + if (length(.data$arrange_vars)) { + node$sort <- list( + names = names(.data$arrange_vars), + orders = as.integer(.data$arrange_desc), + temp_columns = names(.data$temp_columns) + ) + } else if (length(.data$aggregations) && grouped) { + node$sort <- list( + names = group_vars, + orders = rep(0L, length(group_vars)) + ) + } node }, Run = function(node) { assert_is(node, "ExecNode") - ExecPlan_run(self, node) + ExecPlan_run(self, node, node$sort %||% list()) } ) ) @@ -100,6 +135,7 @@ ExecPlan$create <- function(use_threads = option_use_threads()) { ExecNode <- R6Class("ExecNode", inherit = ArrowObject, public = list( + sort = NULL, Project = function(cols) { if (length(cols)) { assert_is_list_of(cols, "Expression") diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index cb69ce17442..de8ca36af6c 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1094,16 +1094,17 @@ extern "C" SEXP _arrow_ExecPlan_create(SEXP use_threads_sexp){ // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ExecPlan_run(const std::shared_ptr& plan, const std::shared_ptr& final_node); -extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ +std::shared_ptr ExecPlan_run(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options); +extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); arrow::r::Input&>::type final_node(final_node_sexp); - return cpp11::as_sexp(ExecPlan_run(plan, final_node)); + arrow::r::Input::type sort_options(sort_options_sexp); + return cpp11::as_sexp(ExecPlan_run(plan, final_node, sort_options)); END_CPP11 } #else -extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ +extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp){ Rf_error("Cannot call ExecPlan_run(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -7035,450 +7036,450 @@ static const R_CallMethodDef CallEntries[] = { { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, { "_json_available", (DL_FUNC)& _json_available, 0 }, - { "_arrow_is_altrep", (DL_FUNC) &_arrow_is_altrep, 1}, - { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, - { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, - { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, - { "_arrow_Array__IsValid", (DL_FUNC) &_arrow_Array__IsValid, 2}, - { "_arrow_Array__length", (DL_FUNC) &_arrow_Array__length, 1}, - { "_arrow_Array__offset", (DL_FUNC) &_arrow_Array__offset, 1}, - { "_arrow_Array__null_count", (DL_FUNC) &_arrow_Array__null_count, 1}, - { "_arrow_Array__type", (DL_FUNC) &_arrow_Array__type, 1}, - { "_arrow_Array__ToString", (DL_FUNC) &_arrow_Array__ToString, 1}, - { "_arrow_Array__type_id", (DL_FUNC) &_arrow_Array__type_id, 1}, - { "_arrow_Array__Equals", (DL_FUNC) &_arrow_Array__Equals, 2}, - { "_arrow_Array__ApproxEquals", (DL_FUNC) &_arrow_Array__ApproxEquals, 2}, - { "_arrow_Array__Diff", (DL_FUNC) &_arrow_Array__Diff, 2}, - { "_arrow_Array__data", (DL_FUNC) &_arrow_Array__data, 1}, - { "_arrow_Array__RangeEquals", (DL_FUNC) &_arrow_Array__RangeEquals, 5}, - { "_arrow_Array__View", (DL_FUNC) &_arrow_Array__View, 2}, - { "_arrow_Array__Validate", (DL_FUNC) &_arrow_Array__Validate, 1}, - { "_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1}, - { "_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1}, - { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, - { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, - { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, - { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, - { "_arrow_LargeListArray__value_type", (DL_FUNC) &_arrow_LargeListArray__value_type, 1}, - { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, - { "_arrow_LargeListArray__values", (DL_FUNC) &_arrow_LargeListArray__values, 1}, - { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, - { "_arrow_LargeListArray__value_length", (DL_FUNC) &_arrow_LargeListArray__value_length, 2}, - { "_arrow_FixedSizeListArray__value_length", (DL_FUNC) &_arrow_FixedSizeListArray__value_length, 2}, - { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, - { "_arrow_LargeListArray__value_offset", (DL_FUNC) &_arrow_LargeListArray__value_offset, 2}, - { "_arrow_FixedSizeListArray__value_offset", (DL_FUNC) &_arrow_FixedSizeListArray__value_offset, 2}, - { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, - { "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1}, - { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, - { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 2}, - { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, - { "_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 2}, - { "_arrow_ArrayData__get_type", (DL_FUNC) &_arrow_ArrayData__get_type, 1}, - { "_arrow_ArrayData__get_length", (DL_FUNC) &_arrow_ArrayData__get_length, 1}, - { "_arrow_ArrayData__get_null_count", (DL_FUNC) &_arrow_ArrayData__get_null_count, 1}, - { "_arrow_ArrayData__get_offset", (DL_FUNC) &_arrow_ArrayData__get_offset, 1}, - { "_arrow_ArrayData__buffers", (DL_FUNC) &_arrow_ArrayData__buffers, 1}, - { "_arrow_Buffer__is_mutable", (DL_FUNC) &_arrow_Buffer__is_mutable, 1}, - { "_arrow_Buffer__ZeroPadding", (DL_FUNC) &_arrow_Buffer__ZeroPadding, 1}, - { "_arrow_Buffer__capacity", (DL_FUNC) &_arrow_Buffer__capacity, 1}, - { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1}, - { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1}, - { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1}, - { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2}, - { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1}, - { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1}, - { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1}, - { "_arrow_ChunkedArray__chunk", (DL_FUNC) &_arrow_ChunkedArray__chunk, 2}, - { "_arrow_ChunkedArray__chunks", (DL_FUNC) &_arrow_ChunkedArray__chunks, 1}, - { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1}, - { "_arrow_ChunkedArray__Slice1", (DL_FUNC) &_arrow_ChunkedArray__Slice1, 2}, - { "_arrow_ChunkedArray__Slice2", (DL_FUNC) &_arrow_ChunkedArray__Slice2, 3}, - { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, - { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, - { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, - { "_arrow_ChunkedArray__ToString", (DL_FUNC) &_arrow_ChunkedArray__ToString, 1}, - { "_arrow_ChunkedArray__from_list", (DL_FUNC) &_arrow_ChunkedArray__from_list, 2}, - { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, - { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, - { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, - { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, - { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, - { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 1}, - { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 2}, - { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, - { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, - { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, - { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5}, - { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, - { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, - { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, - { "_arrow_compute__GetFunctionNames", (DL_FUNC) &_arrow_compute__GetFunctionNames, 0}, - { "_arrow_build_info", (DL_FUNC) &_arrow_build_info, 0}, - { "_arrow_runtime_info", (DL_FUNC) &_arrow_runtime_info, 0}, - { "_arrow_csv___WriteOptions__initialize", (DL_FUNC) &_arrow_csv___WriteOptions__initialize, 1}, - { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, - { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, - { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) &_arrow_csv___ReadOptions__column_names, 1}, - { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, - { "_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, - { "_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, - { "_arrow_TimestampParser__kind", (DL_FUNC) &_arrow_TimestampParser__kind, 1}, - { "_arrow_TimestampParser__format", (DL_FUNC) &_arrow_TimestampParser__format, 1}, - { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC) &_arrow_TimestampParser__MakeStrptime, 1}, - { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC) &_arrow_TimestampParser__MakeISO8601, 0}, - { "_arrow_csv___WriteCSV__Table", (DL_FUNC) &_arrow_csv___WriteCSV__Table, 3}, - { "_arrow_csv___WriteCSV__RecordBatch", (DL_FUNC) &_arrow_csv___WriteCSV__RecordBatch, 3}, - { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, - { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, - { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, - { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, - { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, - { "_arrow_dataset___InMemoryDataset__create", (DL_FUNC) &_arrow_dataset___InMemoryDataset__create, 1}, - { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, - { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, - { "_arrow_dataset___FileSystemDataset__filesystem", (DL_FUNC) &_arrow_dataset___FileSystemDataset__filesystem, 1}, - { "_arrow_dataset___FileSystemDataset__files", (DL_FUNC) &_arrow_dataset___FileSystemDataset__files, 1}, - { "_arrow_dataset___DatasetFactory__Finish1", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish1, 2}, - { "_arrow_dataset___DatasetFactory__Finish2", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish2, 2}, - { "_arrow_dataset___DatasetFactory__Inspect", (DL_FUNC) &_arrow_dataset___DatasetFactory__Inspect, 2}, - { "_arrow_dataset___UnionDatasetFactory__Make", (DL_FUNC) &_arrow_dataset___UnionDatasetFactory__Make, 1}, - { "_arrow_dataset___FileSystemDatasetFactory__Make0", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make0, 3}, - { "_arrow_dataset___FileSystemDatasetFactory__Make2", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make2, 4}, - { "_arrow_dataset___FileSystemDatasetFactory__Make1", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make1, 3}, - { "_arrow_dataset___FileSystemDatasetFactory__Make3", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make3, 4}, - { "_arrow_dataset___FileFormat__type_name", (DL_FUNC) &_arrow_dataset___FileFormat__type_name, 1}, - { "_arrow_dataset___FileFormat__DefaultWriteOptions", (DL_FUNC) &_arrow_dataset___FileFormat__DefaultWriteOptions, 1}, - { "_arrow_dataset___ParquetFileFormat__Make", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__Make, 2}, - { "_arrow_dataset___FileWriteOptions__type_name", (DL_FUNC) &_arrow_dataset___FileWriteOptions__type_name, 1}, - { "_arrow_dataset___ParquetFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___ParquetFileWriteOptions__update, 3}, - { "_arrow_dataset___IpcFileWriteOptions__update2", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update2, 4}, - { "_arrow_dataset___IpcFileWriteOptions__update1", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update1, 3}, - { "_arrow_dataset___CsvFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___CsvFileWriteOptions__update, 2}, - { "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0}, - { "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 3}, - { "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) &_arrow_dataset___FragmentScanOptions__type_name, 1}, - { "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___CsvFragmentScanOptions__Make, 2}, - { "_arrow_dataset___ParquetFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 3}, - { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 2}, - { "_arrow_dataset___DirectoryPartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 2}, - { "_arrow_dataset___HivePartitioning", (DL_FUNC) &_arrow_dataset___HivePartitioning, 3}, - { "_arrow_dataset___HivePartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___HivePartitioning__MakeFactory, 2}, - { "_arrow_dataset___ScannerBuilder__ProjectNames", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectNames, 2}, - { "_arrow_dataset___ScannerBuilder__ProjectExprs", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectExprs, 3}, - { "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2}, - { "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2}, - { "_arrow_dataset___ScannerBuilder__UseAsync", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseAsync, 2}, - { "_arrow_dataset___ScannerBuilder__BatchSize", (DL_FUNC) &_arrow_dataset___ScannerBuilder__BatchSize, 2}, - { "_arrow_dataset___ScannerBuilder__FragmentScanOptions", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FragmentScanOptions, 2}, - { "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1}, - { "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1}, - { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, - { "_arrow_dataset___Scanner__ScanBatches", (DL_FUNC) &_arrow_dataset___Scanner__ScanBatches, 1}, - { "_arrow_dataset___Scanner__ToRecordBatchReader", (DL_FUNC) &_arrow_dataset___Scanner__ToRecordBatchReader, 1}, - { "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, - { "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, - { "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, - { "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 6}, - { "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2}, - { "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1}, - { "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, - { "_arrow_Int16__initialize", (DL_FUNC) &_arrow_Int16__initialize, 0}, - { "_arrow_Int32__initialize", (DL_FUNC) &_arrow_Int32__initialize, 0}, - { "_arrow_Int64__initialize", (DL_FUNC) &_arrow_Int64__initialize, 0}, - { "_arrow_UInt8__initialize", (DL_FUNC) &_arrow_UInt8__initialize, 0}, - { "_arrow_UInt16__initialize", (DL_FUNC) &_arrow_UInt16__initialize, 0}, - { "_arrow_UInt32__initialize", (DL_FUNC) &_arrow_UInt32__initialize, 0}, - { "_arrow_UInt64__initialize", (DL_FUNC) &_arrow_UInt64__initialize, 0}, - { "_arrow_Float16__initialize", (DL_FUNC) &_arrow_Float16__initialize, 0}, - { "_arrow_Float32__initialize", (DL_FUNC) &_arrow_Float32__initialize, 0}, - { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, - { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, - { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, - { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, - { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, - { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, - { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, - { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0}, - { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0}, - { "_arrow_Decimal128Type__initialize", (DL_FUNC) &_arrow_Decimal128Type__initialize, 2}, - { "_arrow_FixedSizeBinary__initialize", (DL_FUNC) &_arrow_FixedSizeBinary__initialize, 1}, - { "_arrow_Timestamp__initialize", (DL_FUNC) &_arrow_Timestamp__initialize, 2}, - { "_arrow_Time32__initialize", (DL_FUNC) &_arrow_Time32__initialize, 1}, - { "_arrow_Time64__initialize", (DL_FUNC) &_arrow_Time64__initialize, 1}, - { "_arrow_list__", (DL_FUNC) &_arrow_list__, 1}, - { "_arrow_large_list__", (DL_FUNC) &_arrow_large_list__, 1}, - { "_arrow_fixed_size_list__", (DL_FUNC) &_arrow_fixed_size_list__, 2}, - { "_arrow_struct__", (DL_FUNC) &_arrow_struct__, 1}, - { "_arrow_DataType__ToString", (DL_FUNC) &_arrow_DataType__ToString, 1}, - { "_arrow_DataType__name", (DL_FUNC) &_arrow_DataType__name, 1}, - { "_arrow_DataType__Equals", (DL_FUNC) &_arrow_DataType__Equals, 2}, - { "_arrow_DataType__num_fields", (DL_FUNC) &_arrow_DataType__num_fields, 1}, - { "_arrow_DataType__fields", (DL_FUNC) &_arrow_DataType__fields, 1}, - { "_arrow_DataType__id", (DL_FUNC) &_arrow_DataType__id, 1}, - { "_arrow_ListType__ToString", (DL_FUNC) &_arrow_ListType__ToString, 1}, - { "_arrow_FixedWidthType__bit_width", (DL_FUNC) &_arrow_FixedWidthType__bit_width, 1}, - { "_arrow_DateType__unit", (DL_FUNC) &_arrow_DateType__unit, 1}, - { "_arrow_TimeType__unit", (DL_FUNC) &_arrow_TimeType__unit, 1}, - { "_arrow_DecimalType__precision", (DL_FUNC) &_arrow_DecimalType__precision, 1}, - { "_arrow_DecimalType__scale", (DL_FUNC) &_arrow_DecimalType__scale, 1}, - { "_arrow_TimestampType__timezone", (DL_FUNC) &_arrow_TimestampType__timezone, 1}, - { "_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1}, - { "_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3}, - { "_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1}, - { "_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1}, - { "_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1}, - { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, - { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, - { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, - { "_arrow_StructType__field_names", (DL_FUNC) &_arrow_StructType__field_names, 1}, - { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, - { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, - { "_arrow_LargeListType__value_field", (DL_FUNC) &_arrow_LargeListType__value_field, 1}, - { "_arrow_LargeListType__value_type", (DL_FUNC) &_arrow_LargeListType__value_type, 1}, - { "_arrow_FixedSizeListType__value_field", (DL_FUNC) &_arrow_FixedSizeListType__value_field, 1}, - { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1}, - { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1}, - { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, - { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1}, - { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, - { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, - { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, - { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, - { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, - { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, - { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, - { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, - { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, - { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, - { "_arrow_ipc___feather___Reader__schema", (DL_FUNC) &_arrow_ipc___feather___Reader__schema, 1}, - { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, - { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, - { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, - { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 2}, - { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, - { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, - { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1}, - { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2}, - { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1}, - { "_arrow_fs___FileInfo__set_path", (DL_FUNC) &_arrow_fs___FileInfo__set_path, 2}, - { "_arrow_fs___FileInfo__size", (DL_FUNC) &_arrow_fs___FileInfo__size, 1}, - { "_arrow_fs___FileInfo__set_size", (DL_FUNC) &_arrow_fs___FileInfo__set_size, 2}, - { "_arrow_fs___FileInfo__base_name", (DL_FUNC) &_arrow_fs___FileInfo__base_name, 1}, - { "_arrow_fs___FileInfo__extension", (DL_FUNC) &_arrow_fs___FileInfo__extension, 1}, - { "_arrow_fs___FileInfo__mtime", (DL_FUNC) &_arrow_fs___FileInfo__mtime, 1}, - { "_arrow_fs___FileInfo__set_mtime", (DL_FUNC) &_arrow_fs___FileInfo__set_mtime, 2}, - { "_arrow_fs___FileSelector__base_dir", (DL_FUNC) &_arrow_fs___FileSelector__base_dir, 1}, - { "_arrow_fs___FileSelector__allow_not_found", (DL_FUNC) &_arrow_fs___FileSelector__allow_not_found, 1}, - { "_arrow_fs___FileSelector__recursive", (DL_FUNC) &_arrow_fs___FileSelector__recursive, 1}, - { "_arrow_fs___FileSelector__create", (DL_FUNC) &_arrow_fs___FileSelector__create, 3}, - { "_arrow_fs___FileSystem__GetTargetInfos_Paths", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_Paths, 2}, - { "_arrow_fs___FileSystem__GetTargetInfos_FileSelector", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_FileSelector, 2}, - { "_arrow_fs___FileSystem__CreateDir", (DL_FUNC) &_arrow_fs___FileSystem__CreateDir, 3}, - { "_arrow_fs___FileSystem__DeleteDir", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDir, 2}, - { "_arrow_fs___FileSystem__DeleteDirContents", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDirContents, 2}, - { "_arrow_fs___FileSystem__DeleteFile", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFile, 2}, - { "_arrow_fs___FileSystem__DeleteFiles", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFiles, 2}, - { "_arrow_fs___FileSystem__Move", (DL_FUNC) &_arrow_fs___FileSystem__Move, 3}, - { "_arrow_fs___FileSystem__CopyFile", (DL_FUNC) &_arrow_fs___FileSystem__CopyFile, 3}, - { "_arrow_fs___FileSystem__OpenInputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputStream, 2}, - { "_arrow_fs___FileSystem__OpenInputFile", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputFile, 2}, - { "_arrow_fs___FileSystem__OpenOutputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenOutputStream, 2}, - { "_arrow_fs___FileSystem__OpenAppendStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenAppendStream, 2}, - { "_arrow_fs___FileSystem__type_name", (DL_FUNC) &_arrow_fs___FileSystem__type_name, 1}, - { "_arrow_fs___LocalFileSystem__create", (DL_FUNC) &_arrow_fs___LocalFileSystem__create, 0}, - { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2}, - { "_arrow_fs___SubTreeFileSystem__base_fs", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_fs, 1}, - { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, - { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, - { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, - { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12}, - { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, - { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, - { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, - { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, - { "_arrow_io___RandomAccessFile__GetSize", (DL_FUNC) &_arrow_io___RandomAccessFile__GetSize, 1}, - { "_arrow_io___RandomAccessFile__supports_zero_copy", (DL_FUNC) &_arrow_io___RandomAccessFile__supports_zero_copy, 1}, - { "_arrow_io___RandomAccessFile__Seek", (DL_FUNC) &_arrow_io___RandomAccessFile__Seek, 2}, - { "_arrow_io___RandomAccessFile__Tell", (DL_FUNC) &_arrow_io___RandomAccessFile__Tell, 1}, - { "_arrow_io___RandomAccessFile__Read0", (DL_FUNC) &_arrow_io___RandomAccessFile__Read0, 1}, - { "_arrow_io___RandomAccessFile__ReadAt", (DL_FUNC) &_arrow_io___RandomAccessFile__ReadAt, 3}, - { "_arrow_io___MemoryMappedFile__Create", (DL_FUNC) &_arrow_io___MemoryMappedFile__Create, 2}, - { "_arrow_io___MemoryMappedFile__Open", (DL_FUNC) &_arrow_io___MemoryMappedFile__Open, 2}, - { "_arrow_io___MemoryMappedFile__Resize", (DL_FUNC) &_arrow_io___MemoryMappedFile__Resize, 2}, - { "_arrow_io___ReadableFile__Open", (DL_FUNC) &_arrow_io___ReadableFile__Open, 1}, - { "_arrow_io___BufferReader__initialize", (DL_FUNC) &_arrow_io___BufferReader__initialize, 1}, - { "_arrow_io___Writable__write", (DL_FUNC) &_arrow_io___Writable__write, 2}, - { "_arrow_io___OutputStream__Tell", (DL_FUNC) &_arrow_io___OutputStream__Tell, 1}, - { "_arrow_io___FileOutputStream__Open", (DL_FUNC) &_arrow_io___FileOutputStream__Open, 1}, - { "_arrow_io___BufferOutputStream__Create", (DL_FUNC) &_arrow_io___BufferOutputStream__Create, 1}, - { "_arrow_io___BufferOutputStream__capacity", (DL_FUNC) &_arrow_io___BufferOutputStream__capacity, 1}, - { "_arrow_io___BufferOutputStream__Finish", (DL_FUNC) &_arrow_io___BufferOutputStream__Finish, 1}, - { "_arrow_io___BufferOutputStream__Tell", (DL_FUNC) &_arrow_io___BufferOutputStream__Tell, 1}, - { "_arrow_io___BufferOutputStream__Write", (DL_FUNC) &_arrow_io___BufferOutputStream__Write, 2}, - { "_arrow_json___ReadOptions__initialize", (DL_FUNC) &_arrow_json___ReadOptions__initialize, 2}, - { "_arrow_json___ParseOptions__initialize1", (DL_FUNC) &_arrow_json___ParseOptions__initialize1, 1}, - { "_arrow_json___ParseOptions__initialize2", (DL_FUNC) &_arrow_json___ParseOptions__initialize2, 2}, - { "_arrow_json___TableReader__Make", (DL_FUNC) &_arrow_json___TableReader__Make, 3}, - { "_arrow_json___TableReader__Read", (DL_FUNC) &_arrow_json___TableReader__Read, 1}, - { "_arrow_MemoryPool__default", (DL_FUNC) &_arrow_MemoryPool__default, 0}, - { "_arrow_MemoryPool__bytes_allocated", (DL_FUNC) &_arrow_MemoryPool__bytes_allocated, 1}, - { "_arrow_MemoryPool__max_memory", (DL_FUNC) &_arrow_MemoryPool__max_memory, 1}, - { "_arrow_MemoryPool__backend_name", (DL_FUNC) &_arrow_MemoryPool__backend_name, 1}, - { "_arrow_supported_memory_backends", (DL_FUNC) &_arrow_supported_memory_backends, 0}, - { "_arrow_ipc___Message__body_length", (DL_FUNC) &_arrow_ipc___Message__body_length, 1}, - { "_arrow_ipc___Message__metadata", (DL_FUNC) &_arrow_ipc___Message__metadata, 1}, - { "_arrow_ipc___Message__body", (DL_FUNC) &_arrow_ipc___Message__body, 1}, - { "_arrow_ipc___Message__Verify", (DL_FUNC) &_arrow_ipc___Message__Verify, 1}, - { "_arrow_ipc___Message__type", (DL_FUNC) &_arrow_ipc___Message__type, 1}, - { "_arrow_ipc___Message__Equals", (DL_FUNC) &_arrow_ipc___Message__Equals, 2}, - { "_arrow_ipc___ReadRecordBatch__Message__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__Message__Schema, 2}, - { "_arrow_ipc___ReadSchema_InputStream", (DL_FUNC) &_arrow_ipc___ReadSchema_InputStream, 1}, - { "_arrow_ipc___ReadSchema_Message", (DL_FUNC) &_arrow_ipc___ReadSchema_Message, 1}, - { "_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, - { "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, - { "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, - { "_arrow_parquet___arrow___ArrowReaderProperties__Make", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1}, - { "_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3}, - { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, - { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, - { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroup1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup1, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroup2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup2, 3}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroups1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups1, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroups2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups2, 3}, - { "_arrow_parquet___arrow___FileReader__num_rows", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_rows, 1}, - { "_arrow_parquet___arrow___FileReader__num_columns", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_columns, 1}, - { "_arrow_parquet___arrow___FileReader__num_row_groups", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_row_groups, 1}, - { "_arrow_parquet___arrow___FileReader__ReadColumn", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadColumn, 2}, - { "_arrow_parquet___ArrowWriterProperties___create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___create, 3}, - { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0}, - { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2}, - { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1}, - { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4}, - { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3}, - { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, - { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, - { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, - { "_arrow_allocate_arrow_schema", (DL_FUNC) &_arrow_allocate_arrow_schema, 0}, - { "_arrow_delete_arrow_schema", (DL_FUNC) &_arrow_delete_arrow_schema, 1}, - { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0}, - { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1}, - { "_arrow_allocate_arrow_array_stream", (DL_FUNC) &_arrow_allocate_arrow_array_stream, 0}, - { "_arrow_delete_arrow_array_stream", (DL_FUNC) &_arrow_delete_arrow_array_stream, 1}, - { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 2}, - { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 2}, - { "_arrow_ImportSchema", (DL_FUNC) &_arrow_ImportSchema, 1}, - { "_arrow_ImportField", (DL_FUNC) &_arrow_ImportField, 1}, - { "_arrow_ImportType", (DL_FUNC) &_arrow_ImportType, 1}, - { "_arrow_ImportRecordBatchReader", (DL_FUNC) &_arrow_ImportRecordBatchReader, 1}, - { "_arrow_ExportType", (DL_FUNC) &_arrow_ExportType, 2}, - { "_arrow_ExportField", (DL_FUNC) &_arrow_ExportField, 2}, - { "_arrow_ExportSchema", (DL_FUNC) &_arrow_ExportSchema, 2}, - { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 3}, - { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3}, - { "_arrow_ExportRecordBatchReader", (DL_FUNC) &_arrow_ExportRecordBatchReader, 2}, - { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 3}, - { "_arrow_vec_to_arrow", (DL_FUNC) &_arrow_vec_to_arrow, 2}, - { "_arrow_DictionaryArray__FromArrays", (DL_FUNC) &_arrow_DictionaryArray__FromArrays, 3}, - { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, - { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, - { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, - { "_arrow_RecordBatch__RenameColumns", (DL_FUNC) &_arrow_RecordBatch__RenameColumns, 2}, - { "_arrow_RecordBatch__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_RecordBatch__ReplaceSchemaMetadata, 2}, - { "_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, - { "_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, - { "_arrow_RecordBatch__GetColumnByName", (DL_FUNC) &_arrow_RecordBatch__GetColumnByName, 2}, - { "_arrow_RecordBatch__SelectColumns", (DL_FUNC) &_arrow_RecordBatch__SelectColumns, 2}, - { "_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 3}, - { "_arrow_RecordBatch__AddColumn", (DL_FUNC) &_arrow_RecordBatch__AddColumn, 4}, - { "_arrow_RecordBatch__SetColumn", (DL_FUNC) &_arrow_RecordBatch__SetColumn, 4}, - { "_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2}, - { "_arrow_RecordBatch__column_name", (DL_FUNC) &_arrow_RecordBatch__column_name, 2}, - { "_arrow_RecordBatch__names", (DL_FUNC) &_arrow_RecordBatch__names, 1}, - { "_arrow_RecordBatch__Slice1", (DL_FUNC) &_arrow_RecordBatch__Slice1, 2}, - { "_arrow_RecordBatch__Slice2", (DL_FUNC) &_arrow_RecordBatch__Slice2, 3}, - { "_arrow_ipc___SerializeRecordBatch__Raw", (DL_FUNC) &_arrow_ipc___SerializeRecordBatch__Raw, 1}, - { "_arrow_ipc___ReadRecordBatch__InputStream__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__InputStream__Schema, 2}, - { "_arrow_RecordBatch__from_arrays", (DL_FUNC) &_arrow_RecordBatch__from_arrays, 2}, - { "_arrow_RecordBatchReader__schema", (DL_FUNC) &_arrow_RecordBatchReader__schema, 1}, - { "_arrow_RecordBatchReader__ReadNext", (DL_FUNC) &_arrow_RecordBatchReader__ReadNext, 1}, - { "_arrow_RecordBatchReader__batches", (DL_FUNC) &_arrow_RecordBatchReader__batches, 1}, - { "_arrow_Table__from_RecordBatchReader", (DL_FUNC) &_arrow_Table__from_RecordBatchReader, 1}, - { "_arrow_ipc___RecordBatchStreamReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__Open, 1}, - { "_arrow_ipc___RecordBatchFileReader__schema", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__schema, 1}, - { "_arrow_ipc___RecordBatchFileReader__num_record_batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__num_record_batches, 1}, - { "_arrow_ipc___RecordBatchFileReader__ReadRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__ReadRecordBatch, 2}, - { "_arrow_ipc___RecordBatchFileReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__Open, 1}, - { "_arrow_Table__from_RecordBatchFileReader", (DL_FUNC) &_arrow_Table__from_RecordBatchFileReader, 1}, - { "_arrow_ipc___RecordBatchFileReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__batches, 1}, - { "_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 2}, - { "_arrow_ipc___RecordBatchWriter__WriteTable", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteTable, 2}, - { "_arrow_ipc___RecordBatchWriter__Close", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__Close, 1}, - { "_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 4}, - { "_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 4}, - { "_arrow_Array__GetScalar", (DL_FUNC) &_arrow_Array__GetScalar, 2}, - { "_arrow_Scalar__ToString", (DL_FUNC) &_arrow_Scalar__ToString, 1}, - { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, - { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, - { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, - { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2}, - { "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, - { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, - { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, - { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, - { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, - { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, - { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, - { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, - { "_arrow_Schema__AddField", (DL_FUNC) &_arrow_Schema__AddField, 3}, - { "_arrow_Schema__SetField", (DL_FUNC) &_arrow_Schema__SetField, 3}, - { "_arrow_Schema__RemoveField", (DL_FUNC) &_arrow_Schema__RemoveField, 2}, - { "_arrow_Schema__GetFieldByName", (DL_FUNC) &_arrow_Schema__GetFieldByName, 2}, - { "_arrow_Schema__fields", (DL_FUNC) &_arrow_Schema__fields, 1}, - { "_arrow_Schema__field_names", (DL_FUNC) &_arrow_Schema__field_names, 1}, - { "_arrow_Schema__HasMetadata", (DL_FUNC) &_arrow_Schema__HasMetadata, 1}, - { "_arrow_Schema__metadata", (DL_FUNC) &_arrow_Schema__metadata, 1}, - { "_arrow_Schema__WithMetadata", (DL_FUNC) &_arrow_Schema__WithMetadata, 2}, - { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1}, - { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3}, - { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1}, - { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, - { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, - { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, - { "_arrow_Table__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_Table__ReplaceSchemaMetadata, 2}, - { "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, - { "_arrow_Table__field", (DL_FUNC) &_arrow_Table__field, 2}, - { "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, - { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1}, - { "_arrow_Table__RenameColumns", (DL_FUNC) &_arrow_Table__RenameColumns, 2}, - { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2}, - { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3}, - { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 3}, - { "_arrow_Table__Validate", (DL_FUNC) &_arrow_Table__Validate, 1}, - { "_arrow_Table__ValidateFull", (DL_FUNC) &_arrow_Table__ValidateFull, 1}, - { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, - { "_arrow_Table__RemoveColumn", (DL_FUNC) &_arrow_Table__RemoveColumn, 2}, - { "_arrow_Table__AddColumn", (DL_FUNC) &_arrow_Table__AddColumn, 4}, - { "_arrow_Table__SetColumn", (DL_FUNC) &_arrow_Table__SetColumn, 4}, - { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2}, - { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1}, - { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2}, - { "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0}, - { "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1}, - { "_arrow_GetIOThreadPoolCapacity", (DL_FUNC) &_arrow_GetIOThreadPoolCapacity, 0}, - { "_arrow_SetIOThreadPoolCapacity", (DL_FUNC) &_arrow_SetIOThreadPoolCapacity, 1}, - { "_arrow_Array__infer_type", (DL_FUNC) &_arrow_Array__infer_type, 1}, - { "_arrow_Table__Reset", (DL_FUNC) &_arrow_Table__Reset, 1}, - { "_arrow_RecordBatch__Reset", (DL_FUNC) &_arrow_RecordBatch__Reset, 1}, + { "_arrow_is_altrep", (DL_FUNC) &_arrow_is_altrep, 1}, + { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, + { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, + { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, + { "_arrow_Array__IsValid", (DL_FUNC) &_arrow_Array__IsValid, 2}, + { "_arrow_Array__length", (DL_FUNC) &_arrow_Array__length, 1}, + { "_arrow_Array__offset", (DL_FUNC) &_arrow_Array__offset, 1}, + { "_arrow_Array__null_count", (DL_FUNC) &_arrow_Array__null_count, 1}, + { "_arrow_Array__type", (DL_FUNC) &_arrow_Array__type, 1}, + { "_arrow_Array__ToString", (DL_FUNC) &_arrow_Array__ToString, 1}, + { "_arrow_Array__type_id", (DL_FUNC) &_arrow_Array__type_id, 1}, + { "_arrow_Array__Equals", (DL_FUNC) &_arrow_Array__Equals, 2}, + { "_arrow_Array__ApproxEquals", (DL_FUNC) &_arrow_Array__ApproxEquals, 2}, + { "_arrow_Array__Diff", (DL_FUNC) &_arrow_Array__Diff, 2}, + { "_arrow_Array__data", (DL_FUNC) &_arrow_Array__data, 1}, + { "_arrow_Array__RangeEquals", (DL_FUNC) &_arrow_Array__RangeEquals, 5}, + { "_arrow_Array__View", (DL_FUNC) &_arrow_Array__View, 2}, + { "_arrow_Array__Validate", (DL_FUNC) &_arrow_Array__Validate, 1}, + { "_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1}, + { "_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1}, + { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, + { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, + { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, + { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, + { "_arrow_LargeListArray__value_type", (DL_FUNC) &_arrow_LargeListArray__value_type, 1}, + { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, + { "_arrow_LargeListArray__values", (DL_FUNC) &_arrow_LargeListArray__values, 1}, + { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, + { "_arrow_LargeListArray__value_length", (DL_FUNC) &_arrow_LargeListArray__value_length, 2}, + { "_arrow_FixedSizeListArray__value_length", (DL_FUNC) &_arrow_FixedSizeListArray__value_length, 2}, + { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, + { "_arrow_LargeListArray__value_offset", (DL_FUNC) &_arrow_LargeListArray__value_offset, 2}, + { "_arrow_FixedSizeListArray__value_offset", (DL_FUNC) &_arrow_FixedSizeListArray__value_offset, 2}, + { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, + { "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1}, + { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, + { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 2}, + { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, + { "_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 2}, + { "_arrow_ArrayData__get_type", (DL_FUNC) &_arrow_ArrayData__get_type, 1}, + { "_arrow_ArrayData__get_length", (DL_FUNC) &_arrow_ArrayData__get_length, 1}, + { "_arrow_ArrayData__get_null_count", (DL_FUNC) &_arrow_ArrayData__get_null_count, 1}, + { "_arrow_ArrayData__get_offset", (DL_FUNC) &_arrow_ArrayData__get_offset, 1}, + { "_arrow_ArrayData__buffers", (DL_FUNC) &_arrow_ArrayData__buffers, 1}, + { "_arrow_Buffer__is_mutable", (DL_FUNC) &_arrow_Buffer__is_mutable, 1}, + { "_arrow_Buffer__ZeroPadding", (DL_FUNC) &_arrow_Buffer__ZeroPadding, 1}, + { "_arrow_Buffer__capacity", (DL_FUNC) &_arrow_Buffer__capacity, 1}, + { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1}, + { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1}, + { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1}, + { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2}, + { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1}, + { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1}, + { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1}, + { "_arrow_ChunkedArray__chunk", (DL_FUNC) &_arrow_ChunkedArray__chunk, 2}, + { "_arrow_ChunkedArray__chunks", (DL_FUNC) &_arrow_ChunkedArray__chunks, 1}, + { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1}, + { "_arrow_ChunkedArray__Slice1", (DL_FUNC) &_arrow_ChunkedArray__Slice1, 2}, + { "_arrow_ChunkedArray__Slice2", (DL_FUNC) &_arrow_ChunkedArray__Slice2, 3}, + { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, + { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, + { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, + { "_arrow_ChunkedArray__ToString", (DL_FUNC) &_arrow_ChunkedArray__ToString, 1}, + { "_arrow_ChunkedArray__from_list", (DL_FUNC) &_arrow_ChunkedArray__from_list, 2}, + { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, + { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, + { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, + { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, + { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, + { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 1}, + { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 3}, + { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, + { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, + { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, + { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5}, + { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, + { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, + { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, + { "_arrow_compute__GetFunctionNames", (DL_FUNC) &_arrow_compute__GetFunctionNames, 0}, + { "_arrow_build_info", (DL_FUNC) &_arrow_build_info, 0}, + { "_arrow_runtime_info", (DL_FUNC) &_arrow_runtime_info, 0}, + { "_arrow_csv___WriteOptions__initialize", (DL_FUNC) &_arrow_csv___WriteOptions__initialize, 1}, + { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, + { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, + { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) &_arrow_csv___ReadOptions__column_names, 1}, + { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, + { "_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, + { "_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, + { "_arrow_TimestampParser__kind", (DL_FUNC) &_arrow_TimestampParser__kind, 1}, + { "_arrow_TimestampParser__format", (DL_FUNC) &_arrow_TimestampParser__format, 1}, + { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC) &_arrow_TimestampParser__MakeStrptime, 1}, + { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC) &_arrow_TimestampParser__MakeISO8601, 0}, + { "_arrow_csv___WriteCSV__Table", (DL_FUNC) &_arrow_csv___WriteCSV__Table, 3}, + { "_arrow_csv___WriteCSV__RecordBatch", (DL_FUNC) &_arrow_csv___WriteCSV__RecordBatch, 3}, + { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, + { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, + { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, + { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, + { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, + { "_arrow_dataset___InMemoryDataset__create", (DL_FUNC) &_arrow_dataset___InMemoryDataset__create, 1}, + { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, + { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, + { "_arrow_dataset___FileSystemDataset__filesystem", (DL_FUNC) &_arrow_dataset___FileSystemDataset__filesystem, 1}, + { "_arrow_dataset___FileSystemDataset__files", (DL_FUNC) &_arrow_dataset___FileSystemDataset__files, 1}, + { "_arrow_dataset___DatasetFactory__Finish1", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish1, 2}, + { "_arrow_dataset___DatasetFactory__Finish2", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish2, 2}, + { "_arrow_dataset___DatasetFactory__Inspect", (DL_FUNC) &_arrow_dataset___DatasetFactory__Inspect, 2}, + { "_arrow_dataset___UnionDatasetFactory__Make", (DL_FUNC) &_arrow_dataset___UnionDatasetFactory__Make, 1}, + { "_arrow_dataset___FileSystemDatasetFactory__Make0", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make0, 3}, + { "_arrow_dataset___FileSystemDatasetFactory__Make2", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make2, 4}, + { "_arrow_dataset___FileSystemDatasetFactory__Make1", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make1, 3}, + { "_arrow_dataset___FileSystemDatasetFactory__Make3", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make3, 4}, + { "_arrow_dataset___FileFormat__type_name", (DL_FUNC) &_arrow_dataset___FileFormat__type_name, 1}, + { "_arrow_dataset___FileFormat__DefaultWriteOptions", (DL_FUNC) &_arrow_dataset___FileFormat__DefaultWriteOptions, 1}, + { "_arrow_dataset___ParquetFileFormat__Make", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__Make, 2}, + { "_arrow_dataset___FileWriteOptions__type_name", (DL_FUNC) &_arrow_dataset___FileWriteOptions__type_name, 1}, + { "_arrow_dataset___ParquetFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___ParquetFileWriteOptions__update, 3}, + { "_arrow_dataset___IpcFileWriteOptions__update2", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update2, 4}, + { "_arrow_dataset___IpcFileWriteOptions__update1", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update1, 3}, + { "_arrow_dataset___CsvFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___CsvFileWriteOptions__update, 2}, + { "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0}, + { "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 3}, + { "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) &_arrow_dataset___FragmentScanOptions__type_name, 1}, + { "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___CsvFragmentScanOptions__Make, 2}, + { "_arrow_dataset___ParquetFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 3}, + { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 2}, + { "_arrow_dataset___DirectoryPartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 2}, + { "_arrow_dataset___HivePartitioning", (DL_FUNC) &_arrow_dataset___HivePartitioning, 3}, + { "_arrow_dataset___HivePartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___HivePartitioning__MakeFactory, 2}, + { "_arrow_dataset___ScannerBuilder__ProjectNames", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectNames, 2}, + { "_arrow_dataset___ScannerBuilder__ProjectExprs", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectExprs, 3}, + { "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2}, + { "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2}, + { "_arrow_dataset___ScannerBuilder__UseAsync", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseAsync, 2}, + { "_arrow_dataset___ScannerBuilder__BatchSize", (DL_FUNC) &_arrow_dataset___ScannerBuilder__BatchSize, 2}, + { "_arrow_dataset___ScannerBuilder__FragmentScanOptions", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FragmentScanOptions, 2}, + { "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1}, + { "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1}, + { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, + { "_arrow_dataset___Scanner__ScanBatches", (DL_FUNC) &_arrow_dataset___Scanner__ScanBatches, 1}, + { "_arrow_dataset___Scanner__ToRecordBatchReader", (DL_FUNC) &_arrow_dataset___Scanner__ToRecordBatchReader, 1}, + { "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, + { "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, + { "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, + { "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 6}, + { "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2}, + { "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1}, + { "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, + { "_arrow_Int16__initialize", (DL_FUNC) &_arrow_Int16__initialize, 0}, + { "_arrow_Int32__initialize", (DL_FUNC) &_arrow_Int32__initialize, 0}, + { "_arrow_Int64__initialize", (DL_FUNC) &_arrow_Int64__initialize, 0}, + { "_arrow_UInt8__initialize", (DL_FUNC) &_arrow_UInt8__initialize, 0}, + { "_arrow_UInt16__initialize", (DL_FUNC) &_arrow_UInt16__initialize, 0}, + { "_arrow_UInt32__initialize", (DL_FUNC) &_arrow_UInt32__initialize, 0}, + { "_arrow_UInt64__initialize", (DL_FUNC) &_arrow_UInt64__initialize, 0}, + { "_arrow_Float16__initialize", (DL_FUNC) &_arrow_Float16__initialize, 0}, + { "_arrow_Float32__initialize", (DL_FUNC) &_arrow_Float32__initialize, 0}, + { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, + { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, + { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, + { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, + { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, + { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, + { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, + { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0}, + { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0}, + { "_arrow_Decimal128Type__initialize", (DL_FUNC) &_arrow_Decimal128Type__initialize, 2}, + { "_arrow_FixedSizeBinary__initialize", (DL_FUNC) &_arrow_FixedSizeBinary__initialize, 1}, + { "_arrow_Timestamp__initialize", (DL_FUNC) &_arrow_Timestamp__initialize, 2}, + { "_arrow_Time32__initialize", (DL_FUNC) &_arrow_Time32__initialize, 1}, + { "_arrow_Time64__initialize", (DL_FUNC) &_arrow_Time64__initialize, 1}, + { "_arrow_list__", (DL_FUNC) &_arrow_list__, 1}, + { "_arrow_large_list__", (DL_FUNC) &_arrow_large_list__, 1}, + { "_arrow_fixed_size_list__", (DL_FUNC) &_arrow_fixed_size_list__, 2}, + { "_arrow_struct__", (DL_FUNC) &_arrow_struct__, 1}, + { "_arrow_DataType__ToString", (DL_FUNC) &_arrow_DataType__ToString, 1}, + { "_arrow_DataType__name", (DL_FUNC) &_arrow_DataType__name, 1}, + { "_arrow_DataType__Equals", (DL_FUNC) &_arrow_DataType__Equals, 2}, + { "_arrow_DataType__num_fields", (DL_FUNC) &_arrow_DataType__num_fields, 1}, + { "_arrow_DataType__fields", (DL_FUNC) &_arrow_DataType__fields, 1}, + { "_arrow_DataType__id", (DL_FUNC) &_arrow_DataType__id, 1}, + { "_arrow_ListType__ToString", (DL_FUNC) &_arrow_ListType__ToString, 1}, + { "_arrow_FixedWidthType__bit_width", (DL_FUNC) &_arrow_FixedWidthType__bit_width, 1}, + { "_arrow_DateType__unit", (DL_FUNC) &_arrow_DateType__unit, 1}, + { "_arrow_TimeType__unit", (DL_FUNC) &_arrow_TimeType__unit, 1}, + { "_arrow_DecimalType__precision", (DL_FUNC) &_arrow_DecimalType__precision, 1}, + { "_arrow_DecimalType__scale", (DL_FUNC) &_arrow_DecimalType__scale, 1}, + { "_arrow_TimestampType__timezone", (DL_FUNC) &_arrow_TimestampType__timezone, 1}, + { "_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1}, + { "_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3}, + { "_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1}, + { "_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1}, + { "_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1}, + { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, + { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, + { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, + { "_arrow_StructType__field_names", (DL_FUNC) &_arrow_StructType__field_names, 1}, + { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, + { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, + { "_arrow_LargeListType__value_field", (DL_FUNC) &_arrow_LargeListType__value_field, 1}, + { "_arrow_LargeListType__value_type", (DL_FUNC) &_arrow_LargeListType__value_type, 1}, + { "_arrow_FixedSizeListType__value_field", (DL_FUNC) &_arrow_FixedSizeListType__value_field, 1}, + { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1}, + { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1}, + { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, + { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1}, + { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, + { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, + { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, + { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, + { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, + { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, + { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, + { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, + { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, + { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, + { "_arrow_ipc___feather___Reader__schema", (DL_FUNC) &_arrow_ipc___feather___Reader__schema, 1}, + { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, + { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, + { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, + { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 2}, + { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, + { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, + { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1}, + { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2}, + { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1}, + { "_arrow_fs___FileInfo__set_path", (DL_FUNC) &_arrow_fs___FileInfo__set_path, 2}, + { "_arrow_fs___FileInfo__size", (DL_FUNC) &_arrow_fs___FileInfo__size, 1}, + { "_arrow_fs___FileInfo__set_size", (DL_FUNC) &_arrow_fs___FileInfo__set_size, 2}, + { "_arrow_fs___FileInfo__base_name", (DL_FUNC) &_arrow_fs___FileInfo__base_name, 1}, + { "_arrow_fs___FileInfo__extension", (DL_FUNC) &_arrow_fs___FileInfo__extension, 1}, + { "_arrow_fs___FileInfo__mtime", (DL_FUNC) &_arrow_fs___FileInfo__mtime, 1}, + { "_arrow_fs___FileInfo__set_mtime", (DL_FUNC) &_arrow_fs___FileInfo__set_mtime, 2}, + { "_arrow_fs___FileSelector__base_dir", (DL_FUNC) &_arrow_fs___FileSelector__base_dir, 1}, + { "_arrow_fs___FileSelector__allow_not_found", (DL_FUNC) &_arrow_fs___FileSelector__allow_not_found, 1}, + { "_arrow_fs___FileSelector__recursive", (DL_FUNC) &_arrow_fs___FileSelector__recursive, 1}, + { "_arrow_fs___FileSelector__create", (DL_FUNC) &_arrow_fs___FileSelector__create, 3}, + { "_arrow_fs___FileSystem__GetTargetInfos_Paths", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_Paths, 2}, + { "_arrow_fs___FileSystem__GetTargetInfos_FileSelector", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_FileSelector, 2}, + { "_arrow_fs___FileSystem__CreateDir", (DL_FUNC) &_arrow_fs___FileSystem__CreateDir, 3}, + { "_arrow_fs___FileSystem__DeleteDir", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDir, 2}, + { "_arrow_fs___FileSystem__DeleteDirContents", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDirContents, 2}, + { "_arrow_fs___FileSystem__DeleteFile", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFile, 2}, + { "_arrow_fs___FileSystem__DeleteFiles", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFiles, 2}, + { "_arrow_fs___FileSystem__Move", (DL_FUNC) &_arrow_fs___FileSystem__Move, 3}, + { "_arrow_fs___FileSystem__CopyFile", (DL_FUNC) &_arrow_fs___FileSystem__CopyFile, 3}, + { "_arrow_fs___FileSystem__OpenInputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputStream, 2}, + { "_arrow_fs___FileSystem__OpenInputFile", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputFile, 2}, + { "_arrow_fs___FileSystem__OpenOutputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenOutputStream, 2}, + { "_arrow_fs___FileSystem__OpenAppendStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenAppendStream, 2}, + { "_arrow_fs___FileSystem__type_name", (DL_FUNC) &_arrow_fs___FileSystem__type_name, 1}, + { "_arrow_fs___LocalFileSystem__create", (DL_FUNC) &_arrow_fs___LocalFileSystem__create, 0}, + { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2}, + { "_arrow_fs___SubTreeFileSystem__base_fs", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_fs, 1}, + { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, + { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, + { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, + { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12}, + { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, + { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, + { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, + { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, + { "_arrow_io___RandomAccessFile__GetSize", (DL_FUNC) &_arrow_io___RandomAccessFile__GetSize, 1}, + { "_arrow_io___RandomAccessFile__supports_zero_copy", (DL_FUNC) &_arrow_io___RandomAccessFile__supports_zero_copy, 1}, + { "_arrow_io___RandomAccessFile__Seek", (DL_FUNC) &_arrow_io___RandomAccessFile__Seek, 2}, + { "_arrow_io___RandomAccessFile__Tell", (DL_FUNC) &_arrow_io___RandomAccessFile__Tell, 1}, + { "_arrow_io___RandomAccessFile__Read0", (DL_FUNC) &_arrow_io___RandomAccessFile__Read0, 1}, + { "_arrow_io___RandomAccessFile__ReadAt", (DL_FUNC) &_arrow_io___RandomAccessFile__ReadAt, 3}, + { "_arrow_io___MemoryMappedFile__Create", (DL_FUNC) &_arrow_io___MemoryMappedFile__Create, 2}, + { "_arrow_io___MemoryMappedFile__Open", (DL_FUNC) &_arrow_io___MemoryMappedFile__Open, 2}, + { "_arrow_io___MemoryMappedFile__Resize", (DL_FUNC) &_arrow_io___MemoryMappedFile__Resize, 2}, + { "_arrow_io___ReadableFile__Open", (DL_FUNC) &_arrow_io___ReadableFile__Open, 1}, + { "_arrow_io___BufferReader__initialize", (DL_FUNC) &_arrow_io___BufferReader__initialize, 1}, + { "_arrow_io___Writable__write", (DL_FUNC) &_arrow_io___Writable__write, 2}, + { "_arrow_io___OutputStream__Tell", (DL_FUNC) &_arrow_io___OutputStream__Tell, 1}, + { "_arrow_io___FileOutputStream__Open", (DL_FUNC) &_arrow_io___FileOutputStream__Open, 1}, + { "_arrow_io___BufferOutputStream__Create", (DL_FUNC) &_arrow_io___BufferOutputStream__Create, 1}, + { "_arrow_io___BufferOutputStream__capacity", (DL_FUNC) &_arrow_io___BufferOutputStream__capacity, 1}, + { "_arrow_io___BufferOutputStream__Finish", (DL_FUNC) &_arrow_io___BufferOutputStream__Finish, 1}, + { "_arrow_io___BufferOutputStream__Tell", (DL_FUNC) &_arrow_io___BufferOutputStream__Tell, 1}, + { "_arrow_io___BufferOutputStream__Write", (DL_FUNC) &_arrow_io___BufferOutputStream__Write, 2}, + { "_arrow_json___ReadOptions__initialize", (DL_FUNC) &_arrow_json___ReadOptions__initialize, 2}, + { "_arrow_json___ParseOptions__initialize1", (DL_FUNC) &_arrow_json___ParseOptions__initialize1, 1}, + { "_arrow_json___ParseOptions__initialize2", (DL_FUNC) &_arrow_json___ParseOptions__initialize2, 2}, + { "_arrow_json___TableReader__Make", (DL_FUNC) &_arrow_json___TableReader__Make, 3}, + { "_arrow_json___TableReader__Read", (DL_FUNC) &_arrow_json___TableReader__Read, 1}, + { "_arrow_MemoryPool__default", (DL_FUNC) &_arrow_MemoryPool__default, 0}, + { "_arrow_MemoryPool__bytes_allocated", (DL_FUNC) &_arrow_MemoryPool__bytes_allocated, 1}, + { "_arrow_MemoryPool__max_memory", (DL_FUNC) &_arrow_MemoryPool__max_memory, 1}, + { "_arrow_MemoryPool__backend_name", (DL_FUNC) &_arrow_MemoryPool__backend_name, 1}, + { "_arrow_supported_memory_backends", (DL_FUNC) &_arrow_supported_memory_backends, 0}, + { "_arrow_ipc___Message__body_length", (DL_FUNC) &_arrow_ipc___Message__body_length, 1}, + { "_arrow_ipc___Message__metadata", (DL_FUNC) &_arrow_ipc___Message__metadata, 1}, + { "_arrow_ipc___Message__body", (DL_FUNC) &_arrow_ipc___Message__body, 1}, + { "_arrow_ipc___Message__Verify", (DL_FUNC) &_arrow_ipc___Message__Verify, 1}, + { "_arrow_ipc___Message__type", (DL_FUNC) &_arrow_ipc___Message__type, 1}, + { "_arrow_ipc___Message__Equals", (DL_FUNC) &_arrow_ipc___Message__Equals, 2}, + { "_arrow_ipc___ReadRecordBatch__Message__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__Message__Schema, 2}, + { "_arrow_ipc___ReadSchema_InputStream", (DL_FUNC) &_arrow_ipc___ReadSchema_InputStream, 1}, + { "_arrow_ipc___ReadSchema_Message", (DL_FUNC) &_arrow_ipc___ReadSchema_Message, 1}, + { "_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, + { "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, + { "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, + { "_arrow_parquet___arrow___ArrowReaderProperties__Make", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1}, + { "_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3}, + { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, + { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, + { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroup1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup1, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroup2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup2, 3}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroups1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups1, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroups2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups2, 3}, + { "_arrow_parquet___arrow___FileReader__num_rows", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_rows, 1}, + { "_arrow_parquet___arrow___FileReader__num_columns", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_columns, 1}, + { "_arrow_parquet___arrow___FileReader__num_row_groups", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_row_groups, 1}, + { "_arrow_parquet___arrow___FileReader__ReadColumn", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadColumn, 2}, + { "_arrow_parquet___ArrowWriterProperties___create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___create, 3}, + { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0}, + { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2}, + { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1}, + { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4}, + { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3}, + { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, + { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, + { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, + { "_arrow_allocate_arrow_schema", (DL_FUNC) &_arrow_allocate_arrow_schema, 0}, + { "_arrow_delete_arrow_schema", (DL_FUNC) &_arrow_delete_arrow_schema, 1}, + { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0}, + { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1}, + { "_arrow_allocate_arrow_array_stream", (DL_FUNC) &_arrow_allocate_arrow_array_stream, 0}, + { "_arrow_delete_arrow_array_stream", (DL_FUNC) &_arrow_delete_arrow_array_stream, 1}, + { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 2}, + { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 2}, + { "_arrow_ImportSchema", (DL_FUNC) &_arrow_ImportSchema, 1}, + { "_arrow_ImportField", (DL_FUNC) &_arrow_ImportField, 1}, + { "_arrow_ImportType", (DL_FUNC) &_arrow_ImportType, 1}, + { "_arrow_ImportRecordBatchReader", (DL_FUNC) &_arrow_ImportRecordBatchReader, 1}, + { "_arrow_ExportType", (DL_FUNC) &_arrow_ExportType, 2}, + { "_arrow_ExportField", (DL_FUNC) &_arrow_ExportField, 2}, + { "_arrow_ExportSchema", (DL_FUNC) &_arrow_ExportSchema, 2}, + { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 3}, + { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3}, + { "_arrow_ExportRecordBatchReader", (DL_FUNC) &_arrow_ExportRecordBatchReader, 2}, + { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 3}, + { "_arrow_vec_to_arrow", (DL_FUNC) &_arrow_vec_to_arrow, 2}, + { "_arrow_DictionaryArray__FromArrays", (DL_FUNC) &_arrow_DictionaryArray__FromArrays, 3}, + { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, + { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, + { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, + { "_arrow_RecordBatch__RenameColumns", (DL_FUNC) &_arrow_RecordBatch__RenameColumns, 2}, + { "_arrow_RecordBatch__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_RecordBatch__ReplaceSchemaMetadata, 2}, + { "_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, + { "_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, + { "_arrow_RecordBatch__GetColumnByName", (DL_FUNC) &_arrow_RecordBatch__GetColumnByName, 2}, + { "_arrow_RecordBatch__SelectColumns", (DL_FUNC) &_arrow_RecordBatch__SelectColumns, 2}, + { "_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 3}, + { "_arrow_RecordBatch__AddColumn", (DL_FUNC) &_arrow_RecordBatch__AddColumn, 4}, + { "_arrow_RecordBatch__SetColumn", (DL_FUNC) &_arrow_RecordBatch__SetColumn, 4}, + { "_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2}, + { "_arrow_RecordBatch__column_name", (DL_FUNC) &_arrow_RecordBatch__column_name, 2}, + { "_arrow_RecordBatch__names", (DL_FUNC) &_arrow_RecordBatch__names, 1}, + { "_arrow_RecordBatch__Slice1", (DL_FUNC) &_arrow_RecordBatch__Slice1, 2}, + { "_arrow_RecordBatch__Slice2", (DL_FUNC) &_arrow_RecordBatch__Slice2, 3}, + { "_arrow_ipc___SerializeRecordBatch__Raw", (DL_FUNC) &_arrow_ipc___SerializeRecordBatch__Raw, 1}, + { "_arrow_ipc___ReadRecordBatch__InputStream__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__InputStream__Schema, 2}, + { "_arrow_RecordBatch__from_arrays", (DL_FUNC) &_arrow_RecordBatch__from_arrays, 2}, + { "_arrow_RecordBatchReader__schema", (DL_FUNC) &_arrow_RecordBatchReader__schema, 1}, + { "_arrow_RecordBatchReader__ReadNext", (DL_FUNC) &_arrow_RecordBatchReader__ReadNext, 1}, + { "_arrow_RecordBatchReader__batches", (DL_FUNC) &_arrow_RecordBatchReader__batches, 1}, + { "_arrow_Table__from_RecordBatchReader", (DL_FUNC) &_arrow_Table__from_RecordBatchReader, 1}, + { "_arrow_ipc___RecordBatchStreamReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__Open, 1}, + { "_arrow_ipc___RecordBatchFileReader__schema", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__schema, 1}, + { "_arrow_ipc___RecordBatchFileReader__num_record_batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__num_record_batches, 1}, + { "_arrow_ipc___RecordBatchFileReader__ReadRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__ReadRecordBatch, 2}, + { "_arrow_ipc___RecordBatchFileReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__Open, 1}, + { "_arrow_Table__from_RecordBatchFileReader", (DL_FUNC) &_arrow_Table__from_RecordBatchFileReader, 1}, + { "_arrow_ipc___RecordBatchFileReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__batches, 1}, + { "_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 2}, + { "_arrow_ipc___RecordBatchWriter__WriteTable", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteTable, 2}, + { "_arrow_ipc___RecordBatchWriter__Close", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__Close, 1}, + { "_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 4}, + { "_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 4}, + { "_arrow_Array__GetScalar", (DL_FUNC) &_arrow_Array__GetScalar, 2}, + { "_arrow_Scalar__ToString", (DL_FUNC) &_arrow_Scalar__ToString, 1}, + { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, + { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, + { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, + { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2}, + { "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, + { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, + { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, + { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, + { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, + { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, + { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, + { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, + { "_arrow_Schema__AddField", (DL_FUNC) &_arrow_Schema__AddField, 3}, + { "_arrow_Schema__SetField", (DL_FUNC) &_arrow_Schema__SetField, 3}, + { "_arrow_Schema__RemoveField", (DL_FUNC) &_arrow_Schema__RemoveField, 2}, + { "_arrow_Schema__GetFieldByName", (DL_FUNC) &_arrow_Schema__GetFieldByName, 2}, + { "_arrow_Schema__fields", (DL_FUNC) &_arrow_Schema__fields, 1}, + { "_arrow_Schema__field_names", (DL_FUNC) &_arrow_Schema__field_names, 1}, + { "_arrow_Schema__HasMetadata", (DL_FUNC) &_arrow_Schema__HasMetadata, 1}, + { "_arrow_Schema__metadata", (DL_FUNC) &_arrow_Schema__metadata, 1}, + { "_arrow_Schema__WithMetadata", (DL_FUNC) &_arrow_Schema__WithMetadata, 2}, + { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1}, + { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3}, + { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1}, + { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, + { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, + { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, + { "_arrow_Table__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_Table__ReplaceSchemaMetadata, 2}, + { "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, + { "_arrow_Table__field", (DL_FUNC) &_arrow_Table__field, 2}, + { "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, + { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1}, + { "_arrow_Table__RenameColumns", (DL_FUNC) &_arrow_Table__RenameColumns, 2}, + { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2}, + { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3}, + { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 3}, + { "_arrow_Table__Validate", (DL_FUNC) &_arrow_Table__Validate, 1}, + { "_arrow_Table__ValidateFull", (DL_FUNC) &_arrow_Table__ValidateFull, 1}, + { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, + { "_arrow_Table__RemoveColumn", (DL_FUNC) &_arrow_Table__RemoveColumn, 2}, + { "_arrow_Table__AddColumn", (DL_FUNC) &_arrow_Table__AddColumn, 4}, + { "_arrow_Table__SetColumn", (DL_FUNC) &_arrow_Table__SetColumn, 4}, + { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2}, + { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1}, + { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2}, + { "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0}, + { "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1}, + { "_arrow_GetIOThreadPoolCapacity", (DL_FUNC) &_arrow_GetIOThreadPoolCapacity, 0}, + { "_arrow_SetIOThreadPoolCapacity", (DL_FUNC) &_arrow_SetIOThreadPoolCapacity, 1}, + { "_arrow_Array__infer_type", (DL_FUNC) &_arrow_Array__infer_type, 1}, + { "_arrow_Table__Reset", (DL_FUNC) &_arrow_Table__Reset, 1}, + { "_arrow_RecordBatch__Reset", (DL_FUNC) &_arrow_RecordBatch__Reset, 1}, {NULL, NULL, 0} }; extern "C" void R_init_arrow(DllInfo* dll){ @@ -7490,3 +7491,5 @@ extern "C" void R_init_arrow(DllInfo* dll){ #endif } + + diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 3d0bbca63d2..cab1a09c6ae 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -57,12 +57,22 @@ std::shared_ptr MakeExecNodeOrStop( // [[arrow::export]] std::shared_ptr ExecPlan_run( const std::shared_ptr& plan, - const std::shared_ptr& final_node) { + const std::shared_ptr& final_node, cpp11::list sort_options) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; - MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, - compute::SinkNodeOptions{&sink_gen}); + + // Sorting uses a different sink node; there is no general sort yet + if (sort_options.size() > 0) { + MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, + compute::OrderBySinkNodeOptions{ + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); + } else { + MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, + compute::SinkNodeOptions{&sink_gen}); + } StopIfNotOk(plan->Validate()); StopIfNotOk(plan->StartProducing()); diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index b027dc98702..57569be50fe 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -133,7 +133,7 @@ test_that("Simple interface for datasets", { # Collecting virtual partition column works expect_equal( - collect(ds) %>% pull(part), + ds %>% pull(part), c(rep(1, 10), rep(2, 10)) ) }) @@ -1728,4 +1728,4 @@ test_that("Error if no format specified and files are not parquet", { "Did you mean to specify a 'format'" ) ) -}) +}) \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 1aa30654495..e0c5b10d5be 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -85,7 +85,6 @@ test_that("Group by sum on dataset", { input %>% group_by(some_grouping) %>% summarize(total = sum(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -94,7 +93,6 @@ test_that("Group by sum on dataset", { input %>% group_by(some_grouping) %>% summarize(total = sum(int * 4, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -103,7 +101,6 @@ test_that("Group by sum on dataset", { input %>% group_by(some_grouping) %>% summarize(total = sum(int)) %>% - arrange(some_grouping) %>% collect(), tbl, ) @@ -195,7 +192,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(any(lgl, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -203,7 +199,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(all(lgl, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -211,7 +206,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(any(lgl, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -219,7 +213,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(all(lgl, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -229,7 +222,6 @@ test_that("Group by any/all", { mutate(has_words = nchar(verses) < 0) %>% group_by(some_grouping) %>% summarize(any(has_words, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -238,7 +230,6 @@ test_that("Group by any/all", { mutate(has_words = nchar(verses) < 0) %>% group_by(some_grouping) %>% summarize(all(has_words, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -246,7 +237,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(has_words = all(nchar(verses) < 0, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -291,7 +281,6 @@ test_that("Filter and aggregate", { filter(some_grouping == 2) %>% group_by(some_grouping) %>% summarize(total = sum(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -301,7 +290,6 @@ test_that("Filter and aggregate", { filter(int > 5) %>% group_by(some_grouping) %>% summarize(total = sum(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -316,7 +304,7 @@ test_that("Expressions on aggregations", { any = any(lgl), all = all(lgl) ) %>% - arrange(some_grouping) %>% + collect() %>% transmute(some = any & !all) %>% collect(), tbl @@ -327,7 +315,6 @@ test_that("Expressions on aggregations", { input %>% group_by(some_grouping) %>% summarize(any(lgl) & !all(lgl)) %>% - arrange(some_grouping) %>% collect(), tbl ) From d75f4bdb43bfb2f38e1f9acc9737d0930b7bdda5 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 24 Aug 2021 16:53:05 -0400 Subject: [PATCH 03/27] Cleanup --- r/R/query-engine.R | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index c595dd27df3..ec5ff637211 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -99,12 +99,6 @@ ExecPlan <- R6Class("ExecPlan", } } - # tab <- tab[ - # tab$SortIndices(names(x$arrange_vars), x$arrange_desc), - # names(x$selected_columns), # this omits x$temp_columns from the result - # drop = FALSE - # ] - # Apply sorting: this is currently not an ExecNode itself, it is a # sink node option. # TODO: error if doing a subsequent operation that would throw away sorting! @@ -152,4 +146,4 @@ ExecNode <- R6Class("ExecNode", ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) } ) -) \ No newline at end of file +) From 7e9aa033685649fc66e3508047253918169768be Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 24 Aug 2021 17:02:57 -0400 Subject: [PATCH 04/27] Add some comments --- r/R/query-engine.R | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index ec5ff637211..824ad2ea22c 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -58,13 +58,15 @@ ExecPlan <- R6Class("ExecPlan", ExecNode_Scan(self, dataset, filter, colnames %||% character(0)) }, Build = function(.data) { + # This method takes an arrow_dplyr_query and chains together the + # ExecNodes that they produce. It does not evaluate them--that is Run(). group_vars <- dplyr::group_vars(.data) grouped <- length(group_vars) > 0 # Collect the target names first because we have to add back the group vars target_names <- names(.data) .data <- ensure_group_vars(.data) - .data <- ensure_arrange_vars(.data) # this sets x$temp_columns + .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns node <- self$Scan(.data) # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again @@ -96,6 +98,12 @@ ExecPlan <- R6Class("ExecPlan", node <- node$Project( make_field_refs(c(group_vars, names(.data$aggregations))) ) + # Add sorting instructions for the rows too to match dplyr + # (see below about why sorting isn't itself a Node) + node$sort <- list( + names = group_vars, + orders = rep(0L, length(group_vars)) + ) } } @@ -108,11 +116,6 @@ ExecPlan <- R6Class("ExecPlan", orders = as.integer(.data$arrange_desc), temp_columns = names(.data$temp_columns) ) - } else if (length(.data$aggregations) && grouped) { - node$sort <- list( - names = group_vars, - orders = rep(0L, length(group_vars)) - ) } node }, @@ -129,6 +132,9 @@ ExecPlan$create <- function(use_threads = option_use_threads()) { ExecNode <- R6Class("ExecNode", inherit = ArrowObject, public = list( + # `sort` is a slight hack to be able to keep around arrange() params, + # which don't currently yield their own ExecNode but rather are consumed + # in the SinkNode (in ExecPlan$run()) sort = NULL, Project = function(cols) { if (length(cols)) { @@ -146,4 +152,4 @@ ExecNode <- R6Class("ExecNode", ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) } ) -) +) \ No newline at end of file From 2cb3ea600a9ea5da77e716c85d427e015601befb Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 26 Aug 2021 16:03:47 -0400 Subject: [PATCH 05/27] summarize now doesn't evaluate --- r/R/dplyr-summarize.R | 5 ++--- r/tests/testthat/test-dplyr-aggregate.R | 15 +++++---------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 9a0cfc5bf91..5ff2ffc5399 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -81,6 +81,5 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { names(inputs) <- as.character(seq_along(inputs)) .data$selected_columns <- inputs - # Eventually, we will return .data here if (dataset) but do it eagerly now - do_exec_plan(.data) -} + .data +} \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index e0c5b10d5be..3f848f22a1e 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -51,7 +51,6 @@ test_that("summarize", { }) test_that("summarize() doesn't evaluate eagerly", { - skip("TODO") expect_s3_class( Table$create(tbl) %>% summarize(total = sum(int)), @@ -60,7 +59,7 @@ test_that("summarize() doesn't evaluate eagerly", { expect_r6_class( Table$create(tbl) %>% summarize(total = sum(int)) %>% - collect(), + compute(), "ArrowTabular" ) }) @@ -111,7 +110,6 @@ test_that("Group by mean on dataset", { input %>% group_by(some_grouping) %>% summarize(mean = mean(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -120,7 +118,6 @@ test_that("Group by mean on dataset", { input %>% group_by(some_grouping) %>% summarize(mean = mean(int, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -131,7 +128,6 @@ test_that("Group by sd on dataset", { input %>% group_by(some_grouping) %>% summarize(sd = sd(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -141,7 +137,6 @@ test_that("Group by sd on dataset", { input %>% group_by(some_grouping) %>% summarize(sd = sd(int, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -152,7 +147,6 @@ test_that("Group by var on dataset", { input %>% group_by(some_grouping) %>% summarize(var = var(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -162,7 +156,6 @@ test_that("Group by var on dataset", { input %>% group_by(some_grouping) %>% summarize(var = var(int, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -304,8 +297,10 @@ test_that("Expressions on aggregations", { any = any(lgl), all = all(lgl) ) %>% - collect() %>% - transmute(some = any & !all) %>% + compute() %>% + ungroup() %>% # TODO: loosen the restriction on mutate after group_by + mutate(some = any & !all) %>% + select(some_grouping, some) %>% collect(), tbl ) From 9a2cde5a45a65b5b8d072dc035dff724bde16077 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 26 Aug 2021 16:24:49 -0400 Subject: [PATCH 06/27] Make dataset tests not assume row order --- r/tests/testthat/test-dataset.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 57569be50fe..f7a41434e03 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -133,7 +133,7 @@ test_that("Simple interface for datasets", { # Collecting virtual partition column works expect_equal( - ds %>% pull(part), + ds %>% arrange(part) %>% pull(part), c(rep(1, 10), rep(2, 10)) ) }) @@ -348,7 +348,7 @@ test_that("IPC/Feather format data", { # Collecting virtual partition column works expect_equal( - collect(ds) %>% pull(part), + ds %>% arrange(part) %>% pull(part), c(rep(3, 10), rep(4, 10)) ) }) @@ -376,7 +376,7 @@ test_that("CSV dataset", { ) # Collecting virtual partition column works expect_equal( - collect(ds) %>% pull(part), + collect(ds) %>% arrange(part) %>% pull(part), c(rep(5, 10), rep(6, 10)) ) }) @@ -804,7 +804,7 @@ test_that("filter scalar validation doesn't crash (ARROW-7772)", { test_that("collect() on Dataset works (if fits in memory)", { skip_if_not_available("parquet") expect_equal( - collect(open_dataset(dataset_dir)), + collect(open_dataset(dataset_dir)) %>% arrange(int), rbind(df1, df2) ) }) From a1cd90f5e1de2c2c2e6dbb6e7981b10af68e8baa Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 26 Aug 2021 16:52:51 -0400 Subject: [PATCH 07/27] Add support for derived grouping columns in summarize --- r/R/dplyr-summarize.R | 2 +- r/tests/testthat/test-dplyr-aggregate.R | 40 ++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 5ff2ffc5399..764db6d7146 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -79,7 +79,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # This is essentially a projection, and the column names don't matter # (but must exist) names(inputs) <- as.character(seq_along(inputs)) - .data$selected_columns <- inputs + .data$selected_columns <- c(inputs, .data$selected_columns[.data$group_by_vars]) .data } \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 3f848f22a1e..21b4501e3cd 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -161,7 +161,7 @@ test_that("Group by var on dataset", { ) }) -test_that("n()", { +test_that("Group by any/all", { withr::local_options(list(arrow.debug = TRUE)) expect_dplyr_equal( input %>% @@ -288,6 +288,44 @@ test_that("Filter and aggregate", { ) }) +test_that("Group by edge cases", { + expect_dplyr_equal( + input %>% + group_by(some_grouping * 2) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + group_by(alt = some_grouping * 2) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) +}) + +test_that("Do things after summarize", { + group2_sum <- tbl %>% + group_by(some_grouping) %>% + filter(int > 5) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + pull() %>% + tail(1) + + skip("WIP") + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + filter(int > 5) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + filter(total == group2_sum) %>% + collect() %>% print(), + tbl + ) +}) + test_that("Expressions on aggregations", { # This is what it effectively is expect_dplyr_equal( From 90612b5f6c77b55400ef623b9411a3db2b8c7482 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 27 Aug 2021 13:24:55 -0400 Subject: [PATCH 08/27] summarize() collapses the query and we can do things on it after --- r/NAMESPACE | 1 + r/R/arrow-package.R | 2 +- r/R/arrowExports.R | 5 +- r/R/dplyr-functions.R | 15 +++++ r/R/dplyr-summarize.R | 9 ++- r/R/dplyr.R | 38 +++++++++-- r/R/expression.R | 3 + r/R/query-engine.R | 15 ++++- r/src/arrowExports.cpp | 17 +++++ r/src/expression.cpp | 5 ++ r/tests/testthat/test-dplyr-aggregate.R | 4 +- r/tests/testthat/test-dplyr-collapse.R | 84 +++++++++++++++++++++++++ 12 files changed, 180 insertions(+), 18 deletions(-) create mode 100644 r/tests/testthat/test-dplyr-collapse.R diff --git a/r/NAMESPACE b/r/NAMESPACE index 8ce6d162eb0..5e78d04de52 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -291,6 +291,7 @@ importFrom(bit64,print.integer64) importFrom(bit64,str.integer64) importFrom(methods,as) importFrom(purrr,as_mapper) +importFrom(purrr,imap) importFrom(purrr,imap_chr) importFrom(purrr,keep) importFrom(purrr,map) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 537eebb1b1d..04f01faf268 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -17,7 +17,7 @@ #' @importFrom stats quantile median na.omit na.exclude na.pass na.fail #' @importFrom R6 R6Class -#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap_chr +#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr #' @importFrom assertthat assert_that is.string #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index ce6d2e872d4..b852a3d8ca9 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -816,6 +816,10 @@ FixedSizeListType__list_size <- function(type) { .Call(`_arrow_FixedSizeListType__list_size`, type) } +compute___expr__equals <- function(lhs, rhs) { + .Call(`_arrow_compute___expr__equals`, lhs, rhs) +} + compute___expr__call <- function(func_name, argument_list, options) { .Call(`_arrow_compute___expr__call`, func_name, argument_list, options) } @@ -1767,4 +1771,3 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } - diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index e535546dd1b..2247d22f368 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -840,3 +840,18 @@ agg_funcs$n <- function() { options = list() ) } + +output_type <- function(fun, input_type) { + # These are quick and dirty heuristics. + if (fun %in% c("any", "all")) { + bool() + } else if (fun %in% "sum") { + # It may upcast to a bigger type but this is close enough + input_type + } else if (fun %in% c("mean", "stddev", "variance")) { + float64() + } else { + # Just so things don't error, assume the resulting type is the same + input_type + } +} \ No newline at end of file diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 764db6d7146..8ca4757a4e2 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -72,14 +72,13 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # Should we: mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] } + # TODO: Should summarize just record the aggregations and leave this projection etc. to do_exec_plan? # Now, from that, split out the data (expressions) and options .data$aggregations <- lapply(results, function(x) x[c("fun", "options")]) - inputs <- lapply(results, function(x) x$data) - # This is essentially a projection, and the column names don't matter - # (but must exist) - names(inputs) <- as.character(seq_along(inputs)) + # TODO: validate that none of names(inputs) are the same as names(group_by_vars) + # dplyr does not error on this but the result it gives isn't great .data$selected_columns <- c(inputs, .data$selected_columns[.data$group_by_vars]) - .data + do_collapse(.data) } \ No newline at end of file diff --git a/r/R/dplyr.R b/r/R/dplyr.R index c3029a114c3..f7e170415d1 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -43,18 +43,21 @@ arrow_dplyr_query <- function(.data) { )) } + .adq(.data) +} + +.adq <- function(.data) { + if (!inherits(.data, c("Dataset", "arrow_dplyr_query"))) { + .data <- InMemoryDataset$create(.data) + } structure( list( - .data = if (inherits(.data, "Dataset")) { - .data$clone() - } else { - InMemoryDataset$create(.data) - }, + .data = .data, # selected_columns is a named list: # * contents are references/expressions pointing to the data # * names are the names they should be in the end (i.e. this # records any renaming) - selected_columns = make_field_refs(names(.data)), + selected_columns = make_field_refs(names(.data$schema)), # filtered_rows will be an Expression filtered_rows = TRUE, # group_by_vars is a character vector of columns (as renamed) @@ -75,6 +78,29 @@ arrow_dplyr_query <- function(.data) { ) } +do_collapse <- function(.data) { + .data$schema <- implicit_schema(.data) + .adq(.data) +} + +implicit_schema <- function(.data) { + # c(.data$group_by_vars, names(.data$aggregations)) + .data <- ensure_group_vars(.data) + old_schm <- .data$.data$schema + new_fields <- map(.data$selected_columns, ~ .$type(old_schm)) + if (is.null(.data$aggregations)) { + return(schema(!!!new_fields)) + } + # * Put group_by_vars first (this can't be done by summarize, they have to be last per the aggregate node signature, and they get projected to this order after aggregation) + # * Infer the output types from the aggregations + group_fields <- new_fields[.data$group_by_vars] + agg_fields <- imap( + new_fields[setdiff(names(new_fields), .data$group_by_vars)], + ~ output_type(.data$aggregations[[.y]][["fun"]], .x) + ) + schema(!!!c(group_fields, agg_fields)) +} + make_field_refs <- function(field_names) { set_names(lapply(field_names, Expression$field_ref), field_names) } diff --git a/r/R/expression.R b/r/R/expression.R index aa9af9270c9..82e21ccf2e1 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -125,6 +125,9 @@ Expression <- R6Class("Expression", inherit = ArrowObject, public = list( ToString = function() compute___expr__ToString(self), + Equals = function(other, ...) { + inherits(other, "Expression") && compute___expr__equals(self, other) + }, # TODO: Implement type determination without storing # schemas in Expression objects (ARROW-13186) schema = NULL, diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 824ad2ea22c..58b3f1ab28e 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -68,13 +68,22 @@ ExecPlan <- R6Class("ExecPlan", .data <- ensure_group_vars(.data) .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns - node <- self$Scan(.data) + if (inherits(.data$.data, "arrow_dplyr_query")) { + # We have a nested query. Recurse. + node <- self$Build(.data$.data) + } else { + node <- self$Scan(.data) + } + # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again if (inherits(.data$filtered_rows, "Expression")) { node <- node$Filter(.data$filtered_rows) } - # If any columns are derived we need to Project (otherwise this may be no-op) - node <- node$Project(c(.data$selected_columns, .data$temp_columns)) + # If any columns are derived, reordered, or renamed we need to Project + projection <- c(.data$selected_columns, .data$temp_columns) + if (!isTRUE(all.equal(projection, make_field_refs(names(.data$.data$schema))))) { + node <- node$Project(projection) + } if (length(.data$aggregations)) { if (grouped) { diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index de8ca36af6c..f33b81c08f0 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3174,6 +3174,22 @@ extern "C" SEXP _arrow_FixedSizeListType__list_size(SEXP type_sexp){ } #endif +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +bool compute___expr__equals(const std::shared_ptr& lhs, const std::shared_ptr& rhs); +extern "C" SEXP _arrow_compute___expr__equals(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type lhs(lhs_sexp); + arrow::r::Input&>::type rhs(rhs_sexp); + return cpp11::as_sexp(compute___expr__equals(lhs, rhs)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_compute___expr__equals(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call compute___expr__equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // expression.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr compute___expr__call(std::string func_name, cpp11::list argument_list, cpp11::list options); @@ -7240,6 +7256,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_FixedSizeListType__value_field", (DL_FUNC) &_arrow_FixedSizeListType__value_field, 1}, { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1}, { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1}, + { "_arrow_compute___expr__equals", (DL_FUNC) &_arrow_compute___expr__equals, 2}, { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1}, { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, diff --git a/r/src/expression.cpp b/r/src/expression.cpp index 3fcba46e911..97a8a746bba 100644 --- a/r/src/expression.cpp +++ b/r/src/expression.cpp @@ -27,6 +27,11 @@ namespace compute = ::arrow::compute; std::shared_ptr make_compute_options(std::string func_name, cpp11::list options); +// [[arrow::export]] +bool compute___expr__equals(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + return lhs->Equals(*rhs); +} // [[arrow::export]] std::shared_ptr compute___expr__call(std::string func_name, cpp11::list argument_list, diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-aggregate.R index 21b4501e3cd..32ee3cd20ac 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-aggregate.R @@ -314,14 +314,14 @@ test_that("Do things after summarize", { pull() %>% tail(1) - skip("WIP") expect_dplyr_equal( input %>% group_by(some_grouping) %>% filter(int > 5) %>% summarize(total = sum(int, na.rm = TRUE)) %>% filter(total == group2_sum) %>% - collect() %>% print(), + mutate(extra = total * 5) %>% + collect(), tbl ) }) diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R new file mode 100644 index 00000000000..84287976ced --- /dev/null +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -0,0 +1,84 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") +tbl$some_grouping <- rep(c(1, 2), 5) + +tab <- Table$create(tbl) + +test_that("implicit_schema with select", { + expect_equal( + tab %>% + select(int, lgl) %>% + implicit_schema(), + schema(int = int32(), lgl = bool()) + ) +}) + +test_that("implicit_schema with rename", { + expect_equal( + tab %>% + select(numbers = int, lgl) %>% + implicit_schema(), + schema(numbers = int32(), lgl = bool()) + ) +}) + +test_that("implicit_schema with mutate", { + expect_equal( + tab %>% + transmute( + numbers = int * 4, + words = as.character(int) + ) %>% + implicit_schema(), + schema(numbers = float64(), words = utf8()) + ) +}) + +test_that("implicit_schema with summarize", { + expect_equal( + tab %>% + summarize( + avg = mean(int) + ) %>% + implicit_schema(), + schema(avg = float64()) + ) +}) + +test_that("implicit_schema with group_by summarize", { + expect_equal( + tab %>% + group_by(some_grouping) %>% + summarize( + avg = mean(int * 5L) + ) %>% + implicit_schema(), + schema(some_grouping = float64(), avg = float64()) + ) +}) \ No newline at end of file From f6cf638cd2cec9789bfa7c0a4e3bcec6e36ea65e Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 27 Aug 2021 13:32:03 -0400 Subject: [PATCH 09/27] Rename test file --- .../testthat/{test-dplyr-aggregate.R => test-dplyr-summarize.R} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename r/tests/testthat/{test-dplyr-aggregate.R => test-dplyr-summarize.R} (99%) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-summarize.R similarity index 99% rename from r/tests/testthat/test-dplyr-aggregate.R rename to r/tests/testthat/test-dplyr-summarize.R index 32ee3cd20ac..ec9d281e053 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -343,7 +343,7 @@ test_that("Expressions on aggregations", { tbl ) # More concisely: - skip("Not implemented") + skip("TODO: ARROW-13778") expect_dplyr_equal( input %>% group_by(some_grouping) %>% From bd6e3632f7183fc00987634e4e701a03458360e4 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 27 Aug 2021 14:11:07 -0400 Subject: [PATCH 10/27] Refactor and fix tests --- r/R/dplyr-mutate.R | 15 ++++++---- r/R/dplyr-summarize.R | 24 ++++++---------- r/R/dplyr.R | 25 +++++++++-------- r/R/query-engine.R | 41 ++++++++++++++++++++-------- r/tests/testthat/test-dplyr-filter.R | 2 +- 5 files changed, 62 insertions(+), 45 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index f19505c1958..f2df4a078a4 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -24,7 +24,7 @@ mutate.arrow_dplyr_query <- function(.data, .before = NULL, .after = NULL) { call <- match.call() - exprs <- quos(...) + exprs <- ensure_named_exprs(quos(...)) .keep <- match.arg(.keep) .before <- enquo(.before) @@ -45,11 +45,6 @@ mutate.arrow_dplyr_query <- function(.data, return(abandon_ship(call, .data, "mutate() on grouped data not supported in Arrow")) } - # Check for unnamed expressions and fix if any - unnamed <- !nzchar(names(exprs)) - # Deparse and take the first element in case they're long expressions - names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) - mask <- arrow_mask(.data) results <- list() for (i in seq_along(exprs)) { @@ -133,3 +128,11 @@ check_transmute_args <- function(..., .keep, .before, .after) { } enquos(...) } + +ensure_named_exprs <- function(exprs) { + # Check for unnamed expressions and fix if any + unnamed <- !nzchar(names(exprs)) + # Deparse and take the first element in case they're long expressions + names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) + exprs +} \ No newline at end of file diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 8ca4757a4e2..5fb22200a04 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -47,11 +47,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # ARROW-13550 abort("`summarize()` with `.groups` argument not supported in Arrow") } - exprs <- quos(...) - # Check for unnamed expressions and fix if any - unnamed <- !nzchar(names(exprs)) - # Deparse and take the first element in case they're long expressions - names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) + exprs <- ensure_named_exprs(quos(...)) mask <- arrow_mask(.data, aggregation = TRUE) @@ -68,17 +64,15 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { ) stop(msg, call. = FALSE) } - # Put it in the data mask too? - # Should we: mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] } - # TODO: Should summarize just record the aggregations and leave this projection etc. to do_exec_plan? - # Now, from that, split out the data (expressions) and options - .data$aggregations <- lapply(results, function(x) x[c("fun", "options")]) - inputs <- lapply(results, function(x) x$data) - # TODO: validate that none of names(inputs) are the same as names(group_by_vars) - # dplyr does not error on this but the result it gives isn't great - .data$selected_columns <- c(inputs, .data$selected_columns[.data$group_by_vars]) - + .data$aggregations <- results do_collapse(.data) +} + +summarize_projection <- function(.data) { + c( + map(.data$aggregations, ~ .$data), + .data$selected_columns[.data$group_by_vars] + ) } \ No newline at end of file diff --git a/r/R/dplyr.R b/r/R/dplyr.R index f7e170415d1..a93a0020f82 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -87,18 +87,21 @@ implicit_schema <- function(.data) { # c(.data$group_by_vars, names(.data$aggregations)) .data <- ensure_group_vars(.data) old_schm <- .data$.data$schema - new_fields <- map(.data$selected_columns, ~ .$type(old_schm)) + if (is.null(.data$aggregations)) { - return(schema(!!!new_fields)) + new_fields <- map(.data$selected_columns, ~ .$type(old_schm)) + } else { + new_fields <- map(summarize_projection(.data), ~ .$type(old_schm)) + # * Put group_by_vars first (this can't be done by summarize, they have to be last per the aggregate node signature, and they get projected to this order after aggregation) + # * Infer the output types from the aggregations + group_fields <- new_fields[.data$group_by_vars] + agg_fields <- imap( + new_fields[setdiff(names(new_fields), .data$group_by_vars)], + ~ output_type(.data$aggregations[[.y]][["fun"]], .x) + ) + new_fields <- c(group_fields, agg_fields) } - # * Put group_by_vars first (this can't be done by summarize, they have to be last per the aggregate node signature, and they get projected to this order after aggregation) - # * Infer the output types from the aggregations - group_fields <- new_fields[.data$group_by_vars] - agg_fields <- imap( - new_fields[setdiff(names(new_fields), .data$group_by_vars)], - ~ output_type(.data$aggregations[[.y]][["fun"]], .x) - ) - schema(!!!c(group_fields, agg_fields)) + schema(!!!new_fields) } make_field_refs <- function(field_names) { @@ -233,4 +236,4 @@ abandon_ship <- function(call, .data, msg) { eval.parent(call, 2) } -query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") +query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") \ No newline at end of file diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 58b3f1ab28e..462d58cece0 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -79,13 +79,14 @@ ExecPlan <- R6Class("ExecPlan", if (inherits(.data$filtered_rows, "Expression")) { node <- node$Filter(.data$filtered_rows) } - # If any columns are derived, reordered, or renamed we need to Project - projection <- c(.data$selected_columns, .data$temp_columns) - if (!isTRUE(all.equal(projection, make_field_refs(names(.data$.data$schema))))) { - node <- node$Project(projection) - } - if (length(.data$aggregations)) { + if (!is.null(.data$aggregations)) { + # Project to include just the data required for each aggregation, + # plus group_by_vars (last) + # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) + # dplyr does not error on this but the result it gives isn't great + node <- node$Project(summarize_projection(.data)) + if (grouped) { # We need to prefix all of the aggregation function names with "hash_" .data$aggregations <- lapply(.data$aggregations, function(x) { @@ -95,8 +96,8 @@ ExecPlan <- R6Class("ExecPlan", } node <- node$Aggregate( - options = .data$aggregations, - target_names = target_names, + options = map(.data$aggregations, ~ .[c("fun", "options")]), + target_names = names(.data$aggregations), out_field_names = names(.data$aggregations), key_names = group_vars ) @@ -114,6 +115,16 @@ ExecPlan <- R6Class("ExecPlan", orders = rep(0L, length(group_vars)) ) } + } else { + # If any columns are derived, reordered, or renamed we need to Project + # If there are aggregations, the projection was already handled above + # We have to project at least once to eliminate some junk columns + # that the ExecPlan adds: + # __fragment_index, __batch_index, __last_in_fragment + # Presumably extraneous repeated projection of the same thing + # (as when we've done collapse() and not projected after) is cheap/no-op + projection <- c(.data$selected_columns, .data$temp_columns) + node <- node$Project(projection) } # Apply sorting: this is currently not an ExecNode itself, it is a @@ -145,20 +156,26 @@ ExecNode <- R6Class("ExecNode", # which don't currently yield their own ExecNode but rather are consumed # in the SinkNode (in ExecPlan$run()) sort = NULL, + preserve_sort = function(new_node) { + new_node$sort <- self$sort + new_node + }, Project = function(cols) { if (length(cols)) { assert_is_list_of(cols, "Expression") - ExecNode_Project(self, cols, names(cols)) + self$preserve_sort(ExecNode_Project(self, cols, names(cols))) } else { - ExecNode_Project(self, character(0), character(0)) + self$preserve_sort(ExecNode_Project(self, character(0), character(0))) } }, Filter = function(expr) { assert_is(expr, "Expression") - ExecNode_Filter(self, expr) + self$preserve_sort(ExecNode_Filter(self, expr)) }, Aggregate = function(options, target_names, out_field_names, key_names) { - ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) + self$preserve_sort( + ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) + ) } ) ) \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index e56ee4be462..ea511e88f94 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -396,4 +396,4 @@ test_that("filter() with .data pronoun", { collect(), tbl ) -}) +}) \ No newline at end of file From bcea9c8f953670d2744ac3bee4250c67a3c35af1 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 27 Aug 2021 14:24:58 -0400 Subject: [PATCH 11/27] Clarify comments and add todos for the collapse() work --- r/R/dataset-scan.R | 1 + r/R/dplyr.R | 9 +++++++-- r/R/query-engine.R | 7 +++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R index 615b0f945a8..3ea501ea9ec 100644 --- a/r/R/dataset-scan.R +++ b/r/R/dataset-scan.R @@ -82,6 +82,7 @@ Scanner$create <- function(dataset, } if (inherits(dataset, "arrow_dplyr_query")) { + # TODO: update for collapse() if (inherits(dataset$.data, "ArrowTabular")) { # To handle mutate() on Table/RecordBatch, we need to collect(as_data_frame=FALSE) now dataset <- dplyr::collect(dataset, as_data_frame = FALSE) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index a93a0020f82..904792b59cc 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -78,13 +78,13 @@ arrow_dplyr_query <- function(.data) { ) } +# TODO: move to dplyr-collect.R do_collapse <- function(.data) { .data$schema <- implicit_schema(.data) .adq(.data) } implicit_schema <- function(.data) { - # c(.data$group_by_vars, names(.data$aggregations)) .data <- ensure_group_vars(.data) old_schm <- .data$.data$schema @@ -111,6 +111,7 @@ make_field_refs <- function(field_names) { #' @export print.arrow_dplyr_query <- function(x, ...) { schm <- x$.data$schema + # TODO: refactor this to use implicit_schema(x) types <- map_chr(x$selected_columns, function(expr) { name <- expr$field_name if (nzchar(name)) { @@ -125,6 +126,7 @@ print.arrow_dplyr_query <- function(x, ...) { } }) fields <- paste(names(types), types, sep = ": ", collapse = "\n") + # TODO: update for collapse() cat(class(x$.data)[1], " (query)\n", sep = "") cat(fields, "\n", sep = "") cat("\n") @@ -150,6 +152,7 @@ print.arrow_dplyr_query <- function(x, ...) { sep = "" ) } + # TODO: update for collapse() cat("See $.data for the source Arrow object\n") invisible(x) } @@ -163,6 +166,7 @@ dim.arrow_dplyr_query <- function(x) { cols <- length(names(x)) if (isTRUE(x$filtered)) { + # TODO: update for collapse() rows <- x$.data$num_rows } else { rows <- Scanner$create(x)$CountRows() @@ -236,4 +240,5 @@ abandon_ship <- function(call, .data, msg) { eval.parent(call, 2) } -query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") \ No newline at end of file +# TODO: update for collapse() +query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 462d58cece0..f7a5c2888d2 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -43,6 +43,7 @@ ExecPlan <- R6Class("ExecPlan", dataset$selected_columns, field_names_in_expression ))) + # TODO: update for collapse() (assert that is Dataset now?) dataset <- dataset$.data } else { if (inherits(dataset, "ArrowTabular")) { @@ -129,7 +130,9 @@ ExecPlan <- R6Class("ExecPlan", # Apply sorting: this is currently not an ExecNode itself, it is a # sink node option. - # TODO: error if doing a subsequent operation that would throw away sorting! + # TODO: handle some cases: + # (1) arrange > summarize > arrange + # (2) ARROW-13779: arrange then operation where order matters (e.g. cumsum) if (length(.data$arrange_vars)) { node$sort <- list( names = names(.data$arrange_vars), @@ -178,4 +181,4 @@ ExecNode <- R6Class("ExecNode", ) } ) -) \ No newline at end of file +) From 11c7066dc96343cb854e628dfd686bd950a42150 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 30 Aug 2021 12:24:11 -0400 Subject: [PATCH 12/27] Add collapse() --- r/R/arrow-package.R | 2 +- r/R/dataset-scan.R | 17 ++-- r/R/dplyr-arrange.R | 2 +- r/R/dplyr-collect.R | 38 +++++++- r/R/dplyr-filter.R | 2 +- r/R/dplyr-group-by.R | 2 +- r/R/dplyr-mutate.R | 4 +- r/R/dplyr-select.R | 6 +- r/R/dplyr-summarize.R | 7 +- r/R/dplyr.R | 71 ++++++--------- r/R/duckdb.R | 2 +- r/R/query-engine.R | 4 +- r/tests/testthat/test-dplyr-collapse.R | 111 ++++++++++++++++++++++++ r/tests/testthat/test-dplyr-summarize.R | 16 ++++ 14 files changed, 210 insertions(+), 74 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 04f01faf268..c09b8f05319 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -35,7 +35,7 @@ c( "select", "filter", "collect", "summarise", "group_by", "groups", "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute", - "arrange", "rename", "pull", "relocate", "compute" + "arrange", "rename", "pull", "relocate", "compute", "collapse" ) ) for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) { diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R index 3ea501ea9ec..dc0686ee605 100644 --- a/r/R/dataset-scan.R +++ b/r/R/dataset-scan.R @@ -73,19 +73,14 @@ Scanner$create <- function(dataset, projection = NULL, filter = TRUE, use_threads = option_use_threads(), - use_async = NULL, + use_async = getOption("arrow.use_async", FALSE), batch_size = NULL, fragment_scan_options = NULL, ...) { - if (is.null(use_async)) { - use_async <- getOption("arrow.use_async", FALSE) - } - if (inherits(dataset, "arrow_dplyr_query")) { - # TODO: update for collapse() - if (inherits(dataset$.data, "ArrowTabular")) { - # To handle mutate() on Table/RecordBatch, we need to collect(as_data_frame=FALSE) now - dataset <- dplyr::collect(dataset, as_data_frame = FALSE) + if (is_collapsed(dataset)) { + # TODO: Is there a way to get a RecordBatchReader rather than evaluating? + dataset$.data <- as_adq(dplyr::compute(dataset$.data))$.data } proj <- c(dataset$selected_columns, dataset$temp_columns) @@ -118,7 +113,7 @@ Scanner$create <- function(dataset, ... )) } - if (inherits(dataset, c("data.frame", "RecordBatch", "Table"))) { + if (inherits(dataset, c("data.frame", "ArrowTabular"))) { dataset <- InMemoryDataset$create(dataset) } assert_is(dataset, "Dataset") @@ -236,4 +231,4 @@ ScannerBuilder <- R6Class("ScannerBuilder", ) #' @export -names.ScannerBuilder <- function(x) names(x$schema) +names.ScannerBuilder <- function(x) names(x$schema) \ No newline at end of file diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R index 345fc183295..017e1d6b302 100644 --- a/r/R/dplyr-arrange.R +++ b/r/R/dplyr-arrange.R @@ -30,7 +30,7 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { # Nothing to do return(.data) } - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) # find and remove any dplyr::desc() and tidy-eval # the arrange expressions inside an Arrow data_mask sorts <- vector("list", length(exprs)) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 7db1b682305..f810a1bd57c 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -19,7 +19,6 @@ # The following S3 methods are registered on load if dplyr is present collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { - # Pull only the selected rows and cols into R # See query-engine.R for ExecPlan/Nodes tab <- do_exec_plan(x) if (as_data_frame) { @@ -37,20 +36,21 @@ collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) { x } } -collect.Dataset <- function(x, ...) dplyr::collect(arrow_dplyr_query(x), ...) +collect.Dataset <- function(x, ...) dplyr::collect(as_adq(x), ...) compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame = FALSE) compute.ArrowTabular <- function(x, ...) x compute.Dataset <- compute.arrow_dplyr_query pull.arrow_dplyr_query <- function(.data, var = -1) { - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) var <- vars_pull(names(.data), !!enquo(var)) .data$selected_columns <- set_names(.data$selected_columns[var], var) dplyr::collect(.data)[[1]] } pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query +# TODO: Correctly handle group_vars after summarize; also in collapse() restore_dplyr_features <- function(df, query) { # An arrow_dplyr_query holds some attributes that Arrow doesn't know about # After calling collect(), make sure these features are carried over @@ -65,10 +65,40 @@ restore_dplyr_features <- function(df, query) { ) } else { # This is a Table, via compute() or collect(as_data_frame = FALSE) - df <- arrow_dplyr_query(df) + df <- as_adq(df) df$group_by_vars <- query$group_by_vars df$drop_empty_groups <- query$drop_empty_groups } } df } + +collapse.arrow_dplyr_query <- function(x, ...) { + # Figure out what schema will result from the query + x$schema <- implicit_schema(x) + # Nest inside a new arrow_dplyr_query + arrow_dplyr_query(x) +} +collapse.Dataset <- collapse.ArrowTabular <- function(x, ...) { + arrow_dplyr_query(x) +} + +implicit_schema <- function(.data) { + .data <- ensure_group_vars(.data) + old_schm <- .data$.data$schema + + if (is.null(.data$aggregations)) { + new_fields <- map(.data$selected_columns, ~ .$type(old_schm)) + } else { + new_fields <- map(summarize_projection(.data), ~ .$type(old_schm)) + # * Put group_by_vars first (this can't be done by summarize, they have to be last per the aggregate node signature, and they get projected to this order after aggregation) + # * Infer the output types from the aggregations + group_fields <- new_fields[.data$group_by_vars] + agg_fields <- imap( + new_fields[setdiff(names(new_fields), .data$group_by_vars)], + ~ output_type(.data$aggregations[[.y]][["fun"]], .x) + ) + new_fields <- c(group_fields, agg_fields) + } + schema(!!!new_fields) +} \ No newline at end of file diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index a44750a9c81..61f27010e77 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -26,7 +26,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) { return(.data) } - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) # tidy-eval the filter expressions inside an Arrow data_mask filters <- lapply(filts, arrow_eval, arrow_mask(.data)) bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index 42cca039022..a89144d6c4a 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -23,7 +23,7 @@ group_by.arrow_dplyr_query <- function(.data, .add = FALSE, add = .add, .drop = dplyr::group_by_drop_default(.data)) { - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) new_groups <- enquos(...) # ... can contain expressions (i.e. can add (or rename?) columns) and so we # need to identify those and add them on to the query with mutate. Specifically, diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index f2df4a078a4..051c5254e50 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -35,7 +35,7 @@ mutate.arrow_dplyr_query <- function(.data, return(.data) } - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) # Restrict the cases we support for now if (length(dplyr::group_vars(.data)) > 0) { @@ -135,4 +135,4 @@ ensure_named_exprs <- function(exprs) { # Deparse and take the first element in case they're long expressions names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) exprs -} \ No newline at end of file +} diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index ee740db4cfb..9a867ced964 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -22,13 +22,13 @@ tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns) select.arrow_dplyr_query <- function(.data, ...) { check_select_helpers(enexprs(...)) - column_select(arrow_dplyr_query(.data), !!!enquos(...)) + column_select(as_adq(.data), !!!enquos(...)) } select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query rename.arrow_dplyr_query <- function(.data, ...) { check_select_helpers(enexprs(...)) - column_select(arrow_dplyr_query(.data), !!!enquos(...), .FUN = vars_rename) + column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename) } rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query @@ -60,7 +60,7 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829 - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) # Assign the schema to the expressions map(.data$selected_columns, ~ (.$schema <- .data$.data$schema)) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 5fb22200a04..c6806b27e3b 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -20,7 +20,7 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb")) { call <- match.call() - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) exprs <- quos(...) # Only retain the columns we need to do our aggregations vars_to_keep <- unique(c( @@ -67,7 +67,8 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { } .data$aggregations <- results - do_collapse(.data) + # TODO: should in-memory query evaluate eagerly? + collapse.arrow_dplyr_query(.data) } summarize_projection <- function(.data) { @@ -75,4 +76,4 @@ summarize_projection <- function(.data) { map(.data$aggregations, ~ .$data), .data$selected_columns[.data$group_by_vars] ) -} \ No newline at end of file +} diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 904792b59cc..8010ae8e6fa 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -23,14 +23,10 @@ arrow_dplyr_query <- function(.data) { # An arrow_dplyr_query is a container for an Arrow data object (Table, # RecordBatch, or Dataset) and the state of the user's dplyr query--things # like selected columns, filters, and group vars. - - # For most dplyr methods, - # method.Table == method.RecordBatch == method.Dataset == method.arrow_dplyr_query - # This works because the functions all pass .data through arrow_dplyr_query() - if (inherits(.data, "arrow_dplyr_query")) { - return(.data) + # An arrow_dplyr_query can contain another arrow_dplyr_query in .data + if (!inherits(.data, c("Dataset", "arrow_dplyr_query"))) { + .data <- InMemoryDataset$create(.data) } - # Evaluating expressions on a dataset with duplicated fieldnames will error dupes <- duplicated(names(.data)) if (any(dupes)) { @@ -42,14 +38,6 @@ arrow_dplyr_query <- function(.data) { ) )) } - - .adq(.data) -} - -.adq <- function(.data) { - if (!inherits(.data, c("Dataset", "arrow_dplyr_query"))) { - .data <- InMemoryDataset$create(.data) - } structure( list( .data = .data, @@ -78,30 +66,14 @@ arrow_dplyr_query <- function(.data) { ) } -# TODO: move to dplyr-collect.R -do_collapse <- function(.data) { - .data$schema <- implicit_schema(.data) - .adq(.data) -} - -implicit_schema <- function(.data) { - .data <- ensure_group_vars(.data) - old_schm <- .data$.data$schema - - if (is.null(.data$aggregations)) { - new_fields <- map(.data$selected_columns, ~ .$type(old_schm)) - } else { - new_fields <- map(summarize_projection(.data), ~ .$type(old_schm)) - # * Put group_by_vars first (this can't be done by summarize, they have to be last per the aggregate node signature, and they get projected to this order after aggregation) - # * Infer the output types from the aggregations - group_fields <- new_fields[.data$group_by_vars] - agg_fields <- imap( - new_fields[setdiff(names(new_fields), .data$group_by_vars)], - ~ output_type(.data$aggregations[[.y]][["fun"]], .x) - ) - new_fields <- c(group_fields, agg_fields) +as_adq <- function(.data) { + # For most dplyr methods, + # method.Table == method.RecordBatch == method.Dataset == method.arrow_dplyr_query + # This works because the functions all pass .data through as_adq() + if (inherits(.data, "arrow_dplyr_query")) { + return(.data) } - schema(!!!new_fields) + arrow_dplyr_query(.data) } make_field_refs <- function(field_names) { @@ -111,7 +83,6 @@ make_field_refs <- function(field_names) { #' @export print.arrow_dplyr_query <- function(x, ...) { schm <- x$.data$schema - # TODO: refactor this to use implicit_schema(x) types <- map_chr(x$selected_columns, function(expr) { name <- expr$field_name if (nzchar(name)) { @@ -165,8 +136,10 @@ names.arrow_dplyr_query <- function(x) names(x$selected_columns) dim.arrow_dplyr_query <- function(x) { cols <- length(names(x)) - if (isTRUE(x$filtered)) { - # TODO: update for collapse() + if (is_collapsed(x)) { + # Don't evaluate just for nrow + rows <- NA_integer_ + } else if (isTRUE(x$filtered)) { rows <- x$.data$num_rows } else { rows <- Scanner$create(x)$CountRows() @@ -181,12 +154,14 @@ as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALS #' @export head.arrow_dplyr_query <- function(x, n = 6L, ...) { + # TODO: refactor/rename out <- head.Dataset(x, n, ...) restore_dplyr_features(out, x) } #' @export tail.arrow_dplyr_query <- function(x, n = 6L, ...) { + # TODO: refactor/rename out <- tail.Dataset(x, n, ...) restore_dplyr_features(out, x) } @@ -194,6 +169,7 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' @export `[.arrow_dplyr_query` <- `[.Dataset` # TODO: ^ should also probably restore_dplyr_features, and/or that should be moved down +# TODO: refactor/rename ensure_group_vars <- function(x) { if (inherits(x, "arrow_dplyr_query")) { @@ -228,17 +204,24 @@ ensure_arrange_vars <- function(x) { # * For Table/RecordBatch, we collect() and then call the dplyr method in R # * For Dataset, we just error abandon_ship <- function(call, .data, msg) { + msg <- trimws(msg) dplyr_fun_name <- sub("^(.*?)\\..*", "\\1", as.character(call[[1]])) if (query_on_dataset(.data)) { stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE) } # else, collect and call dplyr method - msg <- sub("\\n$", "", msg) warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE) call$.data <- dplyr::collect(.data) call[[1]] <- get(dplyr_fun_name, envir = asNamespace("dplyr")) eval.parent(call, 2) } -# TODO: update for collapse() -query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") +query_on_dataset <- function(x) { + if (is_collapsed(x)) { + query_on_dataset((x$.data)) + } else { + !inherits(x$.data, "InMemoryDataset") + } +} + +is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query") \ No newline at end of file diff --git a/r/R/duckdb.R b/r/R/duckdb.R index edef5cdc143..87d1b2cfad6 100644 --- a/r/R/duckdb.R +++ b/r/R/duckdb.R @@ -60,7 +60,7 @@ to_duckdb <- function(.data, con = arrow_duck_connection(), table_name = unique_arrow_tablename(), auto_disconnect = TRUE) { - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) duckdb::duckdb_register_arrow(con, table_name, .data) tbl <- tbl(con, table_name) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index f7a5c2888d2..534cd05e299 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -43,8 +43,8 @@ ExecPlan <- R6Class("ExecPlan", dataset$selected_columns, field_names_in_expression ))) - # TODO: update for collapse() (assert that is Dataset now?) dataset <- dataset$.data + assert_is(dataset, "Dataset") } else { if (inherits(dataset, "ArrowTabular")) { dataset <- InMemoryDataset$create(dataset) @@ -181,4 +181,4 @@ ExecNode <- R6Class("ExecNode", ) } ) -) +) \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index 84287976ced..674bcd249d4 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -81,4 +81,115 @@ test_that("implicit_schema with group_by summarize", { implicit_schema(), schema(some_grouping = float64(), avg = float64()) ) +}) + +test_that("collapse", { + q <- tab %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) + expect_false(is_collapsed(q)) + expect_true(is_collapsed(collapse(q))) + + expect_dplyr_equal( + input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + collapse() %>% + filter(int < 5) %>% + select(int, twice) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + collapse() %>% + select(chr, int, lgl) %>% + collapse() %>% + filter(int < 5) %>% + select(int, chr) %>% + collect(), + tbl + ) +}) + +test_that("Properties of collapsed query", { + q <- tab %>% + filter(dbl > 2) %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + group_by(lgl) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + mutate(extra = total * 5) + + # print(tbl %>% + # filter(dbl > 2) %>% + # select(chr, int, lgl) %>% + # mutate(twice = int * 2L) %>% + # group_by(lgl) %>% + # summarize(total = sum(int, na.rm = TRUE)) %>% + # mutate(extra = total * 5)) + + # # A tibble: 3 × 3 + # lgl total extra + # + # 1 FALSE 8 40 + # 2 TRUE 8 40 + # 3 NA 25 125 + + # Avoid evaluating just for nrow + expect_identical(dim(q), c(NA_integer_, 3L)) + + + # TODO: improve print method + # expect_output(print(q), + # "arrow_dplyr_query (query) + # lgl: bool + # total: int32 + # extra: double (multiply_checked(total, 5)) + + # See $.data for the source Arrow object" + # ) + + expect_equal( + head(q, 1) %>% collect(), + tibble::tibble(lgl = FALSE, total = 8L, extra = 40) + ) + expect_equal( + tail(q, 1) %>% collect(), + tibble::tibble(lgl = NA, total = 25L, extra = 125) + ) +}) + +test_that("query_on_dataset handles collapse()", { + expect_false(query_on_dataset( + tab %>% + select(int, chr) + )) + expect_false(query_on_dataset( + tab %>% + select(int, chr) %>% + collapse() %>% + select(int) + )) + + ds_dir <- tempfile() + dir.create(ds_dir) + on.exit(unlink(ds_dir)) + write_parquet(tab, file.path(ds_dir, "file.parquet")) + ds <- open_dataset(ds_dir) + + expect_true(query_on_dataset( + ds %>% + select(int, chr) + )) + expect_true(query_on_dataset( + ds %>% + select(int, chr) %>% + collapse() %>% + select(int) + )) }) \ No newline at end of file diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index ec9d281e053..5d7fc999beb 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -324,6 +324,22 @@ test_that("Do things after summarize", { collect(), tbl ) + + skip("ARROW-13501") + expect_dplyr_equal( + input %>% + filter(dbl > 2) %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + group_by(lgl) %>% + summarize( + count = n(), + total = sum(twice, na.rm = TRUE) + ) %>% + mutate(mean = total / count) %>% + collect(), + tbl + ) }) test_that("Expressions on aggregations", { From cc2f0d7bb43d8dc5cbba89a6c48da1d1a75a4b03 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 30 Aug 2021 14:02:12 -0400 Subject: [PATCH 13/27] Style and unskip test --- r/R/dataset-scan.R | 2 +- r/R/dplyr-collect.R | 2 +- r/R/dplyr-functions.R | 26 ++++++++++++------------- r/R/dplyr.R | 2 +- r/R/query-engine.R | 2 +- r/tests/testthat/test-dataset.R | 2 +- r/tests/testthat/test-dplyr-collapse.R | 2 +- r/tests/testthat/test-dplyr-filter.R | 2 +- r/tests/testthat/test-dplyr-summarize.R | 3 +-- 9 files changed, 21 insertions(+), 22 deletions(-) diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R index dc0686ee605..75108df1052 100644 --- a/r/R/dataset-scan.R +++ b/r/R/dataset-scan.R @@ -231,4 +231,4 @@ ScannerBuilder <- R6Class("ScannerBuilder", ) #' @export -names.ScannerBuilder <- function(x) names(x$schema) \ No newline at end of file +names.ScannerBuilder <- function(x) names(x$schema) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index f810a1bd57c..94210276417 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -101,4 +101,4 @@ implicit_schema <- function(.data) { new_fields <- c(group_fields, agg_fields) } schema(!!!new_fields) -} \ No newline at end of file +} diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 2247d22f368..72731216f50 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -842,16 +842,16 @@ agg_funcs$n <- function() { } output_type <- function(fun, input_type) { - # These are quick and dirty heuristics. - if (fun %in% c("any", "all")) { - bool() - } else if (fun %in% "sum") { - # It may upcast to a bigger type but this is close enough - input_type - } else if (fun %in% c("mean", "stddev", "variance")) { - float64() - } else { - # Just so things don't error, assume the resulting type is the same - input_type - } -} \ No newline at end of file + # These are quick and dirty heuristics. + if (fun %in% c("any", "all")) { + bool() + } else if (fun %in% "sum") { + # It may upcast to a bigger type but this is close enough + input_type + } else if (fun %in% c("mean", "stddev", "variance")) { + float64() + } else { + # Just so things don't error, assume the resulting type is the same + input_type + } +} diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 8010ae8e6fa..20e976d7ce8 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -224,4 +224,4 @@ query_on_dataset <- function(x) { } } -is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query") \ No newline at end of file +is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query") diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 534cd05e299..c3621093a8d 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -181,4 +181,4 @@ ExecNode <- R6Class("ExecNode", ) } ) -) \ No newline at end of file +) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index f7a41434e03..d1f4c98a28f 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -1728,4 +1728,4 @@ test_that("Error if no format specified and files are not parquet", { "Did you mean to specify a 'format'" ) ) -}) \ No newline at end of file +}) diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index 674bcd249d4..57e03001d91 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -192,4 +192,4 @@ test_that("query_on_dataset handles collapse()", { collapse() %>% select(int) )) -}) \ No newline at end of file +}) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index ea511e88f94..e56ee4be462 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -396,4 +396,4 @@ test_that("filter() with .data pronoun", { collect(), tbl ) -}) \ No newline at end of file +}) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 5d7fc999beb..daf11767bfe 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -325,7 +325,6 @@ test_that("Do things after summarize", { tbl ) - skip("ARROW-13501") expect_dplyr_equal( input %>% filter(dbl > 2) %>% @@ -367,4 +366,4 @@ test_that("Expressions on aggregations", { collect(), tbl ) -}) \ No newline at end of file +}) From 2ea6d04c24e9a427dc546534bb21ac6670c19a9a Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Tue, 31 Aug 2021 09:33:20 -0500 Subject: [PATCH 14/27] use arrange instead of hardcoding --- r/tests/testthat/test-metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index bc6d285b333..9a540f47b02 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -241,7 +241,7 @@ test_that("metadata of list elements (ARROW-10386)", { df_from_ds <- collect(ds), "Row-level metadata is not compatible with this operation and has been ignored" ) - expect_equal(df_from_ds[c(1, 4, 3, 2), ], df, check.attributes = FALSE) + expect_equal(arrange(df_from_ds, int), arrange(df, int), check.attributes = FALSE) # however there is *no* warning if we don't select the metadata column expect_warning( From 9e26457aa62eb9e22b0fbf6690fd468d9e29f1ca Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 1 Sep 2021 15:09:50 -0400 Subject: [PATCH 15/27] Skip column metadata warning test --- r/tests/testthat/test-metadata.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 9a540f47b02..6ebdcd93283 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -239,7 +239,8 @@ test_that("metadata of list elements (ARROW-10386)", { ds <- open_dataset(dst_dir) expect_warning( df_from_ds <- collect(ds), - "Row-level metadata is not compatible with this operation and has been ignored" + NA # TODO: ARROW-13852 + # "Row-level metadata is not compatible with this operation and has been ignored" ) expect_equal(arrange(df_from_ds, int), arrange(df, int), check.attributes = FALSE) @@ -248,4 +249,4 @@ test_that("metadata of list elements (ARROW-10386)", { df_from_ds <- ds %>% select(int) %>% collect(), NA ) -}) +}) \ No newline at end of file From 92d8d3fb490260f33a01a6fa3dae8958e668088c Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 1 Sep 2021 16:31:41 -0400 Subject: [PATCH 16/27] Note breaking changes before I forget --- r/NEWS.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/r/NEWS.md b/r/NEWS.md index 2a22681e457..0421457e067 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,6 +19,11 @@ # arrow 5.0.0.9000 +## Breaking changes + +* `dplyr::summarize()` on an in-memory Arrow Table or RecordBatch no longer eagerly evaluates. Call `compute()` or `collect()` to evaluate the query. +* Row order of data from a Dataset query is no longer deterministic. If you need a stable sort order, you should explicitly `arrange()` the query. + # arrow 5.0.0 ## More dplyr From 88d07bb35a1ddca274fd6c866ef49e8cbb748cc9 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 2 Sep 2021 11:20:30 -0400 Subject: [PATCH 17/27] Add options(arrow.summarise.sort), default FALSE --- r/R/query-engine.R | 14 ++++++++------ r/tests/testthat/test-dplyr-collapse.R | 2 ++ r/tests/testthat/test-dplyr-summarize.R | 2 ++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index c3621093a8d..a96378671af 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -109,12 +109,14 @@ ExecPlan <- R6Class("ExecPlan", node <- node$Project( make_field_refs(c(group_vars, names(.data$aggregations))) ) - # Add sorting instructions for the rows too to match dplyr - # (see below about why sorting isn't itself a Node) - node$sort <- list( - names = group_vars, - orders = rep(0L, length(group_vars)) - ) + if (getOption("arrow.summarise.sort", FALSE)) { + # Add sorting instructions for the rows too to match dplyr + # (see below about why sorting isn't itself a Node) + node$sort <- list( + names = group_vars, + orders = rep(0L, length(group_vars)) + ) + } } } else { # If any columns are derived, reordered, or renamed we need to Project diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index 57e03001d91..4d3cab86add 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -17,6 +17,8 @@ skip_if_not_available("dataset") +withr::local_options(list(arrow.summarise.sort = TRUE)) + library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index daf11767bfe..213b48e6543 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -17,6 +17,8 @@ skip_if_not_available("dataset") +# withr::local_options(list(arrow.summarise.sort = TRUE)) + library(dplyr) library(stringr) From b7d6313c68c11c5d733ed408ed0b839229b605f6 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 2 Sep 2021 11:27:34 -0400 Subject: [PATCH 18/27] Skip all dataset tests on 32-bit windows rtools35 --- r/tests/testthat/helper-skip.R | 9 +++++++++ r/tests/testthat/test-dataset.R | 10 +--------- r/tests/testthat/test-dplyr-arrange.R | 1 + r/tests/testthat/test-dplyr-collapse.R | 1 + r/tests/testthat/test-dplyr-filter.R | 1 + r/tests/testthat/test-dplyr-group-by.R | 1 + r/tests/testthat/test-dplyr-lubridate.R | 1 + r/tests/testthat/test-dplyr-mutate.R | 1 + r/tests/testthat/test-dplyr-string-functions.R | 1 + r/tests/testthat/test-dplyr-summarize.R | 1 + r/tests/testthat/test-dplyr.R | 1 + r/tests/testthat/test-duckdb.R | 2 ++ 12 files changed, 21 insertions(+), 9 deletions(-) diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 906963e38d1..9a6bba60f91 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -68,6 +68,15 @@ skip_on_valgrind <- function() { } } +skip_if_multithreading_disabled <- function() { + is_32bit <- .Machine$sizeof.pointer < 8 + is_old_r <- getRversion() < "4.0.0" + is_windows <- tolower(Sys.info()[["sysname"]]) == "windows" + if (is_32bit && is_old_r && is_windows) { + skip("Multithreading does not work properly on this system") + } +} + process_is_running <- function(x) { cmd <- sprintf("ps aux | grep '%s' | grep -v grep", x) tryCatch(system(cmd, ignore.stdout = TRUE) == 0, error = function(e) FALSE) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index d1f4c98a28f..411eb2ef36e 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() context("Dataset") @@ -27,15 +28,6 @@ ipc_dir <- make_temp_dir() csv_dir <- make_temp_dir() tsv_dir <- make_temp_dir() -skip_if_multithreading_disabled <- function() { - is_32bit <- .Machine$sizeof.pointer < 8 - is_old_r <- getRversion() < "4.0.0" - is_windows <- tolower(Sys.info()[["sysname"]]) == "windows" - if (is_32bit && is_old_r && is_windows) { - skip("Multithreading does not work properly on this system") - } -} - first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") df1 <- tibble( diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R index fc24df58ca7..f131b8444f5 100644 --- a/r/tests/testthat/test-dplyr-arrange.R +++ b/r/tests/testthat/test-dplyr-arrange.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() library(dplyr) diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index 4d3cab86add..776988f870c 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() withr::local_options(list(arrow.summarise.sort = TRUE)) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index e56ee4be462..fb4951ac45e 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index 18be2a9304a..0a8c76a920d 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 64bb42a0ecf..e3281436379 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() library(lubridate) library(dplyr) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 44127839108..3f2bb526f0c 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index b6b8f5a714a..5383bcb2376 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -17,6 +17,7 @@ skip_if_not_available("dataset") skip_if_not_available("utf8proc") +skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 213b48e6543..8c60f844e5b 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() # withr::local_options(list(arrow.summarise.sort = TRUE)) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index d3a9994b5f1..295d7778dc0 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -16,6 +16,7 @@ # under the License. skip_if_not_available("dataset") +skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R index cdfcb62d02d..c4fad4f5148 100644 --- a/r/tests/testthat/test-duckdb.R +++ b/r/tests/testthat/test-duckdb.R @@ -18,6 +18,8 @@ skip_if_not_installed("duckdb", minimum_version = "0.2.8") skip_if_not_installed("dbplyr") skip_if_not_available("dataset") +skip_if_multithreading_disabled() + # when we remove this, we should also remove the FALSE in run_duckdb_examples skip("These tests are flaking: https://github.com/duckdb/duckdb/issues/2100") library(duckdb) From 31ec558c9ef52964fb1292f78d5aedbb7ae4a467 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 2 Sep 2021 13:26:21 -0400 Subject: [PATCH 19/27] Correct but not super satisfying print method --- r/R/dplyr-summarize.R | 4 ++++ r/R/dplyr.R | 17 ++++++++----- r/tests/testthat/test-dplyr-collapse.R | 33 ++++++++++++++++++-------- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index c6806b27e3b..cd93e28f07e 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -77,3 +77,7 @@ summarize_projection <- function(.data) { .data$selected_columns[.data$group_by_vars] ) } + +format_aggregation <- function(x) { + paste0(x$fun, "(", x$data$ToString(), ")") +} diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 20e976d7ce8..c650d7ddb0f 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -97,10 +97,14 @@ print.arrow_dplyr_query <- function(x, ...) { } }) fields <- paste(names(types), types, sep = ": ", collapse = "\n") - # TODO: update for collapse() - cat(class(x$.data)[1], " (query)\n", sep = "") + cat(class(source_data(x))[1], " (query)\n", sep = "") cat(fields, "\n", sep = "") cat("\n") + if (length(x$aggregations)) { + cat("* Aggregations:\n") + aggs <- paste0(names(x$aggregations), ": ", map_chr(x$aggregations, format_aggregation), collapse = "\n") + cat(aggs, "\n", sep = "") + } if (!isTRUE(x$filtered_rows)) { filter_string <- x$filtered_rows$ToString() cat("* Filter: ", filter_string, "\n", sep = "") @@ -123,7 +127,6 @@ print.arrow_dplyr_query <- function(x, ...) { sep = "" ) } - # TODO: update for collapse() cat("See $.data for the source Arrow object\n") invisible(x) } @@ -216,11 +219,13 @@ abandon_ship <- function(call, .data, msg) { eval.parent(call, 2) } -query_on_dataset <- function(x) { +query_on_dataset <- function(x) !inherits(source_data(x), "InMemoryDataset") + +source_data <- function(x) { if (is_collapsed(x)) { - query_on_dataset((x$.data)) + source_data(x$.data) } else { - !inherits(x$.data, "InMemoryDataset") + x$.data } } diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index 776988f870c..eb62e967449 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -146,16 +146,29 @@ test_that("Properties of collapsed query", { # Avoid evaluating just for nrow expect_identical(dim(q), c(NA_integer_, 3L)) - - # TODO: improve print method - # expect_output(print(q), - # "arrow_dplyr_query (query) - # lgl: bool - # total: int32 - # extra: double (multiply_checked(total, 5)) - - # See $.data for the source Arrow object" - # ) + expect_output( + print(q), + "InMemoryDataset (query) +lgl: bool +total: int32 +extra: double (multiply_checked(total, 5)) + +See $.data for the source Arrow object", + fixed = TRUE + ) + expect_output( + print(q$.data), + "InMemoryDataset (query) +int: int32 +lgl: bool + +* Aggregations: +total: sum(int) +* Filter: (dbl > 2) +* Grouped by lgl +See $.data for the source Arrow object", + fixed = TRUE + ) expect_equal( head(q, 1) %>% collect(), From bd251357ec17055ee3ed1fed818b1ebacf61c438 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 2 Sep 2021 13:30:30 -0400 Subject: [PATCH 20/27] sort more tests --- r/tests/testthat/test-s3-minio.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 94451e5351a..2ec67ce6290 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -86,7 +86,7 @@ if (arrow_with_s3() && process_is_running("minio server")) { test_that("open_dataset with an S3 file (not directory) URI", { skip_if_not_available("parquet") expect_identical( - open_dataset(minio_uri("test.parquet")) %>% collect(), + open_dataset(minio_uri("test.parquet")) %>% arrange(int) %>% collect(), example_data ) }) @@ -96,7 +96,9 @@ if (arrow_with_s3() && process_is_running("minio server")) { open_dataset( c(minio_uri("test.feather"), minio_uri("test2.feather")), format = "feather" - ) %>% collect(), + ) %>% + arrange(int) %>% + collect(), rbind(example_data, example_data) ) }) @@ -153,8 +155,8 @@ if (arrow_with_s3() && process_is_running("minio server")) { test_that("open_dataset with fs", { ds <- open_dataset(fs$path(minio_path("hive_dir"))) expect_identical( - ds %>% select(dbl, lgl) %>% collect(), - rbind(df1[, c("dbl", "lgl")], df2[, c("dbl", "lgl")]) + ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) ) }) @@ -170,16 +172,16 @@ if (arrow_with_s3() && process_is_running("minio server")) { expect_length(dir(td), 2) ds <- open_dataset(td) expect_identical( - ds %>% select(dbl, lgl) %>% collect(), - rbind(df1[, c("dbl", "lgl")], df2[, c("dbl", "lgl")]) + ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) ) # Let's copy the other way and use a SubTreeFileSystem rather than URI copy_files(td, fs$path(minio_path("hive_dir2"))) ds2 <- open_dataset(fs$path(minio_path("hive_dir2"))) expect_identical( - ds2 %>% select(dbl, lgl) %>% collect(), - rbind(df1[, c("dbl", "lgl")], df2[, c("dbl", "lgl")]) + ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) ) }) } From f07b420f28ee404fd7afff6a38ebebbc88bb2111 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 2 Sep 2021 13:58:24 -0400 Subject: [PATCH 21/27] More sort --- r/tests/testthat/test-s3-minio.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 2ec67ce6290..a2a13cbf887 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -86,8 +86,8 @@ if (arrow_with_s3() && process_is_running("minio server")) { test_that("open_dataset with an S3 file (not directory) URI", { skip_if_not_available("parquet") expect_identical( - open_dataset(minio_uri("test.parquet")) %>% arrange(int) %>% collect(), - example_data + open_dataset(minio_uri("test.parquet")) %>% collect() %>% arrange(int), + example_data %>% arrange(int) ) }) @@ -99,7 +99,7 @@ if (arrow_with_s3() && process_is_running("minio server")) { ) %>% arrange(int) %>% collect(), - rbind(example_data, example_data) + rbind(example_data, example_data) %>% arrange(int) ) }) @@ -156,7 +156,7 @@ if (arrow_with_s3() && process_is_running("minio server")) { ds <- open_dataset(fs$path(minio_path("hive_dir"))) expect_identical( ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) }) @@ -173,7 +173,7 @@ if (arrow_with_s3() && process_is_running("minio server")) { ds <- open_dataset(td) expect_identical( ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) # Let's copy the other way and use a SubTreeFileSystem rather than URI @@ -181,7 +181,7 @@ if (arrow_with_s3() && process_is_running("minio server")) { ds2 <- open_dataset(fs$path(minio_path("hive_dir2"))) expect_identical( ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) }) } From be2499eb22077143fced9da56e44fe0a2cae2d0b Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 3 Sep 2021 08:17:38 -0400 Subject: [PATCH 22/27] Apply suggestions from code review Co-authored-by: Ian Cook Co-authored-by: Jonathan Keane --- r/R/dplyr.R | 5 +++++ r/tests/testthat/test-dplyr-summarize.R | 2 +- r/tests/testthat/test-metadata.R | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index c650d7ddb0f..7d7c748f0a3 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -66,6 +66,11 @@ arrow_dplyr_query <- function(.data) { ) } +# The only difference between `arrow_dplyr_query()` and `as_adq()` is that if +# `.data` is already an `arrow_dplyr_query`, `as_adq()`, will return it as is, but +# `arrow_dplyr_query()` will nest it inside a new `arrow_dplyr_query`. The only +# place where `arrow_dplyr_query()` should be called directly is inside +# `collapse()` methods; everywhere else, call `as_adq()`. as_adq <- function(.data) { # For most dplyr methods, # method.Table == method.RecordBatch == method.Dataset == method.arrow_dplyr_query diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 8c60f844e5b..dd55108ec05 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -164,7 +164,7 @@ test_that("Group by var on dataset", { ) }) -test_that("Group by any/all", { +test_that("n()", { withr::local_options(list(arrow.debug = TRUE)) expect_dplyr_equal( input %>% diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 6ebdcd93283..6ae5b54fbf3 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -249,4 +249,4 @@ test_that("metadata of list elements (ARROW-10386)", { df_from_ds <- ds %>% select(int) %>% collect(), NA ) -}) \ No newline at end of file +}) From f7e3e546f475faa6bbd02120d7d72080d836b941 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 3 Sep 2021 08:41:13 -0400 Subject: [PATCH 23/27] Cleanups --- r/NEWS.md | 2 +- r/tests/testthat/test-dplyr-summarize.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 0421457e067..eb8001d4718 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -22,7 +22,7 @@ ## Breaking changes * `dplyr::summarize()` on an in-memory Arrow Table or RecordBatch no longer eagerly evaluates. Call `compute()` or `collect()` to evaluate the query. -* Row order of data from a Dataset query is no longer deterministic. If you need a stable sort order, you should explicitly `arrange()` the query. +* Row order of data from a Dataset query is no longer deterministic. If you need a stable sort order, you should explicitly `arrange()` the query. For calls to `summarize()`, you can set `options(arrow.summarise.sort = TRUE)` to match the current `dplyr` behavior of sorting on the grouping columns. # arrow 5.0.0 diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index dd55108ec05..d6ccbc90c04 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -18,7 +18,7 @@ skip_if_not_available("dataset") skip_if_multithreading_disabled() -# withr::local_options(list(arrow.summarise.sort = TRUE)) +withr::local_options(list(arrow.summarise.sort = TRUE)) library(dplyr) library(stringr) @@ -369,4 +369,4 @@ test_that("Expressions on aggregations", { collect(), tbl ) -}) +}) \ No newline at end of file From a63acb9606f006523f19ffd185e6e1790616ee1a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 3 Sep 2021 08:44:58 -0400 Subject: [PATCH 24/27] Improve test verbosity on windows --- .github/workflows/r.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 5acb47a0ae0..8eba1895b8b 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -247,6 +247,7 @@ jobs: Sys.setenv( RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "libarrow.zip"), MAKEFLAGS = paste0("-j", parallel::detectCores()), + ARROW_R_DEV = TRUE, "_R_CHECK_FORCE_SUGGESTS_" = FALSE ) rcmdcheck::rcmdcheck("r", @@ -260,6 +261,15 @@ jobs: shell: cmd run: cat check/arrow.Rcheck/00install.out if: always() + - name: Dump test logs + run: cat r/check/arrow.Rcheck/tests/testthat.Rout* + if: always() + - name: Save the test output + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-output + path: r/check/arrow.Rcheck/tests/testthat.Rout* # We can remove this when we drop support for Rtools 3.5. - name: Ensure using system tar in actions/cache run: | From 3462b24dce8ecddd19340f1bdc0c235e86736d01 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 3 Sep 2021 09:44:04 -0400 Subject: [PATCH 25/27] Skip all dataset tests on old 32-bit windows --- .github/workflows/r.yml | 9 --------- r/tests/testthat/helper-skip.R | 4 ++++ r/tests/testthat/test-dataset.R | 8 -------- r/tests/testthat/test-dplyr-arrange.R | 1 - r/tests/testthat/test-dplyr-collapse.R | 1 - r/tests/testthat/test-dplyr-filter.R | 1 - r/tests/testthat/test-dplyr-group-by.R | 1 - r/tests/testthat/test-dplyr-lubridate.R | 1 - r/tests/testthat/test-dplyr-mutate.R | 1 - r/tests/testthat/test-dplyr-string-functions.R | 1 - r/tests/testthat/test-dplyr-summarize.R | 3 +-- r/tests/testthat/test-dplyr.R | 1 - r/tests/testthat/test-duckdb.R | 1 - 13 files changed, 5 insertions(+), 28 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 8eba1895b8b..e160ba8128a 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -261,15 +261,6 @@ jobs: shell: cmd run: cat check/arrow.Rcheck/00install.out if: always() - - name: Dump test logs - run: cat r/check/arrow.Rcheck/tests/testthat.Rout* - if: always() - - name: Save the test output - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-output - path: r/check/arrow.Rcheck/tests/testthat.Rout* # We can remove this when we drop support for Rtools 3.5. - name: Ensure using system tar in actions/cache run: | diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 9a6bba60f91..9efa1be85b5 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -25,6 +25,10 @@ skip_if_not_available <- function(feature) { if (feature == "re2") { # RE2 does not support valgrind (on purpose): https://github.com/google/re2/issues/177 skip_on_valgrind() + } else if (feature == "dataset") { + # These tests often hang on 32-bit windows rtools35, and we haven't been + # able to figure out how to make them work safely + skip_if_multithreading_disabled() } yes <- feature %in% names(build_features) && build_features[feature] diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 411eb2ef36e..41265f0e638 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() context("Dataset") @@ -346,7 +345,6 @@ test_that("IPC/Feather format data", { }) test_that("CSV dataset", { - skip_if_multithreading_disabled() ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") expect_r6_class(ds$format, "CsvFileFormat") expect_r6_class(ds$filesystem, "LocalFileSystem") @@ -374,7 +372,6 @@ test_that("CSV dataset", { }) test_that("CSV scan options", { - skip_if_multithreading_disabled() options <- FragmentScanOptions$create("text") expect_equal(options$type, "csv") options <- FragmentScanOptions$create("csv", @@ -421,7 +418,6 @@ test_that("CSV scan options", { }) test_that("compressed CSV dataset", { - skip_if_multithreading_disabled() skip_if_not_available("gzip") dst_dir <- make_temp_dir() dst_file <- file.path(dst_dir, "data.csv.gz") @@ -445,7 +441,6 @@ test_that("compressed CSV dataset", { }) test_that("CSV dataset options", { - skip_if_multithreading_disabled() dst_dir <- make_temp_dir() dst_file <- file.path(dst_dir, "data.csv") df <- tibble(chr = letters[1:10]) @@ -473,7 +468,6 @@ test_that("CSV dataset options", { }) test_that("Other text delimited dataset", { - skip_if_multithreading_disabled() ds1 <- open_dataset(tsv_dir, partitioning = "part", format = "tsv") expect_equivalent( ds1 %>% @@ -502,7 +496,6 @@ test_that("Other text delimited dataset", { }) test_that("readr parse options", { - skip_if_multithreading_disabled() arrow_opts <- names(formals(CsvParseOptions$create)) readr_opts <- names(formals(readr_to_csv_parse_options)) @@ -1654,7 +1647,6 @@ test_that("Writing a dataset: Parquet format options", { }) test_that("Writing a dataset: CSV format options", { - skip_if_multithreading_disabled() df <- tibble( int = 1:10, dbl = as.numeric(1:10), diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R index f131b8444f5..fc24df58ca7 100644 --- a/r/tests/testthat/test-dplyr-arrange.R +++ b/r/tests/testthat/test-dplyr-arrange.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() library(dplyr) diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index eb62e967449..331f7b7b62c 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() withr::local_options(list(arrow.summarise.sort = TRUE)) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index fb4951ac45e..e56ee4be462 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R index 0a8c76a920d..18be2a9304a 100644 --- a/r/tests/testthat/test-dplyr-group-by.R +++ b/r/tests/testthat/test-dplyr-group-by.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index e3281436379..64bb42a0ecf 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() library(lubridate) library(dplyr) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 3f2bb526f0c..44127839108 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 5383bcb2376..b6b8f5a714a 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -17,7 +17,6 @@ skip_if_not_available("dataset") skip_if_not_available("utf8proc") -skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index d6ccbc90c04..78d36630e56 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() withr::local_options(list(arrow.summarise.sort = TRUE)) @@ -369,4 +368,4 @@ test_that("Expressions on aggregations", { collect(), tbl ) -}) \ No newline at end of file +}) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 295d7778dc0..d3a9994b5f1 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -16,7 +16,6 @@ # under the License. skip_if_not_available("dataset") -skip_if_multithreading_disabled() library(dplyr) library(stringr) diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R index c4fad4f5148..56343ad729e 100644 --- a/r/tests/testthat/test-duckdb.R +++ b/r/tests/testthat/test-duckdb.R @@ -18,7 +18,6 @@ skip_if_not_installed("duckdb", minimum_version = "0.2.8") skip_if_not_installed("dbplyr") skip_if_not_available("dataset") -skip_if_multithreading_disabled() # when we remove this, we should also remove the FALSE in run_duckdb_examples skip("These tests are flaking: https://github.com/duckdb/duckdb/issues/2100") From 4fa268489e7bc1c4e7b5354cb72dd748a58678ce Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 3 Sep 2021 11:12:31 -0400 Subject: [PATCH 26/27] Final final tweaks --- r/R/dplyr-collect.R | 4 +++- r/R/dplyr.R | 8 ++++---- r/tests/testthat/helper-skip.R | 1 + 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 94210276417..8a5488bf599 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -91,7 +91,9 @@ implicit_schema <- function(.data) { new_fields <- map(.data$selected_columns, ~ .$type(old_schm)) } else { new_fields <- map(summarize_projection(.data), ~ .$type(old_schm)) - # * Put group_by_vars first (this can't be done by summarize, they have to be last per the aggregate node signature, and they get projected to this order after aggregation) + # * Put group_by_vars first (this can't be done by summarize, + # they have to be last per the aggregate node signature, + # and they get projected to this order after aggregation) # * Infer the output types from the aggregations group_fields <- new_fields[.data$group_by_vars] agg_fields <- imap( diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 7d7c748f0a3..199120887b9 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -67,7 +67,7 @@ arrow_dplyr_query <- function(.data) { } # The only difference between `arrow_dplyr_query()` and `as_adq()` is that if -# `.data` is already an `arrow_dplyr_query`, `as_adq()`, will return it as is, but +# `.data` is already an `arrow_dplyr_query`, `as_adq()`, will return it as is, but # `arrow_dplyr_query()` will nest it inside a new `arrow_dplyr_query`. The only # place where `arrow_dplyr_query()` should be called directly is inside # `collapse()` methods; everywhere else, call `as_adq()`. @@ -162,14 +162,14 @@ as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALS #' @export head.arrow_dplyr_query <- function(x, n = 6L, ...) { - # TODO: refactor/rename + # TODO (ARROW-13893): refactor out <- head.Dataset(x, n, ...) restore_dplyr_features(out, x) } #' @export tail.arrow_dplyr_query <- function(x, n = 6L, ...) { - # TODO: refactor/rename + # TODO (ARROW-13893): refactor out <- tail.Dataset(x, n, ...) restore_dplyr_features(out, x) } @@ -177,7 +177,7 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' @export `[.arrow_dplyr_query` <- `[.Dataset` # TODO: ^ should also probably restore_dplyr_features, and/or that should be moved down -# TODO: refactor/rename +# TODO (ARROW-13893): refactor ensure_group_vars <- function(x) { if (inherits(x, "arrow_dplyr_query")) { diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 9efa1be85b5..3ec18a63019 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -39,6 +39,7 @@ skip_if_not_available <- function(feature) { skip_if_no_pyarrow <- function() { skip_on_valgrind() + skip_on_os("windows") skip_if_not_installed("reticulate") if (!reticulate::py_module_available("pyarrow")) { From ceecc8f79b54ea412e3d00063302e325d580b186 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 3 Sep 2021 11:59:50 -0400 Subject: [PATCH 27/27] Fix python skip --- r/tests/testthat/test-python.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R index 9e67219e19a..d5815247d51 100644 --- a/r/tests/testthat/test-python.R +++ b/r/tests/testthat/test-python.R @@ -20,9 +20,10 @@ context("To/from Python") test_that("install_pyarrow", { skip_on_cran() skip_if_not_dev_mode() - # Python problems on Apple M1 still - skip_if(grepl("arm-apple|aarch64.*darwin", R.Version()$platform)) + # Windows CI machine doesn't pick up the right python or something + skip_on_os("windows") skip_if_not_installed("reticulate") + venv <- try(reticulate::virtualenv_create("arrow-test")) # Bail out if virtualenv isn't available skip_if(inherits(venv, "try-error"))