From bf47992a7ca51492c99324891fa35657ec4dd208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 8 Apr 2020 10:19:10 -0400 Subject: [PATCH 1/3] ARROW-8354: [R] Fix segfault in Table to Array conversion The Converter::Make did not like receiving empty ArrayVector. The bug was introduced in ARROW-8216 which could return an empty selection vector due to a randomly generated fixture in test-dplyr.R --- r/src/array_to_vector.cpp | 13 +++++++++---- r/tests/testthat/helper-arrow.R | 2 ++ r/tests/testthat/test-dplyr.R | 10 ++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 4162c824672..5aefaeefde3 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -656,8 +656,15 @@ class Converter_Null : public Converter { return Status::OK(); } }; + std::shared_ptr Converter::Make(const ArrayVector& arrays) { - switch (arrays[0]->type_id()) { + if (arrays.empty()) { + Rcpp::stop(tfm::format("Must have at least one array to create a converter")); + } + + auto type = arrays[0]->type(); + + switch (type->id()) { // direct support case Type::INT32: return std::make_shared>(arrays); @@ -742,7 +749,7 @@ std::shared_ptr Converter::Make(const ArrayVector& arrays) { break; } - Rcpp::stop(tfm::format("cannot handle Array of type %s", arrays[0]->type()->name())); + Rcpp::stop(tfm::format("cannot handle Array of type %s", type->name())); return nullptr; } @@ -813,7 +820,6 @@ SEXP Array__as_vector(const std::shared_ptr& array) { // [[arrow::export]] SEXP ChunkedArray__as_vector(const std::shared_ptr& chunked_array) { - // NB: this segfaults if there are 0 chunks (presumably something tries chunks[0]) return arrow::r::ArrayVector__as_vector(chunked_array->length(), chunked_array->chunks()); } @@ -849,7 +855,6 @@ Rcpp::List Table__to_dataframe(const std::shared_ptr& table, std::vector> converters(nc); for (int64_t i = 0; i < nc; i++) { - // This crashes if num_chunks == 0 converters[i] = arrow::r::Converter::Make(table->column(i)->chunks()); names[i] = table->field(i)->name(); } diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index bd35e1cb4c4..4e6a47d9672 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -23,6 +23,8 @@ if (tolower(Sys.info()[["sysname"]]) == "windows") { options(arrow.use_threads = FALSE) } +set.seed(1) + test_that <- function(what, code) { testthat::test_that(what, { skip_if(getOption("..skip.tests", TRUE), "arrow C++ library not available") diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index e77bfa3ad07..b65990ba95c 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -80,6 +80,7 @@ tbl <- tibble::tibble( int = 1:10, dbl = as.numeric(1:10), lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + false = logical(10), chr = letters[1:10], fct = factor(letters[1:10]) ) @@ -118,6 +119,15 @@ test_that("filter() with NAs in selection", { ) }) +test_that("Filter returning an empty Table should not segfault (ARROW-8354)", { + expect_dplyr_error( + input %>% + filter(false) %>% + select(chr, int, lgl) %>% + collect() + ) +}) + test_that("filtering with expression", { char_sym <- "b" expect_dplyr_equal( From 3653a08bdce2872ee952c069240a404531470935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 8 Apr 2020 14:43:23 -0400 Subject: [PATCH 2/3] Add failing case --- r/src/array_from_vector.cpp | 4 +++- r/tests/testthat/test-Array.R | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 5e3e9350be9..35bfbae8e9d 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -295,7 +295,9 @@ std::shared_ptr MakeStructArray(SEXP df, const std::shared_ptr& for (int i = 0; i < n; i++) { children[i] = Array__from_vector(VECTOR_ELT(df, i), type->child(i)->type(), true); } - return std::make_shared(type, children[0]->length(), children); + + int64_t rows = n ? children[0]->length() : 0; + return std::make_shared(type, rows, children); } std::shared_ptr MakeListArray(SEXP x, const std::shared_ptr& type) { diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index faf1e741a92..ad8068cc8b8 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -413,6 +413,11 @@ test_that("Array$create() can handle data frame with custom struct type (not inf expect_error(Array$create(df, type = type), regexp = "Expecting a character vector") }) +test_that("Array$create() supports tibble with no columns (ARROW-8354)", { + df <- tibble::tibble() + expect_equal(Array$create(df)$as_vector(), df) +}) + test_that("Array$create() handles vector -> list arrays (ARROW-7662)", { # Should be able to create an empty list with a type hint. expect_is(Array$create(list(), list_of(bool())), "ListArray") From c8e2c9f752f31f5f921bbb70635b4e4c0aa390e9 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 8 Apr 2020 13:00:17 -0700 Subject: [PATCH 3/3] Don't error when collecting 0-row table --- r/R/table.R | 2 +- r/src/compute.cpp | 18 ++++++++++++++++-- r/tests/testthat/test-dplyr.R | 5 +++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/r/R/table.R b/r/R/table.R index dd3461b7ed3..b899117c148 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -194,7 +194,7 @@ Table$create <- function(..., schema = NULL) { } #' @export -as.data.frame.Table <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){ +as.data.frame.Table <- function(x, row.names = NULL, optional = FALSE, ...) { Table__to_dataframe(x, use_threads = option_use_threads()) } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index eecbda9ead8..212b27c8a60 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -234,7 +234,14 @@ std::shared_ptr Table__Filter(const std::shared_ptr& options.null_selection_behavior = arrow::compute::FilterOptions::EMIT_NULL; } STOP_IF_NOT_OK(arrow::compute::Filter(&context, table, filter, options, &out)); - return out.table(); + std::shared_ptr tab = out.table(); + if (tab->num_rows() == 0) { + // Slight hack: if there are no rows in the result, instead do a 0-length + // slice so that we get chunked arrays with 1 chunk (itself length 0). + // We need that because the Arrow-to-R converter fails when there are 0 chunks. + return table->Slice(0, 0); + } + return tab; } // [[arrow::export]] @@ -249,6 +256,13 @@ std::shared_ptr Table__FilterChunked( options.null_selection_behavior = arrow::compute::FilterOptions::EMIT_NULL; } STOP_IF_NOT_OK(arrow::compute::Filter(&context, table, filter, options, &out)); - return out.table(); + std::shared_ptr tab = out.table(); + if (tab->num_rows() == 0) { + // Slight hack: if there are no rows in the result, instead do a 0-length + // slice so that we get chunked arrays with 1 chunk (itself length 0). + // We need that because the Arrow-to-R converter fails when there are 0 chunks. + return table->Slice(0, 0); + } + return tab; } #endif diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index b65990ba95c..e531fb30cf6 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -120,11 +120,12 @@ test_that("filter() with NAs in selection", { }) test_that("Filter returning an empty Table should not segfault (ARROW-8354)", { - expect_dplyr_error( + expect_dplyr_equal( input %>% filter(false) %>% select(chr, int, lgl) %>% - collect() + collect(), + tbl ) })