From f17c4ad3f434d87b38bf43b4d35d01fbe7eb7064 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 8 Jun 2021 15:12:10 +0100 Subject: [PATCH 01/23] rebase fix 1 --- r/R/record-batch.R | 11 +++++++++++ r/R/table.R | 10 ++++++++++ r/tests/testthat/test-RecordBatch.R | 5 ++--- r/tests/testthat/test-Table.R | 9 ++++++++- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 1e41d6533a8..4c3b9c90910 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -145,6 +145,7 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowTabular, RecordBatch$create <- function(..., schema = NULL) { arrays <- list2(...) + if (length(arrays) == 1 && inherits(arrays[[1]], c("raw", "Buffer", "InputStream", "Message"))) { return(RecordBatch$from_message(arrays[[1]], schema)) } @@ -161,6 +162,16 @@ RecordBatch$create <- function(..., schema = NULL) { out <- RecordBatch__from_arrays(schema, arrays) return(dplyr::group_by(out, !!!dplyr::groups(arrays[[1]]))) } + + # If any arrays are length 1, recycle them + arr_lens <- map(arrays, length) + if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ + arrays <- modify2( + arrays, + arr_lens == 1, + ~if(.y) rep(.x, max(unlist(arr_lens))) else .x + ) + } # TODO: should this also assert that they're all Arrays? RecordBatch__from_arrays(schema, arrays) diff --git a/r/R/table.R b/r/R/table.R index 09be952af61..e5791f957d8 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -173,6 +173,16 @@ Table$create <- function(..., schema = NULL) { return(dplyr::group_by(out, !!!dplyr::groups(dots[[1]]))) } + # If any arrays are length 1, recycle them + arr_lens <- map(dots, length) + if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ + dots <- modify2( + dots, + arr_lens == 1, + ~if(.y) rep(.x, max(unlist(arr_lens))) else .x + ) + } + if (all_record_batches(dots)) { Table__from_record_batches(dots, schema) } else { diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index beb1306ab4f..1625ac7951f 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +library(purrr) test_that("RecordBatch", { # Note that we're reusing `tbl` and `batch` throughout the tests in this file @@ -416,7 +417,6 @@ test_that("record_batch() handles null type (ARROW-7064)", { }) test_that("record_batch() scalar recycling", { - skip("Not implemented (ARROW-11705)") expect_data_frame( record_batch(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) @@ -467,8 +467,7 @@ test_that("RecordBatch name assignment", { test_that("record_batch() with different length arrays", { msg <- "All arrays must have the same length" - expect_error(record_batch(a=1:5, b = 42), msg) - expect_error(record_batch(a=1:5, b = 1:6), msg) + expect_error(record_batch(a = 1:5, b = 1:6), msg) }) test_that("Handling string data with embedded nuls", { diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 1f9628859d0..284c222256f 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +library(purrr) test_that("read_table handles various input streams (ARROW-3450, ARROW-3505)", { tbl <- tibble::tibble( @@ -471,10 +472,16 @@ test_that("Table name assignment", { test_that("Table$create() with different length columns", { msg <- "All columns must have the same length" - expect_error(Table$create(a=1:5, b = 42), msg) expect_error(Table$create(a=1:5, b = 1:6), msg) }) +test_that("Table$create() scalar recycling", { + expect_data_frame( + Table$create(a = 1:10, b = 5), + tibble::tibble(a = 1:10, b = 5) + ) +}) + test_that("ARROW-11769 - grouping preserved in table creation", { skip_if_not_available("dataset") From 9ad72643f939835038ff7d97b0c3a544e9f38574 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 7 May 2021 20:17:58 +0100 Subject: [PATCH 02/23] Call modify2 directly --- r/R/record-batch.R | 3 +-- r/R/table.R | 2 +- r/tests/testthat/test-RecordBatch.R | 2 -- r/tests/testthat/test-Table.R | 2 -- 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 4c3b9c90910..4eeb1192387 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -145,7 +145,6 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowTabular, RecordBatch$create <- function(..., schema = NULL) { arrays <- list2(...) - if (length(arrays) == 1 && inherits(arrays[[1]], c("raw", "Buffer", "InputStream", "Message"))) { return(RecordBatch$from_message(arrays[[1]], schema)) } @@ -166,7 +165,7 @@ RecordBatch$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map(arrays, length) if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ - arrays <- modify2( + arrays <- purrr::modify2( arrays, arr_lens == 1, ~if(.y) rep(.x, max(unlist(arr_lens))) else .x diff --git a/r/R/table.R b/r/R/table.R index e5791f957d8..174e0611b67 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -176,7 +176,7 @@ Table$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map(dots, length) if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ - dots <- modify2( + dots <- purrr::modify2( dots, arr_lens == 1, ~if(.y) rep(.x, max(unlist(arr_lens))) else .x diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 1625ac7951f..2c6e1211975 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -library(purrr) - test_that("RecordBatch", { # Note that we're reusing `tbl` and `batch` throughout the tests in this file tbl <- tibble::tibble( diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 284c222256f..79cec2d6a80 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -library(purrr) - test_that("read_table handles various input streams (ARROW-3450, ARROW-3505)", { tbl <- tibble::tibble( int = 1:10, dbl = as.numeric(1:10), From ed665958d51a60c079caf0686321983a07295baa Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 10 May 2021 08:42:03 +0100 Subject: [PATCH 03/23] Import modify2 --- r/NAMESPACE | 1 + r/R/record-batch.R | 2 +- r/R/table.R | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index f298ba905ee..51f29c59271 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -296,6 +296,7 @@ importFrom(purrr,map_chr) importFrom(purrr,map_dfr) importFrom(purrr,map_int) importFrom(purrr,map_lgl) +importFrom(purrr,modify2) importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,abort) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 4eeb1192387..8ab63af4c1d 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -165,7 +165,7 @@ RecordBatch$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map(arrays, length) if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ - arrays <- purrr::modify2( + arrays <- modify2( arrays, arr_lens == 1, ~if(.y) rep(.x, max(unlist(arr_lens))) else .x diff --git a/r/R/table.R b/r/R/table.R index 174e0611b67..e5791f957d8 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -176,7 +176,7 @@ Table$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map(dots, length) if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ - dots <- purrr::modify2( + dots <- modify2( dots, arr_lens == 1, ~if(.y) rep(.x, max(unlist(arr_lens))) else .x From 0dc7666a5fb0b0fe5fbf1d9fa1b4e2f3cfafe4fd Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 10 May 2021 09:10:07 +0100 Subject: [PATCH 04/23] Add test for recycling not working if pass in tibble --- r/tests/testthat/test-RecordBatch.R | 10 ++++++++++ r/tests/testthat/test-Table.R | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 2c6e1211975..a7d0af19843 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -421,6 +421,16 @@ test_that("record_batch() scalar recycling", { ) }) +test_that("record_batch() no recycling with tibbles", { + expect_error( + record_batch( + tibble::tibble(a = 1:10, b = 5), + tibble::tibble(a = 1, b = 5) + ), + regexp = "All arrays must have the same length" + ) +}) + test_that("RecordBatch$Equals", { df <- tibble::tibble(x = 1:10, y = letters[1:10]) a <- record_batch(df) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 79cec2d6a80..44213d2d48d 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -470,7 +470,7 @@ test_that("Table name assignment", { test_that("Table$create() with different length columns", { msg <- "All columns must have the same length" - expect_error(Table$create(a=1:5, b = 1:6), msg) + expect_error(Table$create(a = 1:5, b = 1:6), msg) }) test_that("Table$create() scalar recycling", { @@ -480,6 +480,16 @@ test_that("Table$create() scalar recycling", { ) }) +test_that("Table$create() no recycling with tibbles", { + expect_error( + Table$create( + tibble::tibble(a = 1:10, b = 5), + tibble::tibble(a = 1, b = 5) + ), + regexp = "All columns must have the same length" + ) +}) + test_that("ARROW-11769 - grouping preserved in table creation", { skip_if_not_available("dataset") From 830ff2bd166034e7e61c2d6ccf40e1a98e09c8cf Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 10 May 2021 13:02:22 +0100 Subject: [PATCH 05/23] Add handling of ArrowDatum objects to scalar recycling --- r/R/arrowExports.R | 4 ++-- r/R/record-batch.R | 3 ++- r/R/scalar.R | 2 +- r/R/table.R | 3 ++- r/src/arrowExports.cpp | 11 ++++++----- r/src/scalar.cpp | 4 ++-- r/tests/testthat/test-RecordBatch.R | 21 ++++++++++++++++++++- r/tests/testthat/test-Table.R | 21 ++++++++++++++++++++- 8 files changed, 55 insertions(+), 14 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 45a0ea69c59..577773c42bd 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1548,8 +1548,8 @@ Scalar__as_vector <- function(scalar){ .Call(`_arrow_Scalar__as_vector`, scalar) } -MakeArrayFromScalar <- function(scalar){ - .Call(`_arrow_MakeArrayFromScalar`, scalar) +MakeArrayFromScalar <- function(scalar, n){ + .Call(`_arrow_MakeArrayFromScalar`, scalar, n) } Scalar__is_valid <- function(s){ diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 8ab63af4c1d..06cf0eeaf2a 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -165,10 +165,11 @@ RecordBatch$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map(arrays, length) if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ + max_array_len <- max(unlist(arr_lens)) arrays <- modify2( arrays, arr_lens == 1, - ~if(.y) rep(.x, max(unlist(arr_lens))) else .x + ~if(.y) MakeArrayFromScalar(Scalar$create(as.vector(.x)), max_array_len) else .x ) } diff --git a/r/R/scalar.R b/r/R/scalar.R index 01a50b0f358..b8672d8bdcf 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -58,7 +58,7 @@ Scalar <- R6Class("Scalar", ToString = function() Scalar__ToString(self), type_id = function() Scalar__type(self)$id, as_vector = function() Scalar__as_vector(self), - as_array = function() MakeArrayFromScalar(self), + as_array = function() MakeArrayFromScalar(self, 1L), Equals = function(other, ...) { inherits(other, "Scalar") && Scalar__Equals(self, other) }, diff --git a/r/R/table.R b/r/R/table.R index e5791f957d8..fca203ea91b 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -176,10 +176,11 @@ Table$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map(dots, length) if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ + max_array_len <- max(unlist(arr_lens)) dots <- modify2( dots, arr_lens == 1, - ~if(.y) rep(.x, max(unlist(arr_lens))) else .x + ~if(.y) MakeArrayFromScalar(Scalar$create(as.vector(.x)), max_array_len) else .x ) } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 2024483f47d..024e5c58b0e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -6091,15 +6091,16 @@ extern "C" SEXP _arrow_Scalar__as_vector(SEXP scalar_sexp){ // scalar.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr MakeArrayFromScalar(const std::shared_ptr& scalar); -extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp){ +std::shared_ptr MakeArrayFromScalar(const std::shared_ptr& scalar, int n); +extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp, SEXP n_sexp){ BEGIN_CPP11 arrow::r::Input&>::type scalar(scalar_sexp); - return cpp11::as_sexp(MakeArrayFromScalar(scalar)); + arrow::r::Input::type n(n_sexp); + return cpp11::as_sexp(MakeArrayFromScalar(scalar, n)); END_CPP11 } #else -extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp){ +extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp, SEXP n_sexp){ Rf_error("Cannot call MakeArrayFromScalar(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -7279,7 +7280,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, - { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 1}, + { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2}, { "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, diff --git a/r/src/scalar.cpp b/r/src/scalar.cpp index 057e587e7eb..5450a6f0ab7 100644 --- a/r/src/scalar.cpp +++ b/r/src/scalar.cpp @@ -70,8 +70,8 @@ SEXP Scalar__as_vector(const std::shared_ptr& scalar) { // [[arrow::export]] std::shared_ptr MakeArrayFromScalar( - const std::shared_ptr& scalar) { - return ValueOrStop(arrow::MakeArrayFromScalar(*scalar, 1, gc_memory_pool())); + const std::shared_ptr& scalar, int n) { + return ValueOrStop(arrow::MakeArrayFromScalar(*scalar, n, gc_memory_pool())); } // [[arrow::export]] diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index a7d0af19843..a74a80f49f7 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -414,13 +414,32 @@ test_that("record_batch() handles null type (ARROW-7064)", { expect_equivalent(batch$schema, schema(a = int32(), n = null())) }) -test_that("record_batch() scalar recycling", { +test_that("record_batch() scalar recycling with vectors", { expect_data_frame( record_batch(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) ) }) +test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", { + + expect_data_frame( + record_batch(a = Array$create(1:10), b = Scalar$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + + expect_data_frame( + record_batch(a = Array$create(1:10), b = Array$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + + expect_data_frame( + record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + +}) + test_that("record_batch() no recycling with tibbles", { expect_error( record_batch( diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 44213d2d48d..c1c055cbf3f 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -473,13 +473,32 @@ test_that("Table$create() with different length columns", { expect_error(Table$create(a = 1:5, b = 1:6), msg) }) -test_that("Table$create() scalar recycling", { +test_that("Table$create() scalar recycling with vectors", { expect_data_frame( Table$create(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) ) }) +test_that("Table$create() scalar recycling with Scalars, Arrays, and ChunkedArrays", { + + expect_data_frame( + Table$create(a = Array$create(1:10), b = Scalar$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + + expect_data_frame( + Table$create(a = Array$create(1:10), b = Array$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + + expect_data_frame( + Table$create(a = Array$create(1:10), b = ChunkedArray$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + +}) + test_that("Table$create() no recycling with tibbles", { expect_error( Table$create( From b7094cd312ded70ed881e4ef4523ed367d9c3442 Mon Sep 17 00:00:00 2001 From: Nic Date: Mon, 10 May 2021 16:27:26 +0100 Subject: [PATCH 06/23] Update r/R/record-batch.R Co-authored-by: Ian Cook --- r/R/record-batch.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 06cf0eeaf2a..4ae44572541 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -163,9 +163,9 @@ RecordBatch$create <- function(..., schema = NULL) { } # If any arrays are length 1, recycle them - arr_lens <- map(arrays, length) + arr_lens <- map_int(arrays, length) if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ - max_array_len <- max(unlist(arr_lens)) + max_array_len <- max(arr_lens) arrays <- modify2( arrays, arr_lens == 1, From d57c33c29f37cf8f9beae02f71a02f257bdcbaad Mon Sep 17 00:00:00 2001 From: Nic Date: Mon, 10 May 2021 19:15:41 +0100 Subject: [PATCH 07/23] Update r/R/table.R Co-authored-by: Ian Cook --- r/R/table.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/table.R b/r/R/table.R index fca203ea91b..8e2c777a442 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -174,9 +174,9 @@ Table$create <- function(..., schema = NULL) { } # If any arrays are length 1, recycle them - arr_lens <- map(dots, length) + arr_lens <- map_int(dots, length) if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ - max_array_len <- max(unlist(arr_lens)) + max_array_len <- max(arr_lens) dots <- modify2( dots, arr_lens == 1, From c478825415edf9cd1214a9ca68610e865cfe0ea7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 May 2021 11:12:08 +0100 Subject: [PATCH 08/23] Remove call to as.vector --- r/R/record-batch.R | 4 ++-- r/R/table.R | 5 ++++- r/R/util.R | 17 +++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 4ae44572541..d023314982d 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -169,10 +169,10 @@ RecordBatch$create <- function(..., schema = NULL) { arrays <- modify2( arrays, arr_lens == 1, - ~if(.y) MakeArrayFromScalar(Scalar$create(as.vector(.x)), max_array_len) else .x + ~if(.y) repeat_value_as_array(.x, max_array_len) else .x ) } - + # TODO: should this also assert that they're all Arrays? RecordBatch__from_arrays(schema, arrays) } diff --git a/r/R/table.R b/r/R/table.R index 8e2c777a442..9563d1a76b9 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -175,12 +175,13 @@ Table$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map_int(dots, length) + if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ max_array_len <- max(arr_lens) dots <- modify2( dots, arr_lens == 1, - ~if(.y) MakeArrayFromScalar(Scalar$create(as.vector(.x)), max_array_len) else .x + ~if(.y) repeat_value_as_array(.x, max_array_len) else .x ) } @@ -191,5 +192,7 @@ Table$create <- function(..., schema = NULL) { } } + + #' @export names.Table <- function(x) x$ColumnNames() diff --git a/r/R/util.R b/r/R/util.R index 8d1f51bd079..5336c508011 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -139,3 +139,20 @@ attr(is_writable_table, "fail") <- function(call, env){ ) } +#' Take an object of length 1 and repeat it. +#' +#' @param object Object to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray` +#' @param n Number of repetitions +#' +#' @return `Array` of length `n` +#' +#' @keywords internal +repeat_value_as_array <- function(object, n){ + if(length(object) != 1){ + stop("Object to be repeated must be of length 1") + } + if (inherits(object, "ChunkedArray")) { + return(MakeArrayFromScalar(Scalar$create(object$chunks[[1]]), n)) + } + return(MakeArrayFromScalar(Scalar$create(object), n)) +} From 0d298f7e51c4a033c3005fbb7201dff722c23708 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 May 2021 15:58:27 +0100 Subject: [PATCH 09/23] Fix spacing and don't skip test --- r/R/arrow-datum.R | 4 ++-- r/R/arrow-package.R | 2 +- r/R/arrow-tabular.R | 4 ++-- r/R/chunked-array.R | 2 +- r/R/compression.R | 4 ++-- r/R/compute.R | 4 ++-- r/R/csv.R | 6 +++--- r/R/enums.R | 4 ++-- r/R/filesystem.R | 2 +- r/R/metadata.R | 2 +- r/R/parquet.R | 4 ++-- r/R/record-batch.R | 4 ++-- r/R/table.R | 4 ++-- r/R/util.R | 4 ++-- r/data-raw/codegen.R | 8 ++++---- r/extra-tests/helpers.R | 4 ++-- r/extra-tests/write-files.R | 2 +- r/tests/testthat/helper-expectation.R | 6 +++--- r/tests/testthat/test-dataset.R | 2 +- r/tools/winlibs.R | 6 +++--- 20 files changed, 39 insertions(+), 39 deletions(-) diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index 3be8d75af0b..8becc37daf2 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -128,7 +128,7 @@ eval_array_expression <- function(FUN, } #' @export -na.omit.ArrowDatum <- function(object, ...){ +na.omit.ArrowDatum <- function(object, ...) { object$Filter(!is.na(object)) } @@ -136,7 +136,7 @@ na.omit.ArrowDatum <- function(object, ...){ na.exclude.ArrowDatum <- na.omit.ArrowDatum #' @export -na.fail.ArrowDatum <- function(object, ...){ +na.fail.ArrowDatum <- function(object, ...) { if (object$null_count > 0) { stop("missing values in object", call. = FALSE) } diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 684382039f1..d2bf81cf5ee 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -279,7 +279,7 @@ ArrowObject <- R6Class("ArrowObject", class_title <- class(self)[[1]] } cat(class_title, "\n", sep = "") - if (!is.null(self$ToString)){ + if (!is.null(self$ToString)) { cat(self$ToString(), "\n", sep = "") } invisible(self) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index f5535f9ac20..440dcea5994 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -212,7 +212,7 @@ head.ArrowTabular <- head.ArrowDatum tail.ArrowTabular <- tail.ArrowDatum #' @export -na.fail.ArrowTabular <- function(object, ...){ +na.fail.ArrowTabular <- function(object, ...) { for (col in seq_len(object$num_columns)) { if (object$column(col - 1L)$null_count > 0) { stop("missing values in object", call. = FALSE) @@ -222,7 +222,7 @@ na.fail.ArrowTabular <- function(object, ...){ } #' @export -na.omit.ArrowTabular <- function(object, ...){ +na.omit.ArrowTabular <- function(object, ...) { not_na <- map(object$columns, ~call_function("is_valid", .x)) not_na_agg <- Reduce("&", not_na) object$Filter(not_na_agg) diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index fac1eeba2b1..c58e5ac94f9 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -83,7 +83,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, type_id = function() ChunkedArray__type(self)$id, chunk = function(i) Array$create(ChunkedArray__chunk(self, i)), as_vector = function() ChunkedArray__as_vector(self), - Slice = function(offset, length = NULL){ + Slice = function(offset, length = NULL) { if (is.null(length)) { ChunkedArray__Slice1(self, offset) } else { diff --git a/r/R/compression.R b/r/R/compression.R index 8fd709f4fda..499a75c83e1 100644 --- a/r/R/compression.R +++ b/r/R/compression.R @@ -99,7 +99,7 @@ compression_from_name <- function(name) { #' @export #' @include arrow-package.R CompressedOutputStream <- R6Class("CompressedOutputStream", inherit = OutputStream) -CompressedOutputStream$create <- function(stream, codec = "gzip", compression_level = NA){ +CompressedOutputStream$create <- function(stream, codec = "gzip", compression_level = NA) { codec <- Codec$create(codec, compression_level = compression_level) if (is.string(stream)) { stream <- FileOutputStream$create(stream) @@ -113,7 +113,7 @@ CompressedOutputStream$create <- function(stream, codec = "gzip", compression_le #' @format NULL #' @export CompressedInputStream <- R6Class("CompressedInputStream", inherit = InputStream) -CompressedInputStream$create <- function(stream, codec = "gzip", compression_level = NA){ +CompressedInputStream$create <- function(stream, codec = "gzip", compression_level = NA) { codec <- Codec$create(codec, compression_level = compression_level) if (is.string(stream)) { stream <- ReadableFile$create(stream) diff --git a/r/R/compute.R b/r/R/compute.R index 4d36f6057b6..5a00e884980 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -202,7 +202,7 @@ unique.ArrowDatum <- function(x, incomparables = FALSE, ...) { } #' @export -any.ArrowDatum <- function(..., na.rm = FALSE){ +any.ArrowDatum <- function(..., na.rm = FALSE) { a <- collect_arrays_from_dots(list(...)) result <- call_function("any", a) @@ -217,7 +217,7 @@ any.ArrowDatum <- function(..., na.rm = FALSE){ } #' @export -all.ArrowDatum <- function(..., na.rm = FALSE){ +all.ArrowDatum <- function(..., na.rm = FALSE) { a <- collect_arrays_from_dots(list(...)) result <- call_function("all", a) diff --git a/r/R/csv.R b/r/R/csv.R index 2708a5370f0..1312a2676ae 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -414,7 +414,7 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(), #' @rdname CsvReadOptions #' @export CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject) -CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L){ +CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) { assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0) csv___WriteOptions__initialize( list( @@ -637,9 +637,9 @@ write_csv_arrow <- function(x, on.exit(sink$close()) } - if(inherits(x, "RecordBatch")){ + if (inherits(x, "RecordBatch")) { csv___WriteCSV__RecordBatch(x, write_options, sink) - } else if(inherits(x, "Table")){ + } else if (inherits(x, "Table")) { csv___WriteCSV__Table(x, write_options, sink) } diff --git a/r/R/enums.R b/r/R/enums.R index ae44ccf2cad..4271f2ad138 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -16,11 +16,11 @@ # under the License. #' @export -`print.arrow-enum` <- function(x, ...){ +`print.arrow-enum` <- function(x, ...) { NextMethod() } -enum <- function(class, ..., .list = list(...)){ +enum <- function(class, ..., .list = list(...)) { structure( .list, class = c(class, "arrow-enum") diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 6761acab30e..283fbbb0ae5 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -203,7 +203,7 @@ FileSystem <- R6Class("FileSystem", inherit = ArrowObject, GetFileInfo = function(x) { if (inherits(x, "FileSelector")) { fs___FileSystem__GetTargetInfos_FileSelector(self, x) - } else if (is.character(x)){ + } else if (is.character(x)) { fs___FileSystem__GetTargetInfos_Paths(self, clean_path_rel(x)) } else { abort("incompatible type for FileSystem$GetFileInfo()") diff --git a/r/R/metadata.R b/r/R/metadata.R index d3e5e2150bb..408c2214a31 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -59,7 +59,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) { x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]]) } } - } else if(is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { + } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { x <- map2(x, columns_metadata, function(.x, .y) { apply_arrow_r_metadata(.x, .y) }) diff --git a/r/R/parquet.R b/r/R/parquet.R index a9aef2c4d0d..3006fcbbe50 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -296,7 +296,7 @@ ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder", inhe parquet___ArrowWriterProperties___Builder__set_compressions ) }, - set_compression_level = function(table, compression_level){ + set_compression_level = function(table, compression_level) { # cast to integer but keep names compression_level <- set_names(as.integer(compression_level), names(compression_level)) private$.set(table, compression_level, @@ -558,7 +558,7 @@ ParquetArrowReaderProperties <- R6Class("ParquetArrowReaderProperties", ), active = list( use_threads = function(use_threads) { - if(missing(use_threads)) { + if (missing(use_threads)) { parquet___arrow___ArrowReaderProperties__get_use_threads(self) } else { parquet___arrow___ArrowReaderProperties__set_use_threads(self, use_threads) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index d023314982d..f3f45c95e53 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -164,12 +164,12 @@ RecordBatch$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map_int(arrays, length) - if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ + if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { max_array_len <- max(arr_lens) arrays <- modify2( arrays, arr_lens == 1, - ~if(.y) repeat_value_as_array(.x, max_array_len) else .x + ~if (.y) repeat_value_as_array(.x, max_array_len) else .x ) } diff --git a/r/R/table.R b/r/R/table.R index 9563d1a76b9..a9a70af8d4f 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -176,12 +176,12 @@ Table$create <- function(..., schema = NULL) { # If any arrays are length 1, recycle them arr_lens <- map_int(dots, length) - if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)){ + if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { max_array_len <- max(arr_lens) dots <- modify2( dots, arr_lens == 1, - ~if(.y) repeat_value_as_array(.x, max_array_len) else .x + ~if (.y) repeat_value_as_array(.x, max_array_len) else .x ) } diff --git a/r/R/util.R b/r/R/util.R index 5336c508011..d8b9903c9c4 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -147,8 +147,8 @@ attr(is_writable_table, "fail") <- function(call, env){ #' @return `Array` of length `n` #' #' @keywords internal -repeat_value_as_array <- function(object, n){ - if(length(object) != 1){ +repeat_value_as_array <- function(object, n) { + if (length(object) != 1) { stop("Object to be repeated must be of length 1") } if (inherits(object, "ChunkedArray")) { diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index ad4514a3124..9b25cb1842c 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -67,13 +67,13 @@ get_exported_functions <- function(decorations, export_tag) { glue_collapse_data <- function(data, ..., sep = ", ", last = "") { res <- glue_collapse(glue_data(data, ...), sep = sep, last = last) - if(length(res) == 0) res <- "" + if (length(res) == 0) res <- "" res } wrap_call <- function(name, return_type, args) { call <- glue::glue('{name}({list_params})', list_params = glue_collapse_data(args, "{name}")) - if(return_type == "void") { + if (return_type == "void") { glue::glue("\t{call};\n\treturn R_NilValue;", .trim = FALSE) } else { glue::glue("\treturn cpp11::as_sexp({call});") @@ -149,7 +149,7 @@ cpp_functions_definitions <- arrow_exports %>% sep = "\n", real_params = glue_collapse_data(args, "{type} {name}"), input_params = glue_collapse_data(args, "\tarrow::r::Input<{type}>::type {name}({name}_sexp);", sep = "\n"), - return_line = if(nrow(args)) "\n" else "") + return_line = if (nrow(args)) "\n" else "") glue::glue(' // {basename(file)} @@ -162,7 +162,7 @@ cpp_functions_definitions <- arrow_exports %>% cpp_functions_registration <- arrow_exports %>% select(name, return_type, args) %>% - pmap_chr(function(name, return_type, args){ + pmap_chr(function(name, return_type, args) { glue('\t\t{{ "_arrow_{name}", (DL_FUNC) &_arrow_{name}, {nrow(args)}}}, ') }) %>% glue_collapse(sep = "\n") diff --git a/r/extra-tests/helpers.R b/r/extra-tests/helpers.R index af57d45e5d2..3fb450ee332 100644 --- a/r/extra-tests/helpers.R +++ b/r/extra-tests/helpers.R @@ -24,13 +24,13 @@ if_version_less_than <- function(version) { } skip_if_version_less_than <- function(version, msg) { - if(if_version(version, `<`)) { + if (if_version(version, `<`)) { skip(msg) } } skip_if_version_equals <- function(version, msg) { - if(if_version(version, `==`)) { + if (if_version(version, `==`)) { skip(msg) } } diff --git a/r/extra-tests/write-files.R b/r/extra-tests/write-files.R index 75889b61407..e11405d67bf 100644 --- a/r/extra-tests/write-files.R +++ b/r/extra-tests/write-files.R @@ -26,7 +26,7 @@ source("tests/testthat/helper-data.R") write_parquet(example_with_metadata, "extra-tests/files/ex_data.parquet") for (comp in c("lz4", "uncompressed", "zstd")) { - if(!codec_is_available(comp)) break + if (!codec_is_available(comp)) break name <- paste0("extra-tests/files/ex_data_", comp, ".feather") write_feather(example_with_metadata, name, compression = comp) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 5b6958a9a7a..b815515a4fa 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -16,7 +16,7 @@ # under the License. expect_as_vector <- function(x, y, ignore_attr = FALSE, ...) { - expect_fun <- if(ignore_attr){ + expect_fun <- if (ignore_attr) { expect_equivalent } else { expect_equal @@ -28,7 +28,7 @@ expect_data_frame <- function(x, y, ...) { expect_equal(as.data.frame(x), y, ...) } -expect_r6_class <- function(object, class){ +expect_r6_class <- function(object, class) { expect_s3_class(object, class) expect_s3_class(object, "R6") } @@ -255,7 +255,7 @@ expect_vector_error <- function(expr, # A vectorized R expression containing `in } } -split_vector_as_list <- function(vec){ +split_vector_as_list <- function(vec) { vec_split <- length(vec) %/% 2 vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)] vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)] diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index d84ed03c2d2..ad3e7c30f1f 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -90,7 +90,7 @@ test_that("Setup (putting data in the dir)", { expect_length(dir(tsv_dir, recursive = TRUE), 2) }) -if(arrow_with_parquet()) { +if (arrow_with_parquet()) { files <- c( file.path(dataset_dir, 1, "file1.parquet", fsep = "/"), file.path(dataset_dir, 2, "file2.parquet", fsep = "/") diff --git a/r/tools/winlibs.R b/r/tools/winlibs.R index f90becb7649..ccaa5c95d87 100644 --- a/r/tools/winlibs.R +++ b/r/tools/winlibs.R @@ -17,12 +17,12 @@ args <- commandArgs(TRUE) VERSION <- args[1] -if(!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))){ - if(length(args) > 1){ +if (!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) { + if (length(args) > 1) { # Arg 2 would be the path/to/lib.zip localfile <- args[2] cat(sprintf("*** Using RWINLIB_LOCAL %s\n", localfile)) - if(!file.exists(localfile)){ + if (!file.exists(localfile)) { cat(sprintf("*** %s does not exist; build will fail\n", localfile)) } file.copy(localfile, "lib.zip") From 4905839fb1e6c2634f4a8ab43677abee15f5c4c4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 11 May 2021 18:16:26 +0100 Subject: [PATCH 10/23] Run devtools::document() --- r/man/repeat_value_as_array.Rd | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 r/man/repeat_value_as_array.Rd diff --git a/r/man/repeat_value_as_array.Rd b/r/man/repeat_value_as_array.Rd new file mode 100644 index 00000000000..a546c4cef2e --- /dev/null +++ b/r/man/repeat_value_as_array.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{repeat_value_as_array} +\alias{repeat_value_as_array} +\title{Take an object of length 1 and repeat it.} +\usage{ +repeat_value_as_array(object, n) +} +\arguments{ +\item{object}{Object to be repeated - vector, \code{Scalar}, \code{Array}, or \code{ChunkedArray}} + +\item{n}{Number of repetitions} +} +\value{ +\code{Array} of length \code{n} +} +\description{ +Take an object of length 1 and repeat it. +} +\keyword{internal} From 9ad9a499e7da721ea5dc673025b2ff867fdf2c1f Mon Sep 17 00:00:00 2001 From: Nic Date: Fri, 14 May 2021 20:23:42 +0100 Subject: [PATCH 11/23] Update r/R/scalar.R Co-authored-by: Neal Richardson --- r/R/scalar.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/scalar.R b/r/R/scalar.R index b8672d8bdcf..6e5e63cee3e 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -58,7 +58,7 @@ Scalar <- R6Class("Scalar", ToString = function() Scalar__ToString(self), type_id = function() Scalar__type(self)$id, as_vector = function() Scalar__as_vector(self), - as_array = function() MakeArrayFromScalar(self, 1L), + as_array = function(length = 1L) MakeArrayFromScalar(self, as.integer(length)), Equals = function(other, ...) { inherits(other, "Scalar") && Scalar__Equals(self, other) }, From 65b18e548bf4344479595e98148b2eeb8bc6c7c4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 17 May 2021 13:09:38 +0100 Subject: [PATCH 12/23] Use lapply instead of modify2 --- r/R/record-batch.R | 8 ++------ r/R/table.R | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index f3f45c95e53..4aab497b17a 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -163,14 +163,10 @@ RecordBatch$create <- function(..., schema = NULL) { } # If any arrays are length 1, recycle them - arr_lens <- map_int(arrays, length) + arr_lens <- lengths(arrays) if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { max_array_len <- max(arr_lens) - arrays <- modify2( - arrays, - arr_lens == 1, - ~if (.y) repeat_value_as_array(.x, max_array_len) else .x - ) + arrays[arr_lens == 1] <- lapply(arrays[arr_lens == 1], repeat_value_as_array, max_array_len) } # TODO: should this also assert that they're all Arrays? diff --git a/r/R/table.R b/r/R/table.R index a9a70af8d4f..79ff867a28f 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -174,15 +174,11 @@ Table$create <- function(..., schema = NULL) { } # If any arrays are length 1, recycle them - arr_lens <- map_int(dots, length) + arr_lens <- lengths(dots) if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { max_array_len <- max(arr_lens) - dots <- modify2( - dots, - arr_lens == 1, - ~if (.y) repeat_value_as_array(.x, max_array_len) else .x - ) + dots[arr_lens == 1] <- lapply(dots[arr_lens == 1], repeat_value_as_array, max_array_len) } if (all_record_batches(dots)) { From 789e850af33b93655cf77d393b0f8314f87f4e57 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 17 May 2021 21:36:48 +0100 Subject: [PATCH 13/23] Remove unnecessary check of length --- r/R/util.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index d8b9903c9c4..2ac1a14fc86 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -141,16 +141,13 @@ attr(is_writable_table, "fail") <- function(call, env){ #' Take an object of length 1 and repeat it. #' -#' @param object Object to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray` +#' @param object Object of length 1 to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray` #' @param n Number of repetitions #' #' @return `Array` of length `n` #' #' @keywords internal repeat_value_as_array <- function(object, n) { - if (length(object) != 1) { - stop("Object to be repeated must be of length 1") - } if (inherits(object, "ChunkedArray")) { return(MakeArrayFromScalar(Scalar$create(object$chunks[[1]]), n)) } From f071f2eda64c2b42fc15e93a7d11359b0a743801 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 19 May 2021 16:51:34 +0100 Subject: [PATCH 14/23] Make behaviour consistent with tibble --- r/R/record-batch.R | 10 +++++++--- r/tests/testthat/test-RecordBatch.R | 12 ++++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 4aab497b17a..d70a90c7ef8 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -161,12 +161,16 @@ RecordBatch$create <- function(..., schema = NULL) { out <- RecordBatch__from_arrays(schema, arrays) return(dplyr::group_by(out, !!!dplyr::groups(arrays[[1]]))) } - - # If any arrays are length 1, recycle them + + # If any arrays are length 1, recycle them + # Get lengths of items in arrays + is_df <- map_lgl(arrays, ~inherits(.x, "data.frame")) arr_lens <- lengths(arrays) + arr_lens[is_df] <- map_int(arrays[is_df], nrow) + if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { max_array_len <- max(arr_lens) - arrays[arr_lens == 1] <- lapply(arrays[arr_lens == 1], repeat_value_as_array, max_array_len) + arrays[arr_lens == 1 && !is_df] <- lapply(arrays[arr_lens == 1 && !is_df], repeat_value_as_array, max_array_len) } # TODO: should this also assert that they're all Arrays? diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index a74a80f49f7..3f5d4cc51a7 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -443,11 +443,19 @@ test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArra test_that("record_batch() no recycling with tibbles", { expect_error( record_batch( - tibble::tibble(a = 1:10, b = 5), + tibble::tibble(a = 1:10), tibble::tibble(a = 1, b = 5) ), regexp = "All arrays must have the same length" ) + + expect_error( + record_batch( + tibble::tibble(a = 1:10), + tibble::tibble(a = 1) + ), + regexp = "All arrays must have the same length" + ) }) test_that("RecordBatch$Equals", { @@ -462,7 +470,7 @@ test_that("RecordBatch$Equals", { test_that("RecordBatch$Equals(check_metadata)", { df <- tibble::tibble(x = 1:2, y = c("a", "b")) rb1 <- record_batch(df) - rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some="metadata"))) + rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some = "metadata"))) expect_r6_class(rb1, "RecordBatch") expect_r6_class(rb2, "RecordBatch") From 4aa1eebb222ee915d7cab3e36b9cc526e8d1d178 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 19 May 2021 17:57:04 +0100 Subject: [PATCH 15/23] Refactor scalar recycling as own function, and only do on non data.frame objects --- r/NAMESPACE | 1 - r/R/record-batch.R | 10 +--------- r/R/table.R | 7 +------ r/R/util.R | 18 ++++++++++++++++++ r/man/recycle_scalars.Rd | 18 ++++++++++++++++++ r/man/repeat_value_as_array.Rd | 2 +- 6 files changed, 39 insertions(+), 17 deletions(-) create mode 100644 r/man/recycle_scalars.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 51f29c59271..f298ba905ee 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -296,7 +296,6 @@ importFrom(purrr,map_chr) importFrom(purrr,map_dfr) importFrom(purrr,map_int) importFrom(purrr,map_lgl) -importFrom(purrr,modify2) importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,abort) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index d70a90c7ef8..0ba6b4bd45d 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -163,15 +163,7 @@ RecordBatch$create <- function(..., schema = NULL) { } # If any arrays are length 1, recycle them - # Get lengths of items in arrays - is_df <- map_lgl(arrays, ~inherits(.x, "data.frame")) - arr_lens <- lengths(arrays) - arr_lens[is_df] <- map_int(arrays[is_df], nrow) - - if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { - max_array_len <- max(arr_lens) - arrays[arr_lens == 1 && !is_df] <- lapply(arrays[arr_lens == 1 && !is_df], repeat_value_as_array, max_array_len) - } + arrays <- recycle_scalars(arrays) # TODO: should this also assert that they're all Arrays? RecordBatch__from_arrays(schema, arrays) diff --git a/r/R/table.R b/r/R/table.R index 79ff867a28f..54614719562 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -174,13 +174,8 @@ Table$create <- function(..., schema = NULL) { } # If any arrays are length 1, recycle them - arr_lens <- lengths(dots) + dots <- recycle_scalars(dots) - if (length(dots) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { - max_array_len <- max(arr_lens) - dots[arr_lens == 1] <- lapply(dots[arr_lens == 1], repeat_value_as_array, max_array_len) - } - if (all_record_batches(dots)) { Table__from_record_batches(dots, schema) } else { diff --git a/r/R/util.R b/r/R/util.R index 2ac1a14fc86..a301d07317d 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -153,3 +153,21 @@ repeat_value_as_array <- function(object, n) { } return(MakeArrayFromScalar(Scalar$create(object), n)) } + +#' Recycle scalar values in a list of arrays +#' +#' @param arrays List of arrays +#' @return List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled +#' @keywords internal +recycle_scalars <- function(arrays){ + # Get lengths of items in arrays + is_df <- map_lgl(arrays, ~inherits(.x, "data.frame")) + arr_lens <- lengths(arrays) + arr_lens[is_df] <- map_int(arrays[is_df], nrow) + + if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { + max_array_len <- max(arr_lens) + arrays[arr_lens == 1 & !is_df] <- lapply(arrays[arr_lens == 1 & !is_df], repeat_value_as_array, max_array_len) + } + arrays +} \ No newline at end of file diff --git a/r/man/recycle_scalars.Rd b/r/man/recycle_scalars.Rd new file mode 100644 index 00000000000..3d97ecfd79f --- /dev/null +++ b/r/man/recycle_scalars.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{recycle_scalars} +\alias{recycle_scalars} +\title{Recycle scalar values in a list of arrays} +\usage{ +recycle_scalars(arrays) +} +\arguments{ +\item{arrays}{List of arrays} +} +\value{ +List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled +} +\description{ +Recycle scalar values in a list of arrays +} +\keyword{internal} diff --git a/r/man/repeat_value_as_array.Rd b/r/man/repeat_value_as_array.Rd index a546c4cef2e..a4937326efa 100644 --- a/r/man/repeat_value_as_array.Rd +++ b/r/man/repeat_value_as_array.Rd @@ -7,7 +7,7 @@ repeat_value_as_array(object, n) } \arguments{ -\item{object}{Object to be repeated - vector, \code{Scalar}, \code{Array}, or \code{ChunkedArray}} +\item{object}{Object of length 1 to be repeated - vector, \code{Scalar}, \code{Array}, or \code{ChunkedArray}} \item{n}{Number of repetitions} } From d148a8bab401b5bb046b9d50e9785b76fae7ca8b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 19 May 2021 18:18:46 +0100 Subject: [PATCH 16/23] Add test for tibble with length 1 --- r/tests/testthat/test-Table.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index c1c055cbf3f..1ce57be9ea6 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -507,6 +507,14 @@ test_that("Table$create() no recycling with tibbles", { ), regexp = "All columns must have the same length" ) + + expect_error( + Table$create( + tibble::tibble(a = 1:10, b = 5), + tibble::tibble(a = 1) + ), + regexp = "All columns must have the same length" + ) }) test_that("ARROW-11769 - grouping preserved in table creation", { From c791aeb1c6622879f40df18558c6750efca3c772 Mon Sep 17 00:00:00 2001 From: Nic Date: Thu, 20 May 2021 08:49:04 +0100 Subject: [PATCH 17/23] Update r/R/util.R Co-authored-by: Neal Richardson --- r/R/util.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index a301d07317d..c99feba8739 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -149,9 +149,9 @@ attr(is_writable_table, "fail") <- function(call, env){ #' @keywords internal repeat_value_as_array <- function(object, n) { if (inherits(object, "ChunkedArray")) { - return(MakeArrayFromScalar(Scalar$create(object$chunks[[1]]), n)) + return(Scalar$create(object$chunks[[1]])$as_array(n)) } - return(MakeArrayFromScalar(Scalar$create(object), n)) + return(Scalar$create(object)$as_array(n)) } #' Recycle scalar values in a list of arrays @@ -170,4 +170,4 @@ recycle_scalars <- function(arrays){ arrays[arr_lens == 1 & !is_df] <- lapply(arrays[arr_lens == 1 & !is_df], repeat_value_as_array, max_array_len) } arrays -} \ No newline at end of file +} From e7ff7d817ef9b1bed719921f2fc1c00703c915e9 Mon Sep 17 00:00:00 2001 From: Nic Date: Thu, 20 May 2021 08:55:07 +0100 Subject: [PATCH 18/23] Update r/R/util.R Co-authored-by: Neal Richardson --- r/R/util.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index c99feba8739..67926b279dc 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -161,9 +161,7 @@ repeat_value_as_array <- function(object, n) { #' @keywords internal recycle_scalars <- function(arrays){ # Get lengths of items in arrays - is_df <- map_lgl(arrays, ~inherits(.x, "data.frame")) - arr_lens <- lengths(arrays) - arr_lens[is_df] <- map_int(arrays[is_df], nrow) + arr_lens <- map_int(arrays, NROW) if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { max_array_len <- max(arr_lens) From e3df47ed591d437bcf59bace6e71905ff8f2e2da Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 20 May 2021 13:02:28 +0100 Subject: [PATCH 19/23] Remove unnecessary check to see if tibble --- r/R/util.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/util.R b/r/R/util.R index 67926b279dc..fd48e4e6c9b 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -165,7 +165,7 @@ recycle_scalars <- function(arrays){ if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { max_array_len <- max(arr_lens) - arrays[arr_lens == 1 & !is_df] <- lapply(arrays[arr_lens == 1 & !is_df], repeat_value_as_array, max_array_len) + arrays[arr_lens == 1] <- lapply(arrays[arr_lens == 1], repeat_value_as_array, max_array_len) } arrays } From 95463e320320e4d7fba035ed8c9a7be5d8a2d1ce Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 8 Jun 2021 15:34:36 +0100 Subject: [PATCH 20/23] Reorder so fewer calls to Table__from_dots (again) --- r/R/table.R | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/r/R/table.R b/r/R/table.R index 54614719562..b2dd3ded7c1 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -166,21 +166,23 @@ Table$create <- function(..., schema = NULL) { names(dots) <- rep_len("", length(dots)) } stopifnot(length(dots) > 0) - - # Preserve any grouping - if (length(dots) == 1 && inherits(dots[[1]], "grouped_df")) { - out <- Table__from_dots(dots, schema, option_use_threads()) - return(dplyr::group_by(out, !!!dplyr::groups(dots[[1]]))) + + if (all_record_batches(dots)) { + return(Table__from_record_batches(dots, schema)) } # If any arrays are length 1, recycle them dots <- recycle_scalars(dots) - if (all_record_batches(dots)) { - Table__from_record_batches(dots, schema) - } else { - Table__from_dots(dots, schema, option_use_threads()) + out <- Table__from_dots(dots, schema, option_use_threads()) + + # Preserve any grouping + if (length(dots) == 1 && inherits(dots[[1]], "grouped_df")) { + out <- dplyr::group_by(out, !!!dplyr::groups(dots[[1]])) } + + out + } From 8b6de2c142976fcc6272b54651b7fe8ee3b69b43 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 8 Jun 2021 16:15:52 +0100 Subject: [PATCH 21/23] Give error message with tibbles --- r/R/util.R | 9 +++++++++ r/tests/testthat/test-RecordBatch.R | 4 ++-- r/tests/testthat/test-Table.R | 4 ++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index fd48e4e6c9b..26b45bbe297 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -164,6 +164,15 @@ recycle_scalars <- function(arrays){ arr_lens <- map_int(arrays, NROW) if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { + + # Recycling not supported for tibbles and data.frames + if(all(map_lgl(arrays, ~inherits(.x, "data.frame")))){ + abort(c( + "All input tibbles or data.frames must have the same number of rows", + x = paste("Number of rows in inputs:",oxford_paste(map_int(arrays, ~nrow(.x)))) + )) + } + max_array_len <- max(arr_lens) arrays[arr_lens == 1] <- lapply(arrays[arr_lens == 1], repeat_value_as_array, max_array_len) } diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 3f5d4cc51a7..6617805db54 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -446,7 +446,7 @@ test_that("record_batch() no recycling with tibbles", { tibble::tibble(a = 1:10), tibble::tibble(a = 1, b = 5) ), - regexp = "All arrays must have the same length" + regexp = "All input tibbles or data.frames must have the same number of rows" ) expect_error( @@ -454,7 +454,7 @@ test_that("record_batch() no recycling with tibbles", { tibble::tibble(a = 1:10), tibble::tibble(a = 1) ), - regexp = "All arrays must have the same length" + regexp = "All input tibbles or data.frames must have the same number of rows" ) }) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 1ce57be9ea6..6dd36b248ec 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -505,7 +505,7 @@ test_that("Table$create() no recycling with tibbles", { tibble::tibble(a = 1:10, b = 5), tibble::tibble(a = 1, b = 5) ), - regexp = "All columns must have the same length" + regexp = "All input tibbles or data.frames must have the same number of rows" ) expect_error( @@ -513,7 +513,7 @@ test_that("Table$create() no recycling with tibbles", { tibble::tibble(a = 1:10, b = 5), tibble::tibble(a = 1) ), - regexp = "All columns must have the same length" + regexp = "All input tibbles or data.frames must have the same number of rows" ) }) From fff137720f31080b94582614fbd03566128b3753 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 8 Jun 2021 16:38:34 +0100 Subject: [PATCH 22/23] Remove unnecessary whitespace --- r/R/table.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/r/R/table.R b/r/R/table.R index b2dd3ded7c1..3e5c52d9624 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -180,12 +180,8 @@ Table$create <- function(..., schema = NULL) { if (length(dots) == 1 && inherits(dots[[1]], "grouped_df")) { out <- dplyr::group_by(out, !!!dplyr::groups(dots[[1]])) } - out - } - - #' @export names.Table <- function(x) x$ColumnNames() From d044bfb79e4082371aec135ad331747fff2b4bc6 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 9 Jun 2021 08:34:53 +0100 Subject: [PATCH 23/23] Add spacing, shorten error message, rearrange functions, and turn repeated expression into variable --- r/R/util.R | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index 26b45bbe297..884c346e503 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -139,21 +139,6 @@ attr(is_writable_table, "fail") <- function(call, env){ ) } -#' Take an object of length 1 and repeat it. -#' -#' @param object Object of length 1 to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray` -#' @param n Number of repetitions -#' -#' @return `Array` of length `n` -#' -#' @keywords internal -repeat_value_as_array <- function(object, n) { - if (inherits(object, "ChunkedArray")) { - return(Scalar$create(object$chunks[[1]])$as_array(n)) - } - return(Scalar$create(object)$as_array(n)) -} - #' Recycle scalar values in a list of arrays #' #' @param arrays List of arrays @@ -163,18 +148,39 @@ recycle_scalars <- function(arrays){ # Get lengths of items in arrays arr_lens <- map_int(arrays, NROW) - if (length(arrays) > 1 && any(arr_lens == 1) && !all(arr_lens==1)) { + is_scalar <- arr_lens == 1 + + if (length(arrays) > 1 && any(is_scalar) && !all(is_scalar)) { # Recycling not supported for tibbles and data.frames - if(all(map_lgl(arrays, ~inherits(.x, "data.frame")))){ + if (all(map_lgl(arrays, ~inherits(.x, "data.frame")))) { + abort(c( "All input tibbles or data.frames must have the same number of rows", - x = paste("Number of rows in inputs:",oxford_paste(map_int(arrays, ~nrow(.x)))) + x = paste( + "Number of rows in longest and shortest inputs:", + oxford_paste(c(max(arr_lens), min(arr_lens))) + ) )) } max_array_len <- max(arr_lens) - arrays[arr_lens == 1] <- lapply(arrays[arr_lens == 1], repeat_value_as_array, max_array_len) + arrays[is_scalar] <- lapply(arrays[is_scalar], repeat_value_as_array, max_array_len) } arrays } + +#' Take an object of length 1 and repeat it. +#' +#' @param object Object of length 1 to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray` +#' @param n Number of repetitions +#' +#' @return `Array` of length `n` +#' +#' @keywords internal +repeat_value_as_array <- function(object, n) { + if (inherits(object, "ChunkedArray")) { + return(Scalar$create(object$chunks[[1]])$as_array(n)) + } + return(Scalar$create(object)$as_array(n)) +} \ No newline at end of file