From 247775992224e32f1b3ef7128d51992a578b48eb Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 31 Mar 2023 14:19:27 +0100 Subject: [PATCH 01/22] Only preserve metadata if input df is 1 item long --- r/src/table.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/table.cpp b/r/src/table.cpp index 498141cc2f2..5691f3c1a15 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -228,7 +228,7 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, // "top level" attributes, only relevant if the first object is not named and a data // frame cpp11::strings names = Rf_getAttrib(lst, R_NamesSymbol); - if (names[0] == "" && Rf_inherits(VECTOR_ELT(lst, 0), "data.frame")) { + if (names[0] == "" && Rf_inherits(VECTOR_ELT(lst, 0), "data.frame") && Rf_xlength(lst) == 1) { SEXP top_level = metadata[0] = arrow_attributes(VECTOR_ELT(lst, 0), true); if (!Rf_isNull(top_level) && XLENGTH(top_level) > 0) { has_top_level_metadata = true; From 16d67643d97f0d4efd69bb24ce3aa0aa73ae287d Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 31 Mar 2023 14:35:49 +0100 Subject: [PATCH 02/22] Run linter --- r/src/table.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/src/table.cpp b/r/src/table.cpp index 5691f3c1a15..04537000f5d 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -228,7 +228,8 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, // "top level" attributes, only relevant if the first object is not named and a data // frame cpp11::strings names = Rf_getAttrib(lst, R_NamesSymbol); - if (names[0] == "" && Rf_inherits(VECTOR_ELT(lst, 0), "data.frame") && Rf_xlength(lst) == 1) { + if (names[0] == "" && Rf_inherits(VECTOR_ELT(lst, 0), "data.frame") && + Rf_xlength(lst) == 1) { SEXP top_level = metadata[0] = arrow_attributes(VECTOR_ELT(lst, 0), true); if (!Rf_isNull(top_level) && XLENGTH(top_level) > 0) { has_top_level_metadata = true; From 475eecb7e0a9383ef2155b1752f872f8ba84c678 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 3 Apr 2023 14:53:44 +0100 Subject: [PATCH 03/22] Make as.data.frame only return data.frames --- r/R/arrow-tabular.R | 3 ++- r/tests/testthat/test-Table.R | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index ae68cc2118f..058924ba0bd 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -94,7 +94,8 @@ ArrowTabular <- R6Class("ArrowTabular", #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { df <- x$to_data_frame() - apply_arrow_r_metadata(df, x$metadata$r) + out <- apply_arrow_r_metadata(df, x$metadata$r) + as.data.frame(out) } #' @export diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 817b645fad9..ef5e23ba66a 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -711,3 +711,15 @@ test_that("as_arrow_table() errors on data.frame with NULL names", { names(df) <- NULL expect_error(as_arrow_table(df), "Input data frame columns must be named") }) + +test_that("as.data.frame() on an ArrowTabular object returns a vanilla data.frame and not a tibble", { + df <- data.frame(x = 1) + out1 <- as.data.frame(arrow::arrow_table(df, name = "1")) + out2 <- as.data.frame(arrow::arrow_table(name = "1", df)) + out3 <- as.data.frame(arrow::arrow_table(df)) + + expect_s3_class(out1, "data.frame", exact = TRUE) + expect_s3_class(out2, "data.frame", exact = TRUE) + expect_s3_class(out3, "data.frame", exact = TRUE) + +}) From 5accf1def286a65f10d8cb66826cbbbb34099926 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 3 Apr 2023 14:54:42 +0100 Subject: [PATCH 04/22] Lint away... --- r/tests/testthat/test-Table.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index ef5e23ba66a..3bd42399d76 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -721,5 +721,4 @@ test_that("as.data.frame() on an ArrowTabular object returns a vanilla data.fram expect_s3_class(out1, "data.frame", exact = TRUE) expect_s3_class(out2, "data.frame", exact = TRUE) expect_s3_class(out3, "data.frame", exact = TRUE) - }) From 4571af2dd7d9c377b92d366e666509a27733a19a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 06:45:41 +0100 Subject: [PATCH 05/22] Add tibble as a dependendency --- r/DESCRIPTION | 2 +- r/R/arrow-package.R | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index c1114571ebd..f6873935884 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -37,6 +37,7 @@ Imports: R6, rlang (>= 1.0.0), stats, + tibble, tidyselect (>= 1.0.0), utils, vctrs @@ -63,7 +64,6 @@ Suggests: stringr, sys, testthat (>= 3.1.0), - tibble, tzdb, withr LinkingTo: cpp11 (>= 0.4.2) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index a3c860a51c8..d31a078405e 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -27,6 +27,7 @@ #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args #' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure #' @importFrom rlang new_quosures expr_text caller_env check_dots_empty dots_list is_string inform +#' @importFrom tibble as_tibble #' @importFrom tidyselect vars_pull eval_select eval_rename #' @importFrom glue glue #' @useDynLib arrow, .registration = TRUE From 11a3a2395b267690d50aa8e51fea4bff92742cf5 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 06:46:42 +0100 Subject: [PATCH 06/22] Explicitly return data from our reading functions as tibbles --- r/R/csv.R | 2 +- r/R/dplyr-collect.R | 2 +- r/R/feather.R | 2 +- r/R/ipc-stream.R | 2 +- r/R/json.R | 2 +- r/R/parquet.R | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index 82243238662..7885239ad6a 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -248,7 +248,7 @@ read_delim_arrow <- function(file, } if (isTRUE(as_data_frame)) { - tab <- as.data.frame(tab) + tab <- as_tibble(tab) } tab diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 9205a31b14f..ca045f40784 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -24,7 +24,7 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { } collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) { if (as_data_frame) { - as.data.frame(x, ...) + as_tibble(x, ...) } else { x } diff --git a/r/R/feather.R b/r/R/feather.R index 1488db29eb7..67b7934542e 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -196,7 +196,7 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T ) if (isTRUE(as_data_frame)) { - out <- as.data.frame(out) + out <- as_tibble(out) } out } diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R index f0b4a6aae0e..c424a0a6e2b 100644 --- a/r/R/ipc-stream.R +++ b/r/R/ipc-stream.R @@ -106,7 +106,7 @@ read_ipc_stream <- function(file, as_data_frame = TRUE, ...) { # https://issues.apache.org/jira/browse/ARROW-6830 out <- RecordBatchStreamReader$create(file)$read_table() if (as_data_frame) { - out <- as.data.frame(out) + out <- as_tibble(out) } out } diff --git a/r/R/json.R b/r/R/json.R index cdbe850b32f..868643a815d 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -84,7 +84,7 @@ read_json_arrow <- function(file, } if (isTRUE(as_data_frame)) { - tab <- as.data.frame(tab) + tab <- as_tibble(tab) } tab } diff --git a/r/R/parquet.R b/r/R/parquet.R index f3d384e8c25..93904fa16ff 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -70,7 +70,7 @@ read_parquet <- function(file, } if (as_data_frame) { - tab <- as.data.frame(tab) + tab <- as_tibble(tab) } tab } From a318daea1b2a937c63bd423d315ed0303d614c5c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 06:51:02 +0100 Subject: [PATCH 07/22] Update tests to expect tibbles and not data.frames --- r/tests/testthat/helper-expectation.R | 4 +- r/tests/testthat/test-RecordBatch.R | 74 +++++++++++++------------- r/tests/testthat/test-Table.R | 76 +++++++++++++-------------- 3 files changed, 77 insertions(+), 77 deletions(-) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 303a96ead7d..be9cad1bb59 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -19,8 +19,8 @@ expect_as_vector <- function(x, y, ...) { expect_equal(as.vector(x), y, ...) } -expect_data_frame <- function(x, y, ...) { - expect_equal(as.data.frame(x), y, ...) +expect_tibble <- function(x, y, ...) { + expect_equal(as_tibble(x), y, ...) } expect_r6_class <- function(object, class) { diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 7e7084542d3..72ffd4b4596 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -89,7 +89,7 @@ test_that("RecordBatch", { schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8())) ) expect_equal(batch2$column(0), batch$column(1)) - expect_data_frame(batch2, tbl[, -1]) + expect_tibble(batch2, tbl[, -1]) # input validation expect_error(batch$RemoveColumn(NA), "'i' cannot be NA") @@ -109,10 +109,10 @@ test_that("RecordBatch S3 methods", { test_that("RecordBatch$Slice", { batch3 <- batch$Slice(5) - expect_data_frame(batch3, tbl[6:10, ]) + expect_tibble(batch3, tbl[6:10, ]) batch4 <- batch$Slice(5, 2) - expect_data_frame(batch4, tbl[6:7, ]) + expect_tibble(batch4, tbl[6:7, ]) # Input validation expect_error(batch$Slice("ten")) @@ -131,20 +131,20 @@ test_that("RecordBatch$Slice", { }) test_that("[ on RecordBatch", { - expect_data_frame(batch[6:7, ], tbl[6:7, ]) - expect_data_frame(batch[c(6, 7), ], tbl[6:7, ]) - expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4]) - expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_tibble(batch[6:7, ], tbl[6:7, ]) + expect_tibble(batch[c(6, 7), ], tbl[6:7, ]) + expect_tibble(batch[6:7, 2:4], tbl[6:7, 2:4]) + expect_tibble(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr) - expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) - expect_data_frame( + expect_tibble(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_tibble( batch[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ] ) # bool Array - expect_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ]) + expect_tibble(batch[batch$lgl, ], tbl[tbl$lgl, ]) # int Array - expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + expect_tibble(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) # input validation expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"') @@ -176,15 +176,15 @@ test_that("[[<- assignment", { # can remove a column batch[["chr"]] <- NULL - expect_data_frame(batch, tbl[-4]) + expect_tibble(batch, tbl[-4]) # can remove a column by index batch[[4]] <- NULL - expect_data_frame(batch, tbl[1:3]) + expect_tibble(batch, tbl[1:3]) # can add a named column batch[["new"]] <- letters[10:1] - expect_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) + expect_tibble(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) # can replace a column by index batch[[2]] <- as.numeric(10:1) @@ -239,16 +239,16 @@ test_that("head and tail on RecordBatch", { fct = factor(letters[1:10]) ) batch <- RecordBatch$create(tbl) - expect_data_frame(head(batch), head(tbl)) - expect_data_frame(head(batch, 4), head(tbl, 4)) - expect_data_frame(head(batch, 40), head(tbl, 40)) - expect_data_frame(head(batch, -4), head(tbl, -4)) - expect_data_frame(head(batch, -40), head(tbl, -40)) - expect_data_frame(tail(batch), tail(tbl)) - expect_data_frame(tail(batch, 4), tail(tbl, 4)) - expect_data_frame(tail(batch, 40), tail(tbl, 40)) - expect_data_frame(tail(batch, -4), tail(tbl, -4)) - expect_data_frame(tail(batch, -40), tail(tbl, -40)) + expect_tibble(head(batch), head(tbl)) + expect_tibble(head(batch, 4), head(tbl, 4)) + expect_tibble(head(batch, 40), head(tbl, 40)) + expect_tibble(head(batch, -4), head(tbl, -4)) + expect_tibble(head(batch, -40), head(tbl, -40)) + expect_tibble(tail(batch), tail(tbl)) + expect_tibble(tail(batch, 4), tail(tbl, 4)) + expect_tibble(tail(batch, 40), tail(tbl, 40)) + expect_tibble(tail(batch, -4), tail(tbl, -4)) + expect_tibble(tail(batch, -40), tail(tbl, -40)) }) test_that("RecordBatch print method", { @@ -346,7 +346,7 @@ test_that("record_batch() handles data frame columns", { b = struct(x = int32(), y = int32()) ) ) - out <- as.data.frame(batch) + out <- as_tibble(batch) expect_equal(out, tibble::tibble(a = 1:10, b = tib)) # if not named, columns from tib are auto spliced @@ -355,7 +355,7 @@ test_that("record_batch() handles data frame columns", { batch2$schema, schema(a = int32(), x = int32(), y = int32()) ) - out <- as.data.frame(batch2) + out <- as_tibble(batch2) expect_equal(out, tibble::tibble(a = 1:10, !!!tib)) }) @@ -366,7 +366,7 @@ test_that("record_batch() handles data frame columns with schema spec", { schema <- schema(a = int32(), b = struct(x = int16(), y = float64())) batch <- record_batch(a = 1:10, b = tib, schema = schema) expect_equal(batch$schema, schema) - out <- as.data.frame(batch) + out <- as_tibble(batch) expect_equal(out, tibble::tibble(a = 1:10, b = tib_float)) schema <- schema(a = int32(), b = struct(x = int16(), y = utf8())) @@ -379,14 +379,14 @@ test_that("record_batch() auto splices (ARROW-5718)", { batch2 <- record_batch(!!!df) expect_equal(batch1, batch2) expect_equal(batch1$schema, schema(x = int32(), y = utf8())) - expect_data_frame(batch1, df) + expect_tibble(batch1, df) batch3 <- record_batch(df, z = 1:10) batch4 <- record_batch(!!!df, z = 1:10) expect_equal(batch3, batch4) expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32())) expect_equal( - as.data.frame(batch3), + as_tibble(batch3), tibble::as_tibble(cbind(df, data.frame(z = 1:10))) ) @@ -395,7 +395,7 @@ test_that("record_batch() auto splices (ARROW-5718)", { batch6 <- record_batch(!!!df, schema = s) expect_equal(batch5, batch6) expect_equal(batch5$schema, s) - expect_equal(as.data.frame(batch5), df) + expect_equal(as_tibble(batch5), df) s2 <- schema(x = float64(), y = utf8(), z = int16()) batch7 <- record_batch(df, z = 1:10, schema = s2) @@ -403,7 +403,7 @@ test_that("record_batch() auto splices (ARROW-5718)", { expect_equal(batch7, batch8) expect_equal(batch7$schema, s2) expect_equal( - as.data.frame(batch7), + as_tibble(batch7), tibble::as_tibble(cbind(df, data.frame(z = 1:10))) ) }) @@ -425,24 +425,24 @@ test_that("record_batch() handles null type (ARROW-7064)", { }) test_that("record_batch() scalar recycling with vectors", { - expect_data_frame( + expect_tibble( record_batch(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) ) }) test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", { - expect_data_frame( + expect_tibble( record_batch(a = Array$create(1:10), b = Scalar$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_data_frame( + expect_tibble( record_batch(a = Array$create(1:10), b = Array$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_data_frame( + expect_tibble( record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)), tibble::tibble(a = 1:10, b = 5) ) @@ -627,7 +627,7 @@ test_that("Handling string data with embedded nuls", { # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately # on `as.vector()` where as with it, the error only happens on materialization) skip_on_r_older_than("3.6") - df <- as.data.frame(batch_with_nul) + df <- as_tibble(batch_with_nul) expect_error( df$b[], @@ -648,7 +648,7 @@ test_that("Handling string data with embedded nuls", { suppressWarnings( expect_warning( expect_equal( - as.data.frame(batch_with_nul)$b, + as_tibble(batch_with_nul)$b, c("person", "woman", "man", "camera", "tv"), ignore_attr = TRUE ), diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 3bd42399d76..605903d84ca 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -68,26 +68,26 @@ tab <- Table$create(tbl) test_that("[, [[, $ for Table", { expect_identical(names(tab), names(tbl)) - expect_data_frame(tab[6:7, ], tbl[6:7, ]) - expect_data_frame(tab[6:7, 2:4], tbl[6:7, 2:4]) - expect_data_frame(tab[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_tibble(tab[6:7, ], tbl[6:7, ]) + expect_tibble(tab[6:7, 2:4], tbl[6:7, 2:4]) + expect_tibble(tab[, c("dbl", "fct")], tbl[, c(2, 5)]) expect_as_vector(tab[, "chr", drop = TRUE], tbl$chr) # Take within a single chunk - expect_data_frame(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) - expect_data_frame(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ]) + expect_tibble(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_tibble(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ]) # bool ChunkedArray (with one chunk) - expect_data_frame(tab[tab$lgl, ], tbl[tbl$lgl, ]) + expect_tibble(tab[tab$lgl, ], tbl[tbl$lgl, ]) # ChunkedArray with multiple chunks c1 <- c(TRUE, FALSE, TRUE, TRUE, FALSE) c2 <- c(FALSE, FALSE, TRUE, TRUE, FALSE) ca <- ChunkedArray$create(c1, c2) - expect_data_frame(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ]) + expect_tibble(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ]) # int Array - expect_data_frame(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + expect_tibble(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4]) # ChunkedArray - expect_data_frame(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4]) + expect_tibble(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4]) # Expression - expect_data_frame(tab[tab$int > 6, ], tbl[tbl$int > 6, ]) + expect_tibble(tab[tab$int > 6, ], tbl[tbl$int > 6, ]) expect_as_vector(tab[["int"]], tbl$int) expect_as_vector(tab$int, tbl$int) @@ -95,9 +95,9 @@ test_that("[, [[, $ for Table", { expect_null(tab$qwerty) expect_null(tab[["asdf"]]) # List-like column slicing - expect_data_frame(tab[2:4], tbl[2:4]) - expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)]) - expect_data_frame(tab[-3], tbl[-3]) + expect_tibble(tab[2:4], tbl[2:4]) + expect_tibble(tab[c(2, 1)], tbl[c(2, 1)]) + expect_tibble(tab[-3], tbl[-3]) expect_error(tab[[c(4, 3)]]) expect_error(tab[[NA]], "'i' must be character or numeric, not logical") @@ -112,21 +112,21 @@ test_that("[, [[, $ for Table", { expect_error(tab[, c(6, NA)], "Column indices cannot be NA") skip("Table with 0 cols doesn't know how many rows it should have") - expect_data_frame(tab[0], tbl[0]) + expect_tibble(tab[0], tbl[0]) }) test_that("[[<- assignment", { # can remove a column tab[["chr"]] <- NULL - expect_data_frame(tab, tbl[-4]) + expect_tibble(tab, tbl[-4]) # can remove a column by index tab[[4]] <- NULL - expect_data_frame(tab, tbl[1:3]) + expect_tibble(tab, tbl[1:3]) # can add a named column tab[["new"]] <- letters[10:1] - expect_data_frame(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) + expect_tibble(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) # can replace a column by index tab[[2]] <- as.numeric(10:1) @@ -177,10 +177,10 @@ test_that("[[<- assignment", { test_that("Table$Slice", { tab2 <- tab$Slice(5) - expect_data_frame(tab2, tbl[6:10, ]) + expect_tibble(tab2, tbl[6:10, ]) tab3 <- tab$Slice(5, 2) - expect_data_frame(tab3, tbl[6:7, ]) + expect_tibble(tab3, tbl[6:7, ]) # Input validation expect_error(tab$Slice("ten")) @@ -199,16 +199,16 @@ test_that("Table$Slice", { }) test_that("head and tail on Table", { - expect_data_frame(head(tab), head(tbl)) - expect_data_frame(head(tab, 4), head(tbl, 4)) - expect_data_frame(head(tab, 40), head(tbl, 40)) - expect_data_frame(head(tab, -4), head(tbl, -4)) - expect_data_frame(head(tab, -40), head(tbl, -40)) - expect_data_frame(tail(tab), tail(tbl)) - expect_data_frame(tail(tab, 4), tail(tbl, 4)) - expect_data_frame(tail(tab, 40), tail(tbl, 40)) - expect_data_frame(tail(tab, -4), tail(tbl, -4)) - expect_data_frame(tail(tab, -40), tail(tbl, -40)) + expect_tibble(head(tab), head(tbl)) + expect_tibble(head(tab, 4), head(tbl, 4)) + expect_tibble(head(tab, 40), head(tbl, 40)) + expect_tibble(head(tab, -4), head(tbl, -4)) + expect_tibble(head(tab, -40), head(tbl, -40)) + expect_tibble(tail(tab), tail(tbl)) + expect_tibble(tail(tab, 4), tail(tbl, 4)) + expect_tibble(tail(tab, 40), tail(tbl, 40)) + expect_tibble(tail(tab, -4), tail(tbl, -4)) + expect_tibble(tail(tab, -40), tail(tbl, -40)) }) test_that("Table print method", { @@ -265,7 +265,7 @@ test_that("table() handles ... of arrays, chunked arrays, vectors", { tab$schema, schema(a = int32(), b = int32(), c = float64(), x = int32(), y = utf8()) ) - res <- as.data.frame(tab) + res <- as_tibble(tab) expect_equal(names(res), c("a", "b", "c", "x", "y")) expect_equal( res, @@ -280,14 +280,14 @@ test_that("table() auto splices (ARROW-5718)", { tab2 <- Table$create(!!!df) expect_equal(tab1, tab2) expect_equal(tab1$schema, schema(x = int32(), y = utf8())) - expect_equal(as.data.frame(tab1), df) + expect_equal(as_tibble(tab1), df) s <- schema(x = float64(), y = utf8()) tab3 <- Table$create(df, schema = s) tab4 <- Table$create(!!!df, schema = s) expect_equal(tab3, tab4) expect_equal(tab3$schema, s) - expect_equal(as.data.frame(tab3), df) + expect_equal(as_tibble(tab3), df) }) test_that("Validation when creating table with schema (ARROW-10953)", { @@ -366,7 +366,7 @@ test_that("Can create table with specific dictionary types", { expect_equal(sch, tab$schema) if (i != int64()) { # TODO: same downcast to int32 as we do for int64() type elsewhere - expect_identical(as.data.frame(tab), fact) + expect_identical(as_tibble(tab), fact) } } }) @@ -380,7 +380,7 @@ test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { res <- tibble::tibble(f = factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))) tab <- Table$create(b1, b2, b3, b4) - expect_identical(as.data.frame(tab), res) + expect_identical(as_tibble(tab), res) }) test_that("Table$SelectColumns()", { @@ -410,24 +410,24 @@ test_that("Table$create() with different length columns", { }) test_that("Table$create() scalar recycling with vectors", { - expect_data_frame( + expect_tibble( Table$create(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) ) }) test_that("Table$create() scalar recycling with Scalars, Arrays, and ChunkedArrays", { - expect_data_frame( + expect_tibble( Table$create(a = Array$create(1:10), b = Scalar$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_data_frame( + expect_tibble( Table$create(a = Array$create(1:10), b = Array$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_data_frame( + expect_tibble( Table$create(a = Array$create(1:10), b = ChunkedArray$create(5)), tibble::tibble(a = 1:10, b = 5) ) From 467f9b42632bc266c1ac4c856930377a4048c951 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 07:05:42 +0100 Subject: [PATCH 08/22] Map a few more as.data.frames to the correct thing --- r/R/dataset.R | 3 ++- r/R/dplyr-glimpse.R | 2 +- r/R/dplyr.R | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/r/R/dataset.R b/r/R/dataset.R index 30d3ed5ae15..3ab440ec812 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -524,7 +524,8 @@ c.Dataset <- function(...) Dataset$create(list(...)) #' @export as.data.frame.Dataset <- function(x, row.names = NULL, optional = FALSE, ...) { - collect.Dataset(x) + out <- collect.Dataset(x) + as.data.frame(out) } #' @export diff --git a/r/R/dplyr-glimpse.R b/r/R/dplyr-glimpse.R index 8a70f4c5b7b..727a313b3e6 100644 --- a/r/R/dplyr-glimpse.R +++ b/r/R/dplyr-glimpse.R @@ -71,7 +71,7 @@ glimpse.ArrowTabular <- function(x, var_headings <- paste("$", center_pad(tickify(names(x)), var_types)) # Assemble the data glimpse - df <- as.data.frame(head_tab) + df <- as_tibble(head_tab) formatted_data <- map_chr(df, function(.) { tryCatch( paste(pillar::format_glimpse(.), collapse = ", "), diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 72e74809689..042bececc12 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -216,7 +216,8 @@ unique.RecordBatchReader <- unique.arrow_dplyr_query #' @export as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) { - collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...) + out <- collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...) + as.data.frame(out) } #' @export From 16ba00e1f857d766f2cda6400670dd4d4a420676 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 07:45:30 +0100 Subject: [PATCH 09/22] Add as_tibble.ArrowTabular --- r/R/arrow-tabular.R | 8 ++++++-- r/R/dplyr-collect.R | 1 + r/tests/testthat/test-Table.R | 10 ++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index 058924ba0bd..7e991cc1c71 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -93,9 +93,13 @@ ArrowTabular <- R6Class("ArrowTabular", #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { + as.data.frame(as_tibble(x)) +} + +#' @export +as_tibble.ArrowTabular <- function(x, ...) { df <- x$to_data_frame() - out <- apply_arrow_r_metadata(df, x$metadata$r) - as.data.frame(out) + apply_arrow_r_metadata(df, x$metadata$r) } #' @export diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index ca045f40784..683f05fa62f 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -22,6 +22,7 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { out <- compute.arrow_dplyr_query(x) collect.ArrowTabular(out, as_data_frame) } + collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) { if (as_data_frame) { as_tibble(x, ...) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 605903d84ca..6d88793946d 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -722,3 +722,13 @@ test_that("as.data.frame() on an ArrowTabular object returns a vanilla data.fram expect_s3_class(out2, "data.frame", exact = TRUE) expect_s3_class(out3, "data.frame", exact = TRUE) }) + +test_that("as_tibble.ArrowTabular retains groups", { + # calling as_tibble.default on ArrowTabular objects results in any grouping being dropped, which is why + # we need as_tibble.ArrowTabular + df <- data.frame(x = 1:4, y = c("a", "b")) + df_grouped <- dplyr::group_by(df, y) + arrow_grouped <- arrow_table(df_grouped) + expect_tibble(arrow_grouped, df_grouped) + +}) From e7d48385777f0e5449c4161e4cc5b2d969fb6d9d Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 07:46:19 +0100 Subject: [PATCH 10/22] Update NAMESPACE --- r/NAMESPACE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index 70b8fc3fc85..d55a22f001c 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -89,6 +89,7 @@ S3method(as_record_batch_reader,pyarrow.lib.Table) S3method(as_schema,Schema) S3method(as_schema,StructType) S3method(as_schema,pyarrow.lib.Schema) +S3method(as_tibble,ArrowTabular) S3method(c,Array) S3method(c,ChunkedArray) S3method(c,Dataset) @@ -489,6 +490,7 @@ importFrom(stats,na.omit) importFrom(stats,na.pass) importFrom(stats,quantile) importFrom(stats,runif) +importFrom(tibble,as_tibble) importFrom(tidyselect,all_of) importFrom(tidyselect,contains) importFrom(tidyselect,ends_with) From ebb089ee8d49b6e29ccef33e79afd04b779820e7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 07:54:37 +0100 Subject: [PATCH 11/22] Fix more tests --- r/tests/testthat/test-compute-sort.R | 4 ++-- r/tests/testthat/test-dataset-csv.R | 2 +- r/tests/testthat/test-dataset.R | 31 ++++++++++++++-------------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/r/tests/testthat/test-compute-sort.R b/r/tests/testthat/test-compute-sort.R index f521efeddc5..0b2f6c2e6da 100644 --- a/r/tests/testthat/test-compute-sort.R +++ b/r/tests/testthat/test-compute-sort.R @@ -141,7 +141,7 @@ test_that("Table$SortIndices()", { sort(tbl$chr, na.last = TRUE) ) expect_identical( - as.data.frame(x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE)))), + as_tibble(x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE)))), tbl %>% arrange(int, dbl) ) }) @@ -149,7 +149,7 @@ test_that("Table$SortIndices()", { test_that("RecordBatch$SortIndices()", { x <- record_batch(tbl) expect_identical( - as.data.frame(x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE))), + as_tibble(x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE))), tbl %>% arrange(desc(chr), desc(int), desc(dbl)) ) }) diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index df58f853a1e..a8cd5bfe33b 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -91,7 +91,7 @@ test_that("CSV scan options", { sb$FragmentScanOptions(options) tab <- sb$Finish()$ToTable() - expect_equal(as.data.frame(tab), tibble(chr = c("foo", NA))) + expect_equal(as_tibble(tab), tibble(chr = c("foo", NA))) # Set default convert options in CsvFileFormat csv_format <- CsvFileFormat$create( diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 91b405fc01c..06ecc3db43b 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -86,7 +86,7 @@ expect_scan_result <- function(ds, schm) { expect_r6_class(tab, "Table") expect_equal( - as.data.frame(tab), + as_tibble(tab), df1[8, c("chr", "lgl")] ) } @@ -706,7 +706,7 @@ test_that("streaming map_batches into an ExecPlan", { select(int) %>% map_batches( # as_mapper() can't handle %>%? - ~ mutate(as.data.frame(.), lets = letters[int]) + ~ mutate(as_tibble(.), lets = letters[int]) ) %>% arrange(int) %>% collect(), @@ -806,19 +806,19 @@ test_that("head/tail", { big_df <- rbind(df1, df2) # No n provided (default is 6, all from one batch) - expect_equal(as.data.frame(head(ds)), head(df1)) - expect_equal(as.data.frame(tail(ds)), tail(df2)) + expect_equal(as_tibble(head(ds)), head(df1)) + expect_equal(as_tibble(tail(ds)), tail(df2)) # n = 0: have to drop `fct` because factor levels don't come through from # arrow when there are 0 rows zero_df <- big_df[FALSE, names(big_df) != "fct"] - expect_equal(as.data.frame(head(ds, 0))[, names(ds) != "fct"], zero_df) - expect_equal(as.data.frame(tail(ds, 0))[, names(ds) != "fct"], zero_df) + expect_equal(as_tibble(head(ds, 0))[, names(ds) != "fct"], zero_df) + expect_equal(as_tibble(tail(ds, 0))[, names(ds) != "fct"], zero_df) # Two more cases: more than 1 batch, and more than nrow for (n in c(12, 1000)) { - expect_equal(as.data.frame(head(ds, n)), head(big_df, n)) - expect_equal(as.data.frame(tail(ds, n)), tail(big_df, n)) + expect_equal(as_tibble(head(ds, n)), head(big_df, n)) + expect_equal(as_tibble(tail(ds, n)), tail(big_df, n)) } expect_error(head(ds, -1)) # Not yet implemented expect_error(tail(ds, -1)) # Not yet implemented @@ -865,17 +865,17 @@ test_that("Dataset [ (take by index)", { ds <- open_dataset(dataset_dir) # Taking only from one file expect_equal( - as.data.frame(ds[c(4, 5, 9), 3:4]), + as_tibble(ds[c(4, 5, 9), 3:4]), df1[c(4, 5, 9), 3:4] ) # Taking from more than one expect_equal( - as.data.frame(ds[c(4, 5, 9, 12, 13), 3:4]), + as_tibble(ds[c(4, 5, 9, 12, 13), 3:4]), rbind(df1[c(4, 5, 9), 3:4], df2[2:3, 3:4]) ) # Taking out of order expect_equal( - as.data.frame(ds[c(4, 13, 9, 12, 5), ]), + as_tibble(ds[c(4, 13, 9, 12, 5), ]), rbind( df1[4, ], df2[3, ], @@ -890,7 +890,7 @@ test_that("Dataset [ (take by index)", { filter(int > 6) %>% select(int, lgl) expect_equal( - as.data.frame(ds2[c(2, 5), ]), + as_tibble(ds2[c(2, 5), ]), rbind( df1[8, c("int", "lgl")], df2[1, c("int", "lgl")] @@ -985,11 +985,11 @@ test_that("Scanner$ScanBatches", { ds <- open_dataset(ipc_dir, format = "feather") batches <- ds$NewScan()$Finish()$ScanBatches() table <- Table$create(!!!batches) - expect_equal(as.data.frame(table), rbind(df1, df2)) + expect_equal(as_tibble(table), rbind(df1, df2)) batches <- ds$NewScan()$Finish()$ScanBatches() table <- Table$create(!!!batches) - expect_equal(as.data.frame(table), rbind(df1, df2)) + expect_equal(as_tibble(table), rbind(df1, df2)) }) test_that("Scanner$ToRecordBatchReader()", { @@ -1002,7 +1002,7 @@ test_that("Scanner$ToRecordBatchReader()", { reader <- scan$ToRecordBatchReader() expect_r6_class(reader, "RecordBatchReader") expect_identical( - as.data.frame(reader$read_table()), + as_tibble(reader$read_table()), df1[df1$int > 6, c("int", "lgl")] ) }) @@ -1298,7 +1298,6 @@ test_that("FileSystemFactoryOptions with DirectoryPartitioning", { expect_equal( ds %>% summarize(sum(gear)) %>% - collect() %>% as.data.frame(), mtcars %>% summarize(sum(gear)) From 88a1c55c38fe337f09057c9860b85242d90aa4c4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 08:16:58 +0100 Subject: [PATCH 12/22] Clean up some as.data.frames which should be as_tibble and implement methods for RecordBatchReaders --- r/R/arrow-tabular.R | 2 +- r/R/dplyr.R | 9 +++++++-- r/R/record-batch-reader.R | 7 ++++++- r/tests/testthat/test-dplyr-query.R | 24 ++++++++++++------------ 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index 7e991cc1c71..b6a077343e1 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -93,7 +93,7 @@ ArrowTabular <- R6Class("ArrowTabular", #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { - as.data.frame(as_tibble(x)) + as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) } #' @export diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 042bececc12..096e621d645 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -216,8 +216,13 @@ unique.RecordBatchReader <- unique.arrow_dplyr_query #' @export as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) { - out <- collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...) - as.data.frame(out) + as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) +} + + +#' @export +as_tibble.arrow_dplyr_query <- function(x, ...) { + collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...) } #' @export diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R index 184a77df36b..7a8b5e5d67d 100644 --- a/r/R/record-batch-reader.R +++ b/r/R/record-batch-reader.R @@ -129,7 +129,12 @@ dim.RecordBatchReader <- function(x) c(NA_integer_, length(x$schema)) #' @export as.data.frame.RecordBatchReader <- function(x, row.names = NULL, optional = FALSE, ...) { - as.data.frame(x$read_table(), row.names = row.names, optional = optional, ...) + as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) +} + +#' @export +as_tibble.RecordBatchReader <- function(x, ...){ + x$read_table() } #' @export diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 0b2b23ec860..d3c08d6856b 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -117,14 +117,14 @@ test_that("collect(as_data_frame=FALSE)", { # collect(as_data_frame = FALSE) always returns Table now expect_r6_class(b2, "Table") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] - expect_equal(as.data.frame(b2), expected) + expect_equal(as_tibble(b2), expected) b3 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% collect(as_data_frame = FALSE) expect_r6_class(b3, "Table") - expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + expect_equal(as_tibble(b3), set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% @@ -133,7 +133,7 @@ test_that("collect(as_data_frame=FALSE)", { collect(as_data_frame = FALSE) expect_r6_class(b4, "Table") expect_equal( - as.data.frame(b4), + as_tibble(b4), expected %>% rename(strng = chr) %>% group_by(int) @@ -154,14 +154,14 @@ test_that("compute()", { expect_r6_class(b2, "Table") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] - expect_equal(as.data.frame(b2), expected) + expect_equal(as_tibble(b2), expected) b3 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% compute() expect_r6_class(b3, "Table") - expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + expect_equal(as_tibble(b3), set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% @@ -170,7 +170,7 @@ test_that("compute()", { compute() expect_r6_class(b4, "Table") expect_equal( - as.data.frame(b4), + as_tibble(b4), expected %>% rename(strng = chr) %>% group_by(int) @@ -193,7 +193,7 @@ test_that("head", { filter(int > 5) %>% head(2) expect_s3_class(b3, "arrow_dplyr_query") - expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + expect_equal(as_tibble(b3), set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% @@ -202,7 +202,7 @@ test_that("head", { head(2) expect_s3_class(b4, "arrow_dplyr_query") expect_equal( - as.data.frame(b4), + as_tibble(b4), expected %>% rename(strng = chr) %>% group_by(int) @@ -268,7 +268,7 @@ test_that("tail", { expect_s3_class(b2, "arrow_dplyr_query") expected <- tail(tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")], 2) - expect_equal(as.data.frame(b2), expected) + expect_equal(as_tibble(b2), expected) b3 <- batch %>% select(int, strng = chr) %>% @@ -276,7 +276,7 @@ test_that("tail", { arrange(int) %>% tail(2) expect_s3_class(b3, "arrow_dplyr_query") - expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + expect_equal(as_tibble(b3), set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% @@ -286,7 +286,7 @@ test_that("tail", { tail(2) expect_s3_class(b4, "arrow_dplyr_query") expect_equal( - as.data.frame(b4), + as_tibble(b4), expected %>% rename(strng = chr) %>% group_by(int) @@ -598,7 +598,7 @@ test_that("compute() on a grouped query returns a Table with groups in metadata" compute() expect_r6_class(tab1, "Table") expect_equal( - as.data.frame(tab1), + as_tibble(tab1), tbl %>% group_by(int) ) From 52d2be235a209d18cb188c2653851d23b48b00be Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 08:46:07 +0100 Subject: [PATCH 13/22] Fix metadata tests --- r/tests/testthat/test-metadata.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 4cf8e49af1b..d9e855ebe19 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -57,7 +57,7 @@ test_that("Table R metadata", { "$r$columns$c$columns$c1$attributes$extra_attr", fixed = TRUE ) - expect_identical(as.data.frame(tab), example_with_metadata) + expect_identical(as_tibble(tab), example_with_metadata) }) test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", { @@ -94,7 +94,7 @@ test_that("Garbage R metadata doesn't break things", { tab <- Table$create(example_data[1:6]) tab$metadata$r <- "garbage" expect_warning( - expect_identical(as.data.frame(tab), example_data[1:6]), + expect_identical(as_tibble(tab), example_data[1:6]), "Invalid metadata$r", fixed = TRUE ) @@ -103,7 +103,7 @@ test_that("Garbage R metadata doesn't break things", { tab <- Table$create(example_data[1:6]) tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE)) expect_warning( - expect_identical(as.data.frame(tab), example_data[1:6]), + expect_identical(as_tibble(tab), example_data[1:6]), "Invalid metadata$r", fixed = TRUE ) @@ -164,7 +164,7 @@ test_that("RecordBatch metadata", { }) test_that("RecordBatch R metadata", { - expect_identical(as.data.frame(record_batch(example_with_metadata)), example_with_metadata) + expect_identical(as_tibble(record_batch(example_with_metadata)), example_with_metadata) }) test_that("R metadata roundtrip via parquet", { @@ -195,14 +195,14 @@ test_that("haven types roundtrip via feather", { test_that("Date/time type roundtrip", { rb <- record_batch(example_with_times) expect_r6_class(rb$schema$posixlt$type, "VctrsExtensionType") - expect_identical(as.data.frame(rb), example_with_times) + expect_identical(as_tibble(rb), example_with_times) }) test_that("metadata keeps attribute of top level data frame", { df <- structure(data.frame(x = 1, y = 2), foo = "bar") tab <- Table$create(df) - expect_identical(attr(as.data.frame(tab), "foo"), "bar") - expect_identical(as.data.frame(tab), df) + expect_identical(attr(as_tibble(tab), "foo"), "bar") + expect_identical(as_tibble(tab), df) }) @@ -223,7 +223,7 @@ test_that("metadata drops readr's problems attribute", { ) tab <- Table$create(readr_like) - expect_null(attr(as.data.frame(tab), "problems")) + expect_null(attr(as_tibble(tab), "problems")) }) test_that("Row-level metadata (does not by default) roundtrip", { @@ -241,8 +241,8 @@ test_that("Row-level metadata (does not by default) roundtrip", { list("arrow.preserve_row_level_metadata" = TRUE), { tab <- Table$create(df) - expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar") - expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux") + expect_identical(attr(as_tibble(tab)$x[[1]], "foo"), "bar") + expect_identical(attr(as_tibble(tab)$x[[2]], "baz"), "qux") } ) }) @@ -387,7 +387,7 @@ test_that("grouped_df non-arrow metadata is preserved", { grouped_tab <- arrow_table(grouped) expect_equal( - attributes(as.data.frame(grouped_tab))$other_metadata, + attributes(as_tibble(grouped_tab))$other_metadata, "look I'm still here!" ) }) From 0380f8a6714eb91ff074d14ae491b2bdaf1675b0 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 09:51:07 +0100 Subject: [PATCH 14/22] Add new methods --- r/NAMESPACE | 4 ++++ r/R/array.R | 7 ++++++- r/R/dataset.R | 8 ++++++-- r/R/schema.R | 2 +- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index d55a22f001c..b0adec7725b 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -90,6 +90,10 @@ S3method(as_schema,Schema) S3method(as_schema,StructType) S3method(as_schema,pyarrow.lib.Schema) S3method(as_tibble,ArrowTabular) +S3method(as_tibble,Dataset) +S3method(as_tibble,RecordBatchReader) +S3method(as_tibble,StructArray) +S3method(as_tibble,arrow_dplyr_query) S3method(c,Array) S3method(c,ChunkedArray) S3method(c,Dataset) diff --git a/r/R/array.R b/r/R/array.R index 109f6daaa2b..09760dd10b0 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -473,10 +473,15 @@ names.StructArray <- function(x, ...) StructType__field_names(x$type) dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields) #' @export -as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { +as_tibble.StructArray <- function(x, ...) { as.vector(x) } +#' @export +as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { + as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) +} + #' @rdname array #' @usage NULL #' @format NULL diff --git a/r/R/dataset.R b/r/R/dataset.R index 3ab440ec812..46a7e051704 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -522,10 +522,14 @@ dim.Dataset <- function(x) c(x$num_rows, x$num_cols) #' @export c.Dataset <- function(...) Dataset$create(list(...)) +#' @export +as_tibble.Dataset <- function(x, ...) { + collect.Dataset(x) +} + #' @export as.data.frame.Dataset <- function(x, row.names = NULL, optional = FALSE, ...) { - out <- collect.Dataset(x) - as.data.frame(out) + as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) } #' @export diff --git a/r/R/schema.R b/r/R/schema.R index 93e826eff28..39ffddfea83 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -386,5 +386,5 @@ as_schema.StructType <- function(x, ...) { #' @export as.data.frame.Schema <- function(x, row.names = NULL, optional = FALSE, ...) { - as.data.frame(Table__from_schema(x)) + as.data.frame(Table__from_schema(x), row.names = row.names, optional = optional, ...) } From c9e50cee978d55404b61f07f2b59cc059570a903 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 09:52:02 +0100 Subject: [PATCH 15/22] Fix utf tests --- r/tests/testthat/test-utf.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-utf.R b/r/tests/testthat/test-utf.R index f7553da5b4a..38de11b7c37 100644 --- a/r/tests/testthat/test-utf.R +++ b/r/tests/testthat/test-utf.R @@ -45,12 +45,12 @@ test_that("We handle non-UTF strings", { expect_identical(as.vector(ChunkedArray$create(df)), df) # Table (including field name) - expect_identical(as.data.frame(Table$create(df)), df) - expect_identical(as.data.frame(Table$create(df_struct)), df_struct) + expect_identical(as_tibble(Table$create(df)), df) + expect_identical(as_tibble(Table$create(df_struct)), df_struct) # RecordBatch - expect_identical(as.data.frame(record_batch(df)), df) - expect_identical(as.data.frame(record_batch(df_struct)), df_struct) + expect_identical(as_tibble(record_batch(df)), df) + expect_identical(as_tibble(record_batch(df_struct)), df_struct) # Schema field name df_schema <- do.call(schema, raw_schema) @@ -59,10 +59,10 @@ test_that("We handle non-UTF strings", { df_struct_schema <- schema(a = do.call(struct, raw_schema)) # Create table/batch with schema - expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df) - expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct) - expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df) - expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct) + expect_identical(as_tibble(Table$create(df, schema = df_schema)), df) + expect_identical(as_tibble(Table$create(df_struct, schema = df_struct_schema)), df_struct) + expect_identical(as_tibble(record_batch(df, schema = df_schema)), df) + expect_identical(as_tibble(record_batch(df_struct, schema = df_struct_schema)), df_struct) # Serialization feather_file <- tempfile() From 6cca6413bf229b82ca818bef819c56d9e5773eff Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 09:58:39 +0100 Subject: [PATCH 16/22] Update other uses of as.data.frame in tests --- r/tests/testthat/test-compute-aggregate.R | 2 +- r/tests/testthat/test-feather.R | 2 +- r/tests/testthat/test-na-omit.R | 8 ++++---- r/tests/testthat/test-python.R | 8 ++++---- r/tests/testthat/test-read-write.R | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 98face44ff9..3ca28c65a37 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -377,7 +377,7 @@ test_that("value_counts", { type = struct(values = float64(), counts = int64()) ) expect_equal(value_counts(a), result) - expect_identical(as.data.frame(value_counts(a)), result_df) + expect_identical(as_tibble(value_counts(a)), result_df) expect_identical(as.vector(value_counts(a)$counts), result_df$counts) }) diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 4caadc27c4b..88e890d408d 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -177,7 +177,7 @@ test_that("feather read/write round trip", { tab1 <- read_feather(feather_file, as_data_frame = FALSE) expect_r6_class(tab1, "Table") - expect_equal(tib, as.data.frame(tab1)) + expect_equal(tib, as_tibble(tab1)) }) test_that("Read feather from raw vector", { diff --git a/r/tests/testthat/test-na-omit.R b/r/tests/testthat/test-na-omit.R index c2d0fd1b71a..4c26d4275b6 100644 --- a/r/tests/testthat/test-na-omit.R +++ b/r/tests/testthat/test-na-omit.R @@ -43,7 +43,7 @@ test_that("na.fail on Array and ChunkedArray", { test_that("na.omit on Table", { tbl <- Table$create(example_data) expect_equal( - as.data.frame(na.omit(tbl)), + as_tibble(na.omit(tbl)), na.omit(example_data), # We don't include an attribute with the rows omitted ignore_attr = "na.action" @@ -53,7 +53,7 @@ test_that("na.omit on Table", { test_that("na.exclude on Table", { tbl <- Table$create(example_data) expect_equal( - as.data.frame(na.exclude(tbl)), + as_tibble(na.exclude(tbl)), na.exclude(example_data), ignore_attr = "na.action" ) @@ -67,7 +67,7 @@ test_that("na.fail on Table", { test_that("na.omit on RecordBatch", { batch <- record_batch(example_data) expect_equal( - as.data.frame(na.omit(batch)), + as_tibble(na.omit(batch)), na.omit(example_data), ignore_attr = "na.action" ) @@ -76,7 +76,7 @@ test_that("na.omit on RecordBatch", { test_that("na.exclude on RecordBatch", { batch <- record_batch(example_data) expect_equal( - as.data.frame(na.exclude(batch)), + as_tibble(na.exclude(batch)), na.omit(example_data), ignore_attr = "na.action" ) diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R index 968d72119c5..87f364d6e2b 100644 --- a/r/tests/testthat/test-python.R +++ b/r/tests/testthat/test-python.R @@ -108,7 +108,7 @@ test_that("RecordBatch with metadata roundtrip", { expect_identical(rbatch$metadata, batch$metadata) expect_equal(rbatch$a, batch$a) expect_equal(rbatch[c("b", "c", "d")], batch[c("b", "c", "d")]) - expect_identical(as.data.frame(rbatch), example_with_metadata) + expect_identical(as_tibble(rbatch), example_with_metadata) }) test_that("Table with metadata roundtrip", { @@ -123,7 +123,7 @@ test_that("Table with metadata roundtrip", { expect_identical(rtab$metadata, tab$metadata) expect_equal(rtab$a, tab$a) expect_equal(rtab[c("b", "c", "d")], tab[c("b", "c", "d")]) - expect_identical(as.data.frame(rtab), example_with_metadata) + expect_identical(as_tibble(rtab), example_with_metadata) }) test_that("DataType roundtrip", { @@ -161,7 +161,7 @@ test_that("RecordBatchReader to python", { back_to_r <- reticulate::py_to_r(pytab) expect_r6_class(back_to_r, "Table") expect_identical( - as.data.frame(back_to_r), + as_tibble(back_to_r), example_data %>% select(int, lgl) %>% filter(int > 6) @@ -178,7 +178,7 @@ test_that("RecordBatchReader from python", { back_to_r <- reticulate::py_to_r(pyreader) rt_table <- back_to_r$read_table() expect_r6_class(rt_table, "Table") - expect_identical(as.data.frame(rt_table), example_data) + expect_identical(as_tibble(rt_table), example_data) scan <- Scanner$create(tab) reader <- scan$ToRecordBatchReader() diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index 66f6db56d90..1bd5826090a 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -115,7 +115,7 @@ test_that("table round trip handles NA in integer and numeric", { test_that("reading/writing a raw vector (sparklyr integration)", { # These are effectively what sparklyr calls to get data to/from Spark read_from_raw_test <- function(x) { - as.data.frame(RecordBatchStreamReader$create(x)$read_next_batch()) + as_tibble(RecordBatchStreamReader$create(x)$read_next_batch()) } bytes <- write_to_raw(example_data) expect_type(bytes, "raw") From d6020ad0d4e67fb86bb6420fbd72c12b6846a148 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 4 Apr 2023 11:13:53 +0100 Subject: [PATCH 17/22] Fix Flight tests --- r/tests/testthat/test-python-flight.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-python-flight.R b/r/tests/testthat/test-python-flight.R index d2b6fd491e1..f8519fcabd3 100644 --- a/r/tests/testthat/test-python-flight.R +++ b/r/tests/testthat/test-python-flight.R @@ -53,13 +53,13 @@ if (process_is_running("demo_flight_server")) { }) test_that("flight_get", { - expect_identical(as.data.frame(flight_get(client, flight_obj)), example_data) + expect_identical(as_tibble(flight_get(client, flight_obj)), example_data) }) test_that("flight_put with RecordBatch", { flight_obj2 <- tempfile() flight_put(client, RecordBatch$create(example_data), path = flight_obj2) - expect_identical(as.data.frame(flight_get(client, flight_obj2)), example_data) + expect_identical(as_tibble(flight_get(client, flight_obj2)), example_data) }) test_that("flight_put with overwrite = FALSE", { @@ -69,7 +69,7 @@ if (process_is_running("demo_flight_server")) { ) # Default is TRUE so this will overwrite flight_put(client, example_with_times, path = flight_obj) - expect_identical(as.data.frame(flight_get(client, flight_obj)), example_with_times) + expect_identical(as_tibble(flight_get(client, flight_obj)), example_with_times) }) test_that("flight_disconnect", { From 9cf9ba213eb6e78b433570f97088781bbb57f0b1 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 Apr 2023 13:31:04 +0100 Subject: [PATCH 18/22] Revert tibble dependency and update internal function usage to as_df --- r/DESCRIPTION | 2 +- r/NAMESPACE | 6 ------ r/R/array.R | 5 ++--- r/R/arrow-package.R | 1 - r/R/arrow-tabular.R | 5 ++--- r/R/csv.R | 2 +- r/R/dataset.R | 5 ++--- r/R/dplyr-collect.R | 2 +- r/R/dplyr-glimpse.R | 2 +- r/R/dplyr.R | 6 ++---- r/R/feather.R | 2 +- r/R/ipc-stream.R | 2 +- r/R/json.R | 2 +- r/R/parquet.R | 2 +- r/R/record-batch-reader.R | 5 ++--- 15 files changed, 18 insertions(+), 31 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index f6873935884..c1114571ebd 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -37,7 +37,6 @@ Imports: R6, rlang (>= 1.0.0), stats, - tibble, tidyselect (>= 1.0.0), utils, vctrs @@ -64,6 +63,7 @@ Suggests: stringr, sys, testthat (>= 3.1.0), + tibble, tzdb, withr LinkingTo: cpp11 (>= 0.4.2) diff --git a/r/NAMESPACE b/r/NAMESPACE index b0adec7725b..70b8fc3fc85 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -89,11 +89,6 @@ S3method(as_record_batch_reader,pyarrow.lib.Table) S3method(as_schema,Schema) S3method(as_schema,StructType) S3method(as_schema,pyarrow.lib.Schema) -S3method(as_tibble,ArrowTabular) -S3method(as_tibble,Dataset) -S3method(as_tibble,RecordBatchReader) -S3method(as_tibble,StructArray) -S3method(as_tibble,arrow_dplyr_query) S3method(c,Array) S3method(c,ChunkedArray) S3method(c,Dataset) @@ -494,7 +489,6 @@ importFrom(stats,na.omit) importFrom(stats,na.pass) importFrom(stats,quantile) importFrom(stats,runif) -importFrom(tibble,as_tibble) importFrom(tidyselect,all_of) importFrom(tidyselect,contains) importFrom(tidyselect,ends_with) diff --git a/r/R/array.R b/r/R/array.R index 09760dd10b0..ca03d2b5b96 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -472,14 +472,13 @@ names.StructArray <- function(x, ...) StructType__field_names(x$type) #' @export dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields) -#' @export -as_tibble.StructArray <- function(x, ...) { +as_df.StructArray <- function(x, ...) { as.vector(x) } #' @export as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { - as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) + as.data.frame(as_df(x), row.names = row.names, optional = optional, ...) } #' @rdname array diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index d31a078405e..a3c860a51c8 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -27,7 +27,6 @@ #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args #' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure #' @importFrom rlang new_quosures expr_text caller_env check_dots_empty dots_list is_string inform -#' @importFrom tibble as_tibble #' @importFrom tidyselect vars_pull eval_select eval_rename #' @importFrom glue glue #' @useDynLib arrow, .registration = TRUE diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index b6a077343e1..8770e206032 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -93,11 +93,10 @@ ArrowTabular <- R6Class("ArrowTabular", #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { - as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) + as.data.frame(as_df(x), row.names = row.names, optional = optional, ...) } -#' @export -as_tibble.ArrowTabular <- function(x, ...) { +as_df.ArrowTabular <- function(x, ...) { df <- x$to_data_frame() apply_arrow_r_metadata(df, x$metadata$r) } diff --git a/r/R/csv.R b/r/R/csv.R index 7885239ad6a..237a2153e5b 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -248,7 +248,7 @@ read_delim_arrow <- function(file, } if (isTRUE(as_data_frame)) { - tab <- as_tibble(tab) + tab <- as_df(tab) } tab diff --git a/r/R/dataset.R b/r/R/dataset.R index 46a7e051704..fe9cdeba4ac 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -522,14 +522,13 @@ dim.Dataset <- function(x) c(x$num_rows, x$num_cols) #' @export c.Dataset <- function(...) Dataset$create(list(...)) -#' @export -as_tibble.Dataset <- function(x, ...) { +as_df.Dataset <- function(x, ...) { collect.Dataset(x) } #' @export as.data.frame.Dataset <- function(x, row.names = NULL, optional = FALSE, ...) { - as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) + as.data.frame(as_df(x), row.names = row.names, optional = optional, ...) } #' @export diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 683f05fa62f..61db1d2a1bd 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -25,7 +25,7 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) { if (as_data_frame) { - as_tibble(x, ...) + as_df(x, ...) } else { x } diff --git a/r/R/dplyr-glimpse.R b/r/R/dplyr-glimpse.R index 727a313b3e6..ff7a3ef2508 100644 --- a/r/R/dplyr-glimpse.R +++ b/r/R/dplyr-glimpse.R @@ -71,7 +71,7 @@ glimpse.ArrowTabular <- function(x, var_headings <- paste("$", center_pad(tickify(names(x)), var_types)) # Assemble the data glimpse - df <- as_tibble(head_tab) + df <- as_df(head_tab) formatted_data <- map_chr(df, function(.) { tryCatch( paste(pillar::format_glimpse(.), collapse = ", "), diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 096e621d645..a52124befbb 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -216,12 +216,10 @@ unique.RecordBatchReader <- unique.arrow_dplyr_query #' @export as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) { - as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) + as.data.frame(as_df(x), row.names = row.names, optional = optional, ...) } - -#' @export -as_tibble.arrow_dplyr_query <- function(x, ...) { +as_df.arrow_dplyr_query <- function(x, ...) { collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...) } diff --git a/r/R/feather.R b/r/R/feather.R index 67b7934542e..09b04a1f0a6 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -196,7 +196,7 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T ) if (isTRUE(as_data_frame)) { - out <- as_tibble(out) + out <- as_df(out) } out } diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R index c424a0a6e2b..59d690db5dc 100644 --- a/r/R/ipc-stream.R +++ b/r/R/ipc-stream.R @@ -106,7 +106,7 @@ read_ipc_stream <- function(file, as_data_frame = TRUE, ...) { # https://issues.apache.org/jira/browse/ARROW-6830 out <- RecordBatchStreamReader$create(file)$read_table() if (as_data_frame) { - out <- as_tibble(out) + out <- as_df(out) } out } diff --git a/r/R/json.R b/r/R/json.R index 868643a815d..470b52f3f88 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -84,7 +84,7 @@ read_json_arrow <- function(file, } if (isTRUE(as_data_frame)) { - tab <- as_tibble(tab) + tab <- as_df(tab) } tab } diff --git a/r/R/parquet.R b/r/R/parquet.R index 93904fa16ff..586f6cd3c88 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -70,7 +70,7 @@ read_parquet <- function(file, } if (as_data_frame) { - tab <- as_tibble(tab) + tab <- as_df(tab) } tab } diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R index 7a8b5e5d67d..88118a2f70a 100644 --- a/r/R/record-batch-reader.R +++ b/r/R/record-batch-reader.R @@ -129,11 +129,10 @@ dim.RecordBatchReader <- function(x) c(NA_integer_, length(x$schema)) #' @export as.data.frame.RecordBatchReader <- function(x, row.names = NULL, optional = FALSE, ...) { - as.data.frame(as_tibble(x), row.names = row.names, optional = optional, ...) + as.data.frame(as_df(x), row.names = row.names, optional = optional, ...) } -#' @export -as_tibble.RecordBatchReader <- function(x, ...){ +as_df.RecordBatchReader <- function(x, ...){ x$read_table() } From 02dde1f31ac70913af8b06c2705e56ccc266b2fe Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 Apr 2023 13:46:48 +0100 Subject: [PATCH 19/22] Revert expect_tibble --- r/tests/testthat/helper-expectation.R | 4 +- r/tests/testthat/test-RecordBatch.R | 58 +++++++++++------------ r/tests/testthat/test-Table.R | 68 +++++++++++++-------------- r/tests/testthat/test-metadata.R | 4 +- 4 files changed, 67 insertions(+), 67 deletions(-) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index be9cad1bb59..303a96ead7d 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -19,8 +19,8 @@ expect_as_vector <- function(x, y, ...) { expect_equal(as.vector(x), y, ...) } -expect_tibble <- function(x, y, ...) { - expect_equal(as_tibble(x), y, ...) +expect_data_frame <- function(x, y, ...) { + expect_equal(as.data.frame(x), y, ...) } expect_r6_class <- function(object, class) { diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 72ffd4b4596..4198628986e 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -89,7 +89,7 @@ test_that("RecordBatch", { schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8())) ) expect_equal(batch2$column(0), batch$column(1)) - expect_tibble(batch2, tbl[, -1]) + expect_data_frame(batch2, tbl[, -1]) # input validation expect_error(batch$RemoveColumn(NA), "'i' cannot be NA") @@ -109,10 +109,10 @@ test_that("RecordBatch S3 methods", { test_that("RecordBatch$Slice", { batch3 <- batch$Slice(5) - expect_tibble(batch3, tbl[6:10, ]) + expect_data_frame(batch3, tbl[6:10, ]) batch4 <- batch$Slice(5, 2) - expect_tibble(batch4, tbl[6:7, ]) + expect_data_frame(batch4, tbl[6:7, ]) # Input validation expect_error(batch$Slice("ten")) @@ -131,20 +131,20 @@ test_that("RecordBatch$Slice", { }) test_that("[ on RecordBatch", { - expect_tibble(batch[6:7, ], tbl[6:7, ]) - expect_tibble(batch[c(6, 7), ], tbl[6:7, ]) - expect_tibble(batch[6:7, 2:4], tbl[6:7, 2:4]) - expect_tibble(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_data_frame(batch[6:7, ], tbl[6:7, ]) + expect_data_frame(batch[c(6, 7), ], tbl[6:7, ]) + expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4]) + expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr) - expect_tibble(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) - expect_tibble( + expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_data_frame( batch[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ] ) # bool Array - expect_tibble(batch[batch$lgl, ], tbl[tbl$lgl, ]) + expect_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ]) # int Array - expect_tibble(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) # input validation expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"') @@ -176,15 +176,15 @@ test_that("[[<- assignment", { # can remove a column batch[["chr"]] <- NULL - expect_tibble(batch, tbl[-4]) + expect_data_frame(batch, tbl[-4]) # can remove a column by index batch[[4]] <- NULL - expect_tibble(batch, tbl[1:3]) + expect_data_frame(batch, tbl[1:3]) # can add a named column batch[["new"]] <- letters[10:1] - expect_tibble(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) + expect_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) # can replace a column by index batch[[2]] <- as.numeric(10:1) @@ -239,16 +239,16 @@ test_that("head and tail on RecordBatch", { fct = factor(letters[1:10]) ) batch <- RecordBatch$create(tbl) - expect_tibble(head(batch), head(tbl)) - expect_tibble(head(batch, 4), head(tbl, 4)) - expect_tibble(head(batch, 40), head(tbl, 40)) - expect_tibble(head(batch, -4), head(tbl, -4)) - expect_tibble(head(batch, -40), head(tbl, -40)) - expect_tibble(tail(batch), tail(tbl)) - expect_tibble(tail(batch, 4), tail(tbl, 4)) - expect_tibble(tail(batch, 40), tail(tbl, 40)) - expect_tibble(tail(batch, -4), tail(tbl, -4)) - expect_tibble(tail(batch, -40), tail(tbl, -40)) + expect_data_frame(head(batch), head(tbl)) + expect_data_frame(head(batch, 4), head(tbl, 4)) + expect_data_frame(head(batch, 40), head(tbl, 40)) + expect_data_frame(head(batch, -4), head(tbl, -4)) + expect_data_frame(head(batch, -40), head(tbl, -40)) + expect_data_frame(tail(batch), tail(tbl)) + expect_data_frame(tail(batch, 4), tail(tbl, 4)) + expect_data_frame(tail(batch, 40), tail(tbl, 40)) + expect_data_frame(tail(batch, -4), tail(tbl, -4)) + expect_data_frame(tail(batch, -40), tail(tbl, -40)) }) test_that("RecordBatch print method", { @@ -379,7 +379,7 @@ test_that("record_batch() auto splices (ARROW-5718)", { batch2 <- record_batch(!!!df) expect_equal(batch1, batch2) expect_equal(batch1$schema, schema(x = int32(), y = utf8())) - expect_tibble(batch1, df) + expect_data_frame(batch1, df) batch3 <- record_batch(df, z = 1:10) batch4 <- record_batch(!!!df, z = 1:10) @@ -425,24 +425,24 @@ test_that("record_batch() handles null type (ARROW-7064)", { }) test_that("record_batch() scalar recycling with vectors", { - expect_tibble( + expect_data_frame( record_batch(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) ) }) test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", { - expect_tibble( + expect_data_frame( record_batch(a = Array$create(1:10), b = Scalar$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_tibble( + expect_data_frame( record_batch(a = Array$create(1:10), b = Array$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_tibble( + expect_data_frame( record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)), tibble::tibble(a = 1:10, b = 5) ) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 6d88793946d..1dd1c1faf15 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -68,26 +68,26 @@ tab <- Table$create(tbl) test_that("[, [[, $ for Table", { expect_identical(names(tab), names(tbl)) - expect_tibble(tab[6:7, ], tbl[6:7, ]) - expect_tibble(tab[6:7, 2:4], tbl[6:7, 2:4]) - expect_tibble(tab[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_data_frame(tab[6:7, ], tbl[6:7, ]) + expect_data_frame(tab[6:7, 2:4], tbl[6:7, 2:4]) + expect_data_frame(tab[, c("dbl", "fct")], tbl[, c(2, 5)]) expect_as_vector(tab[, "chr", drop = TRUE], tbl$chr) # Take within a single chunk - expect_tibble(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) - expect_tibble(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ]) + expect_data_frame(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_data_frame(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ]) # bool ChunkedArray (with one chunk) - expect_tibble(tab[tab$lgl, ], tbl[tbl$lgl, ]) + expect_data_frame(tab[tab$lgl, ], tbl[tbl$lgl, ]) # ChunkedArray with multiple chunks c1 <- c(TRUE, FALSE, TRUE, TRUE, FALSE) c2 <- c(FALSE, FALSE, TRUE, TRUE, FALSE) ca <- ChunkedArray$create(c1, c2) - expect_tibble(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ]) + expect_data_frame(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ]) # int Array - expect_tibble(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + expect_data_frame(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4]) # ChunkedArray - expect_tibble(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4]) + expect_data_frame(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4]) # Expression - expect_tibble(tab[tab$int > 6, ], tbl[tbl$int > 6, ]) + expect_data_frame(tab[tab$int > 6, ], tbl[tbl$int > 6, ]) expect_as_vector(tab[["int"]], tbl$int) expect_as_vector(tab$int, tbl$int) @@ -95,9 +95,9 @@ test_that("[, [[, $ for Table", { expect_null(tab$qwerty) expect_null(tab[["asdf"]]) # List-like column slicing - expect_tibble(tab[2:4], tbl[2:4]) - expect_tibble(tab[c(2, 1)], tbl[c(2, 1)]) - expect_tibble(tab[-3], tbl[-3]) + expect_data_frame(tab[2:4], tbl[2:4]) + expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)]) + expect_data_frame(tab[-3], tbl[-3]) expect_error(tab[[c(4, 3)]]) expect_error(tab[[NA]], "'i' must be character or numeric, not logical") @@ -112,21 +112,21 @@ test_that("[, [[, $ for Table", { expect_error(tab[, c(6, NA)], "Column indices cannot be NA") skip("Table with 0 cols doesn't know how many rows it should have") - expect_tibble(tab[0], tbl[0]) + expect_data_frame(tab[0], tbl[0]) }) test_that("[[<- assignment", { # can remove a column tab[["chr"]] <- NULL - expect_tibble(tab, tbl[-4]) + expect_data_frame(tab, tbl[-4]) # can remove a column by index tab[[4]] <- NULL - expect_tibble(tab, tbl[1:3]) + expect_data_frame(tab, tbl[1:3]) # can add a named column tab[["new"]] <- letters[10:1] - expect_tibble(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) + expect_data_frame(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) # can replace a column by index tab[[2]] <- as.numeric(10:1) @@ -177,10 +177,10 @@ test_that("[[<- assignment", { test_that("Table$Slice", { tab2 <- tab$Slice(5) - expect_tibble(tab2, tbl[6:10, ]) + expect_data_frame(tab2, tbl[6:10, ]) tab3 <- tab$Slice(5, 2) - expect_tibble(tab3, tbl[6:7, ]) + expect_data_frame(tab3, tbl[6:7, ]) # Input validation expect_error(tab$Slice("ten")) @@ -199,16 +199,16 @@ test_that("Table$Slice", { }) test_that("head and tail on Table", { - expect_tibble(head(tab), head(tbl)) - expect_tibble(head(tab, 4), head(tbl, 4)) - expect_tibble(head(tab, 40), head(tbl, 40)) - expect_tibble(head(tab, -4), head(tbl, -4)) - expect_tibble(head(tab, -40), head(tbl, -40)) - expect_tibble(tail(tab), tail(tbl)) - expect_tibble(tail(tab, 4), tail(tbl, 4)) - expect_tibble(tail(tab, 40), tail(tbl, 40)) - expect_tibble(tail(tab, -4), tail(tbl, -4)) - expect_tibble(tail(tab, -40), tail(tbl, -40)) + expect_data_frame(head(tab), head(tbl)) + expect_data_frame(head(tab, 4), head(tbl, 4)) + expect_data_frame(head(tab, 40), head(tbl, 40)) + expect_data_frame(head(tab, -4), head(tbl, -4)) + expect_data_frame(head(tab, -40), head(tbl, -40)) + expect_data_frame(tail(tab), tail(tbl)) + expect_data_frame(tail(tab, 4), tail(tbl, 4)) + expect_data_frame(tail(tab, 40), tail(tbl, 40)) + expect_data_frame(tail(tab, -4), tail(tbl, -4)) + expect_data_frame(tail(tab, -40), tail(tbl, -40)) }) test_that("Table print method", { @@ -410,24 +410,24 @@ test_that("Table$create() with different length columns", { }) test_that("Table$create() scalar recycling with vectors", { - expect_tibble( + expect_data_frame( Table$create(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) ) }) test_that("Table$create() scalar recycling with Scalars, Arrays, and ChunkedArrays", { - expect_tibble( + expect_data_frame( Table$create(a = Array$create(1:10), b = Scalar$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_tibble( + expect_data_frame( Table$create(a = Array$create(1:10), b = Array$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_tibble( + expect_data_frame( Table$create(a = Array$create(1:10), b = ChunkedArray$create(5)), tibble::tibble(a = 1:10, b = 5) ) @@ -729,6 +729,6 @@ test_that("as_tibble.ArrowTabular retains groups", { df <- data.frame(x = 1:4, y = c("a", "b")) df_grouped <- dplyr::group_by(df, y) arrow_grouped <- arrow_table(df_grouped) - expect_tibble(arrow_grouped, df_grouped) + expect_data_frame(arrow_grouped, df_grouped) }) diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index d9e855ebe19..f97fcc0c588 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -201,8 +201,8 @@ test_that("Date/time type roundtrip", { test_that("metadata keeps attribute of top level data frame", { df <- structure(data.frame(x = 1, y = 2), foo = "bar") tab <- Table$create(df) - expect_identical(attr(as_tibble(tab), "foo"), "bar") - expect_identical(as_tibble(tab), df) + expect_identical(attr(as.data.frame(tab), "foo"), "bar") + expect_identical(as.data.frame(tab), df) }) From 9876de2e633b938ae0c7e76f5a41c30913064444 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 Apr 2023 13:48:49 +0100 Subject: [PATCH 20/22] Define as_df generic --- r/R/arrow-tabular.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index 8770e206032..9e5b3c4a1a4 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -263,3 +263,7 @@ na.omit.ArrowTabular <- function(object, ...) { #' @export na.exclude.ArrowTabular <- na.omit.ArrowTabular + +as_df <- function(x) { + UseMethod("as_df") +} From ee12b68eaf70d4bdc3877c09639c512de843d364 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 Apr 2023 13:49:03 +0100 Subject: [PATCH 21/22] Remove redundant test --- r/tests/testthat/test-metadata.R | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index f97fcc0c588..8c8c72b10c0 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -379,15 +379,3 @@ test_that("grouped_df metadata is recorded (efficiently)", { expect_r6_class(grouped_tab, "Table") expect_equal(grouped_tab$metadata$r$attributes$.group_vars, "a") }) - -test_that("grouped_df non-arrow metadata is preserved", { - simple_tbl <- tibble(a = 1:2, b = 3:4) - attr(simple_tbl, "other_metadata") <- "look I'm still here!" - grouped <- group_by(simple_tbl, a) - grouped_tab <- arrow_table(grouped) - - expect_equal( - attributes(as_tibble(grouped_tab))$other_metadata, - "look I'm still here!" - ) -}) From 2ea92caaf25d5253b3e9db7f2c81a097abddafce Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 6 Apr 2023 13:49:13 +0100 Subject: [PATCH 22/22] Update test to not expect tibble --- r/tests/testthat/test-compute-aggregate.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 3ca28c65a37..a8de6bfc36c 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -368,7 +368,7 @@ test_that("is_in", { test_that("value_counts", { a <- Array$create(c(1, 4, 3, 1, 1, 3, 4)) - result_df <- tibble::tibble( + result_df <- data.frame( values = c(1, 4, 3), counts = c(3L, 2L, 2L) ) @@ -377,7 +377,7 @@ test_that("value_counts", { type = struct(values = float64(), counts = int64()) ) expect_equal(value_counts(a), result) - expect_identical(as_tibble(value_counts(a)), result_df) + expect_identical(as.data.frame(value_counts(a)), result_df) expect_identical(as.vector(value_counts(a)$counts), result_df$counts) })