From 110863380312df087002c074c61790f4c4f1741e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sun, 16 Apr 2023 13:45:02 +0100 Subject: [PATCH 01/18] Only save default metadata --- r/R/metadata.R | 27 +++++++++++++++++++++++++++ r/tests/testthat/test-metadata.R | 12 ++++++++++++ 2 files changed, 39 insertions(+) diff --git a/r/R/metadata.R b/r/R/metadata.R index 6a54b3e3842..69b3982bcec 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -22,6 +22,8 @@ # drop problems attributes (most likely from readr) x[["attributes"]][["problems"]] <- NULL + x <- remove_default_df_metadata(x) + out <- serialize(x, NULL, ascii = TRUE) # if the metadata is over 100 kB, compress @@ -62,6 +64,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) { expr = { columns_metadata <- r_metadata$columns if (is.data.frame(x)) { + # if columns metadata exists, apply it here if (length(names(x)) && !is.null(columns_metadata)) { for (name in intersect(names(columns_metadata), names(x))) { x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]]) @@ -222,3 +225,27 @@ get_r_metadata_from_old_schema <- function(new_schema, old_schema) { } r_meta } + +remove_default_df_metadata <- function(x) { + # if the column attributes are NULL (the default), drop them + if (all(vapply(x$columns, is.null, logical(1)))) { + x$columns <- NULL + } + + # don't need to preserve data.frame on roundtrip + if (identical(x$attributes$class, "data.frame")) { + x$attributes$class <- NULL + + # if there are no other attributes, remove this entirely + if (is_empty(x$attributes)) { + x$attributes <- NULL + } + } + + # if there are no elements left in x, set to NULL + if(is_empty(x)){ + x <- NULL + } + + x +} diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 4cf8e49af1b..c95a847e9a7 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -391,3 +391,15 @@ test_that("grouped_df non-arrow metadata is preserved", { "look I'm still here!" ) }) + +test_that("Only non-default metadata is saved", { + df <- data.frame(x = 1:5) + df_arrow <- arrow_table(df) + expect_null(df_arrow$r_metadata) + + df <- data.frame(x = 1:5) + attributes(df)$foo = "bar" + df_arrow <- arrow_table(df) + expect_identical(df_arrow$r_metadata, list(attributes = list(foo = "bar"))) + +}) From 3e7e5fb1b9aaedce64b565eef9dcaf448a6d267c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Apr 2023 01:01:08 +0300 Subject: [PATCH 02/18] Implement collect.StructArray and update as.data.frame.StructArray to return data.frames --- r/NAMESPACE | 1 + r/R/array.R | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index 7a8efe0ca30..b1a439545b2 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -293,6 +293,7 @@ export(call_function) export(cast_options) export(chunked_array) export(codec_is_available) +export(collect.StructArray) export(concat_arrays) export(concat_tables) export(contains) diff --git a/r/R/array.R b/r/R/array.R index 109f6daaa2b..906e71617fe 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -474,6 +474,11 @@ dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields) #' @export as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { + as.data.frame(collect.StructArray(x), row.names = row.names, optional = optional, ...) +} + +#' @export +collect.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { as.vector(x) } From cb3a0d23dd84d21b26b01b623af46776c397f07a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Apr 2023 01:02:07 +0300 Subject: [PATCH 03/18] as.data.frame.ArrowTabular returns data.frames and collect.ArrowTabular does not --- r/R/arrow-tabular.R | 3 ++- r/R/dplyr-collect.R | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index ae68cc2118f..058924ba0bd 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -94,7 +94,8 @@ ArrowTabular <- R6Class("ArrowTabular", #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { df <- x$to_data_frame() - apply_arrow_r_metadata(df, x$metadata$r) + out <- apply_arrow_r_metadata(df, x$metadata$r) + as.data.frame(out) } #' @export diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 9205a31b14f..5ac670c6c49 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -24,7 +24,8 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { } collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) { if (as_data_frame) { - as.data.frame(x, ...) + df <- x$to_data_frame() + apply_arrow_r_metadata(df, x$metadata$r) } else { x } From 12eeff32a49dab9e5d3f6381bea4f984fb411c50 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Apr 2023 01:03:21 +0300 Subject: [PATCH 04/18] as.data.frame.arrow_dplyr_query returns data.frames --- r/R/dplyr.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 54ecc80aad1..62b345e1cea 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -216,7 +216,8 @@ unique.RecordBatchReader <- unique.arrow_dplyr_query #' @export as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) { - collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...) + out <- collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...) + as.data.frame(out) } #' @export From 85c6dc255ffd17a8ba21fb82c89442d49375865a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Apr 2023 01:09:44 +0300 Subject: [PATCH 05/18] Pass through additional arguments --- r/R/arrow-tabular.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index 058924ba0bd..e62547d291b 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -95,7 +95,7 @@ ArrowTabular <- R6Class("ArrowTabular", as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { df <- x$to_data_frame() out <- apply_arrow_r_metadata(df, x$metadata$r) - as.data.frame(out) + as.data.frame(out, row.names = row.names, optional = optional, ...) } #' @export From 42b02d1b8e5965bd70eefabd1f88bd48534a221e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Apr 2023 01:25:31 +0300 Subject: [PATCH 06/18] Call collect not as.data.frame inside reader functions --- r/R/csv.R | 2 +- r/R/feather.R | 2 +- r/R/ipc-stream.R | 2 +- r/R/json.R | 2 +- r/R/parquet.R | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index 82243238662..88008bd6822 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -248,7 +248,7 @@ read_delim_arrow <- function(file, } if (isTRUE(as_data_frame)) { - tab <- as.data.frame(tab) + tab <- collect.ArrowTabular(tab) } tab diff --git a/r/R/feather.R b/r/R/feather.R index 1488db29eb7..58d0949fb16 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -196,7 +196,7 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T ) if (isTRUE(as_data_frame)) { - out <- as.data.frame(out) + out <- collect.ArrowTabular(out) } out } diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R index f0b4a6aae0e..71441323934 100644 --- a/r/R/ipc-stream.R +++ b/r/R/ipc-stream.R @@ -106,7 +106,7 @@ read_ipc_stream <- function(file, as_data_frame = TRUE, ...) { # https://issues.apache.org/jira/browse/ARROW-6830 out <- RecordBatchStreamReader$create(file)$read_table() if (as_data_frame) { - out <- as.data.frame(out) + out <- collect.ArrowTabular(out) } out } diff --git a/r/R/json.R b/r/R/json.R index cdbe850b32f..e8131b37f25 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -84,7 +84,7 @@ read_json_arrow <- function(file, } if (isTRUE(as_data_frame)) { - tab <- as.data.frame(tab) + tab <- collect.ArrowTabular(tab) } tab } diff --git a/r/R/parquet.R b/r/R/parquet.R index f3d384e8c25..1335e852192 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -70,7 +70,7 @@ read_parquet <- function(file, } if (as_data_frame) { - tab <- as.data.frame(tab) + tab <- collect.ArrowTabular(tab) } tab } From dd07cda22eaa0358ba686c105b7e00869f17450c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Apr 2023 01:25:58 +0300 Subject: [PATCH 07/18] Implement helper function for comparing metadata --- r/tests/testthat/helper-expectation.R | 4 ++++ r/tests/testthat/test-metadata.R | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 303a96ead7d..21890cd1248 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -23,6 +23,10 @@ expect_data_frame <- function(x, y, ...) { expect_equal(as.data.frame(x), y, ...) } +expect_metadata <- function(x, y, ...) { + expect_equal(as.data.frame(collect.ArrowTabular(x)), as.data.frame(y), ...) +} + expect_r6_class <- function(object, class) { expect_s3_class(object, class) expect_s3_class(object, "R6") diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index c95a847e9a7..e9ffcd53f78 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -57,7 +57,7 @@ test_that("Table R metadata", { "$r$columns$c$columns$c1$attributes$extra_attr", fixed = TRUE ) - expect_identical(as.data.frame(tab), example_with_metadata) + expect_metadata(tab, example_with_metadata) }) test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", { @@ -94,7 +94,7 @@ test_that("Garbage R metadata doesn't break things", { tab <- Table$create(example_data[1:6]) tab$metadata$r <- "garbage" expect_warning( - expect_identical(as.data.frame(tab), example_data[1:6]), + as.data.frame(tab), "Invalid metadata$r", fixed = TRUE ) @@ -103,7 +103,7 @@ test_that("Garbage R metadata doesn't break things", { tab <- Table$create(example_data[1:6]) tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE)) expect_warning( - expect_identical(as.data.frame(tab), example_data[1:6]), + as.data.frame(tab), "Invalid metadata$r", fixed = TRUE ) @@ -164,7 +164,7 @@ test_that("RecordBatch metadata", { }) test_that("RecordBatch R metadata", { - expect_identical(as.data.frame(record_batch(example_with_metadata)), example_with_metadata) + expect_metadata(record_batch(example_with_metadata), example_with_metadata) }) test_that("R metadata roundtrip via parquet", { @@ -195,7 +195,7 @@ test_that("haven types roundtrip via feather", { test_that("Date/time type roundtrip", { rb <- record_batch(example_with_times) expect_r6_class(rb$schema$posixlt$type, "VctrsExtensionType") - expect_identical(as.data.frame(rb), example_with_times) + expect_metadata(rb, example_with_times) }) test_that("metadata keeps attribute of top level data frame", { @@ -387,7 +387,7 @@ test_that("grouped_df non-arrow metadata is preserved", { grouped_tab <- arrow_table(grouped) expect_equal( - attributes(as.data.frame(grouped_tab))$other_metadata, + attributes(collect.ArrowTabular(grouped_tab))$other_metadata, "look I'm still here!" ) }) From 46ad76a6dd2bbb88dc9d6ff6b94672d18c1e7b1f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 21 Apr 2023 18:57:33 +0300 Subject: [PATCH 08/18] Don't use collect.ArrowTabular --- r/R/feather.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/feather.R b/r/R/feather.R index 58d0949fb16..24971669fc5 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -196,7 +196,8 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T ) if (isTRUE(as_data_frame)) { - out <- collect.ArrowTabular(out) + df <- out$to_data_frame() + out <- apply_arrow_r_metadata(df, out$metadata$r) } out } From 8d7c78cf00650e4f3d4d035859703603af882068 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 21 Apr 2023 18:16:26 +0100 Subject: [PATCH 09/18] Update expect_data_frame to convert object and expected into data.frames and update tests to use it --- r/tests/testthat/helper-expectation.R | 3 +- r/tests/testthat/test-RecordBatch.R | 25 +++++----- r/tests/testthat/test-Table.R | 44 ++++++++++------- r/tests/testthat/test-compute-aggregate.R | 2 +- r/tests/testthat/test-compute-sort.R | 8 ++-- r/tests/testthat/test-dataset-csv.R | 2 +- r/tests/testthat/test-dataset.R | 50 +++++++++++--------- r/tests/testthat/test-dplyr-funcs-datetime.R | 8 ++-- r/tests/testthat/test-dplyr-query.R | 26 +++++----- r/tests/testthat/test-duckdb.R | 3 +- r/tests/testthat/test-feather.R | 2 +- r/tests/testthat/test-na-omit.R | 16 +++---- r/tests/testthat/test-read-write.R | 4 +- 13 files changed, 105 insertions(+), 88 deletions(-) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 21890cd1248..b2611987085 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -19,8 +19,9 @@ expect_as_vector <- function(x, y, ...) { expect_equal(as.vector(x), y, ...) } +# expect both objects to contain equal values when converted to data.frame objects expect_data_frame <- function(x, y, ...) { - expect_equal(as.data.frame(x), y, ...) + expect_equal(as.data.frame(x), as.data.frame(y), ...) } expect_metadata <- function(x, y, ...) { diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 7e7084542d3..26d6acc6616 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -346,8 +346,8 @@ test_that("record_batch() handles data frame columns", { b = struct(x = int32(), y = int32()) ) ) - out <- as.data.frame(batch) - expect_equal(out, tibble::tibble(a = 1:10, b = tib)) + + expect_data_frame(batch, tibble::tibble(a = 1:10, b = tib)) # if not named, columns from tib are auto spliced batch2 <- record_batch(a = 1:10, tib) @@ -355,8 +355,8 @@ test_that("record_batch() handles data frame columns", { batch2$schema, schema(a = int32(), x = int32(), y = int32()) ) - out <- as.data.frame(batch2) - expect_equal(out, tibble::tibble(a = 1:10, !!!tib)) + + expect_data_frame(batch2, tibble::tibble(a = 1:10, !!!tib)) }) test_that("record_batch() handles data frame columns with schema spec", { @@ -366,8 +366,7 @@ test_that("record_batch() handles data frame columns with schema spec", { schema <- schema(a = int32(), b = struct(x = int16(), y = float64())) batch <- record_batch(a = 1:10, b = tib, schema = schema) expect_equal(batch$schema, schema) - out <- as.data.frame(batch) - expect_equal(out, tibble::tibble(a = 1:10, b = tib_float)) + expect_data_frame(batch, tibble::tibble(a = 1:10, b = tib_float)) schema <- schema(a = int32(), b = struct(x = int16(), y = utf8())) expect_error(record_batch(a = 1:10, b = tib, schema = schema)) @@ -385,9 +384,9 @@ test_that("record_batch() auto splices (ARROW-5718)", { batch4 <- record_batch(!!!df, z = 1:10) expect_equal(batch3, batch4) expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32())) - expect_equal( - as.data.frame(batch3), - tibble::as_tibble(cbind(df, data.frame(z = 1:10))) + expect_data_frame( + batch3, + cbind(df, data.frame(z = 1:10)) ) s <- schema(x = float64(), y = utf8()) @@ -395,16 +394,16 @@ test_that("record_batch() auto splices (ARROW-5718)", { batch6 <- record_batch(!!!df, schema = s) expect_equal(batch5, batch6) expect_equal(batch5$schema, s) - expect_equal(as.data.frame(batch5), df) + expect_data_frame(batch5, df) s2 <- schema(x = float64(), y = utf8(), z = int16()) batch7 <- record_batch(df, z = 1:10, schema = s2) batch8 <- record_batch(!!!df, z = 1:10, schema = s2) expect_equal(batch7, batch8) expect_equal(batch7$schema, s2) - expect_equal( - as.data.frame(batch7), - tibble::as_tibble(cbind(df, data.frame(z = 1:10))) + expect_data_frame( + batch7, + cbind(df, data.frame(z = 1:10)) ) }) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 233705323e7..75e49f2139c 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -265,10 +265,9 @@ test_that("table() handles ... of arrays, chunked arrays, vectors", { tab$schema, schema(a = int32(), b = int32(), c = float64(), x = int32(), y = utf8()) ) - res <- as.data.frame(tab) - expect_equal(names(res), c("a", "b", "c", "x", "y")) - expect_equal( - res, + + expect_data_frame( + tab, tibble::tibble(a = 1:10, b = 1:10, c = v, x = 1:10, y = letters[1:10]) ) }) @@ -280,14 +279,14 @@ test_that("table() auto splices (ARROW-5718)", { tab2 <- Table$create(!!!df) expect_equal(tab1, tab2) expect_equal(tab1$schema, schema(x = int32(), y = utf8())) - expect_equal(as.data.frame(tab1), df) + expect_data_frame(tab1, df) s <- schema(x = float64(), y = utf8()) tab3 <- Table$create(df, schema = s) tab4 <- Table$create(!!!df, schema = s) expect_equal(tab3, tab4) expect_equal(tab3$schema, s) - expect_equal(as.data.frame(tab3), df) + expect_data_frame(tab3, df) }) test_that("Validation when creating table with schema (ARROW-10953)", { @@ -366,7 +365,7 @@ test_that("Can create table with specific dictionary types", { expect_equal(sch, tab$schema) if (i != int64()) { # TODO: same downcast to int32 as we do for int64() type elsewhere - expect_identical(as.data.frame(tab), fact) + expect_data_frame(tab, fact) } } }) @@ -380,7 +379,7 @@ test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { res <- tibble::tibble(f = factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))) tab <- Table$create(b1, b2, b3, b4) - expect_identical(as.data.frame(tab), res) + expect_data_frame(tab, res) }) test_that("Table$SelectColumns()", { @@ -712,21 +711,32 @@ test_that("as_arrow_table() errors on data.frame with NULL names", { expect_error(as_arrow_table(df), "Input data frame columns must be named") }) -test_that("we only preserve metadata of input to arrow_table when passed a single data.frame", { - # data.frame in, data.frame out +test_that("# GH-35038 - passing in multiple arguments doesn't affect return type", { + + df <- data.frame(x = 1) + out1 <- as.data.frame(arrow_table(df, name = "1")) + out2 <- as.data.frame(arrow_table(name = "1", df)) + + expect_s3_class(out1, c("data.frame"), exact = TRUE) + expect_s3_class(out2, c("data.frame"), exact = TRUE) +}) + +test_that("as.data.frame() on ArrowTabular objects returns a base R data.frame regardless of input type", { df <- data.frame(x = 1) out1 <- as.data.frame(arrow_table(df)) expect_s3_class(out1, "data.frame", exact = TRUE) - # tibble in, tibble out tib <- tibble::tibble(x = 1) out2 <- as.data.frame(arrow_table(tib)) - expect_s3_class(out2, c("tbl_df", "tbl", "data.frame"), exact = TRUE) + expect_s3_class(out2, "data.frame", exact = TRUE) +}) - # GH-35038 - passing in multiple arguments doesn't affect return type - out3 <- as.data.frame(arrow_table(df, name = "1")) - out4 <- as.data.frame(arrow_table(name = "1", df)) +test_that("collect() on ArrowTabular objects returns a tibble regardless of input type", { + df <- data.frame(x = 1) + out1 <- dplyr::collect(arrow_table(df)) + expect_s3_class(out1, c("tbl_df", "tbl", "data.frame"), exact = TRUE) - expect_s3_class(out3, c("tbl_df", "tbl", "data.frame"), exact = TRUE) - expect_s3_class(out4, c("tbl_df", "tbl", "data.frame"), exact = TRUE) + tib <- tibble::tibble(x = 1) + out2 <- dplyr::collect(arrow_table(tib)) + expect_s3_class(out2, c("tbl_df", "tbl", "data.frame"), exact = TRUE) }) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 98face44ff9..918b527d8ae 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -377,7 +377,7 @@ test_that("value_counts", { type = struct(values = float64(), counts = int64()) ) expect_equal(value_counts(a), result) - expect_identical(as.data.frame(value_counts(a)), result_df) + expect_data_frame(value_counts(a), result_df) expect_identical(as.vector(value_counts(a)$counts), result_df$counts) }) diff --git a/r/tests/testthat/test-compute-sort.R b/r/tests/testthat/test-compute-sort.R index f521efeddc5..4f593e6d636 100644 --- a/r/tests/testthat/test-compute-sort.R +++ b/r/tests/testthat/test-compute-sort.R @@ -140,16 +140,16 @@ test_that("Table$SortIndices()", { as.vector(x$Take(x$SortIndices("chr"))$chr), sort(tbl$chr, na.last = TRUE) ) - expect_identical( - as.data.frame(x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE)))), + expect_data_frame( + x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE))), tbl %>% arrange(int, dbl) ) }) test_that("RecordBatch$SortIndices()", { x <- record_batch(tbl) - expect_identical( - as.data.frame(x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE))), + expect_data_frame( + x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE)), tbl %>% arrange(desc(chr), desc(int), desc(dbl)) ) }) diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index df58f853a1e..330b2d5ec98 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -91,7 +91,7 @@ test_that("CSV scan options", { sb$FragmentScanOptions(options) tab <- sb$Finish()$ToTable() - expect_equal(as.data.frame(tab), tibble(chr = c("foo", NA))) + expect_data_frame(tab, data.frame(chr = c("foo", NA))) # Set default convert options in CsvFileFormat csv_format <- CsvFileFormat$create( diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 91b405fc01c..23d70fd8e12 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -85,8 +85,8 @@ expect_scan_result <- function(ds, schm) { tab <- scn$ToTable() expect_r6_class(tab, "Table") - expect_equal( - as.data.frame(tab), + expect_data_frame( + tab, df1[8, c("chr", "lgl")] ) } @@ -806,19 +806,19 @@ test_that("head/tail", { big_df <- rbind(df1, df2) # No n provided (default is 6, all from one batch) - expect_equal(as.data.frame(head(ds)), head(df1)) - expect_equal(as.data.frame(tail(ds)), tail(df2)) + expect_data_frame(head(ds), head(df1)) + expect_data_frame(tail(ds), tail(df2)) # n = 0: have to drop `fct` because factor levels don't come through from # arrow when there are 0 rows zero_df <- big_df[FALSE, names(big_df) != "fct"] - expect_equal(as.data.frame(head(ds, 0))[, names(ds) != "fct"], zero_df) - expect_equal(as.data.frame(tail(ds, 0))[, names(ds) != "fct"], zero_df) + expect_data_frame(as.data.frame(head(ds, 0))[, names(ds) != "fct"], zero_df) + expect_data_frame(as.data.frame(tail(ds, 0))[, names(ds) != "fct"], zero_df) # Two more cases: more than 1 batch, and more than nrow for (n in c(12, 1000)) { - expect_equal(as.data.frame(head(ds, n)), head(big_df, n)) - expect_equal(as.data.frame(tail(ds, n)), tail(big_df, n)) + expect_data_frame(head(ds, n), head(big_df, n)) + expect_data_frame(tail(ds, n), tail(big_df, n)) } expect_error(head(ds, -1)) # Not yet implemented expect_error(tail(ds, -1)) # Not yet implemented @@ -864,18 +864,18 @@ test_that("unique()", { test_that("Dataset [ (take by index)", { ds <- open_dataset(dataset_dir) # Taking only from one file - expect_equal( - as.data.frame(ds[c(4, 5, 9), 3:4]), + expect_data_frame( + ds[c(4, 5, 9), 3:4], df1[c(4, 5, 9), 3:4] ) # Taking from more than one - expect_equal( - as.data.frame(ds[c(4, 5, 9, 12, 13), 3:4]), + expect_data_frame( + ds[c(4, 5, 9, 12, 13), 3:4], rbind(df1[c(4, 5, 9), 3:4], df2[2:3, 3:4]) ) # Taking out of order - expect_equal( - as.data.frame(ds[c(4, 13, 9, 12, 5), ]), + expect_data_frame( + ds[c(4, 13, 9, 12, 5), ], rbind( df1[4, ], df2[3, ], @@ -889,8 +889,8 @@ test_that("Dataset [ (take by index)", { ds2 <- ds %>% filter(int > 6) %>% select(int, lgl) - expect_equal( - as.data.frame(ds2[c(2, 5), ]), + expect_data_frame( + ds2[c(2, 5), ], rbind( df1[8, c("int", "lgl")], df2[1, c("int", "lgl")] @@ -957,7 +957,9 @@ test_that("Can delete filesystem dataset files after collection", { write_dataset(ds0, dataset_dir2) ds <- open_dataset(dataset_dir2) - collected <- ds %>% arrange(int) %>% collect() + collected <- ds %>% + arrange(int) %>% + collect() unlink(dataset_dir2, recursive = TRUE) expect_false(dir.exists(dataset_dir2)) @@ -971,7 +973,11 @@ test_that("Can delete filesystem dataset files after collection", { # dataset write_dataset(ds0, dataset_dir2) ds <- open_dataset(dataset_dir2) - collected <- ds %>% arrange(int) %>% head() %>% arrange(int) %>% collect() + collected <- ds %>% + arrange(int) %>% + head() %>% + arrange(int) %>% + collect() unlink(dataset_dir2, recursive = TRUE) expect_false(dir.exists(dataset_dir2)) @@ -985,11 +991,11 @@ test_that("Scanner$ScanBatches", { ds <- open_dataset(ipc_dir, format = "feather") batches <- ds$NewScan()$Finish()$ScanBatches() table <- Table$create(!!!batches) - expect_equal(as.data.frame(table), rbind(df1, df2)) + expect_data_frame(table, rbind(df1, df2)) batches <- ds$NewScan()$Finish()$ScanBatches() table <- Table$create(!!!batches) - expect_equal(as.data.frame(table), rbind(df1, df2)) + expect_data_frame(table, rbind(df1, df2)) }) test_that("Scanner$ToRecordBatchReader()", { @@ -1001,8 +1007,8 @@ test_that("Scanner$ToRecordBatchReader()", { Scanner$create() reader <- scan$ToRecordBatchReader() expect_r6_class(reader, "RecordBatchReader") - expect_identical( - as.data.frame(reader$read_table()), + expect_data_frame( + reader$read_table(), df1[df1$int > 6, c("int", "lgl")] ) }) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index d47f9232111..bb5e96df948 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -310,7 +310,7 @@ test_that("timestamp round trip correctly via strftime and strptime", { fmt2 <- paste(base_format2, fmt) fmt <- paste(base_format, paste0("%", fmt)) test_df <- tibble::tibble(x = strftime(times, format = fmt)) - expect_equal( + expect_data_frame( test_df %>% arrow_table() %>% mutate(!!fmt := strptime(x, format = fmt2)) %>% @@ -1028,7 +1028,7 @@ test_that("leap_year mirror lubridate", { .input %>% mutate(x = leap_year(test_year)) %>% collect(), - data.frame( + tibble::tibble( test_year = as.Date(c( "1998-01-01", # not leap year "1996-01-01", # leap year (divide by 4 rule) @@ -1048,7 +1048,9 @@ test_that("am/pm mirror lubridate", { am2 = lubridate::am(test_time), pm2 = lubridate::pm(test_time) ) %>% - collect(), + # can't use collect() here due to how tibbles store datetimes + # TODO: add better explanation above + as.data.frame(), data.frame( test_time = strptime( x = c( diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index e478d0e4c40..39a6ba792f3 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -119,14 +119,14 @@ test_that("collect(as_data_frame=FALSE)", { # collect(as_data_frame = FALSE) always returns Table now expect_r6_class(b2, "Table") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] - expect_equal(as.data.frame(b2), expected) + expect_data_frame(b2, expected) b3 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% collect(as_data_frame = FALSE) expect_r6_class(b3, "Table") - expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + expect_data_frame(b3, set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% @@ -134,8 +134,8 @@ test_that("collect(as_data_frame=FALSE)", { group_by(int) %>% collect(as_data_frame = FALSE) expect_r6_class(b4, "Table") - expect_equal( - as.data.frame(b4), + expect_data_frame( + b4, expected %>% rename(strng = chr) %>% group_by(int) @@ -156,14 +156,14 @@ test_that("compute()", { expect_r6_class(b2, "Table") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] - expect_equal(as.data.frame(b2), expected) + expect_data_frame(b2, expected) b3 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% compute() expect_r6_class(b3, "Table") - expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + expect_data_frame(b3, set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% @@ -171,8 +171,8 @@ test_that("compute()", { group_by(int) %>% compute() expect_r6_class(b4, "Table") - expect_equal( - as.data.frame(b4), + expect_data_frame( + b4, expected %>% rename(strng = chr) %>% group_by(int) @@ -210,8 +210,7 @@ test_that("arrange then head returns the right data (ARROW-14162)", { arrange(mpg, disp) %>% head(4) %>% collect(), - mtcars, - ignore_attr = "row.names" + tibble::as_tibble(mtcars) ) }) @@ -222,8 +221,7 @@ test_that("arrange then tail returns the right data", { arrange(mpg, disp) %>% tail(4) %>% collect(), - mtcars, - ignore_attr = "row.names" + tibble::as_tibble(mtcars) ) }) @@ -559,8 +557,8 @@ test_that("compute() on a grouped query returns a Table with groups in metadata" group_by(int) %>% compute() expect_r6_class(tab1, "Table") - expect_equal( - as.data.frame(tab1), + expect_data_frame( + tab1, tbl %>% group_by(int) ) diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R index 24e8cadf2e3..409e99b70fb 100644 --- a/r/tests/testthat/test-duckdb.R +++ b/r/tests/testthat/test-duckdb.R @@ -168,7 +168,8 @@ test_that("to_arrow roundtrip, with dataset", { filter(int > 5 & part > 1) %>% mutate(dbl_plus = dbl + 1) %>% collect() %>% - arrange(part, int) + arrange(part, int) %>% + as.data.frame() ) }) diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 4caadc27c4b..67ae8f941d6 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -177,7 +177,7 @@ test_that("feather read/write round trip", { tab1 <- read_feather(feather_file, as_data_frame = FALSE) expect_r6_class(tab1, "Table") - expect_equal(tib, as.data.frame(tab1)) + expect_data_frame(tib, tab1) }) test_that("Read feather from raw vector", { diff --git a/r/tests/testthat/test-na-omit.R b/r/tests/testthat/test-na-omit.R index c2d0fd1b71a..813bcb04855 100644 --- a/r/tests/testthat/test-na-omit.R +++ b/r/tests/testthat/test-na-omit.R @@ -42,8 +42,8 @@ test_that("na.fail on Array and ChunkedArray", { test_that("na.omit on Table", { tbl <- Table$create(example_data) - expect_equal( - as.data.frame(na.omit(tbl)), + expect_data_frame( + na.omit(tbl), na.omit(example_data), # We don't include an attribute with the rows omitted ignore_attr = "na.action" @@ -52,8 +52,8 @@ test_that("na.omit on Table", { test_that("na.exclude on Table", { tbl <- Table$create(example_data) - expect_equal( - as.data.frame(na.exclude(tbl)), + expect_data_frame( + na.exclude(tbl), na.exclude(example_data), ignore_attr = "na.action" ) @@ -66,8 +66,8 @@ test_that("na.fail on Table", { test_that("na.omit on RecordBatch", { batch <- record_batch(example_data) - expect_equal( - as.data.frame(na.omit(batch)), + expect_data_frame( + na.omit(batch), na.omit(example_data), ignore_attr = "na.action" ) @@ -75,8 +75,8 @@ test_that("na.omit on RecordBatch", { test_that("na.exclude on RecordBatch", { batch <- record_batch(example_data) - expect_equal( - as.data.frame(na.exclude(batch)), + expect_data_frame( + na.exclude(batch), na.omit(example_data), ignore_attr = "na.action" ) diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index 66f6db56d90..38bc33004fd 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -119,7 +119,7 @@ test_that("reading/writing a raw vector (sparklyr integration)", { } bytes <- write_to_raw(example_data) expect_type(bytes, "raw") - expect_identical(read_from_raw_test(bytes), example_data) + expect_data_frame(read_from_raw_test(bytes), example_data) # this could just be `read_ipc_stream(x)`; propose that - expect_identical(read_ipc_stream(bytes), example_data) + expect_data_frame(read_ipc_stream(bytes), example_data) }) From ab93ba5924d1b9fae90340ef8502d23295aeed81 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 21 Apr 2023 18:26:40 +0100 Subject: [PATCH 10/18] Rename function for clarity --- r/tests/testthat/helper-expectation.R | 2 +- r/tests/testthat/test-RecordBatch.R | 70 +++++++++--------- r/tests/testthat/test-Table.R | 76 ++++++++++---------- r/tests/testthat/test-compute-aggregate.R | 2 +- r/tests/testthat/test-compute-sort.R | 4 +- r/tests/testthat/test-dataset-csv.R | 2 +- r/tests/testthat/test-dataset.R | 28 ++++---- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- r/tests/testthat/test-dplyr-query.R | 14 ++-- r/tests/testthat/test-feather.R | 2 +- r/tests/testthat/test-na-omit.R | 8 +-- r/tests/testthat/test-read-write.R | 4 +- 12 files changed, 107 insertions(+), 107 deletions(-) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index b2611987085..1f4194a6c56 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -20,7 +20,7 @@ expect_as_vector <- function(x, y, ...) { } # expect both objects to contain equal values when converted to data.frame objects -expect_data_frame <- function(x, y, ...) { +expect_equal_data_frame <- function(x, y, ...) { expect_equal(as.data.frame(x), as.data.frame(y), ...) } diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 26d6acc6616..f29b75dbf40 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -89,7 +89,7 @@ test_that("RecordBatch", { schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8())) ) expect_equal(batch2$column(0), batch$column(1)) - expect_data_frame(batch2, tbl[, -1]) + expect_equal_data_frame(batch2, tbl[, -1]) # input validation expect_error(batch$RemoveColumn(NA), "'i' cannot be NA") @@ -109,10 +109,10 @@ test_that("RecordBatch S3 methods", { test_that("RecordBatch$Slice", { batch3 <- batch$Slice(5) - expect_data_frame(batch3, tbl[6:10, ]) + expect_equal_data_frame(batch3, tbl[6:10, ]) batch4 <- batch$Slice(5, 2) - expect_data_frame(batch4, tbl[6:7, ]) + expect_equal_data_frame(batch4, tbl[6:7, ]) # Input validation expect_error(batch$Slice("ten")) @@ -131,20 +131,20 @@ test_that("RecordBatch$Slice", { }) test_that("[ on RecordBatch", { - expect_data_frame(batch[6:7, ], tbl[6:7, ]) - expect_data_frame(batch[c(6, 7), ], tbl[6:7, ]) - expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4]) - expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_equal_data_frame(batch[6:7, ], tbl[6:7, ]) + expect_equal_data_frame(batch[c(6, 7), ], tbl[6:7, ]) + expect_equal_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4]) + expect_equal_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr) - expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) - expect_data_frame( + expect_equal_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_equal_data_frame( batch[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ] ) # bool Array - expect_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ]) + expect_equal_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ]) # int Array - expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + expect_equal_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) # input validation expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"') @@ -176,15 +176,15 @@ test_that("[[<- assignment", { # can remove a column batch[["chr"]] <- NULL - expect_data_frame(batch, tbl[-4]) + expect_equal_data_frame(batch, tbl[-4]) # can remove a column by index batch[[4]] <- NULL - expect_data_frame(batch, tbl[1:3]) + expect_equal_data_frame(batch, tbl[1:3]) # can add a named column batch[["new"]] <- letters[10:1] - expect_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) + expect_equal_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) # can replace a column by index batch[[2]] <- as.numeric(10:1) @@ -239,16 +239,16 @@ test_that("head and tail on RecordBatch", { fct = factor(letters[1:10]) ) batch <- RecordBatch$create(tbl) - expect_data_frame(head(batch), head(tbl)) - expect_data_frame(head(batch, 4), head(tbl, 4)) - expect_data_frame(head(batch, 40), head(tbl, 40)) - expect_data_frame(head(batch, -4), head(tbl, -4)) - expect_data_frame(head(batch, -40), head(tbl, -40)) - expect_data_frame(tail(batch), tail(tbl)) - expect_data_frame(tail(batch, 4), tail(tbl, 4)) - expect_data_frame(tail(batch, 40), tail(tbl, 40)) - expect_data_frame(tail(batch, -4), tail(tbl, -4)) - expect_data_frame(tail(batch, -40), tail(tbl, -40)) + expect_equal_data_frame(head(batch), head(tbl)) + expect_equal_data_frame(head(batch, 4), head(tbl, 4)) + expect_equal_data_frame(head(batch, 40), head(tbl, 40)) + expect_equal_data_frame(head(batch, -4), head(tbl, -4)) + expect_equal_data_frame(head(batch, -40), head(tbl, -40)) + expect_equal_data_frame(tail(batch), tail(tbl)) + expect_equal_data_frame(tail(batch, 4), tail(tbl, 4)) + expect_equal_data_frame(tail(batch, 40), tail(tbl, 40)) + expect_equal_data_frame(tail(batch, -4), tail(tbl, -4)) + expect_equal_data_frame(tail(batch, -40), tail(tbl, -40)) }) test_that("RecordBatch print method", { @@ -347,7 +347,7 @@ test_that("record_batch() handles data frame columns", { ) ) - expect_data_frame(batch, tibble::tibble(a = 1:10, b = tib)) + expect_equal_data_frame(batch, tibble::tibble(a = 1:10, b = tib)) # if not named, columns from tib are auto spliced batch2 <- record_batch(a = 1:10, tib) @@ -356,7 +356,7 @@ test_that("record_batch() handles data frame columns", { schema(a = int32(), x = int32(), y = int32()) ) - expect_data_frame(batch2, tibble::tibble(a = 1:10, !!!tib)) + expect_equal_data_frame(batch2, tibble::tibble(a = 1:10, !!!tib)) }) test_that("record_batch() handles data frame columns with schema spec", { @@ -366,7 +366,7 @@ test_that("record_batch() handles data frame columns with schema spec", { schema <- schema(a = int32(), b = struct(x = int16(), y = float64())) batch <- record_batch(a = 1:10, b = tib, schema = schema) expect_equal(batch$schema, schema) - expect_data_frame(batch, tibble::tibble(a = 1:10, b = tib_float)) + expect_equal_data_frame(batch, tibble::tibble(a = 1:10, b = tib_float)) schema <- schema(a = int32(), b = struct(x = int16(), y = utf8())) expect_error(record_batch(a = 1:10, b = tib, schema = schema)) @@ -378,13 +378,13 @@ test_that("record_batch() auto splices (ARROW-5718)", { batch2 <- record_batch(!!!df) expect_equal(batch1, batch2) expect_equal(batch1$schema, schema(x = int32(), y = utf8())) - expect_data_frame(batch1, df) + expect_equal_data_frame(batch1, df) batch3 <- record_batch(df, z = 1:10) batch4 <- record_batch(!!!df, z = 1:10) expect_equal(batch3, batch4) expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32())) - expect_data_frame( + expect_equal_data_frame( batch3, cbind(df, data.frame(z = 1:10)) ) @@ -394,14 +394,14 @@ test_that("record_batch() auto splices (ARROW-5718)", { batch6 <- record_batch(!!!df, schema = s) expect_equal(batch5, batch6) expect_equal(batch5$schema, s) - expect_data_frame(batch5, df) + expect_equal_data_frame(batch5, df) s2 <- schema(x = float64(), y = utf8(), z = int16()) batch7 <- record_batch(df, z = 1:10, schema = s2) batch8 <- record_batch(!!!df, z = 1:10, schema = s2) expect_equal(batch7, batch8) expect_equal(batch7$schema, s2) - expect_data_frame( + expect_equal_data_frame( batch7, cbind(df, data.frame(z = 1:10)) ) @@ -424,24 +424,24 @@ test_that("record_batch() handles null type (ARROW-7064)", { }) test_that("record_batch() scalar recycling with vectors", { - expect_data_frame( + expect_equal_data_frame( record_batch(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) ) }) test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", { - expect_data_frame( + expect_equal_data_frame( record_batch(a = Array$create(1:10), b = Scalar$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_data_frame( + expect_equal_data_frame( record_batch(a = Array$create(1:10), b = Array$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_data_frame( + expect_equal_data_frame( record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)), tibble::tibble(a = 1:10, b = 5) ) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 75e49f2139c..ce3254a158e 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -68,26 +68,26 @@ tab <- Table$create(tbl) test_that("[, [[, $ for Table", { expect_identical(names(tab), names(tbl)) - expect_data_frame(tab[6:7, ], tbl[6:7, ]) - expect_data_frame(tab[6:7, 2:4], tbl[6:7, 2:4]) - expect_data_frame(tab[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_equal_data_frame(tab[6:7, ], tbl[6:7, ]) + expect_equal_data_frame(tab[6:7, 2:4], tbl[6:7, 2:4]) + expect_equal_data_frame(tab[, c("dbl", "fct")], tbl[, c(2, 5)]) expect_as_vector(tab[, "chr", drop = TRUE], tbl$chr) # Take within a single chunk - expect_data_frame(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) - expect_data_frame(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ]) + expect_equal_data_frame(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_equal_data_frame(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ]) # bool ChunkedArray (with one chunk) - expect_data_frame(tab[tab$lgl, ], tbl[tbl$lgl, ]) + expect_equal_data_frame(tab[tab$lgl, ], tbl[tbl$lgl, ]) # ChunkedArray with multiple chunks c1 <- c(TRUE, FALSE, TRUE, TRUE, FALSE) c2 <- c(FALSE, FALSE, TRUE, TRUE, FALSE) ca <- ChunkedArray$create(c1, c2) - expect_data_frame(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ]) + expect_equal_data_frame(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ]) # int Array - expect_data_frame(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + expect_equal_data_frame(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4]) # ChunkedArray - expect_data_frame(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4]) + expect_equal_data_frame(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4]) # Expression - expect_data_frame(tab[tab$int > 6, ], tbl[tbl$int > 6, ]) + expect_equal_data_frame(tab[tab$int > 6, ], tbl[tbl$int > 6, ]) expect_as_vector(tab[["int"]], tbl$int) expect_as_vector(tab$int, tbl$int) @@ -95,9 +95,9 @@ test_that("[, [[, $ for Table", { expect_null(tab$qwerty) expect_null(tab[["asdf"]]) # List-like column slicing - expect_data_frame(tab[2:4], tbl[2:4]) - expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)]) - expect_data_frame(tab[-3], tbl[-3]) + expect_equal_data_frame(tab[2:4], tbl[2:4]) + expect_equal_data_frame(tab[c(2, 1)], tbl[c(2, 1)]) + expect_equal_data_frame(tab[-3], tbl[-3]) expect_error(tab[[c(4, 3)]]) expect_error(tab[[NA]], "'i' must be character or numeric, not logical") @@ -112,21 +112,21 @@ test_that("[, [[, $ for Table", { expect_error(tab[, c(6, NA)], "Column indices cannot be NA") skip("Table with 0 cols doesn't know how many rows it should have") - expect_data_frame(tab[0], tbl[0]) + expect_equal_data_frame(tab[0], tbl[0]) }) test_that("[[<- assignment", { # can remove a column tab[["chr"]] <- NULL - expect_data_frame(tab, tbl[-4]) + expect_equal_data_frame(tab, tbl[-4]) # can remove a column by index tab[[4]] <- NULL - expect_data_frame(tab, tbl[1:3]) + expect_equal_data_frame(tab, tbl[1:3]) # can add a named column tab[["new"]] <- letters[10:1] - expect_data_frame(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) + expect_equal_data_frame(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) # can replace a column by index tab[[2]] <- as.numeric(10:1) @@ -177,10 +177,10 @@ test_that("[[<- assignment", { test_that("Table$Slice", { tab2 <- tab$Slice(5) - expect_data_frame(tab2, tbl[6:10, ]) + expect_equal_data_frame(tab2, tbl[6:10, ]) tab3 <- tab$Slice(5, 2) - expect_data_frame(tab3, tbl[6:7, ]) + expect_equal_data_frame(tab3, tbl[6:7, ]) # Input validation expect_error(tab$Slice("ten")) @@ -199,16 +199,16 @@ test_that("Table$Slice", { }) test_that("head and tail on Table", { - expect_data_frame(head(tab), head(tbl)) - expect_data_frame(head(tab, 4), head(tbl, 4)) - expect_data_frame(head(tab, 40), head(tbl, 40)) - expect_data_frame(head(tab, -4), head(tbl, -4)) - expect_data_frame(head(tab, -40), head(tbl, -40)) - expect_data_frame(tail(tab), tail(tbl)) - expect_data_frame(tail(tab, 4), tail(tbl, 4)) - expect_data_frame(tail(tab, 40), tail(tbl, 40)) - expect_data_frame(tail(tab, -4), tail(tbl, -4)) - expect_data_frame(tail(tab, -40), tail(tbl, -40)) + expect_equal_data_frame(head(tab), head(tbl)) + expect_equal_data_frame(head(tab, 4), head(tbl, 4)) + expect_equal_data_frame(head(tab, 40), head(tbl, 40)) + expect_equal_data_frame(head(tab, -4), head(tbl, -4)) + expect_equal_data_frame(head(tab, -40), head(tbl, -40)) + expect_equal_data_frame(tail(tab), tail(tbl)) + expect_equal_data_frame(tail(tab, 4), tail(tbl, 4)) + expect_equal_data_frame(tail(tab, 40), tail(tbl, 40)) + expect_equal_data_frame(tail(tab, -4), tail(tbl, -4)) + expect_equal_data_frame(tail(tab, -40), tail(tbl, -40)) }) test_that("Table print method", { @@ -266,7 +266,7 @@ test_that("table() handles ... of arrays, chunked arrays, vectors", { schema(a = int32(), b = int32(), c = float64(), x = int32(), y = utf8()) ) - expect_data_frame( + expect_equal_data_frame( tab, tibble::tibble(a = 1:10, b = 1:10, c = v, x = 1:10, y = letters[1:10]) ) @@ -279,14 +279,14 @@ test_that("table() auto splices (ARROW-5718)", { tab2 <- Table$create(!!!df) expect_equal(tab1, tab2) expect_equal(tab1$schema, schema(x = int32(), y = utf8())) - expect_data_frame(tab1, df) + expect_equal_data_frame(tab1, df) s <- schema(x = float64(), y = utf8()) tab3 <- Table$create(df, schema = s) tab4 <- Table$create(!!!df, schema = s) expect_equal(tab3, tab4) expect_equal(tab3$schema, s) - expect_data_frame(tab3, df) + expect_equal_data_frame(tab3, df) }) test_that("Validation when creating table with schema (ARROW-10953)", { @@ -365,7 +365,7 @@ test_that("Can create table with specific dictionary types", { expect_equal(sch, tab$schema) if (i != int64()) { # TODO: same downcast to int32 as we do for int64() type elsewhere - expect_data_frame(tab, fact) + expect_equal_data_frame(tab, fact) } } }) @@ -379,7 +379,7 @@ test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { res <- tibble::tibble(f = factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))) tab <- Table$create(b1, b2, b3, b4) - expect_data_frame(tab, res) + expect_equal_data_frame(tab, res) }) test_that("Table$SelectColumns()", { @@ -409,24 +409,24 @@ test_that("Table$create() with different length columns", { }) test_that("Table$create() scalar recycling with vectors", { - expect_data_frame( + expect_equal_data_frame( Table$create(a = 1:10, b = 5), tibble::tibble(a = 1:10, b = 5) ) }) test_that("Table$create() scalar recycling with Scalars, Arrays, and ChunkedArrays", { - expect_data_frame( + expect_equal_data_frame( Table$create(a = Array$create(1:10), b = Scalar$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_data_frame( + expect_equal_data_frame( Table$create(a = Array$create(1:10), b = Array$create(5)), tibble::tibble(a = 1:10, b = 5) ) - expect_data_frame( + expect_equal_data_frame( Table$create(a = Array$create(1:10), b = ChunkedArray$create(5)), tibble::tibble(a = 1:10, b = 5) ) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 918b527d8ae..2732cdef3ed 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -377,7 +377,7 @@ test_that("value_counts", { type = struct(values = float64(), counts = int64()) ) expect_equal(value_counts(a), result) - expect_data_frame(value_counts(a), result_df) + expect_equal_data_frame(value_counts(a), result_df) expect_identical(as.vector(value_counts(a)$counts), result_df$counts) }) diff --git a/r/tests/testthat/test-compute-sort.R b/r/tests/testthat/test-compute-sort.R index 4f593e6d636..a80f6107781 100644 --- a/r/tests/testthat/test-compute-sort.R +++ b/r/tests/testthat/test-compute-sort.R @@ -140,7 +140,7 @@ test_that("Table$SortIndices()", { as.vector(x$Take(x$SortIndices("chr"))$chr), sort(tbl$chr, na.last = TRUE) ) - expect_data_frame( + expect_equal_data_frame( x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE))), tbl %>% arrange(int, dbl) ) @@ -148,7 +148,7 @@ test_that("Table$SortIndices()", { test_that("RecordBatch$SortIndices()", { x <- record_batch(tbl) - expect_data_frame( + expect_equal_data_frame( x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE)), tbl %>% arrange(desc(chr), desc(int), desc(dbl)) ) diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index 330b2d5ec98..98858a7d166 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -91,7 +91,7 @@ test_that("CSV scan options", { sb$FragmentScanOptions(options) tab <- sb$Finish()$ToTable() - expect_data_frame(tab, data.frame(chr = c("foo", NA))) + expect_equal_data_frame(tab, data.frame(chr = c("foo", NA))) # Set default convert options in CsvFileFormat csv_format <- CsvFileFormat$create( diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 23d70fd8e12..b9972901a70 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -85,7 +85,7 @@ expect_scan_result <- function(ds, schm) { tab <- scn$ToTable() expect_r6_class(tab, "Table") - expect_data_frame( + expect_equal_data_frame( tab, df1[8, c("chr", "lgl")] ) @@ -806,19 +806,19 @@ test_that("head/tail", { big_df <- rbind(df1, df2) # No n provided (default is 6, all from one batch) - expect_data_frame(head(ds), head(df1)) - expect_data_frame(tail(ds), tail(df2)) + expect_equal_data_frame(head(ds), head(df1)) + expect_equal_data_frame(tail(ds), tail(df2)) # n = 0: have to drop `fct` because factor levels don't come through from # arrow when there are 0 rows zero_df <- big_df[FALSE, names(big_df) != "fct"] - expect_data_frame(as.data.frame(head(ds, 0))[, names(ds) != "fct"], zero_df) - expect_data_frame(as.data.frame(tail(ds, 0))[, names(ds) != "fct"], zero_df) + expect_equal_data_frame(as.data.frame(head(ds, 0))[, names(ds) != "fct"], zero_df) + expect_equal_data_frame(as.data.frame(tail(ds, 0))[, names(ds) != "fct"], zero_df) # Two more cases: more than 1 batch, and more than nrow for (n in c(12, 1000)) { - expect_data_frame(head(ds, n), head(big_df, n)) - expect_data_frame(tail(ds, n), tail(big_df, n)) + expect_equal_data_frame(head(ds, n), head(big_df, n)) + expect_equal_data_frame(tail(ds, n), tail(big_df, n)) } expect_error(head(ds, -1)) # Not yet implemented expect_error(tail(ds, -1)) # Not yet implemented @@ -864,17 +864,17 @@ test_that("unique()", { test_that("Dataset [ (take by index)", { ds <- open_dataset(dataset_dir) # Taking only from one file - expect_data_frame( + expect_equal_data_frame( ds[c(4, 5, 9), 3:4], df1[c(4, 5, 9), 3:4] ) # Taking from more than one - expect_data_frame( + expect_equal_data_frame( ds[c(4, 5, 9, 12, 13), 3:4], rbind(df1[c(4, 5, 9), 3:4], df2[2:3, 3:4]) ) # Taking out of order - expect_data_frame( + expect_equal_data_frame( ds[c(4, 13, 9, 12, 5), ], rbind( df1[4, ], @@ -889,7 +889,7 @@ test_that("Dataset [ (take by index)", { ds2 <- ds %>% filter(int > 6) %>% select(int, lgl) - expect_data_frame( + expect_equal_data_frame( ds2[c(2, 5), ], rbind( df1[8, c("int", "lgl")], @@ -991,11 +991,11 @@ test_that("Scanner$ScanBatches", { ds <- open_dataset(ipc_dir, format = "feather") batches <- ds$NewScan()$Finish()$ScanBatches() table <- Table$create(!!!batches) - expect_data_frame(table, rbind(df1, df2)) + expect_equal_data_frame(table, rbind(df1, df2)) batches <- ds$NewScan()$Finish()$ScanBatches() table <- Table$create(!!!batches) - expect_data_frame(table, rbind(df1, df2)) + expect_equal_data_frame(table, rbind(df1, df2)) }) test_that("Scanner$ToRecordBatchReader()", { @@ -1007,7 +1007,7 @@ test_that("Scanner$ToRecordBatchReader()", { Scanner$create() reader <- scan$ToRecordBatchReader() expect_r6_class(reader, "RecordBatchReader") - expect_data_frame( + expect_equal_data_frame( reader$read_table(), df1[df1$int > 6, c("int", "lgl")] ) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index bb5e96df948..d59356ad659 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -310,7 +310,7 @@ test_that("timestamp round trip correctly via strftime and strptime", { fmt2 <- paste(base_format2, fmt) fmt <- paste(base_format, paste0("%", fmt)) test_df <- tibble::tibble(x = strftime(times, format = fmt)) - expect_data_frame( + expect_equal_data_frame( test_df %>% arrow_table() %>% mutate(!!fmt := strptime(x, format = fmt2)) %>% diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 39a6ba792f3..bab81a463e9 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -119,14 +119,14 @@ test_that("collect(as_data_frame=FALSE)", { # collect(as_data_frame = FALSE) always returns Table now expect_r6_class(b2, "Table") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] - expect_data_frame(b2, expected) + expect_equal_data_frame(b2, expected) b3 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% collect(as_data_frame = FALSE) expect_r6_class(b3, "Table") - expect_data_frame(b3, set_names(expected, c("int", "strng"))) + expect_equal_data_frame(b3, set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% @@ -134,7 +134,7 @@ test_that("collect(as_data_frame=FALSE)", { group_by(int) %>% collect(as_data_frame = FALSE) expect_r6_class(b4, "Table") - expect_data_frame( + expect_equal_data_frame( b4, expected %>% rename(strng = chr) %>% @@ -156,14 +156,14 @@ test_that("compute()", { expect_r6_class(b2, "Table") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] - expect_data_frame(b2, expected) + expect_equal_data_frame(b2, expected) b3 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% compute() expect_r6_class(b3, "Table") - expect_data_frame(b3, set_names(expected, c("int", "strng"))) + expect_equal_data_frame(b3, set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% @@ -171,7 +171,7 @@ test_that("compute()", { group_by(int) %>% compute() expect_r6_class(b4, "Table") - expect_data_frame( + expect_equal_data_frame( b4, expected %>% rename(strng = chr) %>% @@ -557,7 +557,7 @@ test_that("compute() on a grouped query returns a Table with groups in metadata" group_by(int) %>% compute() expect_r6_class(tab1, "Table") - expect_data_frame( + expect_equal_data_frame( tab1, tbl %>% group_by(int) diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 67ae8f941d6..188a562fe81 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -177,7 +177,7 @@ test_that("feather read/write round trip", { tab1 <- read_feather(feather_file, as_data_frame = FALSE) expect_r6_class(tab1, "Table") - expect_data_frame(tib, tab1) + expect_equal_data_frame(tib, tab1) }) test_that("Read feather from raw vector", { diff --git a/r/tests/testthat/test-na-omit.R b/r/tests/testthat/test-na-omit.R index 813bcb04855..cfc71445d44 100644 --- a/r/tests/testthat/test-na-omit.R +++ b/r/tests/testthat/test-na-omit.R @@ -42,7 +42,7 @@ test_that("na.fail on Array and ChunkedArray", { test_that("na.omit on Table", { tbl <- Table$create(example_data) - expect_data_frame( + expect_equal_data_frame( na.omit(tbl), na.omit(example_data), # We don't include an attribute with the rows omitted @@ -52,7 +52,7 @@ test_that("na.omit on Table", { test_that("na.exclude on Table", { tbl <- Table$create(example_data) - expect_data_frame( + expect_equal_data_frame( na.exclude(tbl), na.exclude(example_data), ignore_attr = "na.action" @@ -66,7 +66,7 @@ test_that("na.fail on Table", { test_that("na.omit on RecordBatch", { batch <- record_batch(example_data) - expect_data_frame( + expect_equal_data_frame( na.omit(batch), na.omit(example_data), ignore_attr = "na.action" @@ -75,7 +75,7 @@ test_that("na.omit on RecordBatch", { test_that("na.exclude on RecordBatch", { batch <- record_batch(example_data) - expect_data_frame( + expect_equal_data_frame( na.exclude(batch), na.omit(example_data), ignore_attr = "na.action" diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index 38bc33004fd..9475788ee5a 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -119,7 +119,7 @@ test_that("reading/writing a raw vector (sparklyr integration)", { } bytes <- write_to_raw(example_data) expect_type(bytes, "raw") - expect_data_frame(read_from_raw_test(bytes), example_data) + expect_equal_data_frame(read_from_raw_test(bytes), example_data) # this could just be `read_ipc_stream(x)`; propose that - expect_data_frame(read_ipc_stream(bytes), example_data) + expect_equal_data_frame(read_ipc_stream(bytes), example_data) }) From 33e7c6fa1a50d847748e6e255f9bef3d1dbf7693 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 21 Apr 2023 18:34:34 +0100 Subject: [PATCH 11/18] Update test-utf to use expect_equal_data_frame --- r/tests/testthat/test-utf.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/r/tests/testthat/test-utf.R b/r/tests/testthat/test-utf.R index f7553da5b4a..660b2a4784f 100644 --- a/r/tests/testthat/test-utf.R +++ b/r/tests/testthat/test-utf.R @@ -45,24 +45,24 @@ test_that("We handle non-UTF strings", { expect_identical(as.vector(ChunkedArray$create(df)), df) # Table (including field name) - expect_identical(as.data.frame(Table$create(df)), df) - expect_identical(as.data.frame(Table$create(df_struct)), df_struct) + expect_equal_data_frame(Table$create(df), df) + expect_equal_data_frame(Table$create(df_struct), df_struct) # RecordBatch - expect_identical(as.data.frame(record_batch(df)), df) - expect_identical(as.data.frame(record_batch(df_struct)), df_struct) + expect_equal_data_frame(record_batch(df), df) + expect_equal_data_frame(record_batch(df_struct), df_struct) # Schema field name - df_schema <- do.call(schema, raw_schema) + df_schema <- schema(raw_schema) expect_identical(names(df_schema), names(df)) df_struct_schema <- schema(a = do.call(struct, raw_schema)) # Create table/batch with schema - expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df) - expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct) - expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df) - expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct) + expect_equal_data_frame(Table$create(df, schema = df_schema), df) + expect_equal_data_frame(Table$create(df_struct, schema = df_struct_schema), df_struct) + expect_equal_data_frame(record_batch(df, schema = df_schema), df) + expect_equal_data_frame(record_batch(df_struct, schema = df_struct_schema), df_struct) # Serialization feather_file <- tempfile() From 4e0562fb3406a9f761b2a92b9a054d325c4b0307 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 21 Apr 2023 19:07:41 +0100 Subject: [PATCH 12/18] Only remove metadata if it is definitely default metadata --- r/R/metadata.R | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index 69b3982bcec..4b340876996 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -22,7 +22,9 @@ # drop problems attributes (most likely from readr) x[["attributes"]][["problems"]] <- NULL - x <- remove_default_df_metadata(x) + if (has_default_df_metadata(x)) { + x <- NULL + } out <- serialize(x, NULL, ascii = TRUE) @@ -226,26 +228,8 @@ get_r_metadata_from_old_schema <- function(new_schema, old_schema) { r_meta } -remove_default_df_metadata <- function(x) { - # if the column attributes are NULL (the default), drop them - if (all(vapply(x$columns, is.null, logical(1)))) { - x$columns <- NULL - } - - # don't need to preserve data.frame on roundtrip - if (identical(x$attributes$class, "data.frame")) { - x$attributes$class <- NULL - - # if there are no other attributes, remove this entirely - if (is_empty(x$attributes)) { - x$attributes <- NULL - } - } - - # if there are no elements left in x, set to NULL - if(is_empty(x)){ - x <- NULL - } - - x +has_default_df_metadata <- function(x){ + identical(names(x), c("attributes", "columns")) && + all(vapply(x$columns, is.null, logical(1))) && + identical(x$attributes$class, "data.frame") } From 652952ca955e28d1acf2d308a72d0a10b2ed8c7e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 21 Apr 2023 19:08:10 +0100 Subject: [PATCH 13/18] Update schema test so it uses an object where even default metadata is restored --- r/tests/testthat/test-schema.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R index 24776e6d0c1..d26b2d275fd 100644 --- a/r/tests/testthat/test-schema.R +++ b/r/tests/testthat/test-schema.R @@ -271,6 +271,7 @@ test_that("schema name assignment", { # Test that R metadata is updated appropriately df <- data.frame(x = 1:3, y = c("a", "b", "c")) + class(df) <- c("data.frame", "special_df") schm2 <- arrow_table(df)$schema names(schm2) <- c("col1", "col2") expect_identical(names(schm2), c("col1", "col2")) From e52d3cfa7c80a80ae2879990264d3697627eb942 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 24 Apr 2023 09:15:08 +0100 Subject: [PATCH 14/18] Remove expect_metadata function --- r/tests/testthat/helper-expectation.R | 4 ---- r/tests/testthat/test-metadata.R | 9 ++++----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 1f4194a6c56..090ed36aa7f 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -24,10 +24,6 @@ expect_equal_data_frame <- function(x, y, ...) { expect_equal(as.data.frame(x), as.data.frame(y), ...) } -expect_metadata <- function(x, y, ...) { - expect_equal(as.data.frame(collect.ArrowTabular(x)), as.data.frame(y), ...) -} - expect_r6_class <- function(object, class) { expect_s3_class(object, class) expect_s3_class(object, "R6") diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index e9ffcd53f78..159228ce83f 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -57,7 +57,7 @@ test_that("Table R metadata", { "$r$columns$c$columns$c1$attributes$extra_attr", fixed = TRUE ) - expect_metadata(tab, example_with_metadata) + expect_equal_data_frame(tab, example_with_metadata) }) test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", { @@ -164,7 +164,7 @@ test_that("RecordBatch metadata", { }) test_that("RecordBatch R metadata", { - expect_metadata(record_batch(example_with_metadata), example_with_metadata) + expect_equal_data_frame(record_batch(example_with_metadata), example_with_metadata) }) test_that("R metadata roundtrip via parquet", { @@ -195,7 +195,7 @@ test_that("haven types roundtrip via feather", { test_that("Date/time type roundtrip", { rb <- record_batch(example_with_times) expect_r6_class(rb$schema$posixlt$type, "VctrsExtensionType") - expect_metadata(rb, example_with_times) + expect_equal_data_frame(rb, example_with_times) }) test_that("metadata keeps attribute of top level data frame", { @@ -398,8 +398,7 @@ test_that("Only non-default metadata is saved", { expect_null(df_arrow$r_metadata) df <- data.frame(x = 1:5) - attributes(df)$foo = "bar" + attributes(df)$foo <- "bar" df_arrow <- arrow_table(df) expect_identical(df_arrow$r_metadata, list(attributes = list(foo = "bar"))) - }) From a1ace5f480b1f80bdf25d2f773618cac756f1e01 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 25 Apr 2023 12:35:09 +0100 Subject: [PATCH 15/18] Update Python tests to compare dfs --- r/tests/testthat/test-python-flight.R | 6 +++--- r/tests/testthat/test-python.R | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-python-flight.R b/r/tests/testthat/test-python-flight.R index d2b6fd491e1..2e60957d2ff 100644 --- a/r/tests/testthat/test-python-flight.R +++ b/r/tests/testthat/test-python-flight.R @@ -53,13 +53,13 @@ if (process_is_running("demo_flight_server")) { }) test_that("flight_get", { - expect_identical(as.data.frame(flight_get(client, flight_obj)), example_data) + expect_equal_data_frame(flight_get(client, flight_obj), example_data) }) test_that("flight_put with RecordBatch", { flight_obj2 <- tempfile() flight_put(client, RecordBatch$create(example_data), path = flight_obj2) - expect_identical(as.data.frame(flight_get(client, flight_obj2)), example_data) + expect_equal_data_frame(flight_get(client, flight_obj2), example_data) }) test_that("flight_put with overwrite = FALSE", { @@ -69,7 +69,7 @@ if (process_is_running("demo_flight_server")) { ) # Default is TRUE so this will overwrite flight_put(client, example_with_times, path = flight_obj) - expect_identical(as.data.frame(flight_get(client, flight_obj)), example_with_times) + expect_equal_data_frame(flight_get(client, flight_obj), example_with_times) }) test_that("flight_disconnect", { diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R index 968d72119c5..58eb84f5d65 100644 --- a/r/tests/testthat/test-python.R +++ b/r/tests/testthat/test-python.R @@ -108,7 +108,7 @@ test_that("RecordBatch with metadata roundtrip", { expect_identical(rbatch$metadata, batch$metadata) expect_equal(rbatch$a, batch$a) expect_equal(rbatch[c("b", "c", "d")], batch[c("b", "c", "d")]) - expect_identical(as.data.frame(rbatch), example_with_metadata) + expect_equal_data_frame(rbatch, example_with_metadata) }) test_that("Table with metadata roundtrip", { @@ -123,7 +123,7 @@ test_that("Table with metadata roundtrip", { expect_identical(rtab$metadata, tab$metadata) expect_equal(rtab$a, tab$a) expect_equal(rtab[c("b", "c", "d")], tab[c("b", "c", "d")]) - expect_identical(as.data.frame(rtab), example_with_metadata) + expect_equal_data_frame(rtab, example_with_metadata) }) test_that("DataType roundtrip", { @@ -160,8 +160,8 @@ test_that("RecordBatchReader to python", { expect_s3_class(pytab, "pyarrow.lib.Table") back_to_r <- reticulate::py_to_r(pytab) expect_r6_class(back_to_r, "Table") - expect_identical( - as.data.frame(back_to_r), + expect_equal_data_frame( + back_to_r, example_data %>% select(int, lgl) %>% filter(int > 6) @@ -178,7 +178,7 @@ test_that("RecordBatchReader from python", { back_to_r <- reticulate::py_to_r(pyreader) rt_table <- back_to_r$read_table() expect_r6_class(rt_table, "Table") - expect_identical(as.data.frame(rt_table), example_data) + expect_equal_data_frame(rt_table, example_data) scan <- Scanner$create(tab) reader <- scan$ToRecordBatchReader() From e9d21eee6c955621b513ceef1ff17c1aa4270677 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 25 Apr 2023 13:07:07 +0100 Subject: [PATCH 16/18] Remove default metadata and nothing else --- r/R/metadata.R | 29 ++++++++++++++++++++++------- r/tests/testthat/test-metadata.R | 2 +- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index 4b340876996..8bf924d79d8 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -22,9 +22,7 @@ # drop problems attributes (most likely from readr) x[["attributes"]][["problems"]] <- NULL - if (has_default_df_metadata(x)) { - x <- NULL - } + x <- remove_default_df_metadata(x) out <- serialize(x, NULL, ascii = TRUE) @@ -228,8 +226,25 @@ get_r_metadata_from_old_schema <- function(new_schema, old_schema) { r_meta } -has_default_df_metadata <- function(x){ - identical(names(x), c("attributes", "columns")) && - all(vapply(x$columns, is.null, logical(1))) && - identical(x$attributes$class, "data.frame") +remove_default_df_metadata <- function(x){ + + # remove the class if it's just data.frame + if (identical(x$attributes$class, "data.frame")) { + x$attributes <- x$attributes[names(x$attributes) != "class"] + if (is_empty(x$attributes)) { + x$attributes <- NULL + } + } + + # remove columns attributes if all NULL + if (all(map_lgl(x$columns, is.null))) { + x$columns <- NULL + } + + if (is_empty(x)) { + x <- NULL + } + + x + } diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 159228ce83f..3972cf05322 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -202,7 +202,7 @@ test_that("metadata keeps attribute of top level data frame", { df <- structure(data.frame(x = 1, y = 2), foo = "bar") tab <- Table$create(df) expect_identical(attr(as.data.frame(tab), "foo"), "bar") - expect_identical(as.data.frame(tab), df) + expect_equal_data_frame(tab, df) }) From 5761ffa905345a2ed7cda07a491791ff79a326ae Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 25 Apr 2023 13:38:17 +0100 Subject: [PATCH 17/18] Update code to only remove class --- r/R/metadata.R | 31 +++++++------------------------ r/tests/testthat/test-metadata.R | 6 +++--- r/tests/testthat/test-schema.R | 1 - 3 files changed, 10 insertions(+), 28 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index 8bf924d79d8..3ae2db4eaa7 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -22,7 +22,13 @@ # drop problems attributes (most likely from readr) x[["attributes"]][["problems"]] <- NULL - x <- remove_default_df_metadata(x) + # remove the class if it's just data.frame + if (identical(x$attributes$class, "data.frame")) { + x$attributes <- x$attributes[names(x$attributes) != "class"] + if (is_empty(x$attributes)) { + x <- x[names(x) != "attributes"] + } + } out <- serialize(x, NULL, ascii = TRUE) @@ -225,26 +231,3 @@ get_r_metadata_from_old_schema <- function(new_schema, old_schema) { } r_meta } - -remove_default_df_metadata <- function(x){ - - # remove the class if it's just data.frame - if (identical(x$attributes$class, "data.frame")) { - x$attributes <- x$attributes[names(x$attributes) != "class"] - if (is_empty(x$attributes)) { - x$attributes <- NULL - } - } - - # remove columns attributes if all NULL - if (all(map_lgl(x$columns, is.null))) { - x$columns <- NULL - } - - if (is_empty(x)) { - x <- NULL - } - - x - -} diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 3972cf05322..e44cd710380 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -392,13 +392,13 @@ test_that("grouped_df non-arrow metadata is preserved", { ) }) -test_that("Only non-default metadata is saved", { +test_that("data.frame class attribute is not saved", { df <- data.frame(x = 1:5) df_arrow <- arrow_table(df) - expect_null(df_arrow$r_metadata) + expect_null(df_arrow$r_metadata$attributes) df <- data.frame(x = 1:5) attributes(df)$foo <- "bar" df_arrow <- arrow_table(df) - expect_identical(df_arrow$r_metadata, list(attributes = list(foo = "bar"))) + expect_identical(df_arrow$r_metadata, list(attributes = list(foo = "bar"), columns = list(x = NULL))) }) diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R index d26b2d275fd..24776e6d0c1 100644 --- a/r/tests/testthat/test-schema.R +++ b/r/tests/testthat/test-schema.R @@ -271,7 +271,6 @@ test_that("schema name assignment", { # Test that R metadata is updated appropriately df <- data.frame(x = 1:3, y = c("a", "b", "c")) - class(df) <- c("data.frame", "special_df") schm2 <- arrow_table(df)$schema names(schm2) <- c("col1", "col2") expect_identical(names(schm2), c("col1", "col2")) From 090b5de72fa5b6da472b7d015c7502be77391a38 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 26 Apr 2023 12:35:15 +0100 Subject: [PATCH 18/18] Relocate collect.StructArray --- r/NAMESPACE | 1 - r/R/array.R | 5 ----- r/R/dplyr-collect.R | 4 ++++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index b1a439545b2..7a8efe0ca30 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -293,7 +293,6 @@ export(call_function) export(cast_options) export(chunked_array) export(codec_is_available) -export(collect.StructArray) export(concat_arrays) export(concat_tables) export(contains) diff --git a/r/R/array.R b/r/R/array.R index 906e71617fe..3e9e0eae1ab 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -477,11 +477,6 @@ as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ... as.data.frame(collect.StructArray(x), row.names = row.names, optional = optional, ...) } -#' @export -collect.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { - as.vector(x) -} - #' @rdname array #' @usage NULL #' @format NULL diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 5ac670c6c49..970722e86a5 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -35,6 +35,10 @@ collect.Dataset <- function(x, as_data_frame = TRUE, ...) { } collect.RecordBatchReader <- collect.Dataset +collect.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { + as.vector(x) +} + compute.ArrowTabular <- function(x, ...) x compute.arrow_dplyr_query <- function(x, ...) { # TODO: should this tryCatch move down into as_arrow_table()?