From ee1efa6589eecea5ec7ba4609cae700afbf04fb1 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 4 Jan 2021 10:33:43 -0600 Subject: [PATCH 1/2] Proactively remove "problems" attributes --- r/R/record-batch.R | 6 ++++++ r/tests/testthat/test-metadata.R | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 331a7a77253..711999e476a 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -274,6 +274,12 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ... } .serialize_arrow_r_metadata <- function(x) { + # drop problems attributes (most likely from readr) + if ("attributes" %in% names(x) && + "problems" %in% names(x[["attributes"]]) ) { + x[["attributes"]][["problems"]] <- NULL + } + rawToChar(serialize(x, NULL, ascii = TRUE)) } diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 1cd6fbc4599..b76fe44ef04 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -134,3 +134,23 @@ test_that("metadata keeps attribute of top level data frame", { expect_identical(attr(as.data.frame(tab), "foo"), "bar") expect_identical(as.data.frame(tab), df) }) + +test_that("metadata drops readr's problems attribute", { + readr_like <- tibble::tibble( + dbl = 1.1, + not_here = NA_character_ + ) + attributes(readr_like) <- append( + attributes(readr_like), + list(problems = tibble::tibble( + row = 1L, + col = NA_character_, + expected = "2 columns", + actual = "1 columns", + file = "'test'" + )) + ) + + tab <- Table$create(readr_like) + expect_null(attr(as.data.frame(tab), "problems")) +}) From 94456b4744b38345b42e7b548a1ef5529ce4e6f1 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 4 Jan 2021 15:05:16 -0600 Subject: [PATCH 2/2] =?UTF-8?q?PR=20comments,=20=F0=9F=93=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- r/NEWS.md | 6 ++++-- r/R/record-batch.R | 7 +++---- r/tests/testthat/test-metadata.R | 4 +++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 66f37bbe561..ebf80ee1d81 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -29,10 +29,11 @@ ## Enhancements -* Table columns can now be added, replaced, or removed by assigning `<-` with either `$` or `[[` +* Table columns can now be added, replaced, or removed by assigning (`<-`) with either `$` or `[[` * Column names of Tables and RecordBatches can be renamed by assigning `names()` * Large string types can now be written to Parquet files -* The [pronouns `.data` and `.env`](https://rlang.r-lib.org/reference/tidyeval-data.html) are now fully supported in Arrow-dplyr pipelines. +* The [pronouns `.data` and `.env`](https://rlang.r-lib.org/reference/tidyeval-data.html) are now fully supported in Arrow `dplyr` pipelines. +* Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows conversion of Arrow string (`utf8()`) type data containing embedded nul `\0` characters to R. If set to `TRUE`, nuls will be stripped and a warning is emitted if any are found. ## Bug fixes @@ -40,6 +41,7 @@ * C++ functions now trigger garbage collection when needed * `write_parquet()` can now write RecordBatches * Reading a Table from a RecordBatchStreamReader containing 0 batches no longer crashes +* `readr`'s `problems` attribute is removed when converting to Arrow RecordBatch and table to prevent large amounts of metadata from accumulating inadvertently [ARROW-10624](https://issues.apache.org/jira/browse/ARROW-10624) ## Packaging and installation diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 711999e476a..ef42c8de7fb 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -274,11 +274,10 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ... } .serialize_arrow_r_metadata <- function(x) { + assert_is(x, "list") + # drop problems attributes (most likely from readr) - if ("attributes" %in% names(x) && - "problems" %in% names(x[["attributes"]]) ) { - x[["attributes"]][["problems"]] <- NULL - } + x[["attributes"]][["problems"]] <- NULL rawToChar(serialize(x, NULL, ascii = TRUE)) } diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index b76fe44ef04..53ee4279b85 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -73,7 +73,9 @@ test_that("Garbage R metadata doesn't break things", { "Invalid metadata$r", fixed = TRUE ) - tab$metadata$r <- .serialize_arrow_r_metadata("garbage") + # serialize data like .serialize_arrow_r_metadata does, but don't call that + # directly since it checks to ensure that the data is a list + tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE)) expect_warning( expect_identical(as.data.frame(tab), example_data[1:6]), "Invalid metadata$r",