diff --git a/r/NAMESPACE b/r/NAMESPACE index 9ce89ca1f1c..fdc84aa5189 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -332,6 +332,7 @@ importFrom(tidyselect,vars_select) importFrom(utils,head) importFrom(utils,install.packages) importFrom(utils,modifyList) +importFrom(utils,object.size) importFrom(utils,packageVersion) importFrom(utils,tail) importFrom(vctrs,s3_register) diff --git a/r/NEWS.md b/r/NEWS.md index 40a943c39ff..521af3eacd4 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -37,6 +37,8 @@ * Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows conversion of Arrow string (`utf8()`) type data containing embedded nul `\0` characters to R. If set to `TRUE`, nuls will be stripped and a warning is emitted if any are found. * `arrow_info()` for an overview of various run-time and build-time Arrow configurations, useful for debugging * Set environment variable `ARROW_DEFAULT_MEMORY_POOL` before loading the Arrow package to change memory allocators. Windows packages are built with `mimalloc`; most others have `jemalloc`. These are used by default if they were built, and they're generally much faster than the system malloc, but sometimes it is useful to turn them off for debugging purposes. To disable them, set `ARROW_DEFAULT_MEMORY_POOL=system`. +* List columns that have attributes on each element are now also included with the metadata that is saved when creating Arrow tables. This allows `sf` tibbles to faithfully preserved and roundtripped (ARROW-10386)[https://issues.apache.org/jira/browse/ARROW-10386]. +* R metadata that exceeds 100Kb is now compressed before being written to a table; see `schema()` for more details. ## Bug fixes diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 8743037f5d3..540cbcd8645 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -147,6 +147,10 @@ print.arrow_info <- function(x, ...) { invisible(x) } +option_compress_metadata <- function() { + !is_false(getOption("arrow.compress_metadata")) +} + #' @include enums.R ArrowObject <- R6Class("ArrowObject", public = list( diff --git a/r/R/feather.R b/r/R/feather.R index 6d29b7d0b89..5aaf340c6db 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -44,6 +44,7 @@ #' the stream will be left open. #' @export #' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data. +#' @seealso [Schema] for information about schemas and metadata handling. #' @examples #' \donttest{ #' tf <- tempfile() diff --git a/r/R/parquet.R b/r/R/parquet.R index ccf87c2f511..4fe321666af 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -275,6 +275,7 @@ make_valid_version <- function(version, valid_versions = valid_parquet_version) #' "snappy" for the `compression` argument. #' #' @seealso [write_parquet] +#' @seealso [Schema] for information about schemas and metadata handling. #' #' @export ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = ArrowObject) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index ef42c8de7fb..6b89c01408c 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -66,7 +66,7 @@ #' - `$schema` #' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list. #' Modify or replace by assigning in (`batch$metadata <- new_metadata`). -#' All list elements are coerced to string. +#' All list elements are coerced to string. See `schema()` for more information. #' - `$columns`: Returns a list of `Array`s #' @rdname RecordBatch #' @name RecordBatch @@ -273,17 +273,36 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ... df } +#' @importFrom utils object.size .serialize_arrow_r_metadata <- function(x) { assert_is(x, "list") # drop problems attributes (most likely from readr) x[["attributes"]][["problems"]] <- NULL - rawToChar(serialize(x, NULL, ascii = TRUE)) + out <- serialize(x, NULL, ascii = TRUE) + + # if the metadata is over 100 kB, compress + if (option_compress_metadata() && object.size(out) > 100000) { + out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE) + + # but ensure that the compression+serialization is effective. + if (object.size(out) > object.size(out_comp)) out <- out_comp + } + + rawToChar(out) } .unserialize_arrow_r_metadata <- function(x) { - tryCatch(unserialize(charToRaw(x)), error = function(e) { + tryCatch({ + out <- unserialize(charToRaw(x)) + + # if this is still raw, try decompressing + if (is.raw(out)) { + out <- unserialize(memDecompress(out, type = "gzip")) + } + out + }, error = function(e) { warning("Invalid metadata$r", call. = FALSE) NULL }) @@ -291,6 +310,20 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ... apply_arrow_r_metadata <- function(x, r_metadata) { tryCatch({ + columns_metadata <- r_metadata$columns + if (is.data.frame(x)) { + if (length(names(x)) && !is.null(columns_metadata)) { + for (name in intersect(names(columns_metadata), names(x))) { + x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]]) + } + } + } else if(is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { + x <- map2(x, columns_metadata, function(.x, .y) { + apply_arrow_r_metadata(.x, .y) + }) + x + } + if (!is.null(r_metadata$attributes)) { attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes if (inherits(x, "POSIXlt")) { @@ -302,12 +335,6 @@ apply_arrow_r_metadata <- function(x, r_metadata) { } } - columns_metadata <- r_metadata$columns - if (length(names(x)) && !is.null(columns_metadata)) { - for (name in intersect(names(columns_metadata), names(x))) { - x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]]) - } - } }, error = function(e) { warning("Invalid metadata$r", call. = FALSE) }) diff --git a/r/R/schema.R b/r/R/schema.R index 9a0ad85acac..46eab693bec 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -50,6 +50,31 @@ #' - `$metadata`: returns the key-value metadata as a named list. #' Modify or replace by assigning in (`sch$metadata <- new_metadata`). #' All list elements are coerced to string. +#' +#' @section R Metadata: +#' +#' When converting a data.frame to an Arrow Table or RecordBatch, attributes +#' from the `data.frame` are saved alongside tables so that the object can be +#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata +#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or +#' at the column (e.g. `attributes(df$col_a)`) or for list columns only: +#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows +#' for storing `haven` columns in a table and being able to faithfully +#' re-create them when pulled back into R. This metadata is separate from the +#' schema (column names and types) which is compatible with other Arrow +#' clients. The R metadata is only read by R and is ignored by other clients +#' (e.g. Pandas has its own custom metadata). This metadata is stored in +#' `$metadata$r`. +#' +#' Since Schema metadata keys and values must be strings, this metadata is +#' saved by serializing R's attribute list structure to a string. If the +#' serialized metadata exceeds 100Kb in size, by default it is compressed +#' starting in version 3.0.0. To disable this compression (e.g. for tables +#' that are compatible with Arrow versions before 3.0.0 and include large +#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`. +#' Files with compressed metadata are readable by older versions of arrow, but +#' the metadata is dropped. +#' #' @rdname Schema #' @name Schema #' @examples diff --git a/r/R/table.R b/r/R/table.R index 1d2190589f7..af79ab7809a 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -75,7 +75,7 @@ #' - `$schema` #' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list. #' Modify or replace by assigning in (`tab$metadata <- new_metadata`). -#' All list elements are coerced to string. +#' All list elements are coerced to string. See `schema()` for more information. #' - `$columns`: Returns a list of `ChunkedArray`s #' @rdname Table #' @name Table @@ -210,11 +210,24 @@ arrow_attributes <- function(x, only_top_level = FALSE) { if (is.data.frame(x)) { columns <- map(x, arrow_attributes) - if (length(att) || !all(map_lgl(columns, is.null))) { + out <- if (length(att) || !all(map_lgl(columns, is.null))) { list(attributes = att, columns = columns) } - } else if (length(att)) { - list(attributes = att, columns = NULL) + return(out) + } + + columns <- NULL + if (is.list(x) && !inherits(x, "POSIXlt")) { + # for list columns, we also keep attributes of each + # element in columns + columns <- map(x, arrow_attributes) + if (all(map_lgl(columns, is.null))) { + columns <- NULL + } + } + + if (length(att) || !is.null(columns)) { + list(attributes = att, columns = columns) } else { NULL } diff --git a/r/extra-tests/helpers.R b/r/extra-tests/helpers.R index 61b7da4ec25..af57d45e5d2 100644 --- a/r/extra-tests/helpers.R +++ b/r/extra-tests/helpers.R @@ -19,6 +19,10 @@ if_version <- function(version, op = `==`) { op(packageVersion("arrow"), version) } +if_version_less_than <- function(version) { + if_version(version, op = `<`) +} + skip_if_version_less_than <- function(version, msg) { if(if_version(version, `<`)) { skip(msg) diff --git a/r/extra-tests/test-read-files.R b/r/extra-tests/test-read-files.R index 90efce3d791..10e9f957920 100644 --- a/r/extra-tests/test-read-files.R +++ b/r/extra-tests/test-read-files.R @@ -162,4 +162,36 @@ test_that("Can see the metadata (stream)", { ) }) +test_that("Can see the extra metadata (parquet)", { + pq_file <- "files/ex_data_extra_metadata.parquet" + + if (if_version_less_than("3.0.0")) { + expect_warning( + df <- read_parquet(pq_file), + "Invalid metadata$r", + fixed = TRUE + ) + expect_s3_class(df, "tbl") + } else { + # version 3.0.0 and greater + df <- read_parquet(pq_file) + expect_s3_class(df, "tbl") + expect_equal( + attributes(df), + list( + names = letters[1:4], + row.names = 1L, + class = c("tbl_df", "tbl", "data.frame"), + top_level = list( + field_one = 12, + field_two = "more stuff" + ) + ) + ) + + # column-level attributes for the large column. + expect_named(attributes(df$b), "lots") + expect_length(attributes(df$b)$lots, 100) + } +}) diff --git a/r/extra-tests/write-files.R b/r/extra-tests/write-files.R index e0927ead4eb..75889b61407 100644 --- a/r/extra-tests/write-files.R +++ b/r/extra-tests/write-files.R @@ -37,3 +37,6 @@ example_with_metadata_v1$c <- NULL write_feather(example_with_metadata_v1, "extra-tests/files/ex_data_v1.feather", version = 1) write_ipc_stream(example_with_metadata, "extra-tests/files/ex_data.stream") + +write_parquet(example_with_extra_metadata, "extra-tests/files/ex_data_extra_metadata.parquet") + diff --git a/r/man/ParquetWriterProperties.Rd b/r/man/ParquetWriterProperties.Rd index a2fab2a96ae..7beb8a82a46 100644 --- a/r/man/ParquetWriterProperties.Rd +++ b/r/man/ParquetWriterProperties.Rd @@ -44,4 +44,6 @@ size of data pages within a column chunk (in bytes). Default 1 MiB. \seealso{ \link{write_parquet} + +\link{Schema} for information about schemas and metadata handling. } diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd index c9cdb343ef8..4653c55814d 100644 --- a/r/man/RecordBatch.Rd +++ b/r/man/RecordBatch.Rd @@ -68,7 +68,7 @@ There are also some active bindings \item \verb{$schema} \item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as a named list. Modify or replace by assigning in (\code{batch$metadata <- new_metadata}). -All list elements are coerced to string. +All list elements are coerced to string. See \code{schema()} for more information. \item \verb{$columns}: Returns a list of \code{Array}s } } diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd index 1c1f75e2dd2..c2fb2fac681 100644 --- a/r/man/Schema.Rd +++ b/r/man/Schema.Rd @@ -48,6 +48,32 @@ All list elements are coerced to string. } } +\section{R Metadata}{ + + +When converting a data.frame to an Arrow Table or RecordBatch, attributes +from the \code{data.frame} are saved alongside tables so that the object can be +reconstructed faithfully in R (e.g. with \code{as.data.frame()}). This metadata +can be both at the top-level of the \code{data.frame} (e.g. \code{attributes(df)}) or +at the column (e.g. \code{attributes(df$col_a)}) or for list columns only: +element level (e.g. \code{attributes(df[1, "col_a"])}). For example, this allows +for storing \code{haven} columns in a table and being able to faithfully +re-create them when pulled back into R. This metadata is separate from the +schema (column names and types) which is compatible with other Arrow +clients. The R metadata is only read by R and is ignored by other clients +(e.g. Pandas has its own custom metadata). This metadata is stored in +\verb{$metadata$r}. + +Since Schema metadata keys and values must be strings, this metadata is +saved by serializing R's attribute list structure to a string. If the +serialized metadata exceeds 100Kb in size, by default it is compressed +starting in version 3.0.0. To disable this compression (e.g. for tables +that are compatible with Arrow versions before 3.0.0 and include large +amounts of metadata), set the option \code{arrow.compress_metadata} to \code{FALSE}. +Files with compressed metadata are readable by older versions of arrow, but +the metadata is dropped. +} + \examples{ \donttest{ df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5)) diff --git a/r/man/Table.Rd b/r/man/Table.Rd index 18c7da12393..46e9afeaf53 100644 --- a/r/man/Table.Rd +++ b/r/man/Table.Rd @@ -68,7 +68,7 @@ There are also some active bindings: \item \verb{$schema} \item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as a named list. Modify or replace by assigning in (\code{tab$metadata <- new_metadata}). -All list elements are coerced to string. +All list elements are coerced to string. See \code{schema()} for more information. \item \verb{$columns}: Returns a list of \code{ChunkedArray}s } } diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 277c8197475..691adbeef05 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -56,4 +56,6 @@ write_feather(mtcars, tf) } \seealso{ \link{RecordBatchWriter} for lower-level access to writing Arrow IPC data. + +\link{Schema} for information about schemas and metadata handling. } diff --git a/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet b/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet new file mode 100644 index 00000000000..bf95f23cd86 Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet differ diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R index 26b1cf0e108..ecce77336b3 100644 --- a/r/tests/testthat/helper-data.R +++ b/r/tests/testthat/helper-data.R @@ -67,3 +67,10 @@ make_big_string <- function() { # This creates a character vector that would exceed the capacity of BinaryArray rep(purrr::map_chr(2047:2050, ~paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18) } + +make_string_of_size <- function(size = 1) { + purrr::map_chr(1000*size, ~paste(sample(letters, ., replace = TRUE), collapse = "")) +} + +example_with_extra_metadata <- example_with_metadata +attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100)) diff --git a/r/tests/testthat/test-backwards-compatibility.R b/r/tests/testthat/test-backwards-compatibility.R index fb06897e75a..c6bd51498cf 100644 --- a/r/tests/testthat/test-backwards-compatibility.R +++ b/r/tests/testthat/test-backwards-compatibility.R @@ -45,6 +45,15 @@ expect_identical_with_metadata <- function(object, expected, ..., top_level = TR expect_identical(object, expected, ...) } +test_that("reading a known Parquet file to dataframe with 3.0.0", { + skip_if_not_available("snappy") + pq_file <- test_path("golden-files/data-arrow-extra-meta_3.0.0.parquet") + + df <- read_parquet(pq_file) + # this is equivalent to `expect_identical()` + expect_identical_with_metadata(df, example_with_extra_metadata) +}) + test_that("reading a known Parquet file to dataframe with 2.0.0", { skip_if_not_available("snappy") pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet") diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 53ee4279b85..17c43bb28ca 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -83,6 +83,46 @@ test_that("Garbage R metadata doesn't break things", { ) }) +test_that("Metadata serialization compression", { + # attributes that (when serialized) are just under 100kb are not compressed, + # and simply serialized + strings <- as.list(rep(make_string_of_size(1), 98)) + small <- .serialize_arrow_r_metadata(strings) + expect_equal( + object.size(small), + object.size(rawToChar(serialize(strings, NULL, ascii = TRUE))) + ) + + # Large strings will be compressed + large_strings <- as.list(rep(make_string_of_size(1), 100)) + large <- .serialize_arrow_r_metadata(large_strings) + expect_lt( + object.size(large), + object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE))) + ) + # and this compression ends up being smaller than even the "small" strings + expect_lt(object.size(large), object.size(small)) + + # However strings where compression + serialization is not effective are no + # worse than only serialization alone + large_few_strings <- as.list(rep(make_string_of_size(50), 2)) + large_few <- .serialize_arrow_r_metadata(large_few_strings) + expect_equal( + object.size(large_few), + object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE))) + ) + + # But we can disable compression + op <- options(arrow.compress_metadata = FALSE); on.exit(options(op)) + + large_strings <- as.list(rep(make_string_of_size(1), 100)) + large <- .serialize_arrow_r_metadata(large_strings) + expect_equal( + object.size(large), + object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE))) + ) +}) + test_that("RecordBatch metadata", { rb <- RecordBatch$create(x = 1:2, y = c("a", "b")) expect_equivalent(rb$metadata, list()) @@ -137,6 +177,7 @@ test_that("metadata keeps attribute of top level data frame", { expect_identical(as.data.frame(tab), df) }) + test_that("metadata drops readr's problems attribute", { readr_like <- tibble::tibble( dbl = 1.1, @@ -156,3 +197,10 @@ test_that("metadata drops readr's problems attribute", { tab <- Table$create(readr_like) expect_null(attr(as.data.frame(tab), "problems")) }) + +test_that("metadata of list elements (ARROW-10386)", { + df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux")))) + tab <- Table$create(df) + expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar") + expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux") +}) diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd index 9ea977b7e55..a1604cb2358 100644 --- a/r/vignettes/arrow.Rmd +++ b/r/vignettes/arrow.Rmd @@ -154,7 +154,7 @@ Arrow supports custom key-value metadata attached to Schemas. When we convert a This metadata is preserved when writing the table to Feather or Parquet, and when reading those files into R, or when calling `as.data.frame()` on a Table/RecordBatch, the column attributes are restored to the columns of the resulting `data.frame`. This means that custom data types, including `haven::labelled`, `vctrs` annotations, and others, are preserved when doing a round-trip through Arrow. -Note that the `attributes()` stored in `$metadata$r` are only understood by R. If you write a `data.frame` with `haven` columns to a Feather file and read that in Pandas, the `haven` metadata won't be recognized there. (Similarly, Pandas writes its own custom metadata, which the R package does not consume.) You are free, however, to define custom metadata conventions for your application and assign any (string) values you want to other metadata keys. +Note that the `attributes()` stored in `$metadata$r` are only understood by R. If you write a `data.frame` with `haven` columns to a Feather file and read that in Pandas, the `haven` metadata won't be recognized there. (Similarly, Pandas writes its own custom metadata, which the R package does not consume.) You are free, however, to define custom metadata conventions for your application and assign any (string) values you want to other metadata keys. For more details, see the documentation for `schema()`. ## Class structure and package conventions