From 05bf8579f20f589f3d516af3bf2cff985828c6a7 Mon Sep 17 00:00:00 2001
From: Romain Francois <romain@rstudio.com>
Date: Thu, 29 Oct 2020 12:17:28 +0100
Subject: [PATCH 01/13] store metadata for each element of a list column too,
 not just the list itself. ARROW-10386.

---
 r/R/record-batch.R               | 20 ++++++++++++++------
 r/R/table.R                      | 19 ++++++++++++++++---
 r/tests/testthat/test-metadata.R |  8 ++++++++
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index ef42c8de7fb..afff6af34ce 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -291,6 +291,20 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ...
 
 apply_arrow_r_metadata <- function(x, r_metadata) {
   tryCatch({
+    columns_metadata <- r_metadata$columns
+    if (is.data.frame(x)) {
+      if (length(names(x)) && !is.null(columns_metadata)) {
+        for (name in intersect(names(columns_metadata), names(x))) {
+          x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
+        }
+      }
+    } else if(is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
+      x <- map2(x, columns_metadata, function(.x, .y) {
+        apply_arrow_r_metadata(.x, .y)
+      })
+      x
+    }
+
     if (!is.null(r_metadata$attributes)) {
       attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
       if (inherits(x, "POSIXlt")) {
@@ -302,12 +316,6 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
       }
     }
 
-    columns_metadata <- r_metadata$columns
-    if (length(names(x)) && !is.null(columns_metadata)) {
-      for (name in intersect(names(columns_metadata), names(x))) {
-        x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
-      }
-    }
   }, error = function(e) {
     warning("Invalid metadata$r", call. = FALSE)
   })
diff --git a/r/R/table.R b/r/R/table.R
index 1d2190589f7..172e7bceab7 100644
--- a/r/R/table.R
+++ b/r/R/table.R
@@ -210,11 +210,24 @@ arrow_attributes <- function(x, only_top_level = FALSE) {
 
   if (is.data.frame(x)) {
     columns <- map(x, arrow_attributes)
-    if (length(att) || !all(map_lgl(columns, is.null))) {
+    out <- if (length(att) || !all(map_lgl(columns, is.null))) {
       list(attributes = att, columns = columns)
     }
-  } else if (length(att)) {
-    list(attributes = att, columns = NULL)
+    return(out)
+  }
+
+  columns <- NULL
+  if (is.list(x) && !inherits(x, "POSIXlt")) {
+    # for list columns, we also keep attributes of each
+    # element in columns
+    columns <- map(x, arrow_attributes)
+    if (all(map_lgl(columns, is.null))) {
+      columns <- NULL
+    }
+  }
+
+  if (length(att) || !is.null(columns)) {
+    list(attributes = att, columns = columns)
   } else {
     NULL
   }
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 53ee4279b85..f869091ddd4 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -137,6 +137,7 @@ test_that("metadata keeps attribute of top level data frame", {
   expect_identical(as.data.frame(tab), df)
 })
 
+
 test_that("metadata drops readr's problems attribute", {
   readr_like <- tibble::tibble(
     dbl = 1.1,
@@ -156,3 +157,10 @@ test_that("metadata drops readr's problems attribute", {
   tab <- Table$create(readr_like)
   expect_null(attr(as.data.frame(tab), "problems"))
 })
+
+test_that("metadata of list elements (ARROW-10386)", {
+  df <- data.frame(x = list(structure(1, foo = "bar"), structure(2, foo = "bar")))
+  tab <- Table$create(df)
+  expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
+  expect_identical(attr(as.data.frame(tab)$x[[2]], "foo"), "bar")
+})

From 0c6065a2bae8f600da4b7ca686e1db4dfa3d1191 Mon Sep 17 00:00:00 2001
From: Romain Francois <romain@rstudio.com>
Date: Thu, 29 Oct 2020 12:25:54 +0100
Subject: [PATCH 02/13] update test

---
 r/tests/testthat/test-metadata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index f869091ddd4..b01620cd4a5 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -159,7 +159,7 @@ test_that("metadata drops readr's problems attribute", {
 })
 
 test_that("metadata of list elements (ARROW-10386)", {
-  df <- data.frame(x = list(structure(1, foo = "bar"), structure(2, foo = "bar")))
+  df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, foo = "bar"))))
   tab <- Table$create(df)
   expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
   expect_identical(attr(as.data.frame(tab)$x[[2]], "foo"), "bar")

From 57f05e2140be27bbd35c6d63ade6a245c59a0733 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Mon, 11 Jan 2021 17:10:49 -0600
Subject: [PATCH 03/13] Slight clarification on test

---
 r/tests/testthat/test-metadata.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index b01620cd4a5..479cb4ea058 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -159,8 +159,8 @@ test_that("metadata drops readr's problems attribute", {
 })
 
 test_that("metadata of list elements (ARROW-10386)", {
-  df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, foo = "bar"))))
+  df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux"))))
   tab <- Table$create(df)
   expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
-  expect_identical(attr(as.data.frame(tab)$x[[2]], "foo"), "bar")
+  expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux")
 })

From a92ed0d98310547413668c0315214b08be62601e Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Tue, 12 Jan 2021 18:07:05 -0600
Subject: [PATCH 04/13] Try some compression

---
 r/R/record-batch.R               | 22 ++++++++++++++++++++--
 r/tests/testthat/helper-data.R   |  5 +++++
 r/tests/testthat/test-metadata.R | 30 ++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index afff6af34ce..71dfafdbe9c 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -279,11 +279,29 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ...
   # drop problems attributes (most likely from readr)
   x[["attributes"]][["problems"]] <- NULL
 
-  rawToChar(serialize(x, NULL, ascii = TRUE))
+  out <- serialize(x, NULL, ascii = TRUE)
+
+  # if the metadata is over 100 kB, compress
+  if (object.size(out) > 100000) {
+    out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
+
+    # but ensure that the compression+serialization is effective.
+    if (object.size(out) > object.size(out_comp)) out <- out_comp
+  }
+
+  rawToChar(out)
 }
 
 .unserialize_arrow_r_metadata <- function(x) {
-  tryCatch(unserialize(charToRaw(x)), error = function(e) {
+  tryCatch({
+    out <- unserialize(charToRaw(x))
+
+    # if this is still raw, try decompressing
+    if (is.raw(out)) {
+      out <- unserialize(memDecompress(out, type = "gzip"))
+    }
+    out
+  }, error = function(e) {
     warning("Invalid metadata$r", call. = FALSE)
     NULL
   })
diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R
index 26b1cf0e108..06f0b48cb8e 100644
--- a/r/tests/testthat/helper-data.R
+++ b/r/tests/testthat/helper-data.R
@@ -67,3 +67,8 @@ make_big_string <- function() {
   # This creates a character vector that would exceed the capacity of BinaryArray
   rep(purrr::map_chr(2047:2050, ~paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18)
 }
+
+make_string_of_size <- function(size = 1) {
+  purrr::map_chr(1000*size, ~paste(sample(letters, ., replace = TRUE), collapse = ""))
+}
+
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 479cb4ea058..f9fac3b90c6 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -83,6 +83,36 @@ test_that("Garbage R metadata doesn't break things", {
   )
 })
 
+test_that("Metadata serialization compression", {
+  # attributes that (when serialized) are just under 100kb are not compressed,
+  # and simply serialized
+  strings <- rep(make_string_of_size(1), 98)
+  small <- .serialize_arrow_r_metadata(strings)
+  expect_equal(
+    object.size(small),
+    object.size(rawToChar(serialize(strings, NULL, ascii = TRUE)))
+  )
+
+  # Large strings will be compressed
+  large_strings <- rep(make_string_of_size(1), 100)
+  large <- .serialize_arrow_r_metadata(large_strings)
+  expect_lt(
+    object.size(large),
+    object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+  )
+  # and this compression ends up being smaller than even the "small" strings
+  expect_lt(object.size(large), object.size(small))
+
+  # However strings where compression + serialization is not effective are no
+  # worse than only serialization alone
+  large_few_strings <- rep(make_string_of_size(50), 2)
+  large_few <- .serialize_arrow_r_metadata(large_few_strings)
+  expect_equal(
+    object.size(large_few),
+    object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE)))
+  )
+})
+
 test_that("RecordBatch metadata", {
   rb <- RecordBatch$create(x = 1:2, y = c("a", "b"))
   expect_equivalent(rb$metadata, list())

From 95aaa304030d2bb5d76262cf82d7cb99a298bf25 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Tue, 12 Jan 2021 18:53:04 -0600
Subject: [PATCH 05/13] Oops, attributes must be lists.

---
 r/tests/testthat/test-metadata.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index f9fac3b90c6..80d76e5612c 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -86,7 +86,7 @@ test_that("Garbage R metadata doesn't break things", {
 test_that("Metadata serialization compression", {
   # attributes that (when serialized) are just under 100kb are not compressed,
   # and simply serialized
-  strings <- rep(make_string_of_size(1), 98)
+  strings <- as.list(rep(make_string_of_size(1), 98))
   small <- .serialize_arrow_r_metadata(strings)
   expect_equal(
     object.size(small),
@@ -94,7 +94,7 @@ test_that("Metadata serialization compression", {
   )
 
   # Large strings will be compressed
-  large_strings <- rep(make_string_of_size(1), 100)
+  large_strings <- as.list(rep(make_string_of_size(1), 100))
   large <- .serialize_arrow_r_metadata(large_strings)
   expect_lt(
     object.size(large),
@@ -105,7 +105,7 @@ test_that("Metadata serialization compression", {
 
   # However strings where compression + serialization is not effective are no
   # worse than only serialization alone
-  large_few_strings <- rep(make_string_of_size(50), 2)
+  large_few_strings <- as.list(rep(make_string_of_size(50), 2))
   large_few <- .serialize_arrow_r_metadata(large_few_strings)
   expect_equal(
     object.size(large_few),

From 6fd2d35254bb4e5bda8ea9e69701715614e6cc44 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 13 Jan 2021 08:27:01 -0600
Subject: [PATCH 06/13] Add option for disabling compression

---
 r/R/arrow-package.R              |  4 ++++
 r/R/record-batch.R               |  2 +-
 r/tests/testthat/test-metadata.R | 10 ++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 8743037f5d3..540cbcd8645 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -147,6 +147,10 @@ print.arrow_info <- function(x, ...) {
   invisible(x)
 }
 
+option_compress_metadata <- function() {
+  !is_false(getOption("arrow.compress_metadata"))
+}
+
 #' @include enums.R
 ArrowObject <- R6Class("ArrowObject",
   public = list(
diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index 71dfafdbe9c..bd2dff0b76f 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -282,7 +282,7 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ...
   out <- serialize(x, NULL, ascii = TRUE)
 
   # if the metadata is over 100 kB, compress
-  if (object.size(out) > 100000) {
+  if (option_compress_metadata() && object.size(out) > 100000) {
     out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
 
     # but ensure that the compression+serialization is effective.
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 80d76e5612c..17c43bb28ca 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -111,6 +111,16 @@ test_that("Metadata serialization compression", {
     object.size(large_few),
     object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE)))
   )
+
+  # But we can disable compression
+  op <- options(arrow.compress_metadata = FALSE); on.exit(options(op))
+
+  large_strings <- as.list(rep(make_string_of_size(1), 100))
+  large <- .serialize_arrow_r_metadata(large_strings)
+  expect_equal(
+    object.size(large),
+    object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+  )
 })
 
 test_that("RecordBatch metadata", {

From 5649500df3fc7fb0875c9763f91047ddb907d834 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 13 Jan 2021 11:36:32 -0600
Subject: [PATCH 07/13] Updated documentation

---
 r/NAMESPACE                      |  1 +
 r/R/feather.R                    |  1 +
 r/R/parquet.R                    |  1 +
 r/R/record-batch.R               |  3 +-
 r/R/schema.R                     | 28 ++++++++++++++
 r/R/table.R                      |  2 +-
 r/man/ParquetWriterProperties.Rd |  2 +
 r/man/RecordBatch.Rd             |  2 +-
 r/man/Schema.Rd                  | 64 ++++++++++++++++++++++----------
 r/man/Table.Rd                   |  2 +-
 r/man/write_feather.Rd           |  2 +
 r/vignettes/arrow.Rmd            |  2 +-
 12 files changed, 86 insertions(+), 24 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 9ce89ca1f1c..25434ee7fc4 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -333,6 +333,7 @@ importFrom(utils,head)
 importFrom(utils,install.packages)
 importFrom(utils,modifyList)
 importFrom(utils,packageVersion)
+importFrom(utils,object.size)
 importFrom(utils,tail)
 importFrom(vctrs,s3_register)
 importFrom(vctrs,vec_cast)
diff --git a/r/R/feather.R b/r/R/feather.R
index 6d29b7d0b89..5aaf340c6db 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -44,6 +44,7 @@
 #' the stream will be left open.
 #' @export
 #' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
+#' @seealso [Schema] for information about schemas and metadata handling.
 #' @examples
 #' \donttest{
 #' tf <- tempfile()
diff --git a/r/R/parquet.R b/r/R/parquet.R
index ccf87c2f511..4fe321666af 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -275,6 +275,7 @@ make_valid_version <- function(version, valid_versions = valid_parquet_version)
 #' "snappy" for the `compression` argument.
 #'
 #' @seealso [write_parquet]
+#' @seealso [Schema] for information about schemas and metadata handling.
 #'
 #' @export
 ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = ArrowObject)
diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index bd2dff0b76f..6b89c01408c 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -66,7 +66,7 @@
 #' - `$schema`
 #' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
 #'    Modify or replace by assigning in (`batch$metadata <- new_metadata`).
-#'    All list elements are coerced to string.
+#'    All list elements are coerced to string. See `schema()` for more information.
 #' - `$columns`: Returns a list of `Array`s
 #' @rdname RecordBatch
 #' @name RecordBatch
@@ -273,6 +273,7 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ...
   df
 }
 
+#' @importFrom utils object.size
 .serialize_arrow_r_metadata <- function(x) {
   assert_is(x, "list")
 
diff --git a/r/R/schema.R b/r/R/schema.R
index 9a0ad85acac..57d8614baad 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -50,6 +50,34 @@
 #' - `$metadata`: returns the key-value metadata as a named list.
 #'    Modify or replace by assigning in (`sch$metadata <- new_metadata`).
 #'    All list elements are coerced to string.
+#'
+#' @section Metadata:
+#'
+#'   Attributes from the `data.frame` are saved alongside tables so that the
+#'   object can be reconstructed faithfully in R (e.g. with `as.data.frame()`).
+#'   This metadata can be both at the top-level of the `data.frame` (e.g.
+#'   `attributes(df)`) or at the column (e.g. `attributes(df$col_a)`) or element
+#'   level (e.g. `attributes(df[1, "col_a"])`). For example, this allows for
+#'   storing `haven` columns in a table and being able to faithfully re-create
+#'   them when pulled back into R. This metadata is separate from the schema
+#'   (e.g. types of the columns) which is compatible with other Arrow clients.
+#'   The R metadata is only read by R and is ignored by other clients (e.g.
+#'   pyarrow which has its own custom metadata for things like Pandas metadata).
+#'   This metadata is stored (and can be accessed with) `table$metadata$r`.
+#'
+#'   This metadata is saved by serializing R's attribute list structure to a
+#'   serialized string. Because of this, large amounts of metadata can quickly
+#'   increase the size of tables (and therefore the size of tables written to
+#'   parquet or feather files). If the (serialized) metadata exceeds 100Kbs in
+#'   size, it is first compressed before saving. To disable this compression
+#'   (e.g. for tables that are compatible with Arrow versions before 3.0.0 and
+#'   include large amounts of metadata) you can set the option
+#'   `arrow.compress_metadata` to `FALSE`.
+#'
+#'   One exception to storing all metadata: `readr`'s `problems` attribute if it
+#'   exists is not saved with the metadata in order to prevent what are
+#'   sometimes excessively large when serialized.
+#'
 #' @rdname Schema
 #' @name Schema
 #' @examples
diff --git a/r/R/table.R b/r/R/table.R
index 172e7bceab7..af79ab7809a 100644
--- a/r/R/table.R
+++ b/r/R/table.R
@@ -75,7 +75,7 @@
 #' - `$schema`
 #' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
 #'    Modify or replace by assigning in (`tab$metadata <- new_metadata`).
-#'    All list elements are coerced to string.
+#'    All list elements are coerced to string. See `schema()` for more information.
 #' - `$columns`: Returns a list of `ChunkedArray`s
 #' @rdname Table
 #' @name Table
diff --git a/r/man/ParquetWriterProperties.Rd b/r/man/ParquetWriterProperties.Rd
index a2fab2a96ae..7beb8a82a46 100644
--- a/r/man/ParquetWriterProperties.Rd
+++ b/r/man/ParquetWriterProperties.Rd
@@ -44,4 +44,6 @@ size of data pages within a column chunk (in bytes). Default 1 MiB.
 
 \seealso{
 \link{write_parquet}
+
+\link{Schema} for information about schemas and metadata handling.
 }
diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd
index c9cdb343ef8..4653c55814d 100644
--- a/r/man/RecordBatch.Rd
+++ b/r/man/RecordBatch.Rd
@@ -68,7 +68,7 @@ There are also some active bindings
 \item \verb{$schema}
 \item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as a named list.
 Modify or replace by assigning in (\code{batch$metadata <- new_metadata}).
-All list elements are coerced to string.
+All list elements are coerced to string. See \code{schema()} for more information.
 \item \verb{$columns}: Returns a list of \code{Array}s
 }
 }
diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd
index 1c1f75e2dd2..7471757115c 100644
--- a/r/man/Schema.Rd
+++ b/r/man/Schema.Rd
@@ -12,22 +12,20 @@ schema(...)
 \item{...}{named list of \link[=data-type]{data types}}
 }
 \description{
-A \code{Schema} is a list of \link{Field}s, which map names to
-Arrow \link[=data-type]{data types}. Create a \code{Schema} when you
-want to convert an R \code{data.frame} to Arrow but don't want to rely on the
-default mapping of R types to Arrow types, such as when you want to choose a
-specific numeric precision, or when creating a \link{Dataset} and you want to
-ensure a specific schema rather than inferring it from the various files.
+A \code{Schema} is a list of \link{Field}s, which map names to Arrow \link[=data-type]{data types}. Create a \code{Schema} when you want to convert an R
+\code{data.frame} to Arrow but don't want to rely on the default mapping of R
+types to Arrow types, such as when you want to choose a specific numeric
+precision, or when creating a \link{Dataset} and you want to ensure a specific
+schema rather than inferring it from the various files.
 
-Many Arrow objects, including \link{Table} and \link{Dataset}, have a \verb{$schema} method
-(active binding) that lets you access their schema.
+Many Arrow objects, including \link{Table} and \link{Dataset}, have a \verb{$schema}
+method (active binding) that lets you access their schema.
 }
 \section{Methods}{
 
 \itemize{
-\item \verb{$ToString()}: convert to a string
-\item \verb{$field(i)}: returns the field at index \code{i} (0-based)
-\item \verb{$GetFieldByName(x)}: returns the field with name \code{x}
+\item \verb{$ToString()}: convert to a string - \verb{$field(i)}: returns the field at
+index \code{i} (0-based) - \verb{$GetFieldByName(x)}: returns the field with name \code{x}
 \item \verb{$WithMetadata(metadata)}: returns a new \code{Schema} with the key-value
 \code{metadata} set. Note that all list elements in \code{metadata} will be coerced
 to \code{character}.
@@ -37,17 +35,45 @@ to \code{character}.
 \section{Active bindings}{
 
 \itemize{
-\item \verb{$names}: returns the field names (called in \code{names(Schema)})
-\item \verb{$num_fields}: returns the number of fields (called in \code{length(Schema)})
-\item \verb{$fields}: returns the list of \code{Field}s in the \code{Schema}, suitable for
-iterating over
-\item \verb{$HasMetadata}: logical: does this \code{Schema} have extra metadata?
-\item \verb{$metadata}: returns the key-value metadata as a named list.
-Modify or replace by assigning in (\code{sch$metadata <- new_metadata}).
-All list elements are coerced to string.
+\item \verb{$names}: returns the field names (called in \code{names(Schema)}) -
+\verb{$num_fields}: returns the number of fields (called in \code{length(Schema)}) -
+\verb{$fields}: returns the list of \code{Field}s in the \code{Schema}, suitable for
+iterating over - \verb{$HasMetadata}: logical: does this \code{Schema} have extra
+metadata? - \verb{$metadata}: returns the key-value metadata as a named list.
+Modify or replace by assigning in (\code{sch$metadata <- new_metadata}). All
+list elements are coerced to string.
 }
 }
 
+\section{Metadata}{
+
+
+Attributes from the \code{data.frame} are saved alongside tables so that the
+object can be reconstructed faithfully in R (e.g. with \code{as.data.frame()}).
+This metadata can be both at the top-level of the \code{data.frame} (e.g.
+\code{attributes(df)}) or at the column (e.g. \code{attributes(df$col_a)}) or element
+level (e.g. \code{attributes(df[1, "col_a"])}). For example, this allows for
+storing \code{haven} columns in a table and being able to faithfully re-create
+them when pulled back into R. This metadata is separate from the schema
+(e.g. types of the columns) which is compatible with other Arrow clients.
+The R metadata is only read by R and is ignored by other clients (e.g.
+pyarrow which has its own custom metadata for things like Pandas metadata).
+This metadata is stored (and can be accessed with) \code{table$metadata$r}.
+
+This metadata is saved by serializing R's attribute list structure to a
+serialized string. Because of this, large amounts of metadata can quickly
+increase the size of tables (and therefore the size of tables written to
+parquet or feather files). If the (serialized) metadata exceeds 100Kbs in
+size, it is first compressed before saving. To disable this compression
+(e.g. for tables that are compatible with Arrow versions before 3.0.0 and
+include large amounts of metadata) you can set the option
+\code{arrow.compress_metadata} to \code{FALSE}.
+
+One exception to storing all metadata: \code{readr}'s \code{problems} attribute if it
+exists is not saved with the metadata in order to prevent what are
+sometimes excessively large when serialized.
+}
+
 \examples{
 \donttest{
 df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
diff --git a/r/man/Table.Rd b/r/man/Table.Rd
index 18c7da12393..46e9afeaf53 100644
--- a/r/man/Table.Rd
+++ b/r/man/Table.Rd
@@ -68,7 +68,7 @@ There are also some active bindings:
 \item \verb{$schema}
 \item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as a named list.
 Modify or replace by assigning in (\code{tab$metadata <- new_metadata}).
-All list elements are coerced to string.
+All list elements are coerced to string. See \code{schema()} for more information.
 \item \verb{$columns}: Returns a list of \code{ChunkedArray}s
 }
 }
diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd
index 277c8197475..691adbeef05 100644
--- a/r/man/write_feather.Rd
+++ b/r/man/write_feather.Rd
@@ -56,4 +56,6 @@ write_feather(mtcars, tf)
 }
 \seealso{
 \link{RecordBatchWriter} for lower-level access to writing Arrow IPC data.
+
+\link{Schema} for information about schemas and metadata handling.
 }
diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd
index 9ea977b7e55..a1604cb2358 100644
--- a/r/vignettes/arrow.Rmd
+++ b/r/vignettes/arrow.Rmd
@@ -154,7 +154,7 @@ Arrow supports custom key-value metadata attached to Schemas. When we convert a
 
 This metadata is preserved when writing the table to Feather or Parquet, and when reading those files into R, or when calling `as.data.frame()` on a Table/RecordBatch, the column attributes are restored to the columns of the resulting `data.frame`. This means that custom data types, including `haven::labelled`, `vctrs` annotations, and others, are preserved when doing a round-trip through Arrow.
 
-Note that the `attributes()` stored in `$metadata$r` are only understood by R. If you write a `data.frame` with `haven` columns to a Feather file and read that in Pandas, the `haven` metadata won't be recognized there. (Similarly, Pandas writes its own custom metadata, which the R package does not consume.) You are free, however, to define custom metadata conventions for your application and assign any (string) values you want to other metadata keys.
+Note that the `attributes()` stored in `$metadata$r` are only understood by R. If you write a `data.frame` with `haven` columns to a Feather file and read that in Pandas, the `haven` metadata won't be recognized there. (Similarly, Pandas writes its own custom metadata, which the R package does not consume.) You are free, however, to define custom metadata conventions for your application and assign any (string) values you want to other metadata keys. For more details, see the documentation for `schema()`.
 
 ## Class structure and package conventions
 

From 92fa1f393091eb7e3cf99e7b6e8a7b3397ea7eb8 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 13 Jan 2021 11:47:31 -0600
Subject: [PATCH 08/13] CI bump


From 82679fa37caee0f448a895f27e00093e768023ca Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 13 Jan 2021 13:27:49 -0600
Subject: [PATCH 09/13] PR comments

---
 r/NAMESPACE     |  2 +-
 r/R/schema.R    | 45 +++++++++++++---------------
 r/man/Schema.Rd | 80 ++++++++++++++++++++++++-------------------------
 3 files changed, 62 insertions(+), 65 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 25434ee7fc4..fdc84aa5189 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -332,8 +332,8 @@ importFrom(tidyselect,vars_select)
 importFrom(utils,head)
 importFrom(utils,install.packages)
 importFrom(utils,modifyList)
-importFrom(utils,packageVersion)
 importFrom(utils,object.size)
+importFrom(utils,packageVersion)
 importFrom(utils,tail)
 importFrom(vctrs,s3_register)
 importFrom(vctrs,vec_cast)
diff --git a/r/R/schema.R b/r/R/schema.R
index 57d8614baad..46eab693bec 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -51,32 +51,29 @@
 #'    Modify or replace by assigning in (`sch$metadata <- new_metadata`).
 #'    All list elements are coerced to string.
 #'
-#' @section Metadata:
+#' @section R Metadata:
 #'
-#'   Attributes from the `data.frame` are saved alongside tables so that the
-#'   object can be reconstructed faithfully in R (e.g. with `as.data.frame()`).
-#'   This metadata can be both at the top-level of the `data.frame` (e.g.
-#'   `attributes(df)`) or at the column (e.g. `attributes(df$col_a)`) or element
-#'   level (e.g. `attributes(df[1, "col_a"])`). For example, this allows for
-#'   storing `haven` columns in a table and being able to faithfully re-create
-#'   them when pulled back into R. This metadata is separate from the schema
-#'   (e.g. types of the columns) which is compatible with other Arrow clients.
-#'   The R metadata is only read by R and is ignored by other clients (e.g.
-#'   pyarrow which has its own custom metadata for things like Pandas metadata).
-#'   This metadata is stored (and can be accessed with) `table$metadata$r`.
+#'   When converting a data.frame to an Arrow Table or RecordBatch, attributes
+#'   from the `data.frame` are saved alongside tables so that the object can be
+#'   reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
+#'   can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or
+#'   at the column (e.g. `attributes(df$col_a)`) or for list columns only:
+#'   element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows
+#'   for storing `haven` columns in a table and being able to faithfully
+#'   re-create them when pulled back into R. This metadata is separate from the
+#'   schema (column names and types) which is compatible with other Arrow
+#'   clients. The R metadata is only read by R and is ignored by other clients
+#'   (e.g. Pandas has its own custom metadata). This metadata is stored in
+#'   `$metadata$r`.
 #'
-#'   This metadata is saved by serializing R's attribute list structure to a
-#'   serialized string. Because of this, large amounts of metadata can quickly
-#'   increase the size of tables (and therefore the size of tables written to
-#'   parquet or feather files). If the (serialized) metadata exceeds 100Kbs in
-#'   size, it is first compressed before saving. To disable this compression
-#'   (e.g. for tables that are compatible with Arrow versions before 3.0.0 and
-#'   include large amounts of metadata) you can set the option
-#'   `arrow.compress_metadata` to `FALSE`.
-#'
-#'   One exception to storing all metadata: `readr`'s `problems` attribute if it
-#'   exists is not saved with the metadata in order to prevent what are
-#'   sometimes excessively large when serialized.
+#'   Since Schema metadata keys and values must be strings, this metadata is
+#'   saved by serializing R's attribute list structure to a string. If the
+#'   serialized metadata exceeds 100Kb in size, by default it is compressed
+#'   starting in version 3.0.0. To disable this compression (e.g. for tables
+#'   that are compatible with Arrow versions before 3.0.0 and include large
+#'   amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
+#'   Files with compressed metadata are readable by older versions of arrow, but
+#'   the metadata is dropped.
 #'
 #' @rdname Schema
 #' @name Schema
diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd
index 7471757115c..c2fb2fac681 100644
--- a/r/man/Schema.Rd
+++ b/r/man/Schema.Rd
@@ -12,20 +12,22 @@ schema(...)
 \item{...}{named list of \link[=data-type]{data types}}
 }
 \description{
-A \code{Schema} is a list of \link{Field}s, which map names to Arrow \link[=data-type]{data types}. Create a \code{Schema} when you want to convert an R
-\code{data.frame} to Arrow but don't want to rely on the default mapping of R
-types to Arrow types, such as when you want to choose a specific numeric
-precision, or when creating a \link{Dataset} and you want to ensure a specific
-schema rather than inferring it from the various files.
+A \code{Schema} is a list of \link{Field}s, which map names to
+Arrow \link[=data-type]{data types}. Create a \code{Schema} when you
+want to convert an R \code{data.frame} to Arrow but don't want to rely on the
+default mapping of R types to Arrow types, such as when you want to choose a
+specific numeric precision, or when creating a \link{Dataset} and you want to
+ensure a specific schema rather than inferring it from the various files.
 
-Many Arrow objects, including \link{Table} and \link{Dataset}, have a \verb{$schema}
-method (active binding) that lets you access their schema.
+Many Arrow objects, including \link{Table} and \link{Dataset}, have a \verb{$schema} method
+(active binding) that lets you access their schema.
 }
 \section{Methods}{
 
 \itemize{
-\item \verb{$ToString()}: convert to a string - \verb{$field(i)}: returns the field at
-index \code{i} (0-based) - \verb{$GetFieldByName(x)}: returns the field with name \code{x}
+\item \verb{$ToString()}: convert to a string
+\item \verb{$field(i)}: returns the field at index \code{i} (0-based)
+\item \verb{$GetFieldByName(x)}: returns the field with name \code{x}
 \item \verb{$WithMetadata(metadata)}: returns a new \code{Schema} with the key-value
 \code{metadata} set. Note that all list elements in \code{metadata} will be coerced
 to \code{character}.
@@ -35,43 +37,41 @@ to \code{character}.
 \section{Active bindings}{
 
 \itemize{
-\item \verb{$names}: returns the field names (called in \code{names(Schema)}) -
-\verb{$num_fields}: returns the number of fields (called in \code{length(Schema)}) -
-\verb{$fields}: returns the list of \code{Field}s in the \code{Schema}, suitable for
-iterating over - \verb{$HasMetadata}: logical: does this \code{Schema} have extra
-metadata? - \verb{$metadata}: returns the key-value metadata as a named list.
-Modify or replace by assigning in (\code{sch$metadata <- new_metadata}). All
-list elements are coerced to string.
+\item \verb{$names}: returns the field names (called in \code{names(Schema)})
+\item \verb{$num_fields}: returns the number of fields (called in \code{length(Schema)})
+\item \verb{$fields}: returns the list of \code{Field}s in the \code{Schema}, suitable for
+iterating over
+\item \verb{$HasMetadata}: logical: does this \code{Schema} have extra metadata?
+\item \verb{$metadata}: returns the key-value metadata as a named list.
+Modify or replace by assigning in (\code{sch$metadata <- new_metadata}).
+All list elements are coerced to string.
 }
 }
 
-\section{Metadata}{
+\section{R Metadata}{
 
 
-Attributes from the \code{data.frame} are saved alongside tables so that the
-object can be reconstructed faithfully in R (e.g. with \code{as.data.frame()}).
-This metadata can be both at the top-level of the \code{data.frame} (e.g.
-\code{attributes(df)}) or at the column (e.g. \code{attributes(df$col_a)}) or element
-level (e.g. \code{attributes(df[1, "col_a"])}). For example, this allows for
-storing \code{haven} columns in a table and being able to faithfully re-create
-them when pulled back into R. This metadata is separate from the schema
-(e.g. types of the columns) which is compatible with other Arrow clients.
-The R metadata is only read by R and is ignored by other clients (e.g.
-pyarrow which has its own custom metadata for things like Pandas metadata).
-This metadata is stored (and can be accessed with) \code{table$metadata$r}.
+When converting a data.frame to an Arrow Table or RecordBatch, attributes
+from the \code{data.frame} are saved alongside tables so that the object can be
+reconstructed faithfully in R (e.g. with \code{as.data.frame()}). This metadata
+can be both at the top-level of the \code{data.frame} (e.g. \code{attributes(df)}) or
+at the column (e.g. \code{attributes(df$col_a)}) or for list columns only:
+element level (e.g. \code{attributes(df[1, "col_a"])}). For example, this allows
+for storing \code{haven} columns in a table and being able to faithfully
+re-create them when pulled back into R. This metadata is separate from the
+schema (column names and types) which is compatible with other Arrow
+clients. The R metadata is only read by R and is ignored by other clients
+(e.g. Pandas has its own custom metadata). This metadata is stored in
+\verb{$metadata$r}.
 
-This metadata is saved by serializing R's attribute list structure to a
-serialized string. Because of this, large amounts of metadata can quickly
-increase the size of tables (and therefore the size of tables written to
-parquet or feather files). If the (serialized) metadata exceeds 100Kbs in
-size, it is first compressed before saving. To disable this compression
-(e.g. for tables that are compatible with Arrow versions before 3.0.0 and
-include large amounts of metadata) you can set the option
-\code{arrow.compress_metadata} to \code{FALSE}.
-
-One exception to storing all metadata: \code{readr}'s \code{problems} attribute if it
-exists is not saved with the metadata in order to prevent what are
-sometimes excessively large when serialized.
+Since Schema metadata keys and values must be strings, this metadata is
+saved by serializing R's attribute list structure to a string. If the
+serialized metadata exceeds 100Kb in size, by default it is compressed
+starting in version 3.0.0. To disable this compression (e.g. for tables
+that are compatible with Arrow versions before 3.0.0 and include large
+amounts of metadata), set the option \code{arrow.compress_metadata} to \code{FALSE}.
+Files with compressed metadata are readable by older versions of arrow, but
+the metadata is dropped.
 }
 
 \examples{

From 920cfb1306ec8be407422f12e502e90b86436ec2 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 13 Jan 2021 15:01:17 -0600
Subject: [PATCH 10/13] =?UTF-8?q?=F0=9F=93=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 r/NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/r/NEWS.md b/r/NEWS.md
index 40a943c39ff..521af3eacd4 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -37,6 +37,8 @@
 * Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows conversion of Arrow string (`utf8()`) type data containing embedded nul `\0` characters to R. If set to `TRUE`, nuls will be stripped and a warning is emitted if any are found.
 * `arrow_info()` for an overview of various run-time and build-time Arrow configurations, useful for debugging
 * Set environment variable `ARROW_DEFAULT_MEMORY_POOL` before loading the Arrow package to change memory allocators. Windows packages are built with `mimalloc`; most others have `jemalloc`. These are used by default if they were built, and they're generally much faster than the system malloc, but sometimes it is useful to turn them off for debugging purposes. To disable them, set `ARROW_DEFAULT_MEMORY_POOL=system`.
+* List columns that have attributes on each element are now also included with the metadata that is saved when creating Arrow tables. This allows `sf` tibbles to faithfully preserved and roundtripped (ARROW-10386)[https://issues.apache.org/jira/browse/ARROW-10386].
+* R metadata that exceeds 100Kb is now compressed before being written to a table; see `schema()` for more details.
 
 ## Bug fixes
 

From a66818d3185398aa1c9daf8e70694ec1aa11fee6 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 13 Jan 2021 15:12:30 -0600
Subject: [PATCH 11/13] add extra-tests for compressed metadata

---
 r/extra-tests/test-read-files.R | 22 ++++++++++++++++++++++
 r/extra-tests/write-files.R     |  3 +++
 r/tests/testthat/helper-data.R  |  2 ++
 3 files changed, 27 insertions(+)

diff --git a/r/extra-tests/test-read-files.R b/r/extra-tests/test-read-files.R
index 90efce3d791..5aa3a7b2fd2 100644
--- a/r/extra-tests/test-read-files.R
+++ b/r/extra-tests/test-read-files.R
@@ -162,4 +162,26 @@ test_that("Can see the metadata (stream)", {
   )
 })
 
+test_that("Can see the extra metadata (parquet)", {
+  pq_file <- "files/ex_data_extra_metadata.parquet"
 
+  df <- read_parquet(pq_file)
+  expect_s3_class(df, "tbl")
+
+  expect_equal(
+    attributes(df),
+    list(
+      names = letters[1:4],
+      row.names = 1L,
+      class = c("tbl_df", "tbl", "data.frame"),
+      top_level = list(
+        field_one = 12,
+        field_two = "more stuff"
+      )
+    )
+  )
+
+  # column-level attributes for the large column.
+  expect_named(attributes(df$b), "lots")
+  expect_length(attributes(df$b)$lots, 100)
+})
diff --git a/r/extra-tests/write-files.R b/r/extra-tests/write-files.R
index e0927ead4eb..75889b61407 100644
--- a/r/extra-tests/write-files.R
+++ b/r/extra-tests/write-files.R
@@ -37,3 +37,6 @@ example_with_metadata_v1$c <- NULL
 write_feather(example_with_metadata_v1, "extra-tests/files/ex_data_v1.feather", version = 1)
 
 write_ipc_stream(example_with_metadata, "extra-tests/files/ex_data.stream")
+
+write_parquet(example_with_extra_metadata, "extra-tests/files/ex_data_extra_metadata.parquet")
+
diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R
index 06f0b48cb8e..ecce77336b3 100644
--- a/r/tests/testthat/helper-data.R
+++ b/r/tests/testthat/helper-data.R
@@ -72,3 +72,5 @@ make_string_of_size <- function(size = 1) {
   purrr::map_chr(1000*size, ~paste(sample(letters, ., replace = TRUE), collapse = ""))
 }
 
+example_with_extra_metadata <- example_with_metadata
+attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100))

From 306751f0de58cae320ec38ad50564b2181b93f20 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 13 Jan 2021 15:41:30 -0600
Subject: [PATCH 12/13] expect warning for compressed metadata prior to 3.0.0

---
 r/extra-tests/helpers.R         |  4 ++++
 r/extra-tests/test-read-files.R | 39 ++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/r/extra-tests/helpers.R b/r/extra-tests/helpers.R
index 61b7da4ec25..af57d45e5d2 100644
--- a/r/extra-tests/helpers.R
+++ b/r/extra-tests/helpers.R
@@ -19,6 +19,10 @@ if_version <- function(version, op = `==`) {
   op(packageVersion("arrow"), version)
 }
 
+if_version_less_than <- function(version) {
+  if_version(version, op = `<`)
+}
+
 skip_if_version_less_than <- function(version, msg) {
   if(if_version(version, `<`)) {
     skip(msg)
diff --git a/r/extra-tests/test-read-files.R b/r/extra-tests/test-read-files.R
index 5aa3a7b2fd2..9de224aff43 100644
--- a/r/extra-tests/test-read-files.R
+++ b/r/extra-tests/test-read-files.R
@@ -165,23 +165,32 @@ test_that("Can see the metadata (stream)", {
 test_that("Can see the extra metadata (parquet)", {
   pq_file <- "files/ex_data_extra_metadata.parquet"
 
-  df <- read_parquet(pq_file)
-  expect_s3_class(df, "tbl")
+  if (if_version_less_than("3.0.0")) {
+    expect_warning(
+      df <- read_parquet(pq_file),
+      "Invalid metadata$r"
+    )
+    expect_s3_class(df, "tbl")
+  } else {
+    # version 3.0.0 and greater
+    df <- read_parquet(pq_file)
+    expect_s3_class(df, "tbl")
 
-  expect_equal(
-    attributes(df),
-    list(
-      names = letters[1:4],
-      row.names = 1L,
-      class = c("tbl_df", "tbl", "data.frame"),
-      top_level = list(
-        field_one = 12,
-        field_two = "more stuff"
+    expect_equal(
+      attributes(df),
+      list(
+        names = letters[1:4],
+        row.names = 1L,
+        class = c("tbl_df", "tbl", "data.frame"),
+        top_level = list(
+          field_one = 12,
+          field_two = "more stuff"
+        )
       )
     )
-  )
 
-  # column-level attributes for the large column.
-  expect_named(attributes(df$b), "lots")
-  expect_length(attributes(df$b)$lots, 100)
+    # column-level attributes for the large column.
+    expect_named(attributes(df$b), "lots")
+    expect_length(attributes(df$b)$lots, 100)
+  }
 })

From fa0041b6c1a7a4e81bdfdabc70a5ccb562239f3e Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 13 Jan 2021 16:07:05 -0600
Subject: [PATCH 13/13] backwards compatibility + fixed = TRUE

---
 r/extra-tests/test-read-files.R                 |   3 ++-
 .../data-arrow-extra-meta_3.0.0.parquet         | Bin 0 -> 13263 bytes
 r/tests/testthat/test-backwards-compatibility.R |   9 +++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet

diff --git a/r/extra-tests/test-read-files.R b/r/extra-tests/test-read-files.R
index 9de224aff43..10e9f957920 100644
--- a/r/extra-tests/test-read-files.R
+++ b/r/extra-tests/test-read-files.R
@@ -168,7 +168,8 @@ test_that("Can see the extra metadata (parquet)", {
   if (if_version_less_than("3.0.0")) {
     expect_warning(
       df <- read_parquet(pq_file),
-      "Invalid metadata$r"
+      "Invalid metadata$r",
+      fixed = TRUE
     )
     expect_s3_class(df, "tbl")
   } else {
diff --git a/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet b/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..bf95f23cd8655e0953d999e8f7f77692bfd0dbf1
GIT binary patch
literal 13263
zcmeI3O>Z056^1Fsah%(v2!c?M0WX505wb{Xa`;^!KxoNP1ZIYIBG({y<d9S&$4Qzr
zO{-PWMNt$*ms$2FbkS9pMHl%AMgK`rpo>1w8A?_h2QAQMCqW{IbMHO(ocFxv;|}dR
zqjy`4D~<0qUVNufZ(Mn?R(odWsamb}(TA%>t?^=G`!#Pjt~RdT=+$0*irYKBFyq;m
z>%7*tw(7Or-)`M#Ja@f&rFQlD&ept9->z?;ZvSd)x3_!ktIMvxhOjG*dJW;e)qH!Q
z@zUAm*KTmjpEn=hedk&=P<!*y)pqUGC){P;Mcw}T)Qv_u(`^yD{dsHqFI&5Rs_*`)
ze&dm}M_1n<_le2_(V6nkzWg+MzyINft4|)+9_*fuft~8@k&gAi@9H^zuWps~vK4Cn
zdTaalTf2XJdiP&fMDeQvo~X<dkbL1Ak$mAC$rs*yWbwtjk7@b0Yv)LYme~a`zkFu*
zi)YTkyeQ-wME|$?AAY0$hv)hqUeN#I-N*I+&R6wsU(`SCev!`gzbNF9{ul4o{<Hp9
z*na)dXHVk774PrF>mwZ_lK$}Bb0+L;EknUyZf*Z&Yxm3NcfWXE8eCla1SRU%f7h+;
zT>Z5C@#^8cajn*P?v*+*`AY2_L#vVew5WA?EgM&>QTTRoxL#(}yn5N*EMIQMx9it$
zT>s#I3HRM^GaPq+`~Qfe-EW!ayMOz3dAR$(a%lVS&urImGVd?dAo|YMCp-WA{wTFm
z*2`L*UTO{dgTWwm)4ij&ULT~alUm(gH}wZ;Sf<r7Ejy`Pq`@k!Iw>2ZRxb^jY1K*#
zhSq7`^><LFa^UM?p0YCa`l;PcopowA)1sYPi!>jmCV~fPxkyVkV@{a{?X+kqYBfy5
zMJnfMwMz4ODr0ns<W=hSQr1rEWg09}znNMLwNg8D>u|rB%5GY$-FMY@`5HZkslQ77
zMOqJ4ez{BwKE}{GORH|`FvR46lh=zhT&Mmzt=DOZQ7b?8xL&&H8XShsX)heY<VD9#
zmpwmfOPQAa)B%C@nF2jGT@2F#l-LLmShY-TAVOLvHOqwNV6qsfxq2>DW>IQjd#<Wo
z?q{*KmDcRlb-wWSoV-B!rn<G)Y0yv03?GUBdal|UIA7~VR6;{MShj@`nTthg0Z2~+
zxr3W}OYdj>)L*6y6MzwX+G#jUt#xXm3=)QEmFbruYJ@)BMe3jon&Z2++YQ^Py;6CY
zBO0YQn?T2=yoLw6OQ&U-=Aa(F=nT@Fn|*O;!WpdYr)(}m&>8owfdyso6slrxxdAg<
zrXD_RX->()R!|N!umwirl(|Fbz+TW8@-#6JD~9rCxDvNvGj&55OlAr6!*LAZ&@Q|}
z>>xEUjhXDeN+s&8QV-p*F;MLdmBTB1#x={8_t6ve=CFe3Xd63yRO(7j_}}gs8tj73
zD+G2Eu&lV*Qi1N!@M(9|q_;@jl?uQ;E;^{eD@G9lSVt7ZXo*EZBurz7fX3>1CV`0~
z6vjHd47Nau0-78SCG<*ILBwHNW`NIO8u1%WgxJJ8a)v=9%vU!&Ma*I{sUX6Z2`Gkq
zXbGNBD{H0!AsJCj2xPJQ3Zq~Sz95u_sRU<ShSdQtVFJ|ff%zye^D_xKT&5;KK~YeJ
z*mJi=d5DVUgHCE^Miaq>Q6-T}+!4a?1pE=+(FK@hoOYeC$n+yR!+-n&;N&iqZ_0`$
zAxSnMkKtt^1z0c*X}A)^<`P?WbyJg9@*CJjicl<sM}H{POC^aBUl3vlfg204B1mx%
z$wUqiZfu7+05wScwv^)<M-U)j#ac{5Cd6YVyn+?1#Y|uv1W{EL_d&3TWnz;kgSVy{
z(kT>0J{M%{NaGS1O0xyOk<ST#d@c6|sRK$lrEk=uFkuTIT$}S@TF<~lRRSC+*c5o@
z*wrz6r~}FnGGL1<BmV#(>C||S{6^%;fq5F>eSEYqYKTfn4tpaT8%m%}6aYx%hb0k8
zSP>2+JtSiw?hc#{9IpEjjKn+jg*hz)3akd<1(=V4WHa1twhZDy-7fJC&43hE%*{M3
zHxRIqAdTbzGjR$@4__bw?+{r)O=vT}a#ojxWSG9>hZ|^1%A!P4UzmV_c)zL?u#}JN
z)yq<sL<*jzezEGo3g)8~yI>=!m2!z`L=eajq=X(Oz)8Fl^$TK4{wN}N2fpDBMlG_q
zgEUBP1gl72#16@@+mfzu3E|Ly^p0XAD>#n?Y_z%!y#og-ZLtntGp&W%WCqwmxm@}Y
z<oQL8!qGg+C>aA9$SCR&UW13+M=?r65EbRI7k*Yt^a8mQNV!k^4xMi@ByquTA`Z6(
zos2JB8BC)YAp6ZRKXN`LPXy+c2Dl1PD8L;yMRS6#8;R5mj4cQZvYC^EMM#$a2qch+
zGJF?M<9+lT5<wlA4|;mDFJl?9&QeAhqWVN7f-{vII}idE-$Ag5EW!b2kvwq_-h`9f
zr#{mBFa%Z=E4Ybw&<EXcz4Ztau$p=VGY7h09`vSGnnQ3_1QBe+<EX(+_zZu<lK@9f
zU<I~pt)57LDp9HN9pQ<dz*T|9M^LqhAg5<Ila`?nc9Cq!lZHP@2B`5G0Ad&E1y0g<
zw9QU%o*lVQ!vM}~3TRdkHYJk~N4ORGAsN{53%{WZSpuJu73~b+IhhKAh0cT|PPCCg
zw`Md6(JXbypCF7t$_;@4?3WWRak%8eWn5h5hs!+qwQ@j1OO>?bapIs@(*slDDbsYG
zG$d#33dM^)$L5d&Y0H+wGYBU~nlz5ylq7zs{`^MQ$I+5z!1D#wjaHdjOe0N~8|Qs0
zFP#_-1Lp-^A3NJp$Jw`P$Z5GLvNW8CUA8?`(g67UggLy@o1qDfbUZMjouHo5DfuKJ
zx>kB8PRceOboiJMPYFD2MYG`{eQA@)$ptyKSkczHdp!5xj6fx&K-1DNlL_|5v>8Z?
z33PFsO`H%N@;cf-tfm0~E)N~_%+!6_353U58hdPEIWPhsoeUasRIKehH+uKo$^9Qz
z&op=b-o5G1&yVkoMx(p@`7<7kI-^_1qqjz*<I&!6+`Dz_E90Mg^Y;1p`1ALw&%B0|
z7(X11J|D4;cZKX7-sX+ZqtV+Bn1Acm-bp^rPo}@fkM3rBC%3!B>2YT=`lz+{!H@2}
z`+l>tcXHU|eycdSH)d=+xu1_`_wSBp`*+9F>~K6exHq01-9^UnI6s<>C;KBl-yTmN
zGUs4Ao*vzczsW&9p1R)mhquT1!+czvl;ixAcjo1%qw!Q-?jJJ$!8k{LesqkCL-slz
z7wlWGX0p%vliRV!jCm91XMDcTKKI!-tLDx2{fy4&lp}YBJ`-dW?2{`C8TrPZ0-3WD
z_CICKLv%*J*-;rXXAiaS!MMnfg}t+r2Vwi{0Q(v9e#|*x3|kBJ#jcsUyY?X%-JddV
z$~AxcVso#$=V#=o+br-!HuH1#agG>HPaFf&ef2_~^1*PnUyciGoPgcAt|!WO%>({e
z12*oNXX*-`V55!2Dc8u*?!X)0X?NJCF6chhKVk*8VY@b*ZseVU4|`3~+4bR5_YQrv
zoxL5y<_UVv_OS^J16S;u9ihLxKxU48VS{$TtEqgzX9Z(Zv4C6JUZ|h<!M0!@$8)c-
zEf+_8ANXU_ME*E_pziJmu18?XoC*8p@KgN@bT5LB%n43`59=oAm*dy4UB3j*)mXl-
zkF=fl4X+CHjx}%~FbSU_8{ST1tPFf}un?2tly^DiIQ9mo^a1iF@<zJ?KlRx7PMh(6
z0rp}tRR-4)L*i@<VUygRVRy_CL*y8<azBS_xjM>c?41xF`e%Z_3h`mC7%C%T3NAS(
zxW=4nPvV@tCd7w->NM3R<3k(dm$9KA!ASYO8^2RILcVZ3?2|9bK;9J0%_qevHlTy?
zZ2aebMwWVL?+lv?_b>+;+uT#vLZ8FgoH1jdfIEe2#6`JW7up){?2A5yc?d3quhrpH
zT(Cj^A}{6`cj5>>5%cC3V;&vlq5jlXb*(Vma1DH?=<dGep((LruAHKCmH+e|a*SpD
ziTnxg`Z4^hJ@7zIOoI>lPL87&bG60&Bd-~6=$RAq6Z0N+nFq}QIeWxB><#>a4{!ln
zr{sYN_80O-4n@v$Ob*Hid>(ikAM6PiC*YoAhrHAWg)v~>4PEttcp%ICjm_p6b5CFc
zZrW)6jyRFq;X}p)e=+7y8<;zFPx+<2$~7<IYiml_gnp67S!1p+cEniU=YhH0lQ+B@
zx9DjsN1VuWF@tvnYoewxx7rh(*)#H${zuOl>jMjNhPlk#5&j^zMLdCL#aH%%lja_6
zoWiHV7{_1n8NJLEGvfhl#VG2UaUj3NHgYd}po2b}AS?2E*a$x&R(#iI=#CEx<5NDO
zhxdxu-)mxsy|gc|G;in!^@!ZfTznV#D13pw*yMil!91$1=Dfg~&sDw0M){3AZH!zA
zMxmP=4}93u^_$o>&xBtcBTfTPY_cXs{`5Lxs=`N^6LALv{TulNUq@_Xmz=X!RXGOS
zA`gf){(?I*dJ!>_PgT8kU$7E;`AwWuxx~-dVO}vF+0$GpPaW4ElQSO38`n{1<Yd&j
zO)M4U{lal=Hcx0HI+^#C8My#Vt>f}jy^XuzhB4$i<jRvG_Jbedae+I9F>0)6yK$v$
z_FUvxdocTtSnp?ede%2Yz2{!!O5-YOVWp@1h&~ECte5s2;WIFg+^I}4Gp~>vB7WhJ
zb0SyCAGjK|!<-VaPreBbux1nI@+^8ga~!rttm+Rr8u#!Uye-U0#xZ^md*GNgJ^BD^
z82T7{;uU?8^@aPvSNs?C#&`nr$cx53IZn)Sun&%T51WDu<_GHudg~*5F!Qqcriu&R
z<&k@1ck~X#T<{+Zv_0^{A7?Ux2XYkuT5~7n6YzC??8SciLSC2)%zF_N`o^4PyeZq*
z7I*Tdxj+n~N7R0~1<#Dfn9DqIiJT07?7Irbj2-K!7zV$q*s1C*dMTfMqn;wup2glZ
z_!K<n+B%1Q6L~5>#4KuuJi|ZgFPFuRJp%{0pzq~~d_R+KZI6D-9L1W@4PI%#+_9Di
zSFk1Otn~!F?fs*^izRv%Vih^4(m(2h7|7Mgx#GxPaA}G!f&=PoT!L?vujP)IV+U~-
zeJ46cJva8{fc}Ha+Gsu2ANn(N)IPbTKlK;>wNEmqMIXl8Gv7q-4>r+TsE>A8yOC`S
zh>x{1&P?W1_Md5wF^3-NZ9c5>GV;~M+!J-iYcZDd+KMg4dvKFIiIwP`tR>n(Z*NZ=
z+|i%%8Qsmp(HDgc$iY`pH_T_|u83vr0~fi=+9@_vy)Ay%E^Lo@4u0E{AUk}1IrkEW
zOFk5radDX+F7qVy=CWQ~*3GZ?1aa21Ux^-y9?9N;KcAykXAt_2=+~LE>B*y~v%hDK
z&suYP5PH1oc?JEPX9RjHpRL)S*-ypuQO{G+*V0Ev-_L!|BA#LFq3n_9nI^FhJ*fSZ
zXY|;EeWHK$TtQ!T;d!3@Wt=_isqCTX@$Bt9>&5e0dZIZ0c?My>=$+%e>P_u8?e9Em
z*%Qav5S!y17yYbu*njz~#=X&j{x6=fc@Ci`jOPGhGvloFZ0EDJc-AX^_VR)AW`FTA
zFIk`BzH?X?Ytd7i!K>=?#nQf4-&A(R^J$-5Y|b-rb`K1FHi15#RoT-st!EzPc{UM`
z3Qzk+#+>I_7MuL4BEHXfo3Ak9`;HM`itx2Z5npnA>dp8*V|2=wC<CXw#V9vpJTCoM
zExq-?tDATFH*cQqpUp4c_U4{13g*YJP7dNr7X@x^I&|yA&+pt}X?DJT|H=EQ@2!&@
zStmbZzA}&Z4!OGN!06|^jz)gxv(ewHN(-J9vdMy(ZhtuX)cst4_g)yqmqLDs*LYFY
zgB*!_$I5S=&(CAhK}@<Z-^+tTzh7dbsqfX#F+#x&C|FXC_Pq<Z=OAwOf3m!o#S0=r
z+8b}wcE0!V{F9%2w)*Mo<;Net{Ik_3pT7UmhcCB&(EUN{_4kU=o%i0J99{o_f5`vX
PZ|xs9YqdN4SDt?Z<t5p8

literal 0
HcmV?d00001

diff --git a/r/tests/testthat/test-backwards-compatibility.R b/r/tests/testthat/test-backwards-compatibility.R
index fb06897e75a..c6bd51498cf 100644
--- a/r/tests/testthat/test-backwards-compatibility.R
+++ b/r/tests/testthat/test-backwards-compatibility.R
@@ -45,6 +45,15 @@ expect_identical_with_metadata <- function(object, expected, ..., top_level = TR
   expect_identical(object, expected, ...)
 }
 
+test_that("reading a known Parquet file to dataframe with 3.0.0", {
+  skip_if_not_available("snappy")
+  pq_file <- test_path("golden-files/data-arrow-extra-meta_3.0.0.parquet")
+
+  df <- read_parquet(pq_file)
+  # this is equivalent to `expect_identical()`
+  expect_identical_with_metadata(df, example_with_extra_metadata)
+})
+
 test_that("reading a known Parquet file to dataframe with 2.0.0", {
   skip_if_not_available("snappy")
   pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet")