From 67ce55e05a302c2c765f3fbeca9e37720643db81 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 3 Jun 2024 17:47:12 -0400 Subject: [PATCH 01/10] Add some protections and options around unserialize() --- r/R/arrowExports.R | 4 ++ r/R/metadata.R | 74 +++++++++++++++++++++++---- r/src/arrowExports.cpp | 10 ++++ r/src/schema.cpp | 88 ++++++++++++++++++++++++++++++++ r/tests/testthat/test-metadata.R | 37 ++++++++++++++ 5 files changed, 203 insertions(+), 10 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 62e2182ffcd..50820069205 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -2052,6 +2052,10 @@ arrow__UnifySchemas <- function(schemas) { .Call(`_arrow_arrow__UnifySchemas`, schemas) } +safe_r_metadata <- function(x) { + invisible(.Call(`_arrow_safe_r_metadata`, x)) +} + Table__num_columns <- function(x) { .Call(`_arrow_Table__num_columns`, x) } diff --git a/r/R/metadata.R b/r/R/metadata.R index 3ae2db4eaa7..e73e7354cba 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -44,23 +44,77 @@ } .deserialize_arrow_r_metadata <- function(x) { - tryCatch( - expr = { - out <- unserialize(charToRaw(x)) - - # if this is still raw, try decompressing - if (is.raw(out)) { - out <- unserialize(memDecompress(out, type = "gzip")) - } - out - }, + tryCatch(safe_unserialize_ascii(x), error = function(e) { + if (getOption("arrow.debug", FALSE)) { + print(conditionMessage(e)) + } warning("Invalid metadata$r", call. = FALSE) NULL } ) } +safe_unserialize_ascii <- function(x) { + # First, check if we can call unserialize() at all + # By default, we only call unserialize() in R 4.4.0 or newer + # but you can enable it in older versions by setting the option + r_4.4_or_newer <- getRversion() >= "4.4" + if (!isTRUE(getOption("arrow.unserialize_metadata", r_4.4_or_newer))) { + opts <- c(">" = "To enable, set `options(arrow.unserialize_metadata = TRUE)`") + if (!r_4.4_or_newer) { + opts <- c( + opts, + ">" = "Or, upgrade to R 4.4.0 or newer" + ) + } + rlang::warn( + "Unserialization of R metadata is disabled.", + body = opts, + .frequency = "once", + .frequency_id = "arrow.unserialize_metadata" + ) + return(NULL) + } + + # Check that this is ASCII serialized data (as in, what we wrote) + if (!identical(substr(unclass(x), 1, 1), "A")) { + stop("Invalid serialized data") + } + out <- unserialize(charToRaw(x)) + # If it's still raw, check for the gzip magic number and uncompress + if (is.raw(out) && identical(out[1:2], as.raw(c(31, 139)))) { + decompressed <- memDecompress(out, type = "gzip") + if (!identical(substr(decompressed, 1, 1), "A")) { + stop("Invalid serialized data") + } + out <- unserialize(decompressed) + } + if (!is.list(out)) { + stop("Invalid serialized data") + } + + tryCatch(safe_r_metadata(out), error = function(e) { + # This C function will error if the metadata contains elements of types + # that are not allowed. + if (getOption("arrow.debug", FALSE)) { + print(conditionMessage(e)) + } + if (getOption("arrow.unsafe_metadata", FALSE)) { + # We've opted-in to unsafe metadata, so we'll just return the metadata + # TODO: should we warn here anyway? + } else { + rlang::warn( + "R metadata may contain unsafe elements and has been discarded.", + body = c("i" = "If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to use it.") + ) + out <<- NULL + } + }) + + out +} + #' @importFrom rlang trace_back apply_arrow_r_metadata <- function(x, r_metadata) { if (is.null(r_metadata)) { diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index d5aec50219e..907fc6b4d0f 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -5347,6 +5347,15 @@ BEGIN_CPP11 return cpp11::as_sexp(arrow__UnifySchemas(schemas)); END_CPP11 } +// schema.cpp +void safe_r_metadata(SEXP x); +extern "C" SEXP _arrow_safe_r_metadata(SEXP x_sexp){ +BEGIN_CPP11 + arrow::r::Input::type x(x_sexp); + safe_r_metadata(x); + return R_NilValue; +END_CPP11 +} // table.cpp int Table__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_Table__num_columns(SEXP x_sexp){ @@ -6183,6 +6192,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1}, { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3}, { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1}, + { "_arrow_safe_r_metadata", (DL_FUNC) &_arrow_safe_r_metadata, 1}, { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, diff --git a/r/src/schema.cpp b/r/src/schema.cpp index 41d3d38d2ed..79cab491675 100644 --- a/r/src/schema.cpp +++ b/r/src/schema.cpp @@ -167,3 +167,91 @@ std::shared_ptr arrow__UnifySchemas( const std::vector>& schemas) { return ValueOrStop(arrow::UnifySchemas(schemas)); } + +void check_r_metadata_attributes(SEXP x); +void check_r_metadata_columns(SEXP x); + +// [[arrow::export]] +void safe_r_metadata(SEXP x) { + if (Rf_isNull(x)) { + return; + } + if (TYPEOF(x) != VECSXP) { + cpp11::stop("R metadata must be a list, not a %s", Rf_type2char(TYPEOF(x))); + } + // Check the names of the list + SEXP names = Rf_getAttrib(x, R_NamesSymbol); + if (Rf_isNull(names)) { + cpp11::stop("R metadata must have names"); + } + const char* name_str; + for (R_xlen_t i = 0; i < Rf_length(names); i++) { + name_str = CHAR(STRING_ELT(names, i)); + // The names may have "attributes" and "columns" only + if (strcmp(name_str, "attributes") == 0) { + check_r_metadata_attributes(VECTOR_ELT(x, i)); + } else if (strcmp(name_str, "columns") == 0) { + check_r_metadata_columns(VECTOR_ELT(x, i)); + } else { + cpp11::stop("Invalid name '%s' for R metadata", name_str); + } + } +} + +void check_r_metadata_attributes(SEXP x) { + if (Rf_isNull(x)) { + return; + } + if (TYPEOF(x) != VECSXP) { + cpp11::stop("attributes must be a list, not a %s", Rf_type2char(TYPEOF(x))); + } + SEXP attr; + for (R_xlen_t i = 0; i < Rf_length(x); i++) { + attr = VECTOR_ELT(x, i); + switch (TYPEOF(attr)) { + // This is effectively an allowlist of types + // We could switch this to a blocklist and maybe just exclude environment? + // As that's the only thing that could contain an active binding? + // TODO: whatever we restrict here on load, we should also restrict when + // saving metadata. + case VECSXP: + // Recurse + return check_r_metadata_attributes(attr); + case STRSXP: + break; + case REALSXP: + break; + case INTSXP: + break; + case LGLSXP: + break; + case CPLXSXP: + break; + case RAWSXP: + break; + case NILSXP: + break; + // Should we error on externalptr (which won't be useful) or just allow it? + // Should we error on environment (which could be an R6 object) + // or inspect it and make sure there are no active bindings in it? + // Should we error on EXPRSXP (e.g. a formula) or just allow it? + // Should we error on OBJSXP (S4 objects)? + default: + cpp11::stop("Invalid attribute type: %s", Rf_type2char(TYPEOF(attr))); + } + } +} + +void check_r_metadata_columns(SEXP x) { + // "columns" is named and each element of "columns" is a list + // with "attributes" and/or "columns" elements--i.e. the same as the top-level + if (Rf_isNull(x)) { + return; + } + if (TYPEOF(x) != VECSXP) { + cpp11::stop("columns must be a list, not a %s", Rf_type2char(TYPEOF(x))); + } + for (R_xlen_t i = 0; i < Rf_length(x); i++) { + safe_r_metadata(VECTOR_ELT(x, i)); + } +} diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index e44cd710380..409caebf326 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +withr::local_options(list(arrow.unserialize_metadata = TRUE)) + test_that("Schema metadata", { s <- schema(b = double()) expect_equal(s$metadata, empty_named_list()) @@ -107,6 +109,39 @@ test_that("Garbage R metadata doesn't break things", { "Invalid metadata$r", fixed = TRUE ) + + bad <- new.env(parent = emptyenv()) + makeActiveBinding("columns", function() stop("This should not run"), bad) + tab$metadata <- list(r = rawToChar(serialize(bad, NULL, ascii = TRUE))) + expect_warning( + as.data.frame(tab), + "Invalid metadata$r", + fixed = TRUE + ) +}) + +test_that("On older R versions, metadata serialization is off by default", { + skip_if(getRversion() >= "4.4") + + rlang::reset_warning_verbosity("arrow.unserialize_metadata") + op <- options(arrow.unserialize_metadata = NULL) + on.exit(options(op)) + + tab <- Table$create(example_with_metadata) + expect_warning( + expect_null(tab$metadata$r), + "Unserialization of R metadata is disabled. +> To enable, set `options(arrow.unserialize_metadata = TRUE)` +> Or, upgrade to R 4.4.0 or newer", + fixed = TRUE + ) + rlang::reset_warning_verbosity("arrow.unserialize_metadata") + + options(arrow.unserialize_metadata = TRUE) + expect_warning( + expect_type(tab$metadata$r, "list"), + NA + ) }) test_that("Metadata serialization compression", { @@ -254,6 +289,8 @@ test_that("Row-level metadata (does not) roundtrip in datasets", { skip_if_not_available("dataset") skip_if_not_available("parquet") + library(dplyr, warn.conflicts = FALSE) + df <- tibble::tibble( metadata = list( structure(1, my_value_as_attr = 1), From b46db7ec5efeed3554a7efca79572b9e8ff02a51 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Jun 2024 13:20:14 -0400 Subject: [PATCH 02/10] Just do it in R --- r/R/arrowExports.R | 4 -- r/R/metadata.R | 51 +++++++++++++++++------- r/src/arrowExports.cpp | 10 ----- r/src/schema.cpp | 88 ------------------------------------------ 4 files changed, 37 insertions(+), 116 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 50820069205..62e2182ffcd 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -2052,10 +2052,6 @@ arrow__UnifySchemas <- function(schemas) { .Call(`_arrow_arrow__UnifySchemas`, schemas) } -safe_r_metadata <- function(x) { - invisible(.Call(`_arrow_safe_r_metadata`, x)) -} - Table__num_columns <- function(x) { .Call(`_arrow_Table__num_columns`, x) } diff --git a/r/R/metadata.R b/r/R/metadata.R index e73e7354cba..befed792e9c 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -30,7 +30,7 @@ } } - out <- serialize(x, NULL, ascii = TRUE) + out <- serialize(safe_r_metadata(x, on_save = TRUE), NULL, ascii = TRUE) # if the metadata is over 100 kB, compress if (option_compress_metadata() && object.size(out) > 100000) { @@ -93,26 +93,49 @@ safe_unserialize_ascii <- function(x) { if (!is.list(out)) { stop("Invalid serialized data") } + safe_r_metadata(out) +} - tryCatch(safe_r_metadata(out), error = function(e) { - # This C function will error if the metadata contains elements of types - # that are not allowed. - if (getOption("arrow.debug", FALSE)) { - print(conditionMessage(e)) +safe_r_metadata <- function(metadata, on_save = FALSE) { + # This function recurses through the list x and checks that all elements are + # of types that are allowed in R metadata. If it finds an element that is not + # allowed, it removes it. If any elements are removed, warn at the end. + any_removed <- FALSE + # Internal function that we'll recursively apply, + # and mutate the `any_removed` variable outside of it + check_r_metadata_types_recursive <- function(x) { + allowed_types <- c("character", "double", "integer", "logical", "complex", "list", "NULL") + if (is.list(x)) { + types <- map_chr(x, typeof) + x[types == "list"] <- map(x[types == "list"], check_r_metadata_types_recursive) + ok <- types %in% allowed_types + if (!all(ok)) { + any_removed <<- TRUE + x <- x[ok] + } } + x + } + new <- check_r_metadata_types_recursive(metadata) + + # On save: don't warn, just save the filtered metadata + if (on_save) { + return(new) + } + # On load: warn if any elements were removed + if (any_removed) { if (getOption("arrow.unsafe_metadata", FALSE)) { - # We've opted-in to unsafe metadata, so we'll just return the metadata - # TODO: should we warn here anyway? + # We've opted-in to unsafe metadata, so warn but return the original metadata + rlang::warn("R metadata may have unsafe elements") + new <- metadata } else { rlang::warn( - "R metadata may contain unsafe elements and has been discarded.", - body = c("i" = "If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to use it.") + "Potentially unsafe elements have been discarded from R metadata.", + body = c("i" = "If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.") ) - out <<- NULL } - }) - - out + } + new } #' @importFrom rlang trace_back diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 907fc6b4d0f..d5aec50219e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -5347,15 +5347,6 @@ BEGIN_CPP11 return cpp11::as_sexp(arrow__UnifySchemas(schemas)); END_CPP11 } -// schema.cpp -void safe_r_metadata(SEXP x); -extern "C" SEXP _arrow_safe_r_metadata(SEXP x_sexp){ -BEGIN_CPP11 - arrow::r::Input::type x(x_sexp); - safe_r_metadata(x); - return R_NilValue; -END_CPP11 -} // table.cpp int Table__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_Table__num_columns(SEXP x_sexp){ @@ -6192,7 +6183,6 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1}, { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3}, { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1}, - { "_arrow_safe_r_metadata", (DL_FUNC) &_arrow_safe_r_metadata, 1}, { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, diff --git a/r/src/schema.cpp b/r/src/schema.cpp index 79cab491675..41d3d38d2ed 100644 --- a/r/src/schema.cpp +++ b/r/src/schema.cpp @@ -167,91 +167,3 @@ std::shared_ptr arrow__UnifySchemas( const std::vector>& schemas) { return ValueOrStop(arrow::UnifySchemas(schemas)); } - -void check_r_metadata_attributes(SEXP x); -void check_r_metadata_columns(SEXP x); - -// [[arrow::export]] -void safe_r_metadata(SEXP x) { - if (Rf_isNull(x)) { - return; - } - if (TYPEOF(x) != VECSXP) { - cpp11::stop("R metadata must be a list, not a %s", Rf_type2char(TYPEOF(x))); - } - // Check the names of the list - SEXP names = Rf_getAttrib(x, R_NamesSymbol); - if (Rf_isNull(names)) { - cpp11::stop("R metadata must have names"); - } - const char* name_str; - for (R_xlen_t i = 0; i < Rf_length(names); i++) { - name_str = CHAR(STRING_ELT(names, i)); - // The names may have "attributes" and "columns" only - if (strcmp(name_str, "attributes") == 0) { - check_r_metadata_attributes(VECTOR_ELT(x, i)); - } else if (strcmp(name_str, "columns") == 0) { - check_r_metadata_columns(VECTOR_ELT(x, i)); - } else { - cpp11::stop("Invalid name '%s' for R metadata", name_str); - } - } -} - -void check_r_metadata_attributes(SEXP x) { - if (Rf_isNull(x)) { - return; - } - if (TYPEOF(x) != VECSXP) { - cpp11::stop("attributes must be a list, not a %s", Rf_type2char(TYPEOF(x))); - } - SEXP attr; - for (R_xlen_t i = 0; i < Rf_length(x); i++) { - attr = VECTOR_ELT(x, i); - switch (TYPEOF(attr)) { - // This is effectively an allowlist of types - // We could switch this to a blocklist and maybe just exclude environment? - // As that's the only thing that could contain an active binding? - // TODO: whatever we restrict here on load, we should also restrict when - // saving metadata. - case VECSXP: - // Recurse - return check_r_metadata_attributes(attr); - case STRSXP: - break; - case REALSXP: - break; - case INTSXP: - break; - case LGLSXP: - break; - case CPLXSXP: - break; - case RAWSXP: - break; - case NILSXP: - break; - // Should we error on externalptr (which won't be useful) or just allow it? - // Should we error on environment (which could be an R6 object) - // or inspect it and make sure there are no active bindings in it? - // Should we error on EXPRSXP (e.g. a formula) or just allow it? - // Should we error on OBJSXP (S4 objects)? - default: - cpp11::stop("Invalid attribute type: %s", Rf_type2char(TYPEOF(attr))); - } - } -} - -void check_r_metadata_columns(SEXP x) { - // "columns" is named and each element of "columns" is a list - // with "attributes" and/or "columns" elements--i.e. the same as the top-level - if (Rf_isNull(x)) { - return; - } - if (TYPEOF(x) != VECSXP) { - cpp11::stop("columns must be a list, not a %s", Rf_type2char(TYPEOF(x))); - } - for (R_xlen_t i = 0; i < Rf_length(x); i++) { - safe_r_metadata(VECTOR_ELT(x, i)); - } -} From 3f0773f0d0883559c5f4acecb18cc81e8bb8933f Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Jun 2024 16:58:23 -0400 Subject: [PATCH 03/10] Add test for data with promise in it. With workaround, we can make this safe in older R --- r/R/metadata.R | 39 +++++++++--------------- r/tests/testthat/test-metadata.R | 52 ++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 48 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index befed792e9c..f4c377d72aa 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -44,7 +44,7 @@ } .deserialize_arrow_r_metadata <- function(x) { - tryCatch(safe_unserialize_ascii(x), + tryCatch(unserialize_r_metadata(x), error = function(e) { if (getOption("arrow.debug", FALSE)) { print(conditionMessage(e)) @@ -55,40 +55,19 @@ ) } -safe_unserialize_ascii <- function(x) { - # First, check if we can call unserialize() at all - # By default, we only call unserialize() in R 4.4.0 or newer - # but you can enable it in older versions by setting the option - r_4.4_or_newer <- getRversion() >= "4.4" - if (!isTRUE(getOption("arrow.unserialize_metadata", r_4.4_or_newer))) { - opts <- c(">" = "To enable, set `options(arrow.unserialize_metadata = TRUE)`") - if (!r_4.4_or_newer) { - opts <- c( - opts, - ">" = "Or, upgrade to R 4.4.0 or newer" - ) - } - rlang::warn( - "Unserialization of R metadata is disabled.", - body = opts, - .frequency = "once", - .frequency_id = "arrow.unserialize_metadata" - ) - return(NULL) - } - +unserialize_r_metadata <- function(x) { # Check that this is ASCII serialized data (as in, what we wrote) if (!identical(substr(unclass(x), 1, 1), "A")) { stop("Invalid serialized data") } - out <- unserialize(charToRaw(x)) + out <- safe_unserialize(charToRaw(x)) # If it's still raw, check for the gzip magic number and uncompress if (is.raw(out) && identical(out[1:2], as.raw(c(31, 139)))) { decompressed <- memDecompress(out, type = "gzip") if (!identical(substr(decompressed, 1, 1), "A")) { stop("Invalid serialized data") } - out <- unserialize(decompressed) + out <- safe_unserialize(decompressed) } if (!is.list(out)) { stop("Invalid serialized data") @@ -96,6 +75,16 @@ safe_unserialize_ascii <- function(x) { safe_r_metadata(out) } +safe_unserialize <- function(x) { + # By capturing the data in a list, we can inspect it for promises without + # triggering their evaluation. + out <- list(unserialize(x)) + if (typeof(out[[1]]) == "promise") { + stop("Serialized data contains a promise object") + } + out[[1]] +} + safe_r_metadata <- function(metadata, on_save = FALSE) { # This function recurses through the list x and checks that all elements are # of types that are allowed in R metadata. If it finds an element that is not diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 409caebf326..139d885d304 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -withr::local_options(list(arrow.unserialize_metadata = TRUE)) - test_that("Schema metadata", { s <- schema(b = double()) expect_equal(s$metadata, empty_named_list()) @@ -118,32 +116,40 @@ test_that("Garbage R metadata doesn't break things", { "Invalid metadata$r", fixed = TRUE ) -}) -test_that("On older R versions, metadata serialization is off by default", { - skip_if(getRversion() >= "4.4") - - rlang::reset_warning_verbosity("arrow.unserialize_metadata") - op <- options(arrow.unserialize_metadata = NULL) - on.exit(options(op)) - - tab <- Table$create(example_with_metadata) - expect_warning( - expect_null(tab$metadata$r), - "Unserialization of R metadata is disabled. -> To enable, set `options(arrow.unserialize_metadata = TRUE)` -> Or, upgrade to R 4.4.0 or newer", - fixed = TRUE - ) - rlang::reset_warning_verbosity("arrow.unserialize_metadata") - - options(arrow.unserialize_metadata = TRUE) - expect_warning( - expect_type(tab$metadata$r, "list"), + # https://hiddenlayer.com/research/r-bitrary-code-execution/ + tab$metadata <- list(r = "A +3 +262913 +197888 +5 +UTF-8 +5 +252 +6 +1 +262153 +7 +message +2 +16 +1 +262153 +32 +arbitrary\040code\040was\040just\040executed +254 +") + expect_message( + expect_warning( + as.data.frame(tab), + "Invalid metadata$r", + fixed = TRUE + ), NA ) }) + test_that("Metadata serialization compression", { # attributes that (when serialized) are just under 100kb are not compressed, # and simply serialized From df7075f335515af1c83d4bf72718f866a29e10bd Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Jun 2024 17:18:26 -0400 Subject: [PATCH 04/10] moar test --- r/tests/testthat/test-metadata.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 139d885d304..e874dd90753 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -149,6 +149,30 @@ arbitrary\040code\040was\040just\040executed ) }) +test_that("Complex or unsafe attributes are pruned from R metadata, if they exist", { + tab <- Table$create(example_data[1:6]) + bad <- new.env() + makeActiveBinding("class", function() stop("This should not run"), bad) + tab$metadata <- list(r = rawToChar(serialize(list(attributes = bad), NULL, ascii = TRUE))) + expect_warning( + as.data.frame(tab), + "Potentially unsafe elements have been discarded from R metadata. +i If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.", + fixed = TRUE + ) + # You can set an option to allow them through. + # It still warns, just differently, and it doesn't prune the attributes + withr::local_options(list("arrow.unsafe_metadata" = TRUE)) + expect_warning( + expect_warning( + as.data.frame(tab), + "R metadata may have unsafe elements" + ), + # This particular example ultimately fails because it's not a list + "Invalid metadata$r", + fixed = TRUE + ) +}) test_that("Metadata serialization compression", { # attributes that (when serialized) are just under 100kb are not compressed, From 7f0271c29fdd5822f4917f1b76ce7e1510ff37e2 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Jun 2024 19:40:42 -0400 Subject: [PATCH 05/10] Report which types have been dropped --- r/R/metadata.R | 19 +++++++++++++------ r/tests/testthat/test-metadata.R | 8 +++++--- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index f4c377d72aa..e82f73ef6b0 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -89,7 +89,7 @@ safe_r_metadata <- function(metadata, on_save = FALSE) { # This function recurses through the list x and checks that all elements are # of types that are allowed in R metadata. If it finds an element that is not # allowed, it removes it. If any elements are removed, warn at the end. - any_removed <- FALSE + types_removed <- c() # Internal function that we'll recursively apply, # and mutate the `any_removed` variable outside of it check_r_metadata_types_recursive <- function(x) { @@ -99,7 +99,7 @@ safe_r_metadata <- function(metadata, on_save = FALSE) { x[types == "list"] <- map(x[types == "list"], check_r_metadata_types_recursive) ok <- types %in% allowed_types if (!all(ok)) { - any_removed <<- TRUE + types_removed <<- c(types_removed, setdiff(types, allowed_types)) x <- x[ok] } } @@ -112,15 +112,22 @@ safe_r_metadata <- function(metadata, on_save = FALSE) { return(new) } # On load: warn if any elements were removed - if (any_removed) { + if (length(types_removed)) { + types_msg <- paste("Type:", oxford_paste(unique(types_removed))) if (getOption("arrow.unsafe_metadata", FALSE)) { # We've opted-in to unsafe metadata, so warn but return the original metadata - rlang::warn("R metadata may have unsafe elements") + rlang::warn( + "R metadata may have unsafe or invalid elements", + body = c("i" = types_msg) + ) new <- metadata } else { rlang::warn( - "Potentially unsafe elements have been discarded from R metadata.", - body = c("i" = "If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.") + "Potentially unsafe or invalid elements have been discarded from R metadata.", + body = c( + "i" = types_msg, + ">" = "If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them." + ) ) } } diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index e874dd90753..175e7ef3b6b 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -156,8 +156,9 @@ test_that("Complex or unsafe attributes are pruned from R metadata, if they exis tab$metadata <- list(r = rawToChar(serialize(list(attributes = bad), NULL, ascii = TRUE))) expect_warning( as.data.frame(tab), - "Potentially unsafe elements have been discarded from R metadata. -i If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.", + "Potentially unsafe or invalid elements have been discarded from R metadata. +i Type: \"environment\" +> If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.", fixed = TRUE ) # You can set an option to allow them through. @@ -166,7 +167,8 @@ i If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` t expect_warning( expect_warning( as.data.frame(tab), - "R metadata may have unsafe elements" + "R metadata may have unsafe or invalid elements +i Type: \"environment\"" ), # This particular example ultimately fails because it's not a list "Invalid metadata$r", From 860333bcb90a9d53f9dd70aac4693fcbb5525868 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Jun 2024 21:05:06 -0400 Subject: [PATCH 06/10] Fix for backwards compat tests --- r/R/metadata.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index e82f73ef6b0..dbed614f84d 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -61,16 +61,16 @@ unserialize_r_metadata <- function(x) { stop("Invalid serialized data") } out <- safe_unserialize(charToRaw(x)) - # If it's still raw, check for the gzip magic number and uncompress - if (is.raw(out) && identical(out[1:2], as.raw(c(31, 139)))) { + # If it's still raw, decompress and unserialize again + if (is.raw(out)) { decompressed <- memDecompress(out, type = "gzip") - if (!identical(substr(decompressed, 1, 1), "A")) { - stop("Invalid serialized data") + if (!identical(rawToChar(decompressed[1]), "A")) { + stop("Invalid serialized compressed data") } out <- safe_unserialize(decompressed) } if (!is.list(out)) { - stop("Invalid serialized data") + stop("Invalid serialized data: must be a list") } safe_r_metadata(out) } From f1c4cddb2322aee931cc816186aa391e3d55301c Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Jun 2024 21:09:23 -0400 Subject: [PATCH 07/10] Make VctrsExtensionType safe too --- r/R/extension.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/extension.R b/r/R/extension.R index 59a02121fd1..4d1d7987c00 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -429,7 +429,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", paste0(capture.output(print(self$ptype())), collapse = "\n") }, deserialize_instance = function() { - private$.ptype <- unserialize(self$extension_metadata()) + private$.ptype <- safe_unserialize(self$extension_metadata()) }, ExtensionEquals = function(other) { inherits(other, "VctrsExtensionType") && identical(self$ptype(), other$ptype()) From 07155c80999d204ffdba2f7df834299ea88a0d6d Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Jun 2024 21:14:14 -0400 Subject: [PATCH 08/10] MORE SAFETY --- r/R/extension.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/extension.R b/r/R/extension.R index 4d1d7987c00..3529144e115 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -429,7 +429,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", paste0(capture.output(print(self$ptype())), collapse = "\n") }, deserialize_instance = function() { - private$.ptype <- safe_unserialize(self$extension_metadata()) + private$.ptype <- safe_r_metadata(safe_unserialize(self$extension_metadata())) }, ExtensionEquals = function(other) { inherits(other, "VctrsExtensionType") && identical(self$ptype(), other$ptype()) From d69c31656f46ec74bb292a66b72e3cd7181c1573 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Jun 2024 23:21:16 -0400 Subject: [PATCH 09/10] More commenting --- r/R/metadata.R | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index dbed614f84d..ba73f085788 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -86,12 +86,25 @@ safe_unserialize <- function(x) { } safe_r_metadata <- function(metadata, on_save = FALSE) { - # This function recurses through the list x and checks that all elements are - # of types that are allowed in R metadata. If it finds an element that is not - # allowed, it removes it. If any elements are removed, warn at the end. + # This function recurses through the metadata list and checks that all + # elements are of types that are allowed in R metadata. + # If it finds an element that is not allowed, it removes it. + # + # This function is used both when saving and loading metadata. + # @param on_save: If TRUE, the function will not warn if it removes elements: + # we're just cleaning up the metadata for saving. If FALSE, it means we're + # loading the metadata, and we'll warn if we find invalid elements. + # + # When loading metadata, you can optionally keep the invalid elements by + # setting `options(arrow.unsafe_metadata = TRUE)`. It will still check + # for invalid elements and warn if any are found, though. + + # This variable will be used to store the types of elements that were removed, + # if any, so we can give an informative warning if needed. types_removed <- c() + # Internal function that we'll recursively apply, - # and mutate the `any_removed` variable outside of it + # and mutate the `types_removed` variable outside of it. check_r_metadata_types_recursive <- function(x) { allowed_types <- c("character", "double", "integer", "logical", "complex", "list", "NULL") if (is.list(x)) { @@ -99,6 +112,7 @@ safe_r_metadata <- function(metadata, on_save = FALSE) { x[types == "list"] <- map(x[types == "list"], check_r_metadata_types_recursive) ok <- types %in% allowed_types if (!all(ok)) { + # Record the invalid types, then remove the offending elements types_removed <<- c(types_removed, setdiff(types, allowed_types)) x <- x[ok] } From e64b85fd196858dfcd6fd7586bf4c97bdaee9108 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 13 Jun 2024 09:17:26 -0400 Subject: [PATCH 10/10] news --- r/NEWS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index dc89fa266e3..317e546a1b7 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,11 +19,10 @@ # arrow 16.1.0.9000 -# arrow 16.1.0 - * R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are automatic translations rather than true user-defined functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223) * `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. * The `na_matches` argument to the `dplyr::*_join()` functions is now supported. This argument controls whether `NA` values are considered equal when joining. (#41358) +* R metadata, stored in the Arrow schema to support round-tripping data between R and Arrow/Parquet, is now serialized and deserialized more strictly. This makes it safer to load data from files from unknown sources into R data.frames. (#41969) # arrow 16.1.0