diff --git a/r/NEWS.md b/r/NEWS.md index dc89fa266e3..317e546a1b7 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,11 +19,10 @@ # arrow 16.1.0.9000 -# arrow 16.1.0 - * R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are automatic translations rather than true user-defined functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223) * `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. * The `na_matches` argument to the `dplyr::*_join()` functions is now supported. This argument controls whether `NA` values are considered equal when joining. (#41358) +* R metadata, stored in the Arrow schema to support round-tripping data between R and Arrow/Parquet, is now serialized and deserialized more strictly. This makes it safer to load data from files from unknown sources into R data.frames. (#41969) # arrow 16.1.0 diff --git a/r/R/extension.R b/r/R/extension.R index 59a02121fd1..3529144e115 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -429,7 +429,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", paste0(capture.output(print(self$ptype())), collapse = "\n") }, deserialize_instance = function() { - private$.ptype <- unserialize(self$extension_metadata()) + private$.ptype <- safe_r_metadata(safe_unserialize(self$extension_metadata())) }, ExtensionEquals = function(other) { inherits(other, "VctrsExtensionType") && identical(self$ptype(), other$ptype()) diff --git a/r/R/metadata.R b/r/R/metadata.R index 3ae2db4eaa7..ba73f085788 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -30,7 +30,7 @@ } } - out <- serialize(x, NULL, ascii = TRUE) + out <- serialize(safe_r_metadata(x, on_save = TRUE), NULL, ascii = TRUE) # if the metadata is over 100 kB, compress if (option_compress_metadata() && object.size(out) > 100000) { @@ -44,23 +44,110 @@ } .deserialize_arrow_r_metadata <- function(x) { - tryCatch( - expr = { - out <- unserialize(charToRaw(x)) - - # if this is still raw, try decompressing - if (is.raw(out)) { - out <- unserialize(memDecompress(out, type = "gzip")) - } - out - }, + tryCatch(unserialize_r_metadata(x), error = function(e) { + if (getOption("arrow.debug", FALSE)) { + print(conditionMessage(e)) + } warning("Invalid metadata$r", call. = FALSE) NULL } ) } +unserialize_r_metadata <- function(x) { + # Check that this is ASCII serialized data (as in, what we wrote) + if (!identical(substr(unclass(x), 1, 1), "A")) { + stop("Invalid serialized data") + } + out <- safe_unserialize(charToRaw(x)) + # If it's still raw, decompress and unserialize again + if (is.raw(out)) { + decompressed <- memDecompress(out, type = "gzip") + if (!identical(rawToChar(decompressed[1]), "A")) { + stop("Invalid serialized compressed data") + } + out <- safe_unserialize(decompressed) + } + if (!is.list(out)) { + stop("Invalid serialized data: must be a list") + } + safe_r_metadata(out) +} + +safe_unserialize <- function(x) { + # By capturing the data in a list, we can inspect it for promises without + # triggering their evaluation. + out <- list(unserialize(x)) + if (typeof(out[[1]]) == "promise") { + stop("Serialized data contains a promise object") + } + out[[1]] +} + +safe_r_metadata <- function(metadata, on_save = FALSE) { + # This function recurses through the metadata list and checks that all + # elements are of types that are allowed in R metadata. + # If it finds an element that is not allowed, it removes it. + # + # This function is used both when saving and loading metadata. + # @param on_save: If TRUE, the function will not warn if it removes elements: + # we're just cleaning up the metadata for saving. If FALSE, it means we're + # loading the metadata, and we'll warn if we find invalid elements. + # + # When loading metadata, you can optionally keep the invalid elements by + # setting `options(arrow.unsafe_metadata = TRUE)`. It will still check + # for invalid elements and warn if any are found, though. + + # This variable will be used to store the types of elements that were removed, + # if any, so we can give an informative warning if needed. + types_removed <- c() + + # Internal function that we'll recursively apply, + # and mutate the `types_removed` variable outside of it. + check_r_metadata_types_recursive <- function(x) { + allowed_types <- c("character", "double", "integer", "logical", "complex", "list", "NULL") + if (is.list(x)) { + types <- map_chr(x, typeof) + x[types == "list"] <- map(x[types == "list"], check_r_metadata_types_recursive) + ok <- types %in% allowed_types + if (!all(ok)) { + # Record the invalid types, then remove the offending elements + types_removed <<- c(types_removed, setdiff(types, allowed_types)) + x <- x[ok] + } + } + x + } + new <- check_r_metadata_types_recursive(metadata) + + # On save: don't warn, just save the filtered metadata + if (on_save) { + return(new) + } + # On load: warn if any elements were removed + if (length(types_removed)) { + types_msg <- paste("Type:", oxford_paste(unique(types_removed))) + if (getOption("arrow.unsafe_metadata", FALSE)) { + # We've opted-in to unsafe metadata, so warn but return the original metadata + rlang::warn( + "R metadata may have unsafe or invalid elements", + body = c("i" = types_msg) + ) + new <- metadata + } else { + rlang::warn( + "Potentially unsafe or invalid elements have been discarded from R metadata.", + body = c( + "i" = types_msg, + ">" = "If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them." + ) + ) + } + } + new +} + #' @importFrom rlang trace_back apply_arrow_r_metadata <- function(x, r_metadata) { if (is.null(r_metadata)) { diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index e44cd710380..175e7ef3b6b 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -107,6 +107,73 @@ test_that("Garbage R metadata doesn't break things", { "Invalid metadata$r", fixed = TRUE ) + + bad <- new.env(parent = emptyenv()) + makeActiveBinding("columns", function() stop("This should not run"), bad) + tab$metadata <- list(r = rawToChar(serialize(bad, NULL, ascii = TRUE))) + expect_warning( + as.data.frame(tab), + "Invalid metadata$r", + fixed = TRUE + ) + + # https://hiddenlayer.com/research/r-bitrary-code-execution/ + tab$metadata <- list(r = "A +3 +262913 +197888 +5 +UTF-8 +5 +252 +6 +1 +262153 +7 +message +2 +16 +1 +262153 +32 +arbitrary\040code\040was\040just\040executed +254 +") + expect_message( + expect_warning( + as.data.frame(tab), + "Invalid metadata$r", + fixed = TRUE + ), + NA + ) +}) + +test_that("Complex or unsafe attributes are pruned from R metadata, if they exist", { + tab <- Table$create(example_data[1:6]) + bad <- new.env() + makeActiveBinding("class", function() stop("This should not run"), bad) + tab$metadata <- list(r = rawToChar(serialize(list(attributes = bad), NULL, ascii = TRUE))) + expect_warning( + as.data.frame(tab), + "Potentially unsafe or invalid elements have been discarded from R metadata. +i Type: \"environment\" +> If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.", + fixed = TRUE + ) + # You can set an option to allow them through. + # It still warns, just differently, and it doesn't prune the attributes + withr::local_options(list("arrow.unsafe_metadata" = TRUE)) + expect_warning( + expect_warning( + as.data.frame(tab), + "R metadata may have unsafe or invalid elements +i Type: \"environment\"" + ), + # This particular example ultimately fails because it's not a list + "Invalid metadata$r", + fixed = TRUE + ) }) test_that("Metadata serialization compression", { @@ -254,6 +321,8 @@ test_that("Row-level metadata (does not) roundtrip in datasets", { skip_if_not_available("dataset") skip_if_not_available("parquet") + library(dplyr, warn.conflicts = FALSE) + df <- tibble::tibble( metadata = list( structure(1, my_value_as_attr = 1),