From d0f248390a622a1a08fcd0919e8d59fe72ddc88b Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 1 Jul 2021 17:43:26 -0500 Subject: [PATCH 1/5] Don't even try to apply row-level metadata with datasets --- r/NAMESPACE | 1 + r/R/metadata.R | 16 ++++++++++++++++ r/tests/testthat/helper-arrow.R | 6 ++++++ r/tests/testthat/test-dataset.R | 6 ------ r/tests/testthat/test-metadata.R | 24 ++++++++++++++++++++++++ 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index ab45aa9985e..e616c589b46 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -301,6 +301,7 @@ importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,abort) importFrom(rlang,as_label) +importFrom(rlang,call_stack) importFrom(rlang,caller_env) importFrom(rlang,dots_n) importFrom(rlang,enexpr) diff --git a/r/R/metadata.R b/r/R/metadata.R index 408c2214a31..100e7995742 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -50,6 +50,7 @@ }) } +#' @importFrom rlang call_stack apply_arrow_r_metadata <- function(x, r_metadata) { tryCatch({ columns_metadata <- r_metadata$columns @@ -60,6 +61,21 @@ apply_arrow_r_metadata <- function(x, r_metadata) { } } } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { + # If we have a list and "columns_metadata" this is apply row-level metadata + # inside of a column in a dataframe. + + # However, if we are inside of a dataset collection, we cannot apply this + # row-level metadata, since the order of the rows is not gauranteed to be + # the same, so don't even try, but warn what's going on + stack <- call_stack() + in_dataset_collect <- any(map_lgl(stack, function(x) { + !is.null(x$fn_name) && x$fn_name == "collect.arrow_dplyr_query" + })) + if (in_dataset_collect) { + warning("Row-level metadata has been discarded") + break + } + x <- map2(x, columns_metadata, function(.x, .y) { apply_arrow_r_metadata(.x, .y) }) diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index 0abbfb6a13a..5f2dad841a1 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -67,3 +67,9 @@ test_that <- function(what, code) { r_only <- function(code) { withr::with_options(list(..skip.tests = FALSE), code) } + +make_temp_dir <- function() { + path <- tempfile() + dir.create(path) + normalizePath(path, winslash = "/") +} diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index a0b1bdae022..66493376e74 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -21,12 +21,6 @@ context("Dataset") library(dplyr) -make_temp_dir <- function() { - path <- tempfile() - dir.create(path) - normalizePath(path, winslash = "/") -} - dataset_dir <- make_temp_dir() hive_dir <- make_temp_dir() ipc_dir <- make_temp_dir() diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index afce1c2244c..fad99e22d9d 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -205,3 +205,27 @@ test_that("metadata of list elements (ARROW-10386)", { expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar") expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux") }) + + +test_that("metadata of list elements (ARROW-10386)", { + skip_if_not_available("dataset") + skip_if_not_available("parquet") + df <- tibble::tibble( + metadata = list( + structure(1, my_value_as_attr = 1), + structure(2, my_value_as_attr = 2), + structure(3, my_value_as_attr = 3), + structure(4, my_value_as_attr = 3)), + part = c(1, 3, 2, 1) + ) + + dst_dir <- make_temp_dir() + write_dataset(df, dst_dir, partitioning = "part") + ds <- open_dataset(dst_dir) + expect_warning( + df_from_ds <- dplyr::collect(ds), + "Row-level metadata has been discarded" + ) + + expect_equal(df_from_ds[c(1, 4, 3, 2), ], df, check.attributes = FALSE) +}) From ad23d0795b0e0939ac18f4f2dbf612917facb691 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 1 Jul 2021 18:41:00 -0500 Subject: [PATCH 2/5] oops --- r/R/metadata.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index 100e7995742..9be12d79ad1 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -73,12 +73,11 @@ apply_arrow_r_metadata <- function(x, r_metadata) { })) if (in_dataset_collect) { warning("Row-level metadata has been discarded") - break + } else { + x <- map2(x, columns_metadata, function(.x, .y) { + apply_arrow_r_metadata(.x, .y) + }) } - - x <- map2(x, columns_metadata, function(.x, .y) { - apply_arrow_r_metadata(.x, .y) - }) x } From af9ee1851751ff83ad149b38cf8f20e6ba2b7329 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Tue, 13 Jul 2021 11:04:50 -0500 Subject: [PATCH 3/5] typo --- r/R/metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index 9be12d79ad1..fb58a83135d 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -61,7 +61,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) { } } } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { - # If we have a list and "columns_metadata" this is apply row-level metadata + # If we have a list and "columns_metadata" this applies row-level metadata # inside of a column in a dataframe. # However, if we are inside of a dataset collection, we cannot apply this From 0569f3ecc4415994637abce737f5c87f34b5efcd Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 15 Jul 2021 13:05:45 -0500 Subject: [PATCH 4/5] Update r/R/metadata.R Co-authored-by: Neal Richardson --- r/R/metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index fb58a83135d..72de201037d 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -65,7 +65,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) { # inside of a column in a dataframe. # However, if we are inside of a dataset collection, we cannot apply this - # row-level metadata, since the order of the rows is not gauranteed to be + # row-level metadata, since the order of the rows is not guaranteed to be # the same, so don't even try, but warn what's going on stack <- call_stack() in_dataset_collect <- any(map_lgl(stack, function(x) { From f9bc74d72b74b7d68582cac32d341b794c9b3938 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 15 Jul 2021 14:29:31 -0500 Subject: [PATCH 5/5] use `trace_back` instead of `stack_trace` which has now been deprecated. Also warn/don't save row-level metadata with datasets. --- r/NAMESPACE | 2 +- r/R/metadata.R | 42 +++++++++++++++++++++++--------- r/tests/testthat/test-metadata.R | 26 +++++++++++++++++--- 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index e616c589b46..814868d8ade 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -301,7 +301,6 @@ importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,abort) importFrom(rlang,as_label) -importFrom(rlang,call_stack) importFrom(rlang,caller_env) importFrom(rlang,dots_n) importFrom(rlang,enexpr) @@ -328,6 +327,7 @@ importFrom(rlang,quos) importFrom(rlang,seq2) importFrom(rlang,set_names) importFrom(rlang,syms) +importFrom(rlang,trace_back) importFrom(rlang,warn) importFrom(stats,median) importFrom(stats,na.exclude) diff --git a/r/R/metadata.R b/r/R/metadata.R index 72de201037d..505d0653b4a 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -50,7 +50,7 @@ }) } -#' @importFrom rlang call_stack +#' @importFrom rlang trace_back apply_arrow_r_metadata <- function(x, r_metadata) { tryCatch({ columns_metadata <- r_metadata$columns @@ -64,15 +64,19 @@ apply_arrow_r_metadata <- function(x, r_metadata) { # If we have a list and "columns_metadata" this applies row-level metadata # inside of a column in a dataframe. - # However, if we are inside of a dataset collection, we cannot apply this - # row-level metadata, since the order of the rows is not guaranteed to be - # the same, so don't even try, but warn what's going on - stack <- call_stack() - in_dataset_collect <- any(map_lgl(stack, function(x) { - !is.null(x$fn_name) && x$fn_name == "collect.arrow_dplyr_query" + # However, if we are inside of a dplyr collection (including all datasets), + # we cannot apply this row-level metadata, since the order of the rows is + # not guaranteed to be the same, so don't even try, but warn what's going on + trace <- trace_back() + in_dplyr_collect <- any(map_lgl(trace$calls, function(x) { + grepl("collect.arrow_dplyr_query", x, fixed = TRUE)[[1]] })) - if (in_dataset_collect) { - warning("Row-level metadata has been discarded") + if (in_dplyr_collect) { + warning( + "Row-level metadata is not compatible with this operation and has ", + "been ignored", + call. = FALSE + ) } else { x <- map2(x, columns_metadata, function(.x, .y) { apply_arrow_r_metadata(.x, .y) @@ -131,9 +135,23 @@ arrow_attributes <- function(x, only_top_level = FALSE) { columns <- NULL if (is.list(x) && !inherits(x, "POSIXlt")) { - # for list columns, we also keep attributes of each - # element in columns - columns <- map(x, arrow_attributes) + # However, if we are inside of a dplyr collection (including all datasets), + # we cannot apply this row-level metadata, since the order of the rows is + # not guaranteed to be the same, so don't even try, but warn what's going on + trace <- trace_back() + in_dataset_write <- any(map_lgl(trace$calls, function(x) { + grepl("write_dataset", x, fixed = TRUE)[[1]] + })) + if (in_dataset_write) { + warning( + "Row-level metadata is not compatible with datasets and will be discarded", + call. = FALSE + ) + } else { + # for list columns, we also keep attributes of each + # element in columns + columns <- map(x, arrow_attributes) + } if (all(map_lgl(columns, is.null))) { columns <- NULL } diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index fad99e22d9d..de3542b1c60 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -210,22 +210,40 @@ test_that("metadata of list elements (ARROW-10386)", { test_that("metadata of list elements (ARROW-10386)", { skip_if_not_available("dataset") skip_if_not_available("parquet") + + library(dplyr) + df <- tibble::tibble( metadata = list( structure(1, my_value_as_attr = 1), structure(2, my_value_as_attr = 2), structure(3, my_value_as_attr = 3), structure(4, my_value_as_attr = 3)), + int = 1L:4L, part = c(1, 3, 2, 1) ) dst_dir <- make_temp_dir() - write_dataset(df, dst_dir, partitioning = "part") - ds <- open_dataset(dst_dir) expect_warning( - df_from_ds <- dplyr::collect(ds), - "Row-level metadata has been discarded" + write_dataset(df, dst_dir, partitioning = "part"), + "Row-level metadata is not compatible with datasets and will be discarded" ) + # but we need to write a dataset with row-level metadata to make sure when + # reading ones that have been written with them we warn appropriately + fake_func_name <- write_dataset + fake_func_name(df, dst_dir, partitioning = "part") + + ds <- open_dataset(dst_dir) + expect_warning( + df_from_ds <- collect(ds), + "Row-level metadata is not compatible with this operation and has been ignored" + ) expect_equal(df_from_ds[c(1, 4, 3, 2), ], df, check.attributes = FALSE) + + # however there is *no* warning if we don't select the metadata column + expect_warning( + df_from_ds <- ds %>% select(int) %>% collect(), + NA + ) })