Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions r/R/dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,8 @@ source_data <- function(x) {
}

is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query")

has_aggregation <- function(x) {
# TODO: update with joins (check right side data too)
!is.null(x$aggregations) || (is_collapsed(x) && has_aggregation(x$.data))
}
27 changes: 26 additions & 1 deletion r/R/query-engine.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,36 @@ do_exec_plan <- function(.data) {
final_node <- plan$Build(.data)
tab <- plan$Run(final_node)

# If arrange() created $temp_columns, make sure to omit them from the result
# We can't currently handle this in the ExecPlan itself because sorting
# happens in the end (SinkNode) so nothing comes after it.
if (length(final_node$sort$temp_columns) > 0) {
# If arrange() created $temp_columns, make sure to omit them from the result
tab <- tab[, setdiff(names(tab), final_node$sort$temp_columns), drop = FALSE]
}

if (ncol(tab)) {
# Apply any column metadata from the original schema, where appropriate
original_schema <- source_data(.data)$schema
# TODO: do we care about other (non-R) metadata preservation?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we detect that it is there and warn/message that we've discarded it? It might be a little bit confusing, but I think that would be preferable to "Arrow lost all my metadata!"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already do discard other metadata when you convert from Arrow to R. Check out the example parquet file in inst/, for example: it has "pandas" metadata. I suspect it would be more surprising/alarming if we started messaging about this metadata all of the sudden.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, Arrow to R conversion losing the metadata sounds like what I expected. TBH I am a little surprised that the following happens silently, since the tab below does have the pandas metadata there. Though that's been in Arrow for a while and if no one has complained, I guess it's not that important.

> tab <- read_parquet(system.file("v0.7.1.parquet", package = "arrow"), as_data_frame = FALSE)
> 
> dplyr::select(tab, carat)$metadata
NULL

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, on 5.0:

new <- tab %>% 
  select(carat) %>% 
  compute()
new$metadata

$pandas
[1] "{\"index_columns\": [\"__index_level_0__\"], \"column_indexes\": [{\"name\":
...

I still don't think this is necessarily meaningful--especially when you're dealing with multiple files with potentially different metadata.

# How would we know if it were meaningful?
r_meta <- original_schema$metadata$r
if (!is.null(r_meta)) {
r_meta <- .unserialize_arrow_r_metadata(r_meta)
# Filter r_metadata$columns on columns with name _and_ type match
new_schema <- tab$schema
common_names <- intersect(names(r_meta$columns), names(tab))
keep <- common_names[
map_lgl(common_names, ~ original_schema[[.]] == new_schema[[.]])
]
r_meta$columns <- r_meta$columns[keep]
if (has_aggregation(.data)) {
# dplyr drops top-level attributes if you do summarize
r_meta$attributes <- NULL
}
tab$metadata$r <- .serialize_arrow_r_metadata(r_meta)
}
}

tab
}

Expand Down
72 changes: 62 additions & 10 deletions r/tests/testthat/test-metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
# specific language governing permissions and limitations
# under the License.

context("Schema metadata and R attributes")
# local_edition(3)

test_that("Schema metadata", {
s <- schema(b = double())
expect_equivalent(s$metadata, list())
expect_equal(s$metadata, empty_named_list())
expect_false(s$HasMetadata)
s$metadata <- list(test = TRUE)
expect_identical(s$metadata, list(test = "TRUE"))
Expand All @@ -31,7 +31,7 @@ test_that("Schema metadata", {
expect_identical(s$metadata, list(test = "TRUE"))
expect_true(s$HasMetadata)
s$metadata <- NULL
expect_equivalent(s$metadata, list())
expect_equal(s$metadata, empty_named_list())
expect_false(s$HasMetadata)
expect_error(
s$metadata <- 4,
Expand All @@ -41,15 +41,15 @@ test_that("Schema metadata", {

test_that("Table metadata", {
tab <- Table$create(x = 1:2, y = c("a", "b"))
expect_equivalent(tab$metadata, list())
expect_equal(tab$metadata, empty_named_list())
tab$metadata <- list(test = TRUE)
expect_identical(tab$metadata, list(test = "TRUE"))
tab$metadata$foo <- 42
expect_identical(tab$metadata, list(test = "TRUE", foo = "42"))
tab$metadata$foo <- NULL
expect_identical(tab$metadata, list(test = "TRUE"))
tab$metadata <- NULL
expect_equivalent(tab$metadata, list())
expect_equal(tab$metadata, empty_named_list())
})

test_that("Table R metadata", {
Expand Down Expand Up @@ -126,15 +126,15 @@ test_that("Metadata serialization compression", {

test_that("RecordBatch metadata", {
rb <- RecordBatch$create(x = 1:2, y = c("a", "b"))
expect_equivalent(rb$metadata, list())
expect_equal(rb$metadata, empty_named_list())
rb$metadata <- list(test = TRUE)
expect_identical(rb$metadata, list(test = "TRUE"))
rb$metadata$foo <- 42
expect_identical(rb$metadata, list(test = "TRUE", foo = "42"))
rb$metadata$foo <- NULL
expect_identical(rb$metadata, list(test = "TRUE"))
rb$metadata <- NULL
expect_equivalent(rb$metadata, list())
expect_equal(rb$metadata, empty_named_list())
})

test_that("RecordBatch R metadata", {
Expand Down Expand Up @@ -212,6 +212,7 @@ test_that("metadata of list elements (ARROW-10386)", {
skip_if_not_available("dataset")
skip_if_not_available("parquet")

local_edition(3)
library(dplyr)

df <- tibble::tibble(
Expand Down Expand Up @@ -239,14 +240,65 @@ test_that("metadata of list elements (ARROW-10386)", {
ds <- open_dataset(dst_dir)
expect_warning(
df_from_ds <- collect(ds),
NA # TODO: ARROW-13852
# "Row-level metadata is not compatible with this operation and has been ignored"
"Row-level metadata is not compatible with this operation and has been ignored"
)
expect_equal(
arrange(df_from_ds, int),
arrange(df, int),
ignore_attr = TRUE
)
expect_equal(arrange(df_from_ds, int), arrange(df, int), check.attributes = FALSE)

# however there is *no* warning if we don't select the metadata column
expect_warning(
df_from_ds <- ds %>% select(int) %>% collect(),
NA
)
})

test_that("dplyr with metadata", {
skip_if_not_available("dataset")

expect_dplyr_equal(
input %>%
collect(),
example_with_metadata
)
expect_dplyr_equal(
input %>%
select(a) %>%
collect(),
example_with_metadata
)
expect_dplyr_equal(
input %>%
mutate(z = b * 4) %>%
select(z, a) %>%
collect(),
example_with_metadata
)
expect_dplyr_equal(
input %>%
mutate(z = nchar(a)) %>%
select(z, a) %>%
collect(),
example_with_metadata
)
# dplyr drops top-level attributes if you do summarize, though attributes
# of grouping columns appear to come through
expect_dplyr_equal(
input %>%
group_by(a) %>%
summarize(n()) %>%
collect(),
example_with_metadata
)
# Same name in output but different data, so the column metadata shouldn't
# carry through
expect_dplyr_equal(
input %>%
mutate(a = nchar(a)) %>%
select(a) %>%
collect(),
example_with_metadata
)
})