diff --git a/dev/tasks/r/github.linux.version.compatibility.yml b/dev/tasks/r/github.linux.version.compatibility.yml new file mode 100644 index 00000000000..2f64227eb8d --- /dev/null +++ b/dev/tasks/r/github.linux.version.compatibility.yml @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: must set "Crossbow" as name to have the badge links working in the +# github comment reports! +name: Crossbow + +on: + push + +jobs: + write-files: + name: "Write files" + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + env: + ARROW_R_DEV: "TRUE" + RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" + steps: + - name: Checkout Arrow + run: | + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + - name: Free Up Disk Space + shell: bash + run: arrow/ci/scripts/util_cleanup.sh + - name: Fetch Submodules and Tags + shell: bash + run: cd arrow && ci/scripts/util_checkout.sh + - uses: r-lib/actions/setup-r@v1 + - name: Install dependencies + run: | + install.packages(c("remotes", "glue", "sys")) + remotes::install_deps("arrow/r", dependencies = TRUE) + shell: Rscript {0} + - name: Install Arrow + run: | + cd arrow/r + R CMD INSTALL . + shell: bash + - name: Write files + run: | + cd arrow/r + R -f extra-tests/write-files.R + shell: bash + + - name: Upload the parquet artifacts + uses: actions/upload-artifact@v2 + with: + name: files + path: arrow/r/extra-tests/files + + read-files: + name: "Read files with Arrow {{ '${{ matrix.old_arrow_version }}' }}" + needs: [write-files] + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + old_arrow_version: + - "2.0.0" + - "1.0.1" + env: + RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" + OLD_ARROW_VERSION: {{ '${{ matrix.old_arrow_version }}' }} + steps: + - name: Checkout Arrow + run: | + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + - uses: r-lib/actions/setup-r@v1 + - name: Install old Arrow + run: | + install.packages(c("remotes", "testthat")) + remotes::install_version("arrow", "{{ '${{ matrix.old_arrow_version }}' }}") + shell: Rscript {0} + - name: Setup our testing directory, copy only the tests to it. + run: | + mkdir -p extra-tests/files + cp arrow/r/extra-tests/helper*.R extra-tests/ + cp arrow/r/extra-tests/test-*.R extra-tests/ + - name: Download artifacts + uses: actions/download-artifact@v2 + with: + name: files + path: extra-tests/files + - name: Test reading + run: | + testthat::test_dir("extra-tests") + shell: Rscript {0} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 569e59f80dd..79bc120cf72 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1752,6 +1752,10 @@ tasks: template: r/github.linux.cran.yml params: MATRIX: "${{ matrix.r_image }}" + + test-r-version-compatibility: + ci: github + template: r/github.linux.version.compatibility.yml test-r-rhub-ubuntu-gcc-release: ci: azure diff --git a/r/.Rbuildignore b/r/.Rbuildignore index 91a8d741a8e..cf4b7ce31ba 100644 --- a/r/.Rbuildignore +++ b/r/.Rbuildignore @@ -23,3 +23,4 @@ clang_format.sh ^autobrew$ ^apache-arrow.rb$ ^.*\.Rhistory$ +^extra-tests diff --git a/r/.gitignore b/r/.gitignore index e5ab1197071..76e8a8dd0bd 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -17,3 +17,4 @@ revdep/ vignettes/nyc-taxi/ arrow_*.tar.gz arrow_*.tgz +extra-tests/files diff --git a/r/extra-tests/helpers.R b/r/extra-tests/helpers.R new file mode 100644 index 00000000000..61b7da4ec25 --- /dev/null +++ b/r/extra-tests/helpers.R @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if_version <- function(version, op = `==`) { + op(packageVersion("arrow"), version) +} + +skip_if_version_less_than <- function(version, msg) { + if(if_version(version, `<`)) { + skip(msg) + } +} + +skip_if_version_equals <- function(version, msg) { + if(if_version(version, `==`)) { + skip(msg) + } +} diff --git a/r/extra-tests/test-read-files.R b/r/extra-tests/test-read-files.R new file mode 100644 index 00000000000..90efce3d791 --- /dev/null +++ b/r/extra-tests/test-read-files.R @@ -0,0 +1,165 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +library(arrow) +library(testthat) + +pq_file <- "files/ex_data.parquet" + +test_that("Can read the file (parquet)", { + # We can read with no error, we assert metadata below + expect_error( + df <- read_parquet(pq_file), + NA + ) +}) + +### Parquet +test_that("Can see the metadata (parquet)", { + skip_if_version_less_than("2.0.0", "Version 1.0.1 can't read new version metadata.") + + df <- read_parquet(pq_file) + expect_s3_class(df, "tbl") + + expect_equal( + attributes(df), + list( + names = letters[1:4], + row.names = 1L, + top_level = list( + field_one = 12, + field_two = "more stuff" + ), + class = c("tbl_df", "tbl", "data.frame") + ) + ) + + # column-level attributes + expect_equal(attributes(df$a), list(class = "special_string")) + expect_equal( + attributes(df$c), + list( + row.names = 1L, + names = c("c1", "c2", "c3"), + class = c("tbl_df", "tbl", "data.frame") + ) + ) +}) + +### Feather +for (comp in c("lz4", "uncompressed", "zstd")) { + feather_file <- paste0("files/ex_data_", comp, ".feather") + + test_that(paste0("Can read the file (feather ", comp, ")"), { + # We can read with no error, we assert metadata below + expect_error( + df <- read_feather(feather_file), + NA + ) + }) + + test_that(paste0("Can see the metadata (feather ", comp, ")"), { + skip_if_version_less_than("2.0.0", "Version 1.0.1 can't read new version metadata.") + + df <- read_feather(feather_file) + expect_s3_class(df, "tbl") + + expect_equal( + attributes(df), + list( + names = letters[1:4], + row.names = 1L, + top_level = list( + field_one = 12, + field_two = "more stuff" + ), + class = c("tbl_df", "tbl", "data.frame") + ) + ) + + # column-level attributes + expect_equal(attributes(df$a), list(class = "special_string")) + expect_equal( + attributes(df$c), + list( + row.names = 1L, + names = c("c1", "c2", "c3"), + class = c("tbl_df", "tbl", "data.frame") + ) + ) + }) +} + +test_that("Can read feather version 1", { + feather_v1_file <- "files/ex_data_v1.feather" + + df <- read_feather(feather_v1_file) + expect_s3_class(df, "tbl") + + expect_equal( + attributes(df), + list( + names = c("a", "b", "d"), + class = c("tbl_df", "tbl", "data.frame"), + row.names = 1L + ) + ) +}) + +### IPC Stream +stream_file <- "files/ex_data.stream" + +test_that("Can read the file (parquet)", { + # We can read with no error, we assert metadata below + expect_error( + df <- read_ipc_stream(stream_file), + NA + ) +}) + +test_that("Can see the metadata (stream)", { + skip_if_version_less_than("2.0.0", "Version 1.0.1 can't read new version metadata.") + df <- read_ipc_stream(stream_file) + + expect_s3_class(df, "tbl") + + expect_equal( + attributes(df), + list( + names = letters[1:4], + row.names = 1L, + top_level = list( + field_one = 12, + field_two = "more stuff" + ), + class = c("tbl_df", "tbl", "data.frame") + ) + ) + + # column-level attributes + expect_equal(attributes(df$a), list(class = "special_string")) + expect_equal( + attributes(df$c), + list( + row.names = 1L, + names = c("c1", "c2", "c3"), + class = c("tbl_df", "tbl", "data.frame") + ) + ) +}) + + diff --git a/r/extra-tests/write-files.R b/r/extra-tests/write-files.R new file mode 100644 index 00000000000..e0927ead4eb --- /dev/null +++ b/r/extra-tests/write-files.R @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +library(arrow) + +if (!dir.exists("extra-tests/files")) { + dir.create("extra-tests/files") +} + +source("tests/testthat/helper-data.R") + +write_parquet(example_with_metadata, "extra-tests/files/ex_data.parquet") + +for (comp in c("lz4", "uncompressed", "zstd")) { + if(!codec_is_available(comp)) break + + name <- paste0("extra-tests/files/ex_data_", comp, ".feather") + write_feather(example_with_metadata, name, compression = comp) +} + +example_with_metadata_v1 <- example_with_metadata +example_with_metadata_v1$c <- NULL +write_feather(example_with_metadata_v1, "extra-tests/files/ex_data_v1.feather", version = 1) + +write_ipc_stream(example_with_metadata, "extra-tests/files/ex_data.stream") diff --git a/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather b/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather new file mode 100644 index 00000000000..d91acd0cc9e Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather differ diff --git a/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather b/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather new file mode 100644 index 00000000000..0198024ec74 Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather differ diff --git a/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather b/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather new file mode 100644 index 00000000000..f6788231c8a Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather differ diff --git a/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet b/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet new file mode 100644 index 00000000000..e1d589bf099 Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet differ diff --git a/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather b/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather new file mode 100644 index 00000000000..f3a71435a6c Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather differ diff --git a/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather b/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather new file mode 100644 index 00000000000..1188ac66959 Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather differ diff --git a/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather b/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather new file mode 100644 index 00000000000..056b26c1743 Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather differ diff --git a/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet b/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet new file mode 100644 index 00000000000..6c59115608c Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet differ diff --git a/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather b/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather new file mode 100644 index 00000000000..b65da723466 Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather differ diff --git a/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather b/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather new file mode 100644 index 00000000000..508903cb49c Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather differ diff --git a/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather b/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather new file mode 100644 index 00000000000..39c829fda20 Binary files /dev/null and b/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather differ diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R index a810aa36781..26b1cf0e108 100644 --- a/r/tests/testthat/helper-data.R +++ b/r/tests/testthat/helper-data.R @@ -34,12 +34,11 @@ example_with_metadata <- tibble::tibble( ), d = "four" ) -# TODO: collect top-level dataset metadata -# https://issues.apache.org/jira/browse/ARROW-9271 -# attr(example_with_metadata, "top_level") <- list( -# field_one = 12, -# field_two = "more stuff" -# ) + +attr(example_with_metadata, "top_level") <- list( + field_one = 12, + field_two = "more stuff" +) haven_data <- tibble::tibble( num = structure(c(5.1, 4.9), diff --git a/r/tests/testthat/test-backwards-compatibility.R b/r/tests/testthat/test-backwards-compatibility.R new file mode 100644 index 00000000000..73c25cdceb7 --- /dev/null +++ b/r/tests/testthat/test-backwards-compatibility.R @@ -0,0 +1,100 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# To write a new version of a test file for a current version: +# write_parquet(example_with_metadata, test_path("golden-files/data-arrow_2.0.0.parquet")) + +# To write a new version of a test file for an old version, use docker(-compose) +# to setup a linux distribution and use RStudio's public package manager binary +# repo to install the old version. The following commands should be run at the +# root of the arrow repo directory and might need slight adjusments. +# R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose build --no-cache r +# R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose run r /bin/bash +# R +# options(repos = "https://packagemanager.rstudio.com/all/__linux__/focal/latest") +# remotes::install_version("arrow", version = "1.0.1") +# # get example data into the global env +# write_parquet(example_with_metadata, "arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet") +# quit()/exit + +expect_identical_with_metadata <- function(object, expected, ..., top_level = TRUE) { + attrs_to_keep <- c("names", "class", "row.names") + if (!top_level) { + # remove not-tbl and not-data.frame attributes + for (attribute in names(attributes(expected))) { + if (attribute %in% attrs_to_keep) next + attributes(expected)[[attribute]] <- NULL + } + } + expect_identical(object, expected, ...) +} + +test_that("reading a known Parquet file to dataframe with 2.0.0", { + skip_if_not_available("snappy") + pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet") + + df <- read_parquet(pq_file) + # this is equivalent to `expect_identical()` + expect_identical_with_metadata(df, example_with_metadata) +}) + +test_that("reading a known Parquet file to dataframe with 1.0.1", { + skip_if_not_available("snappy") + pq_file <- test_path("golden-files/data-arrow_1.0.1.parquet") + + df <- read_parquet(pq_file) + # 1.0.1 didn't save top-level metadata, so we need to remove it. + expect_identical_with_metadata(df, example_with_metadata, top_level = FALSE) +}) + +for (comp in c("lz4", "uncompressed", "zstd")) { + # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_lz4.feather"), compression = "lz4") + # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_uncompressed.feather"), compression = "uncompressed") + # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_zstd.feather"), compression = "zstd") + test_that("reading a known Feather file to dataframe with 2.0.0", { + skip_if_not_available(comp) + feather_file <- test_path(paste0("golden-files/data-arrow_2.0.0_", comp,".feather")) + + df <- read_feather(feather_file) + expect_identical_with_metadata(df, example_with_metadata) + }) + + test_that("reading a known Feather file to dataframe with 1.0.1", { + skip_if_not_available(comp) + feather_file <- test_path(paste0("golden-files/data-arrow_1.0.1_", comp,".feather")) + + df <- read_feather(feather_file) + # 1.0.1 didn't save top-level metadata, so we need to remove it. + expect_identical_with_metadata(df, example_with_metadata, top_level = FALSE) + }) + + test_that("reading a known Feather file to dataframe with 0.17.0", { + skip_if_not_available(comp) + feather_file <- test_path(paste0("golden-files/data-arrow_0.17.0_", comp,".feather")) + + df <- read_feather(feather_file) + # the metadata from 0.17.0 doesn't have the top level, the special class is + # not maintained and the embedded tibble's attributes are read in a wrong + # order. Since this is prior to 1.0.0 punting on checking the attributes + # though classes are always checked, so that must be removed before checking. + example_with_metadata_sans_special_class <- example_with_metadata + example_with_metadata_sans_special_class$a <- unclass(example_with_metadata_sans_special_class$a) + expect_equal(df, example_with_metadata_sans_special_class, check.attributes = FALSE) + }) +} + +# TODO: streams(?)