From 29b9e32fff1bd4d8b653e4a7bd6fe924d651ef73 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Mon, 5 Apr 2021 11:07:43 -0400 Subject: [PATCH 01/11] fix merge in arrow11766v3 --- r/R/feather.R | 7 +++++-- r/R/parquet.R | 10 ++++++++-- r/R/util.R | 13 +++++++++++++ r/tests/testthat/test-feather.R | 14 ++++++++++++++ 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/r/R/feather.R b/r/R/feather.R index 637ce23234a..e72f2eab504 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -147,14 +147,17 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { file <- make_readable_file(file) on.exit(file$close()) } - reader <- FeatherReader$create(file, ...) + reader <- FeatherReader$create(file) col_select <- enquo(col_select) columns <- if (!quo_is_null(col_select)) { vars_select(names(reader), !!col_select) } - out <- reader$Read(columns) + out <- tryCatch( + reader$Read(columns), + error = function(e) { read_compressed_error(e) } + ) if (isTRUE(as_data_frame)) { out <- as.data.frame(out) diff --git a/r/R/parquet.R b/r/R/parquet.R index 45751b16170..689324113c2 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -52,10 +52,16 @@ read_parquet <- function(file, schema <- reader$GetSchema() names <- names(schema) indices <- match(vars_select(names, !!col_select), names) - 1L - tab <- reader$ReadTable(indices) + tab <- tryCatch( + reader$ReadTable(indices), + error = function(e) { read_compressed_error(e) } + ) } else { # read all columns - tab <- reader$ReadTable() + tab <- tryCatch( + reader$ReadTable(), + error = function(e) { read_compressed_error(e) } + ) } if (as_data_frame) { diff --git a/r/R/util.R b/r/R/util.R index 4680381e909..d5a3bc8ab80 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -86,3 +86,16 @@ all_names <- function(expr) { is_constant <- function(expr) { length(all_vars(expr)) == 0 } + +read_compressed_error <- function(e) { + e <- as.character(e) + compression <- sub(".*Support for codec '(.*)'.*", "\\1", e) + msg <- c( + sprintf("Unsupported compressed format %s", compression), + "\nTry setting the environment variable LIBARROW_MINIMAL=false and reinstalling", + "\nfor a more complete installation ", + sprintf("(including %s) or setting", compression), + sprintf("\nARROW_WITH_%s=ON and reinstalling to enable support for this codec.", toupper(compression)) + ) + stop(msg, call. = FALSE) +} diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 52325c7f410..30a92f5e302 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -196,3 +196,17 @@ test_that("Character vectors > 2GB can write to feather", { }) unlink(feather_file) + +ft_file <- test_path("golden-files/data-arrow_2.0.0_lz4.feather") + +test_that("Error messages are shown when the compression algorithm lz4 is not found", { + if (codec_is_available("lz4")) { + d <- read_feather(ft_file) + expect_is(d, "data.frame") + } else { + expect_error( + read_feather(ft_file), + "Unsupported compressed format" + ) + } +}) From 2f29d58d195c125b39b28c66010e3fb3b1246a76 Mon Sep 17 00:00:00 2001 From: Pachamaltese Date: Mon, 5 Apr 2021 17:59:59 -0400 Subject: [PATCH 02/11] Update r/R/parquet.R Co-authored-by: Neal Richardson --- r/R/parquet.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/parquet.R b/r/R/parquet.R index 689324113c2..7e72074f32e 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -54,7 +54,7 @@ read_parquet <- function(file, indices <- match(vars_select(names, !!col_select), names) - 1L tab <- tryCatch( reader$ReadTable(indices), - error = function(e) { read_compressed_error(e) } + error = read_compressed_error ) } else { # read all columns From 0e72f3e3fb1b01750d48676bc5f067602da02974 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Mon, 5 Apr 2021 18:59:37 -0400 Subject: [PATCH 03/11] requested changes in https://github.com/apache/arrow/pull/9893#discussion_r607342996 and https://github.com/apache/arrow/pull/9893#discussion_r607344455 --- r/R/feather.R | 2 +- r/R/parquet.R | 2 +- r/tests/testthat/test-feather.R | 7 +++++++ r/tests/testthat/test-parquet.R | 19 +++++++++++++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/r/R/feather.R b/r/R/feather.R index e72f2eab504..cc6c3b10e18 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -156,7 +156,7 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { out <- tryCatch( reader$Read(columns), - error = function(e) { read_compressed_error(e) } + error = read_compressed_error ) if (isTRUE(as_data_frame)) { diff --git a/r/R/parquet.R b/r/R/parquet.R index 7e72074f32e..169d9f57f52 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -60,7 +60,7 @@ read_parquet <- function(file, # read all columns tab <- tryCatch( reader$ReadTable(), - error = function(e) { read_compressed_error(e) } + error = read_compressed_error ) } diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 30a92f5e302..a6b74f2e2f5 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -210,3 +210,10 @@ test_that("Error messages are shown when the compression algorithm lz4 is not fo ) } }) + +test_that("Error is created when feather reads a parquet file", { + expect_error( + read_feather(system.file("v0.7.1.parquet", package = "arrow")), + "Not a Feather V1 or Arrow IPC file" + ) +}) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index 4ac356f004d..9ef5a10b549 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -234,3 +234,22 @@ test_that("ParquetFileReader $ReadRowGroup(s) methods", { expect_true(reader$ReadRowGroups(c(0, 1), 0) == Table$create(x = 1:20)) expect_error(reader$ReadRowGroups(c(0, 1), 1)) }) + +test_that("Error messages are shown when the compression algorithm snappy is not found", { + if (codec_is_available("snappy")) { + d <- read_parquet(pq_file) + expect_is(d, "data.frame") + } else { + expect_error( + read_feather(pq_file), + "Unsupported compressed format" + ) + } +}) + +test_that("Error is created when parquet reads a feather file", { + expect_error( + read_parquet(test_path("golden-files/data-arrow_2.0.0_lz4.feather")), + "Parquet magic bytes not found in footer" + ) +}) From 008bfc08a763c302fbb9f004a3a4c6bf7dcc0bf1 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Mon, 5 Apr 2021 19:18:46 -0400 Subject: [PATCH 04/11] changes asked in https://github.com/apache/arrow/pull/9893#discussion_r607363061 --- r/tests/testthat/test-feather.R | 12 ++++++++---- r/tests/testthat/test-parquet.R | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index a6b74f2e2f5..f91391e9104 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -200,14 +200,18 @@ unlink(feather_file) ft_file <- test_path("golden-files/data-arrow_2.0.0_lz4.feather") test_that("Error messages are shown when the compression algorithm lz4 is not found", { + msg <- c( + "Unsupported compressed format lz4", + "Try setting the environment variable LIBARROW_MINIMAL=false and reinstalling", + "for a more complete installation (including lz4) or setting", + "ARROW_WITH_LZ4=ON and reinstalling to enable support for this codec." + ) + if (codec_is_available("lz4")) { d <- read_feather(ft_file) expect_is(d, "data.frame") } else { - expect_error( - read_feather(ft_file), - "Unsupported compressed format" - ) + expect_error(read_feather(ft_file), paste(msg, collapse = "\n"), fixed = TRUE) } }) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index 9ef5a10b549..563dd3ec688 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -236,14 +236,18 @@ test_that("ParquetFileReader $ReadRowGroup(s) methods", { }) test_that("Error messages are shown when the compression algorithm snappy is not found", { + msg <- c( + "Unsupported compressed format snappy", + "Try setting the environment variable LIBARROW_MINIMAL=false and reinstalling", + "for a more complete installation (including snappy) or setting", + "ARROW_WITH_SNAPPY=ON and reinstalling to enable support for this codec." + ) + if (codec_is_available("snappy")) { d <- read_parquet(pq_file) expect_is(d, "data.frame") } else { - expect_error( - read_feather(pq_file), - "Unsupported compressed format" - ) + expect_error(read_parquet(pq_file), paste(msg, collapse = "\n"), fixed = TRUE) } }) From 0ecb176ef3b0c88f939835905c78e2ce941dc4ea Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Mon, 5 Apr 2021 23:21:34 -0400 Subject: [PATCH 05/11] compressed error in the style of https://github.com/apache/arrow/pull/9899/files#diff-f6235d4767fc4a7ee1bb726d816b9742ef0bc07503dceb678fd3bc55ee15454bR90-R96 --- r/R/util.R | 21 +++++++++++---------- r/tests/testthat/test-feather.R | 15 +++++++++------ r/tests/testthat/test-parquet.R | 15 +++++++++------ 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index d5a3bc8ab80..5b8efe3bb0e 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -88,14 +88,15 @@ is_constant <- function(expr) { } read_compressed_error <- function(e) { - e <- as.character(e) - compression <- sub(".*Support for codec '(.*)'.*", "\\1", e) - msg <- c( - sprintf("Unsupported compressed format %s", compression), - "\nTry setting the environment variable LIBARROW_MINIMAL=false and reinstalling", - "\nfor a more complete installation ", - sprintf("(including %s) or setting", compression), - sprintf("\nARROW_WITH_%s=ON and reinstalling to enable support for this codec.", toupper(compression)) - ) - stop(msg, call. = FALSE) + msg <- conditionMessage(e) + if (grepl(" codec ", msg)) { + compression <- sub(".*Support for codec '(.*)'.*", "\\1", msg) + e$message <- paste0( + msg, + "\nTry setting the environment variable LIBARROW_MINIMAL=false and reinstalling", + sprintf(" for a more complete installation (including '%s') or setting", compression), + sprintf(" ARROW_WITH_%s=ON and reinstalling to enable support for this codec.", toupper(compression)) + ) + } + stop(e) } diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index f91391e9104..52ef5dc0d0c 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -200,18 +200,21 @@ unlink(feather_file) ft_file <- test_path("golden-files/data-arrow_2.0.0_lz4.feather") test_that("Error messages are shown when the compression algorithm lz4 is not found", { - msg <- c( - "Unsupported compressed format lz4", - "Try setting the environment variable LIBARROW_MINIMAL=false and reinstalling", - "for a more complete installation (including lz4) or setting", - "ARROW_WITH_LZ4=ON and reinstalling to enable support for this codec." + msg <- paste0( + "NotImplemented: Support for codec 'lz4' not built", + paste( + "\nTry setting the environment variable LIBARROW_MINIMAL=false and reinstalling", + "for a more complete installation (including 'lz4') or setting", + "ARROW_WITH_LZ4=ON and reinstalling to enable support for this codec.", + collapse = " " + ) ) if (codec_is_available("lz4")) { d <- read_feather(ft_file) expect_is(d, "data.frame") } else { - expect_error(read_feather(ft_file), paste(msg, collapse = "\n"), fixed = TRUE) + expect_error(read_feather(ft_file), msg, fixed = TRUE) } }) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index 563dd3ec688..2f3be527947 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -236,18 +236,21 @@ test_that("ParquetFileReader $ReadRowGroup(s) methods", { }) test_that("Error messages are shown when the compression algorithm snappy is not found", { - msg <- c( - "Unsupported compressed format snappy", - "Try setting the environment variable LIBARROW_MINIMAL=false and reinstalling", - "for a more complete installation (including snappy) or setting", - "ARROW_WITH_SNAPPY=ON and reinstalling to enable support for this codec." + msg <- paste0( + "NotImplemented: Support for codec 'snappy' not built", + paste( + "\nTry setting the environment variable LIBARROW_MINIMAL=false and reinstalling", + "for a more complete installation (including 'snappy') or setting", + "ARROW_WITH_SNAPPY=ON and reinstalling to enable support for this codec.", + collapse = " " + ) ) if (codec_is_available("snappy")) { d <- read_parquet(pq_file) expect_is(d, "data.frame") } else { - expect_error(read_parquet(pq_file), paste(msg, collapse = "\n"), fixed = TRUE) + expect_error(read_parquet(pq_file), msg, fixed = TRUE) } }) From 30f052aa5f2281fdc005ae8f57ce04c1412ad8b8 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Tue, 6 Apr 2021 12:39:26 -0400 Subject: [PATCH 06/11] make readable file --- r/R/feather.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/feather.R b/r/R/feather.R index cc6c3b10e18..6da15c004c0 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -144,7 +144,7 @@ write_feather <- function(x, #' } read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { if (!inherits(file, "RandomAccessFile")) { - file <- make_readable_file(file) + file <- make_readable_file(file, ...) on.exit(file$close()) } reader <- FeatherReader$create(file) From 3859a176f38e0bf85055fb186e30dde9c93eca75 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Tue, 6 Apr 2021 22:42:48 -0400 Subject: [PATCH 07/11] https://github.com/apache/arrow/pull/9893#discussion_r608160062 --- r/R/feather.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/feather.R b/r/R/feather.R index 6da15c004c0..00fd1345e09 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -125,7 +125,7 @@ write_feather <- function(x, #' #' @inheritParams read_ipc_stream #' @inheritParams read_delim_arrow -#' @param ... additional parameters, passed to [FeatherReader$create()][FeatherReader] +#' @param ... additional parameters. #' #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an #' Arrow [Table] otherwise From 9fa8bd7b3c9d45ee58bc37a638811fd64517ee65 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Wed, 7 Apr 2021 10:29:24 -0400 Subject: [PATCH 08/11] feather additional args --- r/R/feather.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/feather.R b/r/R/feather.R index 00fd1345e09..5bb3792e777 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -125,7 +125,8 @@ write_feather <- function(x, #' #' @inheritParams read_ipc_stream #' @inheritParams read_delim_arrow -#' @param ... additional parameters. +#' @param ... additional parameters, passed to the internal function +#' `make_readable_file()` #' #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an #' Arrow [Table] otherwise From a426a4dae5ca4bf4160275e5bfa3a371a00f55cd Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Wed, 7 Apr 2021 10:41:15 -0400 Subject: [PATCH 09/11] updated docs --- r/man/read_feather.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index c5467c3a22f..0489b9d088a 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -21,7 +21,8 @@ of columns, as used in \code{dplyr::select()}.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or an Arrow \link{Table}?} -\item{...}{additional parameters, passed to \link[=FeatherReader]{FeatherReader$create()}} +\item{...}{additional parameters, passed to the internal function +\code{make_readable_file()}} } \value{ A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an From 1ea9141036551909028d7a5129461b48e65b58b2 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Thu, 8 Apr 2021 00:16:40 -0400 Subject: [PATCH 10/11] make_readable_file linked in Rd --- r/R/feather.R | 3 +-- r/man/read_feather.Rd | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/r/R/feather.R b/r/R/feather.R index 5bb3792e777..7c545eeda56 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -125,8 +125,7 @@ write_feather <- function(x, #' #' @inheritParams read_ipc_stream #' @inheritParams read_delim_arrow -#' @param ... additional parameters, passed to the internal function -#' `make_readable_file()` +#' @param ... additional parameters, passed to [make_readable_file()]. #' #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an #' Arrow [Table] otherwise diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index 0489b9d088a..fe3a7f1e23d 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -21,8 +21,7 @@ of columns, as used in \code{dplyr::select()}.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or an Arrow \link{Table}?} -\item{...}{additional parameters, passed to the internal function -\code{make_readable_file()}} +\item{...}{additional parameters, passed to \code{\link[=make_readable_file]{make_readable_file()}}.} } \value{ A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an From 2daa458c75fee59a59286d796bb7d21b4c827da7 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Fri, 9 Apr 2021 16:29:21 -0400 Subject: [PATCH 11/11] Neal's error msg --- r/R/util.R | 8 +++++--- r/tests/testthat/test-feather.R | 10 +--------- r/tests/testthat/test-parquet.R | 10 +--------- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index 35d335050a4..f5b505f352a 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -93,9 +93,11 @@ read_compressed_error <- function(e) { compression <- sub(".*Support for codec '(.*)'.*", "\\1", msg) e$message <- paste0( msg, - "\nTry setting the environment variable LIBARROW_MINIMAL=false and reinstalling", - sprintf(" for a more complete installation (including '%s') or setting", compression), - sprintf(" ARROW_WITH_%s=ON and reinstalling to enable support for this codec.", toupper(compression)) + "\nIn order to read this file, you will need to reinstall arrow with additional features enabled.", + "\nSet one of these environment variables before installing:", + sprintf("\n\n * LIBARROW_MINIMAL=false (for all optional features, including '%s')", compression), + sprintf("\n * ARROW_WITH_%s=ON (for just '%s')", toupper(compression), compression), + "\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details" ) } stop(e) diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 52ef5dc0d0c..e33fda64641 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -200,15 +200,7 @@ unlink(feather_file) ft_file <- test_path("golden-files/data-arrow_2.0.0_lz4.feather") test_that("Error messages are shown when the compression algorithm lz4 is not found", { - msg <- paste0( - "NotImplemented: Support for codec 'lz4' not built", - paste( - "\nTry setting the environment variable LIBARROW_MINIMAL=false and reinstalling", - "for a more complete installation (including 'lz4') or setting", - "ARROW_WITH_LZ4=ON and reinstalling to enable support for this codec.", - collapse = " " - ) - ) + msg <- "NotImplemented: Support for codec 'lz4' not built\nIn order to read this file, you will need to reinstall arrow with additional features enabled.\nSet one of these environment variables before installing:\n\n * LIBARROW_MINIMAL=false (for all optional features, including 'lz4')\n * ARROW_WITH_LZ4=ON (for just 'lz4')\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details" if (codec_is_available("lz4")) { d <- read_feather(ft_file) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index 2f3be527947..14e7aa78e05 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -236,15 +236,7 @@ test_that("ParquetFileReader $ReadRowGroup(s) methods", { }) test_that("Error messages are shown when the compression algorithm snappy is not found", { - msg <- paste0( - "NotImplemented: Support for codec 'snappy' not built", - paste( - "\nTry setting the environment variable LIBARROW_MINIMAL=false and reinstalling", - "for a more complete installation (including 'snappy') or setting", - "ARROW_WITH_SNAPPY=ON and reinstalling to enable support for this codec.", - collapse = " " - ) - ) + msg <- "NotImplemented: Support for codec 'snappy' not built\nIn order to read this file, you will need to reinstall arrow with additional features enabled.\nSet one of these environment variables before installing:\n\n * LIBARROW_MINIMAL=false (for all optional features, including 'snappy')\n * ARROW_WITH_SNAPPY=ON (for just 'snappy')\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details" if (codec_is_available("snappy")) { d <- read_parquet(pq_file)