From 297c63dc94c71618b457f1eb66a47ea25236faa6 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 8 Jan 2021 13:49:55 -0500 Subject: [PATCH 1/8] Use formals() to get supported readr parse options --- r/R/dataset-format.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index eb57b893e0c..67e1dd15a66 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -105,7 +105,7 @@ CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...)) csv_file_format_parse_options <- function(...) { # Support both the readr spelling of options and the arrow spelling - readr_opts <- c("delim", "quote", "escape_double", "escape_backslash", "skip_empty_rows") + readr_opts <- names(formals(readr_to_csv_parse_options)) if (any(readr_opts %in% names(list(...)))) { readr_to_csv_parse_options(...) } else { From a2aedde36fadd1df3bfe233c93dc398150fea68b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 8 Jan 2021 15:47:21 -0500 Subject: [PATCH 2/8] Improve checking of parse options --- r/R/dataset-format.R | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index 67e1dd15a66..1c320beafb4 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -104,9 +104,31 @@ CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...)) } csv_file_format_parse_options <- function(...) { + opt_names <- names(list(...)) # Support both the readr spelling of options and the arrow spelling + arrow_opts <- names(formals(CsvParseOptions$create)) readr_opts <- names(formals(readr_to_csv_parse_options)) - if (any(readr_opts %in% names(list(...)))) { + is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts)) + is_readr_opt <- !is.na(pmatch(opt_names, readr_opts)) + bad_opts <- opt_names[!is_arrow_opt & !is_readr_opt] + if (length(bad_opts)) { + stop("Unsupported options: ", + paste(bad_opts, collapse = ", "), + call. = FALSE) + } + is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts))) + ambig_opts <- opt_names[is_ambig_opt] + if (length(ambig_opts)) { + stop("Ambiguous arguments: ", + paste(ambig_opts, collapse = ", "), + ". Use full argument names", + call. = FALSE) + } + if (any(is_readr_opt)) { + if (!all(is_readr_opt)) { + stop("Use either Arrow parse options or readr parse options, not both", + call. = FALSE) + } readr_to_csv_parse_options(...) } else { CsvParseOptions$create(...) From 707357fc878af942be701c30e5aff633e35dfa6b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 8 Jan 2021 15:55:23 -0500 Subject: [PATCH 3/8] Add readr parse option tests --- r/tests/testthat/test-dataset.R | 49 +++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 5bdbc42410e..aae4ee63e43 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -303,11 +303,54 @@ test_that("Other text delimited dataset", { filter(integer > 6) %>% summarize(mean = mean(integer)) ) +}) + +test_that("readr parse options", { + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + + # Arrow and readr options are mutually exclusive + expect_equal( + intersect(arrow_opts, readr_opts), + character(0) + ) - # Now with readr option spelling (and omitting format = "text") - ds3 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t") + # With unsupported readr parse options + # (remove this after ARROW-8631) + if (!"na" %in% readr_opts) { + expect_error( + open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"), + "Unsupported" + ) + } + + # With both Arrow and readr parse options (disallowed) + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + quote = "\"", + quoting = TRUE + ), + "either" + ) + + # With ambiguous partial option names (disallowed) + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + quo = "\"", + ), + "Ambiguous" + ) + + # With only readr parse options (and omitting format = "text") + ds1 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t") expect_equivalent( - ds3 %>% + ds1 %>% select(string = chr, integer = int, part) %>% filter(integer > 6 & part == 5) %>% collect() %>% From 09706a758edfe2062349e06768e354c472682c62 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 11 Jan 2021 13:27:07 -0500 Subject: [PATCH 4/8] Mention unsupported readr options in docs --- r/R/dataset-factory.R | 4 +++- r/R/dataset-format.R | 4 +++- r/man/FileFormat.Rd | 4 +++- r/man/dataset_factory.Rd | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/r/R/dataset-factory.R b/r/R/dataset-factory.R index c289cf0c8a6..30622b8a6d0 100644 --- a/r/R/dataset-factory.R +++ b/r/R/dataset-factory.R @@ -107,7 +107,9 @@ DatasetFactory$create <- function(x, #' @param ... Additional format-specific options, passed to #' `FileFormat$create()`. For CSV options, note that you can specify them either #' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the -#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.) +#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). +#' Not all `readr` options are currently supported; please file an issue if you +#' encounter one that `arrow` should support. #' @return A `DatasetFactory` object. Pass this to [open_dataset()], #' in a list potentially with other `DatasetFactory` objects, to create #' a `Dataset`. diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index 1c320beafb4..a9f17face10 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -42,7 +42,9 @@ #' #' `format = "text"`: see [CsvReadOptions]. Note that you can specify them either #' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the -#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.) +#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). +#' Not all `readr` options are currently supported; please file an issue if +#' you encounter one that `arrow` should support. #' #' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`) #' @rdname FileFormat diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd index c17e98bce55..9bb6f9b0608 100644 --- a/r/man/FileFormat.Rd +++ b/r/man/FileFormat.Rd @@ -37,7 +37,9 @@ to reduce memory overhead. Disabled by default. \code{format = "text"}: see \link{CsvReadOptions}. Note that you can specify them either with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the -\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.) +\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.). +Not all \code{readr} options are currently supported; please file an issue if +you encounter one that \code{arrow} should support. } It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat}) diff --git a/r/man/dataset_factory.Rd b/r/man/dataset_factory.Rd index 3216c5b0a16..4bf41021267 100644 --- a/r/man/dataset_factory.Rd +++ b/r/man/dataset_factory.Rd @@ -53,7 +53,9 @@ Hive-style path segments \item{...}{Additional format-specific options, passed to \code{FileFormat$create()}. For CSV options, note that you can specify them either with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the -\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.)} +\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.). +Not all \code{readr} options are currently supported; please file an issue if you +encounter one that \code{arrow} should support.} } \value{ A \code{DatasetFactory} object. Pass this to \code{\link[=open_dataset]{open_dataset()}}, From 205d1e740da77d91f1c78dd7df98f01ab09115cf Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 11 Jan 2021 13:32:36 -0500 Subject: [PATCH 5/8] Implement changes discussed in apache/arrow#9143 --- r/R/dataset-format.R | 52 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index a9f17face10..f1bf601c720 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -105,35 +105,67 @@ CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...)) dataset___CsvFileFormat__Make(opts) } +# Support both readr-style option names and Arrow C++ option names csv_file_format_parse_options <- function(...) { opt_names <- names(list(...)) - # Support both the readr spelling of options and the arrow spelling + # Catch any readr-style options specified with full option names that are + # supported by read_delim_arrow() (and its wrappers) but are not yet + # supported here + unsup_readr_opts <- setdiff( + names(formals(read_delim_arrow)), + names(formals(readr_to_csv_parse_options)) + ) + is_unsup_opt <- opt_names %in% unsup_readr_opts + unsup_opts <- opt_names[is_unsup_opt] + if (length(unsup_opts)) { + stop( + "The following ", + ngettext(length(unsup_opts), "option is ", "options are "), + "supported in \"read_delim_arrow\" functions ", + "but not yet supported here: ", + oxford_paste(unsup_opts), + call. = FALSE + ) + } + # Catch any options with full or partial names that do not match any of the + # recognized Arrow C++ option names or readr-style option names arrow_opts <- names(formals(CsvParseOptions$create)) readr_opts <- names(formals(readr_to_csv_parse_options)) is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts)) is_readr_opt <- !is.na(pmatch(opt_names, readr_opts)) - bad_opts <- opt_names[!is_arrow_opt & !is_readr_opt] - if (length(bad_opts)) { - stop("Unsupported options: ", - paste(bad_opts, collapse = ", "), - call. = FALSE) + unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt] + if (length(unrec_opts)) { + stop( + "Unrecognized ", + ngettext(length(unrec_opts), "option", "options"), + ": ", + oxford_paste(unrec_opts), + call. = FALSE + ) } + # Catch options with ambiguous partial names (such as "del") that make it + # unclear whether the user is specifying Arrow C++ options ("delimiter") or + # readr-style options ("delim") is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts))) ambig_opts <- opt_names[is_ambig_opt] if (length(ambig_opts)) { - stop("Ambiguous arguments: ", - paste(ambig_opts, collapse = ", "), + stop("Ambiguous ", + ngettext(length(ambig_opts), "option", "options"), + ": ", + oxford_paste(ambig_opts), ". Use full argument names", call. = FALSE) } if (any(is_readr_opt)) { + # Catch cases when the user specifies a mix of Arrow C++ options and + # readr-style options if (!all(is_readr_opt)) { stop("Use either Arrow parse options or readr parse options, not both", call. = FALSE) } - readr_to_csv_parse_options(...) + readr_to_csv_parse_options(...) # all options have readr-style names } else { - CsvParseOptions$create(...) + CsvParseOptions$create(...) # all options have Arrow C++ names } } From 7312ef7d13a6f5cb4f6c4bec83ece9fae2d4d7c6 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 11 Jan 2021 13:33:40 -0500 Subject: [PATCH 6/8] Update tests --- r/tests/testthat/test-dataset.R | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index aae4ee63e43..4cb88b42547 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -315,15 +315,26 @@ test_that("readr parse options", { character(0) ) - # With unsupported readr parse options + # With not yet supported readr parse options # (remove this after ARROW-8631) if (!"na" %in% readr_opts) { expect_error( open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"), - "Unsupported" + "supported" ) } + # With unrecognized (garbage) parse options + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + asdfg = "\\" + ), + "Unrecognized" + ) + # With both Arrow and readr parse options (disallowed) expect_error( open_dataset( From 63ec2018c064ac35fe4439d2bb66358cb84282fc Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 11 Jan 2021 13:33:55 -0500 Subject: [PATCH 7/8] Improve comment in tests --- r/tests/testthat/test-dataset.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 4cb88b42547..066c3cdafb7 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -309,7 +309,10 @@ test_that("readr parse options", { arrow_opts <- names(formals(CsvParseOptions$create)) readr_opts <- names(formals(readr_to_csv_parse_options)) - # Arrow and readr options are mutually exclusive + # Arrow and readr parse options must be mutually exclusive, or else the code + # in `csv_file_format_parse_options()` will error or behave incorrectly. A + # failure of this test indicates that these two sets of option names are not + # mutually exclusive. expect_equal( intersect(arrow_opts, readr_opts), character(0) From b39a6a8f8b7e98a297456e035cb9ce0e2b1bad51 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 11 Jan 2021 14:50:43 -0500 Subject: [PATCH 8/8] Remove conditional around test Co-authored-by: Neal Richardson --- r/tests/testthat/test-dataset.R | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 066c3cdafb7..a8902ab8988 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -318,14 +318,11 @@ test_that("readr parse options", { character(0) ) - # With not yet supported readr parse options - # (remove this after ARROW-8631) - if (!"na" %in% readr_opts) { - expect_error( - open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"), - "supported" - ) - } + # With not yet supported readr parse options (ARROW-8631) + expect_error( + open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"), + "supported" + ) # With unrecognized (garbage) parse options expect_error(