diff --git a/r/R/dataset-factory.R b/r/R/dataset-factory.R index c289cf0c8a6..30622b8a6d0 100644 --- a/r/R/dataset-factory.R +++ b/r/R/dataset-factory.R @@ -107,7 +107,9 @@ DatasetFactory$create <- function(x, #' @param ... Additional format-specific options, passed to #' `FileFormat$create()`. For CSV options, note that you can specify them either #' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the -#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.) +#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). +#' Not all `readr` options are currently supported; please file an issue if you +#' encounter one that `arrow` should support. #' @return A `DatasetFactory` object. Pass this to [open_dataset()], #' in a list potentially with other `DatasetFactory` objects, to create #' a `Dataset`. diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index eb57b893e0c..f1bf601c720 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -42,7 +42,9 @@ #' #' `format = "text"`: see [CsvReadOptions]. Note that you can specify them either #' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the -#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.) +#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). +#' Not all `readr` options are currently supported; please file an issue if +#' you encounter one that `arrow` should support. #' #' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`) #' @rdname FileFormat @@ -103,13 +105,67 @@ CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...)) dataset___CsvFileFormat__Make(opts) } +# Support both readr-style option names and Arrow C++ option names csv_file_format_parse_options <- function(...) { - # Support both the readr spelling of options and the arrow spelling - readr_opts <- c("delim", "quote", "escape_double", "escape_backslash", "skip_empty_rows") - if (any(readr_opts %in% names(list(...)))) { - readr_to_csv_parse_options(...) + opt_names <- names(list(...)) + # Catch any readr-style options specified with full option names that are + # supported by read_delim_arrow() (and its wrappers) but are not yet + # supported here + unsup_readr_opts <- setdiff( + names(formals(read_delim_arrow)), + names(formals(readr_to_csv_parse_options)) + ) + is_unsup_opt <- opt_names %in% unsup_readr_opts + unsup_opts <- opt_names[is_unsup_opt] + if (length(unsup_opts)) { + stop( + "The following ", + ngettext(length(unsup_opts), "option is ", "options are "), + "supported in \"read_delim_arrow\" functions ", + "but not yet supported here: ", + oxford_paste(unsup_opts), + call. = FALSE + ) + } + # Catch any options with full or partial names that do not match any of the + # recognized Arrow C++ option names or readr-style option names + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts)) + is_readr_opt <- !is.na(pmatch(opt_names, readr_opts)) + unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt] + if (length(unrec_opts)) { + stop( + "Unrecognized ", + ngettext(length(unrec_opts), "option", "options"), + ": ", + oxford_paste(unrec_opts), + call. = FALSE + ) + } + # Catch options with ambiguous partial names (such as "del") that make it + # unclear whether the user is specifying Arrow C++ options ("delimiter") or + # readr-style options ("delim") + is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts))) + ambig_opts <- opt_names[is_ambig_opt] + if (length(ambig_opts)) { + stop("Ambiguous ", + ngettext(length(ambig_opts), "option", "options"), + ": ", + oxford_paste(ambig_opts), + ". Use full argument names", + call. = FALSE) + } + if (any(is_readr_opt)) { + # Catch cases when the user specifies a mix of Arrow C++ options and + # readr-style options + if (!all(is_readr_opt)) { + stop("Use either Arrow parse options or readr parse options, not both", + call. = FALSE) + } + readr_to_csv_parse_options(...) # all options have readr-style names } else { - CsvParseOptions$create(...) + CsvParseOptions$create(...) # all options have Arrow C++ names } } diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd index c17e98bce55..9bb6f9b0608 100644 --- a/r/man/FileFormat.Rd +++ b/r/man/FileFormat.Rd @@ -37,7 +37,9 @@ to reduce memory overhead. Disabled by default. \code{format = "text"}: see \link{CsvReadOptions}. Note that you can specify them either with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the -\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.) +\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.). +Not all \code{readr} options are currently supported; please file an issue if +you encounter one that \code{arrow} should support. } It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat}) diff --git a/r/man/dataset_factory.Rd b/r/man/dataset_factory.Rd index 3216c5b0a16..4bf41021267 100644 --- a/r/man/dataset_factory.Rd +++ b/r/man/dataset_factory.Rd @@ -53,7 +53,9 @@ Hive-style path segments \item{...}{Additional format-specific options, passed to \code{FileFormat$create()}. For CSV options, note that you can specify them either with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the -\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.)} +\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.). +Not all \code{readr} options are currently supported; please file an issue if you +encounter one that \code{arrow} should support.} } \value{ A \code{DatasetFactory} object. Pass this to \code{\link[=open_dataset]{open_dataset()}}, diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 5bdbc42410e..a8902ab8988 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -303,11 +303,65 @@ test_that("Other text delimited dataset", { filter(integer > 6) %>% summarize(mean = mean(integer)) ) +}) + +test_that("readr parse options", { + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + + # Arrow and readr parse options must be mutually exclusive, or else the code + # in `csv_file_format_parse_options()` will error or behave incorrectly. A + # failure of this test indicates that these two sets of option names are not + # mutually exclusive. + expect_equal( + intersect(arrow_opts, readr_opts), + character(0) + ) - # Now with readr option spelling (and omitting format = "text") - ds3 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t") + # With not yet supported readr parse options (ARROW-8631) + expect_error( + open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"), + "supported" + ) + + # With unrecognized (garbage) parse options + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + asdfg = "\\" + ), + "Unrecognized" + ) + + # With both Arrow and readr parse options (disallowed) + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + quote = "\"", + quoting = TRUE + ), + "either" + ) + + # With ambiguous partial option names (disallowed) + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + quo = "\"", + ), + "Ambiguous" + ) + + # With only readr parse options (and omitting format = "text") + ds1 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t") expect_equivalent( - ds3 %>% + ds1 %>% select(string = chr, integer = int, part) %>% filter(integer > 6 & part == 5) %>% collect() %>%