From 297c63dc94c71618b457f1eb66a47ea25236faa6 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 8 Jan 2021 13:49:55 -0500
Subject: [PATCH 1/8] Use formals() to get supported readr parse options

---
 r/R/dataset-format.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index eb57b893e0c..67e1dd15a66 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -105,7 +105,7 @@ CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...))
 
 csv_file_format_parse_options <- function(...) {
   # Support both the readr spelling of options and the arrow spelling
-  readr_opts <- c("delim", "quote", "escape_double", "escape_backslash", "skip_empty_rows")
+  readr_opts <- names(formals(readr_to_csv_parse_options))
   if (any(readr_opts %in% names(list(...)))) {
     readr_to_csv_parse_options(...)
   } else {

From a2aedde36fadd1df3bfe233c93dc398150fea68b Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 8 Jan 2021 15:47:21 -0500
Subject: [PATCH 2/8] Improve checking of parse options

---
 r/R/dataset-format.R | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index 67e1dd15a66..1c320beafb4 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -104,9 +104,31 @@ CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...))
 }
 
 csv_file_format_parse_options <- function(...) {
+  opt_names <- names(list(...))
   # Support both the readr spelling of options and the arrow spelling
+  arrow_opts <- names(formals(CsvParseOptions$create))
   readr_opts <- names(formals(readr_to_csv_parse_options))
-  if (any(readr_opts %in% names(list(...)))) {
+  is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
+  is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
+  bad_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
+  if (length(bad_opts)) {
+    stop("Unsupported options: ",
+         paste(bad_opts, collapse = ", "),
+         call. = FALSE)
+  }
+  is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts)))
+  ambig_opts <- opt_names[is_ambig_opt]
+  if (length(ambig_opts)) {
+    stop("Ambiguous arguments: ",
+         paste(ambig_opts, collapse = ", "),
+         ". Use full argument names",
+         call. = FALSE)
+  }
+  if (any(is_readr_opt)) {
+    if (!all(is_readr_opt)) {
+      stop("Use either Arrow parse options or readr parse options, not both",
+           call. = FALSE)
+    }
     readr_to_csv_parse_options(...)
   } else {
     CsvParseOptions$create(...)

From 707357fc878af942be701c30e5aff633e35dfa6b Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Fri, 8 Jan 2021 15:55:23 -0500
Subject: [PATCH 3/8] Add readr parse option tests

---
 r/tests/testthat/test-dataset.R | 49 +++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 5bdbc42410e..aae4ee63e43 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -303,11 +303,54 @@ test_that("Other text delimited dataset", {
       filter(integer > 6) %>%
       summarize(mean = mean(integer))
   )
+})
+
+test_that("readr parse options", {
+  arrow_opts <- names(formals(CsvParseOptions$create))
+  readr_opts <- names(formals(readr_to_csv_parse_options))
+
+  # Arrow and readr options are mutually exclusive
+  expect_equal(
+    intersect(arrow_opts, readr_opts),
+    character(0)
+  )
 
-  # Now with readr option spelling (and omitting format = "text")
-  ds3 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t")
+  # With unsupported readr parse options
+  # (remove this after ARROW-8631)
+  if (!"na" %in% readr_opts) {
+    expect_error(
+      open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"),
+      "Unsupported"
+    )
+  }
+
+  # With both Arrow and readr parse options (disallowed)
+  expect_error(
+    open_dataset(
+      tsv_dir,
+      partitioning = "part",
+      format = "text",
+      quote = "\"",
+      quoting = TRUE
+    ),
+    "either"
+  )
+
+  # With ambiguous partial option names (disallowed)
+  expect_error(
+    open_dataset(
+      tsv_dir,
+      partitioning = "part",
+      format = "text",
+      quo = "\"",
+    ),
+    "Ambiguous"
+  )
+
+  # With only readr parse options (and omitting format = "text")
+  ds1 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t")
   expect_equivalent(
-    ds3 %>%
+    ds1 %>%
       select(string = chr, integer = int, part) %>%
       filter(integer > 6 & part == 5) %>%
       collect() %>%

From 09706a758edfe2062349e06768e354c472682c62 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Mon, 11 Jan 2021 13:27:07 -0500
Subject: [PATCH 4/8] Mention unsupported readr options in docs

---
 r/R/dataset-factory.R    | 4 +++-
 r/R/dataset-format.R     | 4 +++-
 r/man/FileFormat.Rd      | 4 +++-
 r/man/dataset_factory.Rd | 4 +++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/r/R/dataset-factory.R b/r/R/dataset-factory.R
index c289cf0c8a6..30622b8a6d0 100644
--- a/r/R/dataset-factory.R
+++ b/r/R/dataset-factory.R
@@ -107,7 +107,9 @@ DatasetFactory$create <- function(x,
 #' @param ... Additional format-specific options, passed to
 #' `FileFormat$create()`. For CSV options, note that you can specify them either
 #' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
-#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.)
+#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
+#' Not all `readr` options are currently supported; please file an issue if you
+#' encounter one that `arrow` should support.
 #' @return A `DatasetFactory` object. Pass this to [open_dataset()],
 #' in a list potentially with other `DatasetFactory` objects, to create
 #' a `Dataset`.
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index 1c320beafb4..a9f17face10 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -42,7 +42,9 @@
 #'
 #'   `format = "text"`: see [CsvReadOptions]. Note that you can specify them either
 #'   with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
-#'   `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.)
+#'   `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
+#'   Not all `readr` options are currently supported; please file an issue if
+#'   you encounter one that `arrow` should support.
 #'
 #' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
 #' @rdname FileFormat
diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd
index c17e98bce55..9bb6f9b0608 100644
--- a/r/man/FileFormat.Rd
+++ b/r/man/FileFormat.Rd
@@ -37,7 +37,9 @@ to reduce memory overhead. Disabled by default.
 
 \code{format = "text"}: see \link{CsvReadOptions}. Note that you can specify them either
 with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
-\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.)
+\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
+Not all \code{readr} options are currently supported; please file an issue if
+you encounter one that \code{arrow} should support.
 }
 
 It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat})
diff --git a/r/man/dataset_factory.Rd b/r/man/dataset_factory.Rd
index 3216c5b0a16..4bf41021267 100644
--- a/r/man/dataset_factory.Rd
+++ b/r/man/dataset_factory.Rd
@@ -53,7 +53,9 @@ Hive-style path segments
 \item{...}{Additional format-specific options, passed to
 \code{FileFormat$create()}. For CSV options, note that you can specify them either
 with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
-\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.)}
+\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
+Not all \code{readr} options are currently supported; please file an issue if you
+encounter one that \code{arrow} should support.}
 }
 \value{
 A \code{DatasetFactory} object. Pass this to \code{\link[=open_dataset]{open_dataset()}},

From 205d1e740da77d91f1c78dd7df98f01ab09115cf Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Mon, 11 Jan 2021 13:32:36 -0500
Subject: [PATCH 5/8] Implement changes discussed in apache/arrow#9143

---
 r/R/dataset-format.R | 52 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index a9f17face10..f1bf601c720 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -105,35 +105,67 @@ CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...))
   dataset___CsvFileFormat__Make(opts)
 }
 
+# Support both readr-style option names and Arrow C++ option names
 csv_file_format_parse_options <- function(...) {
   opt_names <- names(list(...))
-  # Support both the readr spelling of options and the arrow spelling
+  # Catch any readr-style options specified with full option names that are
+  # supported by read_delim_arrow() (and its wrappers) but are not yet
+  # supported here
+  unsup_readr_opts <- setdiff(
+    names(formals(read_delim_arrow)),
+    names(formals(readr_to_csv_parse_options))
+  )
+  is_unsup_opt <- opt_names %in% unsup_readr_opts
+  unsup_opts <- opt_names[is_unsup_opt]
+  if (length(unsup_opts)) {
+    stop(
+      "The following ",
+      ngettext(length(unsup_opts), "option is ", "options are "),
+      "supported in \"read_delim_arrow\" functions ",
+      "but not yet supported here: ",
+      oxford_paste(unsup_opts),
+      call. = FALSE
+    )
+  }
+  # Catch any options with full or partial names that do not match any of the
+  # recognized Arrow C++ option names or readr-style option names
   arrow_opts <- names(formals(CsvParseOptions$create))
   readr_opts <- names(formals(readr_to_csv_parse_options))
   is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
   is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
-  bad_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
-  if (length(bad_opts)) {
-    stop("Unsupported options: ",
-         paste(bad_opts, collapse = ", "),
-         call. = FALSE)
+  unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
+  if (length(unrec_opts)) {
+    stop(
+      "Unrecognized ",
+      ngettext(length(unrec_opts), "option", "options"),
+      ": ",
+      oxford_paste(unrec_opts),
+      call. = FALSE
+    )
   }
+  # Catch options with ambiguous partial names (such as "del") that make it
+  # unclear whether the user is specifying Arrow C++ options ("delimiter") or
+  # readr-style options ("delim")
   is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts)))
   ambig_opts <- opt_names[is_ambig_opt]
   if (length(ambig_opts)) {
-    stop("Ambiguous arguments: ",
-         paste(ambig_opts, collapse = ", "),
+    stop("Ambiguous ",
+         ngettext(length(ambig_opts), "option", "options"),
+         ": ",
+         oxford_paste(ambig_opts),
          ". Use full argument names",
          call. = FALSE)
   }
   if (any(is_readr_opt)) {
+    # Catch cases when the user specifies a mix of Arrow C++ options and
+    # readr-style options
     if (!all(is_readr_opt)) {
       stop("Use either Arrow parse options or readr parse options, not both",
            call. = FALSE)
     }
-    readr_to_csv_parse_options(...)
+    readr_to_csv_parse_options(...) # all options have readr-style names
   } else {
-    CsvParseOptions$create(...)
+    CsvParseOptions$create(...) # all options have Arrow C++ names
   }
 }
 

From 7312ef7d13a6f5cb4f6c4bec83ece9fae2d4d7c6 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Mon, 11 Jan 2021 13:33:40 -0500
Subject: [PATCH 6/8] Update tests

---
 r/tests/testthat/test-dataset.R | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index aae4ee63e43..4cb88b42547 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -315,15 +315,26 @@ test_that("readr parse options", {
     character(0)
   )
 
-  # With unsupported readr parse options
+  # With not yet supported readr parse options
   # (remove this after ARROW-8631)
   if (!"na" %in% readr_opts) {
     expect_error(
       open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"),
-      "Unsupported"
+      "supported"
     )
   }
 
+  # With unrecognized (garbage) parse options
+  expect_error(
+    open_dataset(
+      tsv_dir,
+      partitioning = "part",
+      format = "text",
+      asdfg = "\\"
+    ),
+    "Unrecognized"
+  )
+
   # With both Arrow and readr parse options (disallowed)
   expect_error(
     open_dataset(

From 63ec2018c064ac35fe4439d2bb66358cb84282fc Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Mon, 11 Jan 2021 13:33:55 -0500
Subject: [PATCH 7/8] Improve comment in tests

---
 r/tests/testthat/test-dataset.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 4cb88b42547..066c3cdafb7 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -309,7 +309,10 @@ test_that("readr parse options", {
   arrow_opts <- names(formals(CsvParseOptions$create))
   readr_opts <- names(formals(readr_to_csv_parse_options))
 
-  # Arrow and readr options are mutually exclusive
+  # Arrow and readr parse options must be mutually exclusive, or else the code
+  # in `csv_file_format_parse_options()` will error or behave incorrectly. A
+  # failure of this test indicates that these two sets of option names are not
+  # mutually exclusive.
   expect_equal(
     intersect(arrow_opts, readr_opts),
     character(0)

From b39a6a8f8b7e98a297456e035cb9ce0e2b1bad51 Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Mon, 11 Jan 2021 14:50:43 -0500
Subject: [PATCH 8/8] Remove conditional around test

Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 r/tests/testthat/test-dataset.R | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 066c3cdafb7..a8902ab8988 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -318,14 +318,11 @@ test_that("readr parse options", {
     character(0)
   )
 
-  # With not yet supported readr parse options
-  # (remove this after ARROW-8631)
-  if (!"na" %in% readr_opts) {
-    expect_error(
-      open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"),
-      "supported"
-    )
-  }
+  # With not yet supported readr parse options (ARROW-8631)
+  expect_error(
+    open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"),
+    "supported"
+  )
 
   # With unrecognized (garbage) parse options
   expect_error(