From e381a416c1183c69e50add86a7ce3ef9145fde27 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 8 Jul 2022 13:26:54 -0400 Subject: [PATCH 1/4] ARROW-16715: [R] Bump default parquet version --- r/NAMESPACE | 1 + r/R/arrow-package.R | 2 +- r/R/enums.R | 2 +- r/R/parquet.R | 100 ++++++++++++----------- r/man/enums.Rd | 2 +- r/man/write_parquet.Rd | 46 ++++++----- r/tests/testthat/_snaps/dataset-write.md | 2 +- r/tests/testthat/test-parquet.R | 52 ++++++++++-- 8 files changed, 127 insertions(+), 80 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 5762df9eb0c..023e9bb8318 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -395,6 +395,7 @@ importFrom(rlang,"%||%") importFrom(rlang,":=") importFrom(rlang,.data) importFrom(rlang,abort) +importFrom(rlang,arg_match) importFrom(rlang,as_function) importFrom(rlang,as_label) importFrom(rlang,as_quosure) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 7b59854f1e1..05270ef6bb7 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -23,7 +23,7 @@ #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure -#' @importFrom rlang is_list call2 is_empty as_function as_label +#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/enums.R b/r/R/enums.R index 17d0484b997..727ca9388c3 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -122,7 +122,7 @@ FileType <- enum("FileType", #' @export #' @rdname enums ParquetVersionType <- enum("ParquetVersionType", - PARQUET_1_0 = 0L, PARQUET_2_0 = 1L + PARQUET_1_0 = 0L, PARQUET_2_0 = 1L, PARQUET_2_4 = 2L, PARQUET_2_6 = 3L ) #' @export diff --git a/r/R/parquet.R b/r/R/parquet.R index 62da28fd1ea..2a009fdf05c 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -83,28 +83,34 @@ read_parquet <- function(file, #' @param sink A string file path, URI, or [OutputStream], or path in a file #' system (`SubTreeFileSystem`) #' @param chunk_size how many rows of data to write to disk at once. This -#' directly corresponds to how many rows will be in each row group in parquet. -#' If `NULL`, a best guess will be made for optimal size (based on the number of -#' columns and number of rows), though if the data has fewer than 250 million -#' cells (rows x cols), then the total number of rows is used. -#' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric values -#' are coerced to character. +#' directly corresponds to how many rows will be in each row group in +#' parquet. If `NULL`, a best guess will be made for optimal size (based on +#' the number of columns and number of rows), though if the data has fewer +#' than 250 million cells (rows x cols), then the total number of rows is +#' used. +#' @param version parquet version: "1.0", "2.0" (deprecated), "2.4" (default), +#' "2.6", or "latest" (currently equivalent to 2.6). Numeric values are +#' coerced to character. #' @param compression compression algorithm. Default "snappy". See details. -#' @param compression_level compression level. Meaning depends on compression algorithm -#' @param use_dictionary Specify if we should use dictionary encoding. Default `TRUE` -#' @param write_statistics Specify if we should write statistics. Default `TRUE` +#' @param compression_level compression level. Meaning depends on compression +#' algorithm +#' @param use_dictionary logical: use dictionary encoding? Default `TRUE` +#' @param write_statistics logical: include statistics? Default `TRUE` #' @param data_page_size Set a target threshold for the approximate encoded #' size of data pages within a column chunk (in bytes). Default 1 MiB. -#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet format. Default `FALSE`. +#' @param use_deprecated_int96_timestamps logical: write timestamps to INT96 +#' Parquet format, which has been deprecated? Default `FALSE`. #' @param coerce_timestamps Cast timestamps a particular resolution. Can be #' `NULL`, "ms" or "us". Default `NULL` (no casting) -#' @param allow_truncated_timestamps Allow loss of data when coercing timestamps to a -#' particular resolution. E.g. if microsecond or nanosecond data is lost when coercing -#' to "ms", do not raise an exception -#' @param properties A `ParquetWriterProperties` object, used instead of the options -#' enumerated in this function's signature. Providing `properties` as an argument -#' is deprecated; if you need to assemble `ParquetWriterProperties` outside -#' of `write_parquet()`, use `ParquetFileWriter` instead. +#' @param allow_truncated_timestamps logical: Allow loss of data when coercing +#' timestamps to a particular resolution. E.g. if microsecond or nanosecond +#' data is lost when coercing to "ms", do not raise an exception. Default +#' `FALSE`. +#' @param properties A `ParquetWriterProperties` object, used instead of the +#' options enumerated in this function's signature. Providing `properties` +#' as an argument is deprecated; if you need to assemble +#' `ParquetWriterProperties` outside of `write_parquet()`, use +#' `ParquetFileWriter` instead. #' @param arrow_properties A `ParquetArrowWriterProperties` object. Like #' `properties`, this argument is deprecated. #' @@ -143,7 +149,7 @@ write_parquet <- function(x, sink, chunk_size = NULL, # writer properties - version = NULL, + version = "2.4", compression = default_parquet_compression(), compression_level = NULL, use_dictionary = NULL, @@ -152,9 +158,7 @@ write_parquet <- function(x, # arrow writer properties use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, - allow_truncated_timestamps = FALSE, - properties = NULL, - arrow_properties = NULL) { + allow_truncated_timestamps = FALSE) { x_out <- x x <- as_writable_table(x) @@ -163,24 +167,10 @@ write_parquet <- function(x, on.exit(sink$close()) } - # Deprecation warnings - if (!is.null(properties)) { - warning( - "Providing 'properties' is deprecated. If you need to assemble properties outside ", - "this function, use ParquetFileWriter instead." - ) - } - if (!is.null(arrow_properties)) { - warning( - "Providing 'arrow_properties' is deprecated. If you need to assemble arrow_properties ", - "outside this function, use ParquetFileWriter instead." - ) - } - writer <- ParquetFileWriter$create( x$schema, sink, - properties = properties %||% ParquetWriterProperties$create( + properties = ParquetWriterProperties$create( names(x), version = version, compression = compression, @@ -189,7 +179,7 @@ write_parquet <- function(x, write_statistics = write_statistics, data_page_size = data_page_size ), - arrow_properties = arrow_properties %||% ParquetArrowWriterProperties$create( + arrow_properties = ParquetArrowWriterProperties$create( use_deprecated_int96_timestamps = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps, allow_truncated_timestamps = allow_truncated_timestamps @@ -238,19 +228,35 @@ ParquetArrowWriterProperties$create <- function(use_deprecated_int96_timestamps valid_parquet_version <- c( "1.0" = ParquetVersionType$PARQUET_1_0, - "2.0" = ParquetVersionType$PARQUET_2_0 + "2.0" = ParquetVersionType$PARQUET_2_0, + "2.4" = ParquetVersionType$PARQUET_2_4, + "2.6" = ParquetVersionType$PARQUET_2_6, + "latest" = ParquetVersionType$PARQUET_2_6 ) -make_valid_version <- function(version, valid_versions = valid_parquet_version) { +make_valid_parquet_version <- function(version, valid_versions = valid_parquet_version) { if (is_integerish(version)) { - version <- as.character(version) + version <- as.numeric(version) } - tryCatch( - valid_versions[[match.arg(version, choices = names(valid_versions))]], - error = function(cond) { - stop('"version" should be one of ', oxford_paste(names(valid_versions), "or"), call. = FALSE) - } - ) + if (is.numeric(version)) { + version <- format(version, nsmall = 1) + } + + if (!is.string(version)) { + stop( + '`version` must be one of ', oxford_paste(names(valid_versions), "or"), + call. = FALSE + ) + } + out <- valid_versions[[arg_match(version, values = names(valid_versions))]] + + if (identical(out, ParquetVersionType$PARQUET_2_0)) { + warning( + 'Parquet format version "2.0" is deprecated. Use "2.4" or "2.6" to select format features.', + call. = FALSE + ) + } + out } #' @title ParquetWriterProperties class @@ -300,7 +306,7 @@ ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder", inherit = ArrowObject, public = list( set_version = function(version) { - parquet___WriterProperties___Builder__version(self, make_valid_version(version)) + parquet___WriterProperties___Builder__version(self, make_valid_parquet_version(version)) }, set_compression = function(column_names, compression) { compression <- compression_from_name(compression) diff --git a/r/man/enums.Rd b/r/man/enums.Rd index 7ec126a0198..614c196fdee 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -36,7 +36,7 @@ An object of class \code{Compression::type} (inherits from \code{arrow-enum}) of An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4. -An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 2. +An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 4. An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of length 5. diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index efc6856e5e8..5b08db05565 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -8,7 +8,7 @@ write_parquet( x, sink, chunk_size = NULL, - version = NULL, + version = "2.4", compression = default_parquet_compression(), compression_level = NULL, use_dictionary = NULL, @@ -16,9 +16,7 @@ write_parquet( data_page_size = NULL, use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, - allow_truncated_timestamps = FALSE, - properties = NULL, - arrow_properties = NULL + allow_truncated_timestamps = FALSE ) } \arguments{ @@ -28,38 +26,44 @@ write_parquet( system (\code{SubTreeFileSystem})} \item{chunk_size}{how many rows of data to write to disk at once. This -directly corresponds to how many rows will be in each row group in parquet. -If \code{NULL}, a best guess will be made for optimal size (based on the number of -columns and number of rows), though if the data has fewer than 250 million -cells (rows x cols), then the total number of rows is used.} +directly corresponds to how many rows will be in each row group in +parquet. If \code{NULL}, a best guess will be made for optimal size (based on +the number of columns and number of rows), though if the data has fewer +than 250 million cells (rows x cols), then the total number of rows is +used.} -\item{version}{parquet version, "1.0" or "2.0". Default "1.0". Numeric values -are coerced to character.} +\item{version}{parquet version: "1.0", "2.0" (deprecated), "2.4" (default), +"2.6", or "latest" (currently equivalent to 2.6). Numeric values are +coerced to character.} \item{compression}{compression algorithm. Default "snappy". See details.} -\item{compression_level}{compression level. Meaning depends on compression algorithm} +\item{compression_level}{compression level. Meaning depends on compression +algorithm} -\item{use_dictionary}{Specify if we should use dictionary encoding. Default \code{TRUE}} +\item{use_dictionary}{logical: use dictionary encoding? Default \code{TRUE}} -\item{write_statistics}{Specify if we should write statistics. Default \code{TRUE}} +\item{write_statistics}{logical: include statistics? Default \code{TRUE}} \item{data_page_size}{Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes). Default 1 MiB.} -\item{use_deprecated_int96_timestamps}{Write timestamps to INT96 Parquet format. Default \code{FALSE}.} +\item{use_deprecated_int96_timestamps}{logical: write timestamps to INT96 +Parquet format, which has been deprecated? Default \code{FALSE}.} \item{coerce_timestamps}{Cast timestamps a particular resolution. Can be \code{NULL}, "ms" or "us". Default \code{NULL} (no casting)} -\item{allow_truncated_timestamps}{Allow loss of data when coercing timestamps to a -particular resolution. E.g. if microsecond or nanosecond data is lost when coercing -to "ms", do not raise an exception} +\item{allow_truncated_timestamps}{logical: Allow loss of data when coercing +timestamps to a particular resolution. E.g. if microsecond or nanosecond +data is lost when coercing to "ms", do not raise an exception. Default +\code{FALSE}.} -\item{properties}{A \code{ParquetWriterProperties} object, used instead of the options -enumerated in this function's signature. Providing \code{properties} as an argument -is deprecated; if you need to assemble \code{ParquetWriterProperties} outside -of \code{write_parquet()}, use \code{ParquetFileWriter} instead.} +\item{properties}{A \code{ParquetWriterProperties} object, used instead of the +options enumerated in this function's signature. Providing \code{properties} +as an argument is deprecated; if you need to assemble +\code{ParquetWriterProperties} outside of \code{write_parquet()}, use +\code{ParquetFileWriter} instead.} \item{arrow_properties}{A \code{ParquetArrowWriterProperties} object. Like \code{properties}, this argument is deprecated.} diff --git a/r/tests/testthat/_snaps/dataset-write.md b/r/tests/testthat/_snaps/dataset-write.md index 34cb46e9cb7..e9ca7e09989 100644 --- a/r/tests/testthat/_snaps/dataset-write.md +++ b/r/tests/testthat/_snaps/dataset-write.md @@ -45,5 +45,5 @@ write_dataset(df, dst_dir, format = "parquet", nonsensical_arg = "blah-blah") Error `nonsensical_arg` is not a valid argument for your chosen `format`. - i Supported arguments: `chunk_size`, `version`, `compression`, `compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`, `use_deprecated_int96_timestamps`, `coerce_timestamps`, `allow_truncated_timestamps`, `properties`, and `arrow_properties`. + i Supported arguments: `chunk_size`, `version`, `compression`, `compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`, `use_deprecated_int96_timestamps`, `coerce_timestamps`, and `allow_truncated_timestamps`. diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index 482f508575e..1f73ad7192a 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -129,15 +129,51 @@ test_that("write_parquet() can truncate timestamps", { expect_equal(as.data.frame(tab), as.data.frame(new)) }) -test_that("make_valid_version()", { - expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0) - expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0) +test_that("make_valid_parquet_version()", { + expect_equal( + make_valid_parquet_version("1.0"), + ParquetVersionType$PARQUET_1_0 + ) + expect_deprecated( + expect_equal( + make_valid_parquet_version("2.0"), + ParquetVersionType$PARQUET_2_0 + ) + ) + expect_equal( + make_valid_parquet_version("2.4"), + ParquetVersionType$PARQUET_2_4 + ) + expect_equal( + make_valid_parquet_version("2.6"), + ParquetVersionType$PARQUET_2_6 + ) + expect_equal( + make_valid_parquet_version("latest"), + ParquetVersionType$PARQUET_2_6 + ) - expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0) - expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0) + expect_equal(make_valid_parquet_version(1), ParquetVersionType$PARQUET_1_0) + expect_deprecated( + expect_equal(make_valid_parquet_version(2), ParquetVersionType$PARQUET_2_0) + ) + expect_equal(make_valid_parquet_version(1.0), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_parquet_version(2.4), ParquetVersionType$PARQUET_2_4) +}) - expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0) - expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0) +test_that("make_valid_parquet_version() input validation", { + expect_error( + make_valid_parquet_version("0.3.14"), + '`version` must be one of' + ) + expect_error( + make_valid_parquet_version(NULL), + '`version` must be one of' + ) + expect_error( + make_valid_parquet_version(c("2", "4")), + '`version` must be one of' + ) }) test_that("write_parquet() defaults to snappy compression", { @@ -239,7 +275,7 @@ test_that("write_parquet() handles version argument", { tf <- tempfile() on.exit(unlink(tf)) - purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ { + purrr::walk(list("1.0", "2.4", "2.6", "latest", 1.0, 2.4, 2.6, 1L), ~ { write_parquet(df, tf, version = .x) expect_identical(read_parquet(tf), df) }) From c27f59e501dd2f5aec96b24f5169527e70aca178 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 8 Jul 2022 13:30:42 -0400 Subject: [PATCH 2/4] Actually remove docs for deprecated args --- r/R/parquet.R | 11 ++--------- r/man/write_parquet.Rd | 12 +++--------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/r/R/parquet.R b/r/R/parquet.R index 2a009fdf05c..8cd9daa8575 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -106,13 +106,6 @@ read_parquet <- function(file, #' timestamps to a particular resolution. E.g. if microsecond or nanosecond #' data is lost when coercing to "ms", do not raise an exception. Default #' `FALSE`. -#' @param properties A `ParquetWriterProperties` object, used instead of the -#' options enumerated in this function's signature. Providing `properties` -#' as an argument is deprecated; if you need to assemble -#' `ParquetWriterProperties` outside of `write_parquet()`, use -#' `ParquetFileWriter` instead. -#' @param arrow_properties A `ParquetArrowWriterProperties` object. Like -#' `properties`, this argument is deprecated. #' #' @details The parameters `compression`, `compression_level`, `use_dictionary` and #' `write_statistics` support various patterns: @@ -134,7 +127,7 @@ read_parquet <- function(file, #' Note that "uncompressed" columns may still have dictionary encoding. #' #' @return the input `x` invisibly. -#' +#' @seealso [ParquetFileWriter] for a lower-level interface to Parquet writing. #' @examplesIf arrow_with_parquet() #' tf1 <- tempfile(fileext = ".parquet") #' write_parquet(data.frame(x = 1:5), tf1) @@ -244,7 +237,7 @@ make_valid_parquet_version <- function(version, valid_versions = valid_parquet_v if (!is.string(version)) { stop( - '`version` must be one of ', oxford_paste(names(valid_versions), "or"), + "`version` must be one of ", oxford_paste(names(valid_versions), "or"), call. = FALSE ) } diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index 5b08db05565..ff57e4c8e9a 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -58,15 +58,6 @@ Parquet format, which has been deprecated? Default \code{FALSE}.} timestamps to a particular resolution. E.g. if microsecond or nanosecond data is lost when coercing to "ms", do not raise an exception. Default \code{FALSE}.} - -\item{properties}{A \code{ParquetWriterProperties} object, used instead of the -options enumerated in this function's signature. Providing \code{properties} -as an argument is deprecated; if you need to assemble -\code{ParquetWriterProperties} outside of \code{write_parquet()}, use -\code{ParquetFileWriter} instead.} - -\item{arrow_properties}{A \code{ParquetArrowWriterProperties} object. Like -\code{properties}, this argument is deprecated.} } \value{ the input \code{x} invisibly. @@ -114,3 +105,6 @@ if (codec_is_available("gzip")) { } \dontshow{\}) # examplesIf} } +\seealso{ +\link{ParquetFileWriter} for a lower-level interface to Parquet writing. +} From f375949f1951b78641fa28124fe99a1812da4c6b Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 8 Jul 2022 13:35:33 -0400 Subject: [PATCH 3/4] Add NEWS bullet --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index 45a963ca48e..3d8391092e8 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -23,6 +23,7 @@ * `orders` with year, month, day, hours, minutes, and seconds components are supported. * the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`). * `read_arrow()` and `write_arrow()`, deprecated since 1.0.0 (July 2020), have been removed. Use the `read/write_feather()` and `read/write_ipc_stream()` functions depending on whether you're working with the Arrow IPC file or stream format, respectively. +* `write_parquet()` now defaults to writing Parquet format version 2.4 (was 1.0). Previously deprecated arguments `properties` and `arrow_properties` have been removed; if you need to deal with these lower-level properties objects directly, use `ParquetFileWriter`, which `write_parquet()` wraps. # arrow 8.0.0 From 1776407331668227ff1ae1c2c359bc51b4eae94a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 8 Jul 2022 15:00:56 -0400 Subject: [PATCH 4/4] s/'/"/g Co-authored-by: Dewey Dunnington --- r/tests/testthat/test-parquet.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index 1f73ad7192a..b75892bc843 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -164,15 +164,15 @@ test_that("make_valid_parquet_version()", { test_that("make_valid_parquet_version() input validation", { expect_error( make_valid_parquet_version("0.3.14"), - '`version` must be one of' + "`version` must be one of" ) expect_error( make_valid_parquet_version(NULL), - '`version` must be one of' + "`version` must be one of" ) expect_error( make_valid_parquet_version(c("2", "4")), - '`version` must be one of' + "`version` must be one of" ) })