Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@ importFrom(rlang,"%||%")
importFrom(rlang,":=")
importFrom(rlang,.data)
importFrom(rlang,abort)
importFrom(rlang,arg_match)
importFrom(rlang,as_function)
importFrom(rlang,as_label)
importFrom(rlang,as_quosure)
Expand Down
1 change: 1 addition & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* `orders` with year, month, day, hours, minutes, and seconds components are supported.
* the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`).
* `read_arrow()` and `write_arrow()`, deprecated since 1.0.0 (July 2020), have been removed. Use the `read/write_feather()` and `read/write_ipc_stream()` functions depending on whether you're working with the Arrow IPC file or stream format, respectively.
* `write_parquet()` now defaults to writing Parquet format version 2.4 (was 1.0). Previously deprecated arguments `properties` and `arrow_properties` have been removed; if you need to deal with these lower-level properties objects directly, use `ParquetFileWriter`, which `write_parquet()` wraps.

# arrow 8.0.0

Expand Down
2 changes: 1 addition & 1 deletion r/R/arrow-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
#' @importFrom rlang is_list call2 is_empty as_function as_label
#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match
#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
#' @useDynLib arrow, .registration = TRUE
#' @keywords internal
Expand Down
2 changes: 1 addition & 1 deletion r/R/enums.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ FileType <- enum("FileType",
#' @export
#' @rdname enums
ParquetVersionType <- enum("ParquetVersionType",
PARQUET_1_0 = 0L, PARQUET_2_0 = 1L
PARQUET_1_0 = 0L, PARQUET_2_0 = 1L, PARQUET_2_4 = 2L, PARQUET_2_6 = 3L
)

#' @export
Expand Down
99 changes: 49 additions & 50 deletions r/R/parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,30 +83,29 @@ read_parquet <- function(file,
#' @param sink A string file path, URI, or [OutputStream], or path in a file
#' system (`SubTreeFileSystem`)
#' @param chunk_size how many rows of data to write to disk at once. This
#' directly corresponds to how many rows will be in each row group in parquet.
#' If `NULL`, a best guess will be made for optimal size (based on the number of
#' columns and number of rows), though if the data has fewer than 250 million
#' cells (rows x cols), then the total number of rows is used.
#' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric values
#' are coerced to character.
#' directly corresponds to how many rows will be in each row group in
#' parquet. If `NULL`, a best guess will be made for optimal size (based on
#' the number of columns and number of rows), though if the data has fewer
#' than 250 million cells (rows x cols), then the total number of rows is
#' used.
#' @param version parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
#' "2.6", or "latest" (currently equivalent to 2.6). Numeric values are
#' coerced to character.
#' @param compression compression algorithm. Default "snappy". See details.
#' @param compression_level compression level. Meaning depends on compression algorithm
#' @param use_dictionary Specify if we should use dictionary encoding. Default `TRUE`
#' @param write_statistics Specify if we should write statistics. Default `TRUE`
#' @param compression_level compression level. Meaning depends on compression
#' algorithm
#' @param use_dictionary logical: use dictionary encoding? Default `TRUE`
#' @param write_statistics logical: include statistics? Default `TRUE`
#' @param data_page_size Set a target threshold for the approximate encoded
#' size of data pages within a column chunk (in bytes). Default 1 MiB.
#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet format. Default `FALSE`.
#' @param use_deprecated_int96_timestamps logical: write timestamps to INT96
#' Parquet format, which has been deprecated? Default `FALSE`.
#' @param coerce_timestamps Cast timestamps a particular resolution. Can be
#' `NULL`, "ms" or "us". Default `NULL` (no casting)
#' @param allow_truncated_timestamps Allow loss of data when coercing timestamps to a
#' particular resolution. E.g. if microsecond or nanosecond data is lost when coercing
#' to "ms", do not raise an exception
#' @param properties A `ParquetWriterProperties` object, used instead of the options
#' enumerated in this function's signature. Providing `properties` as an argument
#' is deprecated; if you need to assemble `ParquetWriterProperties` outside
#' of `write_parquet()`, use `ParquetFileWriter` instead.
#' @param arrow_properties A `ParquetArrowWriterProperties` object. Like
#' `properties`, this argument is deprecated.
#' @param allow_truncated_timestamps logical: Allow loss of data when coercing
#' timestamps to a particular resolution. E.g. if microsecond or nanosecond
#' data is lost when coercing to "ms", do not raise an exception. Default
#' `FALSE`.
#'
#' @details The parameters `compression`, `compression_level`, `use_dictionary` and
#' `write_statistics` support various patterns:
Expand All @@ -128,7 +127,7 @@ read_parquet <- function(file,
#' Note that "uncompressed" columns may still have dictionary encoding.
#'
#' @return the input `x` invisibly.
#'
#' @seealso [ParquetFileWriter] for a lower-level interface to Parquet writing.
#' @examplesIf arrow_with_parquet()
#' tf1 <- tempfile(fileext = ".parquet")
#' write_parquet(data.frame(x = 1:5), tf1)
Expand All @@ -143,7 +142,7 @@ write_parquet <- function(x,
sink,
chunk_size = NULL,
# writer properties
version = NULL,
version = "2.4",
compression = default_parquet_compression(),
compression_level = NULL,
use_dictionary = NULL,
Expand All @@ -152,9 +151,7 @@ write_parquet <- function(x,
# arrow writer properties
use_deprecated_int96_timestamps = FALSE,
coerce_timestamps = NULL,
allow_truncated_timestamps = FALSE,
properties = NULL,
arrow_properties = NULL) {
allow_truncated_timestamps = FALSE) {
x_out <- x
x <- as_writable_table(x)

Expand All @@ -163,24 +160,10 @@ write_parquet <- function(x,
on.exit(sink$close())
}

# Deprecation warnings
if (!is.null(properties)) {
warning(
"Providing 'properties' is deprecated. If you need to assemble properties outside ",
"this function, use ParquetFileWriter instead."
)
}
if (!is.null(arrow_properties)) {
warning(
"Providing 'arrow_properties' is deprecated. If you need to assemble arrow_properties ",
"outside this function, use ParquetFileWriter instead."
)
}

writer <- ParquetFileWriter$create(
x$schema,
sink,
properties = properties %||% ParquetWriterProperties$create(
properties = ParquetWriterProperties$create(
names(x),
version = version,
compression = compression,
Expand All @@ -189,7 +172,7 @@ write_parquet <- function(x,
write_statistics = write_statistics,
data_page_size = data_page_size
),
arrow_properties = arrow_properties %||% ParquetArrowWriterProperties$create(
arrow_properties = ParquetArrowWriterProperties$create(
use_deprecated_int96_timestamps = use_deprecated_int96_timestamps,
coerce_timestamps = coerce_timestamps,
allow_truncated_timestamps = allow_truncated_timestamps
Expand Down Expand Up @@ -238,19 +221,35 @@ ParquetArrowWriterProperties$create <- function(use_deprecated_int96_timestamps

valid_parquet_version <- c(
"1.0" = ParquetVersionType$PARQUET_1_0,
"2.0" = ParquetVersionType$PARQUET_2_0
"2.0" = ParquetVersionType$PARQUET_2_0,
"2.4" = ParquetVersionType$PARQUET_2_4,
"2.6" = ParquetVersionType$PARQUET_2_6,
"latest" = ParquetVersionType$PARQUET_2_6
)

make_valid_version <- function(version, valid_versions = valid_parquet_version) {
make_valid_parquet_version <- function(version, valid_versions = valid_parquet_version) {
if (is_integerish(version)) {
version <- as.character(version)
version <- as.numeric(version)
}
tryCatch(
valid_versions[[match.arg(version, choices = names(valid_versions))]],
error = function(cond) {
stop('"version" should be one of ', oxford_paste(names(valid_versions), "or"), call. = FALSE)
}
)
if (is.numeric(version)) {
version <- format(version, nsmall = 1)
}

if (!is.string(version)) {
stop(
"`version` must be one of ", oxford_paste(names(valid_versions), "or"),
call. = FALSE
)
}
out <- valid_versions[[arg_match(version, values = names(valid_versions))]]

if (identical(out, ParquetVersionType$PARQUET_2_0)) {
warning(
'Parquet format version "2.0" is deprecated. Use "2.4" or "2.6" to select format features.',
call. = FALSE
)
}
out
}

#' @title ParquetWriterProperties class
Expand Down Expand Up @@ -300,7 +299,7 @@ ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder",
inherit = ArrowObject,
public = list(
set_version = function(version) {
parquet___WriterProperties___Builder__version(self, make_valid_version(version))
parquet___WriterProperties___Builder__version(self, make_valid_parquet_version(version))
},
set_compression = function(column_names, compression) {
compression <- compression_from_name(compression)
Expand Down
2 changes: 1 addition & 1 deletion r/man/enums.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 23 additions & 25 deletions r/man/write_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/tests/testthat/_snaps/dataset-write.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@
write_dataset(df, dst_dir, format = "parquet", nonsensical_arg = "blah-blah")
Error <rlang_error>
`nonsensical_arg` is not a valid argument for your chosen `format`.
i Supported arguments: `chunk_size`, `version`, `compression`, `compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`, `use_deprecated_int96_timestamps`, `coerce_timestamps`, `allow_truncated_timestamps`, `properties`, and `arrow_properties`.
i Supported arguments: `chunk_size`, `version`, `compression`, `compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`, `use_deprecated_int96_timestamps`, `coerce_timestamps`, and `allow_truncated_timestamps`.

52 changes: 44 additions & 8 deletions r/tests/testthat/test-parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,51 @@ test_that("write_parquet() can truncate timestamps", {
expect_equal(as.data.frame(tab), as.data.frame(new))
})

test_that("make_valid_version()", {
expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0)
test_that("make_valid_parquet_version()", {
expect_equal(
make_valid_parquet_version("1.0"),
ParquetVersionType$PARQUET_1_0
)
expect_deprecated(
expect_equal(
make_valid_parquet_version("2.0"),
ParquetVersionType$PARQUET_2_0
)
)
expect_equal(
make_valid_parquet_version("2.4"),
ParquetVersionType$PARQUET_2_4
)
expect_equal(
make_valid_parquet_version("2.6"),
ParquetVersionType$PARQUET_2_6
)
expect_equal(
make_valid_parquet_version("latest"),
ParquetVersionType$PARQUET_2_6
)

expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0)
expect_equal(make_valid_parquet_version(1), ParquetVersionType$PARQUET_1_0)
expect_deprecated(
expect_equal(make_valid_parquet_version(2), ParquetVersionType$PARQUET_2_0)
)
expect_equal(make_valid_parquet_version(1.0), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_parquet_version(2.4), ParquetVersionType$PARQUET_2_4)
})

expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0)
test_that("make_valid_parquet_version() input validation", {
expect_error(
make_valid_parquet_version("0.3.14"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🥧

"`version` must be one of"
)
expect_error(
make_valid_parquet_version(NULL),
"`version` must be one of"
)
expect_error(
make_valid_parquet_version(c("2", "4")),
"`version` must be one of"
)
})

test_that("write_parquet() defaults to snappy compression", {
Expand Down Expand Up @@ -239,7 +275,7 @@ test_that("write_parquet() handles version argument", {
tf <- tempfile()
on.exit(unlink(tf))

purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ {
purrr::walk(list("1.0", "2.4", "2.6", "latest", 1.0, 2.4, 2.6, 1L), ~ {
write_parquet(df, tf, version = .x)
expect_identical(read_parquet(tf), df)
})
Expand Down