From f95da5da4d8806b6fc5db2c122491900ac001ac6 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 20 Jun 2024 23:12:10 -0800 Subject: [PATCH 01/14] Fix crash in ParquetFileWriter$WriteTable --- r/R/parquet.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/parquet.R b/r/R/parquet.R index 0ee6c62601c..623d8ee908a 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -428,6 +428,7 @@ ParquetFileWriter <- R6Class("ParquetFileWriter", inherit = ArrowObject, public = list( WriteTable = function(table, chunk_size) { + assert_is(table, "Table") parquet___arrow___FileWriter__WriteTable(self, table, chunk_size) }, Close = function() parquet___arrow___FileWriter__Close(self) From 6eb400dc5b7cf58a5f96a3dc73f300c16848cff6 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 20 Jun 2024 23:12:22 -0800 Subject: [PATCH 02/14] Add WriteBatch method to ParquetFileWriter --- r/R/parquet.R | 5 +++++ r/tests/testthat/test-parquet.R | 28 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/r/R/parquet.R b/r/R/parquet.R index 623d8ee908a..be0c718f1a5 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -431,6 +431,11 @@ ParquetFileWriter <- R6Class("ParquetFileWriter", assert_is(table, "Table") parquet___arrow___FileWriter__WriteTable(self, table, chunk_size) }, + WriteBatch = function(batch, chunk_size) { + assert_is(batch, "RecordBatch") + table <- Table$create(batch) + parquet___arrow___FileWriter__WriteTable(self, table, chunk_size) + }, Close = function() parquet___arrow___FileWriter__Close(self) ) ) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index f2359116fda..cc57022600f 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -530,3 +530,31 @@ test_that("thrift string and container size can be specified when reading Parque data <- reader_container$ReadTable() expect_identical(collect.ArrowTabular(data), example_data) }) + +test_that("We can use WriteBatch on ParquetFileWriter", { + tf <- tempfile() + on.exit(unlink(tf)) + sink <- FileOutputStream$create(tf) + sch <- schema(a = int32()) + props <- ParquetWriterProperties$create(column_names = names(sch)) + writer <- ParquetFileWriter$create(schema = sch, sink = sink, properties = props) + + batch <- RecordBatch$create(data.frame(a = 1:10)) + writer$WriteBatch(batch, chunk_size = 10) + writer$WriteBatch(batch, chunk_size = 10) + writer$WriteBatch(batch, chunk_size = 10) + writer$Close() + + tbl <- read_parquet(tf) + expect_equal(nrow(tbl), 30) +}) + +test_that("WriteBatch on ParquetFileWriter errors when called on closed sink", { + sink <- FileOutputStream$create(tempfile()) + sch <- schema(a = int32()) + props <- ParquetWriterProperties$create(column_names = names(sch)) + writer <- ParquetFileWriter$create(schema = sch, sink = sink, properties = props) + writer$Close() + batch <- RecordBatch$create(data.frame(a = 1:10)) + expect_error(writer$WriteBatch(batch, chunk_size = 10), "Operation on closed file") +}) From c1ee601f960dd3af9d5f5d015bb43005b4bbef19 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 2 Jul 2024 13:25:16 -0700 Subject: [PATCH 03/14] Make WriteBatch just call WriteTable --- r/R/parquet.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/parquet.R b/r/R/parquet.R index be0c718f1a5..01d64f30655 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -431,10 +431,10 @@ ParquetFileWriter <- R6Class("ParquetFileWriter", assert_is(table, "Table") parquet___arrow___FileWriter__WriteTable(self, table, chunk_size) }, - WriteBatch = function(batch, chunk_size) { + WriteBatch = function(batch, ...) { assert_is(batch, "RecordBatch") table <- Table$create(batch) - parquet___arrow___FileWriter__WriteTable(self, table, chunk_size) + self$WriteTable(table, ...) }, Close = function() parquet___arrow___FileWriter__Close(self) ) From 7adf3fa7c4495c58d5cd2d37e0bfe8f4db1ecae8 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 2 Jul 2024 13:25:25 -0700 Subject: [PATCH 04/14] Document WriteBatch --- r/R/parquet.R | 1 + r/man/ParquetFileWriter.Rd | 1 + 2 files changed, 2 insertions(+) diff --git a/r/R/parquet.R b/r/R/parquet.R index 01d64f30655..f95a90820dc 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -419,6 +419,7 @@ ParquetWriterProperties$create <- function(column_names, #' @section Methods: #' #' - `WriteTable` Write a [Table] to `sink` +#' - `WriteBatch` Write a [Batch] to `sink` #' - `Close` Close the writer. Note: does not close the `sink`. #' [arrow::io::OutputStream][OutputStream] has its own `close()` method. #' diff --git a/r/man/ParquetFileWriter.Rd b/r/man/ParquetFileWriter.Rd index f36e85ab6c4..62c00c5e429 100644 --- a/r/man/ParquetFileWriter.Rd +++ b/r/man/ParquetFileWriter.Rd @@ -24,6 +24,7 @@ takes the following arguments: \itemize{ \item \code{WriteTable} Write a \link{Table} to \code{sink} +\item \code{WriteBatch} Write a \link{Batch} to \code{sink} \item \code{Close} Close the writer. Note: does not close the \code{sink}. \link[=OutputStream]{arrow::io::OutputStream} has its own \code{close()} method. } From 7ff1cffe46ef8b5c851b9d4f33bbe61abe4e7d96 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 3 Jul 2024 08:55:14 -0700 Subject: [PATCH 05/14] Fix doc issue --- r/R/parquet.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/parquet.R b/r/R/parquet.R index f95a90820dc..88ce1c77128 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -419,7 +419,7 @@ ParquetWriterProperties$create <- function(column_names, #' @section Methods: #' #' - `WriteTable` Write a [Table] to `sink` -#' - `WriteBatch` Write a [Batch] to `sink` +#' - `WriteBatch` Write a [RecordBatch] to `sink` #' - `Close` Close the writer. Note: does not close the `sink`. #' [arrow::io::OutputStream][OutputStream] has its own `close()` method. #' From 806e469c1e65ea04bab4b5b118de4c04b15dbe28 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 3 Jul 2024 08:55:24 -0700 Subject: [PATCH 06/14] Update ParquetFileWriter.Rd --- r/man/ParquetFileWriter.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/man/ParquetFileWriter.Rd b/r/man/ParquetFileWriter.Rd index 62c00c5e429..5779e574d46 100644 --- a/r/man/ParquetFileWriter.Rd +++ b/r/man/ParquetFileWriter.Rd @@ -24,7 +24,7 @@ takes the following arguments: \itemize{ \item \code{WriteTable} Write a \link{Table} to \code{sink} -\item \code{WriteBatch} Write a \link{Batch} to \code{sink} +\item \code{WriteBatch} Write a \link{RecordBatch} to \code{sink} \item \code{Close} Close the writer. Note: does not close the \code{sink}. \link[=OutputStream]{arrow::io::OutputStream} has its own \code{close()} method. } From 4c31dcde17772b3c8581840917f53d320cb97228 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 3 Jul 2024 12:20:01 -0700 Subject: [PATCH 07/14] Skip test on older Arrow C++ --- r/tests/testthat/helper-skip.R | 12 ++++++++++++ r/tests/testthat/test-parquet.R | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index bd290808481..472f2b5a3b0 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -110,6 +110,18 @@ skip_on_r_older_than <- function(r_version) { } } +skip_on_cpp_older_than <- function(cpp_version) { + if (force_tests()) { + return() + } + + current_version <- arrow_info()$build_info$cpp_version + + if (current_version < cpp_version) { + skip(paste("C++ version:", current_version)) + } +} + skip_on_python_older_than <- function(python_version) { if (force_tests()) { return() diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index cc57022600f..e601d648bc8 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -550,6 +550,11 @@ test_that("We can use WriteBatch on ParquetFileWriter", { }) test_that("WriteBatch on ParquetFileWriter errors when called on closed sink", { + # Skip this test if the Arrow C++ version is <15.0.0 because a check for the + # writer being in the closed state wasn't added until 15 which means this will + # segfault on lower versions. + skip_on_cpp_older_than("15.0.0") + sink <- FileOutputStream$create(tempfile()) sch <- schema(a = int32()) props <- ParquetWriterProperties$create(column_names = names(sch)) From a6312a8b6086c3db1d36e4e20943371722fc9bf0 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 9 Jul 2024 13:28:50 -0700 Subject: [PATCH 08/14] Bump minimum supported R version to 15.0.0 --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 6bd940f8067..b1f7cadeccb 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -62,7 +62,7 @@ jobs: strategy: matrix: include: - - cpp_version: "13.0.0" + - cpp_version: "15.0.0" steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 From e3c460dca8d6fb2ee32a468f7dfdacb9113afffb Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 9 Jul 2024 13:29:29 -0700 Subject: [PATCH 09/14] Remove skip_on_cpp_older_than --- r/tests/testthat/helper-skip.R | 12 ------------ r/tests/testthat/test-parquet.R | 5 ----- 2 files changed, 17 deletions(-) diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 472f2b5a3b0..bd290808481 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -110,18 +110,6 @@ skip_on_r_older_than <- function(r_version) { } } -skip_on_cpp_older_than <- function(cpp_version) { - if (force_tests()) { - return() - } - - current_version <- arrow_info()$build_info$cpp_version - - if (current_version < cpp_version) { - skip(paste("C++ version:", current_version)) - } -} - skip_on_python_older_than <- function(python_version) { if (force_tests()) { return() diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index e601d648bc8..cc57022600f 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -550,11 +550,6 @@ test_that("We can use WriteBatch on ParquetFileWriter", { }) test_that("WriteBatch on ParquetFileWriter errors when called on closed sink", { - # Skip this test if the Arrow C++ version is <15.0.0 because a check for the - # writer being in the closed state wasn't added until 15 which means this will - # segfault on lower versions. - skip_on_cpp_older_than("15.0.0") - sink <- FileOutputStream$create(tempfile()) sch <- schema(a = int32()) props <- ParquetWriterProperties$create(column_names = names(sch)) From c41392f2599e3d9e49b74ef6b4372258938e3005 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 9 Jul 2024 13:32:25 -0700 Subject: [PATCH 10/14] Remove ARROW_VERSION_MAJOR >= 15 guard --- r/src/r_to_arrow.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index a81210f0ad9..d2db11e14a7 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1050,7 +1050,6 @@ class RDictionaryConverter> template struct RConverterTrait; -#if ARROW_VERSION_MAJOR >= 15 template struct RConverterTrait< T, enable_if_t::value && !is_interval_type::value && @@ -1062,14 +1061,6 @@ template struct RConverterTrait> { // not implemented }; -#else -template -struct RConverterTrait< - T, enable_if_t::value && !is_interval_type::value && - !is_extension_type::value>> { - using type = RPrimitiveConverter; -}; -#endif template struct RConverterTrait> { From 69f61e740bcd8af49117a687516450d0cc8ef2bc Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 9 Jul 2024 14:02:26 -0700 Subject: [PATCH 11/14] Use 15.0.2 for minimum Arrow C++ version If you use 15.0.0 here like you'd expect you could, you get root@0347b6360c69:/# apt install -y -V libarrow-dev=15.0.0-1 \ libarrow-acero-dev=15.0.0-1 \ libparquet-dev=15.0.0-1 \ libarrow-dataset-dev=15.0.0-1 Reading package lists... Done Building dependency tree... Done Reading state information... Done Some packages could not be installed. This may mean that you have requested an impossible situation or if you are using the unstable distribution that some required packages have not yet been created or been moved out of Incoming. The following information may help to resolve the situation: The following packages have unmet dependencies: libarrow-acero-dev : Depends: libarrow-acero1500 (= 15.0.0-1) but 15.0.2-1 is to be installed libarrow-dataset-dev : Depends: libarrow-dataset1500 (= 15.0.0-1) but 15.0.2-1 is to be installed libarrow-dev : Depends: libarrow1500 (= 15.0.0-1) but 15.0.2-1 is to be installed libparquet-dev : Depends: libparquet1500 (= 15.0.0-1) but 15.0.2-1 is to be installed E: Unable to correct problems, you have held broken packages. --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index b1f7cadeccb..e8f57db99c2 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -62,7 +62,7 @@ jobs: strategy: matrix: include: - - cpp_version: "15.0.0" + - cpp_version: "15.0.2" steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 From 69ea5762db3e7110701054229821754154d507ab Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 9 Jul 2024 15:41:35 -0700 Subject: [PATCH 12/14] Bump min version in check-versions.R --- r/tools/check-versions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tools/check-versions.R b/r/tools/check-versions.R index 34b2ef680c5..57c4e100f9d 100644 --- a/r/tools/check-versions.R +++ b/r/tools/check-versions.R @@ -24,7 +24,7 @@ release_version_supported <- function(r_version, cpp_version) { r_version <- package_version(r_version) cpp_version <- package_version(cpp_version) major <- function(x) as.numeric(x[1, 1]) - minimum_cpp_version <- package_version("13.0.0") + minimum_cpp_version <- package_version("15.0.0") allow_mismatch <- identical(tolower(Sys.getenv("ARROW_R_ALLOW_CPP_VERSION_MISMATCH", "false")), "true") # If we allow a version mismatch we still need to cover the minimum version (13.0.0 for now) From 203697617fa63c755dab4d16104c8ca91f7ce9a3 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 9 Jul 2024 15:41:52 -0700 Subject: [PATCH 13/14] Include NEWS.md entry --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index 1e8a480ef5f..c2690e6248d 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,6 +24,7 @@ * `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. * The `na_matches` argument to the `dplyr::*_join()` functions is now supported. This argument controls whether `NA` values are considered equal when joining. (#41358) * R metadata, stored in the Arrow schema to support round-tripping data between R and Arrow/Parquet, is now serialized and deserialized more strictly. This makes it safer to load data from files from unknown sources into R data.frames. (#41969) +* The minimum version of the Arrow C++ library the Arrow R package can be built with has been bumped to 15.0.0 (#42241) # arrow 16.1.0 From c2d39d0dff6fa3f14389352b6436e94d7affa73c Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 9 Jul 2024 16:48:32 -0700 Subject: [PATCH 14/14] Bump min version in more places --- r/tools/check-versions.R | 2 +- r/tools/test-check-versions.R | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/r/tools/check-versions.R b/r/tools/check-versions.R index 57c4e100f9d..ea7fe93c524 100644 --- a/r/tools/check-versions.R +++ b/r/tools/check-versions.R @@ -27,7 +27,7 @@ release_version_supported <- function(r_version, cpp_version) { minimum_cpp_version <- package_version("15.0.0") allow_mismatch <- identical(tolower(Sys.getenv("ARROW_R_ALLOW_CPP_VERSION_MISMATCH", "false")), "true") - # If we allow a version mismatch we still need to cover the minimum version (13.0.0 for now) + # If we allow a version mismatch we still need to cover the minimum version (15.0.0 for now) # we don't allow newer C++ versions as new features without additional feature gates are likely to # break the R package version_valid <- cpp_version >= minimum_cpp_version && major(cpp_version) <= major(r_version) diff --git a/r/tools/test-check-versions.R b/r/tools/test-check-versions.R index f558648bed1..14c0bee3fd8 100644 --- a/r/tools/test-check-versions.R +++ b/r/tools/test-check-versions.R @@ -61,16 +61,24 @@ test_that("check_versions without mismatch", { test_that("check_versions with mismatch", { withr::local_envvar(.new = c(ARROW_R_ALLOW_CPP_VERSION_MISMATCH = "false")) + expect_true( + release_version_supported("15.0.0", "15.0.0") + ) + expect_false( release_version_supported("15.0.0", "13.0.0") ) withr::local_envvar(.new = c(ARROW_R_ALLOW_CPP_VERSION_MISMATCH = "true")) - expect_true( + expect_false( release_version_supported("15.0.0", "13.0.0") ) + expect_true( + release_version_supported("16.0.0", "15.0.0") + ) + expect_false( release_version_supported("15.0.0", "16.0.0") )