From 12dd08e05147778af521d452868590d77fcc01d1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 4 Jan 2022 08:10:27 -0400 Subject: [PATCH 1/3] add int96 timestamp unit to parquet reader --- r/R/arrowExports.R | 8 ++++++++ r/R/parquet.R | 6 ++++++ r/src/arrowExports.cpp | 34 +++++++++++++++++++++++++++++++++ r/src/parquet.cpp | 12 ++++++++++++ r/tests/testthat/test-parquet.R | 28 +++++++++++++++++++++++++++ 5 files changed, 88 insertions(+) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 46948097388..2980e6e5f3a 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1364,6 +1364,14 @@ parquet___arrow___ArrowReaderProperties__set_read_dictionary <- function(propert invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary`, properties, column_index, read_dict)) } +parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit <- function(properties, unit) { + invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit`, properties, unit)) +} + +parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit <- function(properties) { + .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit`, properties) +} + parquet___arrow___FileReader__OpenFile <- function(file, props) { .Call(`_arrow_parquet___arrow___FileReader__OpenFile`, file, props) } diff --git a/r/R/parquet.R b/r/R/parquet.R index 11be2c051de..3a07c224ed6 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -577,6 +577,12 @@ ParquetArrowReaderProperties <- R6Class("ParquetArrowReaderProperties", }, set_read_dictionary = function(column_index, read_dict) { parquet___arrow___ArrowReaderProperties__set_read_dictionary(self, column_index, read_dict) + }, + coerce_int96_timestamp_unit = function() { + parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit(self) + }, + set_coerce_int96_timestamp_unit = function(unit) { + parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit(self, unit) } ), active = list( diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 86b74234eca..9c7229a696a 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -5351,6 +5351,38 @@ extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_read_diction } #endif +// parquet.cpp +#if defined(ARROW_R_WITH_PARQUET) +void parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit(const std::shared_ptr& properties, arrow::TimeUnit::type unit); +extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit(SEXP properties_sexp, SEXP unit_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type properties(properties_sexp); + arrow::r::Input::type unit(unit_sexp); + parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit(properties, unit); + return R_NilValue; +END_CPP11 +} +#else +extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit(SEXP properties_sexp, SEXP unit_sexp){ + Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_PARQUET) +arrow::TimeUnit::type parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit(const std::shared_ptr& properties); +extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit(SEXP properties_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type properties(properties_sexp); + return cpp11::as_sexp(parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit(properties)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit(SEXP properties_sexp){ + Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // parquet.cpp #if defined(ARROW_R_WITH_PARQUET) std::shared_ptr parquet___arrow___FileReader__OpenFile(const std::shared_ptr& file, const std::shared_ptr& props); @@ -7618,6 +7650,8 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2}, { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2}, { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3}, + { "_arrow_parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit, 1}, { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp index 5de7ca8fac4..5ede9a60aa1 100644 --- a/r/src/parquet.cpp +++ b/r/src/parquet.cpp @@ -69,6 +69,18 @@ void parquet___arrow___ArrowReaderProperties__set_read_dictionary( properties->set_read_dictionary(column_index, read_dict); } +// [[parquet::export]] +void parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit( + const std::shared_ptr& properties, arrow::TimeUnit::type unit) { + properties->set_coerce_int96_timestamp_unit(unit); +} + +// [[parquet::export]] +arrow::TimeUnit::type parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit( + const std::shared_ptr& properties) { + return properties->coerce_int96_timestamp_unit(); +} + // [[parquet::export]] std::shared_ptr parquet___arrow___FileReader__OpenFile( const std::shared_ptr& file, diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index c6533de91ec..07e6ae5caa7 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -324,3 +324,31 @@ test_that("ParquetFileWrite chunk_size calculation doesn't have integer overflow # but our max_chunks is respected expect_equal(calculate_chunk_size(101, 1, 25, 2), 51) }) + +test_that("deprecated int96 timestamp unit can be specified when reading Parquet files", { + tf <- tempfile() + on.exit(unlink(tf)) + + table <- Table$create( + some_datetime = as.POSIXct("2001-01-01 12:34:56.789") + ) + + write_parquet( + table, + tf, + use_deprecated_int96_timestamps = TRUE + ) + + props <- ParquetArrowReaderProperties$create() + props$set_coerce_int96_timestamp_unit(TimeUnit$MILLI) + expect_identical(props$coerce_int96_timestamp_unit(), TimeUnit$MILLI) + + result <- read_parquet( + tf, + as_data_frame = FALSE, + props = props + ) + + expect_identical(result$some_datetime$type$unit(), TimeUnit$MILLI) + expect_true(result$some_datetime == table$some_datetime) +}) From 056d42a2b772a6a03f45bc83d3827e0632222c58 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 4 Jan 2022 09:49:20 -0400 Subject: [PATCH 2/3] better test object, clang-format parquet.cpp --- r/src/parquet.cpp | 6 ++++-- r/tests/testthat/test-parquet.R | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp index 5ede9a60aa1..5d5fd9b7f46 100644 --- a/r/src/parquet.cpp +++ b/r/src/parquet.cpp @@ -71,12 +71,14 @@ void parquet___arrow___ArrowReaderProperties__set_read_dictionary( // [[parquet::export]] void parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit( - const std::shared_ptr& properties, arrow::TimeUnit::type unit) { + const std::shared_ptr& properties, + arrow::TimeUnit::type unit) { properties->set_coerce_int96_timestamp_unit(unit); } // [[parquet::export]] -arrow::TimeUnit::type parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit( +arrow::TimeUnit::type +parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit( const std::shared_ptr& properties) { return properties->coerce_int96_timestamp_unit(); } diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index 07e6ae5caa7..01bfe4d0d87 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -330,7 +330,7 @@ test_that("deprecated int96 timestamp unit can be specified when reading Parquet on.exit(unlink(tf)) table <- Table$create( - some_datetime = as.POSIXct("2001-01-01 12:34:56.789") + some_datetime = as.POSIXct("2001-01-01 12:34:56.789", tz = "UTC") ) write_parquet( From bb57b06d390490e80afa6aba7598db2fe40e3342 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 4 Jan 2022 10:01:41 -0400 Subject: [PATCH 3/3] revert test object --- r/tests/testthat/test-parquet.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index 01bfe4d0d87..07e6ae5caa7 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -330,7 +330,7 @@ test_that("deprecated int96 timestamp unit can be specified when reading Parquet on.exit(unlink(tf)) table <- Table$create( - some_datetime = as.POSIXct("2001-01-01 12:34:56.789", tz = "UTC") + some_datetime = as.POSIXct("2001-01-01 12:34:56.789") ) write_parquet(