From eb5c5ce3f83e2e9b0c5f541c973fc45d0ffa3761 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 30 Jun 2022 18:44:11 +0100 Subject: [PATCH 01/12] Add code from Romaine's suggestions --- r/src/arrow_cpp11.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index f1338c02ca6..8a31f3f4551 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -428,4 +428,21 @@ SEXP as_sexp(const std::shared_ptr& ptr) { return cpp11::to_r6(ptr); } +struct r_vec_size{ + r_vec_size(R_xlen_t x) : value(x){} + + R_xlen_t value; +}; + + +inline SEXP as_sexp(r_vec_size size) { + R_xlen_t x = size.value; + if (x > std::numeric_limits::max()) { + return Rf_ScalarReal(x); + } else { + return Rf_ScalarInteger(x); + } +} + + } // namespace cpp11 From 92a8a86cd108759ba2e65865927f0ea2035415dd Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 30 Jun 2022 18:44:22 +0100 Subject: [PATCH 02/12] Try to use function --- r/src/table.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/table.cpp b/r/src/table.cpp index 051647979f5..89c932d5355 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -28,7 +28,7 @@ int Table__num_columns(const std::shared_ptr& x) { } // [[arrow::export]] -int Table__num_rows(const std::shared_ptr& x) { return x->num_rows(); } +int Table__num_rows(const std::shared_ptr& x) { return r_vec_size(x->num_rows()); } // [[arrow::export]] std::shared_ptr Table__schema(const std::shared_ptr& x) { From d68e39466642b1298bc444696c808ef439f4b182 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 30 Jun 2022 18:54:43 +0100 Subject: [PATCH 03/12] Move definition of r_vec_size and change Table__num_rows to be an object of that type --- r/src/arrow_cpp11.h | 14 +++++++------- r/src/table.cpp | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index 8a31f3f4551..082106f4e52 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -407,6 +407,12 @@ cpp11::writable::list to_r_list(const std::vector>& x) { } // namespace r } // namespace arrow +struct r_vec_size{ + r_vec_size(R_xlen_t x) : value(x){} + + R_xlen_t value; +}; + namespace cpp11 { template @@ -428,13 +434,6 @@ SEXP as_sexp(const std::shared_ptr& ptr) { return cpp11::to_r6(ptr); } -struct r_vec_size{ - r_vec_size(R_xlen_t x) : value(x){} - - R_xlen_t value; -}; - - inline SEXP as_sexp(r_vec_size size) { R_xlen_t x = size.value; if (x > std::numeric_limits::max()) { @@ -445,4 +444,5 @@ inline SEXP as_sexp(r_vec_size size) { } + } // namespace cpp11 diff --git a/r/src/table.cpp b/r/src/table.cpp index 89c932d5355..f55866902bf 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -28,7 +28,9 @@ int Table__num_columns(const std::shared_ptr& x) { } // [[arrow::export]] -int Table__num_rows(const std::shared_ptr& x) { return r_vec_size(x->num_rows()); } +r_vec_size Table__num_rows(const std::shared_ptr& x) { + return x->num_rows(); +} // [[arrow::export]] std::shared_ptr Table__schema(const std::shared_ptr& x) { From 4c43483f1d5759ce8f746b5f0828750394121263 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 30 Jun 2022 19:54:32 +0100 Subject: [PATCH 04/12] Add test for num_rows on large table --- r/tests/testthat/test-Table.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 791e3ce2988..6da6565dcda 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -696,3 +696,18 @@ test_that("as_arrow_table() errors for invalid input", { class = "arrow_no_method_as_arrow_table" ) }) + +test_that("ARROW-14989: num_rows method not susceptible to integer overflow", { + skip_on_cran() + + test_array1 <- Array$create(raw(.Machine$integer.max)) + test_array2 <- Array$create(raw(1)) + big_chunked <- chunked_array(test_array1, test_array2) + + small_table <- Table$create(col = test_array2) + expect_type(small_table$num_rows, "integer") + + big_table <- Table$create(col = big_chunked) + expect_type(big_table$num_rows, "double") + +}) From 5e2cfd12bb916ef263c5edf65cbec2ee4eb4299d Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 30 Jun 2022 20:06:52 +0100 Subject: [PATCH 05/12] Implement for RecordBatches too --- r/src/recordbatch.cpp | 2 +- r/tests/testthat/test-RecordBatch.R | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index 558628a9ccb..ba1ff57894a 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -32,7 +32,7 @@ int RecordBatch__num_columns(const std::shared_ptr& x) { } // [[arrow::export]] -int RecordBatch__num_rows(const std::shared_ptr& x) { +r_vec_size RecordBatch__num_rows(const std::shared_ptr& x) { return x->num_rows(); } diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index a39aa0f0fb9..da0cd2d8312 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -830,3 +830,18 @@ test_that("as_record_batch() works for data.frame()", { record_batch(col1 = Array$create(1, type = float64()), col2 = "two") ) }) + +test_that("ARROW-14989: num_rows method not susceptible to integer overflow", { + skip_on_cran() + + test_array1 <- Array$create(raw(.Machine$integer.max)) + test_array2 <- Array$create(raw(1)) + big_chunked <- chunked_array(test_array1, test_array2) + + small_table <- record_batch(col = test_array2) + expect_type(small_table$num_rows, "integer") + + big_table <- record_batch(col = big_chunked) + expect_type(big_table$num_rows, "double") + +}) From 5036dd2d5ad85d47cc621644fd9b2e6cf11a7cf3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 30 Jun 2022 20:08:53 +0100 Subject: [PATCH 06/12] Remove unnecessary whitespace --- r/src/arrow_cpp11.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index 082106f4e52..acb0ff9ea0f 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -443,6 +443,4 @@ inline SEXP as_sexp(r_vec_size size) { } } - - } // namespace cpp11 From e92c42749d9a61e566da819517380cd25010c089 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 30 Jun 2022 20:14:36 +0100 Subject: [PATCH 07/12] Use values on border instead of small ones --- r/tests/testthat/test-RecordBatch.R | 2 +- r/tests/testthat/test-Table.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index da0cd2d8312..22812f97960 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -838,7 +838,7 @@ test_that("ARROW-14989: num_rows method not susceptible to integer overflow", { test_array2 <- Array$create(raw(1)) big_chunked <- chunked_array(test_array1, test_array2) - small_table <- record_batch(col = test_array2) + small_table <- record_batch(col = test_array1) expect_type(small_table$num_rows, "integer") big_table <- record_batch(col = big_chunked) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 6da6565dcda..2814317a12c 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -704,7 +704,7 @@ test_that("ARROW-14989: num_rows method not susceptible to integer overflow", { test_array2 <- Array$create(raw(1)) big_chunked <- chunked_array(test_array1, test_array2) - small_table <- Table$create(col = test_array2) + small_table <- Table$create(col = test_array1) expect_type(small_table$num_rows, "integer") big_table <- Table$create(col = big_chunked) From dbfa34f4e0cf85dcab6069ea6d23a2047c4ef685 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 30 Jun 2022 20:27:29 +0100 Subject: [PATCH 08/12] Lint the C++ --- r/src/arrow_cpp11.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index acb0ff9ea0f..f8a34d9accf 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -407,8 +407,8 @@ cpp11::writable::list to_r_list(const std::vector>& x) { } // namespace r } // namespace arrow -struct r_vec_size{ - r_vec_size(R_xlen_t x) : value(x){} +struct r_vec_size { + r_vec_size(R_xlen_t x) : value(x) {} R_xlen_t value; }; From e7cf8a66beab6d1b7d85304362086b6205a31279 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 1 Jul 2022 09:59:52 +0100 Subject: [PATCH 09/12] Remove tests that won't run on CI as too slow --- r/tests/testthat/test-RecordBatch.R | 15 --------------- r/tests/testthat/test-Table.R | 15 --------------- 2 files changed, 30 deletions(-) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 22812f97960..a39aa0f0fb9 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -830,18 +830,3 @@ test_that("as_record_batch() works for data.frame()", { record_batch(col1 = Array$create(1, type = float64()), col2 = "two") ) }) - -test_that("ARROW-14989: num_rows method not susceptible to integer overflow", { - skip_on_cran() - - test_array1 <- Array$create(raw(.Machine$integer.max)) - test_array2 <- Array$create(raw(1)) - big_chunked <- chunked_array(test_array1, test_array2) - - small_table <- record_batch(col = test_array1) - expect_type(small_table$num_rows, "integer") - - big_table <- record_batch(col = big_chunked) - expect_type(big_table$num_rows, "double") - -}) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 2814317a12c..791e3ce2988 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -696,18 +696,3 @@ test_that("as_arrow_table() errors for invalid input", { class = "arrow_no_method_as_arrow_table" ) }) - -test_that("ARROW-14989: num_rows method not susceptible to integer overflow", { - skip_on_cran() - - test_array1 <- Array$create(raw(.Machine$integer.max)) - test_array2 <- Array$create(raw(1)) - big_chunked <- chunked_array(test_array1, test_array2) - - small_table <- Table$create(col = test_array1) - expect_type(small_table$num_rows, "integer") - - big_table <- Table$create(col = big_chunked) - expect_type(big_table$num_rows, "double") - -}) From cc269de7fe28a3a1208a019fea2de7d04aaccbb8 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 1 Jul 2022 10:20:59 +0100 Subject: [PATCH 10/12] Mark variable 'explicit' because the linter told me to --- r/src/arrow_cpp11.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index f8a34d9accf..123875325c6 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -408,7 +408,7 @@ cpp11::writable::list to_r_list(const std::vector>& x) { } // namespace arrow struct r_vec_size { - r_vec_size(R_xlen_t x) : value(x) {} + explicit r_vec_size(R_xlen_t x) : value(x) {} R_xlen_t value; }; From 402f2e80f54e0937cee46c0edca0bd970dd65dfa Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 4 Jul 2022 13:10:40 +0100 Subject: [PATCH 11/12] Update r/src/recordbatch.cpp Co-authored-by: Dewey Dunnington --- r/src/recordbatch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index ba1ff57894a..01bd8a3f35b 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -33,7 +33,7 @@ int RecordBatch__num_columns(const std::shared_ptr& x) { // [[arrow::export]] r_vec_size RecordBatch__num_rows(const std::shared_ptr& x) { - return x->num_rows(); + return r_vec_size(x->num_rows()); } // [[arrow::export]] From 33a3ceb66145f2f90ba5d5d73eb752c9ed981dae Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 4 Jul 2022 13:10:47 +0100 Subject: [PATCH 12/12] Update r/src/table.cpp Co-authored-by: Dewey Dunnington --- r/src/table.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/table.cpp b/r/src/table.cpp index f55866902bf..07bf44750a1 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -29,7 +29,7 @@ int Table__num_columns(const std::shared_ptr& x) { // [[arrow::export]] r_vec_size Table__num_rows(const std::shared_ptr& x) { - return x->num_rows(); + return r_vec_size(x->num_rows()); } // [[arrow::export]]