From dbdacb790b0a17a903f2c9d1d2545e001342778e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 5 Jul 2022 14:15:56 +0100 Subject: [PATCH 01/11] Add in r_vec_size for dataset scanner row counting --- r/src/dataset.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index aa44a0721e3..713c0d3b237 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -511,8 +511,8 @@ std::shared_ptr dataset___Scanner__TakeRows( } // [[dataset::export]] -int64_t dataset___Scanner__CountRows(const std::shared_ptr& scanner) { - return ValueOrStop(scanner->CountRows()); +r_vec_size dataset___Scanner__CountRows(const std::shared_ptr& scanner) { + return r_vec_size(ValueOrStop(scanner->CountRows())); } #endif From 545e7e91e12cc34762e1bf559cee627b97247869 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 7 Jul 2022 14:36:59 +0100 Subject: [PATCH 02/11] Update generated file --- r/src/arrowExports.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 62c8b6695c8..24c26cb35b7 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2062,7 +2062,7 @@ extern "C" SEXP _arrow_dataset___Scanner__TakeRows(SEXP scanner_sexp, SEXP indic // dataset.cpp #if defined(ARROW_R_WITH_DATASET) -int64_t dataset___Scanner__CountRows(const std::shared_ptr& scanner); +r_vec_size dataset___Scanner__CountRows(const std::shared_ptr& scanner); extern "C" SEXP _arrow_dataset___Scanner__CountRows(SEXP scanner_sexp){ BEGIN_CPP11 arrow::r::Input&>::type scanner(scanner_sexp); @@ -4247,7 +4247,7 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -int RecordBatch__num_rows(const std::shared_ptr& x); +r_vec_size RecordBatch__num_rows(const std::shared_ptr& x); extern "C" SEXP _arrow_RecordBatch__num_rows(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -4842,7 +4842,7 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -int Table__num_rows(const std::shared_ptr& x); +r_vec_size Table__num_rows(const std::shared_ptr& x); extern "C" SEXP _arrow_Table__num_rows(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); From fc061bc7b10eb37fcd834570b7a7d2861ad35070 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 7 Jul 2022 14:50:45 +0100 Subject: [PATCH 03/11] Swap out int64 for r_vec_size --- r/src/array.cpp | 24 ++++++++++++------------ r/src/arrowExports.cpp | 38 +++++++++++++++++++------------------- r/src/buffer.cpp | 8 ++++---- r/src/chunkedarray.cpp | 4 ++-- r/src/filesystem.cpp | 4 +++- r/src/io.cpp | 21 +++++++++++---------- r/src/message.cpp | 9 +++++---- r/src/parquet.cpp | 4 ++-- r/src/recordbatch.cpp | 4 ++-- r/src/table.cpp | 4 ++-- 10 files changed, 62 insertions(+), 58 deletions(-) diff --git a/r/src/array.cpp b/r/src/array.cpp index 1503d3e76dd..8ba0c569ea4 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -244,15 +244,15 @@ int32_t ListArray__value_length(const std::shared_ptr& array, } // [[arrow::export]] -int64_t LargeListArray__value_length(const std::shared_ptr& array, - int64_t i) { - return array->value_length(i); +r_vec_size LargeListArray__value_length( + const std::shared_ptr& array, int64_t i) { + return r_vec_size(array->value_length(i)); } // [[arrow::export]] -int64_t FixedSizeListArray__value_length( +r_vec_size FixedSizeListArray__value_length( const std::shared_ptr& array, int64_t i) { - return array->value_length(i); + return r_vec_size(array->value_length(i)); } // [[arrow::export]] @@ -262,15 +262,15 @@ int32_t ListArray__value_offset(const std::shared_ptr& array, } // [[arrow::export]] -int64_t LargeListArray__value_offset(const std::shared_ptr& array, - int64_t i) { - return array->value_offset(i); +r_vec_size LargeListArray__value_offset( + const std::shared_ptr& array, int64_t i) { + return r_vec_size(array->value_offset(i)); } // [[arrow::export]] -int64_t FixedSizeListArray__value_offset( +r_vec_size FixedSizeListArray__value_offset( const std::shared_ptr& array, int64_t i) { - return array->value_offset(i); + return r_vec_size(array->value_offset(i)); } // [[arrow::export]] @@ -319,8 +319,8 @@ bool Array__Same(const std::shared_ptr& x, } // [[arrow::export]] -int64_t Array__ReferencedBufferSize(const std::shared_ptr& x) { - return ValueOrStop(arrow::util::ReferencedBufferSize(*x)); +r_vec_size Array__ReferencedBufferSize(const std::shared_ptr& x) { + return r_vec_size(ValueOrStop(arrow::util::ReferencedBufferSize(*x))); } // [[arrow::export]] diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 24c26cb35b7..e5e3a66d52e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -255,7 +255,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int64_t LargeListArray__value_length(const std::shared_ptr& array, int64_t i); +r_vec_size LargeListArray__value_length(const std::shared_ptr& array, int64_t i); extern "C" SEXP _arrow_LargeListArray__value_length(SEXP array_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -264,7 +264,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int64_t FixedSizeListArray__value_length(const std::shared_ptr& array, int64_t i); +r_vec_size FixedSizeListArray__value_length(const std::shared_ptr& array, int64_t i); extern "C" SEXP _arrow_FixedSizeListArray__value_length(SEXP array_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -282,7 +282,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int64_t LargeListArray__value_offset(const std::shared_ptr& array, int64_t i); +r_vec_size LargeListArray__value_offset(const std::shared_ptr& array, int64_t i); extern "C" SEXP _arrow_LargeListArray__value_offset(SEXP array_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -291,7 +291,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int64_t FixedSizeListArray__value_offset(const std::shared_ptr& array, int64_t i); +r_vec_size FixedSizeListArray__value_offset(const std::shared_ptr& array, int64_t i); extern "C" SEXP _arrow_FixedSizeListArray__value_offset(SEXP array_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -357,7 +357,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int64_t Array__ReferencedBufferSize(const std::shared_ptr& x); +r_vec_size Array__ReferencedBufferSize(const std::shared_ptr& x); extern "C" SEXP _arrow_Array__ReferencedBufferSize(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -657,7 +657,7 @@ BEGIN_CPP11 END_CPP11 } // buffer.cpp -int64_t Buffer__capacity(const std::shared_ptr& buffer); +r_vec_size Buffer__capacity(const std::shared_ptr& buffer); extern "C" SEXP _arrow_Buffer__capacity(SEXP buffer_sexp){ BEGIN_CPP11 arrow::r::Input&>::type buffer(buffer_sexp); @@ -665,7 +665,7 @@ BEGIN_CPP11 END_CPP11 } // buffer.cpp -int64_t Buffer__size(const std::shared_ptr& buffer); +r_vec_size Buffer__size(const std::shared_ptr& buffer); extern "C" SEXP _arrow_Buffer__size(SEXP buffer_sexp){ BEGIN_CPP11 arrow::r::Input&>::type buffer(buffer_sexp); @@ -810,7 +810,7 @@ BEGIN_CPP11 END_CPP11 } // chunkedarray.cpp -int64_t ChunkedArray__ReferencedBufferSize(const std::shared_ptr& chunked_array); +r_vec_size ChunkedArray__ReferencedBufferSize(const std::shared_ptr& chunked_array); extern "C" SEXP _arrow_ChunkedArray__ReferencedBufferSize(SEXP chunked_array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type chunked_array(chunked_array_sexp); @@ -2902,7 +2902,7 @@ BEGIN_CPP11 END_CPP11 } // filesystem.cpp -int64_t fs___FileInfo__size(const std::shared_ptr& x); +r_vec_size fs___FileInfo__size(const std::shared_ptr& x); extern "C" SEXP _arrow_fs___FileInfo__size(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -3264,7 +3264,7 @@ BEGIN_CPP11 END_CPP11 } // io.cpp -int64_t io___RandomAccessFile__GetSize(const std::shared_ptr& x); +r_vec_size io___RandomAccessFile__GetSize(const std::shared_ptr& x); extern "C" SEXP _arrow_io___RandomAccessFile__GetSize(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -3290,7 +3290,7 @@ BEGIN_CPP11 END_CPP11 } // io.cpp -int64_t io___RandomAccessFile__Tell(const std::shared_ptr& x); +r_vec_size io___RandomAccessFile__Tell(const std::shared_ptr& x); extern "C" SEXP _arrow_io___RandomAccessFile__Tell(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -3378,7 +3378,7 @@ BEGIN_CPP11 END_CPP11 } // io.cpp -int64_t io___OutputStream__Tell(const std::shared_ptr& stream); +r_vec_size io___OutputStream__Tell(const std::shared_ptr& stream); extern "C" SEXP _arrow_io___OutputStream__Tell(SEXP stream_sexp){ BEGIN_CPP11 arrow::r::Input&>::type stream(stream_sexp); @@ -3402,7 +3402,7 @@ BEGIN_CPP11 END_CPP11 } // io.cpp -int64_t io___BufferOutputStream__capacity(const std::shared_ptr& stream); +r_vec_size io___BufferOutputStream__capacity(const std::shared_ptr& stream); extern "C" SEXP _arrow_io___BufferOutputStream__capacity(SEXP stream_sexp){ BEGIN_CPP11 arrow::r::Input&>::type stream(stream_sexp); @@ -3418,7 +3418,7 @@ BEGIN_CPP11 END_CPP11 } // io.cpp -int64_t io___BufferOutputStream__Tell(const std::shared_ptr& stream); +r_vec_size io___BufferOutputStream__Tell(const std::shared_ptr& stream); extern "C" SEXP _arrow_io___BufferOutputStream__Tell(SEXP stream_sexp){ BEGIN_CPP11 arrow::r::Input&>::type stream(stream_sexp); @@ -3586,7 +3586,7 @@ BEGIN_CPP11 END_CPP11 } // message.cpp -int64_t ipc___Message__body_length(const std::unique_ptr& message); +r_vec_size ipc___Message__body_length(const std::unique_ptr& message); extern "C" SEXP _arrow_ipc___Message__body_length(SEXP message_sexp){ BEGIN_CPP11 arrow::r::Input&>::type message(message_sexp); @@ -3610,7 +3610,7 @@ BEGIN_CPP11 END_CPP11 } // message.cpp -int64_t ipc___Message__Verify(const std::unique_ptr& message); +r_vec_size ipc___Message__Verify(const std::unique_ptr& message); extern "C" SEXP _arrow_ipc___Message__Verify(SEXP message_sexp){ BEGIN_CPP11 arrow::r::Input&>::type message(message_sexp); @@ -3912,7 +3912,7 @@ extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroups2(SEXP reader_ // parquet.cpp #if defined(ARROW_R_WITH_PARQUET) -int64_t parquet___arrow___FileReader__num_rows(const std::shared_ptr& reader); +r_vec_size parquet___arrow___FileReader__num_rows(const std::shared_ptr& reader); extern "C" SEXP _arrow_parquet___arrow___FileReader__num_rows(SEXP reader_sexp){ BEGIN_CPP11 arrow::r::Input&>::type reader(reader_sexp); @@ -4419,7 +4419,7 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -int64_t RecordBatch__ReferencedBufferSize(const std::shared_ptr& batch); +r_vec_size RecordBatch__ReferencedBufferSize(const std::shared_ptr& batch); extern "C" SEXP _arrow_RecordBatch__ReferencedBufferSize(SEXP batch_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); @@ -5021,7 +5021,7 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -int64_t Table__ReferencedBufferSize(const std::shared_ptr& table); +r_vec_size Table__ReferencedBufferSize(const std::shared_ptr& table); extern "C" SEXP _arrow_Table__ReferencedBufferSize(SEXP table_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); diff --git a/r/src/buffer.cpp b/r/src/buffer.cpp index 11a80a8d56d..97f6db5535a 100644 --- a/r/src/buffer.cpp +++ b/r/src/buffer.cpp @@ -28,13 +28,13 @@ void Buffer__ZeroPadding(const std::shared_ptr& buffer) { } // [[arrow::export]] -int64_t Buffer__capacity(const std::shared_ptr& buffer) { - return buffer->capacity(); +r_vec_size Buffer__capacity(const std::shared_ptr& buffer) { + return r_vec_size(buffer->capacity()); } // [[arrow::export]] -int64_t Buffer__size(const std::shared_ptr& buffer) { - return buffer->size(); +r_vec_size Buffer__size(const std::shared_ptr& buffer) { + return r_vec_size(buffer->size()); } // [[arrow::export]] diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index b495f3d1edf..1aadeeb0fb8 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -144,7 +144,7 @@ std::shared_ptr ChunkedArray__from_list(cpp11::list chunks, } // [[arrow::export]] -int64_t ChunkedArray__ReferencedBufferSize( +r_vec_size ChunkedArray__ReferencedBufferSize( const std::shared_ptr& chunked_array) { - return ValueOrStop(arrow::util::ReferencedBufferSize(*chunked_array)); + return r_vec_size(ValueOrStop(arrow::util::ReferencedBufferSize(*chunked_array))); } diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index cdf536b3bc8..f6c5499bd3d 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -71,7 +71,9 @@ void fs___FileInfo__set_path(const std::shared_ptr& x, } // [[arrow::export]] -int64_t fs___FileInfo__size(const std::shared_ptr& x) { return x->size(); } +r_vec_size fs___FileInfo__size(const std::shared_ptr& x) { + return r_vec_size(x->size()); +} // [[arrow::export]] void fs___FileInfo__set_size(const std::shared_ptr& x, int64_t size) { diff --git a/r/src/io.cpp b/r/src/io.cpp index a0f53478a0d..42766ddd2f5 100644 --- a/r/src/io.cpp +++ b/r/src/io.cpp @@ -51,9 +51,9 @@ void io___OutputStream__Close(const std::shared_ptr& x) // ------ arrow::io::RandomAccessFile // [[arrow::export]] -int64_t io___RandomAccessFile__GetSize( +r_vec_size io___RandomAccessFile__GetSize( const std::shared_ptr& x) { - return ValueOrStop(x->GetSize()); + return r_vec_size(ValueOrStop(x->GetSize())); } // [[arrow::export]] @@ -69,9 +69,9 @@ void io___RandomAccessFile__Seek(const std::shared_ptr& x) { - return ValueOrStop(x->Tell()); + return r_vec_size(ValueOrStop(x->Tell())); } // [[arrow::export]] @@ -161,8 +161,9 @@ void io___Writable__write(const std::shared_ptr& stream, // ------- arrow::io::OutputStream // [[arrow::export]] -int64_t io___OutputStream__Tell(const std::shared_ptr& stream) { - return ValueOrStop(stream->Tell()); +r_vec_size io___OutputStream__Tell( + const std::shared_ptr& stream) { + return r_vec_size(ValueOrStop(stream->Tell())); } // ------ arrow::io::FileOutputStream @@ -183,9 +184,9 @@ std::shared_ptr io___BufferOutputStream__Create( } // [[arrow::export]] -int64_t io___BufferOutputStream__capacity( +r_vec_size io___BufferOutputStream__capacity( const std::shared_ptr& stream) { - return stream->capacity(); + return r_vec_size(stream->capacity()); } // [[arrow::export]] @@ -195,9 +196,9 @@ std::shared_ptr io___BufferOutputStream__Finish( } // [[arrow::export]] -int64_t io___BufferOutputStream__Tell( +r_vec_size io___BufferOutputStream__Tell( const std::shared_ptr& stream) { - return ValueOrStop(stream->Tell()); + return r_vec_size(ValueOrStop(stream->Tell())); } // [[arrow::export]] diff --git a/r/src/message.cpp b/r/src/message.cpp index 16044d9a92d..d9832ddc22a 100644 --- a/r/src/message.cpp +++ b/r/src/message.cpp @@ -21,8 +21,9 @@ #include // [[arrow::export]] -int64_t ipc___Message__body_length(const std::unique_ptr& message) { - return message->body_length(); +r_vec_size ipc___Message__body_length( + const std::unique_ptr& message) { + return r_vec_size(message->body_length()); } // [[arrow::export]] @@ -38,8 +39,8 @@ std::shared_ptr ipc___Message__body( } // [[arrow::export]] -int64_t ipc___Message__Verify(const std::unique_ptr& message) { - return message->Verify(); +r_vec_size ipc___Message__Verify(const std::unique_ptr& message) { + return r_vec_size(message->Verify()); } // [[arrow::export]] diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp index 5d5fd9b7f46..d92650252c8 100644 --- a/r/src/parquet.cpp +++ b/r/src/parquet.cpp @@ -148,9 +148,9 @@ std::shared_ptr parquet___arrow___FileReader__ReadRowGroups2( } // [[parquet::export]] -int64_t parquet___arrow___FileReader__num_rows( +r_vec_size parquet___arrow___FileReader__num_rows( const std::shared_ptr& reader) { - return reader->parquet_reader()->metadata()->num_rows(); + return r_vec_size(reader->parquet_reader()->metadata()->num_rows()); } // [[parquet::export]] diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index 01bd8a3f35b..adbc2370670 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -307,7 +307,7 @@ std::shared_ptr RecordBatch__from_arrays(SEXP schema_sxp, SE } // [[arrow::export]] -int64_t RecordBatch__ReferencedBufferSize( +r_vec_size RecordBatch__ReferencedBufferSize( const std::shared_ptr& batch) { - return ValueOrStop(arrow::util::ReferencedBufferSize(*batch)); + return r_vec_size(ValueOrStop(arrow::util::ReferencedBufferSize(*batch))); } diff --git a/r/src/table.cpp b/r/src/table.cpp index 07bf44750a1..975ca1ea32f 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -303,8 +303,8 @@ std::shared_ptr Table__from_record_batches( } // [[arrow::export]] -int64_t Table__ReferencedBufferSize(const std::shared_ptr& table) { - return ValueOrStop(arrow::util::ReferencedBufferSize(*table)); +r_vec_size Table__ReferencedBufferSize(const std::shared_ptr& table) { + return r_vec_size(ValueOrStop(arrow::util::ReferencedBufferSize(*table))); } // [[arrow::export]] From 34e9062f673ae1b255289b524e4cd202be6fe99b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 8 Jul 2022 15:46:57 +0100 Subject: [PATCH 04/11] Add tests for Tables and RecordBatches --- r/tests/testthat/test-RecordBatch.R | 15 +++++++++++++++ r/tests/testthat/test-Table.R | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index a39aa0f0fb9..7679860de2f 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -830,3 +830,18 @@ test_that("as_record_batch() works for data.frame()", { record_batch(col1 = Array$create(1, type = float64()), col2 = "two") ) }) + +test_that("num_rows method not susceptible to integer overflow", { + skip_if_not_running_large_memory_tests() + + test_array1 <- Array$create(raw(.Machine$integer.max)) + test_array2 <- Array$create(raw(1)) + big_chunked <- chunked_array(test_array1, test_array2) + + small_table <- record_batch(col = test_array1) + expect_type(small_table$num_rows, "integer") + + big_table <- record_batch(col = big_chunked) + expect_type(big_table$num_rows, "double") + +}) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 791e3ce2988..ad30047b5da 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -696,3 +696,18 @@ test_that("as_arrow_table() errors for invalid input", { class = "arrow_no_method_as_arrow_table" ) }) + +test_that("num_rows method not susceptible to integer overflow", { + skip_if_not_running_large_memory_tests() + + test_array1 <- Array$create(raw(.Machine$integer.max)) + test_array2 <- Array$create(raw(1)) + big_chunked <- chunked_array(test_array1, test_array2) + + small_table <- Table$create(col = test_array1) + expect_type(small_table$num_rows, "integer") + + big_table <- Table$create(col = big_chunked) + expect_type(big_table$num_rows, "double") + +}) From 21337ea230dd9b5b577f7ed477ba04887eabb021 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 8 Jul 2022 16:07:05 +0100 Subject: [PATCH 05/11] Add test for buffer size method --- r/tests/testthat/test-buffer.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/r/tests/testthat/test-buffer.R b/r/tests/testthat/test-buffer.R index 9b3ebc6de9c..c0a337cff37 100644 --- a/r/tests/testthat/test-buffer.R +++ b/r/tests/testthat/test-buffer.R @@ -95,3 +95,14 @@ test_that("Buffer$Equals", { expect_true(buf1$Equals(buf2)) expect_false(buf1$Equals(vec)) }) + +test_that("size method not susceptible to integer overflow", { + skip_if_not_running_large_memory_tests() + + small_buffer <- buffer(raw(1)) + big_buffer <- buffer(raw(.Machine$integer.max + 1)) + + expect_type(small_buffer$size, "integer") + expect_type(big_buffer$size, "double") + +}) From 63c5acc14c50791949ca4978c5a0c8003d564bcd Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 12 Jul 2022 11:01:24 +0100 Subject: [PATCH 06/11] Remove RecordBatch test --- r/tests/testthat/test-RecordBatch.R | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 7679860de2f..a39aa0f0fb9 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -830,18 +830,3 @@ test_that("as_record_batch() works for data.frame()", { record_batch(col1 = Array$create(1, type = float64()), col2 = "two") ) }) - -test_that("num_rows method not susceptible to integer overflow", { - skip_if_not_running_large_memory_tests() - - test_array1 <- Array$create(raw(.Machine$integer.max)) - test_array2 <- Array$create(raw(1)) - big_chunked <- chunked_array(test_array1, test_array2) - - small_table <- record_batch(col = test_array1) - expect_type(small_table$num_rows, "integer") - - big_table <- record_batch(col = big_chunked) - expect_type(big_table$num_rows, "double") - -}) From 2e19218b1a84e7fe8a57e4402b7617e0f4e556b8 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 12 Jul 2022 11:27:11 +0100 Subject: [PATCH 07/11] Use r_vec_size for more return types --- r/src/arrowExports.cpp | 10 +++++----- r/src/chunkedarray.cpp | 15 +++++++++------ r/src/recordbatch.cpp | 4 ++-- r/src/table.cpp | 4 ++-- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index e5e3a66d52e..e89718144ab 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -698,7 +698,7 @@ BEGIN_CPP11 END_CPP11 } // chunkedarray.cpp -int ChunkedArray__length(const std::shared_ptr& chunked_array); +r_vec_size ChunkedArray__length(const std::shared_ptr& chunked_array); extern "C" SEXP _arrow_ChunkedArray__length(SEXP chunked_array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type chunked_array(chunked_array_sexp); @@ -706,7 +706,7 @@ BEGIN_CPP11 END_CPP11 } // chunkedarray.cpp -int ChunkedArray__null_count(const std::shared_ptr& chunked_array); +r_vec_size ChunkedArray__null_count(const std::shared_ptr& chunked_array); extern "C" SEXP _arrow_ChunkedArray__null_count(SEXP chunked_array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type chunked_array(chunked_array_sexp); @@ -714,7 +714,7 @@ BEGIN_CPP11 END_CPP11 } // chunkedarray.cpp -int ChunkedArray__num_chunks(const std::shared_ptr& chunked_array); +r_vec_size ChunkedArray__num_chunks(const std::shared_ptr& chunked_array); extern "C" SEXP _arrow_ChunkedArray__num_chunks(SEXP chunked_array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type chunked_array(chunked_array_sexp); @@ -4239,7 +4239,7 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -int RecordBatch__num_columns(const std::shared_ptr& x); +r_vec_size RecordBatch__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_RecordBatch__num_columns(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -4834,7 +4834,7 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -int Table__num_columns(const std::shared_ptr& x); +r_vec_size Table__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_Table__num_columns(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index 1aadeeb0fb8..36884bb531b 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -22,18 +22,21 @@ #include // [[arrow::export]] -int ChunkedArray__length(const std::shared_ptr& chunked_array) { - return chunked_array->length(); +r_vec_size ChunkedArray__length( + const std::shared_ptr& chunked_array) { + return r_vec_size(chunked_array->length()); } // [[arrow::export]] -int ChunkedArray__null_count(const std::shared_ptr& chunked_array) { - return chunked_array->null_count(); +r_vec_size ChunkedArray__null_count( + const std::shared_ptr& chunked_array) { + return r_vec_size(chunked_array->null_count()); } // [[arrow::export]] -int ChunkedArray__num_chunks(const std::shared_ptr& chunked_array) { - return chunked_array->num_chunks(); +r_vec_size ChunkedArray__num_chunks( + const std::shared_ptr& chunked_array) { + return r_vec_size(chunked_array->num_chunks()); } // [[arrow::export]] diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index adbc2370670..aca3a74fd81 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -27,8 +27,8 @@ #include // [[arrow::export]] -int RecordBatch__num_columns(const std::shared_ptr& x) { - return x->num_columns(); +r_vec_size RecordBatch__num_columns(const std::shared_ptr& x) { + return r_vec_size(x->num_columns()); } // [[arrow::export]] diff --git a/r/src/table.cpp b/r/src/table.cpp index 975ca1ea32f..f31aac33eff 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -23,8 +23,8 @@ #include // [[arrow::export]] -int Table__num_columns(const std::shared_ptr& x) { - return x->num_columns(); +r_vec_size Table__num_columns(const std::shared_ptr& x) { + return r_vec_size(x->num_columns()); } // [[arrow::export]] From dc92b505642e40020f06a2d9d45bf5482b1317a2 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 12 Jul 2022 11:55:35 +0100 Subject: [PATCH 08/11] Use map_dbl not map_int for utility functions which use lengths --- r/R/arrow-package.R | 2 +- r/R/record-batch.R | 4 ++-- r/R/util.R | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 7b59854f1e1..7e1e5451cc7 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -17,7 +17,7 @@ #' @importFrom stats quantile median na.omit na.exclude na.pass na.fail #' @importFrom R6 R6Class -#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr flatten +#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dbl map_dfr map_int map_lgl keep imap imap_chr flatten #' @importFrom assertthat assert_that is.string #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 2a924afa56a..528ecef2f3d 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -196,8 +196,8 @@ rbind.RecordBatch <- function(...) { } cbind_check_length <- function(inputs, call = caller_env()) { - sizes <- map_int(inputs, NROW) - ok_lengths <- sizes %in% c(head(sizes, 1), 1L) + sizes <- map_dbl(inputs, NROW) + ok_lengths <- sizes %in% c(head(sizes, 1), 1) if (!all(ok_lengths)) { first_bad_one <- which.min(ok_lengths) abort( diff --git a/r/R/util.R b/r/R/util.R index 4aff69e471a..a51fde0c2d6 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -158,7 +158,7 @@ as_writable_table <- function(x) { #' @keywords internal recycle_scalars <- function(arrays) { # Get lengths of items in arrays - arr_lens <- map_int(arrays, NROW) + arr_lens <- map_dbl(arrays, NROW) is_scalar <- arr_lens == 1 From e8c441a2d1bdd95fcff48e1b5f7fdac8fe120221 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 12 Jul 2022 11:56:01 +0100 Subject: [PATCH 09/11] Test more fields --- r/tests/testthat/test-Table.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index ad30047b5da..a7f11153d65 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -703,11 +703,20 @@ test_that("num_rows method not susceptible to integer overflow", { test_array1 <- Array$create(raw(.Machine$integer.max)) test_array2 <- Array$create(raw(1)) big_chunked <- chunked_array(test_array1, test_array2) - small_table <- Table$create(col = test_array1) + big_table <- Table$create(col = big_chunked) + + expect_type(test_array1$nbytes(), "integer") + expect_type(test_array2$nbytes(), "integer") + + big_chunked$length() + + length(big_chunked) + + expect_type(big_chunked$nbytes(), "double") expect_type(small_table$num_rows, "integer") - big_table <- Table$create(col = big_chunked) + expect_type(big_chunked$nbytes(), "double") expect_type(big_table$num_rows, "double") }) From c71c5e142f18a78539fcc912c057de0929a562e6 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 13 Jul 2022 10:54:09 +0100 Subject: [PATCH 10/11] Add test for buffers too --- r/NAMESPACE | 1 + r/tests/testthat/test-Table.R | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index e98cdd51fb7..053c75742f2 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -388,6 +388,7 @@ importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map2_chr) importFrom(purrr,map_chr) +importFrom(purrr,map_dbl) importFrom(purrr,map_dfr) importFrom(purrr,map_int) importFrom(purrr,map_lgl) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index a7f11153d65..b9bbe5b559e 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -700,23 +700,24 @@ test_that("as_arrow_table() errors for invalid input", { test_that("num_rows method not susceptible to integer overflow", { skip_if_not_running_large_memory_tests() - test_array1 <- Array$create(raw(.Machine$integer.max)) - test_array2 <- Array$create(raw(1)) - big_chunked <- chunked_array(test_array1, test_array2) - small_table <- Table$create(col = test_array1) - big_table <- Table$create(col = big_chunked) + small_array <- Array$create(raw(1)) + big_array <- Array$create(raw(.Machine$integer.max)) + big_chunked_array <- chunked_array(big_array, small_array) + # LargeString array with data buffer > MAX_INT32 + big_string_array <- Array$create(make_big_string()) - expect_type(test_array1$nbytes(), "integer") - expect_type(test_array2$nbytes(), "integer") + small_table <- Table$create(col = small_array) + big_table <- Table$create(col = big_chunked_array) - big_chunked$length() + expect_type(big_array$nbytes(), "integer") + expect_type(big_chunked_array$nbytes(), "double") - length(big_chunked) + expect_type(length(big_array), "integer") + expect_type(length(big_chunked_array), "double") - expect_type(big_chunked$nbytes(), "double") expect_type(small_table$num_rows, "integer") - - expect_type(big_chunked$nbytes(), "double") expect_type(big_table$num_rows, "double") + expect_identical(big_string_array$data()$buffers[[3]]$size, 2148007936) + }) From c12da57cde95534a18518d318c9007a00092217e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 13 Jul 2022 11:20:05 +0100 Subject: [PATCH 11/11] Remove redundant buffer test --- r/tests/testthat/test-buffer.R | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/r/tests/testthat/test-buffer.R b/r/tests/testthat/test-buffer.R index c0a337cff37..9b3ebc6de9c 100644 --- a/r/tests/testthat/test-buffer.R +++ b/r/tests/testthat/test-buffer.R @@ -95,14 +95,3 @@ test_that("Buffer$Equals", { expect_true(buf1$Equals(buf2)) expect_false(buf1$Equals(vec)) }) - -test_that("size method not susceptible to integer overflow", { - skip_if_not_running_large_memory_tests() - - small_buffer <- buffer(raw(1)) - big_buffer <- buffer(raw(.Machine$integer.max + 1)) - - expect_type(small_buffer$size, "integer") - expect_type(big_buffer$size, "double") - -})