From 48e3812fa636e854ec846fbbb63a1d796f7d2610 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 30 Jun 2020 15:18:33 -0700 Subject: [PATCH 1/2] Check if integers can fit (need to update tests and add tests where they can't fit) --- r/src/array_to_vector.cpp | 44 ++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index bdef69d3c21..0d3829e60c5 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -19,15 +19,18 @@ #if defined(ARROW_R_WITH_ARROW) #include +#include #include #include #include +#include #include #include namespace arrow { using internal::checked_cast; +using internal::IntegersCanFit; namespace r { @@ -673,6 +676,17 @@ class Converter_Null : public Converter { } }; +bool arrays_can_fit_integer(ArrayVector arrays) { + bool out = false; + auto i32 = arrow::int32(); + for (const auto& array : arrays) { + if (!out) { + out = arrow::IntegersCanFit(arrow::Datum(array), *i32).ok(); + } + } + return out; +} + std::shared_ptr Converter::Make(const std::shared_ptr& type, ArrayVector arrays) { if (arrays.empty()) { @@ -724,14 +738,24 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type return std::make_shared>( std::move(arrays)); - // promotions to numeric vector + // promotions to numeric vector, if they don't fit into int32 case Type::UINT32: - return std::make_shared>( - std::move(arrays)); + if (arrays_can_fit_integer(arrays)) { + return std::make_shared>( + std::move(arrays)); + } else { + return std::make_shared< + arrow::r::Converter_Promotion>(std::move(arrays)); + } case Type::UINT64: - return std::make_shared>( - std::move(arrays)); + if (arrays_can_fit_integer(arrays)) { + return std::make_shared>( + std::move(arrays)); + } else { + return std::make_shared< + arrow::r::Converter_Promotion>(std::move(arrays)); + } case Type::HALF_FLOAT: return std::make_shared< @@ -742,7 +766,7 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type return std::make_shared>( std::move(arrays)); - // time32 ane time64 + // time32 and time64 case Type::TIME32: return std::make_shared>(std::move(arrays)); @@ -753,7 +777,13 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type return std::make_shared>(std::move(arrays)); case Type::INT64: - return std::make_shared(std::move(arrays)); + // Prefer integer if it fits + if (arrays_can_fit_integer(arrays)) { + return std::make_shared>( + std::move(arrays)); + } else { + return std::make_shared(std::move(arrays)); + } case Type::DECIMAL: return std::make_shared(std::move(arrays)); From 6abe0571171f163e21d0599a52188f5b5c25403e Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 1 Jul 2020 08:00:04 -0700 Subject: [PATCH 2/2] Tests and other cleanup --- r/src/array_to_vector.cpp | 8 ++++---- r/tests/testthat/helper-arrow.R | 2 ++ r/tests/testthat/test-Array.R | 23 ++++++++++++++++++----- r/tests/testthat/test-chunked-array.R | 2 +- r/vignettes/arrow.Rmd | 12 +++++++----- 5 files changed, 32 insertions(+), 15 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 0d3829e60c5..196d491ca7b 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -676,7 +676,7 @@ class Converter_Null : public Converter { } }; -bool arrays_can_fit_integer(ArrayVector arrays) { +bool ArraysCanFitInteger(ArrayVector arrays) { bool out = false; auto i32 = arrow::int32(); for (const auto& array : arrays) { @@ -740,7 +740,7 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type // promotions to numeric vector, if they don't fit into int32 case Type::UINT32: - if (arrays_can_fit_integer(arrays)) { + if (ArraysCanFitInteger(arrays)) { return std::make_shared>( std::move(arrays)); } else { @@ -749,7 +749,7 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type } case Type::UINT64: - if (arrays_can_fit_integer(arrays)) { + if (ArraysCanFitInteger(arrays)) { return std::make_shared>( std::move(arrays)); } else { @@ -778,7 +778,7 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type case Type::INT64: // Prefer integer if it fits - if (arrays_can_fit_integer(arrays)) { + if (ArraysCanFitInteger(arrays)) { return std::make_shared>( std::move(arrays)); } else { diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index dcbd6284f62..06a134e60ed 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -26,6 +26,8 @@ if (tolower(Sys.info()[["sysname"]]) == "windows") { set.seed(1) +MAX_INT <- 2147483647L + test_that <- function(what, code) { testthat::test_that(what, { skip_if(getOption("..skip.tests", TRUE), "arrow C++ library not available") diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 3dc1f83af01..2ae6505cc2d 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -243,14 +243,16 @@ test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", { }) test_that("array supports integer64", { - x <- bit64::as.integer64(1:10) + x <- bit64::as.integer64(1:10) + MAX_INT expect_array_roundtrip(x, int64()) x[4] <- NA expect_array_roundtrip(x, int64()) # all NA int64 (ARROW-3795) - expect_array_roundtrip(bit64::as.integer64(NA), int64()) + all_na <- Array$create(bit64::as.integer64(NA)) + expect_type_equal(all_na, int64()) + expect_true(as.vector(is.na(all_na))) }) test_that("array supports difftime", { @@ -381,12 +383,23 @@ test_that("Array$as_vector() converts to integer (ARROW-3794)", { expect_equal(a$as_vector(), 0:255) }) -test_that("Arrays of uint{32,64} convert to numeric", { +test_that("Arrays of {,u}int{32,64} convert to integer if they can fit", { u32 <- Array$create(1L)$cast(uint32()) - expect_identical(as.vector(u32), 1) + expect_identical(as.vector(u32), 1L) u64 <- Array$create(1L)$cast(uint64()) - expect_identical(as.vector(u64), 1) + expect_identical(as.vector(u64), 1L) + + i64 <- Array$create(bit64::as.integer64(1:10)) + expect_identical(as.vector(i64), 1:10) +}) + +test_that("Arrays of uint{32,64} convert to numeric if they can't fit integer", { + u32 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint32()) + expect_identical(as.vector(u32), 1 + MAX_INT) + + u64 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint64()) + expect_identical(as.vector(u64), 1 + MAX_INT) }) test_that("Array$create() recognise arrow::Array (ARROW-3815)", { diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index c627fff3af1..60294ce0e3c 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -198,7 +198,7 @@ test_that("ChunkedArray supports POSIXct (ARROW-3716)", { }) test_that("ChunkedArray supports integer64 (ARROW-3716)", { - x <- bit64::as.integer64(1:10) + x <- bit64::as.integer64(1:10) + MAX_INT expect_chunked_roundtrip(list(x, x), int64()) }) diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd index 234d5802c95..a67f3a23219 100644 --- a/r/vignettes/arrow.Rmd +++ b/r/vignettes/arrow.Rmd @@ -109,11 +109,11 @@ In the tables, entries with a `-` are not currently implemented. | int8 | integer | | int16 | integer | | int32 | integer | -| int64 | bit64::integer64 | +| int64 | integer^++^ | | uint8 | integer | | uint16 | integer | -| uint32 | double | -| uint64 | double | +| uint32 | integer^++^ | +| uint64 | integer^++^ | | float16 | - | | float32 | double | | float64 | double | @@ -127,7 +127,7 @@ In the tables, entries with a `-` are not currently implemented. | timestamp | POSIXct | | duration | - | | decimal | double | -| dictionary | factor^++^ | +| dictionary | factor^+++^ | | list | list | | fixed_size_list | - | | struct | data.frame | @@ -138,7 +138,9 @@ In the tables, entries with a `-` are not currently implemented. | large_binary | - | | large_list | - | -^++^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings. +^++^: These integer types may contain values that exceed the range of R's `integer` type (32-bit signed integer). When they do, `uint32` and `uint64` are converted to `double` ("numeric") and `int64` is converted to `bit64::integer64`. + +^+++^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings. ## Class structure and package conventions