apache · nealrichardson · Jun 30, 2020 · Jul 1, 2020
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
@@ -19,15 +19,18 @@
 #if defined(ARROW_R_WITH_ARROW)
 
 #include <arrow/array.h>
+#include <arrow/datum.h>
 #include <arrow/table.h>
 #include <arrow/util/bitmap_reader.h>
 #include <arrow/util/bitmap_writer.h>
+#include <arrow/util/int_util.h>
 #include <arrow/util/parallel.h>
 #include <arrow/util/task_group.h>
 
 namespace arrow {
 
 using internal::checked_cast;
+using internal::IntegersCanFit;
 
 namespace r {
 
@@ -673,6 +676,17 @@ class Converter_Null : public Converter {
   }
 };
 
+bool ArraysCanFitInteger(ArrayVector arrays) {
+  bool out = false;
+  auto i32 = arrow::int32();
+  for (const auto& array : arrays) {
+    if (!out) {
+      out = arrow::IntegersCanFit(arrow::Datum(array), *i32).ok();
+    }
+  }
+  return out;
+}
+
 std::shared_ptr<Converter> Converter::Make(const std::shared_ptr<DataType>& type,
                                            ArrayVector arrays) {
   if (arrays.empty()) {
@@ -724,14 +738,24 @@ std::shared_ptr<Converter> Converter::Make(const std::shared_ptr<DataType>& type
       return std::make_shared<arrow::r::Converter_Promotion<INTSXP, arrow::UInt16Type>>(
           std::move(arrays));
 
-      // promotions to numeric vector
+      // promotions to numeric vector, if they don't fit into int32
     case Type::UINT32:
-      return std::make_shared<arrow::r::Converter_Promotion<REALSXP, arrow::UInt32Type>>(
-          std::move(arrays));
+      if (ArraysCanFitInteger(arrays)) {
+        return std::make_shared<arrow::r::Converter_Promotion<INTSXP, arrow::UInt32Type>>(
+            std::move(arrays));
+      } else {
+        return std::make_shared<
+            arrow::r::Converter_Promotion<REALSXP, arrow::UInt32Type>>(std::move(arrays));
+      }
 
     case Type::UINT64:
-      return std::make_shared<arrow::r::Converter_Promotion<REALSXP, arrow::UInt64Type>>(
-          std::move(arrays));
+      if (ArraysCanFitInteger(arrays)) {
+        return std::make_shared<arrow::r::Converter_Promotion<INTSXP, arrow::UInt64Type>>(
+            std::move(arrays));
+      } else {
+        return std::make_shared<
+            arrow::r::Converter_Promotion<REALSXP, arrow::UInt64Type>>(std::move(arrays));
+      }
 
     case Type::HALF_FLOAT:
       return std::make_shared<
@@ -742,7 +766,7 @@ std::shared_ptr<Converter> Converter::Make(const std::shared_ptr<DataType>& type
       return std::make_shared<arrow::r::Converter_Promotion<REALSXP, arrow::FloatType>>(
           std::move(arrays));
 
-      // time32 ane time64
+      // time32 and time64
     case Type::TIME32:
       return std::make_shared<arrow::r::Converter_Time<int32_t>>(std::move(arrays));
 
@@ -753,7 +777,13 @@ std::shared_ptr<Converter> Converter::Make(const std::shared_ptr<DataType>& type
       return std::make_shared<arrow::r::Converter_Timestamp<int64_t>>(std::move(arrays));
 
     case Type::INT64:
-      return std::make_shared<arrow::r::Converter_Int64>(std::move(arrays));
+      // Prefer integer if it fits
+      if (ArraysCanFitInteger(arrays)) {
+        return std::make_shared<arrow::r::Converter_Promotion<INTSXP, arrow::Int64Type>>(
+            std::move(arrays));
+      } else {
+        return std::make_shared<arrow::r::Converter_Int64>(std::move(arrays));
+      }
 
     case Type::DECIMAL:
       return std::make_shared<arrow::r::Converter_Decimal>(std::move(arrays));

diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R
@@ -26,6 +26,8 @@ if (tolower(Sys.info()[["sysname"]]) == "windows") {
 
 set.seed(1)
 
+MAX_INT <- 2147483647L
+
 test_that <- function(what, code) {
   testthat::test_that(what, {
     skip_if(getOption("..skip.tests", TRUE), "arrow C++ library not available")

diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
@@ -243,14 +243,16 @@ test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", {
 })
 
 test_that("array supports integer64", {
-  x <- bit64::as.integer64(1:10)
+  x <- bit64::as.integer64(1:10) + MAX_INT
   expect_array_roundtrip(x, int64())
 
   x[4] <- NA
   expect_array_roundtrip(x, int64())
 
   # all NA int64 (ARROW-3795)
-  expect_array_roundtrip(bit64::as.integer64(NA), int64())
+  all_na <- Array$create(bit64::as.integer64(NA))
+  expect_type_equal(all_na, int64())
+  expect_true(as.vector(is.na(all_na)))
 })
 
 test_that("array supports difftime", {
@@ -381,12 +383,23 @@ test_that("Array<int8>$as_vector() converts to integer (ARROW-3794)", {
   expect_equal(a$as_vector(), 0:255)
 })
 
-test_that("Arrays of uint{32,64} convert to numeric", {
+test_that("Arrays of {,u}int{32,64} convert to integer if they can fit", {
   u32 <- Array$create(1L)$cast(uint32())
-  expect_identical(as.vector(u32), 1)
+  expect_identical(as.vector(u32), 1L)
 
   u64 <- Array$create(1L)$cast(uint64())
-  expect_identical(as.vector(u64), 1)
+  expect_identical(as.vector(u64), 1L)
+
+  i64 <- Array$create(bit64::as.integer64(1:10))
+  expect_identical(as.vector(i64), 1:10)
+})
+
+test_that("Arrays of uint{32,64} convert to numeric if they can't fit integer", {
+  u32 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint32())
+  expect_identical(as.vector(u32), 1 + MAX_INT)
+
+  u64 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint64())
+  expect_identical(as.vector(u64), 1 + MAX_INT)
 })
 
 test_that("Array$create() recognise arrow::Array (ARROW-3815)", {

diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R
@@ -198,7 +198,7 @@ test_that("ChunkedArray supports POSIXct (ARROW-3716)", {
 })
 
 test_that("ChunkedArray supports integer64 (ARROW-3716)", {
-  x <- bit64::as.integer64(1:10)
+  x <- bit64::as.integer64(1:10) + MAX_INT
   expect_chunked_roundtrip(list(x, x), int64())
 })
 

diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd
@@ -109,11 +109,11 @@ In the tables, entries with a `-` are not currently implemented.
 | int8              | integer                  |
 | int16             | integer                  |
 | int32             | integer                  |
-| int64             | bit64::integer64         |
+| int64             | integer^++^              |
 | uint8             | integer                  |
 | uint16            | integer                  |
-| uint32            | double                   |
-| uint64            | double                   |
+| uint32            | integer^++^              |
+| uint64            | integer^++^              |
 | float16           | -                        |
 | float32           | double                   |
 | float64           | double                   |
@@ -127,7 +127,7 @@ In the tables, entries with a `-` are not currently implemented.
 | timestamp         | POSIXct                  |
 | duration          | -                        |
 | decimal           | double                   |
-| dictionary        | factor^++^               |
+| dictionary        | factor^+++^              |
 | list              | list                     |
 | fixed_size_list   | -                        |
 | struct            | data.frame               |
@@ -138,7 +138,9 @@ In the tables, entries with a `-` are not currently implemented.
 | large_binary      | -                        |
 | large_list        | -                        |
 
-^++^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings.
+^++^: These integer types may contain values that exceed the range of R's `integer` type (32-bit signed integer). When they do, `uint32` and `uint64` are converted to `double` ("numeric") and `int64` is converted to `bit64::integer64`.
+
+^+++^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings.
 
 ## Class structure and package conventions