Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 37 additions & 7 deletions r/src/array_to_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,18 @@
#if defined(ARROW_R_WITH_ARROW)

#include <arrow/array.h>
#include <arrow/datum.h>
#include <arrow/table.h>
#include <arrow/util/bitmap_reader.h>
#include <arrow/util/bitmap_writer.h>
#include <arrow/util/int_util.h>
#include <arrow/util/parallel.h>
#include <arrow/util/task_group.h>

namespace arrow {

using internal::checked_cast;
using internal::IntegersCanFit;

namespace r {

Expand Down Expand Up @@ -673,6 +676,17 @@ class Converter_Null : public Converter {
}
};

bool ArraysCanFitInteger(ArrayVector arrays) {
bool out = false;
auto i32 = arrow::int32();
for (const auto& array : arrays) {
if (!out) {
out = arrow::IntegersCanFit(arrow::Datum(array), *i32).ok();
}
}
return out;
}

std::shared_ptr<Converter> Converter::Make(const std::shared_ptr<DataType>& type,
ArrayVector arrays) {
if (arrays.empty()) {
Expand Down Expand Up @@ -724,14 +738,24 @@ std::shared_ptr<Converter> Converter::Make(const std::shared_ptr<DataType>& type
return std::make_shared<arrow::r::Converter_Promotion<INTSXP, arrow::UInt16Type>>(
std::move(arrays));

// promotions to numeric vector
// promotions to numeric vector, if they don't fit into int32
case Type::UINT32:
return std::make_shared<arrow::r::Converter_Promotion<REALSXP, arrow::UInt32Type>>(
std::move(arrays));
if (ArraysCanFitInteger(arrays)) {
return std::make_shared<arrow::r::Converter_Promotion<INTSXP, arrow::UInt32Type>>(
std::move(arrays));
} else {
return std::make_shared<
arrow::r::Converter_Promotion<REALSXP, arrow::UInt32Type>>(std::move(arrays));
}

case Type::UINT64:
return std::make_shared<arrow::r::Converter_Promotion<REALSXP, arrow::UInt64Type>>(
std::move(arrays));
if (ArraysCanFitInteger(arrays)) {
return std::make_shared<arrow::r::Converter_Promotion<INTSXP, arrow::UInt64Type>>(
std::move(arrays));
} else {
return std::make_shared<
arrow::r::Converter_Promotion<REALSXP, arrow::UInt64Type>>(std::move(arrays));
}

case Type::HALF_FLOAT:
return std::make_shared<
Expand All @@ -742,7 +766,7 @@ std::shared_ptr<Converter> Converter::Make(const std::shared_ptr<DataType>& type
return std::make_shared<arrow::r::Converter_Promotion<REALSXP, arrow::FloatType>>(
std::move(arrays));

// time32 ane time64
// time32 and time64
case Type::TIME32:
return std::make_shared<arrow::r::Converter_Time<int32_t>>(std::move(arrays));

Expand All @@ -753,7 +777,13 @@ std::shared_ptr<Converter> Converter::Make(const std::shared_ptr<DataType>& type
return std::make_shared<arrow::r::Converter_Timestamp<int64_t>>(std::move(arrays));

case Type::INT64:
return std::make_shared<arrow::r::Converter_Int64>(std::move(arrays));
// Prefer integer if it fits
if (ArraysCanFitInteger(arrays)) {
return std::make_shared<arrow::r::Converter_Promotion<INTSXP, arrow::Int64Type>>(
std::move(arrays));
} else {
return std::make_shared<arrow::r::Converter_Int64>(std::move(arrays));
}

case Type::DECIMAL:
return std::make_shared<arrow::r::Converter_Decimal>(std::move(arrays));
Expand Down
2 changes: 2 additions & 0 deletions r/tests/testthat/helper-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ if (tolower(Sys.info()[["sysname"]]) == "windows") {

set.seed(1)

MAX_INT <- 2147483647L

test_that <- function(what, code) {
testthat::test_that(what, {
skip_if(getOption("..skip.tests", TRUE), "arrow C++ library not available")
Expand Down
23 changes: 18 additions & 5 deletions r/tests/testthat/test-Array.R
Original file line number Diff line number Diff line change
Expand Up @@ -243,14 +243,16 @@ test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", {
})

test_that("array supports integer64", {
x <- bit64::as.integer64(1:10)
x <- bit64::as.integer64(1:10) + MAX_INT
expect_array_roundtrip(x, int64())

x[4] <- NA
expect_array_roundtrip(x, int64())

# all NA int64 (ARROW-3795)
expect_array_roundtrip(bit64::as.integer64(NA), int64())
all_na <- Array$create(bit64::as.integer64(NA))
expect_type_equal(all_na, int64())
expect_true(as.vector(is.na(all_na)))
})

test_that("array supports difftime", {
Expand Down Expand Up @@ -381,12 +383,23 @@ test_that("Array<int8>$as_vector() converts to integer (ARROW-3794)", {
expect_equal(a$as_vector(), 0:255)
})

test_that("Arrays of uint{32,64} convert to numeric", {
test_that("Arrays of {,u}int{32,64} convert to integer if they can fit", {
u32 <- Array$create(1L)$cast(uint32())
expect_identical(as.vector(u32), 1)
expect_identical(as.vector(u32), 1L)

u64 <- Array$create(1L)$cast(uint64())
expect_identical(as.vector(u64), 1)
expect_identical(as.vector(u64), 1L)

i64 <- Array$create(bit64::as.integer64(1:10))
expect_identical(as.vector(i64), 1:10)
})

test_that("Arrays of uint{32,64} convert to numeric if they can't fit integer", {
u32 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint32())
expect_identical(as.vector(u32), 1 + MAX_INT)

u64 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint64())
expect_identical(as.vector(u64), 1 + MAX_INT)
})

test_that("Array$create() recognise arrow::Array (ARROW-3815)", {
Expand Down
2 changes: 1 addition & 1 deletion r/tests/testthat/test-chunked-array.R
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ test_that("ChunkedArray supports POSIXct (ARROW-3716)", {
})

test_that("ChunkedArray supports integer64 (ARROW-3716)", {
x <- bit64::as.integer64(1:10)
x <- bit64::as.integer64(1:10) + MAX_INT
expect_chunked_roundtrip(list(x, x), int64())
})

Expand Down
12 changes: 7 additions & 5 deletions r/vignettes/arrow.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,11 @@ In the tables, entries with a `-` are not currently implemented.
| int8 | integer |
| int16 | integer |
| int32 | integer |
| int64 | bit64::integer64 |
| int64 | integer^++^ |
| uint8 | integer |
| uint16 | integer |
| uint32 | double |
| uint64 | double |
| uint32 | integer^++^ |
| uint64 | integer^++^ |
| float16 | - |
| float32 | double |
| float64 | double |
Expand All @@ -127,7 +127,7 @@ In the tables, entries with a `-` are not currently implemented.
| timestamp | POSIXct |
| duration | - |
| decimal | double |
| dictionary | factor^++^ |
| dictionary | factor^+++^ |
| list | list |
| fixed_size_list | - |
| struct | data.frame |
Expand All @@ -138,7 +138,9 @@ In the tables, entries with a `-` are not currently implemented.
| large_binary | - |
| large_list | - |

^++^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings.
^++^: These integer types may contain values that exceed the range of R's `integer` type (32-bit signed integer). When they do, `uint32` and `uint64` are converted to `double` ("numeric") and `int64` is converted to `bit64::integer64`.

^+++^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings.

## Class structure and package conventions

Expand Down