From 055e964e9f8e0c318769c08a0f15cd0f60094bd5 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 13:50:28 +0100 Subject: [PATCH 01/13] Add 32 and 64 bit decimals --- r/R/dplyr-funcs-type.R | 2 +- r/R/type.R | 20 ++++++++++++++++++++ r/src/array_to_vector.cpp | 6 ++++++ r/src/datatype.cpp | 18 ++++++++++++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 85c26ec05c8..1b42e52baac 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -186,7 +186,7 @@ register_bindings_type_inspect <- function() { is.numeric(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE", - "DECIMAL128", "DECIMAL256" + "DECIMAL32", "DECIMAL64", "DECIMAL128", "DECIMAL256" )]) }) register_binding("base::is.double", function(x) { diff --git a/r/R/type.R b/r/R/type.R index d6db6f146ed..f6f757030d8 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -300,6 +300,10 @@ DecimalType <- R6Class("DecimalType", ) ) +Decimal32Type <- R6Class("Decimal32Type", inherit = DecimalType) + +Decimal64Type <- R6Class("Decimal64Type", inherit = DecimalType) + Decimal128Type <- R6Class("Decimal128Type", inherit = DecimalType) Decimal256Type <- R6Class("Decimal256Type", inherit = DecimalType) @@ -591,6 +595,20 @@ decimal <- function(precision, scale) { } } +#' @rdname data-type +#' @export +decimal32 <- function(precision, scale) { + args <- check_decimal_args(precision, scale) + Decimal32Type__initialize(args$precision, args$scale) +} + +#' @rdname data-type +#' @export +decimal64 <- function(precision, scale) { + args <- check_decimal_args(precision, scale) + Decimal64Type__initialize(args$precision, args$scale) +} + #' @rdname data-type #' @export decimal128 <- function(precision, scale) { @@ -768,6 +786,8 @@ canonical_type_str <- function(type_str) { time64 = "time64", null = "null", timestamp = "timestamp", + decimal32 = "decimal32", + decimal64 = "decimal64", decimal128 = "decimal128", decimal256 = "decimal256", struct = "struct", diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 2f0508eb7a4..c995322382d 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -1313,6 +1313,12 @@ std::shared_ptr Converter::Make( return std::make_shared(chunked_array); } + case Type::DECIMAL32: + return std::make_shared>(chunked_array); + + case Type::DECIMAL64: + return std::make_shared>(chunked_array); + case Type::DECIMAL128: return std::make_shared>(chunked_array); diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index 2f2b89d658d..96790644176 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -81,6 +81,10 @@ const char* r6_class_name::get( case Type::DURATION: return "DurationType"; + case Type::DECIMAL32: + return "Decimal32Type"; + case Type::DECIMAL64: + return "Decimal64Type"; case Type::DECIMAL128: return "Decimal128Type"; case Type::DECIMAL256: @@ -181,6 +185,20 @@ std::shared_ptr Date64__initialize() { return arrow::date64(); // [[arrow::export]] std::shared_ptr Null__initialize() { return arrow::null(); } +// [[arrow::export]] +std::shared_ptr Decimal32Type__initialize(int32_t precision, + int32_t scale) { +// Use the builder that validates inputs +return ValueOrStop(arrow::Decimal32Type::Make(precision, scale)); +} + +// [[arrow::export]] +std::shared_ptr Decimal64Type__initialize(int32_t precision, + int32_t scale) { +// Use the builder that validates inputs +return ValueOrStop(arrow::Decimal64Type::Make(precision, scale)); +} + // [[arrow::export]] std::shared_ptr Decimal128Type__initialize(int32_t precision, int32_t scale) { From bba783856297f35fb528cd7e607cdd6d39b142db Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 13:50:50 +0100 Subject: [PATCH 02/13] Expand tests to account for new decimal types --- r/tests/testthat/test-Array.R | 30 ++++++++++++++++++++++++++- r/tests/testthat/test-chunked-array.R | 1 + r/tests/testthat/test-data-type.R | 2 +- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index ecb0f65dd75..abd03c2970d 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -1321,7 +1321,14 @@ test_that("Array to C-interface", { }) test_that("Can convert R integer/double to decimal (ARROW-11631)", { - # Check both decimal128 and decimal256 + # Check all of decimal32, decimal64, decimal128 and decimal256 + + + decimal32_from_dbl <- arrow_array(c(1, NA_real_), type = decimal32(12, 2)) + decimal64_from_dbl <- arrow_array(c(1, NA_real_), type = decimal64(12, 2)) + decimal32_from_int <- arrow_array(c(1L, NA_integer_), type = decimal32(12, 2)) + decimal64_from_int <- arrow_array(c(1L, NA_integer_), type = decimal64(12, 2)) + decimal128_from_dbl <- arrow_array(c(1, NA_real_), type = decimal128(12, 2)) decimal256_from_dbl <- arrow_array(c(1, NA_real_), type = decimal256(12, 2)) decimal128_from_int <- arrow_array(c(1L, NA_integer_), type = decimal128(12, 2)) @@ -1333,6 +1340,16 @@ test_that("Can convert R integer/double to decimal (ARROW-11631)", { decimal_from_altrep_dbl <- arrow_array(altrep_dbl, type = decimal128(12, 2)) decimal_from_altrep_int <- arrow_array(altrep_int, type = decimal128(12, 2)) + expect_equal( + decimal32_from_dbl, + arrow_array(c(1, NA))$cast(decimal32(12, 2)) + ) + + expect_equal( + decimal64_from_dbl, + arrow_array(c(1, NA))$cast(decimal64()(12, 2)) + ) + expect_equal( decimal128_from_dbl, arrow_array(c(1, NA))$cast(decimal128(12, 2)) @@ -1343,6 +1360,17 @@ test_that("Can convert R integer/double to decimal (ARROW-11631)", { arrow_array(c(1, NA))$cast(decimal256(12, 2)) ) + expect_equal( + decimal32_from_int, + arrow_array(c(1, NA))$cast(decimal32(12, 2)) + ) + + expect_equal( + decimal64_from_int, + arrow_array(c(1, NA))$cast(decimal64(12, 2)) + ) + + expect_equal( decimal128_from_int, arrow_array(c(1, NA))$cast(decimal128(12, 2)) diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index 4ee71260799..560b2459790 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -248,6 +248,7 @@ test_that("ChunkedArray supports empty arrays (ARROW-13761)", { int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), uint64(), float32(), float64(), timestamp("ns"), binary(), large_binary(), fixed_size_binary(32), date32(), date64(), + decimal32(4, 2), decimal64(4, 2), decimal128(4, 2), decimal256(4, 2), dictionary(), struct(x = int32()) ) diff --git a/r/tests/testthat/test-data-type.R b/r/tests/testthat/test-data-type.R index e7212eb61b5..3cf2a87b6cf 100644 --- a/r/tests/testthat/test-data-type.R +++ b/r/tests/testthat/test-data-type.R @@ -497,7 +497,7 @@ test_that("decimal type and validation", { expect_error(decimal128(4, NA), "`scale` must be an integer") expect_error(decimal128(3:4, NA), "`precision` must have size 1. not size 2") expect_error(decimal128(4, 2:3), "`scale` must have size 1. not size 2") - # TODO remove precision range tests below once functionality is tested in C++ (ARROW-15162) + # TODO remove precision range tests below once functionality is tested in C++ (ARROW-15162) - CAN REMOVE THESE IN THIS PR BUT CHECK expect_error(decimal128(0, 2), "Invalid: Decimal precision out of range [1, 38]: 0", fixed = TRUE) expect_error(decimal128(100, 2), "Invalid: Decimal precision out of range [1, 38]: 100", fixed = TRUE) From a3b7fe19b182a3e145b5b906fb1161a8df238e53 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 13:57:01 +0100 Subject: [PATCH 03/13] Update more tests --- r/tests/testthat/test-dplyr-funcs-type.R | 32 +++++++++++++++++++++--- r/tests/testthat/test-type.R | 8 ++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 08fa6ddc64c..8b3927fbd46 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -253,6 +253,8 @@ test_that("type checks with is() giving Arrow types", { Table$create( i32 = Array$create(1, int32()), dec = Array$create(pi)$cast(decimal(3, 2)), + dec32 = Array$create(pi)$cast(decimal32(3, 2)), + dec64 = Array$create(pi)$cast(decimal64(3, 2)), dec128 = Array$create(pi)$cast(decimal128(3, 2)), dec256 = Array$create(pi)$cast(decimal256(3, 2)), f64 = Array$create(1.1, float64()), @@ -261,16 +263,34 @@ test_that("type checks with is() giving Arrow types", { transmute( i32_is_i32 = is(i32, int32()), i32_is_dec = is(i32, decimal(3, 2)), + i32_is_dec32 = is(i32, decimal32(3, 2)), + i32_is_dec64 = is(i32, decimal64(3, 2)), i32_is_dec128 = is(i32, decimal128(3, 2)), i32_is_dec256 = is(i32, decimal256(3, 2)), i32_is_f64 = is(i32, float64()), i32_is_str = is(i32, string()), dec_is_i32 = is(dec, int32()), dec_is_dec = is(dec, decimal(3, 2)), + dec_is_dec32 = is(dec, decimal32(3, 2)), + dec_is_dec64 = is(dec, decimal64(3, 2)), dec_is_dec128 = is(dec, decimal128(3, 2)), dec_is_dec256 = is(dec, decimal256(3, 2)), dec_is_f64 = is(dec, float64()), dec_is_str = is(dec, string()), + dec32_is_i32 = is(dec32, int32()), + dec32_is_dec32 = is(dec32, decimal32(3, 2)), + dec32_is_dec64 = is(dec32, decimal64(3, 2)), + dec32_is_dec128 = is(dec32, decimal128(3, 2)), + dec32_is_dec256 = is(dec32, decimal256(3, 2)), + dec32_is_f64 = is(dec32, float64()), + dec32_is_str = is(dec32, string()), + dec64_is_i32 = is(dec64, int32()), + dec64_is_dec32 = is(dec64, decimal32(3, 2)), + dec64_is_dec64 = is(dec64, decimal64(3, 2)), + dec64_is_dec128 = is(dec64, decimal128(3, 2)), + dec64_is_dec256 = is(dec64, decimal256(3, 2)), + dec64_is_f64 = is(dec64, float64()), + dec64_is_str = is(dec64, string()), dec128_is_i32 = is(dec128, int32()), dec128_is_dec128 = is(dec128, decimal128(3, 2)), dec128_is_dec256 = is(dec128, decimal256(3, 2)), @@ -283,11 +303,15 @@ test_that("type checks with is() giving Arrow types", { dec256_is_str = is(dec128, string()), f64_is_i32 = is(f64, int32()), f64_is_dec = is(f64, decimal(3, 2)), + f64_is_dec32 = is(f64, decimal32(3, 2)), + f64_is_dec64 = is(f64, decimal64(3, 2)), f64_is_dec128 = is(f64, decimal128(3, 2)), f64_is_dec256 = is(f64, decimal256(3, 2)), f64_is_f64 = is(f64, float64()), f64_is_str = is(f64, string()), str_is_i32 = is(str, int32()), + str_is_dec32 = is(str, decimal32(3, 2)), + str_is_dec64 = is(str, decimal64(3, 2)), str_is_dec128 = is(str, decimal128(3, 2)), str_is_dec256 = is(str, decimal256(3, 2)), str_is_i64 = is(str, float64()), @@ -297,9 +321,11 @@ test_that("type checks with is() giving Arrow types", { t() %>% as.vector(), c( - TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, - FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, - FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE + TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, + FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, + FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, + FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, + TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE ) ) # with class2=string diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index 4f6210c29c1..be543f1ad2f 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -200,6 +200,14 @@ test_that("Type strings are correctly canonicalized", { canonical_type_str("timestamp"), sub("^([^([<]+).*$", "\\1", timestamp()$ToString()) ) + expect_equal( + canonical_type_str("decimal32"), + sub("^([^([<]+).*$", "\\1", decimal32(3, 2)$ToString()) + ) + expect_equal( + canonical_type_str("decimal64"), + sub("^([^([<]+).*$", "\\1", decimal64(3, 2)$ToString()) + ) expect_equal( canonical_type_str("decimal128"), sub("^([^([<]+).*$", "\\1", decimal(3, 2)$ToString()) From b7b350b97aa80acc5c7dd707d9ed6528fd6e0c6f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 14:28:04 +0100 Subject: [PATCH 04/13] Call devtools::document() to add the new types --- r/NAMESPACE | 2 ++ r/R/arrowExports.R | 8 ++++++++ r/man/data-type.Rd | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index 412d70ed22c..cdeb27c4067 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -319,6 +319,8 @@ export(date64) export(decimal) export(decimal128) export(decimal256) +export(decimal32) +export(decimal64) export(default_memory_pool) export(dictionary) export(duration) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 4ed612fc734..a988cfb4af7 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -924,6 +924,14 @@ Null__initialize <- function() { .Call(`_arrow_Null__initialize`) } +Decimal32Type__initialize <- function(precision, scale) { + .Call(`_arrow_Decimal32Type__initialize`, precision, scale) +} + +Decimal64Type__initialize <- function(precision, scale) { + .Call(`_arrow_Decimal64Type__initialize`, precision, scale) +} + Decimal128Type__initialize <- function(precision, scale) { .Call(`_arrow_Decimal128Type__initialize`, precision, scale) } diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index 214e8ddc1f6..4a410e5cb45 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -31,6 +31,8 @@ \alias{null} \alias{timestamp} \alias{decimal} +\alias{decimal32} +\alias{decimal64} \alias{decimal128} \alias{decimal256} \alias{struct} @@ -100,6 +102,10 @@ timestamp(unit = c("s", "ms", "us", "ns"), timezone = "") decimal(precision, scale) +decimal32(precision, scale) + +decimal64(precision, scale) + decimal128(precision, scale) decimal256(precision, scale) From 23d2ba83626be1868ba21bb95946c166287401cc Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 14:28:46 +0100 Subject: [PATCH 05/13] Add new decimal types in a few more missed places --- r/R/dplyr-funcs-simple.R | 2 +- r/R/type.R | 6 +++++- r/src/array_to_vector.cpp | 4 ++-- r/src/arrowExports.cpp | 20 ++++++++++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-simple.R b/r/R/dplyr-funcs-simple.R index 05780721f24..ac8a7a46ca4 100644 --- a/r/R/dplyr-funcs-simple.R +++ b/r/R/dplyr-funcs-simple.R @@ -190,7 +190,7 @@ common_type <- function(exprs) { cast_or_parse <- function(x, type) { to_type_id <- type$id - if (to_type_id %in% c(Type[["DECIMAL128"]], Type[["DECIMAL256"]])) { + if (to_type_id %in% c(Type[["DECIMAL32"]], Type[["DECIMAL64"]], Type[["DECIMAL128"]], Type[["DECIMAL256"]])) { # TODO: determine the minimum size of decimal (or integer) required to # accommodate x # We would like to keep calculations on decimal if that's what the data has diff --git a/r/R/type.R b/r/R/type.R index f6f757030d8..1552d1dfa58 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -590,8 +590,12 @@ decimal <- function(precision, scale) { if (args$precision > 38) { decimal256(args$precision, args$scale) - } else { + } else if (args$precision > 18) { decimal128(args$precision, args$scale) + } else if (args$precision > 9) { + decimal64(args$precision, args$scale) + } else { + decimal32(args$precision, args$scale) } } diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index c995322382d..d27e1b93a93 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -1314,10 +1314,10 @@ std::shared_ptr Converter::Make( } case Type::DECIMAL32: - return std::make_shared>(chunked_array); + return std::make_shared>(chunked_array); case Type::DECIMAL64: - return std::make_shared>(chunked_array); + return std::make_shared>(chunked_array); case Type::DECIMAL128: return std::make_shared>(chunked_array); diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index c71d1c77305..c31cc6dc9c1 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2551,6 +2551,24 @@ BEGIN_CPP11 END_CPP11 } // datatype.cpp +std::shared_ptr Decimal32Type__initialize(int32_t precision, int32_t scale); +extern "C" SEXP _arrow_Decimal32Type__initialize(SEXP precision_sexp, SEXP scale_sexp){ +BEGIN_CPP11 + arrow::r::Input::type precision(precision_sexp); + arrow::r::Input::type scale(scale_sexp); + return cpp11::as_sexp(Decimal32Type__initialize(precision, scale)); +END_CPP11 +} +// datatype.cpp +std::shared_ptr Decimal64Type__initialize(int32_t precision, int32_t scale); +extern "C" SEXP _arrow_Decimal64Type__initialize(SEXP precision_sexp, SEXP scale_sexp){ +BEGIN_CPP11 + arrow::r::Input::type precision(precision_sexp); + arrow::r::Input::type scale(scale_sexp); + return cpp11::as_sexp(Decimal64Type__initialize(precision, scale)); +END_CPP11 +} +// datatype.cpp std::shared_ptr Decimal128Type__initialize(int32_t precision, int32_t scale); extern "C" SEXP _arrow_Decimal128Type__initialize(SEXP precision_sexp, SEXP scale_sexp){ BEGIN_CPP11 @@ -5910,6 +5928,8 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0}, { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0}, + { "_arrow_Decimal32Type__initialize", (DL_FUNC) &_arrow_Decimal32Type__initialize, 2}, + { "_arrow_Decimal64Type__initialize", (DL_FUNC) &_arrow_Decimal64Type__initialize, 2}, { "_arrow_Decimal128Type__initialize", (DL_FUNC) &_arrow_Decimal128Type__initialize, 2}, { "_arrow_Decimal256Type__initialize", (DL_FUNC) &_arrow_Decimal256Type__initialize, 2}, { "_arrow_DayTimeInterval__initialize", (DL_FUNC) &_arrow_DayTimeInterval__initialize, 0}, From 1249367c0ab33919b52cb2916d17dd1938b647a3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 14:28:56 +0100 Subject: [PATCH 06/13] Add new enums --- r/R/enums.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/r/R/enums.R b/r/R/enums.R index 98995b2a2e7..4d0bda96da5 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -80,7 +80,13 @@ Type <- enum("Type::type", LARGE_BINARY = 35L, LARGE_LIST = 36L, INTERVAL_MONTH_DAY_NANO = 37L, - RUN_END_ENCODED = 38L + RUN_END_ENCODED = 38L, + STRING_VIEW = 39L, + BINARY_VIEW = 40L, + LIST_VIEW = 41L, + LARGE_LIST_VIEW = 42L, + DECIMAL32 = 43L, + DECIMAL64 = 44L ) TYPES_WITH_NAN <- Type[c("HALF_FLOAT", "FLOAT", "DOUBLE")] @@ -88,7 +94,7 @@ TYPES_NUMERIC <- Type[ c( "INT8", "UINT8", "INT16", "UINT16", "INT32", "UINT32", "INT64", "UINT64", "HALF_FLOAT", "FLOAT", "DOUBLE", - "DECIMAL128", "DECIMAL256" + "DECIMAL32", "DECIMAL64", "DECIMAL128", "DECIMAL256" ) ] From 6174645b4a28f6e051a6aaefe8e16bc42fd22642 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 14:42:34 +0100 Subject: [PATCH 07/13] Fix Array test failures --- r/tests/testthat/test-Array.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index abd03c2970d..cde6877c4da 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -1324,9 +1324,9 @@ test_that("Can convert R integer/double to decimal (ARROW-11631)", { # Check all of decimal32, decimal64, decimal128 and decimal256 - decimal32_from_dbl <- arrow_array(c(1, NA_real_), type = decimal32(12, 2)) + decimal32_from_dbl <- arrow_array(c(1, NA_real_), type = decimal32(9, 2)) decimal64_from_dbl <- arrow_array(c(1, NA_real_), type = decimal64(12, 2)) - decimal32_from_int <- arrow_array(c(1L, NA_integer_), type = decimal32(12, 2)) + decimal32_from_int <- arrow_array(c(1L, NA_integer_), type = decimal32(9, 2)) decimal64_from_int <- arrow_array(c(1L, NA_integer_), type = decimal64(12, 2)) decimal128_from_dbl <- arrow_array(c(1, NA_real_), type = decimal128(12, 2)) @@ -1342,12 +1342,12 @@ test_that("Can convert R integer/double to decimal (ARROW-11631)", { expect_equal( decimal32_from_dbl, - arrow_array(c(1, NA))$cast(decimal32(12, 2)) + arrow_array(c(1, NA))$cast(decimal32(9, 2)) ) expect_equal( decimal64_from_dbl, - arrow_array(c(1, NA))$cast(decimal64()(12, 2)) + arrow_array(c(1, NA))$cast(decimal64(12, 2)) ) expect_equal( @@ -1362,7 +1362,7 @@ test_that("Can convert R integer/double to decimal (ARROW-11631)", { expect_equal( decimal32_from_int, - arrow_array(c(1, NA))$cast(decimal32(12, 2)) + arrow_array(c(1, NA))$cast(decimal32(9, 2)) ) expect_equal( From 79f188634845792d597d66020b294bd7a1d325db Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 14:48:26 +0100 Subject: [PATCH 08/13] Enable filters on decimal32 and decimal64 --- .../arrow/compute/kernels/vector_selection_filter_internal.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 194c3591337..1c2eacb9a76 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -1096,6 +1096,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(match::LargeBinaryLike()), plain_filter, BinaryFilterExec}, {InputType(null()), plain_filter, NullFilterExec}, {InputType(Type::FIXED_SIZE_BINARY), plain_filter, PrimitiveFilterExec}, + {InputType(Type::DECIMAL32), plain_filter, PrimitiveFilterExec}, + {InputType(Type::DECIMAL64), plain_filter, PrimitiveFilterExec}, {InputType(Type::DECIMAL128), plain_filter, PrimitiveFilterExec}, {InputType(Type::DECIMAL256), plain_filter, PrimitiveFilterExec}, {InputType(Type::DICTIONARY), plain_filter, DictionaryFilterExec}, @@ -1116,6 +1118,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(match::LargeBinaryLike()), ree_filter, BinaryFilterExec}, {InputType(null()), ree_filter, NullFilterExec}, {InputType(Type::FIXED_SIZE_BINARY), ree_filter, PrimitiveFilterExec}, + {InputType(Type::DECIMAL32), ree_filter, PrimitiveFilterExec}, + {InputType(Type::DECIMAL64), ree_filter, PrimitiveFilterExec}, {InputType(Type::DECIMAL128), ree_filter, PrimitiveFilterExec}, {InputType(Type::DECIMAL256), ree_filter, PrimitiveFilterExec}, {InputType(Type::DICTIONARY), ree_filter, DictionaryFilterExec}, From b75c573e1b87b89ce10a23af13f0da58d8308a89 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 15:30:25 +0100 Subject: [PATCH 09/13] Make tests less horrifying to update --- r/tests/testthat/test-dplyr-funcs-type.R | 114 ++++++++++++++++------- 1 file changed, 78 insertions(+), 36 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 8b3927fbd46..80cab8aa9ab 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -249,17 +249,15 @@ test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", { test_that("type checks with is() giving Arrow types", { # with class2=DataType + extract_logicals <- function(x){ + x %>% + collect() %>% + t() %>% + as.vector() + } + expect_equal( - Table$create( - i32 = Array$create(1, int32()), - dec = Array$create(pi)$cast(decimal(3, 2)), - dec32 = Array$create(pi)$cast(decimal32(3, 2)), - dec64 = Array$create(pi)$cast(decimal64(3, 2)), - dec128 = Array$create(pi)$cast(decimal128(3, 2)), - dec256 = Array$create(pi)$cast(decimal256(3, 2)), - f64 = Array$create(1.1, float64()), - str = Array$create("a", arrow::string()) - ) %>% + Table$create(i32 = Array$create(1, int32())) %>% transmute( i32_is_i32 = is(i32, int32()), i32_is_dec = is(i32, decimal(3, 2)), @@ -268,7 +266,15 @@ test_that("type checks with is() giving Arrow types", { i32_is_dec128 = is(i32, decimal128(3, 2)), i32_is_dec256 = is(i32, decimal256(3, 2)), i32_is_f64 = is(i32, float64()), - i32_is_str = is(i32, string()), + i32_is_str = is(i32, string()) + ) %>% + extract_logicals(), + c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE) + ) + + expect_equal( + Table$create(dec = Array$create(pi)$cast(decimal(3, 2))) %>% + transmute( dec_is_i32 = is(dec, int32()), dec_is_dec = is(dec, decimal(3, 2)), dec_is_dec32 = is(dec, decimal32(3, 2)), @@ -276,31 +282,78 @@ test_that("type checks with is() giving Arrow types", { dec_is_dec128 = is(dec, decimal128(3, 2)), dec_is_dec256 = is(dec, decimal256(3, 2)), dec_is_f64 = is(dec, float64()), - dec_is_str = is(dec, string()), + dec_is_str = is(dec, string()) + ) %>% + extract_logicals(), + c(FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE) + ) + + expect_equal( + Table$create(dec32 = Array$create(pi)$cast(decimal32(3, 2))) %>% + transmute( dec32_is_i32 = is(dec32, int32()), dec32_is_dec32 = is(dec32, decimal32(3, 2)), dec32_is_dec64 = is(dec32, decimal64(3, 2)), dec32_is_dec128 = is(dec32, decimal128(3, 2)), dec32_is_dec256 = is(dec32, decimal256(3, 2)), dec32_is_f64 = is(dec32, float64()), - dec32_is_str = is(dec32, string()), + dec32_is_str = is(dec32, string()) + ) %>% + extract_logicals(), + c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE) + ) + + expect_equal( + Table$create(dec64 = Array$create(pi)$cast(decimal64(3, 2))) %>% + transmute( dec64_is_i32 = is(dec64, int32()), dec64_is_dec32 = is(dec64, decimal32(3, 2)), dec64_is_dec64 = is(dec64, decimal64(3, 2)), dec64_is_dec128 = is(dec64, decimal128(3, 2)), dec64_is_dec256 = is(dec64, decimal256(3, 2)), dec64_is_f64 = is(dec64, float64()), - dec64_is_str = is(dec64, string()), + dec64_is_str = is(dec64, string()) + ) %>% + extract_logicals(), + c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE) + ) + + expect_equal( + Table$create(dec128 = Array$create(pi)$cast(decimal128(3, 2))) %>% + transmute( dec128_is_i32 = is(dec128, int32()), + dec128_is_dec32 = is(dec128, decimal32(3, 2)), + dec128_is_dec64 = is(dec128, decimal64(3, 2)), dec128_is_dec128 = is(dec128, decimal128(3, 2)), dec128_is_dec256 = is(dec128, decimal256(3, 2)), dec128_is_f64 = is(dec128, float64()), - dec128_is_str = is(dec128, string()), - dec256_is_i32 = is(dec128, int32()), - dec256_is_dec128 = is(dec128, decimal128(3, 2)), - dec256_is_dec256 = is(dec128, decimal256(3, 2)), - dec256_is_f64 = is(dec128, float64()), - dec256_is_str = is(dec128, string()), + dec128_is_str = is(dec128, string()) + ) %>% + extract_logicals(), + c(FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + ) + + expect_equal( + Table$create(dec256 = Array$create(pi)$cast(decimal256(3, 2))) %>% + transmute( + dec256_is_i32 = is(dec256, int32()), + dec256_is_dec32 = is(dec256, decimal32(3, 2)), + dec256_is_dec64 = is(dec256, decimal64(3, 2)), + dec256_is_dec128 = is(dec256, decimal128(3, 2)), + dec256_is_dec256 = is(dec256, decimal256(3, 2)), + dec256_is_f64 = is(dec256, float64()), + dec256_is_str = is(dec256, string()) + ) %>% + extract_logicals(), + c(FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE) + ) + + expect_equal( + Table$create( + f64 = Array$create(1.1, float64()), + str = Array$create("a", arrow::string()) + ) %>% + transmute( f64_is_i32 = is(f64, int32()), f64_is_dec = is(f64, decimal(3, 2)), f64_is_dec32 = is(f64, decimal32(3, 2)), @@ -317,16 +370,9 @@ test_that("type checks with is() giving Arrow types", { str_is_i64 = is(str, float64()), str_is_str = is(str, string()) ) %>% - collect() %>% - t() %>% - as.vector(), - c( - TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, - FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, - FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, - FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, - TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE - ) + extract_logicals(), + c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, TRUE) ) # with class2=string expect_equal( @@ -348,9 +394,7 @@ test_that("type checks with is() giving Arrow types", { str_is_i64 = is(str, "double"), str_is_str = is(str, "string") ) %>% - collect() %>% - t() %>% - as.vector(), + extract_logicals(), c(TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE) ) # with class2=string alias @@ -388,9 +432,7 @@ test_that("type checks with is() giving Arrow types", { str_is_lgl = is(str, "boolean"), str_is_str = is(str, "utf8") ) %>% - collect() %>% - t() %>% - as.vector(), + extract_logicals(), c( TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, From e1e657b4cd261a01f7fe4151a84f22ab3ad06e14 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 15:35:56 +0100 Subject: [PATCH 10/13] lint --- r/R/enums.R | 4 ++-- r/src/datatype.cpp | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/r/R/enums.R b/r/R/enums.R index 4d0bda96da5..a28728552f8 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -95,8 +95,8 @@ TYPES_NUMERIC <- Type[ "INT8", "UINT8", "INT16", "UINT16", "INT32", "UINT32", "INT64", "UINT64", "HALF_FLOAT", "FLOAT", "DOUBLE", "DECIMAL32", "DECIMAL64", "DECIMAL128", "DECIMAL256" - ) - ] + ) +] #' @rdname enums #' @export diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index 96790644176..ea407fc7776 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -187,16 +187,16 @@ std::shared_ptr Null__initialize() { return arrow::null(); } // [[arrow::export]] std::shared_ptr Decimal32Type__initialize(int32_t precision, - int32_t scale) { -// Use the builder that validates inputs -return ValueOrStop(arrow::Decimal32Type::Make(precision, scale)); + int32_t scale) { + // Use the builder that validates inputs + return ValueOrStop(arrow::Decimal32Type::Make(precision, scale)); } // [[arrow::export]] std::shared_ptr Decimal64Type__initialize(int32_t precision, - int32_t scale) { -// Use the builder that validates inputs -return ValueOrStop(arrow::Decimal64Type::Make(precision, scale)); + int32_t scale) { + // Use the builder that validates inputs + return ValueOrStop(arrow::Decimal64Type::Make(precision, scale)); } // [[arrow::export]] From f6b31b4ce357d167d26b3a6e11d12484c2a19f88 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 15:57:08 +0100 Subject: [PATCH 11/13] Fix tests where decimal range update makes them fail --- r/tests/testthat/test-data-type.R | 14 +++----------- r/tests/testthat/test-type.R | 2 +- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/r/tests/testthat/test-data-type.R b/r/tests/testthat/test-data-type.R index 3cf2a87b6cf..fa2e5bcd6e8 100644 --- a/r/tests/testthat/test-data-type.R +++ b/r/tests/testthat/test-data-type.R @@ -474,16 +474,15 @@ test_that("DictionaryType validation", { }) test_that("decimal type and validation", { - expect_r6_class(decimal(4, 2), "Decimal128Type") + expect_r6_class(decimal(4, 2), "Decimal32Type") + expect_r6_class(decimal(14, 2), "Decimal64Type") + expect_r6_class(decimal(22, 2), "Decimal128Type") expect_r6_class(decimal(39, 2), "Decimal256Type") expect_error(decimal("four"), "`precision` must be an integer") expect_error(decimal(4, "two"), "`scale` must be an integer") expect_error(decimal(NA, 2), "`precision` must be an integer") expect_error(decimal(4, NA), "`scale` must be an integer") - # TODO remove precision range tests below once functionality is tested in C++ (ARROW-15162) - expect_error(decimal(0, 2), "Invalid: Decimal precision out of range [1, 38]: 0", fixed = TRUE) - expect_error(decimal(100, 2), "Invalid: Decimal precision out of range [1, 76]: 100", fixed = TRUE) # decimal() creates either decimal128 or decimal256 based on precision expect_identical(class(decimal(38, 2)), class(decimal128(38, 2))) @@ -497,10 +496,6 @@ test_that("decimal type and validation", { expect_error(decimal128(4, NA), "`scale` must be an integer") expect_error(decimal128(3:4, NA), "`precision` must have size 1. not size 2") expect_error(decimal128(4, 2:3), "`scale` must have size 1. not size 2") - # TODO remove precision range tests below once functionality is tested in C++ (ARROW-15162) - CAN REMOVE THESE IN THIS PR BUT CHECK - expect_error(decimal128(0, 2), "Invalid: Decimal precision out of range [1, 38]: 0", fixed = TRUE) - expect_error(decimal128(100, 2), "Invalid: Decimal precision out of range [1, 38]: 100", fixed = TRUE) - expect_r6_class(decimal256(4, 2), "Decimal256Type") @@ -510,9 +505,6 @@ test_that("decimal type and validation", { expect_error(decimal256(4, NA), "`scale` must be an integer") expect_error(decimal256(3:4, NA), "`precision` must have size 1. not size 2") expect_error(decimal256(4, 2:3), "`scale` must have size 1. not size 2") - # TODO remove precision range tests below once functionality is tested in C++ (ARROW-15162) - expect_error(decimal256(0, 2), "Invalid: Decimal precision out of range [1, 76]: 0", fixed = TRUE) - expect_error(decimal256(100, 2), "Invalid: Decimal precision out of range [1, 76]: 100", fixed = TRUE) }) test_that("Binary", { diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index be543f1ad2f..b01af80dc67 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -210,7 +210,7 @@ test_that("Type strings are correctly canonicalized", { ) expect_equal( canonical_type_str("decimal128"), - sub("^([^([<]+).*$", "\\1", decimal(3, 2)$ToString()) + sub("^([^([<]+).*$", "\\1", decimal(31, 2)$ToString()) ) expect_equal( canonical_type_str("decimal128"), From 6df375dcbc34a495569ce385b9df83be330089c4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 5 Jun 2025 15:58:36 +0100 Subject: [PATCH 12/13] Linter fun --- r/tests/testthat/test-dplyr-funcs-type.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 80cab8aa9ab..dbbd2ab206f 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -249,7 +249,7 @@ test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", { test_that("type checks with is() giving Arrow types", { # with class2=DataType - extract_logicals <- function(x){ + extract_logicals <- function(x) { x %>% collect() %>% t() %>% @@ -371,8 +371,10 @@ test_that("type checks with is() giving Arrow types", { str_is_str = is(str, string()) ) %>% extract_logicals(), - c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, - FALSE, FALSE, FALSE, FALSE, FALSE, TRUE) + c( + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, TRUE + ) ) # with class2=string expect_equal( From 1a6dff66cc764bcc64608fd380dce6c0ac1146de Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 13 Jun 2025 14:47:41 +0000 Subject: [PATCH 13/13] Add new types to vignette --- r/vignettes/data_types.Rmd | 123 +++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 32 deletions(-) diff --git a/r/vignettes/data_types.Rmd b/r/vignettes/data_types.Rmd index 4b5ee01b6ab..d5c70a8f02d 100644 --- a/r/vignettes/data_types.Rmd +++ b/r/vignettes/data_types.Rmd @@ -6,11 +6,16 @@ description: > output: rmarkdown::html_vignette --- -Arrow has a rich data type system that includes direct analogs of many R data types, and many data types that do not have a counterpart in R. This article describes the Arrow type system, compares it to R data types, and outlines the default mappings used when data are transferred from Arrow to R. At the end of the article there are two lookup tables: one describing the default "R to Arrow" type mappings and the other describing the "Arrow to R" mappings. +Arrow has a rich data type system that includes direct analogs of many R data types, and many data types that do not have a counterpart +in R. This article describes the Arrow type system, compares it to R data types, and outlines the default mappings used when data are + transferred from Arrow to R. At the end of the article there are two lookup tables: one describing the default "R to Arrow" type mappings + and the other describing the "Arrow to R" mappings. ## Motivating example -To illustrate the conversion that needs to take place, consider the differences between the output when obtain we use `dplyr::glimpse()` to inspect the `starwars` data in its original format -- as a data frame in R -- and the output we obtain when we convert it to an Arrow Table first by calling `arrow_table()`: +To illustrate the conversion that needs to take place, consider the differences between the output when obtain we use `dplyr::glimpse()` + to inspect the `starwars` data in its original format -- as a data frame in R -- and the output we obtain when we convert it to an Arrow + Table first by calling `arrow_table()`: ```{r} library(dplyr, warn.conflicts = FALSE) @@ -22,33 +27,51 @@ glimpse(arrow_table(starwars)) The data represented are essentially the same, but the descriptions of the data types for the columns have changed. For example: -- `name` is labelled `` (character vector) in the data frame; it is labelled `` (a string type, also referred to as utf8 type) in the Arrow Table +- `name` is labelled `` (character vector) in the data frame; it is labelled `` (a string type, also referred to as utf8 +type) in the Arrow Table - `height` is labelled `` (integer vector) in the data frame; it is labelled `` (32 bit signed integer) in the Arrow Table - `mass` is labelled `` (numeric vector) in the data frame; it is labelled `` (64 bit floating point number) in the Arrow Table -Some of these differences are purely cosmetic: integers in R are in fact 32 bit signed integers, so the underlying data types in Arrow and R are direct analogs of one another. In other cases the differences are purely about the implementation: Arrow and R have different ways to store a vector of strings, but at a high level of abstraction the R character type and the Arrow string type can be viewed as direct analogs. In some cases, however, there are no clear analogs: while Arrow has an analog of POSIXct (the timestamp type) it does not have an analog of POSIXlt; conversely, while R can represent 32 bit signed integers, it does not have an equivalent of a 64 bit unsigned integer. +Some of these differences are purely cosmetic: integers in R are in fact 32 bit signed integers, so the underlying data types in Arrow +and R are direct analogs of one another. In other cases the differences are purely about the implementation: Arrow and R have different + ways to store a vector of strings, but at a high level of abstraction the R character type and the Arrow string type can be viewed as + direct analogs. In some cases, however, there are no clear analogs: while Arrow has an analog of POSIXct (the timestamp type) it does + not have an analog of POSIXlt; conversely, while R can represent 32 bit signed integers, it does not have an equivalent of a 64 bit + unsigned integer. -When the arrow package converts between R data and Arrow data, it will first check to see if a Schema has been provided -- see `schema()` for more information -- and if none is available it will attempt to guess the appropriate type by following the default mappings. A complete listing of these mappings is provided at the end of the article, but the most common cases are depicted in the illustration below: +When the arrow package converts between R data and Arrow data, it will first check to see if a Schema has been provided -- see +`schema()` for more information -- and if none is available it will attempt to guess the appropriate type by following the default +mappings. A complete listing of these mappings is provided at the end of the article, but the most common cases are depicted in +the illustration below: ```{r, echo=FALSE, out.width="100%"} knitr::include_graphics("./data_types.png") ``` -In this image, black boxes refer to R data types and light blue boxes refer to Arrow data types. Directional arrows specify conversions (e.g., the bidirectional arrow between the logical R type and the boolean Arrow type means that the logical R converts to an Arrow boolean and vice versa). Solid lines indicate that this conversion rule is always the default; dashed lines mean that it only sometimes applies (the rules and special cases are described below). +In this image, black boxes refer to R data types and light blue boxes refer to Arrow data types. Directional arrows specify +conversions (e.g., the bidirectional arrow between the logical R type and the boolean Arrow type means that the logical R +converts to an Arrow boolean and vice versa). Solid lines indicate that this conversion rule is always the default; dashed lines +mean that it only sometimes applies (the rules and special cases are described below). ## Logical/boolean types -Arrow and R both use three-valued logic. In R, logical values can be `TRUE` or `FALSE`, with `NA` used to represent missing data. In Arrow, the corresponding boolean type can take values `true`, `false`, or `null`, as shown below: +Arrow and R both use three-valued logic. In R, logical values can be `TRUE` or `FALSE`, with `NA` used to represent missing data. +In Arrow, the corresponding boolean type can take values `true`, `false`, or `null`, as shown below: ```{r} chunked_array(c(TRUE, FALSE, NA), type = boolean()) # default ``` -It is not strictly necessary to set `type = boolean()` in this example because the default behavior in arrow is to translate R logical vectors to Arrow booleans and vice versa. However, for the sake of clarity we will specify the data types explicitly throughout this article. We will likewise use `chunked_array()` to create Arrow data from R objects and `as.vector()` to create R data from Arrow objects, but similar results are obtained if we use other methods. +It is not strictly necessary to set `type = boolean()` in this example because the default behavior in arrow is to translate R + logical vectors to Arrow booleans and vice versa. However, for the sake of clarity we will specify the data types explicitly + throughout this article. We will likewise use `chunked_array()` to create Arrow data from R objects and `as.vector()` to create + R data from Arrow objects, but similar results are obtained if we use other methods. ## Integer types -Base R natively supports only one type of integer, using 32 bits to represent signed numbers between -2147483648 and 2147483647, though R can also support 64 bit integers via the [`bit64`](https://cran.r-project.org/package=bit64) package. Arrow inherits signed and unsigned integer types from C++ in 8 bit, 16 bit, 32 bit, and 64 bit versions: +Base R natively supports only one type of integer, using 32 bits to represent signed numbers between -2147483648 and 2147483647, + though R can also support 64 bit integers via the [`bit64`](https://cran.r-project.org/package=bit64) package. Arrow inherits + signed and unsigned integer types from C++ in 8 bit, 16 bit, 32 bit, and 64 bit versions: | Description | Data Type Function | Smallest Value | Largest Value | | --------------- | -----------------: | -------------------: | -------------------: | @@ -78,11 +101,14 @@ When translating from Arrow to R, integer types alway translate to R integers un - If the value of an Arrow uint32 or uint64 falls outside the range allowed for R integers, the result will be a numeric vector in R - If the value of an Arrow int64 variable falls outside the range allowed for R integers, the result will be a `bit64::integer64` vector in R -- If the user sets `options(arrow.int64_downcast = FALSE)`, the Arrow int64 type always yields a `bit64::integer64` vector in R regardless of the value +- If the user sets `options(arrow.int64_downcast = FALSE)`, the Arrow int64 type always yields a `bit64::integer64` vector in R + regardless of the value ## Floating point numeric types -R has one double-precision (64 bit) numeric type, which translates to the Arrow 64 bit floating point type by default. Arrow supports both single-precision (32 bit) and double-precision (64 bit) floating point numbers, specified using the `float32()` and `float64()` data type functions. Both of these are translated to doubles in R. Examples are shown below: +R has one double-precision (64 bit) numeric type, which translates to the Arrow 64 bit floating point type by default. Arrow supports + both single-precision (32 bit) and double-precision (64 bit) floating point numbers, specified using the `float32()` and `float64()` + data type functions. Both of these are translated to doubles in R. Examples are shown below: ```{r} chunked_array(c(0.1, 0.2, 0.3), type = float64()) # default @@ -96,14 +122,22 @@ Note that the Arrow specification also permits half-precision (16 bit) floating ## Fixed point decimal types -Arrow also contains `decimal()` data types, in which numeric values are specified in decimal format rather than binary. Decimals in Arrow come in two varieties, a 128 bit version and a 256 bit version, but in most cases users should be able to use the more general `decimal()` data type function rather than the specific `decimal128()` and `decimal256()` functions. +Arrow also contains `decimal()` data types, in which numeric values are specified in decimal format rather than binary. +Decimals in Arrow come in two varieties, a 128 bit version and a 256 bit version, but in most cases users should be able +to use the more general `decimal()` data type function rather than the specific `decimal32()`, `decimal64()`, `decimal128()`, + and `decimal256()` functions. -The decimal types in Arrow are fixed-precision numbers (rather than floating-point), which means it is necessary to explicitly specify the `precision` and `scale` arguments: +The decimal types in Arrow are fixed-precision numbers (rather than floating-point), which means it is necessary to explicitly +specify the `precision` and `scale` arguments: - `precision` specifies the number of significant digits to store. -- `scale` specifies the number of digits that should be stored after the decimal point. If you set `scale = 2`, exactly two digits will be stored after the decimal point. If you set `scale = 0`, values will be rounded to the nearest whole number. Negative scales are also permitted (handy when dealing with extremely large numbers), so `scale = -2` stores the value to the nearest 100. +- `scale` specifies the number of digits that should be stored after the decimal point. If you set `scale = 2`, exactly two +digits will be stored after the decimal point. If you set `scale = 0`, values will be rounded to the nearest whole number. +Negative scales are also permitted (handy when dealing with extremely large numbers), so `scale = -2` stores the value to the nearest 100. -Because R does not have any way to create decimal types natively, the example below is a little circuitous. First we create some floating point numbers as Chunked Arrays, and then explicitly cast these to decimal types within Arrow. This is possible because Chunked Array objects possess a `cast()` method: +Because R does not have any way to create decimal types natively, the example below is a little circuitous. First we create +some floating point numbers as Chunked Arrays, and then explicitly cast these to decimal types within Arrow. +This is possible because Chunked Array objects possess a `cast()` method: ```{r} arrow_floating <- chunked_array(c(.01, .1, 1, 10, 100)) @@ -111,11 +145,15 @@ arrow_decimals <- arrow_floating$cast(decimal(precision = 5, scale = 2)) arrow_decimals ``` -Though not natively used in R, decimal types can be useful in situations where it is especially important to avoid problems that arise in floating point arithmetic. +Though not natively used in R, decimal types can be useful in situations where it is especially important to avoid problems that arise + in floating point arithmetic. ## String/character types -R uses a single character type to represent strings whereas Arrow has two types. In the Arrow C++ library these types are referred to as strings and large_strings, but to avoid ambiguity in the arrow R package they are defined using the `utf8()` and `large_utf8()` data type functions. The distinction between these two Arrow types is unlikely to be important for R users, though the difference is discussed in the article on [data object layout](./developers/data_object_layout.html). +R uses a single character type to represent strings whereas Arrow has two types. In the Arrow C++ library these types are referred to +as strings and large_strings, but to avoid ambiguity in the arrow R package they are defined using the `utf8()` and `large_utf8()` data + type functions. The distinction between these two Arrow types is unlikely to be important for R users, though the difference is discussed + in the article on [data object layout](./developers/data_object_layout.html). The default behavior is to translate R character vectors to the utf8/string type, and to translate both Arrow types to R character vectors: @@ -127,7 +165,8 @@ as.vector(strings) ## Factor/dictionary types -The analog of R factors in Arrow is the dictionary type. Factors translate to dictionaries and vice versa. To illustrate this, let's create a small factor object in R: +The analog of R factors in Arrow is the dictionary type. Factors translate to dictionaries and vice versa. To illustrate this, let's +create a small factor object in R: ```{r} fct <- factor(c("cat", "dog", "pig", "dog")) @@ -147,11 +186,15 @@ When translated back to R, we recover the original factor: as.vector(dict) ``` -Arrow dictionaries are slightly more flexible than R factors: values in a dictionary do not necessarily have to be strings, but labels in a factor do. As a consequence, non-string values in an Arrow dictionary are coerced to strings when translated to R. +Arrow dictionaries are slightly more flexible than R factors: values in a dictionary do not necessarily have to be strings, but labels +in a factor do. As a consequence, non-string values in an Arrow dictionary are coerced to strings when translated to R. ## Date types -In R, dates are typically represented using the Date class. Internally a Date object is a numeric type whose value counts the number of days since the beginning of the Unix epoch (1 January 1970). Arrow supplies two data types that can be used to represent dates: the date32 type and the date64 type. The date32 type is similar to the Date class in R: internally it stores a 32 bit integer that counts the number of days since 1 January 1970. The default in arrow is to translate R Date objects to Arrow date32 types: +In R, dates are typically represented using the Date class. Internally a Date object is a numeric type whose value counts the number +of days since the beginning of the Unix epoch (1 January 1970). Arrow supplies two data types that can be used to represent dates: + the date32 type and the date64 type. The date32 type is similar to the Date class in R: internally it stores a 32 bit integer that + counts the number of days since 1 January 1970. The default in arrow is to translate R Date objects to Arrow date32 types: ```{r} nirvana_album_dates <- as.Date(c("1989-06-15", "1991-09-24", "1993-09-13")) @@ -160,7 +203,8 @@ nirvana_32 <- chunked_array(nirvana_album_dates, type = date32()) # default nirvana_32 ``` -Arrow also supplies a higher-precision date64 type, in which the date is represented as a 64 bit integer that encodes the number of *milliseconds* since 1970-01-01 00:00 UTC: +Arrow also supplies a higher-precision date64 type, in which the date is represented as a 64 bit integer that encodes the number of + *milliseconds* since 1970-01-01 00:00 UTC: ```{r} nirvana_64 <- chunked_array(nirvana_album_dates, type = date64()) @@ -173,7 +217,8 @@ The translation from Arrow to R differs. Internally the date32 type is very simi class(as.vector(nirvana_32)) ``` -However, because date64 types are specified to millisecond-level precision, they are translated to R as POSIXct times to avoid the possibility of losing relevant information: +However, because date64 types are specified to millisecond-level precision, they are translated to R as POSIXct times to avoid the +possibility of losing relevant information: ```{r} class(as.vector(nirvana_64)) @@ -181,35 +226,45 @@ class(as.vector(nirvana_64)) ## Temporal/timestamp types -In R there are two classes used to represent date and time information, POSIXct and POSIXlt. Arrow only has one: the timestamp type. Arrow timestamps are loosely analogous to the POSIXct class. Internally, a POSIXct object represents the date with as a numeric variable that stores the number of seconds since 1970-01-01 00:00 UTC. Internally, an Arrow timestamp is a 64 bit integer counting the number of milliseconds since 1970-01-01 00:00 UTC. +In R there are two classes used to represent date and time information, POSIXct and POSIXlt. Arrow only has one: the timestamp type. +Arrow timestamps are loosely analogous to the POSIXct class. Internally, a POSIXct object represents the date with as a numeric variable +that stores the number of seconds since 1970-01-01 00:00 UTC. Internally, an Arrow timestamp is a 64 bit integer counting the number of + milliseconds since 1970-01-01 00:00 UTC. -Arrow and R both support timezone information, but display it differently in the printed object. In R, local time is printed with the timezone name adjacent to it: +Arrow and R both support timezone information, but display it differently in the printed object. In R, local time is printed with the +timezone name adjacent to it: ```{r} sydney_newyear <- as.POSIXct("2000-01-01 00:01", tz = "Australia/Sydney") sydney_newyear ``` -When translated to Arrow, this POSIXct object becomes an Arrow timestamp object. When printed, however, the temporal instant is always displayed in UTC rather than local time: +When translated to Arrow, this POSIXct object becomes an Arrow timestamp object. When printed, however, the temporal instant is always +displayed in UTC rather than local time: ```{r} sydney_newyear_arrow <- chunked_array(sydney_newyear, type = timestamp()) sydney_newyear_arrow ``` -The timezone information is not lost, however, which we can easily see by translating the `sydney_newyear_arrow` object back to an R POSIXct object: +The timezone information is not lost, however, which we can easily see by translating the `sydney_newyear_arrow` object back to an + R POSIXct object: ```{r} as.vector(sydney_newyear_arrow) ``` -For POSIXlt objects the behaviour is different. Internally a POSIXlt object is a list specifying the "local time" in terms of a variety of human-relevant fields. There is no analogous class to this in Arrow, so the default behaviour is to translate it to an Arrow list. +For POSIXlt objects the behaviour is different. Internally a POSIXlt object is a list specifying the "local time" in terms of a +variety of human-relevant fields. There is no analogous class to this in Arrow, so the default behaviour is to translate it to an Arrow list. ## Time of day types -Base R does not have a class to represent the time of day independent of the date (i.e., it is not possible to specify "3pm" without referring to a specific day), but it can be done with the help of the [`hms`](https://hms.tidyverse.org/) package. Internally, hms objects are always stored as the number of seconds since 00:00:00. +Base R does not have a class to represent the time of day independent of the date (i.e., it is not possible to specify "3pm" without + referring to a specific day), but it can be done with the help of the [`hms`](https://hms.tidyverse.org/) package. Internally, + hms objects are always stored as the number of seconds since 00:00:00. -Arrow has two data types for this purposes. For time32 types, data are stored as a 32 bit integer that is interpreted either as the number of seconds or the number of milliseconds since 00:00:00. Note the difference between the following: +Arrow has two data types for this purposes. For time32 types, data are stored as a 32 bit integer that is interpreted either as the + number of seconds or the number of milliseconds since 00:00:00. Note the difference between the following: ```{r} time_of_day <- hms::hms(56, 34, 12) @@ -217,7 +272,8 @@ chunked_array(time_of_day, type = time32(unit = "s")) chunked_array(time_of_day, type = time32(unit = "ms")) ``` -A time64 object is similar, but stores the time of day using a 64 bit integer and can represent the time at higher precision. It is possible to choose microseconds (`unit = "us"`) or nanoseconds (`unit = "ns"`), as shown below: +A time64 object is similar, but stores the time of day using a 64 bit integer and can represent the time at higher precision. It is + possible to choose microseconds (`unit = "us"`) or nanoseconds (`unit = "ns"`), as shown below: ```{r} chunked_array(time_of_day, type = time64(unit = "us")) @@ -228,7 +284,9 @@ All versions of time32 and time64 objects in Arrow translate to hms times in R. ## Duration types -Lengths of time are represented as difftime objects in R. The analogous data type in Arrow is the duration type. A duration type is stored as a 64 bit integer, which can represent the number of seconds (the default, `unit = "s"`), milliseconds (`unit = "ms"`), microseconds (`unit = "us"`), or nanoseconds (`unit = "ns"`). To illustrate this we'll create a difftime in R corresponding to 278 seconds: +Lengths of time are represented as difftime objects in R. The analogous data type in Arrow is the duration type. A duration type +is stored as a 64 bit integer, which can represent the number of seconds (the default, `unit = "s"`), milliseconds (`unit = "ms"`), + microseconds (`unit = "us"`), or nanoseconds (`unit = "ns"`). To illustrate this we'll create a difftime in R corresponding to 278 seconds: ```{r} len <- as.difftime(278, unit = "secs") @@ -247,7 +305,8 @@ Regardless of the underlying unit, duration objects in Arrow translate to diffti ## List of default translations -The discussion above covers the most common cases. The two tables in this section provide a more complete list of how arrow translates between R data types and Arrow data types. In these table, entries with a `-` are not currently implemented. +The discussion above covers the most common cases. The two tables in this section provide a more complete list of how arrow + translates between R data types and Arrow data types. In these table, entries with a `-` are not currently implemented. ### Translations from R to Arrow