From 2f835b39c81e12c0dd97efce0c562ddd27e6e33f Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Mon, 22 Jun 2020 14:24:59 +0200 Subject: [PATCH 01/22] R support for binary arrays --- r/R/type.R | 2 +- r/src/array_from_vector.cpp | 64 +++++++++++++++++++++++++++++++---- r/src/array_to_vector.cpp | 51 ++++++++++++++++++++++++++++ r/src/arrow_rcpp.h | 3 ++ r/src/symbols.cpp | 18 ++++++++++ r/tests/testthat/test-Array.R | 6 ++++ 6 files changed, 137 insertions(+), 7 deletions(-) diff --git a/r/R/type.R b/r/R/type.R index ab30524ed3f..6ceabf5270e 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -60,7 +60,7 @@ DataType <- R6Class("DataType", FLOAT = float32(), DOUBLE = float64(), STRING = utf8(), - BINARY = stop("Type BINARY not implemented yet"), + BINARY = binary(), DATE32 = date32(), DATE64 = date64(), TIMESTAMP = shared_ptr(Timestamp, self$pointer()), diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 7035dd3e170..21e0c7a9dbb 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -142,6 +142,15 @@ struct VectorToArrayConverter { return Status::OK(); } + Status Visit(const arrow::BinaryType& type) { + if (!(Rf_inherits(x, "vctrs_list_of") && + TYPEOF(Rf_getAttrib(x, symbols::ptype)) == RAWSXP)) { + return Status::RError("Expecting a list of raw vectors"); + } + + return Status::OK(); + } + template arrow::enable_if_base_binary Visit(const T& type) { using BuilderType = typename TypeTraits::BuilderType; @@ -916,6 +925,39 @@ class Time64Converter : public TimeConverter { } }; +class BinaryVectorConverter : public VectorConverter { + public: + ~BinaryVectorConverter() {} + + Status Init(ArrayBuilder* builder) { + typed_builder_ = checked_cast(builder); + return Status::OK(); + } + + Status Ingest(SEXP obj) { + ARROW_RETURN_IF(TYPEOF(obj) != VECSXP, Status::RError("Expecting a list")); + R_xlen_t n = XLENGTH(obj); + for (R_xlen_t i = 0; i < n; i++) { + SEXP obj_i = VECTOR_ELT(obj, i); + if (obj_i == R_NilValue) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + } else { + ARROW_RETURN_IF(TYPEOF(obj_i) != RAWSXP, + Status::RError("Expecting a raw vector")); + RETURN_NOT_OK(typed_builder_->Append(RAW(obj_i), XLENGTH(obj_i))); + } + } + return Status::OK(); + } + + Status GetResult(std::shared_ptr* result) { + return typed_builder_->Finish(result); + } + + private: + BinaryBuilder* typed_builder_; +}; + #define NUMERIC_CONVERTER(TYPE_ENUM, TYPE) \ case Type::TYPE_ENUM: \ *out = \ @@ -936,6 +978,7 @@ class Time64Converter : public TimeConverter { Status GetConverter(const std::shared_ptr& type, std::unique_ptr* out) { switch (type->id()) { + SIMPLE_CONVERTER_CASE(BINARY, BinaryVectorConverter); SIMPLE_CONVERTER_CASE(BOOL, BooleanVectorConverter); NUMERIC_CONVERTER(INT8, Int8Type); NUMERIC_CONVERTER(INT16, Int16Type); @@ -1066,12 +1109,22 @@ std::shared_ptr InferArrowTypeFromVector(SEXP x) { if (Rf_inherits(x, "data.frame") || Rf_inherits(x, "POSIXlt")) { return InferArrowTypeFromDataFrame(x); } else { - if (XLENGTH(x) == 0) { - Rcpp::stop( - "Requires at least one element to infer the values' type of a list vector"); - } + SEXP ptype = Rf_getAttrib(x, symbols::ptype); + if (ptype == R_NilValue) { + if (XLENGTH(x) == 0) { + Rcpp::stop( + "Requires at least one element to infer the values' type of a list vector"); + } - return arrow::list(InferArrowType(VECTOR_ELT(x, 0))); + return arrow::list(InferArrowType(VECTOR_ELT(x, 0))); + } else { + // special case list(raw()) -> BinaryArray + if (TYPEOF(ptype) == RAWSXP) { + return arrow::binary(); + } + + return arrow::list(InferArrowType(ptype)); + } } } @@ -1285,7 +1338,6 @@ std::shared_ptr Array__from_vector( StopIfNotOk(converter->Ingest(x)); std::shared_ptr result; StopIfNotOk(converter->GetResult(&result)); - return result; } diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index ebcdb184821..2d2dff0123c 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -285,6 +285,54 @@ class Converter_Boolean : public Converter { } }; +class Converter_Binary : public Converter { + public: + explicit Converter_Binary(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { + SEXP res = PROTECT(Rf_allocVector(VECSXP, n)); + Rf_setAttrib(res, R_ClassSymbol, data::classes_vctrs_list_of); + Rf_setAttrib(res, symbols::ptype, data::empty_raw); + UNPROTECT(1); + return res; + } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return Status::OK(); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + const BinaryArray* binary_array = checked_cast(array.get()); + + auto ingest_one = [&](R_xlen_t i) { + int ni; + auto value = binary_array->GetValue(i, &ni); + SEXP raw = PROTECT(Rf_allocVector(RAWSXP, ni)); + std::copy(value, value + ni, RAW(raw)); + + SET_VECTOR_ELT(data, i, raw); + UNPROTECT(1); + }; + + if (array->null_count()) { + internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), + n); + + for (R_xlen_t i = 0; i < n; i++, bitmap_reader.Next()) { + if (bitmap_reader.IsSet()) ingest_one(i); + } + + } else { + for (R_xlen_t i = 0; i < n; i++) { + ingest_one(i); + } + } + + return Status::OK(); + } +}; + class Converter_Dictionary : public Converter { public: explicit Converter_Dictionary(const ArrayVector& arrays) : Converter(arrays) {} @@ -711,6 +759,9 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type case Type::BOOL: return std::make_shared(std::move(arrays)); + case Type::BINARY: + return std::make_shared(std::move(arrays)); + // handle memory dense strings case Type::STRING: return std::make_shared(std::move(arrays)); diff --git a/r/src/arrow_rcpp.h b/r/src/arrow_rcpp.h index e66f14e70a8..9a038bc1096 100644 --- a/r/src/arrow_rcpp.h +++ b/r/src/arrow_rcpp.h @@ -34,6 +34,7 @@ struct symbols { static SEXP row_names; static SEXP serialize_arrow_r_metadata; static SEXP as_list; + static SEXP ptype; }; struct data { @@ -41,6 +42,8 @@ struct data { static SEXP classes_metadata_r; static SEXP names_metadata; + static SEXP classes_vctrs_list_of; + static SEXP empty_raw; }; struct ns { diff --git a/r/src/symbols.cpp b/r/src/symbols.cpp index be06fdfb533..001a2dd941c 100644 --- a/r/src/symbols.cpp +++ b/r/src/symbols.cpp @@ -27,6 +27,7 @@ SEXP symbols::inspect = Rf_install("inspect"); SEXP symbols::row_names = Rf_install("row.names"); SEXP symbols::serialize_arrow_r_metadata = Rf_install(".serialize_arrow_r_metadata"); SEXP symbols::as_list = Rf_install("as.list"); +SEXP symbols::ptype = Rf_install("ptype"); SEXP get_classes_POSIXct() { SEXP classes = Rf_allocVector(STRSXP, 2); @@ -50,9 +51,26 @@ SEXP get_names_metadata() { return names; } +SEXP get_classes_vctrs_list_of() { + SEXP classes = Rf_allocVector(STRSXP, 3); + R_PreserveObject(classes); + SET_STRING_ELT(classes, 0, Rf_mkChar("vctrs_list_of")); + SET_STRING_ELT(classes, 1, Rf_mkChar("vctrs_vctr")); + SET_STRING_ELT(classes, 2, Rf_mkChar("list")); + return classes; +} + +SEXP get_empty_raw() { + SEXP res = Rf_allocVector(RAWSXP, 0); + R_PreserveObject(res); + return res; +} + SEXP data::classes_POSIXct = get_classes_POSIXct(); SEXP data::classes_metadata_r = get_classes_metadata_r(); SEXP data::names_metadata = get_names_metadata(); +SEXP data::classes_vctrs_list_of = get_classes_vctrs_list_of(); +SEXP data::empty_raw = get_empty_raw(); void inspect(SEXP obj) { Rcpp::Shield call_inspect(Rf_lang2(symbols::inspect, obj)); diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 2ae6505cc2d..694943c4152 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -49,6 +49,12 @@ test_that("Integer Array", { x <- expect_array_roundtrip(ints, int32()) }) +test_that("binary Array", { + bin <- vctrs::list_of(as.raw(1:10), as.raw(0:255), .ptype = raw()) + expect_array_roundtrip(bin, binary()) +}) + + test_that("Slice() and RangeEquals()", { ints <- c(1:10, 101:110, 201:205) x <- Array$create(ints) From ba32ce31bc00dd2bcb0bb430651371622b7663fe Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Mon, 22 Jun 2020 14:33:52 +0200 Subject: [PATCH 02/22] added degenerate test case --- r/tests/testthat/test-Array.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 694943c4152..d95b9e5dfe3 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -52,8 +52,11 @@ test_that("Integer Array", { test_that("binary Array", { bin <- vctrs::list_of(as.raw(1:10), as.raw(0:255), .ptype = raw()) expect_array_roundtrip(bin, binary()) -}) + # degenerate + bin <- structure(list(1L), ptype = raw()) + expect_error(Array$create(bin)) +}) test_that("Slice() and RangeEquals()", { ints <- c(1:10, 101:110, 201:205) From 34ac48444fe2a1a92e36e76f9c584ed9d390d7ed Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 23 Jun 2020 17:29:33 +0200 Subject: [PATCH 03/22] using Rf_isNull() --- r/src/array_from_vector.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 21e0c7a9dbb..b20e9369e9c 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -191,7 +191,7 @@ struct VectorToArrayConverter { RETURN_NOT_OK(builder->Reserve(n)); for (R_xlen_t i = 0; i < n; i++) { SEXP vector = VECTOR_ELT(x, i); - if (vector == R_NilValue) { + if (Rf_isNull(vector)) { RETURN_NOT_OK(list_builder->AppendNull()); continue; } @@ -939,7 +939,7 @@ class BinaryVectorConverter : public VectorConverter { R_xlen_t n = XLENGTH(obj); for (R_xlen_t i = 0; i < n; i++) { SEXP obj_i = VECTOR_ELT(obj, i); - if (obj_i == R_NilValue) { + if (Rf_isNull(obj_i)) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { ARROW_RETURN_IF(TYPEOF(obj_i) != RAWSXP, @@ -1110,7 +1110,7 @@ std::shared_ptr InferArrowTypeFromVector(SEXP x) { return InferArrowTypeFromDataFrame(x); } else { SEXP ptype = Rf_getAttrib(x, symbols::ptype); - if (ptype == R_NilValue) { + if (Rf_isNull(ptype)) { if (XLENGTH(x) == 0) { Rcpp::stop( "Requires at least one element to infer the values' type of a list vector"); From 2bfa333260f2e7d87793439aaa49b8521916cc11 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 08:34:54 +0200 Subject: [PATCH 04/22] reserve before append --- r/src/array_from_vector.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index b20e9369e9c..79cbf46fb70 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -937,13 +937,24 @@ class BinaryVectorConverter : public VectorConverter { Status Ingest(SEXP obj) { ARROW_RETURN_IF(TYPEOF(obj) != VECSXP, Status::RError("Expecting a list")); R_xlen_t n = XLENGTH(obj); + + // Reserve enough space before appending + int64_t size = 0; + for (R_xlen_t i = 0; i < n; i++) { + SEXP obj_i = VECTOR_ELT(obj, i); + if (!Rf_isNull(obj_i)) { + ARROW_RETURN_IF(TYPEOF(obj_i) != RAWSXP, + Status::RError("Expecting a raw vector")); + size += XLENGTH(obj_i); + } + } + + // append for (R_xlen_t i = 0; i < n; i++) { SEXP obj_i = VECTOR_ELT(obj, i); if (Rf_isNull(obj_i)) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { - ARROW_RETURN_IF(TYPEOF(obj_i) != RAWSXP, - Status::RError("Expecting a raw vector")); RETURN_NOT_OK(typed_builder_->Append(RAW(obj_i), XLENGTH(obj_i))); } } From ad8e72e13e20067e3fc75b435062d92a192d43c9 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 08:36:07 +0200 Subject: [PATCH 05/22] ... actually Reserve() --- r/src/array_from_vector.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 79cbf46fb70..3d74659afbe 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -948,6 +948,7 @@ class BinaryVectorConverter : public VectorConverter { size += XLENGTH(obj_i); } } + RETURN_NOT_OK(typed_builder_->Reserve(size)); // append for (R_xlen_t i = 0; i < n; i++) { From da6fb1af7b0f4caabd68ba42a664fef35c9d4198 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 08:47:53 +0200 Subject: [PATCH 06/22] Less return() --- r/src/array_from_vector.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 3d74659afbe..08b3fe82d42 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -1128,15 +1128,15 @@ std::shared_ptr InferArrowTypeFromVector(SEXP x) { "Requires at least one element to infer the values' type of a list vector"); } - return arrow::list(InferArrowType(VECTOR_ELT(x, 0))); - } else { - // special case list(raw()) -> BinaryArray - if (TYPEOF(ptype) == RAWSXP) { - return arrow::binary(); - } + ptype = VECTOR_ELT(x, 0); + } - return arrow::list(InferArrowType(ptype)); + // special case list(raw()) -> BinaryArray + if (TYPEOF(ptype) == RAWSXP) { + return arrow::binary(); } + + return arrow::list(InferArrowType(ptype)); } } From cd5e2c9827b3570c636f390b8780f7ba79a9459c Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 10:02:23 +0200 Subject: [PATCH 07/22] make vctrs::list_of() from List arrays to keep a ptype that matches the value_type() of the list. --- r/src/array_to_vector.cpp | 30 +++++++++++++++++++++++++++--- r/tests/testthat/test-Array.R | 4 ++-- r/tests/testthat/test-json.R | 2 +- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 2d2dff0123c..48104f5fa39 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -19,6 +19,7 @@ #if defined(ARROW_R_WITH_ARROW) #include +#include #include #include #include @@ -628,10 +629,31 @@ class Converter_Decimal : public Converter { }; class Converter_List : public Converter { + private: + std::shared_ptr value_type_; + public: - explicit Converter_List(const ArrayVector& arrays) : Converter(arrays) {} + explicit Converter_List(const ArrayVector& arrays, + const std::shared_ptr& value_type) + : Converter(arrays), value_type_(value_type) {} + + SEXP Allocate(R_xlen_t n) const { + Rcpp::List res(no_init(n)); + Rf_setAttrib(res, R_ClassSymbol, arrow::r::data::classes_vctrs_list_of); + + // Build an empty array to match value_type + std::unique_ptr builder; + StopIfNotOk(arrow::MakeBuilder(arrow::default_memory_pool(), value_type_, &builder)); - SEXP Allocate(R_xlen_t n) const { return Rcpp::List(no_init(n)); } + std::shared_ptr array; + StopIfNotOk(builder->Finish(&array)); + + // convert to an R object to store as the list' ptype + SEXP ptype = Array__as_vector(array); + Rf_setAttrib(res, arrow::r::symbols::ptype, ptype); + + return res; + } Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { // nothing to do, list contain NULL by default @@ -847,7 +869,9 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type return std::make_shared(std::move(arrays)); case Type::LIST: - return std::make_shared(std::move(arrays)); + return std::make_shared( + std::move(arrays), + checked_cast(type.get())->value_type()); case Type::NA: return std::make_shared(std::move(arrays)); diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index d95b9e5dfe3..32caab96124 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -27,7 +27,7 @@ expect_array_roundtrip <- function(x, type) { # Is there some vctrs thing we should do on the roundtrip back to R? expect_identical(is.na(a), is.na(x)) } - expect_equal(as.vector(a), x) + expect_equivalent(as.vector(a), x) # Make sure the storage mode is the same on roundtrip (esp. integer vs. numeric) expect_identical(typeof(as.vector(a)), typeof(x)) @@ -39,7 +39,7 @@ expect_array_roundtrip <- function(x, type) { if (!inherits(type, "ListType")) { expect_identical(is.na(a_sliced), is.na(x_sliced)) } - expect_equal(as.vector(a_sliced), x_sliced) + expect_equivalent(as.vector(a_sliced), x_sliced) } invisible(a) } diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R index d4a783089bc..a35a465bf0b 100644 --- a/r/tests/testthat/test-json.R +++ b/r/tests/testthat/test-json.R @@ -133,7 +133,7 @@ test_that("Can read json file with nested columns (ARROW-5503)", { c(5, 6) ) list_array <- tab1$column(0) - expect_identical( + expect_equivalent( list_array$as_vector(), list_array_r ) From 0a368aa65ba1ddb84621b3a617cffc3afad7f650 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 11:13:52 +0200 Subject: [PATCH 08/22] additional DataType types, currently not handled --- r/R/enums.R | 8 +++++++- r/R/type.R | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/r/R/enums.R b/r/R/enums.R index 223d5a799a4..e9420ce4562 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -71,7 +71,13 @@ Type <- enum("Type::type", SPARSE_UNION = 26L, DENSE_UNION = 27L, DICTIONARY = 28L, - MAP = 29L + MAP = 29L, + EXTENSION = 30L, + FIXED_SIZE_LIST = 31L, + DURATION = 32L, + LARGE_STRING = 33L, + LARGE_BINARY = 34L, + LARGE_LIST = 35L ) #' @rdname enums diff --git a/r/R/type.R b/r/R/type.R index 6ceabf5270e..74feae9591b 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -73,7 +73,13 @@ DataType <- R6Class("DataType", SPARSE_UNION = stop("Type SPARSE_UNION not implemented yet"), DENSE_UNION = stop("Type DENSE_UNION not implemented yet"), DICTIONARY = shared_ptr(DictionaryType, self$pointer()), - MAP = stop("Type MAP not implemented yet") + MAP = stop("Type MAP not implemented yet"), + EXTENSION = stop("Type EXTENSION not implemented yet"), + FIXED_SIZE_LIST = stop("Type FIXED_SIZE_LIST not implemented yet"), + DURATION = stop("Type DURATION not implemented yet"), + LARGE_STRING = stop("Type LARGE_STRING not implemented yet"), + LARGE_BINARY = stop("Type LARGE_BINARY not implemented yet"), + LARGE_LIST = stop("Type LARGE_LIST not implemented yet") ) } ), From 77e9e46bebc5499382458871ba0be51a79a12b58 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 11:19:26 +0200 Subject: [PATCH 09/22] + large_binary() --- r/NAMESPACE | 1 + r/R/arrowExports.R | 4 ++++ r/R/type.R | 9 ++++++++- r/src/arrowExports.cpp | 15 +++++++++++++++ r/src/datatype.cpp | 3 +++ 5 files changed, 31 insertions(+), 1 deletion(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index d0ab7c6fdf4..e8da0c02beb 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -186,6 +186,7 @@ export(int16) export(int32) export(int64) export(int8) +export(large_binary) export(last_col) export(list_of) export(map_batches) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 54fadcb8055..2c2709e10ba 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -488,6 +488,10 @@ Binary__initialize <- function(){ .Call(`_arrow_Binary__initialize` ) } +LargeBinary__initialize <- function(){ + .Call(`_arrow_LargeBinary__initialize` ) +} + Date32__initialize <- function(){ .Call(`_arrow_Date32__initialize` ) } diff --git a/r/R/type.R b/r/R/type.R index 74feae9591b..4007adec217 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -78,7 +78,7 @@ DataType <- R6Class("DataType", FIXED_SIZE_LIST = stop("Type FIXED_SIZE_LIST not implemented yet"), DURATION = stop("Type DURATION not implemented yet"), LARGE_STRING = stop("Type LARGE_STRING not implemented yet"), - LARGE_BINARY = stop("Type LARGE_BINARY not implemented yet"), + LARGE_BINARY = large_binary(), LARGE_LIST = stop("Type LARGE_LIST not implemented yet") ) } @@ -149,6 +149,7 @@ Boolean <- R6Class("Boolean", inherit = FixedWidthType) Utf8 <- R6Class("Utf8", inherit = DataType) Binary <- R6Class("Binary", inherit = DataType) FixedSizeBinary <- R6Class("FixedSizeBinary", inherit = FixedWidthType) +LargeBinary <- R6Class("LargeBinary", inherit = DataType) DateType <- R6Class("DateType", inherit = FixedWidthType, @@ -302,6 +303,12 @@ binary <- function(byte_width = NULL) { } } +#' @rdname data-type +#' @export +large_binary <- function() { + shared_ptr(LargeBinary, LargeBinary__initialize()) +} + #' @rdname data-type #' @export string <- utf8 diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 3fa77c24adc..120ef9399c8 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1891,6 +1891,20 @@ RcppExport SEXP _arrow_Binary__initialize(){ } #endif +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr LargeBinary__initialize(); +RcppExport SEXP _arrow_LargeBinary__initialize(){ +BEGIN_RCPP + return Rcpp::wrap(LargeBinary__initialize()); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeBinary__initialize(){ + Rf_error("Cannot call LargeBinary__initialize(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // datatype.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Date32__initialize(); @@ -5787,6 +5801,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, + { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0}, { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0}, diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index 9f505986d84..f74d03414b4 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -83,6 +83,9 @@ std::shared_ptr Utf8__initialize() { return arrow::utf8(); } // [[arrow::export]] std::shared_ptr Binary__initialize() { return arrow::binary(); } +// [[arrow::export]] +std::shared_ptr LargeBinary__initialize() { return arrow::large_binary(); } + // [[arrow::export]] std::shared_ptr Date32__initialize() { return arrow::date32(); } From 140dc24da45f21a7144f3d8dfd85e51a307d9531 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 11:38:17 +0200 Subject: [PATCH 10/22] Converter_Binary becomes a template --- r/src/array_to_vector.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 48104f5fa39..6fe93299dda 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -286,8 +286,10 @@ class Converter_Boolean : public Converter { } }; +template class Converter_Binary : public Converter { public: + using offset_type = typename ArrayType::offset_type; explicit Converter_Binary(const ArrayVector& arrays) : Converter(arrays) {} SEXP Allocate(R_xlen_t n) const { @@ -304,16 +306,21 @@ class Converter_Binary : public Converter { Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) const { - const BinaryArray* binary_array = checked_cast(array.get()); + const ArrayType* binary_array = checked_cast(array.get()); auto ingest_one = [&](R_xlen_t i) { - int ni; + offset_type ni; auto value = binary_array->GetValue(i, &ni); + if (ni > R_XLEN_T_MAX) { + return Status::RError("Array too big to be represented as a raw vector"); + } SEXP raw = PROTECT(Rf_allocVector(RAWSXP, ni)); std::copy(value, value + ni, RAW(raw)); SET_VECTOR_ELT(data, i, raw); UNPROTECT(1); + + return Status::OK(); }; if (array->null_count()) { @@ -321,12 +328,12 @@ class Converter_Binary : public Converter { n); for (R_xlen_t i = 0; i < n; i++, bitmap_reader.Next()) { - if (bitmap_reader.IsSet()) ingest_one(i); + if (bitmap_reader.IsSet()) RETURN_NOT_OK(ingest_one(i)); } } else { for (R_xlen_t i = 0; i < n; i++) { - ingest_one(i); + RETURN_NOT_OK(ingest_one(i)); } } @@ -782,7 +789,10 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type return std::make_shared(std::move(arrays)); case Type::BINARY: - return std::make_shared(std::move(arrays)); + return std::make_shared>(std::move(arrays)); + + case Type::LARGE_BINARY: + return std::make_shared>(std::move(arrays)); // handle memory dense strings case Type::STRING: From 0b3051fcb155ff2eb46380d27f311f1154415210 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 13:03:49 +0200 Subject: [PATCH 11/22] BinaryVectorConverter becomes a template to handle both Binary and LargeBinary arrays --- r/src/array_from_vector.cpp | 8 +++++--- r/src/array_to_vector.cpp | 6 ++++-- r/src/datatype.cpp | 4 +++- r/tests/testthat/test-Array.R | 3 ++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 08b3fe82d42..3706d9e3f41 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -925,12 +925,13 @@ class Time64Converter : public TimeConverter { } }; +template class BinaryVectorConverter : public VectorConverter { public: ~BinaryVectorConverter() {} Status Init(ArrayBuilder* builder) { - typed_builder_ = checked_cast(builder); + typed_builder_ = checked_cast(builder); return Status::OK(); } @@ -967,7 +968,7 @@ class BinaryVectorConverter : public VectorConverter { } private: - BinaryBuilder* typed_builder_; + Builder* typed_builder_; }; #define NUMERIC_CONVERTER(TYPE_ENUM, TYPE) \ @@ -990,7 +991,8 @@ class BinaryVectorConverter : public VectorConverter { Status GetConverter(const std::shared_ptr& type, std::unique_ptr* out) { switch (type->id()) { - SIMPLE_CONVERTER_CASE(BINARY, BinaryVectorConverter); + SIMPLE_CONVERTER_CASE(BINARY, BinaryVectorConverter); + SIMPLE_CONVERTER_CASE(LARGE_BINARY, BinaryVectorConverter); SIMPLE_CONVERTER_CASE(BOOL, BooleanVectorConverter); NUMERIC_CONVERTER(INT8, Int8Type); NUMERIC_CONVERTER(INT16, Int16Type); diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 6fe93299dda..1ba8bd44996 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -789,10 +789,12 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type return std::make_shared(std::move(arrays)); case Type::BINARY: - return std::make_shared>(std::move(arrays)); + return std::make_shared>( + std::move(arrays)); case Type::LARGE_BINARY: - return std::make_shared>(std::move(arrays)); + return std::make_shared>( + std::move(arrays)); // handle memory dense strings case Type::STRING: diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index f74d03414b4..a63188fa0d4 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -84,7 +84,9 @@ std::shared_ptr Utf8__initialize() { return arrow::utf8(); } std::shared_ptr Binary__initialize() { return arrow::binary(); } // [[arrow::export]] -std::shared_ptr LargeBinary__initialize() { return arrow::large_binary(); } +std::shared_ptr LargeBinary__initialize() { + return arrow::large_binary(); +} // [[arrow::export]] std::shared_ptr Date32__initialize() { return arrow::date32(); } diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 32caab96124..6d7d9ffbfcb 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -18,7 +18,7 @@ context("Array") expect_array_roundtrip <- function(x, type) { - a <- Array$create(x) + a <- Array$create(x, type = type) expect_type_equal(a$type, type) expect_identical(length(a), length(x)) if (!inherits(type, "ListType")) { @@ -52,6 +52,7 @@ test_that("Integer Array", { test_that("binary Array", { bin <- vctrs::list_of(as.raw(1:10), as.raw(0:255), .ptype = raw()) expect_array_roundtrip(bin, binary()) + expect_array_roundtrip(bin, large_binary()) # degenerate bin <- structure(list(1L), ptype = raw()) From a24b8407e325cba334655039cc918d2f85f52cfc Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 13:19:32 +0200 Subject: [PATCH 12/22] + large_utf8() --- r/NAMESPACE | 1 + r/R/type.R | 7 ++++++- r/man/data-type.Rd | 6 ++++++ r/man/enums.Rd | 2 +- r/src/datatype.cpp | 3 +++ 5 files changed, 17 insertions(+), 2 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index e8da0c02beb..b8e15311263 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -187,6 +187,7 @@ export(int32) export(int64) export(int8) export(large_binary) +export(large_utf8) export(last_col) export(list_of) export(map_batches) diff --git a/r/R/type.R b/r/R/type.R index 4007adec217..0bbb842a74a 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -77,7 +77,7 @@ DataType <- R6Class("DataType", EXTENSION = stop("Type EXTENSION not implemented yet"), FIXED_SIZE_LIST = stop("Type FIXED_SIZE_LIST not implemented yet"), DURATION = stop("Type DURATION not implemented yet"), - LARGE_STRING = stop("Type LARGE_STRING not implemented yet"), + LARGE_STRING = large_utf8(), LARGE_BINARY = large_binary(), LARGE_LIST = stop("Type LARGE_LIST not implemented yet") ) @@ -147,6 +147,7 @@ Float32 <- R6Class("Float32", inherit = FixedWidthType) Float64 <- R6Class("Float64", inherit = FixedWidthType) Boolean <- R6Class("Boolean", inherit = FixedWidthType) Utf8 <- R6Class("Utf8", inherit = DataType) +LargeUtf8 <- R6Class("LargeUtf8", inherit = DataType) Binary <- R6Class("Binary", inherit = DataType) FixedSizeBinary <- R6Class("FixedSizeBinary", inherit = FixedWidthType) LargeBinary <- R6Class("LargeBinary", inherit = DataType) @@ -293,6 +294,10 @@ bool <- boolean #' @export utf8 <- function() shared_ptr(Utf8, Utf8__initialize()) +#' @rdname data-type +#' @export +large_utf8 <- function() shared_ptr(LargeUtf8, LargeUtf8__initialize()) + #' @rdname data-type #' @export binary <- function(byte_width = NULL) { diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index 6881b9603b2..fc8abd33297 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -18,7 +18,9 @@ \alias{boolean} \alias{bool} \alias{utf8} +\alias{large_utf8} \alias{binary} +\alias{large_binary} \alias{string} \alias{date32} \alias{date64} @@ -63,8 +65,12 @@ bool() utf8() +large_utf8() + binary(byte_width = NULL) +large_binary() + string() date32() diff --git a/r/man/enums.Rd b/r/man/enums.Rd index 0b47b66cc23..b592af916bb 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -18,7 +18,7 @@ An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of le An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2. -An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 30. +An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 36. An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 17. diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index a63188fa0d4..fcc4757d30f 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -80,6 +80,9 @@ std::shared_ptr Boolean__initialize() { return arrow::boolean() // [[arrow::export]] std::shared_ptr Utf8__initialize() { return arrow::utf8(); } +// [[arrow::export]] +std::shared_ptr LargeUtf8__initialize() { return arrow::large_utf8(); } + // [[arrow::export]] std::shared_ptr Binary__initialize() { return arrow::binary(); } From 97155304e8b2cfa8f4512100f1574ba638d94e67 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 13:38:54 +0200 Subject: [PATCH 13/22] string vector -> large string array --- r/R/arrowExports.R | 4 ++++ r/src/array_from_vector.cpp | 2 +- r/src/arrowExports.cpp | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 2c2709e10ba..7b54b6a1a9a 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -484,6 +484,10 @@ Utf8__initialize <- function(){ .Call(`_arrow_Utf8__initialize` ) } +LargeUtf8__initialize <- function(){ + .Call(`_arrow_LargeUtf8__initialize` ) +} + Binary__initialize <- function(){ .Call(`_arrow_Binary__initialize` ) } diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 3706d9e3f41..b8900f6cbbf 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -1303,7 +1303,7 @@ std::shared_ptr Array__from_vector( } // treat strings separately for now - if (type->id() == Type::STRING) { + if (type->id() == Type::STRING || type->id() == Type::LARGE_STRING) { return VectorToArrayConverter::Visit(x, type); } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 120ef9399c8..7d56848f5bf 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1877,6 +1877,20 @@ RcppExport SEXP _arrow_Utf8__initialize(){ } #endif +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr LargeUtf8__initialize(); +RcppExport SEXP _arrow_LargeUtf8__initialize(){ +BEGIN_RCPP + return Rcpp::wrap(LargeUtf8__initialize()); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeUtf8__initialize(){ + Rf_error("Cannot call LargeUtf8__initialize(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // datatype.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Binary__initialize(); @@ -5800,6 +5814,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, + { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, From e934e9ac64446d437e60e4c42f5b9f4c99f75f90 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 13:50:00 +0200 Subject: [PATCH 14/22] Large string -> character vector --- r/src/array_to_vector.cpp | 10 ++++++++-- r/tests/testthat/test-Array.R | 4 ++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 1ba8bd44996..c5d51ffdb55 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -187,6 +187,7 @@ class Converter_Date32 : public Converter_SimpleArray { } }; +template struct Converter_String : public Converter { public: explicit Converter_String(const ArrayVector& arrays) : Converter(arrays) {} @@ -221,7 +222,7 @@ struct Converter_String : public Converter { return Status::OK(); } - arrow::StringArray* string_array = static_cast(array.get()); + StringArrayType* string_array = static_cast(array.get()); if (array->null_count()) { // need to watch for nulls arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), @@ -798,7 +799,12 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type // handle memory dense strings case Type::STRING: - return std::make_shared(std::move(arrays)); + return std::make_shared>( + std::move(arrays)); + + case Type::LARGE_STRING: + return std::make_shared>( + std::move(arrays)); case Type::DICTIONARY: return std::make_shared(std::move(arrays)); diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 6d7d9ffbfcb..11144d9ed66 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -160,12 +160,16 @@ test_that("Array supports logical vectors (ARROW-3341)", { test_that("Array supports character vectors (ARROW-3339)", { # without NA expect_array_roundtrip(c("itsy", "bitsy", "spider"), utf8()) + expect_array_roundtrip(c("itsy", "bitsy", "spider"), large_utf8()) + # with NA expect_array_roundtrip(c("itsy", NA, "spider"), utf8()) + expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8()) }) test_that("empty arrays are supported", { expect_array_roundtrip(character(), utf8()) + expect_array_roundtrip(character(), large_utf8()) expect_array_roundtrip(integer(), int32()) expect_array_roundtrip(numeric(), float64()) expect_array_roundtrip(factor(character()), dictionary(int8(), utf8())) From d00ccc256b717afbc99dd468bb392eb7f92530a8 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 14:07:39 +0200 Subject: [PATCH 15/22] + StringVectorConverter<> to remove special case for strings --- r/src/array_from_vector.cpp | 52 +++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index b8900f6cbbf..84027c05f2b 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -971,6 +971,51 @@ class BinaryVectorConverter : public VectorConverter { Builder* typed_builder_; }; +template +class StringVectorConverter : public VectorConverter { + public: + ~StringVectorConverter() {} + + Status Init(ArrayBuilder* builder) { + typed_builder_ = checked_cast(builder); + return Status::OK(); + } + + Status Ingest(SEXP obj) { + ARROW_RETURN_IF(TYPEOF(obj) != STRSXP, + Status::RError("Expecting a character vector")); + R_xlen_t n = XLENGTH(obj); + + // Reserve enough space before appending + int64_t size = 0; + for (R_xlen_t i = 0; i < n; i++) { + SEXP string_i = STRING_ELT(obj, i); + if (string_i != NA_STRING) { + size += XLENGTH(string_i); + } + } + RETURN_NOT_OK(typed_builder_->Reserve(size)); + + // append + for (R_xlen_t i = 0; i < n; i++) { + SEXP string_i = STRING_ELT(obj, i); + if (string_i == NA_STRING) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + } else { + RETURN_NOT_OK(typed_builder_->Append(CHAR(string_i), XLENGTH(string_i))); + } + } + return Status::OK(); + } + + Status GetResult(std::shared_ptr* result) { + return typed_builder_->Finish(result); + } + + private: + Builder* typed_builder_; +}; + #define NUMERIC_CONVERTER(TYPE_ENUM, TYPE) \ case Type::TYPE_ENUM: \ *out = \ @@ -994,6 +1039,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(BINARY, BinaryVectorConverter); SIMPLE_CONVERTER_CASE(LARGE_BINARY, BinaryVectorConverter); SIMPLE_CONVERTER_CASE(BOOL, BooleanVectorConverter); + SIMPLE_CONVERTER_CASE(STRING, StringVectorConverter); + SIMPLE_CONVERTER_CASE(LARGE_STRING, StringVectorConverter); NUMERIC_CONVERTER(INT8, Int8Type); NUMERIC_CONVERTER(INT16, Int16Type); NUMERIC_CONVERTER(INT32, Int32Type); @@ -1302,11 +1349,6 @@ std::shared_ptr Array__from_vector( return arrow::r::Array__from_vector_reuse_memory(x); } - // treat strings separately for now - if (type->id() == Type::STRING || type->id() == Type::LARGE_STRING) { - return VectorToArrayConverter::Visit(x, type); - } - // factors only when type has been inferred if (type->id() == Type::DICTIONARY) { if (type_inferred || arrow::r::CheckCompatibleFactor(x, type)) { From e4653d695d0793c6d3e238e2e27f1d4129f7b702 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 15:38:14 +0200 Subject: [PATCH 16/22] large_list_of() --- r/NAMESPACE | 1 + r/R/arrowExports.R | 12 +++++++++++ r/R/list.R | 12 +++++++++++ r/man/data-type.Rd | 3 +++ r/src/arrowExports.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++ r/src/datatype.cpp | 28 ++++++++++++++++++++++++ 6 files changed, 104 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index b8e15311263..82e7e4c59f8 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -187,6 +187,7 @@ export(int32) export(int64) export(int8) export(large_binary) +export(large_list_of) export(large_utf8) export(last_col) export(list_of) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 7b54b6a1a9a..b037639815b 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -532,6 +532,10 @@ list__ <- function(x){ .Call(`_arrow_list__` , x) } +large_list__ <- function(x){ + .Call(`_arrow_large_list__` , x) +} + struct_ <- function(fields){ .Call(`_arrow_struct_` , fields) } @@ -628,6 +632,14 @@ ListType__value_type <- function(type){ .Call(`_arrow_ListType__value_type` , type) } +LargeListType__value_field <- function(type){ + .Call(`_arrow_LargeListType__value_field` , type) +} + +LargeListType__value_type <- function(type){ + .Call(`_arrow_LargeListType__value_type` , type) +} + dataset___expr__field_ref <- function(name){ .Call(`_arrow_dataset___expr__field_ref` , name) } diff --git a/r/R/list.R b/r/R/list.R index 31c72bb3034..ada47e5bc65 100644 --- a/r/R/list.R +++ b/r/R/list.R @@ -28,3 +28,15 @@ ListType <- R6Class("ListType", #' @rdname data-type #' @export list_of <- function(type) shared_ptr(ListType, list__(type)) + +LargeListType <- R6Class("LargeListType", + inherit = NestedType, + active = list( + value_field = function() shared_ptr(Field, LargeListType__value_field(self)), + value_type = function() DataType$create(LargeListType__value_type(self)) + ) +) + +#' @rdname data-type +#' @export +large_list_of <- function(type) shared_ptr(LargeListType, large_list__(type)) diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index fc8abd33297..6f71140c863 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -30,6 +30,7 @@ \alias{timestamp} \alias{decimal} \alias{list_of} +\alias{large_list_of} \alias{struct} \title{Apache Arrow data types} \usage{ @@ -89,6 +90,8 @@ decimal(precision, scale) list_of(type) +large_list_of(type) + struct(...) } \arguments{ diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 7d56848f5bf..1c0b95831a3 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2053,6 +2053,21 @@ RcppExport SEXP _arrow_list__(SEXP x_sexp){ } #endif +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +SEXP large_list__(SEXP x); +RcppExport SEXP _arrow_large_list__(SEXP x_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter::type x(x_sexp); + return Rcpp::wrap(large_list__(x)); +END_RCPP +} +#else +RcppExport SEXP _arrow_large_list__(SEXP x_sexp){ + Rf_error("Cannot call large_list__(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // datatype.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr struct_(List fields); @@ -2418,6 +2433,36 @@ RcppExport SEXP _arrow_ListType__value_type(SEXP type_sexp){ } #endif +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr LargeListType__value_field(const std::shared_ptr& type); +RcppExport SEXP _arrow_LargeListType__value_field(SEXP type_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type type(type_sexp); + return Rcpp::wrap(LargeListType__value_field(type)); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeListType__value_field(SEXP type_sexp){ + Rf_error("Cannot call LargeListType__value_field(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr LargeListType__value_type(const std::shared_ptr& type); +RcppExport SEXP _arrow_LargeListType__value_type(SEXP type_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type type(type_sexp); + return Rcpp::wrap(LargeListType__value_type(type)); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeListType__value_type(SEXP type_sexp){ + Rf_error("Cannot call LargeListType__value_type(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // expression.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr dataset___expr__field_ref(std::string name); @@ -5826,6 +5871,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Time32__initialize", (DL_FUNC) &_arrow_Time32__initialize, 1}, { "_arrow_Time64__initialize", (DL_FUNC) &_arrow_Time64__initialize, 1}, { "_arrow_list__", (DL_FUNC) &_arrow_list__, 1}, + { "_arrow_large_list__", (DL_FUNC) &_arrow_large_list__, 1}, { "_arrow_struct_", (DL_FUNC) &_arrow_struct_, 1}, { "_arrow_DataType__ToString", (DL_FUNC) &_arrow_DataType__ToString, 1}, { "_arrow_DataType__name", (DL_FUNC) &_arrow_DataType__name, 1}, @@ -5850,6 +5896,8 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, + { "_arrow_LargeListType__value_field", (DL_FUNC) &_arrow_LargeListType__value_field, 1}, + { "_arrow_LargeListType__value_type", (DL_FUNC) &_arrow_LargeListType__value_type, 1}, { "_arrow_dataset___expr__field_ref", (DL_FUNC) &_arrow_dataset___expr__field_ref, 1}, { "_arrow_dataset___expr__equal", (DL_FUNC) &_arrow_dataset___expr__equal, 2}, { "_arrow_dataset___expr__not_equal", (DL_FUNC) &_arrow_dataset___expr__not_equal, 2}, diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index fcc4757d30f..c22afd1a1b0 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -150,6 +150,22 @@ SEXP list__(SEXP x) { return R_NilValue; } +// [[arrow::export]] +SEXP large_list__(SEXP x) { + if (Rf_inherits(x, "Field")) { + Rcpp::ConstReferenceSmartPtrInputParameter> field(x); + return wrap(arrow::large_list(field)); + } + + if (Rf_inherits(x, "DataType")) { + Rcpp::ConstReferenceSmartPtrInputParameter> type(x); + return wrap(arrow::large_list(type)); + } + + stop("incompatible"); + return R_NilValue; +} + // [[arrow::export]] std::shared_ptr struct_(List fields) { return arrow::struct_(arrow::r::List_to_shared_ptr_vector(fields)); @@ -280,4 +296,16 @@ std::shared_ptr ListType__value_type( return type->value_type(); } +// [[arrow::export]] +std::shared_ptr LargeListType__value_field( + const std::shared_ptr& type) { + return type->value_field(); +} + +// [[arrow::export]] +std::shared_ptr LargeListType__value_type( + const std::shared_ptr& type) { + return type->value_type(); +} + #endif From 64f5b9f1310051d9e7cf61c82d404a9438bcffd0 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 26 Jun 2020 16:18:20 +0200 Subject: [PATCH 17/22] LargeListArray support --- r/R/array.R | 18 ++++++++ r/R/arrowExports.R | 20 +++++++++ r/R/type.R | 2 +- r/src/array.cpp | 31 +++++++++++++ r/src/array_from_vector.cpp | 8 +--- r/src/array_to_vector.cpp | 10 ++++- r/src/arrowExports.cpp | 82 +++++++++++++++++++++++++++++++++++ r/tests/testthat/test-Array.R | 47 +++++++++++++++++++- 8 files changed, 207 insertions(+), 11 deletions(-) diff --git a/r/R/array.R b/r/R/array.R index 194cdeb42f8..5d4d8c66aad 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -93,6 +93,8 @@ Array <- R6Class("Array", shared_ptr(StructArray, self$pointer()) } else if (type_id == Type$LIST) { shared_ptr(ListArray, self$pointer()) + } else if (type_id == Type$LARGE_LIST){ + shared_ptr(LargeListArray, self$pointer()) } else { self } @@ -230,6 +232,22 @@ ListArray <- R6Class("ListArray", inherit = Array, ) ) +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +LargeListArray <- R6Class("LargeListArray", inherit = Array, + public = list( + values = function() Array$create(LargeListArray__values(self)), + value_length = function(i) LargeListArray__value_length(self, i), + value_offset = function(i) LargeListArray__value_offset(self, i), + raw_value_offsets = function() LargeListArray__raw_value_offsets(self) + ), + active = list( + value_type = function() DataType$create(LargeListArray__value_type(self)) + ) +) + #' @export length.Array <- function(x) x$length() diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index b037639815b..37059c70fb2 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -92,22 +92,42 @@ ListArray__value_type <- function(array){ .Call(`_arrow_ListArray__value_type` , array) } +LargeListArray__value_type <- function(array){ + .Call(`_arrow_LargeListArray__value_type` , array) +} + ListArray__values <- function(array){ .Call(`_arrow_ListArray__values` , array) } +LargeListArray__values <- function(array){ + .Call(`_arrow_LargeListArray__values` , array) +} + ListArray__value_length <- function(array, i){ .Call(`_arrow_ListArray__value_length` , array, i) } +LargeListArray__value_length <- function(array, i){ + .Call(`_arrow_LargeListArray__value_length` , array, i) +} + ListArray__value_offset <- function(array, i){ .Call(`_arrow_ListArray__value_offset` , array, i) } +LargeListArray__value_offset <- function(array, i){ + .Call(`_arrow_LargeListArray__value_offset` , array, i) +} + ListArray__raw_value_offsets <- function(array){ .Call(`_arrow_ListArray__raw_value_offsets` , array) } +LargeListArray__raw_value_offsets <- function(array){ + .Call(`_arrow_LargeListArray__raw_value_offsets` , array) +} + Array__infer_type <- function(x){ .Call(`_arrow_Array__infer_type` , x) } diff --git a/r/R/type.R b/r/R/type.R index 0bbb842a74a..430440d5779 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -79,7 +79,7 @@ DataType <- R6Class("DataType", DURATION = stop("Type DURATION not implemented yet"), LARGE_STRING = large_utf8(), LARGE_BINARY = large_binary(), - LARGE_LIST = stop("Type LARGE_LIST not implemented yet") + LARGE_LIST = shared_ptr(LargeListType, self$pointer()) ) } ), diff --git a/r/src/array.cpp b/r/src/array.cpp index 9ada75d9b51..da980602bce 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -206,24 +206,48 @@ std::shared_ptr ListArray__value_type( return array->value_type(); } +// [[arrow::export]] +std::shared_ptr LargeListArray__value_type( + const std::shared_ptr& array) { + return array->value_type(); +} + // [[arrow::export]] std::shared_ptr ListArray__values( const std::shared_ptr& array) { return array->values(); } +// [[arrow::export]] +std::shared_ptr LargeListArray__values( + const std::shared_ptr& array) { + return array->values(); +} + // [[arrow::export]] int32_t ListArray__value_length(const std::shared_ptr& array, int64_t i) { return array->value_length(i); } +// [[arrow::export]] +int64_t LargeListArray__value_length(const std::shared_ptr& array, + int64_t i) { + return array->value_length(i); +} + // [[arrow::export]] int32_t ListArray__value_offset(const std::shared_ptr& array, int64_t i) { return array->value_offset(i); } +// [[arrow::export]] +int64_t LargeListArray__value_offset(const std::shared_ptr& array, + int64_t i) { + return array->value_offset(i); +} + // [[arrow::export]] Rcpp::IntegerVector ListArray__raw_value_offsets( const std::shared_ptr& array) { @@ -231,4 +255,11 @@ Rcpp::IntegerVector ListArray__raw_value_offsets( return Rcpp::IntegerVector(offsets, offsets + array->length()); } +// [[arrow::export]] +Rcpp::IntegerVector LargeListArray__raw_value_offsets( + const std::shared_ptr& array) { + auto offsets = array->raw_value_offsets(); + return Rcpp::IntegerVector(offsets, offsets + array->length()); +} + #endif diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 84027c05f2b..44241049be5 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -373,10 +373,6 @@ std::shared_ptr MakeStructArray(SEXP df, const std::shared_ptr& return std::make_shared(type, rows, children); } -std::shared_ptr MakeListArray(SEXP x, const std::shared_ptr& type) { - return VectorToArrayConverter::Visit(x, type); -} - template int64_t time_cast(T value); @@ -1362,8 +1358,8 @@ std::shared_ptr Array__from_vector( Rcpp::stop("Object incompatible with dictionary type"); } - if (type->id() == Type::LIST) { - return arrow::r::MakeListArray(x, type); + if (type->id() == Type::LIST || type->id() == Type::LARGE_LIST) { + return VectorToArrayConverter::Visit(x, type); } // struct types diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index c5d51ffdb55..9586c839970 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -636,6 +636,7 @@ class Converter_Decimal : public Converter { } }; +template class Converter_List : public Converter { private: std::shared_ptr value_type_; @@ -670,7 +671,7 @@ class Converter_List : public Converter { Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) const { - auto list_array = checked_cast(array.get()); + auto list_array = checked_cast(array.get()); auto values_array = list_array->values(); auto ingest_one = [&](R_xlen_t i) { @@ -887,10 +888,15 @@ std::shared_ptr Converter::Make(const std::shared_ptr& type return std::make_shared(std::move(arrays)); case Type::LIST: - return std::make_shared( + return std::make_shared>( std::move(arrays), checked_cast(type.get())->value_type()); + case Type::LARGE_LIST: + return std::make_shared>( + std::move(arrays), + checked_cast(type.get())->value_type()); + case Type::NA: return std::make_shared(std::move(arrays)); diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 1c0b95831a3..63b6524976c 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -364,6 +364,21 @@ RcppExport SEXP _arrow_ListArray__value_type(SEXP array_sexp){ } #endif +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr LargeListArray__value_type(const std::shared_ptr& array); +RcppExport SEXP _arrow_LargeListArray__value_type(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + return Rcpp::wrap(LargeListArray__value_type(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeListArray__value_type(SEXP array_sexp){ + Rf_error("Cannot call LargeListArray__value_type(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // array.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr ListArray__values(const std::shared_ptr& array); @@ -379,6 +394,21 @@ RcppExport SEXP _arrow_ListArray__values(SEXP array_sexp){ } #endif +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr LargeListArray__values(const std::shared_ptr& array); +RcppExport SEXP _arrow_LargeListArray__values(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + return Rcpp::wrap(LargeListArray__values(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeListArray__values(SEXP array_sexp){ + Rf_error("Cannot call LargeListArray__values(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // array.cpp #if defined(ARROW_R_WITH_ARROW) int32_t ListArray__value_length(const std::shared_ptr& array, int64_t i); @@ -395,6 +425,22 @@ RcppExport SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){ } #endif +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +int64_t LargeListArray__value_length(const std::shared_ptr& array, int64_t i); +RcppExport SEXP _arrow_LargeListArray__value_length(SEXP array_sexp, SEXP i_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + Rcpp::traits::input_parameter::type i(i_sexp); + return Rcpp::wrap(LargeListArray__value_length(array, i)); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeListArray__value_length(SEXP array_sexp, SEXP i_sexp){ + Rf_error("Cannot call LargeListArray__value_length(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // array.cpp #if defined(ARROW_R_WITH_ARROW) int32_t ListArray__value_offset(const std::shared_ptr& array, int64_t i); @@ -411,6 +457,22 @@ RcppExport SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){ } #endif +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +int64_t LargeListArray__value_offset(const std::shared_ptr& array, int64_t i); +RcppExport SEXP _arrow_LargeListArray__value_offset(SEXP array_sexp, SEXP i_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + Rcpp::traits::input_parameter::type i(i_sexp); + return Rcpp::wrap(LargeListArray__value_offset(array, i)); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeListArray__value_offset(SEXP array_sexp, SEXP i_sexp){ + Rf_error("Cannot call LargeListArray__value_offset(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // array.cpp #if defined(ARROW_R_WITH_ARROW) Rcpp::IntegerVector ListArray__raw_value_offsets(const std::shared_ptr& array); @@ -426,6 +488,21 @@ RcppExport SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){ } #endif +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +Rcpp::IntegerVector LargeListArray__raw_value_offsets(const std::shared_ptr& array); +RcppExport SEXP _arrow_LargeListArray__raw_value_offsets(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + return Rcpp::wrap(LargeListArray__raw_value_offsets(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_LargeListArray__raw_value_offsets(SEXP array_sexp){ + Rf_error("Cannot call LargeListArray__raw_value_offsets(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // array_from_vector.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Array__infer_type(SEXP x); @@ -5761,10 +5838,15 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, + { "_arrow_LargeListArray__value_type", (DL_FUNC) &_arrow_LargeListArray__value_type, 1}, { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, + { "_arrow_LargeListArray__values", (DL_FUNC) &_arrow_LargeListArray__values, 1}, { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, + { "_arrow_LargeListArray__value_length", (DL_FUNC) &_arrow_LargeListArray__value_length, 2}, { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, + { "_arrow_LargeListArray__value_offset", (DL_FUNC) &_arrow_LargeListArray__value_offset, 2}, { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, + { "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1}, { "_arrow_Array__infer_type", (DL_FUNC) &_arrow_Array__infer_type, 1}, { "_arrow_Array__from_vector", (DL_FUNC) &_arrow_Array__from_vector, 2}, { "_arrow_ChunkedArray__from_list", (DL_FUNC) &_arrow_ChunkedArray__from_list, 2}, diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 11144d9ed66..8392e4bbf2e 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -21,7 +21,7 @@ expect_array_roundtrip <- function(x, type) { a <- Array$create(x, type = type) expect_type_equal(a$type, type) expect_identical(length(a), length(x)) - if (!inherits(type, "ListType")) { + if (!inherits(type, "ListType") && !inherits(type, "LargeListType")) { # TODO: revisit how missingness works with ListArrays # R list objects don't handle missingness the same way as other vectors. # Is there some vctrs thing we should do on the roundtrip back to R? @@ -36,7 +36,7 @@ expect_array_roundtrip <- function(x, type) { x_sliced <- x[-1] expect_type_equal(a_sliced$type, type) expect_identical(length(a_sliced), length(x_sliced)) - if (!inherits(type, "ListType")) { + if (!inherits(type, "ListType") && !inherits(type, "LargeListType")) { expect_identical(is.na(a_sliced), is.na(x_sliced)) } expect_equivalent(as.vector(a_sliced), x_sliced) @@ -495,6 +495,49 @@ test_that("Array$create() handles vector -> list arrays (ARROW-7662)", { expect_error(Array$create(list(df))) }) +test_that("Array$create() handles vector -> large list arrays", { + # Should be able to create an empty list with a type hint. + expect_is(Array$create(list(), large_list_of(bool())), "LargeListArray") + + # logical + expect_array_roundtrip(list(NA), large_list_of(bool())) + expect_array_roundtrip(list(logical(0)), large_list_of(bool())) + expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), large_list_of(bool())) + expect_array_roundtrip(list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), large_list_of(bool())) + + # integer + expect_array_roundtrip(list(NA_integer_), large_list_of(int32())) + expect_array_roundtrip(list(integer(0)), large_list_of(int32())) + expect_array_roundtrip(list(1:2, 3:4, 12:18), large_list_of(int32())) + expect_array_roundtrip(list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), large_list_of(int32())) + + # numeric + expect_array_roundtrip(list(NA_real_), large_list_of(float64())) + expect_array_roundtrip(list(numeric(0)), large_list_of(float64())) + expect_array_roundtrip(list(1, c(2, 3), 4), large_list_of(float64())) + expect_array_roundtrip(list(1, numeric(0), c(2, 3, NA_real_), 4), large_list_of(float64())) + + # character + expect_array_roundtrip(list(NA_character_), large_list_of(utf8())) + expect_array_roundtrip(list(character(0)), large_list_of(utf8())) + expect_array_roundtrip(list("itsy", c("bitsy", "spider"), c("is")), large_list_of(utf8())) + expect_array_roundtrip(list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), large_list_of(utf8())) + + # factor + expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), large_list_of(dictionary(int8(), utf8()))) + expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), large_list_of(dictionary(int8(), utf8()))) + + # struct + expect_array_roundtrip( + list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))), + large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())) + ) + expect_array_roundtrip( + list(tibble::tibble(a = list(integer()))), + large_list_of(struct(a = list_of(int32()))) + ) +}) + test_that("Array$create() should have helpful error on lists with type hint", { expect_error(Array$create(list(numeric(0)), list_of(bool())), regexp = "List vector expecting elements vector of type") From a3385e45db2346fe65f40f4c7fa891a2bb791e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romain=20Fran=C3=A7ois?= Date: Fri, 26 Jun 2020 17:56:34 +0200 Subject: [PATCH 18/22] Update r/tests/testthat/test-Array.R Co-authored-by: Neal Richardson --- r/tests/testthat/test-Array.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 8392e4bbf2e..420553617d9 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -36,7 +36,7 @@ expect_array_roundtrip <- function(x, type) { x_sliced <- x[-1] expect_type_equal(a_sliced$type, type) expect_identical(length(a_sliced), length(x_sliced)) - if (!inherits(type, "ListType") && !inherits(type, "LargeListType")) { + if (!inherits(type, c("ListType", "LargeListType"))) { expect_identical(is.na(a_sliced), is.na(x_sliced)) } expect_equivalent(as.vector(a_sliced), x_sliced) From 249bf513444c4cdbdbfe5797adac2eb693d1123d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romain=20Fran=C3=A7ois?= Date: Fri, 26 Jun 2020 17:56:50 +0200 Subject: [PATCH 19/22] Update r/tests/testthat/test-Array.R Co-authored-by: Neal Richardson --- r/tests/testthat/test-Array.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 420553617d9..174a19c0874 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -21,7 +21,7 @@ expect_array_roundtrip <- function(x, type) { a <- Array$create(x, type = type) expect_type_equal(a$type, type) expect_identical(length(a), length(x)) - if (!inherits(type, "ListType") && !inherits(type, "LargeListType")) { + if (!inherits(type, c("ListType", "LargeListType"))) { # TODO: revisit how missingness works with ListArrays # R list objects don't handle missingness the same way as other vectors. # Is there some vctrs thing we should do on the roundtrip back to R? From ff10327578b9ad5672a56d26211111cf53879ca8 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Sat, 27 Jun 2020 06:37:16 +0200 Subject: [PATCH 20/22] add as= argument to expect_array_roundtrip() --- r/tests/testthat/test-Array.R | 56 ++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 174a19c0874..66908325b9e 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -17,8 +17,8 @@ context("Array") -expect_array_roundtrip <- function(x, type) { - a <- Array$create(x, type = type) +expect_array_roundtrip <- function(x, type, as = NULL) { + a <- Array$create(x, type = as) expect_type_equal(a$type, type) expect_identical(length(a), length(x)) if (!inherits(type, c("ListType", "LargeListType"))) { @@ -52,7 +52,7 @@ test_that("Integer Array", { test_that("binary Array", { bin <- vctrs::list_of(as.raw(1:10), as.raw(0:255), .ptype = raw()) expect_array_roundtrip(bin, binary()) - expect_array_roundtrip(bin, large_binary()) + expect_array_roundtrip(bin, large_binary(), as = large_binary()) # degenerate bin <- structure(list(1L), ptype = raw()) @@ -160,16 +160,16 @@ test_that("Array supports logical vectors (ARROW-3341)", { test_that("Array supports character vectors (ARROW-3339)", { # without NA expect_array_roundtrip(c("itsy", "bitsy", "spider"), utf8()) - expect_array_roundtrip(c("itsy", "bitsy", "spider"), large_utf8()) + expect_array_roundtrip(c("itsy", "bitsy", "spider"), large_utf8(), as = large_utf8()) # with NA expect_array_roundtrip(c("itsy", NA, "spider"), utf8()) - expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8()) + expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8()) }) test_that("empty arrays are supported", { expect_array_roundtrip(character(), utf8()) - expect_array_roundtrip(character(), large_utf8()) + expect_array_roundtrip(character(), large_utf8(), as = large_utf8()) expect_array_roundtrip(integer(), int32()) expect_array_roundtrip(numeric(), float64()) expect_array_roundtrip(factor(character()), dictionary(int8(), utf8())) @@ -497,44 +497,46 @@ test_that("Array$create() handles vector -> list arrays (ARROW-7662)", { test_that("Array$create() handles vector -> large list arrays", { # Should be able to create an empty list with a type hint. - expect_is(Array$create(list(), large_list_of(bool())), "LargeListArray") + expect_is(Array$create(list(), type = large_list_of(bool())), "LargeListArray") # logical - expect_array_roundtrip(list(NA), large_list_of(bool())) - expect_array_roundtrip(list(logical(0)), large_list_of(bool())) - expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), large_list_of(bool())) - expect_array_roundtrip(list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), large_list_of(bool())) + expect_array_roundtrip(list(NA), large_list_of(bool()), as = large_list_of(bool())) + expect_array_roundtrip(list(logical(0)), large_list_of(bool()), as = large_list_of(bool())) + expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), large_list_of(bool()), as = large_list_of(bool())) + expect_array_roundtrip(list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), large_list_of(bool()), as = large_list_of(bool())) # integer - expect_array_roundtrip(list(NA_integer_), large_list_of(int32())) - expect_array_roundtrip(list(integer(0)), large_list_of(int32())) - expect_array_roundtrip(list(1:2, 3:4, 12:18), large_list_of(int32())) - expect_array_roundtrip(list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), large_list_of(int32())) + expect_array_roundtrip(list(NA_integer_), large_list_of(int32()), as = large_list_of(int32())) + expect_array_roundtrip(list(integer(0)), large_list_of(int32()), as = large_list_of(int32())) + expect_array_roundtrip(list(1:2, 3:4, 12:18), large_list_of(int32()), as = large_list_of(int32())) + expect_array_roundtrip(list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), large_list_of(int32()), as = large_list_of(int32())) # numeric - expect_array_roundtrip(list(NA_real_), large_list_of(float64())) - expect_array_roundtrip(list(numeric(0)), large_list_of(float64())) - expect_array_roundtrip(list(1, c(2, 3), 4), large_list_of(float64())) - expect_array_roundtrip(list(1, numeric(0), c(2, 3, NA_real_), 4), large_list_of(float64())) + expect_array_roundtrip(list(NA_real_), large_list_of(float64()), as = large_list_of(float64())) + expect_array_roundtrip(list(numeric(0)), large_list_of(float64()), as = large_list_of(float64())) + expect_array_roundtrip(list(1, c(2, 3), 4), large_list_of(float64()), as = large_list_of(float64())) + expect_array_roundtrip(list(1, numeric(0), c(2, 3, NA_real_), 4), large_list_of(float64()), as = large_list_of(float64())) # character - expect_array_roundtrip(list(NA_character_), large_list_of(utf8())) - expect_array_roundtrip(list(character(0)), large_list_of(utf8())) - expect_array_roundtrip(list("itsy", c("bitsy", "spider"), c("is")), large_list_of(utf8())) - expect_array_roundtrip(list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), large_list_of(utf8())) + expect_array_roundtrip(list(NA_character_), large_list_of(utf8()), as = large_list_of(utf8())) + expect_array_roundtrip(list(character(0)), large_list_of(utf8()), as = large_list_of(utf8())) + expect_array_roundtrip(list("itsy", c("bitsy", "spider"), c("is")), large_list_of(utf8()), as = large_list_of(utf8())) + expect_array_roundtrip(list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), large_list_of(utf8()), as = large_list_of(utf8())) # factor - expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), large_list_of(dictionary(int8(), utf8()))) - expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), large_list_of(dictionary(int8(), utf8()))) + expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), large_list_of(dictionary(int8(), utf8())), as = large_list_of(dictionary(int8(), utf8()))) + expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), large_list_of(dictionary(int8(), utf8())), as = large_list_of(dictionary(int8(), utf8()))) # struct expect_array_roundtrip( list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))), - large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())) + large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())), + as = large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())) ) expect_array_roundtrip( list(tibble::tibble(a = list(integer()))), - large_list_of(struct(a = list_of(int32()))) + large_list_of(struct(a = list_of(int32()))), + as = large_list_of(struct(a = list_of(int32()))) ) }) From db601142ad148fefec48d803e055e88415aab072 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 1 Jul 2020 09:29:58 -0700 Subject: [PATCH 21/22] (Repeatedly) convert to UTF-8; tests pass --- r/src/array_from_vector.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 44241049be5..19322309a6b 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -987,7 +987,7 @@ class StringVectorConverter : public VectorConverter { for (R_xlen_t i = 0; i < n; i++) { SEXP string_i = STRING_ELT(obj, i); if (string_i != NA_STRING) { - size += XLENGTH(string_i); + size += XLENGTH(Rf_mkCharCE(Rf_translateCharUTF8(string_i), CE_UTF8)); } } RETURN_NOT_OK(typed_builder_->Reserve(size)); @@ -998,6 +998,8 @@ class StringVectorConverter : public VectorConverter { if (string_i == NA_STRING) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { + // Make sure we're ingesting UTF-8 + string_i = Rf_mkCharCE(Rf_translateCharUTF8(string_i), CE_UTF8); RETURN_NOT_OK(typed_builder_->Append(CHAR(string_i), XLENGTH(string_i))); } } From 3dc462c7f9375ebc88e27a49e75523cc66302d96 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 1 Jul 2020 09:34:09 -0700 Subject: [PATCH 22/22] Update vignette --- r/vignettes/arrow.Rmd | 68 +++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd index 8602ea260e0..b3accd00fc3 100644 --- a/r/vignettes/arrow.Rmd +++ b/r/vignettes/arrow.Rmd @@ -103,40 +103,40 @@ In the tables, entries with a `-` are not currently implemented. ### Arrow to R -| Arrow type | R type | -|-------------------|--------------------------| -| boolean | logical | -| int8 | integer | -| int16 | integer | -| int32 | integer | -| int64 | integer^++^ | -| uint8 | integer | -| uint16 | integer | -| uint32 | integer^++^ | -| uint64 | integer^++^ | -| float16 | - | -| float32 | double | -| float64 | double | -| utf8 | character | -| binary | - | -| fixed_size_binary | - | -| date32 | Date | -| date64 | POSIXct | -| time32 | hms::difftime | -| time64 | hms::difftime | -| timestamp | POSIXct | -| duration | - | -| decimal | double | -| dictionary | factor^+++^ | -| list | list | -| fixed_size_list | - | -| struct | data.frame | -| null | vctrs::vctrs_unspecified | -| map | - | -| union | - | -| large_utf8 | - | -| large_binary | - | -| large_list | - | +| Arrow type | R type | +|-------------------|---------------------------| +| boolean | logical | +| int8 | integer | +| int16 | integer | +| int32 | integer | +| int64 | integer^++^ | +| uint8 | integer | +| uint16 | integer | +| uint32 | integer^++^ | +| uint64 | integer^++^ | +| float16 | - | +| float32 | double | +| float64 | double | +| utf8 | character | +| binary | vctrs::vctrs_list_of(raw) | +| fixed_size_binary | - | +| date32 | Date | +| date64 | POSIXct | +| time32 | hms::difftime | +| time64 | hms::difftime | +| timestamp | POSIXct | +| duration | - | +| decimal | double | +| dictionary | factor^+++^ | +| list | vctrs::vctrs_list_of | +| fixed_size_list | - | +| struct | data.frame | +| null | vctrs::vctrs_unspecified | +| map | - | +| union | - | +| large_utf8 | character | +| large_binary | vctrs::vctrs_list_of(raw) | +| large_list | vctrs::vctrs_list_of | ^++^: These integer types may contain values that exceed the range of R's `integer` type (32-bit signed integer). When they do, `uint32` and `uint64` are converted to `double` ("numeric") and `int64` is converted to `bit64::integer64`.