diff --git a/r/R/List.R b/r/R/List.R index cc8c2b1c9e1..efd8839f39b 100644 --- a/r/R/List.R +++ b/r/R/List.R @@ -18,7 +18,11 @@ #' @include R6.R `arrow::ListType` <- R6Class("arrow::ListType", - inherit = `arrow::NestedType` + inherit = `arrow::NestedType`, + active = list( + value_field = function() shared_ptr(`arrow::Field`, ListType__value_field(self)), + value_type = function() `arrow::DataType`$dispatch(ListType__value_type(self)) + ) ) #' @rdname DataType diff --git a/r/R/array.R b/r/R/array.R index b6e21ef8e69..7e5e9552172 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -118,12 +118,26 @@ ) ) +`arrow::ListArray` <- R6Class("arrow::ListArray", inherit = `arrow::Array`, + public = list( + values = function() `arrow::Array`$dispatch(ListArray__values(self)), + value_length = function(i) ListArray__value_length(self, i), + value_offset = function(i) ListArray__value_offset(self, i), + raw_value_offsets = function() ListArray__raw_value_offsets(self) + ), + active = list( + value_type = function() `arrow::DataType`$dispatch(ListArray__value_type(self)) + ) +) + `arrow::Array`$dispatch <- function(xp){ a <- shared_ptr(`arrow::Array`, xp) if(a$type_id() == Type$DICTIONARY){ a <- shared_ptr(`arrow::DictionaryArray`, xp) } else if (a$type_id() == Type$STRUCT) { a <- shared_ptr(`arrow::StructArray`, xp) + } else if(a$type_id() == Type$LIST) { + a <- shared_ptr(`arrow::ListArray`, xp) } a } diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 8609f9b85f1..a3edd1a5f26 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -80,6 +80,26 @@ StructArray__Flatten <- function(array){ .Call(`_arrow_StructArray__Flatten` , array) } +ListArray__value_type <- function(array){ + .Call(`_arrow_ListArray__value_type` , array) +} + +ListArray__values <- function(array){ + .Call(`_arrow_ListArray__values` , array) +} + +ListArray__value_length <- function(array, i){ + .Call(`_arrow_ListArray__value_length` , array, i) +} + +ListArray__value_offset <- function(array, i){ + .Call(`_arrow_ListArray__value_offset` , array, i) +} + +ListArray__raw_value_offsets <- function(array){ + .Call(`_arrow_ListArray__raw_value_offsets` , array) +} + Array__as_vector <- function(array){ .Call(`_arrow_Array__as_vector` , array) } @@ -456,6 +476,14 @@ StructType__GetFieldIndex <- function(type, name){ .Call(`_arrow_StructType__GetFieldIndex` , type, name) } +ListType__value_field <- function(type){ + .Call(`_arrow_ListType__value_field` , type) +} + +ListType__value_type <- function(type){ + .Call(`_arrow_ListType__value_type` , type) +} + ipc___feather___TableWriter__SetDescription <- function(writer, description){ invisible(.Call(`_arrow_ipc___feather___TableWriter__SetDescription` , writer, description)) } diff --git a/r/src/array.cpp b/r/src/array.cpp index 35da4b1e4b3..7e4fa6f4f14 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -140,4 +140,35 @@ arrow::ArrayVector StructArray__Flatten( return out; } +// [[arrow::export]] +std::shared_ptr ListArray__value_type( + const std::shared_ptr& array) { + return array->value_type(); +} + +// [[arrow::export]] +std::shared_ptr ListArray__values( + const std::shared_ptr& array) { + return array->values(); +} + +// [[arrow::export]] +int32_t ListArray__value_length(const std::shared_ptr& array, + int64_t i) { + return array->value_length(i); +} + +// [[arrow::export]] +int32_t ListArray__value_offset(const std::shared_ptr& array, + int64_t i) { + return array->value_offset(i); +} + +// [[arrow::export]] +Rcpp::IntegerVector ListArray__raw_value_offsets( + const std::shared_ptr& array) { + auto offsets = array->raw_value_offsets(); + return Rcpp::IntegerVector(offsets, offsets + array->length()); +} + #endif diff --git a/r/src/array__to_vector.cpp b/r/src/array__to_vector.cpp index 4e26f8d53f5..7fcb02bef3c 100644 --- a/r/src/array__to_vector.cpp +++ b/r/src/array__to_vector.cpp @@ -547,6 +547,47 @@ class Converter_Decimal : public Converter { } }; +class Converter_List : public Converter { + public: + explicit Converter_List(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { return Rcpp::List(no_init(n)); } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + // nothing to do, list contain NULL by default + return Status::OK(); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + using internal::checked_cast; + auto list_array = checked_cast(array.get()); + auto values_array = list_array->values(); + + auto ingest_one = [&](R_xlen_t i) { + auto slice = + values_array->Slice(list_array->value_offset(i), list_array->value_length(i)); + SET_VECTOR_ELT(data, i + start, Array__as_vector(slice)); + }; + + if (array->null_count()) { + internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), + n); + + for (R_xlen_t i = 0; i < n; i++, bitmap_reader.Next()) { + if (bitmap_reader.IsSet()) ingest_one(i); + } + + } else { + for (R_xlen_t i = 0; i < n; i++) { + ingest_one(i); + } + } + + return Status::OK(); + } +}; + class Converter_Int64 : public Converter { public: explicit Converter_Int64(const ArrayVector& arrays) : Converter(arrays) {} @@ -658,9 +699,13 @@ std::shared_ptr Converter::Make(const ArrayVector& arrays) { case Type::DECIMAL: return std::make_shared(arrays); + // nested case Type::STRUCT: return std::make_shared(arrays); + case Type::LIST: + return std::make_shared(arrays); + default: break; } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 2352184ec83..5460526bd78 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -317,6 +317,83 @@ RcppExport SEXP _arrow_StructArray__Flatten(SEXP array_sexp){ } #endif +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ListArray__value_type(const std::shared_ptr& array); +RcppExport SEXP _arrow_ListArray__value_type(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + return Rcpp::wrap(ListArray__value_type(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ListArray__value_type(SEXP array_sexp){ + Rf_error("Cannot call ListArray__value_type(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ListArray__values(const std::shared_ptr& array); +RcppExport SEXP _arrow_ListArray__values(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + return Rcpp::wrap(ListArray__values(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ListArray__values(SEXP array_sexp){ + Rf_error("Cannot call ListArray__values(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +int32_t ListArray__value_length(const std::shared_ptr& array, int64_t i); +RcppExport SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + Rcpp::traits::input_parameter::type i(i_sexp); + return Rcpp::wrap(ListArray__value_length(array, i)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){ + Rf_error("Cannot call ListArray__value_length(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +int32_t ListArray__value_offset(const std::shared_ptr& array, int64_t i); +RcppExport SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + Rcpp::traits::input_parameter::type i(i_sexp); + return Rcpp::wrap(ListArray__value_offset(array, i)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){ + Rf_error("Cannot call ListArray__value_offset(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +Rcpp::IntegerVector ListArray__raw_value_offsets(const std::shared_ptr& array); +RcppExport SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + return Rcpp::wrap(ListArray__raw_value_offsets(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){ + Rf_error("Cannot call ListArray__raw_value_offsets(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // array__to_vector.cpp #if defined(ARROW_R_WITH_ARROW) SEXP Array__as_vector(const std::shared_ptr& array); @@ -1743,6 +1820,36 @@ RcppExport SEXP _arrow_StructType__GetFieldIndex(SEXP type_sexp, SEXP name_sexp) } #endif +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ListType__value_field(const std::shared_ptr& type); +RcppExport SEXP _arrow_ListType__value_field(SEXP type_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type type(type_sexp); + return Rcpp::wrap(ListType__value_field(type)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ListType__value_field(SEXP type_sexp){ + Rf_error("Cannot call ListType__value_field(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ListType__value_type(const std::shared_ptr& type); +RcppExport SEXP _arrow_ListType__value_type(SEXP type_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type type(type_sexp); + return Rcpp::wrap(ListType__value_type(type)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ListType__value_type(SEXP type_sexp){ + Rf_error("Cannot call ListType__value_type(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // feather.cpp #if defined(ARROW_R_WITH_ARROW) void ipc___feather___TableWriter__SetDescription(const std::unique_ptr& writer, const std::string& description); @@ -3371,6 +3478,11 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, + { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, + { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, + { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, + { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, + { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 1}, { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, @@ -3465,6 +3577,8 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, + { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, + { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, { "_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2}, { "_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2}, { "_arrow_ipc___feather___TableWriter__Append", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Append, 3}, diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index 18920f22713..f4a4b09003b 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -281,4 +281,16 @@ int StructType__GetFieldIndex(const std::shared_ptr& type, return type->GetFieldIndex(name); } +// [[arrow::export]] +std::shared_ptr ListType__value_field( + const std::shared_ptr& type) { + return type->value_field(); +} + +// [[arrow::export]] +std::shared_ptr ListType__value_type( + const std::shared_ptr& type) { + return type->value_type(); +} + #endif diff --git a/r/tests/testthat/test-DataType.R b/r/tests/testthat/test-DataType.R index 6f77b3b87e3..dfc0d53f40d 100644 --- a/r/tests/testthat/test-DataType.R +++ b/r/tests/testthat/test-DataType.R @@ -297,6 +297,8 @@ test_that("list type works as expected", { x$children(), list(field("item", int32())) ) + expect_equal(x$value_type, int32()) + expect_equal(x$value_field, field("item", int32())) }) test_that("struct type works as expected", { diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R index 38b20a84f4c..206dfdd22a8 100644 --- a/r/tests/testthat/test-json.R +++ b/r/tests/testthat/test-json.R @@ -75,12 +75,12 @@ test_that("read_json_arrow() converts to tibble", { test_that("Can read json file with nested columns (ARROW-5503)", { tf <- tempfile() writeLines(' - { "nuf": {} } - { "nuf": null } - { "nuf": { "ps": 78.0, "hello": "hi" } } - { "nuf": { "ps": 90.0, "hello": "bonjour" } } - { "nuf": { "hello": "ciao" } } - { "nuf": { "ps": 19 } } + { "arr": [1.0, 2.0, 3.0], "nuf": {} } + { "arr": [2.0], "nuf": null } + { "arr": [], "nuf": { "ps": 78.0, "hello": "hi" } } + { "arr": null, "nuf": { "ps": 90.0, "hello": "bonjour" } } + { "arr": [5.0], "nuf": { "hello": "ciao" } } + { "arr": [5.0, 6.0], "nuf": { "ps": 19 } } ', tf) tab1 <- read_json_arrow(tf, as_tibble = FALSE) @@ -93,11 +93,12 @@ test_that("Can read json file with nested columns (ARROW-5503)", { expect_equal( tab1$schema, schema( + arr = list_of(float64()), nuf = struct(ps = float64(), hello = utf8()) ) ) - struct_array <- tab1$column(0)$data()$chunk(0) + struct_array <- tab1$column(1)$data()$chunk(0) ps <- array(c(NA, NA, 78, 90, NA, 19)) hello <- array(c(NA, NA, "hi", "bonjour", "ciao", NA)) expect_equal(struct_array$field(0L), ps) @@ -108,8 +109,28 @@ test_that("Can read json file with nested columns (ARROW-5503)", { data.frame(ps = ps$as_vector(), hello = hello$as_vector(), stringsAsFactors = FALSE) ) - # cannot yet test list and struct types in R api - # tib <- as.data.frame(tab1) + list_array_r <- list( + c(1, 2, 3), + c(2), + numeric(), + NULL, + 5, + c(5, 6) + ) + list_array <- tab1$column(0)$data() + expect_identical( + list_array$as_vector(), + list_array_r + ) + + tib <- as.data.frame(tab1) + expect_identical( + tib, + tibble::tibble( + arr = list_array_r, + nuf = data.frame(ps = ps$as_vector(), hello = hello$as_vector(), stringsAsFactors = FALSE) + ) + ) unlink(tf) })