Skip to content

Commit c164685

Browse files
ARROW-5625: [R] convert Array of struct type to data frame columns
struct arrays become data frame columns, i.e. ``` r library(arrow, warn.conflicts = FALSE) library(tibble) tf <- tempfile() writeLines(' { "hello": 3.5, "world": false, "yo": "thing", "nuf": {} } { "hello": 3.25, "world": null, "nuf": null } { "hello": 3.125, "world": null, "yo": "\u5fcd", "nuf": { "ps": 78.0, "house": "Gryffindor"} } { "hello": 0.0, "world": true, "yo": null, "nuf": { "ps": 90.0, "house": "Slytherin" } } ', tf) tab1 <- read_json_arrow(tf, as_tibble = FALSE) array <- tab1$column(3)$data()$chunk(0) array$field(0) #> arrow::Array #> [ #> null, #> null, #> 78, #> 90 #> ] array$as_vector() #> ps house #> 1 NA <NA> #> 2 NA <NA> #> 3 78 Gryffindor #> 4 90 Slytherin as.data.frame(tab1) #> # A tibble: 4 x 4 #> hello world yo nuf$ps $house #> <dbl> <lgl> <chr> <dbl> <chr> #> 1 3.5 FALSE thing NA <NA> #> 2 3.25 NA <NA> NA <NA> #> 3 3.12 NA 忍 78 Gryffindor #> 4 0 TRUE <NA> 90 Slytherin ``` <sup>Created on 2019-06-17 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0.9000)</sup> Author: Romain Francois <romain@rstudio.com> Closes #4593 from romainfrancois/ARROW-5625/struct_arrays and squashes the following commits: b1f087e <Romain Francois> expand on unit test and added comment 17fd51e <Romain Francois> effectively use arrays 059c244 <Romain Francois> lint 394a3d7 <Romain Francois> Converter_Struct::Ingest.*() a76f267 <Romain Francois> Converter_Struct::Allocate() e425c5b <Romain Francois> More StrucrtArray methods bd71f38 <Romain Francois> + methods GetFieldIndex() and GetFieldByName() to arrow::StructType
1 parent a30bb7b commit c164685

File tree

12 files changed

+254
-22
lines changed

12 files changed

+254
-22
lines changed

r/R/ChunkedArray.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
`arrow::ChunkedArray` <- R6Class("arrow::ChunkedArray", inherit = `arrow::Object`,
3333
public = list(
3434
length = function() ChunkedArray__length(self),
35-
chunk = function(i) shared_ptr(`arrow::Array`, ChunkedArray__chunk(self, i)),
35+
chunk = function(i) `arrow::Array`$dispatch(ChunkedArray__chunk(self, i)),
3636
as_vector = function() ChunkedArray__as_vector(self),
3737
Slice = function(offset, length = NULL){
3838
if (is.null(length)) {
@@ -50,7 +50,7 @@
5050
active = list(
5151
null_count = function() ChunkedArray__null_count(self),
5252
num_chunks = function() ChunkedArray__num_chunks(self),
53-
chunks = function() map(ChunkedArray__chunks(self), shared_ptr, class = `arrow::Array`),
53+
chunks = function() map(ChunkedArray__chunks(self), ~ `arrow::Array`$dispatch(.x)),
5454
type = function() `arrow::DataType`$dispatch(ChunkedArray__type(self))
5555
)
5656
)

r/R/Struct.R

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,16 @@
1818
#' @include R6.R
1919

2020
`arrow::StructType` <- R6Class("arrow::StructType",
21-
inherit = `arrow::NestedType`
21+
inherit = `arrow::NestedType`,
22+
public = list(
23+
GetFieldByName = function(name) shared_ptr(`arrow::Field`, StructType__GetFieldByName(self, name)),
24+
GetFieldIndex = function(name) StructType__GetFieldIndex(self, name)
25+
)
2226
)
2327

2428
#' @rdname DataType
2529
#' @export
2630
struct <- function(...){
27-
shared_ptr(`arrow::StructType`, struct_(.fields(list(...))))
31+
xp <- struct_(.fields(list(...)))
32+
shared_ptr(`arrow::StructType`, xp)
2833
}

r/R/array.R

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,27 @@
103103
)
104104
)
105105

106+
`arrow::DictionaryArray` <- R6Class("arrow::DictionaryArray", inherit = `arrow::Array`,
107+
public = list(
108+
indices = function() `arrow::Array`$dispatch(DictionaryArray__indices(self)),
109+
dictionary = function() `arrow::Array`$dispatch(DictionaryArray__dictionary(self))
110+
)
111+
)
112+
113+
`arrow::StructArray` <- R6Class("arrow::StructArray", inherit = `arrow::Array`,
114+
public = list(
115+
field = function(i) `arrow::Array`$dispatch(StructArray__field(self, i)),
116+
GetFieldByName = function(name) `arrow::Array`$dispatch(StructArray__GetFieldByName(self, name)),
117+
Flatten = function() map(StructArray__Flatten(self), ~ `arrow::Array`$dispatch(.x))
118+
)
119+
)
120+
106121
`arrow::Array`$dispatch <- function(xp){
107122
a <- shared_ptr(`arrow::Array`, xp)
108123
if(a$type_id() == Type$DICTIONARY){
109124
a <- shared_ptr(`arrow::DictionaryArray`, xp)
125+
} else if (a$type_id() == Type$STRUCT) {
126+
a <- shared_ptr(`arrow::StructArray`, xp)
110127
}
111128
a
112129
}
@@ -126,11 +143,3 @@
126143
array <- function(x, type = NULL){
127144
`arrow::Array`$dispatch(Array__from_vector(x, type))
128145
}
129-
130-
`arrow::DictionaryArray` <- R6Class("arrow::DictionaryArray", inherit = `arrow::Array`,
131-
public = list(
132-
indices = function() `arrow::Array`$dispatch(DictionaryArray__indices(self)),
133-
dictionary = function() `arrow::Array`$dispatch(DictionaryArray__dictionary(self))
134-
)
135-
)
136-

r/R/arrowExports.R

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/array.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,25 @@ std::shared_ptr<arrow::Array> DictionaryArray__dictionary(
119119
return array->dictionary();
120120
}
121121

122+
// [[arrow::export]]
123+
std::shared_ptr<arrow::Array> StructArray__field(
124+
const std::shared_ptr<arrow::StructArray>& array, int i) {
125+
return array->field(i);
126+
}
127+
128+
// [[arrow::export]]
129+
std::shared_ptr<arrow::Array> StructArray__GetFieldByName(
130+
const std::shared_ptr<arrow::StructArray>& array, const std::string& name) {
131+
return array->GetFieldByName(name);
132+
}
133+
134+
// [[arrow::export]]
135+
arrow::ArrayVector StructArray__Flatten(
136+
const std::shared_ptr<arrow::StructArray>& array) {
137+
int nf = array->num_fields();
138+
arrow::ArrayVector out(nf);
139+
STOP_IF_NOT_OK(array->Flatten(arrow::default_memory_pool(), &out));
140+
return out;
141+
}
142+
122143
#endif

r/src/array__to_vector.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,65 @@ class Converter_Dictionary : public Converter {
345345
}
346346
};
347347

348+
class Converter_Struct : public Converter {
349+
public:
350+
explicit Converter_Struct(const ArrayVector& arrays) : Converter(arrays), converters() {
351+
auto first_array =
352+
internal::checked_cast<arrow::StructArray*>(Converter::arrays_[0].get());
353+
int nf = first_array->num_fields();
354+
for (int i = 0; i < nf; i++) {
355+
converters.push_back(Converter::Make({first_array->field(i)}));
356+
}
357+
}
358+
359+
SEXP Allocate(R_xlen_t n) const {
360+
// allocate a data frame column to host each array
361+
auto first_array =
362+
internal::checked_cast<arrow::StructArray*>(Converter::arrays_[0].get());
363+
auto type = first_array->struct_type();
364+
int nf = first_array->num_fields();
365+
Rcpp::List out(nf);
366+
Rcpp::CharacterVector colnames(nf);
367+
for (int i = 0; i < nf; i++) {
368+
out[i] = converters[i]->Allocate(n);
369+
colnames[i] = type->child(i)->name();
370+
}
371+
IntegerVector rn(2);
372+
rn[0] = NA_INTEGER;
373+
rn[1] = -n;
374+
Rf_setAttrib(out, symbols::row_names, rn);
375+
Rf_setAttrib(out, R_NamesSymbol, colnames);
376+
Rf_setAttrib(out, R_ClassSymbol, Rf_mkString("data.frame"));
377+
return out;
378+
}
379+
380+
Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
381+
int nf = converters.size();
382+
for (int i = 0; i < nf; i++) {
383+
STOP_IF_NOT_OK(converters[i]->Ingest_all_nulls(VECTOR_ELT(data, i), start, n));
384+
}
385+
return Status::OK();
386+
}
387+
388+
Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
389+
R_xlen_t start, R_xlen_t n) const {
390+
auto struct_array = internal::checked_cast<arrow::StructArray*>(array.get());
391+
int nf = converters.size();
392+
// Flatten() deals with merging of nulls
393+
ArrayVector arrays(nf);
394+
STOP_IF_NOT_OK(struct_array->Flatten(default_memory_pool(), &arrays));
395+
for (int i = 0; i < nf; i++) {
396+
STOP_IF_NOT_OK(
397+
converters[i]->Ingest_some_nulls(VECTOR_ELT(data, i), arrays[i], start, n));
398+
}
399+
400+
return Status::OK();
401+
}
402+
403+
private:
404+
std::vector<std::shared_ptr<Converter>> converters;
405+
};
406+
348407
double ms_to_seconds(int64_t ms) { return static_cast<double>(ms / 1000); }
349408

350409
class Converter_Date64 : public Converter {
@@ -599,6 +658,9 @@ std::shared_ptr<Converter> Converter::Make(const ArrayVector& arrays) {
599658
case Type::DECIMAL:
600659
return std::make_shared<arrow::r::Converter_Decimal>(arrays);
601660

661+
case Type::STRUCT:
662+
return std::make_shared<arrow::r::Converter_Struct>(arrays);
663+
602664
default:
603665
break;
604666
}

r/src/arrowExports.cpp

Lines changed: 84 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/arrow_types.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ struct symbols {
3131
static SEXP xp;
3232
static SEXP dot_Internal;
3333
static SEXP inspect;
34+
static SEXP row_names;
3435
};
3536
} // namespace r
3637
} // namespace arrow
@@ -172,9 +173,9 @@ inline std::shared_ptr<T> extract(SEXP x) {
172173
#include <arrow/ipc/feather.h>
173174
#include <arrow/ipc/reader.h>
174175
#include <arrow/ipc/writer.h>
176+
#include <arrow/json/reader.h>
175177
#include <arrow/type.h>
176178
#include <arrow/util/compression.h>
177-
#include <arrow/json/reader.h>
178179

179180
RCPP_EXPOSED_ENUM_NODECL(arrow::Type::type)
180181
RCPP_EXPOSED_ENUM_NODECL(arrow::DateUnit)

r/src/datatype.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,4 +269,16 @@ bool DictionaryType__ordered(const std::shared_ptr<arrow::DictionaryType>& type)
269269
return type->ordered();
270270
}
271271

272+
// [[arrow::export]]
273+
std::shared_ptr<arrow::Field> StructType__GetFieldByName(
274+
const std::shared_ptr<arrow::StructType>& type, const std::string& name) {
275+
return type->GetFieldByName(name);
276+
}
277+
278+
// [[arrow::export]]
279+
int StructType__GetFieldIndex(const std::shared_ptr<arrow::StructType>& type,
280+
const std::string& name) {
281+
return type->GetFieldIndex(name);
282+
}
283+
272284
#endif

r/src/symbols.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ SEXP symbols::units = Rf_install("units");
2323
SEXP symbols::xp = Rf_install(".:xp:.");
2424
SEXP symbols::dot_Internal = Rf_install(".Internal");
2525
SEXP symbols::inspect = Rf_install("inspect");
26+
SEXP symbols::row_names = Rf_install("row.names");
2627

2728
void inspect(SEXP obj) {
2829
Rcpp::Shield<SEXP> call_inspect(Rf_lang2(symbols::inspect, obj));

0 commit comments

Comments
 (0)