-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-5718: [R] auto splice data frames in record_batch() and table() #4704
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5eabd20
0a4892c
61902ab
28bc470
79a7041
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -148,6 +148,9 @@ std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__InputStream__Schema( | |
| return batch; | ||
| } | ||
|
|
||
| namespace arrow { | ||
| namespace r { | ||
|
|
||
| arrow::Status check_consistent_array_size( | ||
| const std::vector<std::shared_ptr<arrow::Array>>& arrays, int64_t* num_rows) { | ||
| if (arrays.size()) { | ||
|
|
@@ -163,30 +166,69 @@ arrow::Status check_consistent_array_size( | |
| return arrow::Status::OK(); | ||
| } | ||
|
|
||
| Status count_fields(SEXP lst, int* out) { | ||
| int res = 0; | ||
| R_xlen_t n = XLENGTH(lst); | ||
| SEXP names = Rf_getAttrib(lst, R_NamesSymbol); | ||
| for (R_xlen_t i = 0; i < n; i++) { | ||
| if (LENGTH(STRING_ELT(names, i)) > 0) { | ||
| ++res; | ||
| } else { | ||
| SEXP x = VECTOR_ELT(lst, i); | ||
| if (Rf_inherits(x, "data.frame")) { | ||
| res += XLENGTH(x); | ||
| } else { | ||
| return Status::RError( | ||
| "only data frames are allowed as unnamed arguments to be auto spliced"); | ||
| } | ||
| } | ||
| } | ||
| *out = res; | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| } // namespace r | ||
| } // namespace arrow | ||
|
|
||
| std::shared_ptr<arrow::RecordBatch> RecordBatch__from_arrays__known_schema( | ||
| const std::shared_ptr<arrow::Schema>& schema, SEXP lst) { | ||
| R_xlen_t n_arrays = XLENGTH(lst); | ||
| if (schema->num_fields() != n_arrays) { | ||
| int num_fields; | ||
| STOP_IF_NOT_OK(arrow::r::count_fields(lst, &num_fields)); | ||
|
|
||
| if (schema->num_fields() != num_fields) { | ||
| Rcpp::stop("incompatible. schema has %d fields, and %d arrays are supplied", | ||
| schema->num_fields(), n_arrays); | ||
| schema->num_fields(), num_fields); | ||
| } | ||
|
|
||
| // convert lst to a vector of arrow::Array | ||
| std::vector<std::shared_ptr<arrow::Array>> arrays(n_arrays); | ||
| std::vector<std::shared_ptr<arrow::Array>> arrays(num_fields); | ||
| SEXP names = Rf_getAttrib(lst, R_NamesSymbol); | ||
| bool has_names = !Rf_isNull(names); | ||
|
|
||
| for (R_xlen_t i = 0; i < n_arrays; i++) { | ||
| if (has_names && schema->field(i)->name() != CHAR(STRING_ELT(names, i))) { | ||
| Rcpp::stop("field at index %d has name '%s' != '%s'", i + 1, | ||
| schema->field(i)->name(), CHAR(STRING_ELT(names, i))); | ||
| auto fill_array = [&arrays, &schema](int j, SEXP x, SEXP name) { | ||
| if (schema->field(j)->name() != CHAR(name)) { | ||
| Rcpp::stop("field at index %d has name '%s' != '%s'", j + 1, | ||
| schema->field(j)->name(), CHAR(name)); | ||
| } | ||
| arrays[j] = arrow::r::Array__from_vector(x, schema->field(j)->type(), false); | ||
| }; | ||
|
|
||
| for (R_xlen_t i = 0, j = 0; j < num_fields; i++) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you simplify the loop conditions? Make it only iterate over The loop is now invariant on j (out_idx in example) Easier to follow and review/refactor.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bonus point, get rid of all of this and make a C++ iterator over |
||
| SEXP name_i = STRING_ELT(names, i); | ||
| SEXP x_i = VECTOR_ELT(lst, i); | ||
|
|
||
| if (LENGTH(name_i) == 0) { | ||
| SEXP names_x_i = Rf_getAttrib(x_i, R_NamesSymbol); | ||
| for (R_xlen_t k = 0; k < XLENGTH(x_i); k++, j++) { | ||
| fill_array(j, VECTOR_ELT(x_i, k), STRING_ELT(names_x_i, k)); | ||
| } | ||
| } else { | ||
| fill_array(j, x_i, name_i); | ||
| j++; | ||
| } | ||
| arrays[i] = | ||
| arrow::r::Array__from_vector(VECTOR_ELT(lst, i), schema->field(i)->type(), false); | ||
| } | ||
|
|
||
| int64_t num_rows = 0; | ||
| STOP_IF_NOT_OK(check_consistent_array_size(arrays, &num_rows)); | ||
| STOP_IF_NOT_OK(arrow::r::check_consistent_array_size(arrays, &num_rows)); | ||
| return arrow::RecordBatch::Make(schema, num_rows, arrays); | ||
| } | ||
|
|
||
|
|
@@ -197,38 +239,45 @@ std::shared_ptr<arrow::RecordBatch> RecordBatch__from_arrays(SEXP schema_sxp, SE | |
| arrow::r::extract<arrow::Schema>(schema_sxp), lst); | ||
| } | ||
|
|
||
| R_xlen_t n_arrays = XLENGTH(lst); | ||
| int num_fields; | ||
| STOP_IF_NOT_OK(arrow::r::count_fields(lst, &num_fields)); | ||
|
|
||
| // convert lst to a vector of arrow::Array | ||
| std::vector<std::shared_ptr<arrow::Array>> arrays(n_arrays); | ||
| for (R_xlen_t i = 0; i < n_arrays; i++) { | ||
| arrays[i] = Array__from_vector(VECTOR_ELT(lst, i), R_NilValue); | ||
| std::vector<std::shared_ptr<arrow::Array>> arrays(num_fields); | ||
| std::vector<std::string> arrays_names(num_fields); | ||
| SEXP names = Rf_getAttrib(lst, R_NamesSymbol); | ||
|
|
||
| auto fill_array = [&arrays, &arrays_names](int j, SEXP x, SEXP name) { | ||
| arrays[j] = Array__from_vector(x, R_NilValue); | ||
| arrays_names[j] = CHAR(name); | ||
| }; | ||
|
|
||
| for (R_xlen_t i = 0, j = 0; j < num_fields; i++) { | ||
| SEXP name_i = STRING_ELT(names, i); | ||
| SEXP x_i = VECTOR_ELT(lst, i); | ||
| if (LENGTH(name_i) == 0) { | ||
| SEXP names_x_i = Rf_getAttrib(x_i, R_NamesSymbol); | ||
| for (R_xlen_t k = 0; k < XLENGTH(x_i); k++, j++) { | ||
| fill_array(j, VECTOR_ELT(x_i, k), STRING_ELT(names_x_i, k)); | ||
| } | ||
| } else { | ||
| fill_array(j, x_i, name_i); | ||
| j++; | ||
| } | ||
| } | ||
|
|
||
| // generate schema from the types that have been infered | ||
| std::shared_ptr<arrow::Schema> schema; | ||
| if (Rf_inherits(schema_sxp, "arrow::Schema")) { | ||
| schema = arrow::r::extract<arrow::Schema>(schema_sxp); | ||
| } else { | ||
| Rcpp::CharacterVector names(Rf_getAttrib(lst, R_NamesSymbol)); | ||
| std::vector<std::shared_ptr<arrow::Field>> fields(n_arrays); | ||
| for (R_xlen_t i = 0; i < n_arrays; i++) { | ||
| fields[i] = | ||
| std::make_shared<arrow::Field>(std::string(names[i]), arrays[i]->type()); | ||
| } | ||
| schema = std::make_shared<arrow::Schema>(std::move(fields)); | ||
| } | ||
|
|
||
| Rcpp::CharacterVector names(Rf_getAttrib(lst, R_NamesSymbol)); | ||
| std::vector<std::shared_ptr<arrow::Field>> fields(n_arrays); | ||
| for (R_xlen_t i = 0; i < n_arrays; i++) { | ||
| fields[i] = std::make_shared<arrow::Field>(std::string(names[i]), arrays[i]->type()); | ||
| std::vector<std::shared_ptr<arrow::Field>> fields(num_fields); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like |
||
| for (R_xlen_t i = 0; i < num_fields; i++) { | ||
| fields[i] = std::make_shared<arrow::Field>(arrays_names[i], arrays[i]->type()); | ||
| } | ||
| schema = std::make_shared<arrow::Schema>(std::move(fields)); | ||
|
|
||
| // check all sizes are the same | ||
| int64_t num_rows = 0; | ||
| STOP_IF_NOT_OK(check_consistent_array_size(arrays, &num_rows)); | ||
| STOP_IF_NOT_OK(arrow::r::check_consistent_array_size(arrays, &num_rows)); | ||
|
|
||
| return arrow::RecordBatch::Make(schema, num_rows, arrays); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -117,54 +117,84 @@ std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst, SEXP schema_sxp) { | |
| return tab; | ||
| } | ||
|
|
||
| R_xlen_t n = XLENGTH(lst); | ||
| std::vector<std::shared_ptr<arrow::Column>> columns(n); | ||
| int num_fields; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious why the record_batch and table code isn't more shared
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It could be. record batch only have to handle arrays, where tables have to handle arrays, chunked arrays, columns. but I agree that the structure of the code is similar.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should refactor the flatten-for loop into a small function if they're all the same. |
||
| STOP_IF_NOT_OK(arrow::r::count_fields(lst, &num_fields)); | ||
|
|
||
| std::vector<std::shared_ptr<arrow::Column>> columns(num_fields); | ||
| std::shared_ptr<arrow::Schema> schema; | ||
|
|
||
| if (Rf_isNull(schema_sxp)) { | ||
| // infer the schema from the ... | ||
| std::vector<std::shared_ptr<arrow::Field>> fields(n); | ||
| Rcpp::CharacterVector names(Rf_getAttrib(lst, R_NamesSymbol)); | ||
| std::vector<std::shared_ptr<arrow::Field>> fields(num_fields); | ||
| SEXP names = Rf_getAttrib(lst, R_NamesSymbol); | ||
|
|
||
| for (R_xlen_t i = 0; i < n; i++) { | ||
| SEXP x = VECTOR_ELT(lst, i); | ||
| auto fill_one_column = [&columns, &fields](int j, SEXP x, SEXP name) { | ||
| if (Rf_inherits(x, "arrow::Column")) { | ||
| columns[i] = arrow::r::extract<arrow::Column>(x); | ||
| fields[i] = columns[i]->field(); | ||
| columns[j] = arrow::r::extract<arrow::Column>(x); | ||
| fields[j] = columns[j]->field(); | ||
| } else if (Rf_inherits(x, "arrow::ChunkedArray")) { | ||
| auto chunked_array = arrow::r::extract<arrow::ChunkedArray>(x); | ||
| fields[i] = | ||
| std::make_shared<arrow::Field>(std::string(names[i]), chunked_array->type()); | ||
| columns[i] = std::make_shared<arrow::Column>(fields[i], chunked_array); | ||
| fields[j] = std::make_shared<arrow::Field>(CHAR(name), chunked_array->type()); | ||
| columns[j] = std::make_shared<arrow::Column>(fields[j], chunked_array); | ||
| } else if (Rf_inherits(x, "arrow::Array")) { | ||
| auto array = arrow::r::extract<arrow::Array>(x); | ||
| fields[i] = std::make_shared<arrow::Field>(std::string(names[i]), array->type()); | ||
| columns[i] = std::make_shared<arrow::Column>(fields[i], array); | ||
| fields[j] = std::make_shared<arrow::Field>(CHAR(name), array->type()); | ||
| columns[j] = std::make_shared<arrow::Column>(fields[j], array); | ||
| } else { | ||
| auto array = Array__from_vector(x, R_NilValue); | ||
| fields[i] = std::make_shared<arrow::Field>(std::string(names[i]), array->type()); | ||
| columns[i] = std::make_shared<arrow::Column>(fields[i], array); | ||
| fields[j] = std::make_shared<arrow::Field>(CHAR(name), array->type()); | ||
| columns[j] = std::make_shared<arrow::Column>(fields[j], array); | ||
| } | ||
| }; | ||
|
|
||
| for (R_xlen_t i = 0, j = 0; j < num_fields; i++) { | ||
| SEXP name_i = STRING_ELT(names, i); | ||
| SEXP x_i = VECTOR_ELT(lst, i); | ||
|
|
||
| if (LENGTH(name_i) == 0) { | ||
| SEXP names_x_i = Rf_getAttrib(x_i, R_NamesSymbol); | ||
| for (R_xlen_t k = 0; k < XLENGTH(x_i); k++, j++) { | ||
| fill_one_column(j, VECTOR_ELT(x_i, k), STRING_ELT(names_x_i, k)); | ||
| } | ||
| } else { | ||
| fill_one_column(j, x_i, name_i); | ||
| j++; | ||
| } | ||
| } | ||
|
|
||
| schema = std::make_shared<arrow::Schema>(std::move(fields)); | ||
| } else { | ||
| // use the schema that is given | ||
| schema = arrow::r::extract<arrow::Schema>(schema_sxp); | ||
|
|
||
| for (R_xlen_t i = 0; i < n; i++) { | ||
| SEXP x = VECTOR_ELT(lst, i); | ||
| auto fill_one_column = [&columns, &schema](int j, SEXP x) { | ||
| if (Rf_inherits(x, "arrow::Column")) { | ||
| columns[i] = arrow::r::extract<arrow::Column>(x); | ||
| columns[j] = arrow::r::extract<arrow::Column>(x); | ||
| } else if (Rf_inherits(x, "arrow::ChunkedArray")) { | ||
| auto chunked_array = arrow::r::extract<arrow::ChunkedArray>(x); | ||
| columns[i] = std::make_shared<arrow::Column>(schema->field(i), chunked_array); | ||
| columns[j] = std::make_shared<arrow::Column>(schema->field(j), chunked_array); | ||
| } else if (Rf_inherits(x, "arrow::Array")) { | ||
| auto array = arrow::r::extract<arrow::Array>(x); | ||
| columns[i] = std::make_shared<arrow::Column>(schema->field(i), array); | ||
| columns[j] = std::make_shared<arrow::Column>(schema->field(j), array); | ||
| } else { | ||
| auto type = schema->field(i)->type(); | ||
| auto type = schema->field(j)->type(); | ||
| auto array = arrow::r::Array__from_vector(x, type, false); | ||
| columns[i] = std::make_shared<arrow::Column>(schema->field(i), array); | ||
| columns[j] = std::make_shared<arrow::Column>(schema->field(j), array); | ||
| } | ||
| }; | ||
|
|
||
| SEXP names = Rf_getAttrib(lst, R_NamesSymbol); | ||
| for (R_xlen_t i = 0, j = 0; j < num_fields; i++) { | ||
| SEXP name_i = STRING_ELT(names, i); | ||
| SEXP x_i = VECTOR_ELT(lst, i); | ||
|
|
||
| if (LENGTH(name_i) == 0) { | ||
| for (R_xlen_t k = 0; k < XLENGTH(x_i); k++, j++) { | ||
| fill_one_column(j, VECTOR_ELT(x_i, k)); | ||
| } | ||
| } else { | ||
| fill_one_column(j, x_i); | ||
| j++; | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right but why? IIUC this has to do with how the cpp code distinguishes things to autosplice, switching behavior on
nchar(name). That's a subtlety I'll probably forget by Monday so it seems worth explaining.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
otherwise we'd have to treat the NULL case differently internally as we did before, maybe I could do that instead.