From df6c1c6681df34f7c89bbc592847e1a9798a2932 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Feb 2022 15:29:01 -0400 Subject: [PATCH 01/66] initial working extension type --- r/DESCRIPTION | 1 + r/R/arrowExports.R | 16 +++ r/R/extension.R | 102 +++++++++++++++++++ r/src/arrowExports.cpp | 68 +++++++++++++ r/src/datatype.cpp | 2 + r/src/extension.cpp | 157 ++++++++++++++++++++++++++++++ r/tests/testthat/test-extension.R | 8 ++ 7 files changed, 354 insertions(+) create mode 100644 r/R/extension.R create mode 100644 r/src/extension.cpp create mode 100644 r/tests/testthat/test-extension.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 36a55c05b26..a5fb1ee9a4e 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -108,6 +108,7 @@ Collate: 'table.R' 'dplyr.R' 'duckdb.R' + 'extension.R' 'feather.R' 'field.R' 'filesystem.R' diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 5ef6312196d..254565e0b53 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1068,6 +1068,22 @@ compute___expr__type_id <- function(x, schema) { .Call(`_arrow_compute___expr__type_id`, x, schema) } +ExtensionType__initialize <- function(storage_type, extension_name, extension_metadata, r6_type_generator, r6_array_generator) { + .Call(`_arrow_ExtensionType__initialize`, storage_type, extension_name, extension_metadata, r6_type_generator, r6_array_generator) +} + +ExtensionType__extension_name <- function(type) { + .Call(`_arrow_ExtensionType__extension_name`, type) +} + +ExtensionType__Serialize <- function(type) { + .Call(`_arrow_ExtensionType__Serialize`, type) +} + +ExtensionType__storage_type <- function(type) { + .Call(`_arrow_ExtensionType__storage_type`, type) +} + ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level) { invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level)) } diff --git a/r/R/extension.R b/r/R/extension.R new file mode 100644 index 00000000000..6447bf6b257 --- /dev/null +++ b/r/R/extension.R @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ExtensionArray <- R6Class("ExtensionArray", + inherit = Array, + active = list( + type = function() { + # C++ call + } + ) +) + +ExtensionType <- R6Class("ExtensionType", + inherit = DataType, + public = list( + storage_type = function() { + ExtensionType__storage_type(self) + }, + + storage_id = function() { + self$storage_type()$id + }, + + extension_name = function() { + ExtensionType__extension_name(self) + }, + + ToString = function() { + self$extension_name() + }, + + Serialize = function() { + ExtensionType__Serialize(self) + }, + + .initialize = function(storage_type, extension_name, extension_metadata) { + abort("Not implemented") + } + ) +) + +MakeExtensionType <- function(storage_type, + extension_name, extension_metadata, + type_class, array_class = ExtensionArray) { + assert_is(type_class, "R6ClassGenerator") + assert_is(array_class, "R6ClassGenerator") + + type <- ExtensionType__initialize( + storage_type, + extension_name, + extension_metadata, + type_class, + array_class + ) + + type$.initialize(storage_type, extension_name, extension_metadata) + type +} + + +SimpleExtensionType <- R6Class("SimpleExtensionType", + inherit = ExtensionType, + public = list( + ToString = function() { + paste0(self$extension_name(), " <'", self.metadata(), "'>") + }, + + metadata = function() { + metadata_chr + }, + + .initialize = function(storage_type, extension_name, extension_metadata) { + private$metadata_chr <- rawToChar(extension_metadata) + } + ), + private = list( + metadata_chr = NULL + ) +) + +SimpleExtensionType$create <- function(storage_type, metadata = "") { + MakeExtensionType( + storage_type, + "arrow_r.simple_extension", + charToRaw(metadata), + type_class = SimpleExtensionType + ) +} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 0a29ed0872d..455fc19625e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4167,6 +4167,70 @@ extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){ } #endif +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +cpp11::sexp ExtensionType__initialize(const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_type_generator, cpp11::environment r6_array_generator); +extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_type_generator_sexp, SEXP r6_array_generator_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type storage_type(storage_type_sexp); + arrow::r::Input::type extension_name(extension_name_sexp); + arrow::r::Input::type extension_metadata(extension_metadata_sexp); + arrow::r::Input::type r6_type_generator(r6_type_generator_sexp); + arrow::r::Input::type r6_array_generator(r6_array_generator_sexp); + return cpp11::as_sexp(ExtensionType__initialize(storage_type, extension_name, extension_metadata, r6_type_generator, r6_array_generator)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_type_generator_sexp, SEXP r6_array_generator_sexp){ + Rf_error("Cannot call ExtensionType__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +std::string ExtensionType__extension_name(const std::shared_ptr& type); +extern "C" SEXP _arrow_ExtensionType__extension_name(SEXP type_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type type(type_sexp); + return cpp11::as_sexp(ExtensionType__extension_name(type)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExtensionType__extension_name(SEXP type_sexp){ + Rf_error("Cannot call ExtensionType__extension_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +cpp11::raws ExtensionType__Serialize(const std::shared_ptr& type); +extern "C" SEXP _arrow_ExtensionType__Serialize(SEXP type_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type type(type_sexp); + return cpp11::as_sexp(ExtensionType__Serialize(type)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExtensionType__Serialize(SEXP type_sexp){ + Rf_error("Cannot call ExtensionType__Serialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExtensionType__storage_type(const std::shared_ptr& type); +extern "C" SEXP _arrow_ExtensionType__storage_type(SEXP type_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type type(type_sexp); + return cpp11::as_sexp(ExtensionType__storage_type(type)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExtensionType__storage_type(SEXP type_sexp){ + Rf_error("Cannot call ExtensionType__storage_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // feather.cpp #if defined(ARROW_R_WITH_ARROW) void ipc___WriteFeather__Table(const std::shared_ptr& stream, const std::shared_ptr& table, int version, int chunk_size, arrow::Compression::type compression, int compression_level); @@ -8011,6 +8075,10 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, + { "_arrow_ExtensionType__initialize", (DL_FUNC) &_arrow_ExtensionType__initialize, 5}, + { "_arrow_ExtensionType__extension_name", (DL_FUNC) &_arrow_ExtensionType__extension_name, 1}, + { "_arrow_ExtensionType__Serialize", (DL_FUNC) &_arrow_ExtensionType__Serialize, 1}, + { "_arrow_ExtensionType__storage_type", (DL_FUNC) &_arrow_ExtensionType__storage_type, 1}, { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index fd083f66d41..68b6c8fada5 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -101,6 +101,8 @@ const char* r6_class_name::get( return "StructType"; case Type::DICTIONARY: return "DictionaryType"; + case Type::EXTENSION: + return "ExtensionType"; default: break; diff --git a/r/src/extension.cpp b/r/src/extension.cpp new file mode 100644 index 00000000000..66dbe686ae2 --- /dev/null +++ b/r/src/extension.cpp @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "./arrow_types.h" + +#if defined(ARROW_R_WITH_ARROW) + +#include +#include +#include + + +class RExtensionType: public arrow::ExtensionType { +public: + RExtensionType(const std::shared_ptr storage_type, + std::string extension_name, std::string extension_metadata, + cpp11::environment r6_type_generator, + cpp11::environment r6_array_generator) + : arrow::ExtensionType(storage_type), + extension_name_(extension_name), + extension_metadata_(extension_metadata), + r6_type_generator_(r6_type_generator), + r6_array_generator_(r6_array_generator) {} + + std::string extension_name() const { return extension_name_; } + bool ExtensionEquals(const arrow::ExtensionType &other) const; + std::shared_ptr MakeArray(std::shared_ptr data) const; + arrow::Result> Deserialize( + std::shared_ptr storage_type, + const std::string &serialized_data) const; + std::string Serialize() const { return extension_metadata_; } + + std::shared_ptr Clone() const; + +private: + std::string extension_name_; + std::string extension_metadata_; + cpp11::environment r6_type_generator_; + cpp11::environment r6_array_generator_; +}; + +class RExtensionArray: public arrow::ExtensionArray { +public: + + RExtensionArray(const std::shared_ptr &data, + const std::shared_ptr &type) + : arrow::ExtensionArray(data), + r_extension_type_(type) {} + +private: + const std::shared_ptr r_extension_type_; +}; + + +bool RExtensionType::ExtensionEquals(const arrow::ExtensionType &other) const { + if (other.extension_name() != extension_name()) { + return false; + } + + auto other_r = dynamic_cast(&other); + if (other_r == nullptr) { + return false; + } + + return other_r->extension_metadata_ == extension_metadata_; +} + +std::shared_ptr RExtensionType::MakeArray(std::shared_ptr data) const { + return std::make_shared(data, Clone()); +} + +arrow::Result> RExtensionType::Deserialize( + std::shared_ptr storage_type, + const std::string &serialized_data) const { + try { + cpp11::function make_extension_type(cpp11::package("arrow")["MakeExtensionType"]); + cpp11::sexp storage_type_r6 = cpp11::to_r6(storage_type); + cpp11::writable::raws serialized_data_raw(serialized_data); + + cpp11::sexp result = make_extension_type( + storage_type_r6, + extension_name(), + serialized_data_raw, + r6_type_generator_, + r6_array_generator_ + ); + + auto ptr = arrow::r::r6_to_pointer*>(result); + return *ptr; + } catch(std::exception& e) { + return arrow::Status::UnknownError(e.what()); + } +} + +std::shared_ptr RExtensionType::Clone() const { + return std::make_shared( + storage_type(), + extension_name_, + extension_metadata_, + r6_type_generator_, + r6_array_generator_ + ); +} + +// [[arrow::export]] +cpp11::sexp ExtensionType__initialize( + const std::shared_ptr& storage_type, + std::string extension_name, + cpp11::raws extension_metadata, + cpp11::environment r6_type_generator, + cpp11::environment r6_array_generator +) { + cpp11::function constructor(r6_type_generator["new"]); + std::string metadata_string(extension_metadata.begin(), extension_metadata.end()); + auto shared_ptr_ptr = new std::shared_ptr( + new RExtensionType( + storage_type, + extension_name, + metadata_string, + r6_type_generator, + r6_array_generator)); + auto external_ptr = cpp11::external_pointer>(shared_ptr_ptr); + return constructor(external_ptr); +} + +// [[arrow::export]] +std::string ExtensionType__extension_name(const std::shared_ptr& type) { + return type->extension_name(); +} + +// [[arrow::export]] +cpp11::raws ExtensionType__Serialize(const std::shared_ptr& type) { + std::string serialized_string = type->Serialize(); + cpp11::writable::raws bytes(serialized_string.begin(), serialized_string.end()); + return bytes; +} + +// [[arrow::export]] +std::shared_ptr ExtensionType__storage_type(const std::shared_ptr& type) { + return type->storage_type(); +} + +#endif diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R new file mode 100644 index 00000000000..19bd2ad2741 --- /dev/null +++ b/r/tests/testthat/test-extension.R @@ -0,0 +1,8 @@ + +test_that("extension types can be created", { + type <- SimpleExtensionType$create(int32(), "some custom metadata") + expect_identical(type$extension_name(), "arrow_r.simple_extension") + expect_true(type$storage_type() == int32()) + expect_identical(type$storage_id(), int32()$id) + expect_identical(type$Serialize(), charToRaw("some custom metadata")) +}) From d7ea064502f5c50436c45ac0e6becdce56b923f2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Feb 2022 15:51:34 -0400 Subject: [PATCH 02/66] simplify implementation --- r/R/extension.R | 47 ++++++++----------------------- r/tests/testthat/test-extension.R | 9 +++++- 2 files changed, 20 insertions(+), 36 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 6447bf6b257..5777238a1d8 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -40,22 +40,29 @@ ExtensionType <- R6Class("ExtensionType", }, ToString = function() { - self$extension_name() + metadata_utf8 <- rawToChar(self$Serialize()) + Encoding(metadata_utf8) <- "UTF-8" + paste0(class(self)[1], " <", metadata_utf8, ">") }, Serialize = function() { ExtensionType__Serialize(self) }, - .initialize = function(storage_type, extension_name, extension_metadata) { - abort("Not implemented") + .Deserialize = function(storage_type, extension_name, extension_metadata) { + # Do nothing by default but allow other classes to override this method + # to populate R6 class members. } ) ) + MakeExtensionType <- function(storage_type, extension_name, extension_metadata, - type_class, array_class = ExtensionArray) { + type_class = ExtensionType, + array_class = ExtensionArray) { + assert_that(is.string(extension_name), is.raw(extension_metadata)) + assert_is(storage_type, "DataType") assert_is(type_class, "R6ClassGenerator") assert_is(array_class, "R6ClassGenerator") @@ -67,36 +74,6 @@ MakeExtensionType <- function(storage_type, array_class ) - type$.initialize(storage_type, extension_name, extension_metadata) + type$.Deserialize(storage_type, extension_name, extension_metadata) type } - - -SimpleExtensionType <- R6Class("SimpleExtensionType", - inherit = ExtensionType, - public = list( - ToString = function() { - paste0(self$extension_name(), " <'", self.metadata(), "'>") - }, - - metadata = function() { - metadata_chr - }, - - .initialize = function(storage_type, extension_name, extension_metadata) { - private$metadata_chr <- rawToChar(extension_metadata) - } - ), - private = list( - metadata_chr = NULL - ) -) - -SimpleExtensionType$create <- function(storage_type, metadata = "") { - MakeExtensionType( - storage_type, - "arrow_r.simple_extension", - charToRaw(metadata), - type_class = SimpleExtensionType - ) -} diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 19bd2ad2741..81882d23ae0 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -1,8 +1,15 @@ test_that("extension types can be created", { - type <- SimpleExtensionType$create(int32(), "some custom metadata") + type <- MakeExtensionType( + int32(), + "arrow_r.simple_extension", + charToRaw("some custom metadata"), + ) + + expect_r6_class(type, "ExtensionType") expect_identical(type$extension_name(), "arrow_r.simple_extension") expect_true(type$storage_type() == int32()) expect_identical(type$storage_id(), int32()$id) expect_identical(type$Serialize(), charToRaw("some custom metadata")) + expect_identical(type$ToString(), "ExtensionType ") }) From 7d69c714ff66cce22b72f9e9c659788862be0890 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Feb 2022 15:55:51 -0400 Subject: [PATCH 03/66] test subclasses --- r/tests/testthat/test-extension.R | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 81882d23ae0..6685db516c6 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -13,3 +13,31 @@ test_that("extension types can be created", { expect_identical(type$Serialize(), charToRaw("some custom metadata")) expect_identical(type$ToString(), "ExtensionType ") }) + +test_that("extension type subclasses work", { + SomeExtensionTypeSubclass <- R6Class( + "SomeExtensionTypeSubclass", inherit = ExtensionType, + public = list( + some_custom_method = function() { + private$some_custom_field + }, + + .Deserialize = function(storage_type, extension_name, extension_metadata) { + private$some_custom_field <- head(extension_metadata, 5) + } + ), + private = list( + some_custom_field = NULL + ) + ) + + type <- MakeExtensionType( + int32(), + "some_extension_subclass", + charToRaw("some custom metadata"), + type_class = SomeExtensionTypeSubclass + ) + + expect_r6_class(type, "SomeExtensionTypeSubclass") + expect_identical(type$some_custom_method(), charToRaw("some ")) +}) From 20557763b681a4475074002e58f08b037dae467a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Feb 2022 16:33:26 -0400 Subject: [PATCH 04/66] with some array infrastructure --- r/R/arrowExports.R | 16 ++++++++ r/R/extension.R | 10 +++++ r/src/arrowExports.cpp | 67 +++++++++++++++++++++++++++++++ r/src/extension.cpp | 28 +++++++++++-- r/tests/testthat/test-extension.R | 21 ++++++++++ 5 files changed, 139 insertions(+), 3 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 254565e0b53..fc4ea5a1ef2 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1084,6 +1084,22 @@ ExtensionType__storage_type <- function(type) { .Call(`_arrow_ExtensionType__storage_type`, type) } +ExtensionType__MakeArray <- function(type, data) { + .Call(`_arrow_ExtensionType__MakeArray`, type, data) +} + +ExtensionArray__storage <- function(array) { + .Call(`_arrow_ExtensionArray__storage`, array) +} + +RegisterRExtensionType <- function(type) { + invisible(.Call(`_arrow_RegisterRExtensionType`, type)) +} + +UnregisterRExtensionType <- function(type_name) { + invisible(.Call(`_arrow_UnregisterRExtensionType`, type_name)) +} + ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level) { invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level)) } diff --git a/r/R/extension.R b/r/R/extension.R index 5777238a1d8..f0444bc36b8 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -17,6 +17,11 @@ ExtensionArray <- R6Class("ExtensionArray", inherit = Array, + public = list( + storage = function() { + ExtensionArray__storage(self) + } + ), active = list( type = function() { # C++ call @@ -49,6 +54,11 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__Serialize(self) }, + MakeArray = function(data) { + assert_is(data, "ArrayData") + ExtensionType__MakeArray(self, data) + }, + .Deserialize = function(storage_type, extension_name, extension_metadata) { # Do nothing by default but allow other classes to override this method # to populate R6 class members. diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 455fc19625e..095fc47635e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4231,6 +4231,69 @@ extern "C" SEXP _arrow_ExtensionType__storage_type(SEXP type_sexp){ } #endif +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExtensionType__MakeArray(const std::shared_ptr& type, const std::shared_ptr& data); +extern "C" SEXP _arrow_ExtensionType__MakeArray(SEXP type_sexp, SEXP data_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type type(type_sexp); + arrow::r::Input&>::type data(data_sexp); + return cpp11::as_sexp(ExtensionType__MakeArray(type, data)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExtensionType__MakeArray(SEXP type_sexp, SEXP data_sexp){ + Rf_error("Cannot call ExtensionType__MakeArray(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ExtensionArray__storage(const std::shared_ptr& array); +extern "C" SEXP _arrow_ExtensionArray__storage(SEXP array_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type array(array_sexp); + return cpp11::as_sexp(ExtensionArray__storage(array)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExtensionArray__storage(SEXP array_sexp){ + Rf_error("Cannot call ExtensionArray__storage(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +void RegisterRExtensionType(const std::shared_ptr& type); +extern "C" SEXP _arrow_RegisterRExtensionType(SEXP type_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type type(type_sexp); + RegisterRExtensionType(type); + return R_NilValue; +END_CPP11 +} +#else +extern "C" SEXP _arrow_RegisterRExtensionType(SEXP type_sexp){ + Rf_error("Cannot call RegisterRExtensionType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +void UnregisterRExtensionType(std::string type_name); +extern "C" SEXP _arrow_UnregisterRExtensionType(SEXP type_name_sexp){ +BEGIN_CPP11 + arrow::r::Input::type type_name(type_name_sexp); + UnregisterRExtensionType(type_name); + return R_NilValue; +END_CPP11 +} +#else +extern "C" SEXP _arrow_UnregisterRExtensionType(SEXP type_name_sexp){ + Rf_error("Cannot call UnregisterRExtensionType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // feather.cpp #if defined(ARROW_R_WITH_ARROW) void ipc___WriteFeather__Table(const std::shared_ptr& stream, const std::shared_ptr& table, int version, int chunk_size, arrow::Compression::type compression, int compression_level); @@ -8079,6 +8142,10 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExtensionType__extension_name", (DL_FUNC) &_arrow_ExtensionType__extension_name, 1}, { "_arrow_ExtensionType__Serialize", (DL_FUNC) &_arrow_ExtensionType__Serialize, 1}, { "_arrow_ExtensionType__storage_type", (DL_FUNC) &_arrow_ExtensionType__storage_type, 1}, + { "_arrow_ExtensionType__MakeArray", (DL_FUNC) &_arrow_ExtensionType__MakeArray, 2}, + { "_arrow_ExtensionArray__storage", (DL_FUNC) &_arrow_ExtensionArray__storage, 1}, + { "_arrow_RegisterRExtensionType", (DL_FUNC) &_arrow_RegisterRExtensionType, 1}, + { "_arrow_UnregisterRExtensionType", (DL_FUNC) &_arrow_UnregisterRExtensionType, 1}, { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, diff --git a/r/src/extension.cpp b/r/src/extension.cpp index 66dbe686ae2..e7a022236cc 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -71,15 +71,15 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType &other) const { return false; } - auto other_r = dynamic_cast(&other); - if (other_r == nullptr) { + if (other.Serialize() != Serialize()) { return false; } - return other_r->extension_metadata_ == extension_metadata_; + return true; } std::shared_ptr RExtensionType::MakeArray(std::shared_ptr data) const { + data->type = Clone(); return std::make_shared(data, Clone()); } @@ -154,4 +154,26 @@ std::shared_ptr ExtensionType__storage_type(const std::shared_p return type->storage_type(); } +// [[arrow::export]] +std::shared_ptr ExtensionType__MakeArray(const std::shared_ptr& type, + const std::shared_ptr& data) { + return type->MakeArray(data); +} + +// [[arrow::export]] +std::shared_ptr ExtensionArray__storage(const std::shared_ptr& array) { + return array->storage(); +} + +// [[arrow::export]] +void RegisterRExtensionType(const std::shared_ptr& type) { + auto ext_type = std::dynamic_pointer_cast(type); + StopIfNotOk(arrow::RegisterExtensionType(ext_type)); +} + +// [[arrow::export]] +void UnregisterRExtensionType(std::string type_name) { + StopIfNotOk(arrow::UnregisterExtensionType(type_name)); +} + #endif diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 6685db516c6..fb433d946b6 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -12,6 +12,14 @@ test_that("extension types can be created", { expect_identical(type$storage_id(), int32()$id) expect_identical(type$Serialize(), charToRaw("some custom metadata")) expect_identical(type$ToString(), "ExtensionType ") + + storage <- Array$create(1:10) + array <- type$MakeArray(storage$data()) + expect_r6_class(array, "ExtensionArray") + expect_r6_class(array$type, "ExtensionType") + + expect_true(array$type == type) + expect_true(array$storage() == storage) }) test_that("extension type subclasses work", { @@ -31,6 +39,10 @@ test_that("extension type subclasses work", { ) ) + SomeExtensionArraySubclass <- R6Class( + "SomeExtensionArraySubclass", inherit = ExtensionArray + ) + type <- MakeExtensionType( int32(), "some_extension_subclass", @@ -40,4 +52,13 @@ test_that("extension type subclasses work", { expect_r6_class(type, "SomeExtensionTypeSubclass") expect_identical(type$some_custom_method(), charToRaw("some ")) + + + +}) + +test_that("extension types can be registered", { + }) + + From eec9407a863f946a718232d6c82d04f286805388 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Feb 2022 17:45:48 -0400 Subject: [PATCH 05/66] with registration of custom subclasses --- r/R/arrowExports.R | 8 ++--- r/R/extension.R | 52 ++++++++++++++++++++++++------- r/src/array.cpp | 2 ++ r/src/arrowExports.cpp | 24 +++++++------- r/src/extension.cpp | 22 +++---------- r/tests/testthat/test-extension.R | 15 +++++---- 6 files changed, 73 insertions(+), 50 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index fc4ea5a1ef2..ac09d65c067 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1092,12 +1092,12 @@ ExtensionArray__storage <- function(array) { .Call(`_arrow_ExtensionArray__storage`, array) } -RegisterRExtensionType <- function(type) { - invisible(.Call(`_arrow_RegisterRExtensionType`, type)) +arrow__RegisterRExtensionType <- function(type) { + invisible(.Call(`_arrow_arrow__RegisterRExtensionType`, type)) } -UnregisterRExtensionType <- function(type_name) { - invisible(.Call(`_arrow_UnregisterRExtensionType`, type_name)) +arrow__UnregisterRExtensionType <- function(type_name) { + invisible(.Call(`_arrow_arrow__UnregisterRExtensionType`, type_name)) } ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level) { diff --git a/r/R/extension.R b/r/R/extension.R index f0444bc36b8..c6fe958195b 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -18,14 +18,13 @@ ExtensionArray <- R6Class("ExtensionArray", inherit = Array, public = list( + initialize = function(xp) { + super$initialize(xp) + }, + storage = function() { ExtensionArray__storage(self) } - ), - active = list( - type = function() { - # C++ call - } ) ) @@ -44,12 +43,6 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__extension_name(self) }, - ToString = function() { - metadata_utf8 <- rawToChar(self$Serialize()) - Encoding(metadata_utf8) <- "UTF-8" - paste0(class(self)[1], " <", metadata_utf8, ">") - }, - Serialize = function() { ExtensionType__Serialize(self) }, @@ -59,6 +52,12 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__MakeArray(self, data) }, + ToString = function() { + metadata_utf8 <- rawToChar(self$Serialize()) + Encoding(metadata_utf8) <- "UTF-8" + paste0(class(self)[1], " <", metadata_utf8, ">") + }, + .Deserialize = function(storage_type, extension_name, extension_metadata) { # Do nothing by default but allow other classes to override this method # to populate R6 class members. @@ -66,6 +65,19 @@ ExtensionType <- R6Class("ExtensionType", ) ) +ExtensionType$.default_new <- ExtensionType$new +ExtensionType$new <- function(xp) { + superclass <- ExtensionType$.default_new(xp) + registered_type <- extension_type_registry[[superclass$extension_name()]] + if (is.null(registered_type)) { + return(superclass) + } + + type <- registered_type$clone() + type[[".:xp:."]] <- xp + type +} + MakeExtensionType <- function(storage_type, extension_name, extension_metadata, @@ -87,3 +99,21 @@ MakeExtensionType <- function(storage_type, type$.Deserialize(storage_type, extension_name, extension_metadata) type } + +RegisterExtensionType <- function(type) { + assert_is(type, "ExtensionType") + arrow__RegisterRExtensionType(type) + extension_type_registry[[type$extension_name()]] <- type + invisible(type) +} + +UnregisterExtensionType <- function(extension_name) { + arrow__UnregisterRExtensionType(extension_name) + result <- extension_type_registry[[extension_name]] + if (!is.null(result)) { + rm(list = extension_name, envir = extension_type_registry) + } + invisible(result) +} + +extension_type_registry <- new.env(parent = emptyenv()) diff --git a/r/src/array.cpp b/r/src/array.cpp index 8fcc96e0d42..16490bbaeca 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -41,6 +41,8 @@ const char* r6_class_name::get(const std::shared_ptr return "FixedSizeListArray"; case arrow::Type::MAP: return "MapArray"; + case arrow::Type::EXTENSION: + return "ExtensionArray"; default: return "Array"; diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 095fc47635e..e46834a5d46 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4264,33 +4264,33 @@ extern "C" SEXP _arrow_ExtensionArray__storage(SEXP array_sexp){ // extension.cpp #if defined(ARROW_R_WITH_ARROW) -void RegisterRExtensionType(const std::shared_ptr& type); -extern "C" SEXP _arrow_RegisterRExtensionType(SEXP type_sexp){ +void arrow__RegisterRExtensionType(const std::shared_ptr& type); +extern "C" SEXP _arrow_arrow__RegisterRExtensionType(SEXP type_sexp){ BEGIN_CPP11 arrow::r::Input&>::type type(type_sexp); - RegisterRExtensionType(type); + arrow__RegisterRExtensionType(type); return R_NilValue; END_CPP11 } #else -extern "C" SEXP _arrow_RegisterRExtensionType(SEXP type_sexp){ - Rf_error("Cannot call RegisterRExtensionType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +extern "C" SEXP _arrow_arrow__RegisterRExtensionType(SEXP type_sexp){ + Rf_error("Cannot call arrow__RegisterRExtensionType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif // extension.cpp #if defined(ARROW_R_WITH_ARROW) -void UnregisterRExtensionType(std::string type_name); -extern "C" SEXP _arrow_UnregisterRExtensionType(SEXP type_name_sexp){ +void arrow__UnregisterRExtensionType(std::string type_name); +extern "C" SEXP _arrow_arrow__UnregisterRExtensionType(SEXP type_name_sexp){ BEGIN_CPP11 arrow::r::Input::type type_name(type_name_sexp); - UnregisterRExtensionType(type_name); + arrow__UnregisterRExtensionType(type_name); return R_NilValue; END_CPP11 } #else -extern "C" SEXP _arrow_UnregisterRExtensionType(SEXP type_name_sexp){ - Rf_error("Cannot call UnregisterRExtensionType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +extern "C" SEXP _arrow_arrow__UnregisterRExtensionType(SEXP type_name_sexp){ + Rf_error("Cannot call arrow__UnregisterRExtensionType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -8144,8 +8144,8 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExtensionType__storage_type", (DL_FUNC) &_arrow_ExtensionType__storage_type, 1}, { "_arrow_ExtensionType__MakeArray", (DL_FUNC) &_arrow_ExtensionType__MakeArray, 2}, { "_arrow_ExtensionArray__storage", (DL_FUNC) &_arrow_ExtensionArray__storage, 1}, - { "_arrow_RegisterRExtensionType", (DL_FUNC) &_arrow_RegisterRExtensionType, 1}, - { "_arrow_UnregisterRExtensionType", (DL_FUNC) &_arrow_UnregisterRExtensionType, 1}, + { "_arrow_arrow__RegisterRExtensionType", (DL_FUNC) &_arrow_arrow__RegisterRExtensionType, 1}, + { "_arrow_arrow__UnregisterRExtensionType", (DL_FUNC) &_arrow_arrow__UnregisterRExtensionType, 1}, { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, diff --git a/r/src/extension.cpp b/r/src/extension.cpp index e7a022236cc..e3c61b30447 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -53,19 +53,6 @@ class RExtensionType: public arrow::ExtensionType { cpp11::environment r6_array_generator_; }; -class RExtensionArray: public arrow::ExtensionArray { -public: - - RExtensionArray(const std::shared_ptr &data, - const std::shared_ptr &type) - : arrow::ExtensionArray(data), - r_extension_type_(type) {} - -private: - const std::shared_ptr r_extension_type_; -}; - - bool RExtensionType::ExtensionEquals(const arrow::ExtensionType &other) const { if (other.extension_name() != extension_name()) { return false; @@ -79,8 +66,9 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType &other) const { } std::shared_ptr RExtensionType::MakeArray(std::shared_ptr data) const { - data->type = Clone(); - return std::make_shared(data, Clone()); + std::shared_ptr new_data = data->Copy(); + new_data->type = Clone(); + return std::make_shared(new_data); } arrow::Result> RExtensionType::Deserialize( @@ -166,13 +154,13 @@ std::shared_ptr ExtensionArray__storage(const std::shared_ptr& type) { +void arrow__RegisterRExtensionType(const std::shared_ptr& type) { auto ext_type = std::dynamic_pointer_cast(type); StopIfNotOk(arrow::RegisterExtensionType(ext_type)); } // [[arrow::export]] -void UnregisterRExtensionType(std::string type_name) { +void arrow__UnregisterRExtensionType(std::string type_name) { StopIfNotOk(arrow::UnregisterExtensionType(type_name)); } diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index fb433d946b6..faa38598012 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -19,7 +19,7 @@ test_that("extension types can be created", { expect_r6_class(array$type, "ExtensionType") expect_true(array$type == type) - expect_true(array$storage() == storage) + expect_true(all(array$storage() == storage)) }) test_that("extension type subclasses work", { @@ -53,12 +53,15 @@ test_that("extension type subclasses work", { expect_r6_class(type, "SomeExtensionTypeSubclass") expect_identical(type$some_custom_method(), charToRaw("some ")) + RegisterExtensionType(type) + ptr_type <- allocate_arrow_schema() + type$export_to_c(ptr_type) + type2 <- DataType$import_from_c(ptr_type) + delete_arrow_schema(ptr_type) -}) - -test_that("extension types can be registered", { + expect_identical(type2$extension_name(), "some_extension_subclass") + expect_identical(type2$some_custom_method(), type$some_custom_method()) + expect_identical(UnregisterExtensionType("some_extension_subclass"), type) }) - - From 6abc0dbde205b73ccae3772df40e2193f955dbae Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Feb 2022 18:15:58 -0400 Subject: [PATCH 06/66] get array and type subclasses working better --- r/R/extension.R | 49 ++++++++++++++++++++++++++----- r/tests/testthat/test-extension.R | 14 +++++++-- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index c6fe958195b..3c8ecb8c8c8 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -18,19 +18,45 @@ ExtensionArray <- R6Class("ExtensionArray", inherit = Array, public = list( - initialize = function(xp) { - super$initialize(xp) - }, - storage = function() { ExtensionArray__storage(self) } ) ) +ExtensionArray$.default_new <- ExtensionArray$new +ExtensionArray$new <- function(xp) { + superclass <- ExtensionArray$.default_new(xp) + registered_type <- extension_type_registry[[superclass$type$extension_name()]] + if (is.null(registered_type)) { + return(superclass) + } + + class <- registered_type$.__enclos_env__$private$array_class + if (inherits(superclass, class$classname)) { + return(superclass) + } + + class$new(xp) +} + ExtensionType <- R6Class("ExtensionType", inherit = DataType, public = list( + initialize = function(xp) { + super$initialize(xp) + self$.Deserialize( + self$storage_type(), + self$extension_name(), + self$Serialize() + ) + }, + + .set_r6_constructors = function(type_class, array_class) { + private$type_class <- type_class + private$array_class <- array_class + }, + storage_type = function() { ExtensionType__storage_type(self) }, @@ -52,6 +78,11 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__MakeArray(self, data) }, + WrapArray = function(array) { + assert_is(array, "Array") + self$MakeArray(array$data()) + }, + ToString = function() { metadata_utf8 <- rawToChar(self$Serialize()) Encoding(metadata_utf8) <- "UTF-8" @@ -62,6 +93,11 @@ ExtensionType <- R6Class("ExtensionType", # Do nothing by default but allow other classes to override this method # to populate R6 class members. } + ), + + private = list( + type_class = NULL, + array_class = NULL ) ) @@ -73,9 +109,7 @@ ExtensionType$new <- function(xp) { return(superclass) } - type <- registered_type$clone() - type[[".:xp:."]] <- xp - type + registered_type$.__enclos_env__$private$type_class$new(xp) } @@ -96,6 +130,7 @@ MakeExtensionType <- function(storage_type, array_class ) + type$.set_r6_constructors(type_class, array_class) type$.Deserialize(storage_type, extension_name, extension_metadata) type } diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index faa38598012..3542bef0856 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -40,14 +40,20 @@ test_that("extension type subclasses work", { ) SomeExtensionArraySubclass <- R6Class( - "SomeExtensionArraySubclass", inherit = ExtensionArray + "SomeExtensionArraySubclass", inherit = ExtensionArray, + public = list( + some_custom_method = function() { + self$type$some_custom_method() + } + ) ) type <- MakeExtensionType( int32(), "some_extension_subclass", charToRaw("some custom metadata"), - type_class = SomeExtensionTypeSubclass + type_class = SomeExtensionTypeSubclass, + array_class = SomeExtensionArraySubclass ) expect_r6_class(type, "SomeExtensionTypeSubclass") @@ -63,5 +69,9 @@ test_that("extension type subclasses work", { expect_identical(type2$extension_name(), "some_extension_subclass") expect_identical(type2$some_custom_method(), type$some_custom_method()) + array <- type$WrapArray(Array$create(1:10)) + expect_r6_class(array, "SomeExtensionArraySubclass") + expect_identical(array$some_custom_method(), type$some_custom_method()) + expect_identical(UnregisterExtensionType("some_extension_subclass"), type) }) From be1992cf2d147d8f584a0c940e6c6d656bf178ce Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Feb 2022 18:16:32 -0400 Subject: [PATCH 07/66] clang-format --- r/src/extension.cpp | 88 ++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/r/src/extension.cpp b/r/src/extension.cpp index e3c61b30447..b229f60f484 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -19,41 +19,40 @@ #if defined(ARROW_R_WITH_ARROW) -#include -#include #include +#include +#include - -class RExtensionType: public arrow::ExtensionType { -public: +class RExtensionType : public arrow::ExtensionType { + public: RExtensionType(const std::shared_ptr storage_type, std::string extension_name, std::string extension_metadata, cpp11::environment r6_type_generator, cpp11::environment r6_array_generator) - : arrow::ExtensionType(storage_type), - extension_name_(extension_name), - extension_metadata_(extension_metadata), - r6_type_generator_(r6_type_generator), - r6_array_generator_(r6_array_generator) {} + : arrow::ExtensionType(storage_type), + extension_name_(extension_name), + extension_metadata_(extension_metadata), + r6_type_generator_(r6_type_generator), + r6_array_generator_(r6_array_generator) {} std::string extension_name() const { return extension_name_; } - bool ExtensionEquals(const arrow::ExtensionType &other) const; + bool ExtensionEquals(const arrow::ExtensionType& other) const; std::shared_ptr MakeArray(std::shared_ptr data) const; arrow::Result> Deserialize( std::shared_ptr storage_type, - const std::string &serialized_data) const; + const std::string& serialized_data) const; std::string Serialize() const { return extension_metadata_; } std::shared_ptr Clone() const; -private: + private: std::string extension_name_; std::string extension_metadata_; cpp11::environment r6_type_generator_; cpp11::environment r6_array_generator_; }; -bool RExtensionType::ExtensionEquals(const arrow::ExtensionType &other) const { +bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { if (other.extension_name() != extension_name()) { return false; } @@ -65,7 +64,8 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType &other) const { return true; } -std::shared_ptr RExtensionType::MakeArray(std::shared_ptr data) const { +std::shared_ptr RExtensionType::MakeArray( + std::shared_ptr data) const { std::shared_ptr new_data = data->Copy(); new_data->type = Clone(); return std::make_shared(new_data); @@ -73,60 +73,47 @@ std::shared_ptr RExtensionType::MakeArray(std::shared_ptr> RExtensionType::Deserialize( std::shared_ptr storage_type, - const std::string &serialized_data) const { + const std::string& serialized_data) const { try { cpp11::function make_extension_type(cpp11::package("arrow")["MakeExtensionType"]); cpp11::sexp storage_type_r6 = cpp11::to_r6(storage_type); cpp11::writable::raws serialized_data_raw(serialized_data); - cpp11::sexp result = make_extension_type( - storage_type_r6, - extension_name(), - serialized_data_raw, - r6_type_generator_, - r6_array_generator_ - ); + cpp11::sexp result = + make_extension_type(storage_type_r6, extension_name(), serialized_data_raw, + r6_type_generator_, r6_array_generator_); auto ptr = arrow::r::r6_to_pointer*>(result); return *ptr; - } catch(std::exception& e) { + } catch (std::exception& e) { return arrow::Status::UnknownError(e.what()); } } std::shared_ptr RExtensionType::Clone() const { - return std::make_shared( - storage_type(), - extension_name_, - extension_metadata_, - r6_type_generator_, - r6_array_generator_ - ); + return std::make_shared(storage_type(), extension_name_, + extension_metadata_, r6_type_generator_, + r6_array_generator_); } // [[arrow::export]] cpp11::sexp ExtensionType__initialize( - const std::shared_ptr& storage_type, - std::string extension_name, - cpp11::raws extension_metadata, - cpp11::environment r6_type_generator, - cpp11::environment r6_array_generator -) { + const std::shared_ptr& storage_type, std::string extension_name, + cpp11::raws extension_metadata, cpp11::environment r6_type_generator, + cpp11::environment r6_array_generator) { cpp11::function constructor(r6_type_generator["new"]); std::string metadata_string(extension_metadata.begin(), extension_metadata.end()); auto shared_ptr_ptr = new std::shared_ptr( - new RExtensionType( - storage_type, - extension_name, - metadata_string, - r6_type_generator, - r6_array_generator)); - auto external_ptr = cpp11::external_pointer>(shared_ptr_ptr); + new RExtensionType(storage_type, extension_name, metadata_string, r6_type_generator, + r6_array_generator)); + auto external_ptr = + cpp11::external_pointer>(shared_ptr_ptr); return constructor(external_ptr); } // [[arrow::export]] -std::string ExtensionType__extension_name(const std::shared_ptr& type) { +std::string ExtensionType__extension_name( + const std::shared_ptr& type) { return type->extension_name(); } @@ -138,18 +125,21 @@ cpp11::raws ExtensionType__Serialize(const std::shared_ptr } // [[arrow::export]] -std::shared_ptr ExtensionType__storage_type(const std::shared_ptr& type) { +std::shared_ptr ExtensionType__storage_type( + const std::shared_ptr& type) { return type->storage_type(); } // [[arrow::export]] -std::shared_ptr ExtensionType__MakeArray(const std::shared_ptr& type, - const std::shared_ptr& data) { +std::shared_ptr ExtensionType__MakeArray( + const std::shared_ptr& type, + const std::shared_ptr& data) { return type->MakeArray(data); } // [[arrow::export]] -std::shared_ptr ExtensionArray__storage(const std::shared_ptr& array) { +std::shared_ptr ExtensionArray__storage( + const std::shared_ptr& array) { return array->storage(); } From 45af0322aefa61662ab3302820f2bee5f14781f5 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Feb 2022 21:47:12 -0400 Subject: [PATCH 08/66] license --- r/tests/testthat/test-extension.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 3542bef0856..55ef3e89201 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. test_that("extension types can be created", { type <- MakeExtensionType( From 9184a6058c8ab1b92f956e5bd8b89650880d4764 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 14 Mar 2022 14:10:57 -0300 Subject: [PATCH 09/66] Update r/R/extension.R Co-authored-by: Jonathan Keane --- r/R/extension.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/extension.R b/r/R/extension.R index 3c8ecb8c8c8..46e1c753d68 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -114,7 +114,8 @@ ExtensionType$new <- function(xp) { MakeExtensionType <- function(storage_type, - extension_name, extension_metadata, + extension_name, + extension_metadata, type_class = ExtensionType, array_class = ExtensionArray) { assert_that(is.string(extension_name), is.raw(extension_metadata)) From 88145dadebbd819ccc020aaa0ce72463eda236e5 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 14 Mar 2022 15:24:21 -0300 Subject: [PATCH 10/66] make sure ExtensionEquals can be overridden from R --- r/R/extension.R | 9 +++++- r/src/extension.cpp | 45 +++++++++++++++++++++------- r/tests/testthat/test-extension.R | 49 +++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 12 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 46e1c753d68..af18cb88cdb 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -92,6 +92,13 @@ ExtensionType <- R6Class("ExtensionType", .Deserialize = function(storage_type, extension_name, extension_metadata) { # Do nothing by default but allow other classes to override this method # to populate R6 class members. + }, + + .ExtensionEquals = function(other) { + # note that this must not call to C++ (because C++ might call here) + inherits(other, "ExtensionType") && + identical(other$extension_name(), self$extension_name()) && + identical(other$Serialize(), self$Serialize()) } ), @@ -114,7 +121,7 @@ ExtensionType$new <- function(xp) { MakeExtensionType <- function(storage_type, - extension_name, + extension_name, extension_metadata, type_class = ExtensionType, array_class = ExtensionArray) { diff --git a/r/src/extension.cpp b/r/src/extension.cpp index b229f60f484..dadc7eaec32 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -36,15 +36,21 @@ class RExtensionType : public arrow::ExtensionType { r6_array_generator_(r6_array_generator) {} std::string extension_name() const { return extension_name_; } + bool ExtensionEquals(const arrow::ExtensionType& other) const; + std::shared_ptr MakeArray(std::shared_ptr data) const; + arrow::Result> Deserialize( std::shared_ptr storage_type, const std::string& serialized_data) const; + std::string Serialize() const { return extension_metadata_; } std::shared_ptr Clone() const; + cpp11::environment to_r6() const; + private: std::string extension_name_; std::string extension_metadata_; @@ -53,15 +59,27 @@ class RExtensionType : public arrow::ExtensionType { }; bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { + // Avoid materializing the R6 type if at all possible, since this is slow + // and in some cases not possible due to threading if (other.extension_name() != extension_name()) { return false; } - if (other.Serialize() != Serialize()) { - return false; + if (other.Serialize() == Serialize()) { + return true; } - return true; + // With any ambiguity, we need to materialize the R6 type and call its + // ExtensionEquals method. + cpp11::environment instance = to_r6(); + cpp11::function instance_ExtensionEquals(instance[".ExtensionEquals"]); + + std::shared_ptr other_shared = + ValueOrStop(other.Deserialize(other.storage_type(), other.Serialize())); + cpp11::sexp other_r6 = cpp11::to_r6(other_shared, "ExtensionType"); + + cpp11::logicals result(instance_ExtensionEquals(other_r6)); + return cpp11::as_cpp(result); } std::shared_ptr RExtensionType::MakeArray( @@ -75,14 +93,7 @@ arrow::Result> RExtensionType::Deserialize( std::shared_ptr storage_type, const std::string& serialized_data) const { try { - cpp11::function make_extension_type(cpp11::package("arrow")["MakeExtensionType"]); - cpp11::sexp storage_type_r6 = cpp11::to_r6(storage_type); - cpp11::writable::raws serialized_data_raw(serialized_data); - - cpp11::sexp result = - make_extension_type(storage_type_r6, extension_name(), serialized_data_raw, - r6_type_generator_, r6_array_generator_); - + cpp11::environment result = to_r6(); auto ptr = arrow::r::r6_to_pointer*>(result); return *ptr; } catch (std::exception& e) { @@ -96,6 +107,18 @@ std::shared_ptr RExtensionType::Clone() const { r6_array_generator_); } +cpp11::environment RExtensionType::to_r6() const { + cpp11::function make_extension_type(cpp11::package("arrow")["MakeExtensionType"]); + cpp11::sexp storage_type_r6 = cpp11::to_r6(storage_type()); + cpp11::writable::raws serialized_data_raw(Serialize()); + + cpp11::sexp result = + make_extension_type(storage_type_r6, extension_name(), serialized_data_raw, + r6_type_generator_, r6_array_generator_); + + return result; +} + // [[arrow::export]] cpp11::sexp ExtensionType__initialize( const std::shared_ptr& storage_type, std::string extension_name, diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 55ef3e89201..d9e60715dc8 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -91,3 +91,52 @@ test_that("extension type subclasses work", { expect_identical(UnregisterExtensionType("some_extension_subclass"), type) }) + +test_that("extension subclasses can override the ExtensionEquals method", { + SomeExtensionTypeSubclass <- R6Class( + "SomeExtensionTypeSubclass", inherit = ExtensionType, + public = list( + field_values = NULL, + + .Deserialize = function(storage_type, extension_name, extension_metadata) { + self$field_values <- unserialize(extension_metadata) + }, + + .ExtensionEquals = function(other) { + if (!inherits(other, "SomeExtensionTypeSubclass")) { + return(FALSE) + } + + setequal(names(other$field_values), names(self$field_values)) && + identical( + other$field_values[names(self$field_values)], + self$field_values + ) + } + ) + ) + + type <- MakeExtensionType( + int32(), + "some_extension_subclass", + serialize(list(field1 = "value1", field2 = "value2"), NULL), + type_class = SomeExtensionTypeSubclass + ) + + RegisterExtensionType(type) + + expect_true(type$.ExtensionEquals(type)) + expect_true(type$Equals(type)) + + type2 <- MakeExtensionType( + int32(), + "some_extension_subclass", + serialize(list(field2 = "value2", field1 = "value1"), NULL), + type_class = SomeExtensionTypeSubclass + ) + + expect_true(type$.ExtensionEquals(type2)) + expect_true(type$Equals(type2)) + + UnregisterExtensionType("some_extension_subclass") +}) From f3afad78356e207661e671e84be4e8df3caff1bc Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 14 Mar 2022 17:00:58 -0300 Subject: [PATCH 11/66] start on vctrs extension type + tests --- r/R/arrow-package.R | 4 ++ r/R/extension.R | 89 +++++++++++++++++++++++++++++++ r/tests/testthat/test-extension.R | 31 +++++++++++ 3 files changed, 124 insertions(+) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 2fab03d08c3..d688b89b801 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -80,6 +80,9 @@ } } + # register extension types that we use internally + ReRegisterExtensionType(vctrs_extension_type(vctrs::unspecified())) + invisible() } @@ -108,6 +111,7 @@ } } + #' Is the C++ Arrow library available? #' #' You won't generally need to call these function, but they're made available diff --git a/r/R/extension.R b/r/R/extension.R index af18cb88cdb..701056504f9 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. + ExtensionArray <- R6Class("ExtensionArray", inherit = Array, public = list( @@ -150,6 +151,24 @@ RegisterExtensionType <- function(type) { invisible(type) } +ReRegisterExtensionType <- function(type) { + extension_name <- type$extension_name() + result <- extension_type_registry[[extension_name]] + if (!is.null(result)) { + UnregisterExtensionType(extension_name) + } + + tryCatch( + RegisterExtensionType(type), + error = function(e) { + UnregisterExtensionType(extension_name) + RegisterExtensionType(type) + } + ) + + invisible(result) +} + UnregisterExtensionType <- function(extension_name) { arrow__UnregisterRExtensionType(extension_name) result <- extension_type_registry[[extension_name]] @@ -160,3 +179,73 @@ UnregisterExtensionType <- function(extension_name) { } extension_type_registry <- new.env(parent = emptyenv()) + + +VctrsExtensionType <- R6Class("VctrsExtensionType", + inherit = ExtensionType, + public = list( + ptype = function() { + private$.ptype + }, + + ToString = function() { + tf <- tempfile() + on.exit(unlink(tf)) + sink(tf) + print(self$ptype()) + sink(NULL) + paste0(readLines(tf), collapse = "\n") + }, + + .Deserialize = function(storage_type, extension_name, extension_metadata) { + message("Deserialize called") + private$.ptype <- unserialize(extension_metadata) + message(sprintf("...with ptype class %s", paste0(class(private$.ptype), collapse = " / "))) + }, + + .ExtensionEquals = function(other) { + if (!inherits(other, "VctrsExtensionType")) { + return(FALSE) + } + + identical(self$ptype(), other$ptype()) + } + ), + private = list( + .ptype = NULL + ) +) + +VctrsExtensionArray <- R6Class("VctrsExtensionArray", + inherit = ExtensionArray, + public = list( + to_vector = function() { + vctrs::vec_restore(super$to_vector(), self$type$ptype()) + } + ) +) + +VctrsExtensionArray$create <- function(x, ptype = vctrs::vec_ptype(x), + type = NULL) { + if (inherits(x, "VctrsExtensionArray")) { + return(x) + } + + vctrs::vec_assert(x) + type <- vctrs_extension_type(ptype) + storage <- Array$create(vctrs::vec_data(x), type = type$storage_type()) + type$WrapArray(storage) +} + + +vctrs_extension_type <- function(ptype) { + ptype <- vctrs::vec_ptype(ptype) + + MakeExtensionType( + storage_type = type(vctrs::vec_data(ptype)), + extension_name = "arrow.r.vctrs", + extension_metadata = serialize(ptype, NULL), + type_class = VctrsExtensionType, + array_class = VctrsExtensionArray + ) +} diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index d9e60715dc8..bf81aefec18 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -140,3 +140,34 @@ test_that("extension subclasses can override the ExtensionEquals method", { UnregisterExtensionType("some_extension_subclass") }) + +test_that("vctrs extension type works", { + custom_vctr <- vctrs::new_vctr( + 1:4, + attr_key = "attr_val", + class = "arrow_custom_test" + ) + + type <- vctrs_extension_type(custom_vctr) + expect_r6_class(type, "VctrsExtensionType") + expect_identical(type$ptype(), vctrs::vec_ptype(custom_vctr)) + expect_true(type$Equals(type)) + expect_match(type$ToString(), "arrow_custom_test") + + array_in <- VctrsExtensionArray$create(custom_vctr) + expect_true(array_in$type$Equals(type)) + expect_identical(VctrsExtensionArray$create(array_in), array_in) + + tf <- tempfile() + on.exit(unlink(tf)) + write_feather(arrow_table(col = array_in), tf) + table_out <- read_feather(tf, as_data_frame = FALSE) + array_out <- table_out$col + + expect_true(table_out$col$type$Equals(type)) + expect_identical( + table_out$col$as_vector(), + custom_vctr + ) +}) + From e50744f170d7afb5f46327a8cce7a7e99baa97d8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 15 Mar 2022 12:31:14 -0300 Subject: [PATCH 12/66] improvement, but test still failing --- r/R/extension.R | 4 ++-- r/tests/testthat/test-extension.R | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 701056504f9..d7d50b5dce5 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -219,8 +219,8 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", VctrsExtensionArray <- R6Class("VctrsExtensionArray", inherit = ExtensionArray, public = list( - to_vector = function() { - vctrs::vec_restore(super$to_vector(), self$type$ptype()) + as_vector = function() { + vctrs::vec_restore(self$storage()$as_vector(), self$type$ptype()) } ) ) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index bf81aefec18..7dbe78a7de8 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -162,11 +162,14 @@ test_that("vctrs extension type works", { on.exit(unlink(tf)) write_feather(arrow_table(col = array_in), tf) table_out <- read_feather(tf, as_data_frame = FALSE) - array_out <- table_out$col + array_out <- table_out$col$chunk(0) - expect_true(table_out$col$type$Equals(type)) + expect_r6_class(array_out$type, "VctrsExtensionType") + expect_r6_class(array_out, "VctrsExtensionArray") + + expect_true(array_out$type$Equals(type)) expect_identical( - table_out$col$as_vector(), + array_out$as_vector(), custom_vctr ) }) From f92496ed5be6273136e1732faa6e145f6a2cd3d3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 15 Mar 2022 13:17:07 -0300 Subject: [PATCH 13/66] fix Deserialize from C++ --- r/R/extension.R | 2 -- r/src/extension.cpp | 14 ++++++++------ r/tests/testthat/test-extension.R | 24 ++++++++++++++++++------ 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index d7d50b5dce5..45d0a6e489d 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -198,9 +198,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", }, .Deserialize = function(storage_type, extension_name, extension_metadata) { - message("Deserialize called") private$.ptype <- unserialize(extension_metadata) - message(sprintf("...with ptype class %s", paste0(class(private$.ptype), collapse = " / "))) }, .ExtensionEquals = function(other) { diff --git a/r/src/extension.cpp b/r/src/extension.cpp index dadc7eaec32..a38c12d1c41 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -49,7 +49,8 @@ class RExtensionType : public arrow::ExtensionType { std::shared_ptr Clone() const; - cpp11::environment to_r6() const; + cpp11::environment to_r6(std::shared_ptr storage_type, + const std::string& serialized_data) const; private: std::string extension_name_; @@ -71,7 +72,7 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { // With any ambiguity, we need to materialize the R6 type and call its // ExtensionEquals method. - cpp11::environment instance = to_r6(); + cpp11::environment instance = to_r6(storage_type(), Serialize()); cpp11::function instance_ExtensionEquals(instance[".ExtensionEquals"]); std::shared_ptr other_shared = @@ -93,7 +94,7 @@ arrow::Result> RExtensionType::Deserialize( std::shared_ptr storage_type, const std::string& serialized_data) const { try { - cpp11::environment result = to_r6(); + cpp11::environment result = to_r6(storage_type, serialized_data); auto ptr = arrow::r::r6_to_pointer*>(result); return *ptr; } catch (std::exception& e) { @@ -107,10 +108,11 @@ std::shared_ptr RExtensionType::Clone() const { r6_array_generator_); } -cpp11::environment RExtensionType::to_r6() const { +cpp11::environment RExtensionType::to_r6(std::shared_ptr storage_type, + const std::string& serialized_data) const { cpp11::function make_extension_type(cpp11::package("arrow")["MakeExtensionType"]); - cpp11::sexp storage_type_r6 = cpp11::to_r6(storage_type()); - cpp11::writable::raws serialized_data_raw(Serialize()); + cpp11::sexp storage_type_r6 = cpp11::to_r6(storage_type); + cpp11::writable::raws serialized_data_raw(serialized_data); cpp11::sexp result = make_extension_type(storage_type_r6, extension_name(), serialized_data_raw, diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 7dbe78a7de8..a97bfe21f1a 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -77,17 +77,29 @@ test_that("extension type subclasses work", { RegisterExtensionType(type) + # create a new type instance with storage/metadata not identical + # to the registered type + type2 <- MakeExtensionType( + float64(), + "some_extension_subclass", + charToRaw("some other custom metadata"), + type_class = SomeExtensionTypeSubclass, + array_class = SomeExtensionArraySubclass + ) + ptr_type <- allocate_arrow_schema() - type$export_to_c(ptr_type) - type2 <- DataType$import_from_c(ptr_type) + type2$export_to_c(ptr_type) + type3 <- DataType$import_from_c(ptr_type) delete_arrow_schema(ptr_type) - expect_identical(type2$extension_name(), "some_extension_subclass") - expect_identical(type2$some_custom_method(), type$some_custom_method()) + expect_identical(type3$extension_name(), "some_extension_subclass") + expect_identical(type3$some_custom_method(), type2$some_custom_method()) + expect_identical(type3$Serialize(), type2$Serialize()) + expect_true(type3$storage_type() == type2$storage_type()) - array <- type$WrapArray(Array$create(1:10)) + array <- type3$WrapArray(Array$create(1:10)) expect_r6_class(array, "SomeExtensionArraySubclass") - expect_identical(array$some_custom_method(), type$some_custom_method()) + expect_identical(array$some_custom_method(), type3$some_custom_method()) expect_identical(UnregisterExtensionType("some_extension_subclass"), type) }) From d0ec080cfa5b571a1f4b2517680e0fcd3bf6cf42 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 15 Mar 2022 13:41:42 -0300 Subject: [PATCH 14/66] failing test for chunked array out --- r/tests/testthat/test-extension.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index a97bfe21f1a..41547d0714e 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -184,5 +184,12 @@ test_that("vctrs extension type works", { array_out$as_vector(), custom_vctr ) + + chunked_array_out <- table_out$col + expect_true(chunked_array_out$type$Equals(type)) + expect_identical( + chunked_array_out$as_vector(), + custom_vctr + ) }) From aef86075041bc7c701f79c9d293de2ade3d86d96 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 15 Mar 2022 14:34:28 -0300 Subject: [PATCH 15/66] shuffle responsibility between ExtensionArray, ChunkedArray, and ExtensionType --- r/R/arrowExports.R | 4 +- r/R/chunked-array.R | 8 ++- r/R/extension.R | 91 +++++++++++++++++-------------- r/src/arrowExports.cpp | 11 ++-- r/src/extension.cpp | 18 ++---- r/tests/testthat/test-extension.R | 24 ++------ 6 files changed, 77 insertions(+), 79 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index ac09d65c067..0027d7bf178 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1068,8 +1068,8 @@ compute___expr__type_id <- function(x, schema) { .Call(`_arrow_compute___expr__type_id`, x, schema) } -ExtensionType__initialize <- function(storage_type, extension_name, extension_metadata, r6_type_generator, r6_array_generator) { - .Call(`_arrow_ExtensionType__initialize`, storage_type, extension_name, extension_metadata, r6_type_generator, r6_array_generator) +ExtensionType__initialize <- function(storage_type, extension_name, extension_metadata, r6_type_generator) { + .Call(`_arrow_ExtensionType__initialize`, storage_type, extension_name, extension_metadata, r6_type_generator) } ExtensionType__extension_name <- function(type) { diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 95a05aba5b2..76fa8488113 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -85,7 +85,13 @@ ChunkedArray <- R6Class("ChunkedArray", type_id = function() ChunkedArray__type(self)$id, nbytes = function() ChunkedArray__ReferencedBufferSize(self), chunk = function(i) Array$create(ChunkedArray__chunk(self, i)), - as_vector = function() ChunkedArray__as_vector(self, option_use_threads()), + as_vector = function() { + if (inherits(self$type, "ExtensionType")) { + self$type$.chunked_array_as_vector(self) + } else { + ChunkedArray__as_vector(self, option_use_threads()) + } + }, Slice = function(offset, length = NULL) { if (is.null(length)) { ChunkedArray__Slice1(self, offset) diff --git a/r/R/extension.R b/r/R/extension.R index 45d0a6e489d..7dd80fcbc99 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -21,24 +21,22 @@ ExtensionArray <- R6Class("ExtensionArray", public = list( storage = function() { ExtensionArray__storage(self) + }, + + as_vector = function() { + self$type$.array_as_vector(self) } ) ) -ExtensionArray$.default_new <- ExtensionArray$new -ExtensionArray$new <- function(xp) { - superclass <- ExtensionArray$.default_new(xp) - registered_type <- extension_type_registry[[superclass$type$extension_name()]] - if (is.null(registered_type)) { - return(superclass) - } - - class <- registered_type$.__enclos_env__$private$array_class - if (inherits(superclass, class$classname)) { - return(superclass) +ExtensionArray$create <- function(x, type) { + assert_is(type, "ExtensionType") + if (inheritx(x, "ExtensionArray") && type$Equals(x$type)) { + return(x) } - class$new(xp) + storage <- Array$create(x, type = type$storage_type()) + type$WrapArray(storage) } ExtensionType <- R6Class("ExtensionType", @@ -53,9 +51,8 @@ ExtensionType <- R6Class("ExtensionType", ) }, - .set_r6_constructors = function(type_class, array_class) { + .set_r6_constructors = function(type_class) { private$type_class <- type_class - private$array_class <- array_class }, storage_type = function() { @@ -100,12 +97,23 @@ ExtensionType <- R6Class("ExtensionType", inherits(other, "ExtensionType") && identical(other$extension_name(), self$extension_name()) && identical(other$Serialize(), self$Serialize()) + }, + + .chunked_array_as_vector = function(chunked_array) { + storage_arrays <- lapply( + seq_len(chunked_array$num_chunks) - 1L, + function(i) chunked_array$chunk(i)$storage() + ) + storage <- chunked_array(!!! storage_arrays, type = self$storage_type()) + storage$as_vector() + }, + + .array_as_vector = function(extension_array) { + extension_array$storage()$as_vector() } ), - private = list( - type_class = NULL, - array_class = NULL + type_class = NULL ) ) @@ -124,22 +132,19 @@ ExtensionType$new <- function(xp) { MakeExtensionType <- function(storage_type, extension_name, extension_metadata, - type_class = ExtensionType, - array_class = ExtensionArray) { + type_class = ExtensionType) { assert_that(is.string(extension_name), is.raw(extension_metadata)) assert_is(storage_type, "DataType") assert_is(type_class, "R6ClassGenerator") - assert_is(array_class, "R6ClassGenerator") type <- ExtensionType__initialize( storage_type, extension_name, extension_metadata, - type_class, - array_class + type_class ) - type$.set_r6_constructors(type_class, array_class) + type$.set_r6_constructors(type_class) type$.Deserialize(storage_type, extension_name, extension_metadata) type } @@ -207,6 +212,20 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", } identical(self$ptype(), other$ptype()) + }, + + .chunked_array_as_vector = function(chunked_array) { + vctrs::vec_restore( + super$.chunked_array_as_vector(chunked_array), + self$ptype() + ) + }, + + .array_as_vector = function(extension_array) { + vctrs::vec_restore( + super$.array_as_vector(extension_array), + self$ptype() + ) } ), private = list( @@ -214,36 +233,28 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", ) ) -VctrsExtensionArray <- R6Class("VctrsExtensionArray", - inherit = ExtensionArray, - public = list( - as_vector = function() { - vctrs::vec_restore(self$storage()$as_vector(), self$type$ptype()) - } - ) -) -VctrsExtensionArray$create <- function(x, ptype = vctrs::vec_ptype(x), - type = NULL) { - if (inherits(x, "VctrsExtensionArray")) { +vctrs_extension_array <- function(x, ptype = vctrs::vec_ptype(x), + storage_type = NULL) { + if (inherits(x, "ExtensionArray") && inherits(x$type, "VctrsExtensionType")) { return(x) } vctrs::vec_assert(x) - type <- vctrs_extension_type(ptype) - storage <- Array$create(vctrs::vec_data(x), type = type$storage_type()) + storage <- Array$create(vctrs::vec_data(x), type = storage_type) + type <- vctrs_extension_type(ptype, storage$type) type$WrapArray(storage) } -vctrs_extension_type <- function(ptype) { +vctrs_extension_type <- function(ptype, + storage_type = type(vctrs::vec_data(ptype))) { ptype <- vctrs::vec_ptype(ptype) MakeExtensionType( - storage_type = type(vctrs::vec_data(ptype)), + storage_type = storage_type, extension_name = "arrow.r.vctrs", extension_metadata = serialize(ptype, NULL), - type_class = VctrsExtensionType, - array_class = VctrsExtensionArray + type_class = VctrsExtensionType ) } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index e46834a5d46..7cc27ccb8d8 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4169,19 +4169,18 @@ extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){ // extension.cpp #if defined(ARROW_R_WITH_ARROW) -cpp11::sexp ExtensionType__initialize(const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_type_generator, cpp11::environment r6_array_generator); -extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_type_generator_sexp, SEXP r6_array_generator_sexp){ +cpp11::sexp ExtensionType__initialize(const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_type_generator); +extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_type_generator_sexp){ BEGIN_CPP11 arrow::r::Input&>::type storage_type(storage_type_sexp); arrow::r::Input::type extension_name(extension_name_sexp); arrow::r::Input::type extension_metadata(extension_metadata_sexp); arrow::r::Input::type r6_type_generator(r6_type_generator_sexp); - arrow::r::Input::type r6_array_generator(r6_array_generator_sexp); - return cpp11::as_sexp(ExtensionType__initialize(storage_type, extension_name, extension_metadata, r6_type_generator, r6_array_generator)); + return cpp11::as_sexp(ExtensionType__initialize(storage_type, extension_name, extension_metadata, r6_type_generator)); END_CPP11 } #else -extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_type_generator_sexp, SEXP r6_array_generator_sexp){ +extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_type_generator_sexp){ Rf_error("Cannot call ExtensionType__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -8138,7 +8137,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, - { "_arrow_ExtensionType__initialize", (DL_FUNC) &_arrow_ExtensionType__initialize, 5}, + { "_arrow_ExtensionType__initialize", (DL_FUNC) &_arrow_ExtensionType__initialize, 4}, { "_arrow_ExtensionType__extension_name", (DL_FUNC) &_arrow_ExtensionType__extension_name, 1}, { "_arrow_ExtensionType__Serialize", (DL_FUNC) &_arrow_ExtensionType__Serialize, 1}, { "_arrow_ExtensionType__storage_type", (DL_FUNC) &_arrow_ExtensionType__storage_type, 1}, diff --git a/r/src/extension.cpp b/r/src/extension.cpp index a38c12d1c41..2d3a5a5ad66 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -27,13 +27,11 @@ class RExtensionType : public arrow::ExtensionType { public: RExtensionType(const std::shared_ptr storage_type, std::string extension_name, std::string extension_metadata, - cpp11::environment r6_type_generator, - cpp11::environment r6_array_generator) + cpp11::environment r6_type_generator) : arrow::ExtensionType(storage_type), extension_name_(extension_name), extension_metadata_(extension_metadata), - r6_type_generator_(r6_type_generator), - r6_array_generator_(r6_array_generator) {} + r6_type_generator_(r6_type_generator) {} std::string extension_name() const { return extension_name_; } @@ -56,7 +54,6 @@ class RExtensionType : public arrow::ExtensionType { std::string extension_name_; std::string extension_metadata_; cpp11::environment r6_type_generator_; - cpp11::environment r6_array_generator_; }; bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { @@ -104,8 +101,7 @@ arrow::Result> RExtensionType::Deserialize( std::shared_ptr RExtensionType::Clone() const { return std::make_shared(storage_type(), extension_name_, - extension_metadata_, r6_type_generator_, - r6_array_generator_); + extension_metadata_, r6_type_generator_); } cpp11::environment RExtensionType::to_r6(std::shared_ptr storage_type, @@ -116,7 +112,7 @@ cpp11::environment RExtensionType::to_r6(std::shared_ptr storag cpp11::sexp result = make_extension_type(storage_type_r6, extension_name(), serialized_data_raw, - r6_type_generator_, r6_array_generator_); + r6_type_generator_); return result; } @@ -124,13 +120,11 @@ cpp11::environment RExtensionType::to_r6(std::shared_ptr storag // [[arrow::export]] cpp11::sexp ExtensionType__initialize( const std::shared_ptr& storage_type, std::string extension_name, - cpp11::raws extension_metadata, cpp11::environment r6_type_generator, - cpp11::environment r6_array_generator) { + cpp11::raws extension_metadata, cpp11::environment r6_type_generator) { cpp11::function constructor(r6_type_generator["new"]); std::string metadata_string(extension_metadata.begin(), extension_metadata.end()); auto shared_ptr_ptr = new std::shared_ptr( - new RExtensionType(storage_type, extension_name, metadata_string, r6_type_generator, - r6_array_generator)); + new RExtensionType(storage_type, extension_name, metadata_string, r6_type_generator)); auto external_ptr = cpp11::external_pointer>(shared_ptr_ptr); return constructor(external_ptr); diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 41547d0714e..1660ddce434 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -55,21 +55,11 @@ test_that("extension type subclasses work", { ) ) - SomeExtensionArraySubclass <- R6Class( - "SomeExtensionArraySubclass", inherit = ExtensionArray, - public = list( - some_custom_method = function() { - self$type$some_custom_method() - } - ) - ) - type <- MakeExtensionType( int32(), "some_extension_subclass", charToRaw("some custom metadata"), - type_class = SomeExtensionTypeSubclass, - array_class = SomeExtensionArraySubclass + type_class = SomeExtensionTypeSubclass ) expect_r6_class(type, "SomeExtensionTypeSubclass") @@ -83,8 +73,7 @@ test_that("extension type subclasses work", { float64(), "some_extension_subclass", charToRaw("some other custom metadata"), - type_class = SomeExtensionTypeSubclass, - array_class = SomeExtensionArraySubclass + type_class = SomeExtensionTypeSubclass ) ptr_type <- allocate_arrow_schema() @@ -98,8 +87,7 @@ test_that("extension type subclasses work", { expect_true(type3$storage_type() == type2$storage_type()) array <- type3$WrapArray(Array$create(1:10)) - expect_r6_class(array, "SomeExtensionArraySubclass") - expect_identical(array$some_custom_method(), type3$some_custom_method()) + expect_r6_class(array, "ExtensionArray") expect_identical(UnregisterExtensionType("some_extension_subclass"), type) }) @@ -166,9 +154,9 @@ test_that("vctrs extension type works", { expect_true(type$Equals(type)) expect_match(type$ToString(), "arrow_custom_test") - array_in <- VctrsExtensionArray$create(custom_vctr) + array_in <- vctrs_extension_array(custom_vctr) expect_true(array_in$type$Equals(type)) - expect_identical(VctrsExtensionArray$create(array_in), array_in) + expect_identical(vctrs_extension_array(array_in), array_in) tf <- tempfile() on.exit(unlink(tf)) @@ -177,7 +165,7 @@ test_that("vctrs extension type works", { array_out <- table_out$col$chunk(0) expect_r6_class(array_out$type, "VctrsExtensionType") - expect_r6_class(array_out, "VctrsExtensionArray") + expect_r6_class(array_out, "ExtensionArray") expect_true(array_out$type$Equals(type)) expect_identical( From b91ab8f319a2493b78115a86066b61c65d99d740 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 15 Mar 2022 15:35:27 -0300 Subject: [PATCH 16/66] test roundtripping through Table, RecordBatch, and ChunkedArray --- r/R/arrow-tabular.R | 34 +++++++++++ r/R/record-batch.R | 5 +- r/R/table.R | 5 +- r/tests/testthat/test-extension.R | 93 +++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 2 deletions(-) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index 43110ccf24e..6bd3feb992d 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -98,6 +98,40 @@ ArrowTabular <- R6Class("ArrowTabular", ) ) +tabular_as_data_frame_common <- function(x, base) { + x_cols <- names(x) + col_is_extension <- vapply( + x_cols, + function(col) inherits(x$schema[[col]]$type, "ExtensionType"), + logical(1) + ) + + if (!any(col_is_extension)) { + return(base(x, option_use_threads())) + } + + extension_cols <- x_cols[col_is_extension] + + if (all(col_is_extension)) { + tibble_no_extension_types <- NULL + } else { + tibble_no_extension_types <- base( + x[setdiff(x_cols, extension_cols)], + option_use_threads() + ) + } + + extension_vectors <- lapply( + extension_cols, + function(col) x[[col]]$as_vector() + ) + + names(extension_vectors) <- extension_cols + + all_vectors <- c(extension_vectors, tibble_no_extension_types)[x_cols] + tibble::new_tibble(all_vectors, nrow = nrow(x)) +} + #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { df <- x$to_data_frame() diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 24bd61535e7..1410760125c 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -114,7 +114,10 @@ RecordBatch <- R6Class("RecordBatch", # Take, Filter, and SortIndices are methods on ArrowTabular serialize = function() ipc___SerializeRecordBatch__Raw(self), to_data_frame = function() { - RecordBatch__to_dataframe(self, use_threads = option_use_threads()) + tabular_as_data_frame_common( + self, + RecordBatch__to_dataframe + ) }, cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { assert_is(target_schema, "Schema") diff --git a/r/R/table.R b/r/R/table.R index 07750786ee2..c4103650c1a 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -98,7 +98,10 @@ Table <- R6Class("Table", field = function(i) Table__field(self, i), serialize = function(output_stream, ...) write_table(self, output_stream, ...), to_data_frame = function() { - Table__to_dataframe(self, use_threads = option_use_threads()) + tabular_as_data_frame_common( + self, + Table__to_dataframe + ) }, cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { assert_is(target_schema, "Schema") diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 1660ddce434..790e11c4353 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -181,3 +181,96 @@ test_that("vctrs extension type works", { ) }) +test_that("chunked arrays can roundtrip extension types", { + custom_vctr1 <- vctrs::new_vctr(1:4, class = "arrow_custom_test") + custom_vctr2 <- vctrs::new_vctr(5:8, class = "arrow_custom_test") + custom_array1 <- vctrs_extension_array(custom_vctr1) + custom_array2 <- vctrs_extension_array(custom_vctr2) + + custom_chunked <- chunked_array(custom_array1, custom_array2) + expect_r6_class(custom_chunked$type, "VctrsExtensionType") + expect_identical( + custom_chunked$as_vector(), + vctrs::new_vctr(1:8, class = "arrow_custom_test") + ) +}) + +test_that("RecordBatch can roundtrip extension types", { + custom_vctr <- vctrs::new_vctr(1:8, class = "arrow_custom_test") + custom_array <- vctrs_extension_array(custom_vctr) + normal_vctr <- letters[1:8] + + custom_record_batch <- record_batch(custom = custom_array) + expect_identical( + custom_record_batch$to_data_frame(), + tibble::tibble( + custom = custom_vctr + ) + ) + + mixed_record_batch <- record_batch( + custom = custom_array, + normal = normal_vctr + ) + expect_identical( + mixed_record_batch$to_data_frame(), + tibble::tibble( + custom = custom_vctr, + normal = normal_vctr + ) + ) + + # check both column orders, since column order should stay in the same + # order whether the colunns are are extension types or not + mixed_record_batch2 <- record_batch( + normal = normal_vctr, + custom = custom_array + ) + expect_identical( + mixed_record_batch2$to_data_frame(), + tibble::tibble( + normal = normal_vctr, + custom = custom_vctr + ) + ) +}) + +test_that("Table can roundtrip extension types", { + custom_vctr <- vctrs::new_vctr(1:8, class = "arrow_custom_test") + custom_array <- vctrs_extension_array(custom_vctr) + normal_vctr <- letters[1:8] + + custom_table <- arrow_table(custom = custom_array) + expect_identical( + custom_table$to_data_frame(), + tibble::tibble( + custom = custom_vctr + ) + ) + + mixed_table <- arrow_table( + custom = custom_array, + normal = normal_vctr + ) + expect_identical( + mixed_table$to_data_frame(), + tibble::tibble( + custom = custom_vctr, + normal = normal_vctr + ) + ) + + # check both column orders, since column order should stay in the same + # order whether the colunns are are extension types or not + mixed_table2 <- arrow_table( + normal = normal_vctr, + custom = custom_array + ) + expect_identical( + mixed_table2$to_data_frame(), + tibble::tibble( + normal = normal_vctr, + custom = custom_vctr + ) + ) +}) From 4012ac1f938e8e50405dcc62264f056f7eb65e60 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 09:23:18 -0300 Subject: [PATCH 17/66] clang-format --- r/src/extension.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/r/src/extension.cpp b/r/src/extension.cpp index 2d3a5a5ad66..577f4c832f3 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -110,9 +110,8 @@ cpp11::environment RExtensionType::to_r6(std::shared_ptr storag cpp11::sexp storage_type_r6 = cpp11::to_r6(storage_type); cpp11::writable::raws serialized_data_raw(serialized_data); - cpp11::sexp result = - make_extension_type(storage_type_r6, extension_name(), serialized_data_raw, - r6_type_generator_); + cpp11::sexp result = make_extension_type(storage_type_r6, extension_name(), + serialized_data_raw, r6_type_generator_); return result; } @@ -123,8 +122,8 @@ cpp11::sexp ExtensionType__initialize( cpp11::raws extension_metadata, cpp11::environment r6_type_generator) { cpp11::function constructor(r6_type_generator["new"]); std::string metadata_string(extension_metadata.begin(), extension_metadata.end()); - auto shared_ptr_ptr = new std::shared_ptr( - new RExtensionType(storage_type, extension_name, metadata_string, r6_type_generator)); + auto shared_ptr_ptr = new std::shared_ptr(new RExtensionType( + storage_type, extension_name, metadata_string, r6_type_generator)); auto external_ptr = cpp11::external_pointer>(shared_ptr_ptr); return constructor(external_ptr); From 536715ef4010a9b347523a8ecbe820f815249daa Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 09:26:06 -0300 Subject: [PATCH 18/66] only register extension type if arrow is available --- r/R/arrow-package.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index d688b89b801..cd1ef575c3d 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -80,8 +80,10 @@ } } - # register extension types that we use internally - ReRegisterExtensionType(vctrs_extension_type(vctrs::unspecified())) + if (arrow_available()) { + # register extension types that we use internally + ReRegisterExtensionType(vctrs_extension_type(vctrs::unspecified())) + } invisible() } From 79dd3a02e98ba8801766e3cc91452187f39880bf Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 10:05:38 -0300 Subject: [PATCH 19/66] start to document, simplify circular logic in methods --- r/R/extension.R | 21 ++++++++++++++++----- r/src/extension.cpp | 6 ++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 7dd80fcbc99..23637076b8e 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -117,15 +117,26 @@ ExtensionType <- R6Class("ExtensionType", ) ) + +# ExtensionType$new() is what gets used by the generated wrapper code to +# create an R6 object when a shared_ptr is returned to R and +# that object has type_id() EXTENSION_TYPE. Rather than add complexity +# to the wrapper code, we modify ExtensionType$new() to do what we need +# it to do here (which is to return an instance of a custom R6 +# type whose .Deserialize method is called to populate custom fields). ExtensionType$.default_new <- ExtensionType$new ExtensionType$new <- function(xp) { - superclass <- ExtensionType$.default_new(xp) - registered_type <- extension_type_registry[[superclass$extension_name()]] - if (is.null(registered_type)) { - return(superclass) + super <- ExtensionType$.default_new(xp) + registered_type_instance <- extension_type_registry[[super$extension_name()]] + if (is.null(registered_type_instance)) { + return(super) } - registered_type$.__enclos_env__$private$type_class$new(xp) + instance <- registered_type_instance$clone() + instance$.__enclos_env__$super <- super + instance$initialize(xp) + + instance } diff --git a/r/src/extension.cpp b/r/src/extension.cpp index 577f4c832f3..618fc35be06 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -23,8 +23,14 @@ #include #include +// A wrapper around arrow::ExtensionType that allows R to register extension +// types whose Deserialize, ExtensionEquals, and Sersialize methods are +// in practice handled at the R level. class RExtensionType : public arrow::ExtensionType { public: + + // An instance of RExtensionType already has a copy of its seraialized + // extension metadata when constructed. RExtensionType(const std::shared_ptr storage_type, std::string extension_name, std::string extension_metadata, cpp11::environment r6_type_generator) From 3560bed9e29f24f1386d4fedcbb91208bed0c6e1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 11:44:06 -0300 Subject: [PATCH 20/66] more documentation and simplification --- r/R/arrowExports.R | 8 ++- r/R/extension.R | 112 ++++++++++++++++++++++------------------- r/src/arrowExports.cpp | 26 ++++++++-- r/src/extension.cpp | 24 ++++++--- 4 files changed, 104 insertions(+), 66 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 0027d7bf178..7bf77f1e66c 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1068,8 +1068,8 @@ compute___expr__type_id <- function(x, schema) { .Call(`_arrow_compute___expr__type_id`, x, schema) } -ExtensionType__initialize <- function(storage_type, extension_name, extension_metadata, r6_type_generator) { - .Call(`_arrow_ExtensionType__initialize`, storage_type, extension_name, extension_metadata, r6_type_generator) +ExtensionType__initialize <- function(storage_type, extension_name, extension_metadata, r6_class) { + .Call(`_arrow_ExtensionType__initialize`, storage_type, extension_name, extension_metadata, r6_class) } ExtensionType__extension_name <- function(type) { @@ -1088,6 +1088,10 @@ ExtensionType__MakeArray <- function(type, data) { .Call(`_arrow_ExtensionType__MakeArray`, type, data) } +ExtensionType__r6_class <- function(type) { + .Call(`_arrow_ExtensionType__r6_class`, type) +} + ExtensionArray__storage <- function(array) { .Call(`_arrow_ExtensionArray__storage`, array) } diff --git a/r/R/extension.R b/r/R/extension.R index 23637076b8e..e4c057f523f 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -42,6 +42,10 @@ ExtensionArray$create <- function(x, type) { ExtensionType <- R6Class("ExtensionType", inherit = DataType, public = list( + + # In addition to the initialization that occurs for all + # ArrowObject instances, we call .Deserialize(), which can + # be overridden to populate custom fields initialize = function(xp) { super$initialize(xp) self$.Deserialize( @@ -51,8 +55,13 @@ ExtensionType <- R6Class("ExtensionType", ) }, - .set_r6_constructors = function(type_class) { - private$type_class <- type_class + # Because of how C++ shared_ptr<> objects are converted to R objects, + # the initial object that is instantiated will be of this class + # (ExtensionType), but the R6Class object that was registered is + # available from C++. We need this in order to produce the correct + # R6 subclass when a shared_ptr is returned to R. + r6_class = function() { + ExtensionType__r6_class(self) }, storage_type = function() { @@ -82,9 +91,31 @@ ExtensionType <- R6Class("ExtensionType", }, ToString = function() { - metadata_utf8 <- rawToChar(self$Serialize()) - Encoding(metadata_utf8) <- "UTF-8" - paste0(class(self)[1], " <", metadata_utf8, ">") + # metadata is probably valid UTF-8 (e.g., JSON), but might not be + # and it's confusing to error when printing the object. This herustic + # isn't perfect (but subclasses should override this method anyway) + metadata_raw <- self$Serialize() + + if (as.raw(0x00) %in% metadata_raw) { + if (length(metadata_raw) > 20) { + sprintf( + "<%s %s...>", + class(self)[1], + paste(format(utils::head(metadata_raw, 20)), collapse = " ") + ) + } else { + sprintf( + "<%s %s>", + class(self)[1], + paste(format(metadata_raw), collapse = " ") + ) + } + + } else { + metadata_utf8 <- rawToChar(self$Serialize()) + Encoding(metadata_utf8) <- "UTF-8" + paste0(class(self)[1], " <", metadata_utf8, ">") + } }, .Deserialize = function(storage_type, extension_name, extension_metadata) { @@ -93,7 +124,6 @@ ExtensionType <- R6Class("ExtensionType", }, .ExtensionEquals = function(other) { - # note that this must not call to C++ (because C++ might call here) inherits(other, "ExtensionType") && identical(other$extension_name(), self$extension_name()) && identical(other$Serialize(), self$Serialize()) @@ -111,9 +141,6 @@ ExtensionType <- R6Class("ExtensionType", .array_as_vector = function(extension_array) { extension_array$storage()$as_vector() } - ), - private = list( - type_class = NULL ) ) @@ -129,14 +156,10 @@ ExtensionType$new <- function(xp) { super <- ExtensionType$.default_new(xp) registered_type_instance <- extension_type_registry[[super$extension_name()]] if (is.null(registered_type_instance)) { - return(super) + super + } else { + super$r6_class()$new(xp) } - - instance <- registered_type_instance$clone() - instance$.__enclos_env__$super <- super - instance$initialize(xp) - - instance } @@ -148,41 +171,29 @@ MakeExtensionType <- function(storage_type, assert_is(storage_type, "DataType") assert_is(type_class, "R6ClassGenerator") - type <- ExtensionType__initialize( + ExtensionType__initialize( storage_type, extension_name, extension_metadata, type_class ) - - type$.set_r6_constructors(type_class) - type$.Deserialize(storage_type, extension_name, extension_metadata) - type } -RegisterExtensionType <- function(type) { - assert_is(type, "ExtensionType") - arrow__RegisterRExtensionType(type) - extension_type_registry[[type$extension_name()]] <- type - invisible(type) +RegisterExtensionType <- function(extension_type) { + assert_is(extension_type, "ExtensionType") + arrow__RegisterRExtensionType(extension_type) + extension_type_registry[[extension_type$extension_name()]] <- extension_type + invisible(extension_type) } -ReRegisterExtensionType <- function(type) { - extension_name <- type$extension_name() - result <- extension_type_registry[[extension_name]] - if (!is.null(result)) { - UnregisterExtensionType(extension_name) - } - +ReRegisterExtensionType <- function(extension_type) { tryCatch( - RegisterExtensionType(type), + RegisterExtensionType(extension_type), error = function(e) { - UnregisterExtensionType(extension_name) - RegisterExtensionType(type) + UnregisterExtensionType(extension_type$extension_name()) + RegisterExtensionType(extension_type) } ) - - invisible(result) } UnregisterExtensionType <- function(extension_name) { @@ -245,19 +256,6 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", ) -vctrs_extension_array <- function(x, ptype = vctrs::vec_ptype(x), - storage_type = NULL) { - if (inherits(x, "ExtensionArray") && inherits(x$type, "VctrsExtensionType")) { - return(x) - } - - vctrs::vec_assert(x) - storage <- Array$create(vctrs::vec_data(x), type = storage_type) - type <- vctrs_extension_type(ptype, storage$type) - type$WrapArray(storage) -} - - vctrs_extension_type <- function(ptype, storage_type = type(vctrs::vec_data(ptype))) { ptype <- vctrs::vec_ptype(ptype) @@ -269,3 +267,15 @@ vctrs_extension_type <- function(ptype, type_class = VctrsExtensionType ) } + +vctrs_extension_array <- function(x, ptype = vctrs::vec_ptype(x), + storage_type = NULL) { + if (inherits(x, "ExtensionArray") && inherits(x$type, "VctrsExtensionType")) { + return(x) + } + + vctrs::vec_assert(x) + storage <- Array$create(vctrs::vec_data(x), type = storage_type) + type <- vctrs_extension_type(ptype, storage$type) + type$WrapArray(storage) +} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 7cc27ccb8d8..bb9a23305e6 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4169,18 +4169,18 @@ extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){ // extension.cpp #if defined(ARROW_R_WITH_ARROW) -cpp11::sexp ExtensionType__initialize(const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_type_generator); -extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_type_generator_sexp){ +cpp11::sexp ExtensionType__initialize(const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_class); +extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_class_sexp){ BEGIN_CPP11 arrow::r::Input&>::type storage_type(storage_type_sexp); arrow::r::Input::type extension_name(extension_name_sexp); arrow::r::Input::type extension_metadata(extension_metadata_sexp); - arrow::r::Input::type r6_type_generator(r6_type_generator_sexp); - return cpp11::as_sexp(ExtensionType__initialize(storage_type, extension_name, extension_metadata, r6_type_generator)); + arrow::r::Input::type r6_class(r6_class_sexp); + return cpp11::as_sexp(ExtensionType__initialize(storage_type, extension_name, extension_metadata, r6_class)); END_CPP11 } #else -extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_type_generator_sexp){ +extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_class_sexp){ Rf_error("Cannot call ExtensionType__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -4246,6 +4246,21 @@ extern "C" SEXP _arrow_ExtensionType__MakeArray(SEXP type_sexp, SEXP data_sexp){ } #endif +// extension.cpp +#if defined(ARROW_R_WITH_ARROW) +cpp11::environment ExtensionType__r6_class(const std::shared_ptr& type); +extern "C" SEXP _arrow_ExtensionType__r6_class(SEXP type_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type type(type_sexp); + return cpp11::as_sexp(ExtensionType__r6_class(type)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_ExtensionType__r6_class(SEXP type_sexp){ + Rf_error("Cannot call ExtensionType__r6_class(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // extension.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr ExtensionArray__storage(const std::shared_ptr& array); @@ -8142,6 +8157,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExtensionType__Serialize", (DL_FUNC) &_arrow_ExtensionType__Serialize, 1}, { "_arrow_ExtensionType__storage_type", (DL_FUNC) &_arrow_ExtensionType__storage_type, 1}, { "_arrow_ExtensionType__MakeArray", (DL_FUNC) &_arrow_ExtensionType__MakeArray, 2}, + { "_arrow_ExtensionType__r6_class", (DL_FUNC) &_arrow_ExtensionType__r6_class, 1}, { "_arrow_ExtensionArray__storage", (DL_FUNC) &_arrow_ExtensionArray__storage, 1}, { "_arrow_arrow__RegisterRExtensionType", (DL_FUNC) &_arrow_arrow__RegisterRExtensionType, 1}, { "_arrow_arrow__UnregisterRExtensionType", (DL_FUNC) &_arrow_arrow__UnregisterRExtensionType, 1}, diff --git a/r/src/extension.cpp b/r/src/extension.cpp index 618fc35be06..61a0adb9fde 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -33,11 +33,11 @@ class RExtensionType : public arrow::ExtensionType { // extension metadata when constructed. RExtensionType(const std::shared_ptr storage_type, std::string extension_name, std::string extension_metadata, - cpp11::environment r6_type_generator) + cpp11::environment r6_class) : arrow::ExtensionType(storage_type), extension_name_(extension_name), extension_metadata_(extension_metadata), - r6_type_generator_(r6_type_generator) {} + r6_class_(r6_class) {} std::string extension_name() const { return extension_name_; } @@ -53,13 +53,15 @@ class RExtensionType : public arrow::ExtensionType { std::shared_ptr Clone() const; + cpp11::environment R6Class() { return r6_class_; } + cpp11::environment to_r6(std::shared_ptr storage_type, const std::string& serialized_data) const; private: std::string extension_name_; std::string extension_metadata_; - cpp11::environment r6_type_generator_; + cpp11::environment r6_class_; }; bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { @@ -107,7 +109,7 @@ arrow::Result> RExtensionType::Deserialize( std::shared_ptr RExtensionType::Clone() const { return std::make_shared(storage_type(), extension_name_, - extension_metadata_, r6_type_generator_); + extension_metadata_, r6_class_); } cpp11::environment RExtensionType::to_r6(std::shared_ptr storage_type, @@ -117,7 +119,7 @@ cpp11::environment RExtensionType::to_r6(std::shared_ptr storag cpp11::writable::raws serialized_data_raw(serialized_data); cpp11::sexp result = make_extension_type(storage_type_r6, extension_name(), - serialized_data_raw, r6_type_generator_); + serialized_data_raw, r6_class_); return result; } @@ -125,11 +127,11 @@ cpp11::environment RExtensionType::to_r6(std::shared_ptr storag // [[arrow::export]] cpp11::sexp ExtensionType__initialize( const std::shared_ptr& storage_type, std::string extension_name, - cpp11::raws extension_metadata, cpp11::environment r6_type_generator) { - cpp11::function constructor(r6_type_generator["new"]); + cpp11::raws extension_metadata, cpp11::environment r6_class) { + cpp11::function constructor(r6_class["new"]); std::string metadata_string(extension_metadata.begin(), extension_metadata.end()); auto shared_ptr_ptr = new std::shared_ptr(new RExtensionType( - storage_type, extension_name, metadata_string, r6_type_generator)); + storage_type, extension_name, metadata_string, r6_class)); auto external_ptr = cpp11::external_pointer>(shared_ptr_ptr); return constructor(external_ptr); @@ -161,6 +163,12 @@ std::shared_ptr ExtensionType__MakeArray( return type->MakeArray(data); } +// [[arrow::export]] +cpp11::environment ExtensionType__r6_class(const std::shared_ptr& type) { + auto r_type = arrow::internal::checked_pointer_cast(type); + return r_type->R6Class(); +} + // [[arrow::export]] std::shared_ptr ExtensionArray__storage( const std::shared_ptr& array) { From 1d5db0f937b59781e23ed88738d7ab35ebc07eab Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 12:09:13 -0300 Subject: [PATCH 21/66] simplify C++--R6 interaction --- r/src/extension.cpp | 65 ++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/r/src/extension.cpp b/r/src/extension.cpp index 61a0adb9fde..3720ef0c066 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -24,13 +24,11 @@ #include // A wrapper around arrow::ExtensionType that allows R to register extension -// types whose Deserialize, ExtensionEquals, and Sersialize methods are -// in practice handled at the R level. +// types whose Deserialize, ExtensionEquals, and Serialize methods are +// in meanintfully handled at the R level. At the C++ level, the type is +// already serialized to minimize calls to R from C++. class RExtensionType : public arrow::ExtensionType { public: - - // An instance of RExtensionType already has a copy of its seraialized - // extension metadata when constructed. RExtensionType(const std::shared_ptr storage_type, std::string extension_name, std::string extension_metadata, cpp11::environment r6_class) @@ -51,12 +49,16 @@ class RExtensionType : public arrow::ExtensionType { std::string Serialize() const { return extension_metadata_; } - std::shared_ptr Clone() const; + std::unique_ptr Clone() const; + + cpp11::environment r6_class() { return r6_class_; } - cpp11::environment R6Class() { return r6_class_; } + cpp11::environment r6_instance(std::shared_ptr storage_type, + const std::string& serialized_data) const; - cpp11::environment to_r6(std::shared_ptr storage_type, - const std::string& serialized_data) const; + cpp11::environment r6_instance() const { + return r6_instance(storage_type(), Serialize()); + } private: std::string extension_name_; @@ -77,7 +79,7 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { // With any ambiguity, we need to materialize the R6 type and call its // ExtensionEquals method. - cpp11::environment instance = to_r6(storage_type(), Serialize()); + cpp11::environment instance = r6_instance(); cpp11::function instance_ExtensionEquals(instance[".ExtensionEquals"]); std::shared_ptr other_shared = @@ -91,37 +93,38 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { std::shared_ptr RExtensionType::MakeArray( std::shared_ptr data) const { std::shared_ptr new_data = data->Copy(); - new_data->type = Clone(); + std::unique_ptr cloned = Clone(); + new_data->type = std::shared_ptr(cloned.release()); return std::make_shared(new_data); } arrow::Result> RExtensionType::Deserialize( std::shared_ptr storage_type, const std::string& serialized_data) const { - try { - cpp11::environment result = to_r6(storage_type, serialized_data); - auto ptr = arrow::r::r6_to_pointer*>(result); - return *ptr; - } catch (std::exception& e) { - return arrow::Status::UnknownError(e.what()); - } + std::unique_ptr cloned = Clone(); + cloned->storage_type_ = storage_type; + cloned->extension_metadata_ = serialized_data; + return std::shared_ptr(cloned.release()); } -std::shared_ptr RExtensionType::Clone() const { - return std::make_shared(storage_type(), extension_name_, - extension_metadata_, r6_class_); +std::unique_ptr RExtensionType::Clone() const { + RExtensionType* ptr = new RExtensionType(storage_type(), extension_name_, + extension_metadata_, r6_class_); + return std::unique_ptr(ptr); } -cpp11::environment RExtensionType::to_r6(std::shared_ptr storage_type, +cpp11::environment RExtensionType::r6_instance(std::shared_ptr storage_type, const std::string& serialized_data) const { - cpp11::function make_extension_type(cpp11::package("arrow")["MakeExtensionType"]); - cpp11::sexp storage_type_r6 = cpp11::to_r6(storage_type); - cpp11::writable::raws serialized_data_raw(serialized_data); - - cpp11::sexp result = make_extension_type(storage_type_r6, extension_name(), - serialized_data_raw, r6_class_); - - return result; + // This is a version of to_r6<>() that is a more direct route to creating the object. + // This is done to avoid circular calls, since to_r6<>() has to go through + // ExtensionType$new(), which then calls back to C++ to get r6_class_ to then + // return the correct subclass. + std::unique_ptr cloned = Clone(); + cpp11::external_pointer> xp( + new std::shared_ptr(cloned.release())); + + cpp11::function r6_class_new(r6_class_["new"]); + return r6_class_new(xp); } // [[arrow::export]] @@ -166,7 +169,7 @@ std::shared_ptr ExtensionType__MakeArray( // [[arrow::export]] cpp11::environment ExtensionType__r6_class(const std::shared_ptr& type) { auto r_type = arrow::internal::checked_pointer_cast(type); - return r_type->R6Class(); + return r_type->r6_class(); } // [[arrow::export]] From 99eda35309fe151e55006a4b8f3c968a3dc452d3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 13:04:56 -0300 Subject: [PATCH 22/66] more clarifying and simplifying the C++--R6 interaction --- r/src/arrowExports.cpp | 2 +- r/src/extension.cpp | 39 +++++++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index bb9a23305e6..aa3208e8d0c 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4169,7 +4169,7 @@ extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){ // extension.cpp #if defined(ARROW_R_WITH_ARROW) -cpp11::sexp ExtensionType__initialize(const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_class); +cpp11::environment ExtensionType__initialize(const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_class); extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_class_sexp){ BEGIN_CPP11 arrow::r::Input&>::type storage_type(storage_type_sexp); diff --git a/r/src/extension.cpp b/r/src/extension.cpp index 3720ef0c066..84a329dee7b 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -67,7 +67,7 @@ class RExtensionType : public arrow::ExtensionType { }; bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { - // Avoid materializing the R6 type if at all possible, since this is slow + // Avoid materializing the R6 instance if at all possible, since this is slow // and in some cases not possible due to threading if (other.extension_name() != extension_name()) { return false; @@ -77,7 +77,7 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { return true; } - // With any ambiguity, we need to materialize the R6 type and call its + // With any ambiguity, we need to materialize the R6 instance and call its // ExtensionEquals method. cpp11::environment instance = r6_instance(); cpp11::function instance_ExtensionEquals(instance[".ExtensionEquals"]); @@ -104,40 +104,45 @@ arrow::Result> RExtensionType::Deserialize( std::unique_ptr cloned = Clone(); cloned->storage_type_ = storage_type; cloned->extension_metadata_ = serialized_data; + + // Create an ephemeral R6 instance here, which will call the R6 instance's + // .Deserialize() method, possibly erroring when the metadata is invalid + // or the deserialized values are invalid. It might be possible to avoid + // this call but when there is an error it will be confusing, since it will + // only occur when the result surfaces to R (which might be much later). + cloned->r6_instance(); + return std::shared_ptr(cloned.release()); } std::unique_ptr RExtensionType::Clone() const { - RExtensionType* ptr = new RExtensionType(storage_type(), extension_name_, - extension_metadata_, r6_class_); + RExtensionType* ptr = + new RExtensionType(storage_type(), extension_name_, extension_metadata_, r6_class_); return std::unique_ptr(ptr); } -cpp11::environment RExtensionType::r6_instance(std::shared_ptr storage_type, - const std::string& serialized_data) const { +cpp11::environment RExtensionType::r6_instance( + std::shared_ptr storage_type, + const std::string& serialized_data) const { // This is a version of to_r6<>() that is a more direct route to creating the object. // This is done to avoid circular calls, since to_r6<>() has to go through // ExtensionType$new(), which then calls back to C++ to get r6_class_ to then // return the correct subclass. std::unique_ptr cloned = Clone(); cpp11::external_pointer> xp( - new std::shared_ptr(cloned.release())); + new std::shared_ptr(cloned.release())); cpp11::function r6_class_new(r6_class_["new"]); return r6_class_new(xp); } // [[arrow::export]] -cpp11::sexp ExtensionType__initialize( +cpp11::environment ExtensionType__initialize( const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_class) { - cpp11::function constructor(r6_class["new"]); std::string metadata_string(extension_metadata.begin(), extension_metadata.end()); - auto shared_ptr_ptr = new std::shared_ptr(new RExtensionType( - storage_type, extension_name, metadata_string, r6_class)); - auto external_ptr = - cpp11::external_pointer>(shared_ptr_ptr); - return constructor(external_ptr); + RExtensionType cpp_type(storage_type, extension_name, metadata_string, r6_class); + return cpp_type.r6_instance(); } // [[arrow::export]] @@ -167,8 +172,10 @@ std::shared_ptr ExtensionType__MakeArray( } // [[arrow::export]] -cpp11::environment ExtensionType__r6_class(const std::shared_ptr& type) { - auto r_type = arrow::internal::checked_pointer_cast(type); +cpp11::environment ExtensionType__r6_class( + const std::shared_ptr& type) { + auto r_type = + arrow::internal::checked_pointer_cast(type); return r_type->r6_class(); } From 10ed09bcc75d285dadd48289aad053327d09eae8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 13:13:19 -0300 Subject: [PATCH 23/66] more documenting --- r/R/extension.R | 6 ++++-- r/src/extension.cpp | 9 +++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index e4c057f523f..59cc160b789 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -90,7 +90,9 @@ ExtensionType <- R6Class("ExtensionType", self$MakeArray(array$data()) }, - ToString = function() { + # ExtensionType subclasses can reimplement the following methods: + + .ToString = function() { # metadata is probably valid UTF-8 (e.g., JSON), but might not be # and it's confusing to error when printing the object. This herustic # isn't perfect (but subclasses should override this method anyway) @@ -215,7 +217,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", private$.ptype }, - ToString = function() { + .ToString = function() { tf <- tempfile() on.exit(unlink(tf)) sink(tf) diff --git a/r/src/extension.cpp b/r/src/extension.cpp index 84a329dee7b..68bb91e22ae 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -49,6 +49,8 @@ class RExtensionType : public arrow::ExtensionType { std::string Serialize() const { return extension_metadata_; } + std::string ToString() const; + std::unique_ptr Clone() const; cpp11::environment r6_class() { return r6_class_; } @@ -115,6 +117,13 @@ arrow::Result> RExtensionType::Deserialize( return std::shared_ptr(cloned.release()); } +std::string RExtensionType::ToString() const { + cpp11::environment instance = r6_instance(); + cpp11::function instance_ToString(instance[".ToString"]); + cpp11::sexp result = instance_ToString(); + return cpp11::as_cpp(result); +} + std::unique_ptr RExtensionType::Clone() const { RExtensionType* ptr = new RExtensionType(storage_type(), extension_name_, extension_metadata_, r6_class_); From 3e4df799d5254fe5f710bf550d6ffb51d43310ff Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 13:53:41 -0300 Subject: [PATCH 24/66] more documentation and renaming methods --- r/NAMESPACE | 1 + r/R/arrow-package.R | 2 +- r/R/extension.R | 145 ++++++++++++++++++++++-------- r/man/ExtensionArray.Rd | 23 +++++ r/man/ExtensionType.Rd | 52 +++++++++++ r/tests/testthat/test-extension.R | 18 ++-- 6 files changed, 193 insertions(+), 48 deletions(-) create mode 100644 r/man/ExtensionArray.Rd create mode 100644 r/man/ExtensionType.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 7cb89b0a53a..f1916f0e020 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -134,6 +134,7 @@ export(DictionaryArray) export(DirectoryPartitioning) export(DirectoryPartitioningFactory) export(Expression) +export(ExtensionType) export(FeatherReader) export(Field) export(FileFormat) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index cd1ef575c3d..4ee5db8fd9f 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -82,7 +82,7 @@ if (arrow_available()) { # register extension types that we use internally - ReRegisterExtensionType(vctrs_extension_type(vctrs::unspecified())) + reregister_extension_type(vctrs_extension_type(vctrs::unspecified())) } invisible() diff --git a/r/R/extension.R b/r/R/extension.R index 59cc160b789..b82118b0c36 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -15,7 +15,26 @@ # specific language governing permissions and limitations # under the License. - +#' @include arrow-package.R +#' @title class arrow::ExtensionArray +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' The `ExtensionArray` class inherits from `Array`, but also provides +#' access to the underlying storage of the extension. +#' +#' - `$storage()`: Returns the underlying [Array] used to store +#' values. +#' +#' The `ExtensionArray` is not intended to be subclassed for extension +#' types. +#' +#' @rdname ExtensionArray +#' @name ExtensionArray ExtensionArray <- R6Class("ExtensionArray", inherit = Array, public = list( @@ -39,6 +58,55 @@ ExtensionArray$create <- function(x, type) { type$WrapArray(storage) } +#' @include arrow-package.R +#' @title class arrow::ExtensionType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' The `ExtensionType` class inherits from `DataType`, but also defines +#' extra methods specific to extension types: +#' +#' - `$storage_type()`: Returns the underlying [DataType] used to store +#' values. +#' - `$storage_id()`: Returns the [Type] identifier corresponding to the +#' `$storage_type()`. +#' - `$extension_name()`: Returns the extension name. +#' - `$Serialize()`: Returns the serialized version of the extension metadata +#' as a [raw()] vector. +#' - `$WrapArray(array)`: Wraps a storage [Array] into an [ExtensionArray] +#' with this extension type. +#' +#' In addition, subclasses may override the following methos to customize +#' the behaviour of extension classes. +#' +#' - `$.Deserialize(storage_type, extension_name, extension_metadata)` +#' This method is called when a new [ExtensionType] +#' is initialized and is responsible for parsing and validating +#' the serialized `extension_metadata` (a [raw()] vector) +#' such that its contents can be inspected by fields and/or methods +#' of the R6 ExtensionType subclass. Implementations must also check the +#' `storage_type` to make sure it is compatible with the extension type. +#' - `$.array_as_vector(extension_array)`: Convert an [Array] to an R +#' vector. This method is called by [as.vector()] on [ExtensionArray] +#' objects or when a [RecordBatch] containing an [ExtensionArray] is +#' converted to a [data.frame()]. The default method returns the converted +#' storage array. +#' - `$.chunked_array_as_vector(chunked_array)`: Convert a [ChunkedArray] +#' to an R vector. This method is called by [as.vector()] on a [ChunkedArray] +#' whose type matches this extension type or when a [Table] containing +#' such a column is converted to a [data.frame()]. The default method +#' returns the converted version of the equivalent storage arrays +#' as a [ChunkedArray]. +#' - `$.ToString()` Return a string representation that will be printed +#' to the console when this type or an Array of this type is printed. +#' +#' @rdname ExtensionType +#' @name ExtensionType +#' @export ExtensionType <- R6Class("ExtensionType", inherit = DataType, public = list( @@ -90,7 +158,29 @@ ExtensionType <- R6Class("ExtensionType", self$MakeArray(array$data()) }, - # ExtensionType subclasses can reimplement the following methods: + .Deserialize = function(storage_type, extension_name, extension_metadata) { + # Do nothing by default but allow other classes to override this method + # to populate R6 class members. + }, + + .ExtensionEquals = function(other) { + inherits(other, "ExtensionType") && + identical(other$extension_name(), self$extension_name()) && + identical(other$Serialize(), self$Serialize()) + }, + + .array_as_vector = function(extension_array) { + extension_array$storage()$as_vector() + }, + + .chunked_array_as_vector = function(chunked_array) { + storage_arrays <- lapply( + seq_len(chunked_array$num_chunks) - 1L, + function(i) chunked_array$chunk(i)$storage() + ) + storage <- chunked_array(!!! storage_arrays, type = self$storage_type()) + storage$as_vector() + }, .ToString = function() { # metadata is probably valid UTF-8 (e.g., JSON), but might not be @@ -118,35 +208,10 @@ ExtensionType <- R6Class("ExtensionType", Encoding(metadata_utf8) <- "UTF-8" paste0(class(self)[1], " <", metadata_utf8, ">") } - }, - - .Deserialize = function(storage_type, extension_name, extension_metadata) { - # Do nothing by default but allow other classes to override this method - # to populate R6 class members. - }, - - .ExtensionEquals = function(other) { - inherits(other, "ExtensionType") && - identical(other$extension_name(), self$extension_name()) && - identical(other$Serialize(), self$Serialize()) - }, - - .chunked_array_as_vector = function(chunked_array) { - storage_arrays <- lapply( - seq_len(chunked_array$num_chunks) - 1L, - function(i) chunked_array$chunk(i)$storage() - ) - storage <- chunked_array(!!! storage_arrays, type = self$storage_type()) - storage$as_vector() - }, - - .array_as_vector = function(extension_array) { - extension_array$storage()$as_vector() } ) ) - # ExtensionType$new() is what gets used by the generated wrapper code to # create an R6 object when a shared_ptr is returned to R and # that object has type_id() EXTENSION_TYPE. Rather than add complexity @@ -165,10 +230,10 @@ ExtensionType$new <- function(xp) { } -MakeExtensionType <- function(storage_type, - extension_name, - extension_metadata, - type_class = ExtensionType) { +new_extension_type <- function(storage_type, + extension_name, + extension_metadata, + type_class = ExtensionType) { assert_that(is.string(extension_name), is.raw(extension_metadata)) assert_is(storage_type, "DataType") assert_is(type_class, "R6ClassGenerator") @@ -181,24 +246,28 @@ MakeExtensionType <- function(storage_type, ) } -RegisterExtensionType <- function(extension_type) { +new_extension_array <- function(storage_array, extension_type) { + ExtensionArray$create(storage_array, extension_type) +} + +register_extension_type <- function(extension_type) { assert_is(extension_type, "ExtensionType") arrow__RegisterRExtensionType(extension_type) extension_type_registry[[extension_type$extension_name()]] <- extension_type invisible(extension_type) } -ReRegisterExtensionType <- function(extension_type) { +reregister_extension_type <- function(extension_type) { tryCatch( - RegisterExtensionType(extension_type), + register_extension_type(extension_type), error = function(e) { - UnregisterExtensionType(extension_type$extension_name()) - RegisterExtensionType(extension_type) + unregister_extension_type(extension_type$extension_name()) + register_extension_type(extension_type) } ) } -UnregisterExtensionType <- function(extension_name) { +unregister_extension_type <- function(extension_name) { arrow__UnregisterRExtensionType(extension_name) result <- extension_type_registry[[extension_name]] if (!is.null(result)) { @@ -262,7 +331,7 @@ vctrs_extension_type <- function(ptype, storage_type = type(vctrs::vec_data(ptype))) { ptype <- vctrs::vec_ptype(ptype) - MakeExtensionType( + new_extension_type( storage_type = storage_type, extension_name = "arrow.r.vctrs", extension_metadata = serialize(ptype, NULL), diff --git a/r/man/ExtensionArray.Rd b/r/man/ExtensionArray.Rd new file mode 100644 index 00000000000..84a63c9bb94 --- /dev/null +++ b/r/man/ExtensionArray.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extension.R +\docType{class} +\name{ExtensionArray} +\alias{ExtensionArray} +\title{class arrow::ExtensionArray} +\description{ +class arrow::ExtensionArray +} +\section{Methods}{ + + +The \code{ExtensionArray} class inherits from \code{Array}, but also provides +access to the underlying storage of the extension. +\itemize{ +\item \verb{$storage()}: Returns the underlying \link{Array} used to store +values. +} + +The \code{ExtensionArray} is not intended to be subclassed for extension +types. +} + diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd new file mode 100644 index 00000000000..a11ec812b43 --- /dev/null +++ b/r/man/ExtensionType.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extension.R +\docType{class} +\name{ExtensionType} +\alias{ExtensionType} +\title{class arrow::ExtensionType} +\description{ +class arrow::ExtensionType +} +\section{Methods}{ + + +The \code{ExtensionType} class inherits from \code{DataType}, but also defines +extra methods specific to extension types: +\itemize{ +\item \verb{$storage_type()}: Returns the underlying \link{DataType} used to store +values. +\item \verb{$storage_id()}: Returns the \link{Type} identifier corresponding to the +\verb{$storage_type()}. +\item \verb{$extension_name()}: Returns the extension name. +\item \verb{$Serialize()}: Returns the serialized version of the extension metadata +as a \code{\link[=raw]{raw()}} vector. +\item \verb{$WrapArray(array)}: Wraps a storage \link{Array} into an \link{ExtensionArray} +with this extension type. +} + +In addition, subclasses may override the following methos to customize +the behaviour of extension classes. +\itemize{ +\item \verb{$.Deserialize(storage_type, extension_name, extension_metadata)} +This method is called when a new \link{ExtensionType} +is initialized and is responsible for parsing and validating +the serialized \code{extension_metadata} (a \code{\link[=raw]{raw()}} vector) +such that its contents can be inspected by fields and/or methods +of the R6 ExtensionType subclass. Implementations must also check the +\code{storage_type} to make sure it is compatible with the extension type. +\item \verb{$.array_as_vector(extension_array)}: Convert an \link{Array} to an R +vector. This method is called by \code{\link[=as.vector]{as.vector()}} on \link{ExtensionArray} +objects or when a \link{RecordBatch} containing an \link{ExtensionArray} is +converted to a \code{\link[=data.frame]{data.frame()}}. The default method returns the converted +storage array. +\item \verb{$.chunked_array_as_vector(chunked_array)}: Convert a \link{ChunkedArray} +to an R vector. This method is called by \code{\link[=as.vector]{as.vector()}} on a \link{ChunkedArray} +whose type matches this extension type or when a \link{Table} containing +such a column is converted to a \code{\link[=data.frame]{data.frame()}}. The default method +returns the converted version of the equivalent storage arrays +as a \link{ChunkedArray}. +\item \verb{$.ToString()} Return a string representation that will be printed +to the console when this type or an Array of this type is printed. +} +} + diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 790e11c4353..9b5453e67af 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -16,7 +16,7 @@ # under the License. test_that("extension types can be created", { - type <- MakeExtensionType( + type <- new_extension_type( int32(), "arrow_r.simple_extension", charToRaw("some custom metadata"), @@ -55,7 +55,7 @@ test_that("extension type subclasses work", { ) ) - type <- MakeExtensionType( + type <- new_extension_type( int32(), "some_extension_subclass", charToRaw("some custom metadata"), @@ -65,11 +65,11 @@ test_that("extension type subclasses work", { expect_r6_class(type, "SomeExtensionTypeSubclass") expect_identical(type$some_custom_method(), charToRaw("some ")) - RegisterExtensionType(type) + register_extension_type(type) # create a new type instance with storage/metadata not identical # to the registered type - type2 <- MakeExtensionType( + type2 <- new_extension_type( float64(), "some_extension_subclass", charToRaw("some other custom metadata"), @@ -89,7 +89,7 @@ test_that("extension type subclasses work", { array <- type3$WrapArray(Array$create(1:10)) expect_r6_class(array, "ExtensionArray") - expect_identical(UnregisterExtensionType("some_extension_subclass"), type) + expect_identical(unregister_extension_type("some_extension_subclass"), type) }) test_that("extension subclasses can override the ExtensionEquals method", { @@ -116,19 +116,19 @@ test_that("extension subclasses can override the ExtensionEquals method", { ) ) - type <- MakeExtensionType( + type <- new_extension_type( int32(), "some_extension_subclass", serialize(list(field1 = "value1", field2 = "value2"), NULL), type_class = SomeExtensionTypeSubclass ) - RegisterExtensionType(type) + register_extension_type(type) expect_true(type$.ExtensionEquals(type)) expect_true(type$Equals(type)) - type2 <- MakeExtensionType( + type2 <- new_extension_type( int32(), "some_extension_subclass", serialize(list(field2 = "value2", field1 = "value1"), NULL), @@ -138,7 +138,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { expect_true(type$.ExtensionEquals(type2)) expect_true(type$Equals(type2)) - UnregisterExtensionType("some_extension_subclass") + unregister_extension_type("some_extension_subclass") }) test_that("vctrs extension type works", { From 6731c362209169721a8298a04e38dd9c202d706e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 15:06:43 -0300 Subject: [PATCH 25/66] documentation for extension types and vctrs extension type --- r/NAMESPACE | 7 ++ r/R/extension.R | 136 ++++++++++++++++++++++++------ r/man/new_extension_type.Rd | 81 ++++++++++++++++++ r/man/vctrs_extension_array.Rd | 50 +++++++++++ r/tests/testthat/test-extension.R | 2 +- 5 files changed, 248 insertions(+), 28 deletions(-) create mode 100644 r/man/new_extension_type.Rd create mode 100644 r/man/vctrs_extension_array.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index f1916f0e020..2113d0a0375 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -268,6 +268,8 @@ export(match_arrow) export(matches) export(mmap_create) export(mmap_open) +export(new_extension_array) +export(new_extension_type) export(null) export(num_range) export(one_of) @@ -283,6 +285,8 @@ export(read_parquet) export(read_schema) export(read_tsv_arrow) export(record_batch) +export(register_extension_type) +export(reregister_extension_type) export(s3_bucket) export(schema) export(set_cpu_count) @@ -301,8 +305,11 @@ export(uint32) export(uint64) export(uint8) export(unify_schemas) +export(unregister_extension_type) export(utf8) export(value_counts) +export(vctrs_extension_array) +export(vctrs_extension_type) export(write_arrow) export(write_csv_arrow) export(write_dataset) diff --git a/r/R/extension.R b/r/R/extension.R index b82118b0c36..9f0376c41c8 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -221,15 +221,62 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType$.default_new <- ExtensionType$new ExtensionType$new <- function(xp) { super <- ExtensionType$.default_new(xp) - registered_type_instance <- extension_type_registry[[super$extension_name()]] - if (is.null(registered_type_instance)) { + r6_class <- super$r6_class() + if (identical(r6_class$classname, "ExtensionType")) { super } else { - super$r6_class()$new(xp) + r6_class$new(xp) } } - +#' Extension types +#' +#' Extension arrays are wrappers around regular Arrow [Array] objects +#' that provide some customized behaviour and/or storage. A common use-case +#' for extension types is to define a customized conversion between an +#' an Arrow [Array] and an R object when the default conversion is slow +#' or looses metadata important to the interpretation of values in the array. +#' For most types, the built-in +#' [vctrs extension type][vctrs_extension_type] is probably sufficient. +#' +#' These functions create, register, and unregister [ExtensionType] +#' and [ExtensionArray] objects. To use an extension type you will have to: +#' +#' - Define an [R6::R6Class] that inherits from [ExtensionType] and reimplement +#' one or more methods (e.g., `.Deserialize()`). +#' - Register a dummy instance of your extension type created using +#' [new_extension_type()] using [register_extension_type()]. +#' +#' If defining an extension type in an R package, you will probably want to +#' use [reregister_extension_type()] in that package's [.onLoad()] hook +#' since your package will probably get reloaded in the same R session +#' during its development and [register_extension_type()] will error if +#' called twice for the same `extension_name`. +#' +#' @param storage_type The [data type][data-type] of the underlying storage +#' array. +#' @param storage_array An [Array] object of the underlying storage. +#' @param extension_type An [ExtensionType] instance. +#' @param extension_name The extension name. This should be namespaced using +#' "dot" syntax (i.e., "some_package.some_type"). The namespace "arrow" +#' is reserved for extension types defined by the Apache Arrow libraries. +#' @param extension_metadata A [raw()] vector containing the serialized +#' version of the type. +#' @param type_class An [R6::R6Class] whose `$new()` class method will be +#' used to construct a new instance of the type. +#' +#' @return +#' - `new_extension_type()` returns an [ExtensionType] instance according +#' to the `type_class` specified. +#' - `new_extension_array()` returns an [ExtensionArray] whose `$type` +#' corresponds to `extension_type`. +#' - `register_extension_type()` and `reregister_extension_type()` return +#' `extension_type`, invisibly. +#' - `unregister_extension_type()` returns the previously registered +#' `extension_type` (invisibly) or `NULL` if no type was previously +#' registered. +#' @export +#' new_extension_type <- function(storage_type, extension_name, extension_metadata, @@ -246,17 +293,21 @@ new_extension_type <- function(storage_type, ) } +#' @rdname new_extension_type +#' @export new_extension_array <- function(storage_array, extension_type) { ExtensionArray$create(storage_array, extension_type) } +#' @rdname new_extension_type +#' @export register_extension_type <- function(extension_type) { assert_is(extension_type, "ExtensionType") arrow__RegisterRExtensionType(extension_type) - extension_type_registry[[extension_type$extension_name()]] <- extension_type - invisible(extension_type) } +#' @rdname new_extension_type +#' @export reregister_extension_type <- function(extension_type) { tryCatch( register_extension_type(extension_type), @@ -267,17 +318,12 @@ reregister_extension_type <- function(extension_type) { ) } +#' @rdname new_extension_type +#' @export unregister_extension_type <- function(extension_name) { arrow__UnregisterRExtensionType(extension_name) - result <- extension_type_registry[[extension_name]] - if (!is.null(result)) { - rm(list = extension_name, envir = extension_type_registry) - } - invisible(result) } -extension_type_registry <- new.env(parent = emptyenv()) - VctrsExtensionType <- R6Class("VctrsExtensionType", inherit = ExtensionType, @@ -288,10 +334,12 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", .ToString = function() { tf <- tempfile() - on.exit(unlink(tf)) sink(tf) + on.exit({ + sink(NULL) + unlink(tf) + }) print(self$ptype()) - sink(NULL) paste0(readLines(tf), collapse = "\n") }, @@ -327,18 +375,38 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", ) -vctrs_extension_type <- function(ptype, - storage_type = type(vctrs::vec_data(ptype))) { - ptype <- vctrs::vec_ptype(ptype) - - new_extension_type( - storage_type = storage_type, - extension_name = "arrow.r.vctrs", - extension_metadata = serialize(ptype, NULL), - type_class = VctrsExtensionType - ) -} - +#' Extension type for generic vectors +#' +#' Most common R vector types are converted automatically to a suitable +#' Arrow [data type][data-type] without the need for an extension type. For +#' vector types whose conversion is not suitably handled by default, you can +#' create a [vctrs_extension_array()], which passes [vctrs::vec_data()] to +#' `Array$create()` and calls [vctrs::vec_restore()] when the [Array] is +#' converted back into an R vector. +#' +#' @param x A vctr (i.e., [vctrs::vec_is()] returns `TRUE`). +#' @param ptype A [vctrs::vec_ptype()], which is usually a zero-length +#' version of the object with the appropriate attributes set. This value +#' will be serialized using [serialize()], so it should not refer to any +#' R object that can't be saved/reloaded. +#' @inheritParams new_extension_type +#' +#' @return +#' - `vctrs_extension_array()` returns an [ExtensionArray] instance with a +#' `vctrs_extension_type()`. +#' - `vctrs_extension_type()` returns an [ExtensionType] instance for the +#' extension name "arrow.r.vctrs". +#' @export +#' +#' @examplesIf arrow_available() +#' (array <- vctrs_extension_array(as.POSIXlt("2022-01-02 03:45", tz = "UTC"))) +#' array$type +#' as.vector(array) +#' +#' temp_feather <- tempfile() +#' write_feather(arrow_table(col = array), temp_feather) +#' read_feather(temp_feather) +#' unlink(temp_feather) vctrs_extension_array <- function(x, ptype = vctrs::vec_ptype(x), storage_type = NULL) { if (inherits(x, "ExtensionArray") && inherits(x$type, "VctrsExtensionType")) { @@ -350,3 +418,17 @@ vctrs_extension_array <- function(x, ptype = vctrs::vec_ptype(x), type <- vctrs_extension_type(ptype, storage$type) type$WrapArray(storage) } + +#' @rdname vctrs_extension_array +#' @export +vctrs_extension_type <- function(ptype, + storage_type = type(vctrs::vec_data(ptype))) { + ptype <- vctrs::vec_ptype(ptype) + + new_extension_type( + storage_type = storage_type, + extension_name = "arrow.r.vctrs", + extension_metadata = serialize(ptype, NULL), + type_class = VctrsExtensionType + ) +} diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd new file mode 100644 index 00000000000..fb11415269a --- /dev/null +++ b/r/man/new_extension_type.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extension.R +\name{new_extension_type} +\alias{new_extension_type} +\alias{new_extension_array} +\alias{register_extension_type} +\alias{reregister_extension_type} +\alias{unregister_extension_type} +\title{Extension types} +\usage{ +new_extension_type( + storage_type, + extension_name, + extension_metadata, + type_class = ExtensionType +) + +new_extension_array(storage_array, extension_type) + +register_extension_type(extension_type) + +reregister_extension_type(extension_type) + +unregister_extension_type(extension_name) +} +\arguments{ +\item{storage_type}{The \link[=data-type]{data type} of the underlying storage +array.} + +\item{extension_name}{The extension name. This should be namespaced using +"dot" syntax (i.e., "some_package.some_type"). The namespace "arrow" +is reserved for extension types defined by the Apache Arrow libraries.} + +\item{extension_metadata}{A \code{\link[=raw]{raw()}} vector containing the serialized +version of the type.} + +\item{type_class}{An \link[R6:R6Class]{R6::R6Class} whose \verb{$new()} class method will be +used to construct a new instance of the type.} + +\item{storage_array}{An \link{Array} object of the underlying storage.} + +\item{extension_type}{An \link{ExtensionType} instance.} +} +\value{ +\itemize{ +\item \code{new_extension_type()} returns an \link{ExtensionType} instance according +to the \code{type_class} specified. +\item \code{new_extension_array()} returns an \link{ExtensionArray} whose \verb{$type} +corresponds to \code{extension_type}. +\item \code{register_extension_type()} and \code{reregister_extension_type()} return +\code{extension_type}, invisibly. +\item \code{unregister_extension_type()} returns the previously registered +\code{extension_type} (invisibly) or \code{NULL} if no type was previously +registered. +} +} +\description{ +Extension arrays are wrappers around regular Arrow \link{Array} objects +that provide some customized behaviour and/or storage. A common use-case +for extension types is to define a customized conversion between an +an Arrow \link{Array} and an R object when the default conversion is slow +or looses metadata important to the interpretation of values in the array. +For most types, the built-in +\link[=vctrs_extension_type]{vctrs extension type} is probably sufficient. +} +\details{ +These functions create, register, and unregister \link{ExtensionType} +and \link{ExtensionArray} objects. To use an extension type you will have to: +\itemize{ +\item Define an \link[R6:R6Class]{R6::R6Class} that inherits from \link{ExtensionType} and reimplement +one or more methods (e.g., \code{.Deserialize()}). +\item Register a dummy instance of your extension type created using +\code{\link[=new_extension_type]{new_extension_type()}} using \code{\link[=register_extension_type]{register_extension_type()}}. +} + +If defining an extension type in an R package, you will probably want to +use \code{\link[=reregister_extension_type]{reregister_extension_type()}} in that package's \code{\link[=.onLoad]{.onLoad()}} hook +since your package will probably get reloaded in the same R session +during its development and \code{\link[=register_extension_type]{register_extension_type()}} will error if +called twice for the same \code{extension_name}. +} diff --git a/r/man/vctrs_extension_array.Rd b/r/man/vctrs_extension_array.Rd new file mode 100644 index 00000000000..19c3431bcf7 --- /dev/null +++ b/r/man/vctrs_extension_array.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extension.R +\name{vctrs_extension_array} +\alias{vctrs_extension_array} +\alias{vctrs_extension_type} +\title{Extension type for generic vectors} +\usage{ +vctrs_extension_array(x, ptype = vctrs::vec_ptype(x), storage_type = NULL) + +vctrs_extension_type(ptype, storage_type = type(vctrs::vec_data(ptype))) +} +\arguments{ +\item{x}{A vctr (i.e., \code{\link[vctrs:vec_assert]{vctrs::vec_is()}} returns \code{TRUE}).} + +\item{ptype}{A \code{\link[vctrs:vec_ptype]{vctrs::vec_ptype()}}, which is usually a zero-length +version of the object with the appropriate attributes set. This value +will be serialized using \code{\link[=serialize]{serialize()}}, so it should not refer to any +R object that can't be saved/reloaded.} + +\item{storage_type}{The \link[=data-type]{data type} of the underlying storage +array.} +} +\value{ +\itemize{ +\item \code{vctrs_extension_array()} returns an \link{ExtensionArray} instance with a +\code{vctrs_extension_type()}. +\item \code{vctrs_extension_type()} returns an \link{ExtensionType} instance for the +extension name "arrow.r.vctrs". +} +} +\description{ +Most common R vector types are converted automatically to a suitable +Arrow \link[=data-type]{data type} without the need for an extension type. For +vector types whose conversion is not suitably handled by default, you can +create a \code{\link[=vctrs_extension_array]{vctrs_extension_array()}}, which passes \code{\link[vctrs:vec_data]{vctrs::vec_data()}} to +\code{Array$create()} and calls \code{\link[vctrs:vec_proxy]{vctrs::vec_restore()}} when the \link{Array} is +converted back into an R vector. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +(array <- vctrs_extension_array(as.POSIXlt("2022-01-02 03:45", tz = "UTC"))) +array$type +as.vector(array) + +temp_feather <- tempfile() +write_feather(arrow_table(col = array), temp_feather) +read_feather(temp_feather) +unlink(temp_feather) +\dontshow{\}) # examplesIf} +} diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 9b5453e67af..2fbd582f345 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -89,7 +89,7 @@ test_that("extension type subclasses work", { array <- type3$WrapArray(Array$create(1:10)) expect_r6_class(array, "ExtensionArray") - expect_identical(unregister_extension_type("some_extension_subclass"), type) + unregister_extension_type("some_extension_subclass") }) test_that("extension subclasses can override the ExtensionEquals method", { From 054211922a24398f99159ea9b24e7724bf27439c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 15:17:36 -0300 Subject: [PATCH 26/66] make sure new functions are in the pkgdown index --- r/_pkgdown.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index fcb7b2016ac..a50eec85481 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -144,6 +144,7 @@ reference: - buffer - read_message - concat_arrays + - ExtensionArray - title: Arrow data types and schema contents: - Schema @@ -156,6 +157,9 @@ reference: - DataType - DictionaryType - FixedWidthType + - new_extension_type + - vctrs_extension_type + - ExtensionType - title: Flight contents: - load_flight_server From 64c57311c3e9fc02eb8db27a2a7ab124c1cbd616 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 16 Mar 2022 16:30:24 -0300 Subject: [PATCH 27/66] add missing pkgdown reference section --- r/_pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index a50eec85481..c3810cdf099 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -145,6 +145,7 @@ reference: - read_message - concat_arrays - ExtensionArray + - vctrs_extension_array - title: Arrow data types and schema contents: - Schema From ca24c76ea72d7c0cd495b2d059b742e17ae17ffb Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 17 Mar 2022 09:12:43 -0300 Subject: [PATCH 28/66] correct the docs --- r/R/extension.R | 22 +++++++++++++--------- r/man/new_extension_type.Rd | 19 ++++++++++++------- r/man/vctrs_extension_array.Rd | 2 +- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 9f0376c41c8..30cff305658 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -244,14 +244,22 @@ ExtensionType$new <- function(xp) { #' #' - Define an [R6::R6Class] that inherits from [ExtensionType] and reimplement #' one or more methods (e.g., `.Deserialize()`). +#' - Make a type constructor function (e.g., `my_extension_type()`) that calls +#' [new_extension_type()] to create an R6 instance that can be used as a +#' [data type][data-type] elsewhere in the package. +#' - Make an array constructor function (e.g., `my_extension_array()`) that +#' calls [new_extension_array()] to create an [Array] instance of your +#' extension type. #' - Register a dummy instance of your extension type created using -#' [new_extension_type()] using [register_extension_type()]. +#' you constructor function using [register_extension_type()]. #' #' If defining an extension type in an R package, you will probably want to #' use [reregister_extension_type()] in that package's [.onLoad()] hook #' since your package will probably get reloaded in the same R session #' during its development and [register_extension_type()] will error if -#' called twice for the same `extension_name`. +#' called twice for the same `extension_name`. For an example of an +#' extension type that uses most of these features, see +#' [vctrs_extension_type()]. #' #' @param storage_type The [data type][data-type] of the underlying storage #' array. @@ -270,13 +278,9 @@ ExtensionType$new <- function(xp) { #' to the `type_class` specified. #' - `new_extension_array()` returns an [ExtensionArray] whose `$type` #' corresponds to `extension_type`. -#' - `register_extension_type()` and `reregister_extension_type()` return -#' `extension_type`, invisibly. -#' - `unregister_extension_type()` returns the previously registered -#' `extension_type` (invisibly) or `NULL` if no type was previously -#' registered. +#' - `register_extension_type()`, `unregister_extension_type()` +#' and `reregister_extension_type()` return `NULL`, invisibly. #' @export -#' new_extension_type <- function(storage_type, extension_name, extension_metadata, @@ -375,7 +379,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", ) -#' Extension type for generic vectors +#' Extension type for generic typed vectors #' #' Most common R vector types are converted automatically to a suitable #' Arrow [data type][data-type] without the need for an extension type. For diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index fb11415269a..70ed4d2a2d3 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -47,11 +47,8 @@ used to construct a new instance of the type.} to the \code{type_class} specified. \item \code{new_extension_array()} returns an \link{ExtensionArray} whose \verb{$type} corresponds to \code{extension_type}. -\item \code{register_extension_type()} and \code{reregister_extension_type()} return -\code{extension_type}, invisibly. -\item \code{unregister_extension_type()} returns the previously registered -\code{extension_type} (invisibly) or \code{NULL} if no type was previously -registered. +\item \code{register_extension_type()}, \code{unregister_extension_type()} +and \code{reregister_extension_type()} return \code{NULL}, invisibly. } } \description{ @@ -69,13 +66,21 @@ and \link{ExtensionArray} objects. To use an extension type you will have to: \itemize{ \item Define an \link[R6:R6Class]{R6::R6Class} that inherits from \link{ExtensionType} and reimplement one or more methods (e.g., \code{.Deserialize()}). +\item Make a type constructor function (e.g., \code{my_extension_type()}) that calls +\code{\link[=new_extension_type]{new_extension_type()}} to create an R6 instance that can be used as a +\link[=data-type]{data type} elsewhere in the package. +\item Make an array constructor function (e.g., \code{my_extension_array()}) that +calls \code{\link[=new_extension_array]{new_extension_array()}} to create an \link{Array} instance of your +extension type. \item Register a dummy instance of your extension type created using -\code{\link[=new_extension_type]{new_extension_type()}} using \code{\link[=register_extension_type]{register_extension_type()}}. +you constructor function using \code{\link[=register_extension_type]{register_extension_type()}}. } If defining an extension type in an R package, you will probably want to use \code{\link[=reregister_extension_type]{reregister_extension_type()}} in that package's \code{\link[=.onLoad]{.onLoad()}} hook since your package will probably get reloaded in the same R session during its development and \code{\link[=register_extension_type]{register_extension_type()}} will error if -called twice for the same \code{extension_name}. +called twice for the same \code{extension_name}. For an example of an +extension type that uses most of these features, see +\code{\link[=vctrs_extension_type]{vctrs_extension_type()}}. } diff --git a/r/man/vctrs_extension_array.Rd b/r/man/vctrs_extension_array.Rd index 19c3431bcf7..b80ce48dc2a 100644 --- a/r/man/vctrs_extension_array.Rd +++ b/r/man/vctrs_extension_array.Rd @@ -3,7 +3,7 @@ \name{vctrs_extension_array} \alias{vctrs_extension_array} \alias{vctrs_extension_type} -\title{Extension type for generic vectors} +\title{Extension type for generic typed vectors} \usage{ vctrs_extension_array(x, ptype = vctrs::vec_ptype(x), storage_type = NULL) From f2013590874774724746d57f14d8efeeb9b26b03 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 17 Mar 2022 09:30:52 -0300 Subject: [PATCH 29/66] add setfaulting test for dataset --- r/tests/testthat/test-extension.R | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 2fbd582f345..4536f113c82 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -274,3 +274,36 @@ test_that("Table can roundtrip extension types", { ) ) }) + +test_that("Dataset/arrow_dplyr_query can roundtrip extension types", { + tf <- tempfile() + on.exit(unlink(tf, recursive = TRUE)) + + df <- expand.grid( + number = 1:10, + letter = letters, + stringsAsFactors = FALSE, + KEEP.OUT.ATTRS = FALSE + ) %>% + tibble::as_tibble() + + df$extension <- vctrs::new_vctr(df$letter, class = "arrow_custom_vctr") + + table <- arrow_table( + number = df$number, + letter = df$letter, + extension = vctrs_extension_array(df$extension) + ) + + table %>% + dplyr::group_by(number) %>% + write_dataset(tf) + + expect_identical( + open_dataset(tf) %>% + dplyr::select(number, letter, extension) %>% + dplyr::collect() %>% + dplyr::arrange(number, letter), + df + ) +}) From 59cdcf29b91ee8267f9b11387f919d227e7c3493 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 17 Mar 2022 10:12:50 -0300 Subject: [PATCH 30/66] with non-segfaulting dataset read! --- r/R/extension.R | 1 - r/src/extension.cpp | 68 ++++++++++++++++++++++++------- r/tests/testthat/test-extension.R | 12 +++--- 3 files changed, 58 insertions(+), 23 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 30cff305658..0ac66bae5c0 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -328,7 +328,6 @@ unregister_extension_type <- function(extension_name) { arrow__UnregisterRExtensionType(extension_name) } - VctrsExtensionType <- R6Class("VctrsExtensionType", inherit = ExtensionType, public = list( diff --git a/r/src/extension.cpp b/r/src/extension.cpp index 68bb91e22ae..e9642a18491 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -19,6 +19,8 @@ #if defined(ARROW_R_WITH_ARROW) +#include + #include #include #include @@ -27,15 +29,22 @@ // types whose Deserialize, ExtensionEquals, and Serialize methods are // in meanintfully handled at the R level. At the C++ level, the type is // already serialized to minimize calls to R from C++. +// +// Using a std::shared_ptr<> to wrap a cpp11::sexp type is unusual, but we +// need it here to avoid calling the copy constructor from another thread, +// since this might call into the R API. If we don't do this, we get crashes +// when reading a multi-file Dataset. class RExtensionType : public arrow::ExtensionType { public: RExtensionType(const std::shared_ptr storage_type, std::string extension_name, std::string extension_metadata, - cpp11::environment r6_class) + std::shared_ptr r6_class, + std::thread::id creation_thread) : arrow::ExtensionType(storage_type), extension_name_(extension_name), extension_metadata_(extension_metadata), - r6_class_(r6_class) {} + r6_class_(r6_class), + creation_thread_(creation_thread) {} std::string extension_name() const { return extension_name_; } @@ -53,7 +62,7 @@ class RExtensionType : public arrow::ExtensionType { std::unique_ptr Clone() const; - cpp11::environment r6_class() { return r6_class_; } + cpp11::environment r6_class() const { return *r6_class_; } cpp11::environment r6_instance(std::shared_ptr storage_type, const std::string& serialized_data) const; @@ -65,7 +74,19 @@ class RExtensionType : public arrow::ExtensionType { private: std::string extension_name_; std::string extension_metadata_; - cpp11::environment r6_class_; + std::string cached_to_string_; + std::shared_ptr r6_class_; + std::thread::id creation_thread_; + + arrow::Status assert_r_thread() const { + if (std::this_thread::get_id() == creation_thread_) { + return arrow::Status::OK(); + } else { + return arrow::Status::ExecutionError("RExtensionType <", extension_name_, + "> attempted to call into R ", + "from a non-R thread"); + } + } }; bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { @@ -80,7 +101,12 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { } // With any ambiguity, we need to materialize the R6 instance and call its - // ExtensionEquals method. + // ExtensionEquals method. We can't do this on the non-R thread. + arrow::Status is_r_thread = assert_r_thread(); + if (!assert_r_thread().ok()) { + throw std::runtime_error(is_r_thread.message()); + } + cpp11::environment instance = r6_instance(); cpp11::function instance_ExtensionEquals(instance[".ExtensionEquals"]); @@ -107,17 +133,27 @@ arrow::Result> RExtensionType::Deserialize( cloned->storage_type_ = storage_type; cloned->extension_metadata_ = serialized_data; - // Create an ephemeral R6 instance here, which will call the R6 instance's - // .Deserialize() method, possibly erroring when the metadata is invalid - // or the deserialized values are invalid. It might be possible to avoid - // this call but when there is an error it will be confusing, since it will - // only occur when the result surfaces to R (which might be much later). - cloned->r6_instance(); + // We probably should create an ephemeral R6 instance here, which will call + // the R6 instance's .Deserialize() method, possibly erroring when the metadata is + // invalid or the deserialized values are invalid. When there is an error it will be + // confusing, since it will only occur when the result surfaces to R + // (which might be much later). Unfortunately, the Deserialize() method gets + // called from other threads frequently (e.g., when reading a multi-file Dataset), + // and we get crashes if we try this. As a compromise, we call this method when we can + // to maximize the likelihood an error is surfaced. + if (assert_r_thread().ok()) { + cloned->r6_instance(); + } return std::shared_ptr(cloned.release()); } std::string RExtensionType::ToString() const { + // In case this gets called from another thread + if (!assert_r_thread().ok()) { + return ExtensionType::ToString(); + } + cpp11::environment instance = r6_instance(); cpp11::function instance_ToString(instance[".ToString"]); cpp11::sexp result = instance_ToString(); @@ -125,8 +161,8 @@ std::string RExtensionType::ToString() const { } std::unique_ptr RExtensionType::Clone() const { - RExtensionType* ptr = - new RExtensionType(storage_type(), extension_name_, extension_metadata_, r6_class_); + RExtensionType* ptr = new RExtensionType( + storage_type(), extension_name_, extension_metadata_, r6_class_, creation_thread_); return std::unique_ptr(ptr); } @@ -141,7 +177,7 @@ cpp11::environment RExtensionType::r6_instance( cpp11::external_pointer> xp( new std::shared_ptr(cloned.release())); - cpp11::function r6_class_new(r6_class_["new"]); + cpp11::function r6_class_new(r6_class()["new"]); return r6_class_new(xp); } @@ -150,7 +186,9 @@ cpp11::environment ExtensionType__initialize( const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_class) { std::string metadata_string(extension_metadata.begin(), extension_metadata.end()); - RExtensionType cpp_type(storage_type, extension_name, metadata_string, r6_class); + auto r6_class_shared = std::make_shared(r6_class); + RExtensionType cpp_type(storage_type, extension_name, metadata_string, r6_class_shared, + std::this_thread::get_id()); return cpp_type.r6_instance(); } diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 4536f113c82..94a8212feb6 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -299,11 +299,9 @@ test_that("Dataset/arrow_dplyr_query can roundtrip extension types", { dplyr::group_by(number) %>% write_dataset(tf) - expect_identical( - open_dataset(tf) %>% - dplyr::select(number, letter, extension) %>% - dplyr::collect() %>% - dplyr::arrange(number, letter), - df - ) + roundtripped <- open_dataset(tf) %>% + dplyr::select(number, letter, extension) %>% + dplyr::collect() + + expect_identical(unclass(roundtripped$extension), roundtripped$letter) }) From ebebdb4a2a04169e423a075402665723f71d6d5a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 17 Mar 2022 11:15:15 -0300 Subject: [PATCH 31/66] properly fence Dataset test --- r/tests/testthat/test-extension.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 94a8212feb6..81e1f14194c 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -276,6 +276,8 @@ test_that("Table can roundtrip extension types", { }) test_that("Dataset/arrow_dplyr_query can roundtrip extension types", { + skip_if_not_available("dataset") + tf <- tempfile() on.exit(unlink(tf, recursive = TRUE)) From 68a78e1c13dd88b5a2bac7ea7ec65e94bde0b71d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 21 Mar 2022 09:26:21 -0300 Subject: [PATCH 32/66] Update r/R/arrow-package.R Co-authored-by: Jonathan Keane --- r/R/arrow-package.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 4ee5db8fd9f..427fda0165b 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -113,7 +113,6 @@ } } - #' Is the C++ Arrow library available? #' #' You won't generally need to call these function, but they're made available From e4ec12eaed9202a492f0cb64633b4b9cfda26ffd Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 21 Mar 2022 09:26:59 -0300 Subject: [PATCH 33/66] Update r/R/arrow-tabular.R Co-authored-by: Jonathan Keane --- r/R/arrow-tabular.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index 6bd3feb992d..f31b4c3ea94 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -106,6 +106,7 @@ tabular_as_data_frame_common <- function(x, base) { logical(1) ) + # If no columns are ExtensionTypes, we use our standard constructor if (!any(col_is_extension)) { return(base(x, option_use_threads())) } From 151ead33f959dc6fab0458d6fb3357f043754e8b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 21 Mar 2022 16:42:03 -0300 Subject: [PATCH 34/66] Update r/R/extension.R Co-authored-by: Jonathan Keane --- r/R/extension.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/extension.R b/r/R/extension.R index 0ac66bae5c0..b43e3d0fe55 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -50,7 +50,7 @@ ExtensionArray <- R6Class("ExtensionArray", ExtensionArray$create <- function(x, type) { assert_is(type, "ExtensionType") - if (inheritx(x, "ExtensionArray") && type$Equals(x$type)) { + if (inherits(x, "ExtensionArray") && type$Equals(x$type)) { return(x) } From 7eeb63797239c20ba1a9f84b43c894f70b554daf Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 09:12:50 -0300 Subject: [PATCH 35/66] actually use new_extension_array()! --- r/R/extension.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/extension.R b/r/R/extension.R index b43e3d0fe55..efdf526c6fb 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -419,7 +419,7 @@ vctrs_extension_array <- function(x, ptype = vctrs::vec_ptype(x), vctrs::vec_assert(x) storage <- Array$create(vctrs::vec_data(x), type = storage_type) type <- vctrs_extension_type(ptype, storage$type) - type$WrapArray(storage) + new_extension_array(storage, type) } #' @rdname vctrs_extension_array From 8cc73c2e7c39c8f0161b9f879e18b040a4010ca5 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 09:35:15 -0300 Subject: [PATCH 36/66] fix roxygen include --- r/R/extension.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/extension.R b/r/R/extension.R index efdf526c6fb..2d0df7094f1 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -16,6 +16,8 @@ # under the License. #' @include arrow-package.R + + #' @title class arrow::ExtensionArray #' #' @usage NULL @@ -58,7 +60,6 @@ ExtensionArray$create <- function(x, type) { type$WrapArray(storage) } -#' @include arrow-package.R #' @title class arrow::ExtensionType #' #' @usage NULL From 9895d1cbedecda0cf4bddbc98a7295a66539f8b9 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 10:05:51 -0300 Subject: [PATCH 37/66] typo in C++ comment --- r/src/extension.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/extension.cpp b/r/src/extension.cpp index e9642a18491..657f30b3b0f 100644 --- a/r/src/extension.cpp +++ b/r/src/extension.cpp @@ -27,7 +27,7 @@ // A wrapper around arrow::ExtensionType that allows R to register extension // types whose Deserialize, ExtensionEquals, and Serialize methods are -// in meanintfully handled at the R level. At the C++ level, the type is +// in meaningfully handled at the R level. At the C++ level, the type is // already serialized to minimize calls to R from C++. // // Using a std::shared_ptr<> to wrap a cpp11::sexp type is unusual, but we From 1fdf6382be8effc652b99bb26d09d297e0b4802a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 10:42:45 -0300 Subject: [PATCH 38/66] better default method for .chunked_array_as_vector() --- r/R/extension.R | 26 +++++++++++++++++--------- r/man/ExtensionType.Rd | 4 ++-- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 2d0df7094f1..90aa28dd4c6 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -100,8 +100,8 @@ ExtensionArray$create <- function(x, type) { #' to an R vector. This method is called by [as.vector()] on a [ChunkedArray] #' whose type matches this extension type or when a [Table] containing #' such a column is converted to a [data.frame()]. The default method -#' returns the converted version of the equivalent storage arrays -#' as a [ChunkedArray]. +#' converts each array using `$.array_as_vector()` and concatenates them +#' using [vctrs::vec_c()]. #' - `$.ToString()` Return a string representation that will be printed #' to the console when this type or an Array of this type is printed. #' @@ -175,12 +175,15 @@ ExtensionType <- R6Class("ExtensionType", }, .chunked_array_as_vector = function(chunked_array) { - storage_arrays <- lapply( + # Converting one array at a time so that users don't have to remember + # to implement two methods. Converting all the storage arrays to + # a ChunkedArray is probably faster (VctrsExtensionType does this). + storage_vectors <- lapply( seq_len(chunked_array$num_chunks) - 1L, - function(i) chunked_array$chunk(i)$storage() + function(i) chunked_array$chunk(i)$as_vector() ) - storage <- chunked_array(!!! storage_arrays, type = self$storage_type()) - storage$as_vector() + + vctrs::vec_c(!!! storage_vectors) }, .ToString = function() { @@ -360,10 +363,15 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", }, .chunked_array_as_vector = function(chunked_array) { - vctrs::vec_restore( - super$.chunked_array_as_vector(chunked_array), - self$ptype() + # rather than convert one array at a time, use more Arrow + # machinery to convert the whole ChunkedArray at once + storage_arrays <- lapply( + seq_len(chunked_array$num_chunks) - 1L, + function(i) chunked_array$chunk(i)$storage() ) + storage <- chunked_array(!!! storage_arrays, type = self$storage_type()) + + vctrs::vec_restore(storage$as_vector(), self$ptype()) }, .array_as_vector = function(extension_array) { diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index a11ec812b43..3b39a436f16 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -43,8 +43,8 @@ storage array. to an R vector. This method is called by \code{\link[=as.vector]{as.vector()}} on a \link{ChunkedArray} whose type matches this extension type or when a \link{Table} containing such a column is converted to a \code{\link[=data.frame]{data.frame()}}. The default method -returns the converted version of the equivalent storage arrays -as a \link{ChunkedArray}. +converts each array using \verb{$.array_as_vector()} and concatenates them +using \code{\link[vctrs:vec_c]{vctrs::vec_c()}}. \item \verb{$.ToString()} Return a string representation that will be printed to the console when this type or an Array of this type is printed. } From 25a998092405a4d20f5c72aea9069112cd719f96 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 11:30:30 -0300 Subject: [PATCH 39/66] add example, export ExtensionArray, support ExtensionType$create() and ExtensionArray$create() for consistency. --- r/NAMESPACE | 1 + r/R/extension.R | 85 ++++++++++++++++++++++++++++++++----- r/man/new_extension_type.Rd | 45 +++++++++++++++++++- 3 files changed, 120 insertions(+), 11 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 2113d0a0375..f8aece152c0 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -134,6 +134,7 @@ export(DictionaryArray) export(DirectoryPartitioning) export(DirectoryPartitioningFactory) export(Expression) +export(ExtensionArray) export(ExtensionType) export(FeatherReader) export(Field) diff --git a/r/R/extension.R b/r/R/extension.R index 90aa28dd4c6..eed6788dbaf 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -37,6 +37,7 @@ #' #' @rdname ExtensionArray #' @name ExtensionArray +#' @export ExtensionArray <- R6Class("ExtensionArray", inherit = Array, public = list( @@ -145,6 +146,18 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__extension_name(self) }, + # For argument naming consistency + extension_metadata = function() { + self$Serialize() + }, + + # To make sure this conversion is done properly + extension_metadata_utf8 = function() { + metadata_utf8 <- rawToChar(self$Serialize()) + Encoding(metadata_utf8) <- "UTF-8" + metadata_utf8 + }, + Serialize = function() { ExtensionType__Serialize(self) }, @@ -177,7 +190,8 @@ ExtensionType <- R6Class("ExtensionType", .chunked_array_as_vector = function(chunked_array) { # Converting one array at a time so that users don't have to remember # to implement two methods. Converting all the storage arrays to - # a ChunkedArray is probably faster (VctrsExtensionType does this). + # a ChunkedArray and then converting is probably faster + # (VctrsExtensionType does this). storage_vectors <- lapply( seq_len(chunked_array$num_chunks) - 1L, function(i) chunked_array$chunk(i)$as_vector() @@ -208,9 +222,7 @@ ExtensionType <- R6Class("ExtensionType", } } else { - metadata_utf8 <- rawToChar(self$Serialize()) - Encoding(metadata_utf8) <- "UTF-8" - paste0(class(self)[1], " <", metadata_utf8, ">") + paste0(class(self)[1], " <", self$extension_metadata_utf8(), ">") } } ) @@ -233,6 +245,22 @@ ExtensionType$new <- function(xp) { } } +ExtensionType$create <- function(storage_type, + extension_name, + extension_metadata = raw(), + type_class = ExtensionType) { + assert_that(is.string(extension_name), is.raw(extension_metadata)) + assert_is(storage_type, "DataType") + assert_is(type_class, "R6ClassGenerator") + + ExtensionType__initialize( + storage_type, + extension_name, + extension_metadata, + type_class + ) +} + #' Extension types #' #' Extension arrays are wrappers around regular Arrow [Array] objects @@ -285,15 +313,52 @@ ExtensionType$new <- function(xp) { #' - `register_extension_type()`, `unregister_extension_type()` #' and `reregister_extension_type()` return `NULL`, invisibly. #' @export +#' +#' @examplesIf arrow_available() +#' # Create the R6 type whose methods control how Array objects are +#' # converted to R objects, how equality between types is computed, +#' # and how types are printed. +#' NamedString <- R6::R6Class( +#' "NamedString", +#' inherit = ExtensionType, +#' public = list( +#' .array_as_vector = function(extension_array) { +#' storage <- super$.array_as_vector(extension_array) +#' names(storage$values) <- storage$names +#' storage$values +#' } +#' ) +#' ) +#' +#' # Create a helper type constructor that calls new_extension_type() +#' named_string <- function() { +#' new_extension_type( +#' storage_type = struct(values = string(), names = string()), +#' extension_name = "arrow.example.named_string", +#' type_class = NamedString +#' ) +#' } +#' +#' # Create a helper array constructor that calls new_extension_array() +#' named_string_array <- function(values, name = names(values)) { +#' new_extension_array( +#' Array$create(data.frame(values = unname(values), names = name)), +#' named_string() +#' ) +#' } +#' +#' # Register the extension type so that Arrow knows what to do when +#' # it encounters this extension type +#' reregister_extension_type(named_string()) +#' +#' # Create Array objects and use them! +#' (array <- named_string_array(c(name1 = "value1", name2 = "value2"))) +#' as.vector(array) new_extension_type <- function(storage_type, extension_name, - extension_metadata, + extension_metadata = raw(), type_class = ExtensionType) { - assert_that(is.string(extension_name), is.raw(extension_metadata)) - assert_is(storage_type, "DataType") - assert_is(type_class, "R6ClassGenerator") - - ExtensionType__initialize( + ExtensionType$create( storage_type, extension_name, extension_metadata, diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 70ed4d2a2d3..5aa9d61ab7b 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -11,7 +11,7 @@ new_extension_type( storage_type, extension_name, - extension_metadata, + extension_metadata = raw(), type_class = ExtensionType ) @@ -84,3 +84,46 @@ called twice for the same \code{extension_name}. For an example of an extension type that uses most of these features, see \code{\link[=vctrs_extension_type]{vctrs_extension_type()}}. } +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Create the R6 type whose methods control how Array objects are +# converted to R objects, how equality between types is computed, +# and how types are printed. +NamedString <- R6::R6Class( + "NamedString", + inherit = ExtensionType, + public = list( + .array_as_vector = function(extension_array) { + storage <- super$.array_as_vector(extension_array) + names(storage$values) <- storage$names + storage$values + } + ) +) + +# Create a helper type constructor that calls new_extension_type() +named_string <- function() { + new_extension_type( + storage_type = struct(values = string(), names = string()), + extension_name = "arrow.example.named_string", + type_class = NamedString + ) +} + +# Create a helper array constructor that calls new_extension_array() +named_string_array <- function(values, name = names(values)) { + new_extension_array( + Array$create(data.frame(values = unname(values), names = name)), + named_string() + ) +} + +# Register the extension type so that Arrow knows what to do when +# it encounters this extension type +reregister_extension_type(named_string()) + +# Create Array objects and use them! +(array <- named_string_array(c(name1 = "value1", name2 = "value2"))) +as.vector(array) +\dontshow{\}) # examplesIf} +} From 7d3e87712861e86f22cf8c5ed59c13220e045030 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 13:32:55 -0300 Subject: [PATCH 40/66] allow strings for extension_metadata (but make sure they're UTF-8 encoded before conversion to raw) --- r/R/extension.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/r/R/extension.R b/r/R/extension.R index eed6788dbaf..4b361be2c94 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -249,6 +249,10 @@ ExtensionType$create <- function(storage_type, extension_name, extension_metadata = raw(), type_class = ExtensionType) { + if (is.string(extension_metadata)) { + extension_metadata <- charToRaw(enc2utf8(extension_metadata)) + } + assert_that(is.string(extension_name), is.raw(extension_metadata)) assert_is(storage_type, "DataType") assert_is(type_class, "R6ClassGenerator") From 184f8e807ccb3e713db25215c029dc9c002fc2fb Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 13:43:21 -0300 Subject: [PATCH 41/66] remove confusing methods --- r/R/extension.R | 31 +++++++++++-------------------- r/man/ExtensionType.Rd | 4 ++-- r/man/new_extension_type.Rd | 5 +++-- r/tests/testthat/test-extension.R | 6 +++--- 4 files changed, 19 insertions(+), 27 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 4b361be2c94..30e946420e6 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -77,8 +77,8 @@ ExtensionArray$create <- function(x, type) { #' - `$storage_id()`: Returns the [Type] identifier corresponding to the #' `$storage_type()`. #' - `$extension_name()`: Returns the extension name. -#' - `$Serialize()`: Returns the serialized version of the extension metadata -#' as a [raw()] vector. +#' - `$extension_metadata()`: Returns the serialized version of the extension +#' metadata as a [raw()] vector. #' - `$WrapArray(array)`: Wraps a storage [Array] into an [ExtensionArray] #' with this extension type. #' @@ -121,7 +121,7 @@ ExtensionType <- R6Class("ExtensionType", self$.Deserialize( self$storage_type(), self$extension_name(), - self$Serialize() + self$extension_metadata() ) }, @@ -146,30 +146,20 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__extension_name(self) }, - # For argument naming consistency extension_metadata = function() { - self$Serialize() + ExtensionType__Serialize(self) }, # To make sure this conversion is done properly extension_metadata_utf8 = function() { - metadata_utf8 <- rawToChar(self$Serialize()) + metadata_utf8 <- rawToChar(self$extension_metadata()) Encoding(metadata_utf8) <- "UTF-8" metadata_utf8 }, - Serialize = function() { - ExtensionType__Serialize(self) - }, - - MakeArray = function(data) { - assert_is(data, "ArrayData") - ExtensionType__MakeArray(self, data) - }, - WrapArray = function(array) { assert_is(array, "Array") - self$MakeArray(array$data()) + ExtensionType__MakeArray(self, array$data()) }, .Deserialize = function(storage_type, extension_name, extension_metadata) { @@ -180,7 +170,7 @@ ExtensionType <- R6Class("ExtensionType", .ExtensionEquals = function(other) { inherits(other, "ExtensionType") && identical(other$extension_name(), self$extension_name()) && - identical(other$Serialize(), self$Serialize()) + identical(other$extension_metadata(), self$extension_metadata()) }, .array_as_vector = function(extension_array) { @@ -204,7 +194,7 @@ ExtensionType <- R6Class("ExtensionType", # metadata is probably valid UTF-8 (e.g., JSON), but might not be # and it's confusing to error when printing the object. This herustic # isn't perfect (but subclasses should override this method anyway) - metadata_raw <- self$Serialize() + metadata_raw <- self$extension_metadata() if (as.raw(0x00) %in% metadata_raw) { if (length(metadata_raw) > 20) { @@ -304,8 +294,9 @@ ExtensionType$create <- function(storage_type, #' @param extension_name The extension name. This should be namespaced using #' "dot" syntax (i.e., "some_package.some_type"). The namespace "arrow" #' is reserved for extension types defined by the Apache Arrow libraries. -#' @param extension_metadata A [raw()] vector containing the serialized -#' version of the type. +#' @param extension_metadata A [raw()] or [character()] vector containing the +#' serialized version of the type. Chatacter vectors must be length 1 and +#' are converted to UTF-8 before converting to [raw()]. #' @param type_class An [R6::R6Class] whose `$new()` class method will be #' used to construct a new instance of the type. #' diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index 3b39a436f16..1711ad00127 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -18,8 +18,8 @@ values. \item \verb{$storage_id()}: Returns the \link{Type} identifier corresponding to the \verb{$storage_type()}. \item \verb{$extension_name()}: Returns the extension name. -\item \verb{$Serialize()}: Returns the serialized version of the extension metadata -as a \code{\link[=raw]{raw()}} vector. +\item \verb{$extension_metadata()}: Returns the serialized version of the extension +metadata as a \code{\link[=raw]{raw()}} vector. \item \verb{$WrapArray(array)}: Wraps a storage \link{Array} into an \link{ExtensionArray} with this extension type. } diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 5aa9d61ab7b..fe8f6a62423 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -31,8 +31,9 @@ array.} "dot" syntax (i.e., "some_package.some_type"). The namespace "arrow" is reserved for extension types defined by the Apache Arrow libraries.} -\item{extension_metadata}{A \code{\link[=raw]{raw()}} vector containing the serialized -version of the type.} +\item{extension_metadata}{A \code{\link[=raw]{raw()}} or \code{\link[=character]{character()}} vector containing the +serialized version of the type. Chatacter vectors must be length 1 and +are converted to UTF-8 before converting to \code{\link[=raw]{raw()}}.} \item{type_class}{An \link[R6:R6Class]{R6::R6Class} whose \verb{$new()} class method will be used to construct a new instance of the type.} diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 81e1f14194c..78a1fa67b8d 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -26,11 +26,11 @@ test_that("extension types can be created", { expect_identical(type$extension_name(), "arrow_r.simple_extension") expect_true(type$storage_type() == int32()) expect_identical(type$storage_id(), int32()$id) - expect_identical(type$Serialize(), charToRaw("some custom metadata")) + expect_identical(type$extension_metadata(), charToRaw("some custom metadata")) expect_identical(type$ToString(), "ExtensionType ") storage <- Array$create(1:10) - array <- type$MakeArray(storage$data()) + array <- type$WrapArray(storage) expect_r6_class(array, "ExtensionArray") expect_r6_class(array$type, "ExtensionType") @@ -83,7 +83,7 @@ test_that("extension type subclasses work", { expect_identical(type3$extension_name(), "some_extension_subclass") expect_identical(type3$some_custom_method(), type2$some_custom_method()) - expect_identical(type3$Serialize(), type2$Serialize()) + expect_identical(type3$extension_metadata(), type2$extension_metadata()) expect_true(type3$storage_type() == type2$storage_type()) array <- type3$WrapArray(Array$create(1:10)) From 30172b65a3696699a955180f9b98d5baaae42c79 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 13:50:05 -0300 Subject: [PATCH 42/66] actually test UTF-8 conversion and printing of non-utf-8 metadata --- r/tests/testthat/test-extension.R | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 78a1fa67b8d..f4772537c1b 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -92,6 +92,31 @@ test_that("extension type subclasses work", { unregister_extension_type("some_extension_subclass") }) +test_that("extension types can use UTF-8 for metadata", { + type <- new_extension_type( + int32(), + "arrow.test.simple_extension", + "\U0001f4a9\U0001f4a9\U0001f4a9\U0001f4a9" + ) + + expect_identical( + type$extension_metadata_utf8(), + "\U0001f4a9\U0001f4a9\U0001f4a9\U0001f4a9" + ) + + expect_match(type$ToString(), "\U0001f4a9{4}") +}) + +test_that("extension types can be printed that don't use UTF-8 for metadata", { + type <- new_extension_type( + int32(), + "arrow.test.simple_extension", + as.raw(0:5) + ) + + expect_match(type$ToString(), "00 01 02 03 04 05") +}) + test_that("extension subclasses can override the ExtensionEquals method", { SomeExtensionTypeSubclass <- R6Class( "SomeExtensionTypeSubclass", inherit = ExtensionType, From df25ad1a53c5de8bd70e4d4cb5914357defeba25 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 14:02:41 -0300 Subject: [PATCH 43/66] better example --- r/R/extension.R | 61 ++++++++++++++++++++++++++++--------- r/man/new_extension_type.Rd | 61 ++++++++++++++++++++++++++++--------- 2 files changed, 94 insertions(+), 28 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 30e946420e6..5a7696ac3ba 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -313,41 +313,74 @@ ExtensionType$create <- function(storage_type, #' # Create the R6 type whose methods control how Array objects are #' # converted to R objects, how equality between types is computed, #' # and how types are printed. -#' NamedString <- R6::R6Class( -#' "NamedString", +#' QuantizedType <- R6::R6Class( +#' "QuantizedType", #' inherit = ExtensionType, #' public = list( +#' # methods to access the custom metadata fields +#' center = function() private$.center, +#' scale = function() private$.scale, +#' +#' # called when an Array of this type is converted to an R vector #' .array_as_vector = function(extension_array) { -#' storage <- super$.array_as_vector(extension_array) -#' names(storage$values) <- storage$names -#' storage$values +#' unquantized_arrow <- +#' (extension_array$storage()$cast(float64()) / private$.scale) + +#' private$.center +#' +#' as.vector(unquantized_arrow) +#' }, +#' +#' # populate the custom metadata fields from the serialized metadata +#' .Deserialize = function(storage_type, extension_name, extension_metadata) { +#' vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) +#' private$.center <- vals[1] +#' private$.scale <- vals[2] #' } +#' ), +#' +#' private = list( +#' .center = NULL, +#' .scale = NULL #' ) #' ) #' #' # Create a helper type constructor that calls new_extension_type() -#' named_string <- function() { +#' quantized <- function(center = 0, scale = 1, storage_type = int32()) { #' new_extension_type( -#' storage_type = struct(values = string(), names = string()), -#' extension_name = "arrow.example.named_string", -#' type_class = NamedString +#' storage_type = storage_type, +#' extension_name = "arrow.example.quantized", +#' extension_metadata = paste(center, scale, sep = ";"), +#' type_class = QuantizedType #' ) #' } #' #' # Create a helper array constructor that calls new_extension_array() -#' named_string_array <- function(values, name = names(values)) { +#' quantized_array <- function(x, center = 0, scale = 1, +#' storage_type = int32()) { +#' type <- quantized(center, scale, storage_type) #' new_extension_array( -#' Array$create(data.frame(values = unname(values), names = name)), -#' named_string() +#' Array$create((x - center) * scale, type = storage_type), +#' type #' ) #' } #' #' # Register the extension type so that Arrow knows what to do when #' # it encounters this extension type -#' reregister_extension_type(named_string()) +#' reregister_extension_type(quantized()) #' #' # Create Array objects and use them! -#' (array <- named_string_array(c(name1 = "value1", name2 = "value2"))) +#' (vals <- runif(5, min = 19, max = 21)) +#' +#' (array <- quantized_array( +#' vals, +#' center = 20, +#' scale = 2 ^ 15 - 1, +#' storage_type = int16()) +#' ) +#' +#' array$type$center() +#' array$type$scale() +#' #' as.vector(array) new_extension_type <- function(storage_type, extension_name, diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index fe8f6a62423..7aaa4b907f2 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -90,41 +90,74 @@ extension type that uses most of these features, see # Create the R6 type whose methods control how Array objects are # converted to R objects, how equality between types is computed, # and how types are printed. -NamedString <- R6::R6Class( - "NamedString", +QuantizedType <- R6::R6Class( + "QuantizedType", inherit = ExtensionType, public = list( + # methods to access the custom metadata fields + center = function() private$.center, + scale = function() private$.scale, + + # called when an Array of this type is converted to an R vector .array_as_vector = function(extension_array) { - storage <- super$.array_as_vector(extension_array) - names(storage$values) <- storage$names - storage$values + unquantized_arrow <- + (extension_array$storage()$cast(float64()) / private$.scale) + + private$.center + + as.vector(unquantized_arrow) + }, + + # populate the custom metadata fields from the serialized metadata + .Deserialize = function(storage_type, extension_name, extension_metadata) { + vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) + private$.center <- vals[1] + private$.scale <- vals[2] } + ), + + private = list( + .center = NULL, + .scale = NULL ) ) # Create a helper type constructor that calls new_extension_type() -named_string <- function() { +quantized <- function(center = 0, scale = 1, storage_type = int32()) { new_extension_type( - storage_type = struct(values = string(), names = string()), - extension_name = "arrow.example.named_string", - type_class = NamedString + storage_type = storage_type, + extension_name = "arrow.example.quantized", + extension_metadata = paste(center, scale, sep = ";"), + type_class = QuantizedType ) } # Create a helper array constructor that calls new_extension_array() -named_string_array <- function(values, name = names(values)) { +quantized_array <- function(x, center = 0, scale = 1, + storage_type = int32()) { + type <- quantized(center, scale, storage_type) new_extension_array( - Array$create(data.frame(values = unname(values), names = name)), - named_string() + Array$create((x - center) * scale, type = storage_type), + type ) } # Register the extension type so that Arrow knows what to do when # it encounters this extension type -reregister_extension_type(named_string()) +reregister_extension_type(quantized()) # Create Array objects and use them! -(array <- named_string_array(c(name1 = "value1", name2 = "value2"))) +(vals <- runif(5, min = 19, max = 21)) + +(array <- quantized_array( + vals, + center = 20, + scale = 2 ^ 15 - 1, + storage_type = int16()) +) + +array$type$center() +array$type$scale() + as.vector(array) \dontshow{\}) # examplesIf} } From 26fb3636032d23a3d4dce03a2b37e47a75705f7e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 22 Mar 2022 14:06:40 -0300 Subject: [PATCH 44/66] Less redundant .Deserialize() method --- r/R/extension.R | 16 ++++++---------- r/man/ExtensionType.Rd | 2 +- r/man/new_extension_type.Rd | 2 +- r/tests/testthat/test-extension.R | 8 ++++---- 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 5a7696ac3ba..1c828e29dab 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -85,7 +85,7 @@ ExtensionArray$create <- function(x, type) { #' In addition, subclasses may override the following methos to customize #' the behaviour of extension classes. #' -#' - `$.Deserialize(storage_type, extension_name, extension_metadata)` +#' - `$.Deserialize()` #' This method is called when a new [ExtensionType] #' is initialized and is responsible for parsing and validating #' the serialized `extension_metadata` (a [raw()] vector) @@ -118,11 +118,7 @@ ExtensionType <- R6Class("ExtensionType", # be overridden to populate custom fields initialize = function(xp) { super$initialize(xp) - self$.Deserialize( - self$storage_type(), - self$extension_name(), - self$extension_metadata() - ) + self$.Deserialize() }, # Because of how C++ shared_ptr<> objects are converted to R objects, @@ -162,7 +158,7 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__MakeArray(self, array$data()) }, - .Deserialize = function(storage_type, extension_name, extension_metadata) { + .Deserialize = function() { # Do nothing by default but allow other classes to override this method # to populate R6 class members. }, @@ -331,7 +327,7 @@ ExtensionType$create <- function(storage_type, #' }, #' #' # populate the custom metadata fields from the serialized metadata -#' .Deserialize = function(storage_type, extension_name, extension_metadata) { +#' .Deserialize = function() { #' vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) #' private$.center <- vals[1] #' private$.scale <- vals[2] @@ -443,8 +439,8 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", paste0(readLines(tf), collapse = "\n") }, - .Deserialize = function(storage_type, extension_name, extension_metadata) { - private$.ptype <- unserialize(extension_metadata) + .Deserialize = function() { + private$.ptype <- unserialize(self$extension_metadata()) }, .ExtensionEquals = function(other) { diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index 1711ad00127..e42b8112f1d 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -27,7 +27,7 @@ with this extension type. In addition, subclasses may override the following methos to customize the behaviour of extension classes. \itemize{ -\item \verb{$.Deserialize(storage_type, extension_name, extension_metadata)} +\item \verb{$.Deserialize()} This method is called when a new \link{ExtensionType} is initialized and is responsible for parsing and validating the serialized \code{extension_metadata} (a \code{\link[=raw]{raw()}} vector) diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 7aaa4b907f2..267186f80b1 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -108,7 +108,7 @@ QuantizedType <- R6::R6Class( }, # populate the custom metadata fields from the serialized metadata - .Deserialize = function(storage_type, extension_name, extension_metadata) { + .Deserialize = function() { vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) private$.center <- vals[1] private$.scale <- vals[2] diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index f4772537c1b..29ec62ffd7b 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -46,8 +46,8 @@ test_that("extension type subclasses work", { private$some_custom_field }, - .Deserialize = function(storage_type, extension_name, extension_metadata) { - private$some_custom_field <- head(extension_metadata, 5) + .Deserialize = function() { + private$some_custom_field <- head(self$extension_metadata(), 5) } ), private = list( @@ -123,8 +123,8 @@ test_that("extension subclasses can override the ExtensionEquals method", { public = list( field_values = NULL, - .Deserialize = function(storage_type, extension_name, extension_metadata) { - self$field_values <- unserialize(extension_metadata) + .Deserialize = function() { + self$field_values <- unserialize(self$extension_metadata()) }, .ExtensionEquals = function(other) { From 685e00add7ce8349870096633bf72de9c83e9b8b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 13:56:40 -0300 Subject: [PATCH 45/66] try to fix windows test --- r/tests/testthat/test-extension.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 29ec62ffd7b..cb8c8158291 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -104,7 +104,7 @@ test_that("extension types can use UTF-8 for metadata", { "\U0001f4a9\U0001f4a9\U0001f4a9\U0001f4a9" ) - expect_match(type$ToString(), "\U0001f4a9{4}") + expect_match(type$ToString(), "\U0001f4a9", fixed = TRUE) }) test_that("extension types can be printed that don't use UTF-8 for metadata", { From 66772d3f50a7a0734fe99116947e2e042710f25e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 14:30:49 -0300 Subject: [PATCH 46/66] move extension type definition to extension.h so that the Convert method can be used elsewhere --- r/src/arrowExports.cpp | 18 ++--- r/src/{extension.cpp => extension-impl.cpp} | 78 ++++----------------- r/src/extension.h | 76 ++++++++++++++++++++ 3 files changed, 100 insertions(+), 72 deletions(-) rename r/src/{extension.cpp => extension-impl.cpp} (73%) create mode 100644 r/src/extension.h diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index aa3208e8d0c..c4271a19aaf 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4167,7 +4167,7 @@ extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){ } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) cpp11::environment ExtensionType__initialize(const std::shared_ptr& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_class); extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_class_sexp){ @@ -4185,7 +4185,7 @@ extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP ex } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) std::string ExtensionType__extension_name(const std::shared_ptr& type); extern "C" SEXP _arrow_ExtensionType__extension_name(SEXP type_sexp){ @@ -4200,7 +4200,7 @@ extern "C" SEXP _arrow_ExtensionType__extension_name(SEXP type_sexp){ } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) cpp11::raws ExtensionType__Serialize(const std::shared_ptr& type); extern "C" SEXP _arrow_ExtensionType__Serialize(SEXP type_sexp){ @@ -4215,7 +4215,7 @@ extern "C" SEXP _arrow_ExtensionType__Serialize(SEXP type_sexp){ } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr ExtensionType__storage_type(const std::shared_ptr& type); extern "C" SEXP _arrow_ExtensionType__storage_type(SEXP type_sexp){ @@ -4230,7 +4230,7 @@ extern "C" SEXP _arrow_ExtensionType__storage_type(SEXP type_sexp){ } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr ExtensionType__MakeArray(const std::shared_ptr& type, const std::shared_ptr& data); extern "C" SEXP _arrow_ExtensionType__MakeArray(SEXP type_sexp, SEXP data_sexp){ @@ -4246,7 +4246,7 @@ extern "C" SEXP _arrow_ExtensionType__MakeArray(SEXP type_sexp, SEXP data_sexp){ } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) cpp11::environment ExtensionType__r6_class(const std::shared_ptr& type); extern "C" SEXP _arrow_ExtensionType__r6_class(SEXP type_sexp){ @@ -4261,7 +4261,7 @@ extern "C" SEXP _arrow_ExtensionType__r6_class(SEXP type_sexp){ } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr ExtensionArray__storage(const std::shared_ptr& array); extern "C" SEXP _arrow_ExtensionArray__storage(SEXP array_sexp){ @@ -4276,7 +4276,7 @@ extern "C" SEXP _arrow_ExtensionArray__storage(SEXP array_sexp){ } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) void arrow__RegisterRExtensionType(const std::shared_ptr& type); extern "C" SEXP _arrow_arrow__RegisterRExtensionType(SEXP type_sexp){ @@ -4292,7 +4292,7 @@ extern "C" SEXP _arrow_arrow__RegisterRExtensionType(SEXP type_sexp){ } #endif -// extension.cpp +// extension-impl.cpp #if defined(ARROW_R_WITH_ARROW) void arrow__UnregisterRExtensionType(std::string type_name); extern "C" SEXP _arrow_arrow__UnregisterRExtensionType(SEXP type_name_sexp){ diff --git a/r/src/extension.cpp b/r/src/extension-impl.cpp similarity index 73% rename from r/src/extension.cpp rename to r/src/extension-impl.cpp index 657f30b3b0f..4b49e72a6a6 100644 --- a/r/src/extension.cpp +++ b/r/src/extension-impl.cpp @@ -25,69 +25,7 @@ #include #include -// A wrapper around arrow::ExtensionType that allows R to register extension -// types whose Deserialize, ExtensionEquals, and Serialize methods are -// in meaningfully handled at the R level. At the C++ level, the type is -// already serialized to minimize calls to R from C++. -// -// Using a std::shared_ptr<> to wrap a cpp11::sexp type is unusual, but we -// need it here to avoid calling the copy constructor from another thread, -// since this might call into the R API. If we don't do this, we get crashes -// when reading a multi-file Dataset. -class RExtensionType : public arrow::ExtensionType { - public: - RExtensionType(const std::shared_ptr storage_type, - std::string extension_name, std::string extension_metadata, - std::shared_ptr r6_class, - std::thread::id creation_thread) - : arrow::ExtensionType(storage_type), - extension_name_(extension_name), - extension_metadata_(extension_metadata), - r6_class_(r6_class), - creation_thread_(creation_thread) {} - - std::string extension_name() const { return extension_name_; } - - bool ExtensionEquals(const arrow::ExtensionType& other) const; - - std::shared_ptr MakeArray(std::shared_ptr data) const; - - arrow::Result> Deserialize( - std::shared_ptr storage_type, - const std::string& serialized_data) const; - - std::string Serialize() const { return extension_metadata_; } - - std::string ToString() const; - - std::unique_ptr Clone() const; - - cpp11::environment r6_class() const { return *r6_class_; } - - cpp11::environment r6_instance(std::shared_ptr storage_type, - const std::string& serialized_data) const; - - cpp11::environment r6_instance() const { - return r6_instance(storage_type(), Serialize()); - } - - private: - std::string extension_name_; - std::string extension_metadata_; - std::string cached_to_string_; - std::shared_ptr r6_class_; - std::thread::id creation_thread_; - - arrow::Status assert_r_thread() const { - if (std::this_thread::get_id() == creation_thread_) { - return arrow::Status::OK(); - } else { - return arrow::Status::ExecutionError("RExtensionType <", extension_name_, - "> attempted to call into R ", - "from a non-R thread"); - } - } -}; +#include "extension.h" bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { // Avoid materializing the R6 instance if at all possible, since this is slow @@ -160,6 +98,20 @@ std::string RExtensionType::ToString() const { return cpp11::as_cpp(result); } +cpp11::sexp RExtensionType::Convert(const std::shared_ptr& array) { + cpp11::environment instance = r6_instance(); + cpp11::function instance_Convert(instance[".array_as_vector"]); + cpp11::sexp array_sexp = cpp11::to_r6(array, "ExtensionArray"); + return instance_Convert(array_sexp); +} + +cpp11::sexp RExtensionType::Convert(const std::shared_ptr& array) { + cpp11::environment instance = r6_instance(); + cpp11::function instance_Convert(instance[".chunked_array_as_vector"]); + cpp11::sexp array_sexp = cpp11::to_r6(array, "ChunkedArray"); + return instance_Convert(array_sexp); +} + std::unique_ptr RExtensionType::Clone() const { RExtensionType* ptr = new RExtensionType( storage_type(), extension_name_, extension_metadata_, r6_class_, creation_thread_); diff --git a/r/src/extension.h b/r/src/extension.h new file mode 100644 index 00000000000..e52c76a2847 --- /dev/null +++ b/r/src/extension.h @@ -0,0 +1,76 @@ + +#include "./arrow_types.h" + +#include + +#include +#include +#include + +// A wrapper around arrow::ExtensionType that allows R to register extension +// types whose Deserialize, ExtensionEquals, and Serialize methods are +// in meaningfully handled at the R level. At the C++ level, the type is +// already serialized to minimize calls to R from C++. +// +// Using a std::shared_ptr<> to wrap a cpp11::sexp type is unusual, but we +// need it here to avoid calling the copy constructor from another thread, +// since this might call into the R API. If we don't do this, we get crashes +// when reading a multi-file Dataset. +class RExtensionType : public arrow::ExtensionType { + public: + RExtensionType(const std::shared_ptr storage_type, + std::string extension_name, std::string extension_metadata, + std::shared_ptr r6_class, + std::thread::id creation_thread) + : arrow::ExtensionType(storage_type), + extension_name_(extension_name), + extension_metadata_(extension_metadata), + r6_class_(r6_class), + creation_thread_(creation_thread) {} + + std::string extension_name() const { return extension_name_; } + + bool ExtensionEquals(const arrow::ExtensionType& other) const; + + std::shared_ptr MakeArray(std::shared_ptr data) const; + + arrow::Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const; + + std::string Serialize() const { return extension_metadata_; } + + std::string ToString() const; + + cpp11::sexp Convert(const std::shared_ptr& array); + + cpp11::sexp Convert(const std::shared_ptr& array); + + std::unique_ptr Clone() const; + + cpp11::environment r6_class() const { return *r6_class_; } + + cpp11::environment r6_instance(std::shared_ptr storage_type, + const std::string& serialized_data) const; + + cpp11::environment r6_instance() const { + return r6_instance(storage_type(), Serialize()); + } + + private: + std::string extension_name_; + std::string extension_metadata_; + std::string cached_to_string_; + std::shared_ptr r6_class_; + std::thread::id creation_thread_; + + arrow::Status assert_r_thread() const { + if (std::this_thread::get_id() == creation_thread_) { + return arrow::Status::OK(); + } else { + return arrow::Status::ExecutionError("RExtensionType <", extension_name_, + "> attempted to call into R ", + "from a non-R thread"); + } + } +}; From f2b2e45095f72d8be0265d0ffeb1a999f082d4ae Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 15:03:34 -0300 Subject: [PATCH 47/66] use Converter for extension types --- r/R/arrow-tabular.R | 35 ----------------------------------- r/R/record-batch.R | 7 +------ r/R/table.R | 7 +------ r/src/array_to_vector.cpp | 35 ++++++++++++++++++++++++++++++++++- r/src/extension-impl.cpp | 7 ++++--- r/src/extension.h | 4 ++-- 6 files changed, 42 insertions(+), 53 deletions(-) diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index f31b4c3ea94..43110ccf24e 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -98,41 +98,6 @@ ArrowTabular <- R6Class("ArrowTabular", ) ) -tabular_as_data_frame_common <- function(x, base) { - x_cols <- names(x) - col_is_extension <- vapply( - x_cols, - function(col) inherits(x$schema[[col]]$type, "ExtensionType"), - logical(1) - ) - - # If no columns are ExtensionTypes, we use our standard constructor - if (!any(col_is_extension)) { - return(base(x, option_use_threads())) - } - - extension_cols <- x_cols[col_is_extension] - - if (all(col_is_extension)) { - tibble_no_extension_types <- NULL - } else { - tibble_no_extension_types <- base( - x[setdiff(x_cols, extension_cols)], - option_use_threads() - ) - } - - extension_vectors <- lapply( - extension_cols, - function(col) x[[col]]$as_vector() - ) - - names(extension_vectors) <- extension_cols - - all_vectors <- c(extension_vectors, tibble_no_extension_types)[x_cols] - tibble::new_tibble(all_vectors, nrow = nrow(x)) -} - #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { df <- x$to_data_frame() diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 1410760125c..382d86ed946 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -113,12 +113,7 @@ RecordBatch <- R6Class("RecordBatch", }, # Take, Filter, and SortIndices are methods on ArrowTabular serialize = function() ipc___SerializeRecordBatch__Raw(self), - to_data_frame = function() { - tabular_as_data_frame_common( - self, - RecordBatch__to_dataframe - ) - }, + to_data_frame = function() RecordBatch__to_dataframe(self, option_use_threads()), cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { assert_is(target_schema, "Schema") assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") diff --git a/r/R/table.R b/r/R/table.R index c4103650c1a..632f507492a 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -97,12 +97,7 @@ Table <- R6Class("Table", }, field = function(i) Table__field(self, i), serialize = function(output_stream, ...) write_table(self, output_stream, ...), - to_data_frame = function() { - tabular_as_data_frame_common( - self, - Table__to_dataframe - ) - }, + to_data_frame = function() Table__to_dataframe(self, option_use_threads()), cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { assert_is(target_schema, "Schema") assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 06d0a87a101..984de312ea7 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -29,6 +29,7 @@ #include #include +#include "./extension.h" #include "./r_task_group.h" namespace arrow { @@ -65,7 +66,14 @@ class Converter { // converter is passed as self to outlive the scope of Converter::Convert() SEXP ScheduleConvertTasks(RTasks& tasks, std::shared_ptr self) { - // try altrep first + // ExtensionType has to go through the ExensionType R6 instance + auto extension_type = + dynamic_cast(chunked_array_->type().get()); + if (extension_type != nullptr) { + return extension_type->Convert(chunked_array_); + } + + // try altrep before the Converter api: SEXP alt = altrep::MakeAltrepVector(chunked_array_); if (!Rf_isNull(alt)) { return alt; @@ -1154,6 +1162,28 @@ class Converter_Null : public Converter { } }; +class Converter_Extension : public Converter { + public: + explicit Converter_Extension(const std::shared_ptr& chunked_array) + : Converter(chunked_array) {} + + SEXP Allocate(R_xlen_t n) const { + Rf_error("Can't use Converter() API for ExtensionType directoy"); + } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return Status::NotImplemented("Converter API for ExtensionType"); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { + return Status::NotImplemented("Converter API for ExtensionType"); + } + + private: + std::shared_ptr storage_converter_; +}; + bool ArraysCanFitInteger(ArrayVector arrays) { bool all_can_fit = true; auto i32 = arrow::int32(); @@ -1316,6 +1346,9 @@ std::shared_ptr Converter::Make( case Type::NA: return std::make_shared(chunked_array); + case Type::EXTENSION: + return std::make_shared(chunked_array); + default: break; } diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index 4b49e72a6a6..209b6c87fce 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -25,7 +25,7 @@ #include #include -#include "extension.h" +#include "./extension.h" bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { // Avoid materializing the R6 instance if at all possible, since this is slow @@ -98,14 +98,15 @@ std::string RExtensionType::ToString() const { return cpp11::as_cpp(result); } -cpp11::sexp RExtensionType::Convert(const std::shared_ptr& array) { +cpp11::sexp RExtensionType::Convert(const std::shared_ptr& array) const { cpp11::environment instance = r6_instance(); cpp11::function instance_Convert(instance[".array_as_vector"]); cpp11::sexp array_sexp = cpp11::to_r6(array, "ExtensionArray"); return instance_Convert(array_sexp); } -cpp11::sexp RExtensionType::Convert(const std::shared_ptr& array) { +cpp11::sexp RExtensionType::Convert( + const std::shared_ptr& array) const { cpp11::environment instance = r6_instance(); cpp11::function instance_Convert(instance[".chunked_array_as_vector"]); cpp11::sexp array_sexp = cpp11::to_r6(array, "ChunkedArray"); diff --git a/r/src/extension.h b/r/src/extension.h index e52c76a2847..94705191718 100644 --- a/r/src/extension.h +++ b/r/src/extension.h @@ -42,9 +42,9 @@ class RExtensionType : public arrow::ExtensionType { std::string ToString() const; - cpp11::sexp Convert(const std::shared_ptr& array); + cpp11::sexp Convert(const std::shared_ptr& array) const; - cpp11::sexp Convert(const std::shared_ptr& array); + cpp11::sexp Convert(const std::shared_ptr& array) const; std::unique_ptr Clone() const; From 57115944841d9abb006ab94f2df031e80715f862 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 15:18:31 -0300 Subject: [PATCH 48/66] add license --- r/src/extension.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/r/src/extension.h b/r/src/extension.h index 94705191718..02236049412 100644 --- a/r/src/extension.h +++ b/r/src/extension.h @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// or more contributor license agreements. See the NOTICE file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. #include "./arrow_types.h" From 93a0326aa1220e65fd289456f509239e94572593 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 15:21:29 -0300 Subject: [PATCH 49/66] reomve unused extension type method --- r/src/extension-impl.cpp | 7 ------- r/src/extension.h | 2 -- 2 files changed, 9 deletions(-) diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index 209b6c87fce..1697a092048 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -98,13 +98,6 @@ std::string RExtensionType::ToString() const { return cpp11::as_cpp(result); } -cpp11::sexp RExtensionType::Convert(const std::shared_ptr& array) const { - cpp11::environment instance = r6_instance(); - cpp11::function instance_Convert(instance[".array_as_vector"]); - cpp11::sexp array_sexp = cpp11::to_r6(array, "ExtensionArray"); - return instance_Convert(array_sexp); -} - cpp11::sexp RExtensionType::Convert( const std::shared_ptr& array) const { cpp11::environment instance = r6_instance(); diff --git a/r/src/extension.h b/r/src/extension.h index 02236049412..f0aa96bc2be 100644 --- a/r/src/extension.h +++ b/r/src/extension.h @@ -58,8 +58,6 @@ class RExtensionType : public arrow::ExtensionType { std::string ToString() const; - cpp11::sexp Convert(const std::shared_ptr& array) const; - cpp11::sexp Convert(const std::shared_ptr& array) const; std::unique_ptr Clone() const; From 6ae66f6097c4c210600027d71839df223ed0469d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 15:24:38 -0300 Subject: [PATCH 50/66] .ToString -> ToString() --- r/R/extension.R | 6 +++--- r/src/extension-impl.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 1c828e29dab..db27bb31b4e 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -103,7 +103,7 @@ ExtensionArray$create <- function(x, type) { #' such a column is converted to a [data.frame()]. The default method #' converts each array using `$.array_as_vector()` and concatenates them #' using [vctrs::vec_c()]. -#' - `$.ToString()` Return a string representation that will be printed +#' - `$ToString()` Return a string representation that will be printed #' to the console when this type or an Array of this type is printed. #' #' @rdname ExtensionType @@ -186,7 +186,7 @@ ExtensionType <- R6Class("ExtensionType", vctrs::vec_c(!!! storage_vectors) }, - .ToString = function() { + ToString = function() { # metadata is probably valid UTF-8 (e.g., JSON), but might not be # and it's confusing to error when printing the object. This herustic # isn't perfect (but subclasses should override this method anyway) @@ -428,7 +428,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", private$.ptype }, - .ToString = function() { + ToString = function() { tf <- tempfile() sink(tf) on.exit({ diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index 1697a092048..38648f67530 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -93,7 +93,7 @@ std::string RExtensionType::ToString() const { } cpp11::environment instance = r6_instance(); - cpp11::function instance_ToString(instance[".ToString"]); + cpp11::function instance_ToString(instance["ToString"]); cpp11::sexp result = instance_ToString(); return cpp11::as_cpp(result); } From b03988ac83e9a415b731fa636ac2870a00118cbd Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 15:28:45 -0300 Subject: [PATCH 51/66] extension_metadata -> Serialize --- r/R/extension.R | 18 +++++++++--------- r/man/ExtensionType.Rd | 4 ++-- r/man/new_extension_type.Rd | 2 +- r/tests/testthat/test-extension.R | 10 +++++----- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index db27bb31b4e..5bc7792be75 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -77,7 +77,7 @@ ExtensionArray$create <- function(x, type) { #' - `$storage_id()`: Returns the [Type] identifier corresponding to the #' `$storage_type()`. #' - `$extension_name()`: Returns the extension name. -#' - `$extension_metadata()`: Returns the serialized version of the extension +#' - `$Serialize()`: Returns the serialized version of the extension #' metadata as a [raw()] vector. #' - `$WrapArray(array)`: Wraps a storage [Array] into an [ExtensionArray] #' with this extension type. @@ -142,13 +142,13 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__extension_name(self) }, - extension_metadata = function() { + Serialize = function() { ExtensionType__Serialize(self) }, # To make sure this conversion is done properly - extension_metadata_utf8 = function() { - metadata_utf8 <- rawToChar(self$extension_metadata()) + SerializeUTF8 = function() { + metadata_utf8 <- rawToChar(self$Serialize()) Encoding(metadata_utf8) <- "UTF-8" metadata_utf8 }, @@ -166,7 +166,7 @@ ExtensionType <- R6Class("ExtensionType", .ExtensionEquals = function(other) { inherits(other, "ExtensionType") && identical(other$extension_name(), self$extension_name()) && - identical(other$extension_metadata(), self$extension_metadata()) + identical(other$Serialize(), self$Serialize()) }, .array_as_vector = function(extension_array) { @@ -190,7 +190,7 @@ ExtensionType <- R6Class("ExtensionType", # metadata is probably valid UTF-8 (e.g., JSON), but might not be # and it's confusing to error when printing the object. This herustic # isn't perfect (but subclasses should override this method anyway) - metadata_raw <- self$extension_metadata() + metadata_raw <- self$Serialize() if (as.raw(0x00) %in% metadata_raw) { if (length(metadata_raw) > 20) { @@ -208,7 +208,7 @@ ExtensionType <- R6Class("ExtensionType", } } else { - paste0(class(self)[1], " <", self$extension_metadata_utf8(), ">") + paste0(class(self)[1], " <", self$SerializeUTF8(), ">") } } ) @@ -328,7 +328,7 @@ ExtensionType$create <- function(storage_type, #' #' # populate the custom metadata fields from the serialized metadata #' .Deserialize = function() { -#' vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) +#' vals <- as.numeric(strsplit(self$SerializeUTF8(), ";")[[1]]) #' private$.center <- vals[1] #' private$.scale <- vals[2] #' } @@ -440,7 +440,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", }, .Deserialize = function() { - private$.ptype <- unserialize(self$extension_metadata()) + private$.ptype <- unserialize(self$Serialize()) }, .ExtensionEquals = function(other) { diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index e42b8112f1d..b7cc02a3fbb 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -18,7 +18,7 @@ values. \item \verb{$storage_id()}: Returns the \link{Type} identifier corresponding to the \verb{$storage_type()}. \item \verb{$extension_name()}: Returns the extension name. -\item \verb{$extension_metadata()}: Returns the serialized version of the extension +\item \verb{$Serialize()}: Returns the serialized version of the extension metadata as a \code{\link[=raw]{raw()}} vector. \item \verb{$WrapArray(array)}: Wraps a storage \link{Array} into an \link{ExtensionArray} with this extension type. @@ -45,7 +45,7 @@ whose type matches this extension type or when a \link{Table} containing such a column is converted to a \code{\link[=data.frame]{data.frame()}}. The default method converts each array using \verb{$.array_as_vector()} and concatenates them using \code{\link[vctrs:vec_c]{vctrs::vec_c()}}. -\item \verb{$.ToString()} Return a string representation that will be printed +\item \verb{$ToString()} Return a string representation that will be printed to the console when this type or an Array of this type is printed. } } diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 267186f80b1..64ca2748d75 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -109,7 +109,7 @@ QuantizedType <- R6::R6Class( # populate the custom metadata fields from the serialized metadata .Deserialize = function() { - vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) + vals <- as.numeric(strsplit(self$SerializeUTF8(), ";")[[1]]) private$.center <- vals[1] private$.scale <- vals[2] } diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index cb8c8158291..9e6f2098bc0 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -26,7 +26,7 @@ test_that("extension types can be created", { expect_identical(type$extension_name(), "arrow_r.simple_extension") expect_true(type$storage_type() == int32()) expect_identical(type$storage_id(), int32()$id) - expect_identical(type$extension_metadata(), charToRaw("some custom metadata")) + expect_identical(type$Serialize(), charToRaw("some custom metadata")) expect_identical(type$ToString(), "ExtensionType ") storage <- Array$create(1:10) @@ -47,7 +47,7 @@ test_that("extension type subclasses work", { }, .Deserialize = function() { - private$some_custom_field <- head(self$extension_metadata(), 5) + private$some_custom_field <- head(self$Serialize(), 5) } ), private = list( @@ -83,7 +83,7 @@ test_that("extension type subclasses work", { expect_identical(type3$extension_name(), "some_extension_subclass") expect_identical(type3$some_custom_method(), type2$some_custom_method()) - expect_identical(type3$extension_metadata(), type2$extension_metadata()) + expect_identical(type3$Serialize(), type2$Serialize()) expect_true(type3$storage_type() == type2$storage_type()) array <- type3$WrapArray(Array$create(1:10)) @@ -100,7 +100,7 @@ test_that("extension types can use UTF-8 for metadata", { ) expect_identical( - type$extension_metadata_utf8(), + type$SerializeUTF8(), "\U0001f4a9\U0001f4a9\U0001f4a9\U0001f4a9" ) @@ -124,7 +124,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { field_values = NULL, .Deserialize = function() { - self$field_values <- unserialize(self$extension_metadata()) + self$field_values <- unserialize(self$Serialize()) }, .ExtensionEquals = function(other) { From 743f65739bfece1f9989357674def4e2abc47df7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 15:32:17 -0300 Subject: [PATCH 52/66] .Deserialize -> Deserialize --- r/R/extension.R | 14 +++++++------- r/man/ExtensionType.Rd | 2 +- r/man/new_extension_type.Rd | 4 ++-- r/src/extension-impl.cpp | 2 +- r/tests/testthat/test-extension.R | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 5bc7792be75..51a35a6ef96 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -85,7 +85,7 @@ ExtensionArray$create <- function(x, type) { #' In addition, subclasses may override the following methos to customize #' the behaviour of extension classes. #' -#' - `$.Deserialize()` +#' - `$Deserialize()` #' This method is called when a new [ExtensionType] #' is initialized and is responsible for parsing and validating #' the serialized `extension_metadata` (a [raw()] vector) @@ -114,11 +114,11 @@ ExtensionType <- R6Class("ExtensionType", public = list( # In addition to the initialization that occurs for all - # ArrowObject instances, we call .Deserialize(), which can + # ArrowObject instances, we call Deserialize(), which can # be overridden to populate custom fields initialize = function(xp) { super$initialize(xp) - self$.Deserialize() + self$Deserialize() }, # Because of how C++ shared_ptr<> objects are converted to R objects, @@ -158,7 +158,7 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__MakeArray(self, array$data()) }, - .Deserialize = function() { + Deserialize = function() { # Do nothing by default but allow other classes to override this method # to populate R6 class members. }, @@ -265,7 +265,7 @@ ExtensionType$create <- function(storage_type, #' and [ExtensionArray] objects. To use an extension type you will have to: #' #' - Define an [R6::R6Class] that inherits from [ExtensionType] and reimplement -#' one or more methods (e.g., `.Deserialize()`). +#' one or more methods (e.g., `Deserialize()`). #' - Make a type constructor function (e.g., `my_extension_type()`) that calls #' [new_extension_type()] to create an R6 instance that can be used as a #' [data type][data-type] elsewhere in the package. @@ -327,7 +327,7 @@ ExtensionType$create <- function(storage_type, #' }, #' #' # populate the custom metadata fields from the serialized metadata -#' .Deserialize = function() { +#' Deserialize = function() { #' vals <- as.numeric(strsplit(self$SerializeUTF8(), ";")[[1]]) #' private$.center <- vals[1] #' private$.scale <- vals[2] @@ -439,7 +439,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", paste0(readLines(tf), collapse = "\n") }, - .Deserialize = function() { + Deserialize = function() { private$.ptype <- unserialize(self$Serialize()) }, diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index b7cc02a3fbb..fe24ad91978 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -27,7 +27,7 @@ with this extension type. In addition, subclasses may override the following methos to customize the behaviour of extension classes. \itemize{ -\item \verb{$.Deserialize()} +\item \verb{$Deserialize()} This method is called when a new \link{ExtensionType} is initialized and is responsible for parsing and validating the serialized \code{extension_metadata} (a \code{\link[=raw]{raw()}} vector) diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 64ca2748d75..3ff8eb7808a 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -66,7 +66,7 @@ These functions create, register, and unregister \link{ExtensionType} and \link{ExtensionArray} objects. To use an extension type you will have to: \itemize{ \item Define an \link[R6:R6Class]{R6::R6Class} that inherits from \link{ExtensionType} and reimplement -one or more methods (e.g., \code{.Deserialize()}). +one or more methods (e.g., \code{Deserialize()}). \item Make a type constructor function (e.g., \code{my_extension_type()}) that calls \code{\link[=new_extension_type]{new_extension_type()}} to create an R6 instance that can be used as a \link[=data-type]{data type} elsewhere in the package. @@ -108,7 +108,7 @@ QuantizedType <- R6::R6Class( }, # populate the custom metadata fields from the serialized metadata - .Deserialize = function() { + Deserialize = function() { vals <- as.numeric(strsplit(self$SerializeUTF8(), ";")[[1]]) private$.center <- vals[1] private$.scale <- vals[2] diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index 38648f67530..7f70862965f 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -72,7 +72,7 @@ arrow::Result> RExtensionType::Deserialize( cloned->extension_metadata_ = serialized_data; // We probably should create an ephemeral R6 instance here, which will call - // the R6 instance's .Deserialize() method, possibly erroring when the metadata is + // the R6 instance's Deserialize() method, possibly erroring when the metadata is // invalid or the deserialized values are invalid. When there is an error it will be // confusing, since it will only occur when the result surfaces to R // (which might be much later). Unfortunately, the Deserialize() method gets diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 9e6f2098bc0..98d1521b33c 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -46,7 +46,7 @@ test_that("extension type subclasses work", { private$some_custom_field }, - .Deserialize = function() { + Deserialize = function() { private$some_custom_field <- head(self$Serialize(), 5) } ), @@ -123,7 +123,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { public = list( field_values = NULL, - .Deserialize = function() { + Deserialize = function() { self$field_values <- unserialize(self$Serialize()) }, From 396b38ec298a34498e1517227413d5a6c948f1c7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 15:35:17 -0300 Subject: [PATCH 53/66] .ExtensionEquals -> ExtensionEquals --- r/R/extension.R | 4 ++-- r/src/extension-impl.cpp | 2 +- r/tests/testthat/test-extension.R | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 51a35a6ef96..03ce31ce37e 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -163,7 +163,7 @@ ExtensionType <- R6Class("ExtensionType", # to populate R6 class members. }, - .ExtensionEquals = function(other) { + ExtensionEquals = function(other) { inherits(other, "ExtensionType") && identical(other$extension_name(), self$extension_name()) && identical(other$Serialize(), self$Serialize()) @@ -443,7 +443,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", private$.ptype <- unserialize(self$Serialize()) }, - .ExtensionEquals = function(other) { + ExtensionEquals = function(other) { if (!inherits(other, "VctrsExtensionType")) { return(FALSE) } diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index 7f70862965f..4876307a582 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -46,7 +46,7 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { } cpp11::environment instance = r6_instance(); - cpp11::function instance_ExtensionEquals(instance[".ExtensionEquals"]); + cpp11::function instance_ExtensionEquals(instance["ExtensionEquals"]); std::shared_ptr other_shared = ValueOrStop(other.Deserialize(other.storage_type(), other.Serialize())); diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 98d1521b33c..e6eb8285651 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -127,7 +127,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { self$field_values <- unserialize(self$Serialize()) }, - .ExtensionEquals = function(other) { + ExtensionEquals = function(other) { if (!inherits(other, "SomeExtensionTypeSubclass")) { return(FALSE) } @@ -150,7 +150,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { register_extension_type(type) - expect_true(type$.ExtensionEquals(type)) + expect_true(type$ExtensionEquals(type)) expect_true(type$Equals(type)) type2 <- new_extension_type( @@ -160,7 +160,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { type_class = SomeExtensionTypeSubclass ) - expect_true(type$.ExtensionEquals(type2)) + expect_true(type$ExtensionEquals(type2)) expect_true(type$Equals(type2)) unregister_extension_type("some_extension_subclass") From 9aa909cadb11ad67e26a838a547e1e9fb5a84cc1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 16:01:10 -0300 Subject: [PATCH 54/66] consolidate .array_as_vector() and .chunked_array_as_vector() to as_vector() --- r/R/chunked-array.R | 2 +- r/R/extension.R | 90 ++++++++++++++++++------------------- r/man/ExtensionType.Rd | 20 +++------ r/man/new_extension_type.Rd | 2 +- r/src/extension-impl.cpp | 2 +- 5 files changed, 53 insertions(+), 63 deletions(-) diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 76fa8488113..6e140b98123 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -87,7 +87,7 @@ ChunkedArray <- R6Class("ChunkedArray", chunk = function(i) Array$create(ChunkedArray__chunk(self, i)), as_vector = function() { if (inherits(self$type, "ExtensionType")) { - self$type$.chunked_array_as_vector(self) + self$type$as_vector(self) } else { ChunkedArray__as_vector(self, option_use_threads()) } diff --git a/r/R/extension.R b/r/R/extension.R index 03ce31ce37e..bb1aff70a34 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -46,7 +46,7 @@ ExtensionArray <- R6Class("ExtensionArray", }, as_vector = function() { - self$type$.array_as_vector(self) + self$type$as_vector(self) } ) ) @@ -85,24 +85,18 @@ ExtensionArray$create <- function(x, type) { #' In addition, subclasses may override the following methos to customize #' the behaviour of extension classes. #' -#' - `$Deserialize()` -#' This method is called when a new [ExtensionType] +#' - `$Deserialize()`: This method is called when a new [ExtensionType] #' is initialized and is responsible for parsing and validating -#' the serialized `extension_metadata` (a [raw()] vector) +#' the serialized extension_metadata (a [raw()] vector) #' such that its contents can be inspected by fields and/or methods #' of the R6 ExtensionType subclass. Implementations must also check the #' `storage_type` to make sure it is compatible with the extension type. -#' - `$.array_as_vector(extension_array)`: Convert an [Array] to an R +#' - `$as_vector(extension_array)`: Convert an [Array] or [ChunkedArray] to an R #' vector. This method is called by [as.vector()] on [ExtensionArray] -#' objects or when a [RecordBatch] containing an [ExtensionArray] is -#' converted to a [data.frame()]. The default method returns the converted -#' storage array. -#' - `$.chunked_array_as_vector(chunked_array)`: Convert a [ChunkedArray] -#' to an R vector. This method is called by [as.vector()] on a [ChunkedArray] -#' whose type matches this extension type or when a [Table] containing -#' such a column is converted to a [data.frame()]. The default method -#' converts each array using `$.array_as_vector()` and concatenates them -#' using [vctrs::vec_c()]. +#' objects, when a [RecordBatch] containing an [ExtensionArray] is +#' converted to a [data.frame()], or when a [ChunkedArray] (e.g., a column +#' in a [Table]) is converted to an R vector. The default method returns the +#' converted storage array. #' - `$ToString()` Return a string representation that will be printed #' to the console when this type or an Array of this type is printed. #' @@ -169,21 +163,21 @@ ExtensionType <- R6Class("ExtensionType", identical(other$Serialize(), self$Serialize()) }, - .array_as_vector = function(extension_array) { - extension_array$storage()$as_vector() - }, - - .chunked_array_as_vector = function(chunked_array) { - # Converting one array at a time so that users don't have to remember - # to implement two methods. Converting all the storage arrays to - # a ChunkedArray and then converting is probably faster - # (VctrsExtensionType does this). - storage_vectors <- lapply( - seq_len(chunked_array$num_chunks) - 1L, - function(i) chunked_array$chunk(i)$as_vector() - ) - - vctrs::vec_c(!!! storage_vectors) + as_vector = function(extension_array) { + if (inherits(extension_array, "ChunkedArray")) { + # Converting one array at a time so that users don't have to remember + # to implement two methods. Converting all the storage arrays to + # a ChunkedArray and then converting is probably faster + # (VctrsExtensionType does this). + storage_vectors <- lapply( + seq_len(extension_array$num_chunks) - 1L, + function(i) self$as_vector(extension_array$chunk(i)$storage()) + ) + + vctrs::vec_c(!!! storage_vectors) + } else if (inherits(extension_array, "ExtensionArray")) { + extension_array$storage()$as_vector() + } }, ToString = function() { @@ -318,7 +312,7 @@ ExtensionType$create <- function(storage_type, #' scale = function() private$.scale, #' #' # called when an Array of this type is converted to an R vector -#' .array_as_vector = function(extension_array) { +#' as_vector = function(extension_array) { #' unquantized_arrow <- #' (extension_array$storage()$cast(float64()) / private$.scale) + #' private$.center @@ -451,23 +445,25 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", identical(self$ptype(), other$ptype()) }, - .chunked_array_as_vector = function(chunked_array) { - # rather than convert one array at a time, use more Arrow - # machinery to convert the whole ChunkedArray at once - storage_arrays <- lapply( - seq_len(chunked_array$num_chunks) - 1L, - function(i) chunked_array$chunk(i)$storage() - ) - storage <- chunked_array(!!! storage_arrays, type = self$storage_type()) - - vctrs::vec_restore(storage$as_vector(), self$ptype()) - }, - - .array_as_vector = function(extension_array) { - vctrs::vec_restore( - super$.array_as_vector(extension_array), - self$ptype() - ) + as_vector = function(extension_array) { + if (inherits(extension_array, "ChunkedArray")) { + # rather than convert one array at a time, use more Arrow + # machinery to convert the whole ChunkedArray at once + storage_arrays <- lapply( + seq_len(extension_array$num_chunks) - 1L, + function(i) extension_array$chunk(i)$storage() + ) + storage <- chunked_array(!!! storage_arrays, type = self$storage_type()) + + vctrs::vec_restore(storage$as_vector(), self$ptype()) + } else if (inherits(extension_array, "Array")) { + vctrs::vec_restore( + super$as_vector(extension_array), + self$ptype() + ) + } else { + super$as_vector(extension_array) + } } ), private = list( diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index fe24ad91978..52e8043b924 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -27,24 +27,18 @@ with this extension type. In addition, subclasses may override the following methos to customize the behaviour of extension classes. \itemize{ -\item \verb{$Deserialize()} -This method is called when a new \link{ExtensionType} +\item \verb{$Deserialize()}: This method is called when a new \link{ExtensionType} is initialized and is responsible for parsing and validating -the serialized \code{extension_metadata} (a \code{\link[=raw]{raw()}} vector) +the serialized extension_metadata (a \code{\link[=raw]{raw()}} vector) such that its contents can be inspected by fields and/or methods of the R6 ExtensionType subclass. Implementations must also check the \code{storage_type} to make sure it is compatible with the extension type. -\item \verb{$.array_as_vector(extension_array)}: Convert an \link{Array} to an R +\item \verb{$as_vector(extension_array)}: Convert an \link{Array} or \link{ChunkedArray} to an R vector. This method is called by \code{\link[=as.vector]{as.vector()}} on \link{ExtensionArray} -objects or when a \link{RecordBatch} containing an \link{ExtensionArray} is -converted to a \code{\link[=data.frame]{data.frame()}}. The default method returns the converted -storage array. -\item \verb{$.chunked_array_as_vector(chunked_array)}: Convert a \link{ChunkedArray} -to an R vector. This method is called by \code{\link[=as.vector]{as.vector()}} on a \link{ChunkedArray} -whose type matches this extension type or when a \link{Table} containing -such a column is converted to a \code{\link[=data.frame]{data.frame()}}. The default method -converts each array using \verb{$.array_as_vector()} and concatenates them -using \code{\link[vctrs:vec_c]{vctrs::vec_c()}}. +objects, when a \link{RecordBatch} containing an \link{ExtensionArray} is +converted to a \code{\link[=data.frame]{data.frame()}}, or when a \link{ChunkedArray} (e.g., a column +in a \link{Table}) is converted to an R vector. The default method returns the +converted storage array. \item \verb{$ToString()} Return a string representation that will be printed to the console when this type or an Array of this type is printed. } diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 3ff8eb7808a..8f8229d972f 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -99,7 +99,7 @@ QuantizedType <- R6::R6Class( scale = function() private$.scale, # called when an Array of this type is converted to an R vector - .array_as_vector = function(extension_array) { + as_vector = function(extension_array) { unquantized_arrow <- (extension_array$storage()$cast(float64()) / private$.scale) + private$.center diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index 4876307a582..4a8c3464050 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -101,7 +101,7 @@ std::string RExtensionType::ToString() const { cpp11::sexp RExtensionType::Convert( const std::shared_ptr& array) const { cpp11::environment instance = r6_instance(); - cpp11::function instance_Convert(instance[".chunked_array_as_vector"]); + cpp11::function instance_Convert(instance["as_vector"]); cpp11::sexp array_sexp = cpp11::to_r6(array, "ChunkedArray"); return instance_Convert(array_sexp); } From 68245a63e2f87ccb0b3e650604f47ebe57c06759 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 16:12:47 -0300 Subject: [PATCH 55/66] better testing of array conversion --- r/R/extension.R | 24 ++++++++++++++++++------ r/tests/testthat/_snaps/extension.md | 5 +++++ r/tests/testthat/test-extension.R | 7 +++++++ 3 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 r/tests/testthat/_snaps/extension.md diff --git a/r/R/extension.R b/r/R/extension.R index bb1aff70a34..52c6aaf67dc 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -171,12 +171,20 @@ ExtensionType <- R6Class("ExtensionType", # (VctrsExtensionType does this). storage_vectors <- lapply( seq_len(extension_array$num_chunks) - 1L, - function(i) self$as_vector(extension_array$chunk(i)$storage()) + function(i) self$as_vector(extension_array$chunk(i)) ) vctrs::vec_c(!!! storage_vectors) } else if (inherits(extension_array, "ExtensionArray")) { extension_array$storage()$as_vector() + } else { + classes <- paste(class(extension_array), collapse = " / ") + abort( + c( + "`extension_array` must be a ChunkedArray or ExtensionArray", + i = glue::glue("Got object of type {classes}") + ) + ) } }, @@ -313,11 +321,15 @@ ExtensionType$create <- function(storage_type, #' #' # called when an Array of this type is converted to an R vector #' as_vector = function(extension_array) { -#' unquantized_arrow <- -#' (extension_array$storage()$cast(float64()) / private$.scale) + -#' private$.center -#' -#' as.vector(unquantized_arrow) +#' if (inherits(extension_array, "ExtensionArray")) +#' unquantized_arrow <- +#' (extension_array$storage()$cast(float64()) / private$.scale) + +#' private$.center +#' +#' as.vector(unquantized_arrow) +#' } else { +#' super$as_vector(extension_array) +#' } #' }, #' #' # populate the custom metadata fields from the serialized metadata diff --git a/r/tests/testthat/_snaps/extension.md b/r/tests/testthat/_snaps/extension.md new file mode 100644 index 00000000000..214d5eb18ed --- /dev/null +++ b/r/tests/testthat/_snaps/extension.md @@ -0,0 +1,5 @@ +# extension types can be created + + `extension_array` must be a ChunkedArray or ExtensionArray + i Got object of type character + diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index e6eb8285651..e563fd2b7a9 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -36,6 +36,13 @@ test_that("extension types can be created", { expect_true(array$type == type) expect_true(all(array$storage() == storage)) + + expect_identical(array$as_vector(), 1:10) + expect_identical(chunked_array(array)$as_vector(), 1:10) + + expect_snapshot_error( + type$as_vector("not an extension array or chunked array") + ) }) test_that("extension type subclasses work", { From ef459eed292fff69f527e8749f235dc7f634f649 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 16:18:28 -0300 Subject: [PATCH 56/66] test with vctrs extension type --- r/tests/testthat/_snaps/extension.md | 5 +++++ r/tests/testthat/test-extension.R | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/r/tests/testthat/_snaps/extension.md b/r/tests/testthat/_snaps/extension.md index 214d5eb18ed..4335958b8ae 100644 --- a/r/tests/testthat/_snaps/extension.md +++ b/r/tests/testthat/_snaps/extension.md @@ -3,3 +3,8 @@ `extension_array` must be a ChunkedArray or ExtensionArray i Got object of type character +# vctrs extension type works + + `extension_array` must be a ChunkedArray or ExtensionArray + i Got object of type character + diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index e563fd2b7a9..0b02d46f2e4 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -211,6 +211,10 @@ test_that("vctrs extension type works", { chunked_array_out$as_vector(), custom_vctr ) + + expect_snapshot_error( + type$as_vector("not an extension array or chunked array") + ) }) test_that("chunked arrays can roundtrip extension types", { From 2d7fa629d2bfe7ab2c1b8d742ba8879ac28b83e2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 16:20:29 -0300 Subject: [PATCH 57/66] undo changes to Table/RecordBatch --- r/R/record-batch.R | 4 +++- r/R/table.R | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 382d86ed946..24bd61535e7 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -113,7 +113,9 @@ RecordBatch <- R6Class("RecordBatch", }, # Take, Filter, and SortIndices are methods on ArrowTabular serialize = function() ipc___SerializeRecordBatch__Raw(self), - to_data_frame = function() RecordBatch__to_dataframe(self, option_use_threads()), + to_data_frame = function() { + RecordBatch__to_dataframe(self, use_threads = option_use_threads()) + }, cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { assert_is(target_schema, "Schema") assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") diff --git a/r/R/table.R b/r/R/table.R index 632f507492a..07750786ee2 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -97,7 +97,9 @@ Table <- R6Class("Table", }, field = function(i) Table__field(self, i), serialize = function(output_stream, ...) write_table(self, output_stream, ...), - to_data_frame = function() Table__to_dataframe(self, option_use_threads()), + to_data_frame = function() { + Table__to_dataframe(self, use_threads = option_use_threads()) + }, cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { assert_is(target_schema, "Schema") assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") From a0d658bfe05e126892d17f7a64133a8fb47449b6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 28 Mar 2022 16:23:18 -0300 Subject: [PATCH 58/66] fix typos, remove unused field in Converter_Extension --- r/src/array_to_vector.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 984de312ea7..177c654cac5 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -1168,7 +1168,7 @@ class Converter_Extension : public Converter { : Converter(chunked_array) {} SEXP Allocate(R_xlen_t n) const { - Rf_error("Can't use Converter() API for ExtensionType directoy"); + Rf_error("Can't use Converter() API for ExtensionType directly"); } Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { @@ -1179,9 +1179,6 @@ class Converter_Extension : public Converter { R_xlen_t start, R_xlen_t n, size_t chunk_index) const { return Status::NotImplemented("Converter API for ExtensionType"); } - - private: - std::shared_ptr storage_converter_; }; bool ArraysCanFitInteger(ArrayVector arrays) { From 1d71a0314a711fa728a0c873847c48499058c36e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 29 Mar 2022 09:21:07 -0300 Subject: [PATCH 59/66] move extension type logic to the extension type converter --- r/src/array_to_vector.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 177c654cac5..b89738d6c65 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -66,14 +66,7 @@ class Converter { // converter is passed as self to outlive the scope of Converter::Convert() SEXP ScheduleConvertTasks(RTasks& tasks, std::shared_ptr self) { - // ExtensionType has to go through the ExensionType R6 instance - auto extension_type = - dynamic_cast(chunked_array_->type().get()); - if (extension_type != nullptr) { - return extension_type->Convert(chunked_array_); - } - - // try altrep before the Converter api: + // try altrep first SEXP alt = altrep::MakeAltrepVector(chunked_array_); if (!Rf_isNull(alt)) { return alt; @@ -1162,22 +1155,32 @@ class Converter_Null : public Converter { } }; +// Unlike other types, conversion of ExtensionType (chunked) arrays occurs at +// R level via the ExtensionType (or subclass) R6 instance. We do this via Allocate, +// since it is called once per ChunkedArray. class Converter_Extension : public Converter { public: explicit Converter_Extension(const std::shared_ptr& chunked_array) : Converter(chunked_array) {} SEXP Allocate(R_xlen_t n) const { - Rf_error("Can't use Converter() API for ExtensionType directly"); + auto extension_type = + dynamic_cast(chunked_array_->type().get()); + if (extension_type == nullptr) { + Rf_error("Converter_Extension can't be used with a non-R extension type"); + } + + return extension_type->Convert(chunked_array_); } + // At this point we have already done the conversion Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { - return Status::NotImplemented("Converter API for ExtensionType"); + return Status::OK(); } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n, size_t chunk_index) const { - return Status::NotImplemented("Converter API for ExtensionType"); + return Status::OK(); } }; From 374a6ff1e7cdc500fde06f957bab87c8916955a4 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 29 Mar 2022 09:23:19 -0300 Subject: [PATCH 60/66] Use old ChunedArray$as_vector() method that used the converter API --- r/R/chunked-array.R | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 6e140b98123..95a05aba5b2 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -85,13 +85,7 @@ ChunkedArray <- R6Class("ChunkedArray", type_id = function() ChunkedArray__type(self)$id, nbytes = function() ChunkedArray__ReferencedBufferSize(self), chunk = function(i) Array$create(ChunkedArray__chunk(self, i)), - as_vector = function() { - if (inherits(self$type, "ExtensionType")) { - self$type$as_vector(self) - } else { - ChunkedArray__as_vector(self, option_use_threads()) - } - }, + as_vector = function() ChunkedArray__as_vector(self, option_use_threads()), Slice = function(offset, length = NULL) { if (is.null(length)) { ChunkedArray__Slice1(self, offset) From b45d18197d8a6d61cdb295de7cfcd7c2ec0161aa Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 30 Mar 2022 09:41:45 -0300 Subject: [PATCH 61/66] update comments to remind about the future SafeCallIntoR. --- r/src/extension-impl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index 4a8c3464050..ad16c35d98b 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -28,8 +28,7 @@ #include "./extension.h" bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { - // Avoid materializing the R6 instance if at all possible, since this is slow - // and in some cases not possible due to threading + // Avoid materializing the R6 instance if at all possible if (other.extension_name() != extension_name()) { return false; } @@ -40,6 +39,7 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { // With any ambiguity, we need to materialize the R6 instance and call its // ExtensionEquals method. We can't do this on the non-R thread. + // After ARROW-15841, we can use SafeCallIntoR. arrow::Status is_r_thread = assert_r_thread(); if (!assert_r_thread().ok()) { throw std::runtime_error(is_r_thread.message()); @@ -77,8 +77,7 @@ arrow::Result> RExtensionType::Deserialize( // confusing, since it will only occur when the result surfaces to R // (which might be much later). Unfortunately, the Deserialize() method gets // called from other threads frequently (e.g., when reading a multi-file Dataset), - // and we get crashes if we try this. As a compromise, we call this method when we can - // to maximize the likelihood an error is surfaced. + // and we get crashes if we try this. After ARROW-15841, we can use SafeCallIntoR. if (assert_r_thread().ok()) { cloned->r6_instance(); } @@ -88,6 +87,7 @@ arrow::Result> RExtensionType::Deserialize( std::string RExtensionType::ToString() const { // In case this gets called from another thread + // After ARROW-15841, we can use SafeCallIntoR. if (!assert_r_thread().ok()) { return ExtensionType::ToString(); } From 4130110b2d1f0e32e10a91150c824fc64de854e7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 30 Mar 2022 09:46:55 -0300 Subject: [PATCH 62/66] Serialize -> extension_metadata --- r/R/extension.R | 22 ++++++++++++---------- r/man/ExtensionType.Rd | 4 +++- r/man/new_extension_type.Rd | 16 ++++++++++------ r/tests/testthat/test-extension.R | 10 +++++----- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 52c6aaf67dc..f51aa3d8fd7 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -77,8 +77,10 @@ ExtensionArray$create <- function(x, type) { #' - `$storage_id()`: Returns the [Type] identifier corresponding to the #' `$storage_type()`. #' - `$extension_name()`: Returns the extension name. -#' - `$Serialize()`: Returns the serialized version of the extension +#' - `$extension_metadata()`: Returns the serialized version of the extension #' metadata as a [raw()] vector. +#' - `$extension_metadata_utf8()`: Returns the serialized version of the +#' extension metadata as a UTF-8 encoded string. #' - `$WrapArray(array)`: Wraps a storage [Array] into an [ExtensionArray] #' with this extension type. #' @@ -136,13 +138,13 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__extension_name(self) }, - Serialize = function() { + extension_metadata = function() { ExtensionType__Serialize(self) }, # To make sure this conversion is done properly - SerializeUTF8 = function() { - metadata_utf8 <- rawToChar(self$Serialize()) + extension_metadata_utf8 = function() { + metadata_utf8 <- rawToChar(self$extension_metadata()) Encoding(metadata_utf8) <- "UTF-8" metadata_utf8 }, @@ -160,7 +162,7 @@ ExtensionType <- R6Class("ExtensionType", ExtensionEquals = function(other) { inherits(other, "ExtensionType") && identical(other$extension_name(), self$extension_name()) && - identical(other$Serialize(), self$Serialize()) + identical(other$extension_metadata(), self$extension_metadata()) }, as_vector = function(extension_array) { @@ -192,7 +194,7 @@ ExtensionType <- R6Class("ExtensionType", # metadata is probably valid UTF-8 (e.g., JSON), but might not be # and it's confusing to error when printing the object. This herustic # isn't perfect (but subclasses should override this method anyway) - metadata_raw <- self$Serialize() + metadata_raw <- self$extension_metadata() if (as.raw(0x00) %in% metadata_raw) { if (length(metadata_raw) > 20) { @@ -210,7 +212,7 @@ ExtensionType <- R6Class("ExtensionType", } } else { - paste0(class(self)[1], " <", self$SerializeUTF8(), ">") + paste0(class(self)[1], " <", self$extension_metadata_utf8(), ">") } } ) @@ -321,7 +323,7 @@ ExtensionType$create <- function(storage_type, #' #' # called when an Array of this type is converted to an R vector #' as_vector = function(extension_array) { -#' if (inherits(extension_array, "ExtensionArray")) +#' if (inherits(extension_array, "ExtensionArray")) { #' unquantized_arrow <- #' (extension_array$storage()$cast(float64()) / private$.scale) + #' private$.center @@ -334,7 +336,7 @@ ExtensionType$create <- function(storage_type, #' #' # populate the custom metadata fields from the serialized metadata #' Deserialize = function() { -#' vals <- as.numeric(strsplit(self$SerializeUTF8(), ";")[[1]]) +#' vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) #' private$.center <- vals[1] #' private$.scale <- vals[2] #' } @@ -446,7 +448,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", }, Deserialize = function() { - private$.ptype <- unserialize(self$Serialize()) + private$.ptype <- unserialize(self$extension_metadata()) }, ExtensionEquals = function(other) { diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index 52e8043b924..fd51edb3308 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -18,8 +18,10 @@ values. \item \verb{$storage_id()}: Returns the \link{Type} identifier corresponding to the \verb{$storage_type()}. \item \verb{$extension_name()}: Returns the extension name. -\item \verb{$Serialize()}: Returns the serialized version of the extension +\item \verb{$extension_metadata()}: Returns the serialized version of the extension metadata as a \code{\link[=raw]{raw()}} vector. +\item \verb{$extension_metadata_utf8()}: Returns the serialized version of the +extension metadata as a UTF-8 encoded string. \item \verb{$WrapArray(array)}: Wraps a storage \link{Array} into an \link{ExtensionArray} with this extension type. } diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 8f8229d972f..56b8dd3aec9 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -100,16 +100,20 @@ QuantizedType <- R6::R6Class( # called when an Array of this type is converted to an R vector as_vector = function(extension_array) { - unquantized_arrow <- - (extension_array$storage()$cast(float64()) / private$.scale) + - private$.center - - as.vector(unquantized_arrow) + if (inherits(extension_array, "ExtensionArray")) { + unquantized_arrow <- + (extension_array$storage()$cast(float64()) / private$.scale) + + private$.center + + as.vector(unquantized_arrow) + } else { + super$as_vector(extension_array) + } }, # populate the custom metadata fields from the serialized metadata Deserialize = function() { - vals <- as.numeric(strsplit(self$SerializeUTF8(), ";")[[1]]) + vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) private$.center <- vals[1] private$.scale <- vals[2] } diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 0b02d46f2e4..221abb79557 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -26,7 +26,7 @@ test_that("extension types can be created", { expect_identical(type$extension_name(), "arrow_r.simple_extension") expect_true(type$storage_type() == int32()) expect_identical(type$storage_id(), int32()$id) - expect_identical(type$Serialize(), charToRaw("some custom metadata")) + expect_identical(type$extension_metadata(), charToRaw("some custom metadata")) expect_identical(type$ToString(), "ExtensionType ") storage <- Array$create(1:10) @@ -54,7 +54,7 @@ test_that("extension type subclasses work", { }, Deserialize = function() { - private$some_custom_field <- head(self$Serialize(), 5) + private$some_custom_field <- head(self$extension_metadata(), 5) } ), private = list( @@ -90,7 +90,7 @@ test_that("extension type subclasses work", { expect_identical(type3$extension_name(), "some_extension_subclass") expect_identical(type3$some_custom_method(), type2$some_custom_method()) - expect_identical(type3$Serialize(), type2$Serialize()) + expect_identical(type3$extension_metadata(), type2$extension_metadata()) expect_true(type3$storage_type() == type2$storage_type()) array <- type3$WrapArray(Array$create(1:10)) @@ -107,7 +107,7 @@ test_that("extension types can use UTF-8 for metadata", { ) expect_identical( - type$SerializeUTF8(), + type$extension_metadata_utf8(), "\U0001f4a9\U0001f4a9\U0001f4a9\U0001f4a9" ) @@ -131,7 +131,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { field_values = NULL, Deserialize = function() { - self$field_values <- unserialize(self$Serialize()) + self$field_values <- unserialize(self$extension_metadata()) }, ExtensionEquals = function(other) { From c105112bcad747cf52ba26ae8cdc3a3c5d9d17e0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 30 Mar 2022 10:48:19 -0300 Subject: [PATCH 63/66] Deserialize -> populate_instance --- r/R/extension.R | 16 ++++++++-------- r/man/ExtensionType.Rd | 2 +- r/man/new_extension_type.Rd | 4 ++-- r/tests/testthat/test-extension.R | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index f51aa3d8fd7..9ef4c79ec33 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -87,7 +87,7 @@ ExtensionArray$create <- function(x, type) { #' In addition, subclasses may override the following methos to customize #' the behaviour of extension classes. #' -#' - `$Deserialize()`: This method is called when a new [ExtensionType] +#' - `$populate_instance()`: This method is called when a new [ExtensionType] #' is initialized and is responsible for parsing and validating #' the serialized extension_metadata (a [raw()] vector) #' such that its contents can be inspected by fields and/or methods @@ -110,11 +110,11 @@ ExtensionType <- R6Class("ExtensionType", public = list( # In addition to the initialization that occurs for all - # ArrowObject instances, we call Deserialize(), which can + # ArrowObject instances, we call populate_instance(), which can # be overridden to populate custom fields initialize = function(xp) { super$initialize(xp) - self$Deserialize() + self$populate_instance() }, # Because of how C++ shared_ptr<> objects are converted to R objects, @@ -154,7 +154,7 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__MakeArray(self, array$data()) }, - Deserialize = function() { + populate_instance = function() { # Do nothing by default but allow other classes to override this method # to populate R6 class members. }, @@ -223,7 +223,7 @@ ExtensionType <- R6Class("ExtensionType", # that object has type_id() EXTENSION_TYPE. Rather than add complexity # to the wrapper code, we modify ExtensionType$new() to do what we need # it to do here (which is to return an instance of a custom R6 -# type whose .Deserialize method is called to populate custom fields). +# type whose .populate_instance method is called to populate custom fields). ExtensionType$.default_new <- ExtensionType$new ExtensionType$new <- function(xp) { super <- ExtensionType$.default_new(xp) @@ -269,7 +269,7 @@ ExtensionType$create <- function(storage_type, #' and [ExtensionArray] objects. To use an extension type you will have to: #' #' - Define an [R6::R6Class] that inherits from [ExtensionType] and reimplement -#' one or more methods (e.g., `Deserialize()`). +#' one or more methods (e.g., `populate_instance()`). #' - Make a type constructor function (e.g., `my_extension_type()`) that calls #' [new_extension_type()] to create an R6 instance that can be used as a #' [data type][data-type] elsewhere in the package. @@ -335,7 +335,7 @@ ExtensionType$create <- function(storage_type, #' }, #' #' # populate the custom metadata fields from the serialized metadata -#' Deserialize = function() { +#' populate_instance = function() { #' vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) #' private$.center <- vals[1] #' private$.scale <- vals[2] @@ -447,7 +447,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", paste0(readLines(tf), collapse = "\n") }, - Deserialize = function() { + populate_instance = function() { private$.ptype <- unserialize(self$extension_metadata()) }, diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index fd51edb3308..662dcb8e4c0 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -29,7 +29,7 @@ with this extension type. In addition, subclasses may override the following methos to customize the behaviour of extension classes. \itemize{ -\item \verb{$Deserialize()}: This method is called when a new \link{ExtensionType} +\item \verb{$populate_instance()}: This method is called when a new \link{ExtensionType} is initialized and is responsible for parsing and validating the serialized extension_metadata (a \code{\link[=raw]{raw()}} vector) such that its contents can be inspected by fields and/or methods diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 56b8dd3aec9..93e12fb2221 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -66,7 +66,7 @@ These functions create, register, and unregister \link{ExtensionType} and \link{ExtensionArray} objects. To use an extension type you will have to: \itemize{ \item Define an \link[R6:R6Class]{R6::R6Class} that inherits from \link{ExtensionType} and reimplement -one or more methods (e.g., \code{Deserialize()}). +one or more methods (e.g., \code{populate_instance()}). \item Make a type constructor function (e.g., \code{my_extension_type()}) that calls \code{\link[=new_extension_type]{new_extension_type()}} to create an R6 instance that can be used as a \link[=data-type]{data type} elsewhere in the package. @@ -112,7 +112,7 @@ QuantizedType <- R6::R6Class( }, # populate the custom metadata fields from the serialized metadata - Deserialize = function() { + populate_instance = function() { vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) private$.center <- vals[1] private$.scale <- vals[2] diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 221abb79557..279a81b4933 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -53,7 +53,7 @@ test_that("extension type subclasses work", { private$some_custom_field }, - Deserialize = function() { + populate_instance = function() { private$some_custom_field <- head(self$extension_metadata(), 5) } ), @@ -130,7 +130,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { public = list( field_values = NULL, - Deserialize = function() { + populate_instance = function() { self$field_values <- unserialize(self$extension_metadata()) }, From 2e5aa01348e306ffc112e064a7ba34b4d541a239 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 30 Mar 2022 13:56:18 -0300 Subject: [PATCH 64/66] populate_instance() -> deserialize_instance() --- r/R/extension.R | 16 ++++++++-------- r/man/ExtensionType.Rd | 2 +- r/man/new_extension_type.Rd | 4 ++-- r/tests/testthat/test-extension.R | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/r/R/extension.R b/r/R/extension.R index 9ef4c79ec33..111a0e86203 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -87,7 +87,7 @@ ExtensionArray$create <- function(x, type) { #' In addition, subclasses may override the following methos to customize #' the behaviour of extension classes. #' -#' - `$populate_instance()`: This method is called when a new [ExtensionType] +#' - `$deserialize_instance()`: This method is called when a new [ExtensionType] #' is initialized and is responsible for parsing and validating #' the serialized extension_metadata (a [raw()] vector) #' such that its contents can be inspected by fields and/or methods @@ -110,11 +110,11 @@ ExtensionType <- R6Class("ExtensionType", public = list( # In addition to the initialization that occurs for all - # ArrowObject instances, we call populate_instance(), which can + # ArrowObject instances, we call deserialize_instance(), which can # be overridden to populate custom fields initialize = function(xp) { super$initialize(xp) - self$populate_instance() + self$deserialize_instance() }, # Because of how C++ shared_ptr<> objects are converted to R objects, @@ -154,7 +154,7 @@ ExtensionType <- R6Class("ExtensionType", ExtensionType__MakeArray(self, array$data()) }, - populate_instance = function() { + deserialize_instance = function() { # Do nothing by default but allow other classes to override this method # to populate R6 class members. }, @@ -223,7 +223,7 @@ ExtensionType <- R6Class("ExtensionType", # that object has type_id() EXTENSION_TYPE. Rather than add complexity # to the wrapper code, we modify ExtensionType$new() to do what we need # it to do here (which is to return an instance of a custom R6 -# type whose .populate_instance method is called to populate custom fields). +# type whose .deserialize_instance method is called to populate custom fields). ExtensionType$.default_new <- ExtensionType$new ExtensionType$new <- function(xp) { super <- ExtensionType$.default_new(xp) @@ -269,7 +269,7 @@ ExtensionType$create <- function(storage_type, #' and [ExtensionArray] objects. To use an extension type you will have to: #' #' - Define an [R6::R6Class] that inherits from [ExtensionType] and reimplement -#' one or more methods (e.g., `populate_instance()`). +#' one or more methods (e.g., `deserialize_instance()`). #' - Make a type constructor function (e.g., `my_extension_type()`) that calls #' [new_extension_type()] to create an R6 instance that can be used as a #' [data type][data-type] elsewhere in the package. @@ -335,7 +335,7 @@ ExtensionType$create <- function(storage_type, #' }, #' #' # populate the custom metadata fields from the serialized metadata -#' populate_instance = function() { +#' deserialize_instance = function() { #' vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) #' private$.center <- vals[1] #' private$.scale <- vals[2] @@ -447,7 +447,7 @@ VctrsExtensionType <- R6Class("VctrsExtensionType", paste0(readLines(tf), collapse = "\n") }, - populate_instance = function() { + deserialize_instance = function() { private$.ptype <- unserialize(self$extension_metadata()) }, diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index 662dcb8e4c0..6b05f3490d2 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -29,7 +29,7 @@ with this extension type. In addition, subclasses may override the following methos to customize the behaviour of extension classes. \itemize{ -\item \verb{$populate_instance()}: This method is called when a new \link{ExtensionType} +\item \verb{$deserialize_instance()}: This method is called when a new \link{ExtensionType} is initialized and is responsible for parsing and validating the serialized extension_metadata (a \code{\link[=raw]{raw()}} vector) such that its contents can be inspected by fields and/or methods diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 93e12fb2221..96d5c10c935 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -66,7 +66,7 @@ These functions create, register, and unregister \link{ExtensionType} and \link{ExtensionArray} objects. To use an extension type you will have to: \itemize{ \item Define an \link[R6:R6Class]{R6::R6Class} that inherits from \link{ExtensionType} and reimplement -one or more methods (e.g., \code{populate_instance()}). +one or more methods (e.g., \code{deserialize_instance()}). \item Make a type constructor function (e.g., \code{my_extension_type()}) that calls \code{\link[=new_extension_type]{new_extension_type()}} to create an R6 instance that can be used as a \link[=data-type]{data type} elsewhere in the package. @@ -112,7 +112,7 @@ QuantizedType <- R6::R6Class( }, # populate the custom metadata fields from the serialized metadata - populate_instance = function() { + deserialize_instance = function() { vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]]) private$.center <- vals[1] private$.scale <- vals[2] diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 279a81b4933..cf82b2f1f26 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -53,7 +53,7 @@ test_that("extension type subclasses work", { private$some_custom_field }, - populate_instance = function() { + deserialize_instance = function() { private$some_custom_field <- head(self$extension_metadata(), 5) } ), @@ -130,7 +130,7 @@ test_that("extension subclasses can override the ExtensionEquals method", { public = list( field_values = NULL, - populate_instance = function() { + deserialize_instance = function() { self$field_values <- unserialize(self$extension_metadata()) }, From 46386cbb656c540c1e97d1fbd665002931c77e61 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 Apr 2022 15:58:17 -0300 Subject: [PATCH 65/66] use SafeCallIntoR() --- r/src/extension-impl.cpp | 71 +++++++++++++++++++++------------------- r/src/extension.h | 19 ++--------- 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index ad16c35d98b..b0cbcca7d67 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -26,6 +26,7 @@ #include #include "./extension.h" +#include "./safe-call-into-r.h" bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { // Avoid materializing the R6 instance if at all possible @@ -40,20 +41,23 @@ bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const { // With any ambiguity, we need to materialize the R6 instance and call its // ExtensionEquals method. We can't do this on the non-R thread. // After ARROW-15841, we can use SafeCallIntoR. - arrow::Status is_r_thread = assert_r_thread(); - if (!assert_r_thread().ok()) { - throw std::runtime_error(is_r_thread.message()); - } + arrow::Result result = SafeCallIntoR([&]() { + cpp11::environment instance = r6_instance(); + cpp11::function instance_ExtensionEquals(instance["ExtensionEquals"]); - cpp11::environment instance = r6_instance(); - cpp11::function instance_ExtensionEquals(instance["ExtensionEquals"]); + std::shared_ptr other_shared = + ValueOrStop(other.Deserialize(other.storage_type(), other.Serialize())); + cpp11::sexp other_r6 = cpp11::to_r6(other_shared, "ExtensionType"); - std::shared_ptr other_shared = - ValueOrStop(other.Deserialize(other.storage_type(), other.Serialize())); - cpp11::sexp other_r6 = cpp11::to_r6(other_shared, "ExtensionType"); + cpp11::logicals result(instance_ExtensionEquals(other_r6)); + return cpp11::as_cpp(result); + }); - cpp11::logicals result(instance_ExtensionEquals(other_r6)); - return cpp11::as_cpp(result); + if (!result.ok()) { + throw std::runtime_error(result.status().message()); + } + + return result.ValueUnsafe(); } std::shared_ptr RExtensionType::MakeArray( @@ -71,31 +75,33 @@ arrow::Result> RExtensionType::Deserialize( cloned->storage_type_ = storage_type; cloned->extension_metadata_ = serialized_data; - // We probably should create an ephemeral R6 instance here, which will call - // the R6 instance's Deserialize() method, possibly erroring when the metadata is - // invalid or the deserialized values are invalid. When there is an error it will be - // confusing, since it will only occur when the result surfaces to R - // (which might be much later). Unfortunately, the Deserialize() method gets - // called from other threads frequently (e.g., when reading a multi-file Dataset), - // and we get crashes if we try this. After ARROW-15841, we can use SafeCallIntoR. - if (assert_r_thread().ok()) { - cloned->r6_instance(); - } + // We create an ephemeral R6 instance here, which will call the R6 instance's + // deserialize_instance() method, possibly erroring when the metadata is + // invalid or the deserialized values are invalid. + arrow::Result result = SafeCallIntoR([&]() { + r6_instance(); + return true; + }); + ARROW_RETURN_NOT_OK(result); return std::shared_ptr(cloned.release()); } std::string RExtensionType::ToString() const { - // In case this gets called from another thread - // After ARROW-15841, we can use SafeCallIntoR. - if (!assert_r_thread().ok()) { + arrow::Result result = SafeCallIntoR([&]() { + cpp11::environment instance = r6_instance(); + cpp11::function instance_ToString(instance["ToString"]); + cpp11::sexp result = instance_ToString(); + return cpp11::as_cpp(result); + }); + + // In the event of an error (e.g., we are not on the main thread + // and we are not inside RunWithCapturedR()), just call the default method + if (!result.ok()) { return ExtensionType::ToString(); + } else { + return result.ValueUnsafe(); } - - cpp11::environment instance = r6_instance(); - cpp11::function instance_ToString(instance["ToString"]); - cpp11::sexp result = instance_ToString(); - return cpp11::as_cpp(result); } cpp11::sexp RExtensionType::Convert( @@ -107,8 +113,8 @@ cpp11::sexp RExtensionType::Convert( } std::unique_ptr RExtensionType::Clone() const { - RExtensionType* ptr = new RExtensionType( - storage_type(), extension_name_, extension_metadata_, r6_class_, creation_thread_); + RExtensionType* ptr = + new RExtensionType(storage_type(), extension_name_, extension_metadata_, r6_class_); return std::unique_ptr(ptr); } @@ -133,8 +139,7 @@ cpp11::environment ExtensionType__initialize( cpp11::raws extension_metadata, cpp11::environment r6_class) { std::string metadata_string(extension_metadata.begin(), extension_metadata.end()); auto r6_class_shared = std::make_shared(r6_class); - RExtensionType cpp_type(storage_type, extension_name, metadata_string, r6_class_shared, - std::this_thread::get_id()); + RExtensionType cpp_type(storage_type, extension_name, metadata_string, r6_class_shared); return cpp_type.r6_instance(); } diff --git a/r/src/extension.h b/r/src/extension.h index f0aa96bc2be..fbd3ad48469 100644 --- a/r/src/extension.h +++ b/r/src/extension.h @@ -17,8 +17,6 @@ #include "./arrow_types.h" -#include - #include #include #include @@ -36,13 +34,11 @@ class RExtensionType : public arrow::ExtensionType { public: RExtensionType(const std::shared_ptr storage_type, std::string extension_name, std::string extension_metadata, - std::shared_ptr r6_class, - std::thread::id creation_thread) + std::shared_ptr r6_class) : arrow::ExtensionType(storage_type), extension_name_(extension_name), extension_metadata_(extension_metadata), - r6_class_(r6_class), - creation_thread_(creation_thread) {} + r6_class_(r6_class) {} std::string extension_name() const { return extension_name_; } @@ -76,15 +72,4 @@ class RExtensionType : public arrow::ExtensionType { std::string extension_metadata_; std::string cached_to_string_; std::shared_ptr r6_class_; - std::thread::id creation_thread_; - - arrow::Status assert_r_thread() const { - if (std::this_thread::get_id() == creation_thread_) { - return arrow::Status::OK(); - } else { - return arrow::Status::ExecutionError("RExtensionType <", extension_name_, - "> attempted to call into R ", - "from a non-R thread"); - } - } }; From 5c847cb123e933e6dac252f744a490d15623f688 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 Apr 2022 16:18:56 -0300 Subject: [PATCH 66/66] punt on SafeCallIntoR for Deserialize() --- r/src/extension-impl.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index b0cbcca7d67..57c4874c973 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -75,15 +75,16 @@ arrow::Result> RExtensionType::Deserialize( cloned->storage_type_ = storage_type; cloned->extension_metadata_ = serialized_data; - // We create an ephemeral R6 instance here, which will call the R6 instance's + // We could create an ephemeral R6 instance here, which will call the R6 instance's // deserialize_instance() method, possibly erroring when the metadata is - // invalid or the deserialized values are invalid. - arrow::Result result = SafeCallIntoR([&]() { + // invalid or the deserialized values are invalid. The complexity of setting up + // an event loop from wherever this *might* be called is high and hard to + // predict. As a compromise, just create the instance when it is safe to + // do so. + if (GetMainRThread().IsMainThread()) { r6_instance(); - return true; - }); + } - ARROW_RETURN_NOT_OK(result); return std::shared_ptr(cloned.release()); }