diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 577773c42bd..9257f5787b1 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1,5 +1,13 @@ # Generated by using data-raw/codegen.R -> do not edit by hand +is_altrep_int_nonull <- function(x){ + .Call(`_arrow_is_altrep_int_nonull`, x) +} + +is_altrep_dbl_nonull <- function(x){ + .Call(`_arrow_is_altrep_dbl_nonull`, x) +} + Array__Slice1 <- function(array, offset){ .Call(`_arrow_Array__Slice1`, array, offset) } diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index 9b25cb1842c..1a49ffc80fa 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -214,6 +214,11 @@ glue::glue('\n 'extern "C" void R_init_arrow(DllInfo* dll){ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); + + #if defined(HAS_ALTREP) + arrow::r::Init_Altrep_classes(dll); + #endif + } \n') diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp new file mode 100644 index 00000000000..33e30aa3ffb --- /dev/null +++ b/r/src/altrep.cpp @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "./arrow_types.h" + +#if defined(HAS_ALTREP) + +#include +#include + +namespace arrow { +namespace r { + +template +struct ArrayNoNull { + using data_type = typename std::conditional::type; + static void DeleteArray(std::shared_ptr* ptr) { delete ptr; } + using Pointer = cpp11::external_pointer, DeleteArray>; + + // altrep object around an Array with no nulls + // data1: an external pointer to a shared pointer to the Array + // data2: not used + + static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr& array) { + // we don't need the whole r6 object, just an external pointer + // that retain the array + Pointer xp(new std::shared_ptr(array)); + + SEXP res = R_new_altrep(class_t, xp, R_NilValue); + MARK_NOT_MUTABLE(res); + + return res; + } + + static Rboolean Inspect(SEXP x, int pre, int deep, int pvec, + void (*inspect_subtree)(SEXP, int, int, int)) { + const auto& array = Get(x); + Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n", + array->type()->ToString().c_str(), array->length(), array.get()); + inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec); + return TRUE; + } + + static const std::shared_ptr& Get(SEXP vec) { + return *Pointer(R_altrep_data1(vec)); + } + + static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); } + + static const void* Dataptr_or_null(SEXP vec) { + return Get(vec)->data()->template GetValues(1); + } + + static SEXP Duplicate(SEXP vec, Rboolean) { + const auto& array = Get(vec); + auto size = array->length(); + + SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length())); + + memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type)); + + UNPROTECT(1); + return copy; + } + + static void* Dataptr(SEXP vec, Rboolean writeable) { + return const_cast(Dataptr_or_null(vec)); + } + + // by definition, there are no NA + static int No_NA(SEXP vec) { return 1; } + + static void Init(R_altrep_class_t class_t, DllInfo* dll) { + // altrep + R_set_altrep_Length_method(class_t, ArrayNoNull::Length); + R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect); + R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate); + + // altvec + R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr); + R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null); + } +}; + +struct DoubleArrayNoNull { + static R_altrep_class_t class_t; + + static void Init(DllInfo* dll) { + class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll); + ArrayNoNull::Init(class_t, dll); + R_set_altreal_No_NA_method(class_t, ArrayNoNull::No_NA); + } + + static SEXP Make(const std::shared_ptr& array) { + return ArrayNoNull::Make(class_t, array); + } +}; + +struct Int32ArrayNoNull { + static R_altrep_class_t class_t; + + static void Init(DllInfo* dll) { + class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll); + ArrayNoNull::Init(class_t, dll); + R_set_altinteger_No_NA_method(class_t, ArrayNoNull::No_NA); + } + + static SEXP Make(const std::shared_ptr& array) { + return ArrayNoNull::Make(class_t, array); + } +}; + +R_altrep_class_t Int32ArrayNoNull::class_t; +R_altrep_class_t DoubleArrayNoNull::class_t; + +void Init_Altrep_classes(DllInfo* dll) { + DoubleArrayNoNull::Init(dll); + Int32ArrayNoNull::Init(dll); +} + +SEXP MakeDoubleArrayNoNull(const std::shared_ptr& array) { + return DoubleArrayNoNull::Make(array); +} + +SEXP MakeInt32ArrayNoNull(const std::shared_ptr& array) { + return Int32ArrayNoNull::Make(array); +} + +} // namespace r +} // namespace arrow + +#endif + +// [[arrow::export]] +bool is_altrep_int_nonull(SEXP x) { +#if defined(HAS_ALTREP) + return R_altrep_inherits(x, arrow::r::Int32ArrayNoNull::class_t); +#else + return false; +#endif +} + +// [[arrow::export]] +bool is_altrep_dbl_nonull(SEXP x) { +#if defined(HAS_ALTREP) + return R_altrep_inherits(x, arrow::r::DoubleArrayNoNull::class_t); +#else + return false; +#endif +} diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index d5fae295181..a8f7191bf18 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -28,6 +28,7 @@ #include #include +#include #include namespace arrow { @@ -143,6 +144,24 @@ Status IngestSome(const std::shared_ptr& array, R_xlen_t n, // Allocate + Ingest SEXP ArrayVector__as_vector(R_xlen_t n, const std::shared_ptr& type, const ArrayVector& arrays) { +#if defined(HAS_ALTREP) + // special case when there is only one array + if (arrays.size() == 1) { + const auto& array = arrays[0]; + if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0 && + array->null_count() == 0) { + switch (type->id()) { + case arrow::Type::DOUBLE: + return arrow::r::MakeDoubleArrayNoNull(array); + case arrow::Type::INT32: + return arrow::r::MakeInt32ArrayNoNull(array); + default: + break; + } + } + } +#endif + auto converter = Converter::Make(type, arrays); SEXP data = PROTECT(converter->Allocate(n)); StopIfNotOk(converter->IngestSerial(data)); @@ -1280,6 +1299,10 @@ SEXP Array__as_vector(const std::shared_ptr& array) { // [[arrow::export]] SEXP ChunkedArray__as_vector(const std::shared_ptr& chunked_array) { + if (chunked_array->num_chunks() == 1) { + return Array__as_vector(chunked_array->chunk(0)); + } + return arrow::r::ArrayVector__as_vector(chunked_array->length(), chunked_array->type(), chunked_array->chunks()); } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 024e5c58b0e..427844a3c8e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4,6 +4,36 @@ #include "./arrow_types.h" +// altrep.cpp +#if defined(ARROW_R_WITH_ARROW) +bool is_altrep_int_nonull(SEXP x); +extern "C" SEXP _arrow_is_altrep_int_nonull(SEXP x_sexp){ +BEGIN_CPP11 + arrow::r::Input::type x(x_sexp); + return cpp11::as_sexp(is_altrep_int_nonull(x)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_is_altrep_int_nonull(SEXP x_sexp){ + Rf_error("Cannot call is_altrep_int_nonull(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// altrep.cpp +#if defined(ARROW_R_WITH_ARROW) +bool is_altrep_dbl_nonull(SEXP x); +extern "C" SEXP _arrow_is_altrep_dbl_nonull(SEXP x_sexp){ +BEGIN_CPP11 + arrow::r::Input::type x(x_sexp); + return cpp11::as_sexp(is_altrep_dbl_nonull(x)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_is_altrep_dbl_nonull(SEXP x_sexp){ + Rf_error("Cannot call is_altrep_dbl_nonull(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // array.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Array__Slice1(const std::shared_ptr& array, R_xlen_t offset); @@ -6893,6 +6923,8 @@ static const R_CallMethodDef CallEntries[] = { { "_dataset_available", (DL_FUNC)& _dataset_available, 0 }, { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, + { "_arrow_is_altrep_int_nonull", (DL_FUNC) &_arrow_is_altrep_int_nonull, 1}, + { "_arrow_is_altrep_dbl_nonull", (DL_FUNC) &_arrow_is_altrep_dbl_nonull, 1}, { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, @@ -7334,6 +7366,11 @@ static const R_CallMethodDef CallEntries[] = { extern "C" void R_init_arrow(DllInfo* dll){ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); + + #if defined(HAS_ALTREP) + arrow::r::Init_Altrep_classes(dll); + #endif + } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 09511e32e87..68e1c8659c4 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -165,6 +165,12 @@ arrow::Status InferSchemaFromDots(SEXP lst, SEXP schema_sxp, int num_fields, arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, std::shared_ptr& schema); +#if defined(HAS_ALTREP) +void Init_Altrep_classes(DllInfo* dll); +SEXP MakeInt32ArrayNoNull(const std::shared_ptr& array); +SEXP MakeDoubleArrayNoNull(const std::shared_ptr& array); +#endif + } // namespace r } // namespace arrow diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R new file mode 100644 index 00000000000..ec1c671b12e --- /dev/null +++ b/r/tests/testthat/test-altrep.R @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("altrep") + +skip_if(getRversion() <= "3.5.0") + +test_that("altrep vectors from int32 and dbl arrays with no nulls", { + withr::local_options(list(arrow.use_altrep = TRUE)) + v_int <- Array$create(1:1000) + v_dbl <- Array$create(as.numeric(1:1000)) + c_int <- ChunkedArray$create(1:1000) + c_dbl <- ChunkedArray$create(as.numeric(1:1000)) + + expect_true(is_altrep_int_nonull(as.vector(v_int))) + expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) + expect_true(is_altrep_dbl_nonull(as.vector(v_dbl))) + expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + + expect_equal(c_int$num_chunks, 1L) + expect_true(is_altrep_int_nonull(as.vector(c_int))) + expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(1)))) + + expect_equal(c_dbl$num_chunks, 1L) + expect_true(is_altrep_dbl_nonull(as.vector(c_dbl))) + expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1)))) + + withr::local_options(list(arrow.use_altrep = NULL)) + expect_true(is_altrep_int_nonull(as.vector(v_int))) + expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) + expect_true(is_altrep_dbl_nonull(as.vector(v_dbl))) + expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + + withr::local_options(list(arrow.use_altrep = FALSE)) + expect_false(is_altrep_int_nonull(as.vector(v_int))) + expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) + expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) + expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) +}) + +test_that("altrep vectors from int32 and dbl arrays with nulls", { + withr::local_options(list(arrow.use_altrep = TRUE)) + v_int <- Array$create(c(1L, NA, 3L)) + v_dbl <- Array$create(c(1, NA, 3)) + c_int <- ChunkedArray$create(c(1L, NA, 3L)) + c_dbl <- ChunkedArray$create(c(1, NA, 3)) + + # cannot be altrep because one NA + expect_false(is_altrep_int_nonull(as.vector(v_int))) + expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) + expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) + expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + expect_false(is_altrep_int_nonull(as.vector(c_int))) + expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1)))) + expect_false(is_altrep_dbl_nonull(as.vector(c_dbl))) + expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1)))) + + # but then, no NA beyond, so can be altrep again + expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2)))) + expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2)))) + expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2)))) + expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2)))) + + # chunked array with 2 chunks cannot be altrep + c_int <- ChunkedArray$create(0L, c(1L, NA, 3L)) + c_dbl <- ChunkedArray$create(0, c(1, NA, 3)) + expect_equal(c_int$num_chunks, 2L) + expect_equal(c_dbl$num_chunks, 2L) + expect_false(is_altrep_int_nonull(as.vector(c_int))) + expect_false(is_altrep_dbl_nonull(as.vector(c_dbl))) + expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3)))) + expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3)))) +}) + +test_that("empty vectors are not altrep", { + withr::local_options(list(arrow.use_altrep = TRUE)) + v_int <- Array$create(integer()) + v_dbl <- Array$create(numeric()) + + expect_false(is_altrep_int_nonull(as.vector(v_int))) + expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) +})