diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 2237f818ee0..72a5e455858 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1,11 +1,7 @@ # Generated by using data-raw/codegen.R -> do not edit by hand -is_altrep_int_nonull <- function(x) { - .Call(`_arrow_is_altrep_int_nonull`, x) -} - -is_altrep_dbl_nonull <- function(x) { - .Call(`_arrow_is_altrep_dbl_nonull`, x) +is_altrep <- function(x) { + .Call(`_arrow_is_altrep`, x) } Array__Slice1 <- function(array, offset) { diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index bb0e92eb640..7bdd8486d39 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -216,7 +216,7 @@ glue::glue('\n R_useDynamicSymbols(dll, FALSE); #if defined(ARROW_R_WITH_ARROW) && defined(HAS_ALTREP) - arrow::r::Init_Altrep_classes(dll); + arrow::r::altrep::Init_Altrep_classes(dll); #endif } diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index b07cbe70ed3..ec68ade1ba9 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -19,9 +19,16 @@ #if defined(ARROW_R_WITH_ARROW) +#include +#include +#include + #include #if defined(HAS_ALTREP) +// defined in array_to_vector.cpp +SEXP Array__as_vector(const std::shared_ptr& array); + #if R_VERSION < R_Version(3, 6, 0) // workaround because R's not so conveniently uses `class` @@ -43,144 +50,484 @@ extern "C" { #include #endif -#include +#include "./r_task_group.h" namespace arrow { namespace r { +namespace altrep { + +template +R_xlen_t Standard_Get_region(SEXP data2, R_xlen_t i, R_xlen_t n, c_type* buf); + +template <> +R_xlen_t Standard_Get_region(SEXP data2, R_xlen_t i, R_xlen_t n, double* buf) { + return REAL_GET_REGION(data2, i, n, buf); +} + +template <> +R_xlen_t Standard_Get_region(SEXP data2, R_xlen_t i, R_xlen_t n, int* buf) { + return INTEGER_GET_REGION(data2, i, n, buf); +} + +// altrep R vector shadowing an Array. +// +// This tries as much as possible to directly use the data +// from the Array and minimize data copies. +// +// Both slots of the altrep object (data1 and data2) are used: +// +// data1: always used, stores an R external pointer to a +// shared pointer of the Array +// data2: starts as NULL, and becomes a standard R vector with the same +// data if necessary (if materialization is needed) template -struct ArrayNoNull { - using data_type = typename std::conditional::type; +struct AltrepArrayPrimitive { static void DeleteArray(std::shared_ptr* ptr) { delete ptr; } using Pointer = cpp11::external_pointer, DeleteArray>; - // altrep object around an Array with no nulls - // data1: an external pointer to a shared pointer to the Array - // data2: not used + using c_type = typename std::conditional::type; - static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr& array) { - // we don't need the whole r6 object, just an external pointer - // that retain the array - Pointer xp(new std::shared_ptr(array)); - - SEXP res = R_new_altrep(class_t, xp, R_NilValue); - MARK_NOT_MUTABLE(res); + // singleton altrep class description + static R_altrep_class_t class_t; - return res; + // the altrep R object + SEXP alt_; + + // This constructor is used to create the altrep object from + // an Array. Used by MakeAltrepArrayPrimitive() which is used + // in array_to_vector.cpp + explicit AltrepArrayPrimitive(const std::shared_ptr& array) + : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr(array)), + R_NilValue)) { + // force duplicate on modify + MARK_NOT_MUTABLE(alt_); } - static Rboolean Inspect(SEXP x, int pre, int deep, int pvec, - void (*inspect_subtree)(SEXP, int, int, int)) { - const auto& array = Get(x); - Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n", - array->type()->ToString().c_str(), array->length(), array.get()); - inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec); - return TRUE; + // This constructor is used when R calls altrep methods. + // + // For example in the Length() method below: + // + // template + // R_xlen_t Length(SEXP alt) { + // return AltrepClass(alt).Length(); + // } + explicit AltrepArrayPrimitive(SEXP alt) : alt_(alt) {} + + // the arrow::Array that is being wrapped by the altrep object + // this is only valid before data2 has been materialized + const std::shared_ptr& array() const { return *Pointer(R_altrep_data1(alt_)); } + + R_xlen_t Length() { return array()->length(); } + + // Does the data2 slot of the altrep object contain a + // standard R vector with the same data as the array + bool IsMaterialized() const { return !Rf_isNull(R_altrep_data2(alt_)); } + + // Force materialization. After calling this, the data2 slot of the altrep + // object contains a standard R vector with the same data, with + // R sentinels where the Array has nulls. + void Materialize() { + if (!IsMaterialized()) { + auto size = array()->length(); + + // create a standard R vector + SEXP copy = PROTECT(Rf_allocVector(sexp_type, size)); + + // copy the data from the array, through Get_region + Get_region(0, size, reinterpret_cast(DATAPTR(copy))); + + // store as data2, this is now considered materialized + R_set_altrep_data2(alt_, copy); + MARK_NOT_MUTABLE(copy); + + UNPROTECT(1); + } } - static const std::shared_ptr& Get(SEXP vec) { - return *Pointer(R_altrep_data1(vec)); + // Duplication is done by first materializing the vector and + // then make a lazy duplicate of data2 + SEXP Duplicate(Rboolean /* deep */) { + Materialize(); + return Rf_lazy_duplicate(R_altrep_data2(alt_)); } - static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); } + // What gets printed on .Internal(inspect()) + Rboolean Inspect(int pre, int deep, int pvec, + void (*inspect_subtree)(SEXP, int, int, int)) { + const auto& array_ = array(); + Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n", + array_->type()->ToString().c_str(), array_->null_count(), + IsMaterialized() ? "materialized" : "not materialized", array_->length(), + array_.get()); + inspect_subtree(R_altrep_data1(alt_), pre, deep + 1, pvec); + if (IsMaterialized()) { + inspect_subtree(R_altrep_data2(alt_), pre, deep + 1, pvec); + } + + return TRUE; + } - static const void* Dataptr_or_null(SEXP vec) { - return Get(vec)->data()->template GetValues(1); + // R calls this to get a pointer to the start of the vector data + // but only if this is possible without allocating (in the R sense). + // + // For this implementation we can return the data in these cases + // - data2 has been created, and so the R sentinels are in place where the array has + // nulls + // - the Array has no nulls, we can directly return the start of its data + // + // Otherwise: if the array has nulls and data2 has not been generated: give up + const void* Dataptr_or_null() { + if (IsMaterialized()) { + return DATAPTR_RO(R_altrep_data2(alt_)); + } + + const auto& array_ = array(); + if (array_->null_count() == 0) { + return reinterpret_cast(array_->data()->template GetValues(1)); + } + + return NULL; } - static SEXP Duplicate(SEXP vec, Rboolean) { - const auto& array = Get(vec); - auto size = array->length(); + // R calls this to get a pointer to the start of the data, R allocations are allowed. + // + // If the object hasn't been materialized, and the array has no + // nulls we can directly point to the array data. + // + // Otherwise, the object is materialized DATAPTR(data2) is returned. + void* Dataptr(Rboolean writeable) { + if (!IsMaterialized()) { + const auto& array_ = array(); + + if (array_->null_count() == 0) { + return reinterpret_cast( + const_cast(array_->data()->template GetValues(1))); + } + } + + // Otherwise we have to materialize and hand the pointer to data2 + // + // NOTE: this returns the DATAPTR() of data2 even in the case writeable = TRUE + // + // which is risky because C(++) clients of this object might + // modify data2, and therefore make it diverge from the data of the Array, + // but the object was marked as immutable on creation, so doing this is + // disregarding the R api. + // + // Simply stop() when `writeable = TRUE` is too strong, e.g. this fails + // identical() which calls DATAPTR() even though DATAPTR_RO() would + // be enough + Materialize(); + return DATAPTR(R_altrep_data2(alt_)); + } - SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length())); + // Does the Array have no nulls ? + int No_NA() const { return array()->null_count() != 0; } - memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type)); + int Is_sorted() const { return UNKNOWN_SORTEDNESS; } - UNPROTECT(1); - return copy; + // The value at position i + c_type Elt(R_xlen_t i) { + const auto& array_ = array(); + return array_->IsNull(i) ? cpp11::na() + : array_->data()->template GetValues(1)[i]; } - static void* Dataptr(SEXP vec, Rboolean writeable) { - return const_cast(Dataptr_or_null(vec)); + // R calls this when it wants data from position `i` to `i + n` copied into `buf` + // The returned value is the number of values that were really copied + // (this can be lower than n) + R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, c_type* buf) { + // If we have data2, we can just copy the region into buf + // using the standard Get_region for this R type + if (IsMaterialized()) { + return Standard_Get_region(R_altrep_data2(alt_), i, n, buf); + } + + // The vector was not materialized, aka we don't have data2 + // + // In that case, we copy the data from the Array, and then + // do a second pass to force the R sentinels for where the + // array has nulls + // + // This only materialize the region, into buf. Not the entire vector. + auto slice = array()->Slice(i, n); + R_xlen_t ncopy = slice->length(); + + // first copy the data buffer + memcpy(buf, slice->data()->template GetValues(1), ncopy * sizeof(c_type)); + + // then set the R NA sentinels if needed + if (slice->null_count() > 0) { + internal::BitmapReader bitmap_reader(slice->null_bitmap()->data(), slice->offset(), + ncopy); + + for (R_xlen_t j = 0; j < ncopy; j++, bitmap_reader.Next()) { + if (bitmap_reader.IsNotSet()) { + buf[j] = cpp11::na(); + } + } + } + + return ncopy; } - // by definition, there are no NA - static int No_NA(SEXP vec) { return 1; } + // This cannot keep the external pointer to an Arrow object through + // R serialization, so return the materialized + SEXP Serialized_state() { + Materialize(); + return R_altrep_data2(alt_); + } - static void Init(R_altrep_class_t class_t, DllInfo* dll) { - // altrep - R_set_altrep_Length_method(class_t, ArrayNoNull::Length); - R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect); - R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate); + static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; } - // altvec - R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr); - R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null); + SEXP Coerce(int type) { + // Just let R handle it for now + return NULL; } }; +template +R_altrep_class_t AltrepArrayPrimitive::class_t; -struct DoubleArrayNoNull { - static R_altrep_class_t class_t; +// The methods below are how R interacts with the altrep objects. +// +// They all use the same pattern: create a C++ object of the +// class parameter, and then call the method. +template +R_xlen_t Length(SEXP alt) { + return AltrepClass(alt).Length(); +} - static void Init(DllInfo* dll) { - class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll); - ArrayNoNull::Init(class_t, dll); - R_set_altreal_No_NA_method(class_t, ArrayNoNull::No_NA); - } +template +Rboolean Inspect(SEXP alt, int pre, int deep, int pvec, + void (*inspect_subtree)(SEXP, int, int, int)) { + return AltrepClass(alt).Inspect(pre, deep, pvec, inspect_subtree); +} - static SEXP Make(const std::shared_ptr& array) { - return ArrayNoNull::Make(class_t, array); - } -}; +template +const void* Dataptr_or_null(SEXP alt) { + return AltrepClass(alt).Dataptr_or_null(); +} -struct Int32ArrayNoNull { - static R_altrep_class_t class_t; +template +void* Dataptr(SEXP alt, Rboolean writeable) { + return AltrepClass(alt).Dataptr(writeable); +} + +template +SEXP Duplicate(SEXP alt, Rboolean deep) { + return AltrepClass(alt).Duplicate(deep); +} + +template +auto Elt(SEXP alt, R_xlen_t i) -> decltype(AltrepClass(alt).Elt(i)) { + return AltrepClass(alt).Elt(i); +} + +template +int No_NA(SEXP alt) { + return AltrepClass(alt).No_NA(); +} + +template +int Is_sorted(SEXP alt) { + return AltrepClass(alt).Is_sorted(); +} + +template +R_xlen_t Get_region(SEXP alt, R_xlen_t i, R_xlen_t n, typename AltrepClass::c_type* buf) { + return AltrepClass(alt).Get_region(i, n, buf); +} + +template +SEXP Serialized_state(SEXP alt) { + return AltrepClass(alt).Serialized_state(); +} + +template +SEXP Unserialize(SEXP class_, SEXP state) { + return AltrepClass::Unserialize(class_, state); +} - static void Init(DllInfo* dll) { - class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll); - ArrayNoNull::Init(class_t, dll); - R_set_altinteger_No_NA_method(class_t, ArrayNoNull::No_NA); +template +SEXP Coerce(SEXP alt, int type) { + return AltrepClass(alt).Coerce(type); +} + +static std::shared_ptr NaRmOptions( + const std::shared_ptr& array, bool na_rm) { + auto options = std::make_shared( + arrow::compute::ScalarAggregateOptions::Defaults()); + options->min_count = 0; + options->skip_nulls = na_rm; + return options; +} + +template +SEXP MinMax(SEXP alt, Rboolean narm) { + using data_type = typename std::conditional::type; + using scalar_type = + typename std::conditional::type; + + AltrepArrayPrimitive alt_(alt); + + const auto& array = alt_.array(); + bool na_rm = narm == TRUE; + auto n = array->length(); + auto null_count = array->null_count(); + if ((na_rm || n == 0) && null_count == n) { + return Rf_ScalarReal(Min ? R_PosInf : R_NegInf); } + if (!na_rm && null_count > 0) { + return cpp11::as_sexp(cpp11::na()); + } + + auto options = NaRmOptions(array, na_rm); + + const auto& minmax = + ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get())); + const auto& minmax_scalar = + internal::checked_cast(*minmax.scalar()); + + const auto& result_scalar = internal::checked_cast( + *ValueOrStop(minmax_scalar.field(Min ? "min" : "max"))); + return cpp11::as_sexp(result_scalar.value); +} + +template +SEXP Min(SEXP alt, Rboolean narm) { + return MinMax(alt, narm); +} + +template +SEXP Max(SEXP alt, Rboolean narm) { + return MinMax(alt, narm); +} + +template +static SEXP Sum(SEXP alt, Rboolean narm) { + using data_type = typename std::conditional::type; + + AltrepArrayPrimitive alt_(alt); + + const auto& array = alt_.array(); + bool na_rm = narm == TRUE; + auto null_count = array->null_count(); - static SEXP Make(const std::shared_ptr& array) { - return ArrayNoNull::Make(class_t, array); + if (!na_rm && null_count > 0) { + return cpp11::as_sexp(cpp11::na()); } -}; + auto options = NaRmOptions(array, na_rm); + + const auto& sum = + ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get())); + + if (sexp_type == INTSXP) { + // When calling the "sum" function on an int32 array, we get an Int64 scalar + // in case of overflow, make it a double like R + int64_t value = internal::checked_cast(*sum.scalar()).value; + if (value <= INT32_MIN || value > INT32_MAX) { + return Rf_ScalarReal(static_cast(value)); + } else { + return Rf_ScalarInteger(static_cast(value)); + } + } else { + return Rf_ScalarReal( + internal::checked_cast(*sum.scalar()).value); + } +} -R_altrep_class_t Int32ArrayNoNull::class_t; -R_altrep_class_t DoubleArrayNoNull::class_t; +// initialize altrep, altvec, altreal, and altinteger methods +template +void InitAltrepMethods(R_altrep_class_t class_t, DllInfo* dll) { + R_set_altrep_Length_method(class_t, Length); + R_set_altrep_Inspect_method(class_t, Inspect); + R_set_altrep_Duplicate_method(class_t, Duplicate); + R_set_altrep_Serialized_state_method(class_t, Serialized_state); + R_set_altrep_Unserialize_method(class_t, Unserialize); + R_set_altrep_Coerce_method(class_t, Coerce); +} -void Init_Altrep_classes(DllInfo* dll) { - DoubleArrayNoNull::Init(dll); - Int32ArrayNoNull::Init(dll); +template +void InitAltvecMethods(R_altrep_class_t class_t, DllInfo* dll) { + R_set_altvec_Dataptr_method(class_t, Dataptr); + R_set_altvec_Dataptr_or_null_method(class_t, Dataptr_or_null); +} + +template +void InitAltRealMethods(R_altrep_class_t class_t, DllInfo* dll) { + R_set_altreal_No_NA_method(class_t, No_NA); + R_set_altreal_Is_sorted_method(class_t, Is_sorted); + + R_set_altreal_Sum_method(class_t, Sum); + R_set_altreal_Min_method(class_t, Min); + R_set_altreal_Max_method(class_t, Max); + + R_set_altreal_Elt_method(class_t, Elt); + R_set_altreal_Get_region_method(class_t, Get_region); } -SEXP MakeDoubleArrayNoNull(const std::shared_ptr& array) { - return DoubleArrayNoNull::Make(array); +template +void InitAltIntegerMethods(R_altrep_class_t class_t, DllInfo* dll) { + R_set_altinteger_No_NA_method(class_t, No_NA); + R_set_altinteger_Is_sorted_method(class_t, Is_sorted); + + R_set_altinteger_Sum_method(class_t, Sum); + R_set_altinteger_Min_method(class_t, Min); + R_set_altinteger_Max_method(class_t, Max); + + R_set_altinteger_Elt_method(class_t, Elt); + R_set_altinteger_Get_region_method(class_t, Get_region); } -SEXP MakeInt32ArrayNoNull(const std::shared_ptr& array) { - return Int32ArrayNoNull::Make(array); +template +void InitAltRealClass(DllInfo* dll, const char* name) { + AltrepClass::class_t = R_make_altreal_class(name, "arrow", dll); + InitAltrepMethods(AltrepClass::class_t, dll); + InitAltvecMethods(AltrepClass::class_t, dll); + InitAltRealMethods(AltrepClass::class_t, dll); } -} // namespace r -} // namespace arrow +template +void InitAltIntegerClass(DllInfo* dll, const char* name) { + AltrepClass::class_t = R_make_altinteger_class(name, "arrow", dll); + InitAltrepMethods(AltrepClass::class_t, dll); + InitAltvecMethods(AltrepClass::class_t, dll); + InitAltIntegerMethods(AltrepClass::class_t, dll); +} -#endif +// initialize the altrep classes +void Init_Altrep_classes(DllInfo* dll) { + InitAltRealClass>(dll, "array_dbl_vector"); + InitAltIntegerClass>(dll, "array_int_vector"); +} -// [[arrow::export]] -bool is_altrep_int_nonull(SEXP x) { -#if defined(HAS_ALTREP) - return R_altrep_inherits(x, arrow::r::Int32ArrayNoNull::class_t); -#else - return false; -#endif +// return an altrep R vector that shadows the array if possible +SEXP MakeAltrepArrayPrimitive(const std::shared_ptr& array) { + switch (array->type()->id()) { + case arrow::Type::DOUBLE: + return altrep::AltrepArrayPrimitive(array).alt_; + + case arrow::Type::INT32: + return altrep::AltrepArrayPrimitive(array).alt_; + + default: + break; + } + + return R_NilValue; } +} // namespace altrep +} // namespace r +} // namespace arrow + +#endif // HAS_ALTREP + // [[arrow::export]] -bool is_altrep_dbl_nonull(SEXP x) { +bool is_altrep(SEXP x) { #if defined(HAS_ALTREP) - return R_altrep_inherits(x, arrow::r::DoubleArrayNoNull::class_t); + return ALTREP(x); #else return false; #endif diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index d5a5425966f..edbd13963d5 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -69,15 +69,13 @@ class Converter { // special case when there is only one array if (chunked_array_->num_chunks() == 1) { const auto& array = chunked_array_->chunk(0); - if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0 && - array->null_count() == 0) { - switch (array->type()->id()) { - case arrow::Type::DOUBLE: - return arrow::r::MakeDoubleArrayNoNull(array); - case arrow::Type::INT32: - return arrow::r::MakeInt32ArrayNoNull(array); - default: - break; + // using altrep if + // - the arrow.use_altrep is set to TRUE or unset (implicit TRUE) + // - the array has at least one element + if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0) { + SEXP alt = altrep::MakeAltrepArrayPrimitive(array); + if (!Rf_isNull(alt)) { + return alt; } } } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 5ef39215c73..d99abf2605d 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -6,31 +6,16 @@ // altrep.cpp #if defined(ARROW_R_WITH_ARROW) -bool is_altrep_int_nonull(SEXP x); -extern "C" SEXP _arrow_is_altrep_int_nonull(SEXP x_sexp){ +bool is_altrep(SEXP x); +extern "C" SEXP _arrow_is_altrep(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input::type x(x_sexp); - return cpp11::as_sexp(is_altrep_int_nonull(x)); + return cpp11::as_sexp(is_altrep(x)); END_CPP11 } #else -extern "C" SEXP _arrow_is_altrep_int_nonull(SEXP x_sexp){ - Rf_error("Cannot call is_altrep_int_nonull(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); -} -#endif - -// altrep.cpp -#if defined(ARROW_R_WITH_ARROW) -bool is_altrep_dbl_nonull(SEXP x); -extern "C" SEXP _arrow_is_altrep_dbl_nonull(SEXP x_sexp){ -BEGIN_CPP11 - arrow::r::Input::type x(x_sexp); - return cpp11::as_sexp(is_altrep_dbl_nonull(x)); -END_CPP11 -} -#else -extern "C" SEXP _arrow_is_altrep_dbl_nonull(SEXP x_sexp){ - Rf_error("Cannot call is_altrep_dbl_nonull(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +extern "C" SEXP _arrow_is_altrep(SEXP x_sexp){ + Rf_error("Cannot call is_altrep(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -7040,8 +7025,7 @@ static const R_CallMethodDef CallEntries[] = { { "_dataset_available", (DL_FUNC)& _dataset_available, 0 }, { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, - { "_arrow_is_altrep_int_nonull", (DL_FUNC) &_arrow_is_altrep_int_nonull, 1}, - { "_arrow_is_altrep_dbl_nonull", (DL_FUNC) &_arrow_is_altrep_dbl_nonull, 1}, + { "_arrow_is_altrep", (DL_FUNC) &_arrow_is_altrep, 1}, { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, @@ -7492,7 +7476,7 @@ extern "C" void R_init_arrow(DllInfo* dll){ R_useDynamicSymbols(dll, FALSE); #if defined(ARROW_R_WITH_ARROW) && defined(HAS_ALTREP) - arrow::r::Init_Altrep_classes(dll); + arrow::r::altrep::Init_Altrep_classes(dll); #endif } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 4ecb99174b5..9419d956877 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -101,6 +101,7 @@ auto ValueOrStop(R&& result) -> decltype(std::forward(result).ValueOrDie()) { } namespace r { +class RTasks; std::shared_ptr InferArrowType(SEXP x); std::shared_ptr vec_to_arrow__reuse_memory(SEXP x); @@ -174,9 +175,13 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, std::shared_ptr& schema); #if defined(HAS_ALTREP) + +namespace altrep { + void Init_Altrep_classes(DllInfo* dll); -SEXP MakeInt32ArrayNoNull(const std::shared_ptr& array); -SEXP MakeDoubleArrayNoNull(const std::shared_ptr& array); +SEXP MakeAltrepArrayPrimitive(const std::shared_ptr& array); + +} // namespace altrep #endif } // namespace r diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R index 42784b61442..8cb989b1d4c 100644 --- a/r/tests/testthat/test-altrep.R +++ b/r/tests/testthat/test-altrep.R @@ -26,30 +26,30 @@ test_that("altrep vectors from int32 and dbl arrays with no nulls", { c_int <- ChunkedArray$create(1:1000) c_dbl <- ChunkedArray$create(as.numeric(1:1000)) - expect_true(is_altrep_int_nonull(as.vector(v_int))) - expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + expect_true(is_altrep(as.vector(v_int))) + expect_true(is_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_altrep(as.vector(v_dbl))) + expect_true(is_altrep(as.vector(v_dbl$Slice(1)))) expect_equal(c_int$num_chunks, 1L) - expect_true(is_altrep_int_nonull(as.vector(c_int))) - expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(1)))) + expect_true(is_altrep(as.vector(c_int))) + expect_true(is_altrep(as.vector(c_int$Slice(1)))) expect_equal(c_dbl$num_chunks, 1L) - expect_true(is_altrep_dbl_nonull(as.vector(c_dbl))) - expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1)))) + expect_true(is_altrep(as.vector(c_dbl))) + expect_true(is_altrep(as.vector(c_dbl$Slice(1)))) withr::local_options(list(arrow.use_altrep = NULL)) - expect_true(is_altrep_int_nonull(as.vector(v_int))) - expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + expect_true(is_altrep(as.vector(v_int))) + expect_true(is_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_altrep(as.vector(v_dbl))) + expect_true(is_altrep(as.vector(v_dbl$Slice(1)))) withr::local_options(list(arrow.use_altrep = FALSE)) - expect_false(is_altrep_int_nonull(as.vector(v_int))) - expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + expect_false(is_altrep(as.vector(v_int))) + expect_false(is_altrep(as.vector(v_int$Slice(1)))) + expect_false(is_altrep(as.vector(v_dbl))) + expect_false(is_altrep(as.vector(v_dbl$Slice(1)))) }) test_that("altrep vectors from int32 and dbl arrays with nulls", { @@ -59,31 +59,30 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", { c_int <- ChunkedArray$create(c(1L, NA, 3L)) c_dbl <- ChunkedArray$create(c(1, NA, 3)) - # cannot be altrep because one NA - expect_false(is_altrep_int_nonull(as.vector(v_int))) - expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) - expect_false(is_altrep_int_nonull(as.vector(c_int))) - expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1)))) - expect_false(is_altrep_dbl_nonull(as.vector(c_dbl))) - expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1)))) - - # but then, no NA beyond, so can be altrep again - expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2)))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2)))) - expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2)))) - expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2)))) + expect_true(is_altrep(as.vector(v_int))) + expect_true(is_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_altrep(as.vector(v_dbl))) + expect_true(is_altrep(as.vector(v_dbl$Slice(1)))) + expect_true(is_altrep(as.vector(c_int))) + expect_true(is_altrep(as.vector(c_int$Slice(1)))) + expect_true(is_altrep(as.vector(c_dbl))) + expect_true(is_altrep(as.vector(c_dbl$Slice(1)))) + + expect_true(is_altrep(as.vector(v_int$Slice(2)))) + expect_true(is_altrep(as.vector(v_dbl$Slice(2)))) + expect_true(is_altrep(as.vector(c_int$Slice(2)))) + expect_true(is_altrep(as.vector(c_dbl$Slice(2)))) # chunked array with 2 chunks cannot be altrep c_int <- ChunkedArray$create(0L, c(1L, NA, 3L)) c_dbl <- ChunkedArray$create(0, c(1, NA, 3)) expect_equal(c_int$num_chunks, 2L) expect_equal(c_dbl$num_chunks, 2L) - expect_false(is_altrep_int_nonull(as.vector(c_int))) - expect_false(is_altrep_dbl_nonull(as.vector(c_dbl))) - expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3)))) - expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3)))) + + expect_false(is_altrep(as.vector(c_int))) + expect_false(is_altrep(as.vector(c_dbl))) + expect_true(is_altrep(as.vector(c_int$Slice(3)))) + expect_true(is_altrep(as.vector(c_dbl$Slice(3)))) }) test_that("empty vectors are not altrep", { @@ -91,8 +90,8 @@ test_that("empty vectors are not altrep", { v_int <- Array$create(integer()) v_dbl <- Array$create(numeric()) - expect_false(is_altrep_int_nonull(as.vector(v_int))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) + expect_false(is_altrep(as.vector(v_int))) + expect_false(is_altrep(as.vector(v_dbl))) }) test_that("as.data.frame(, ) can create altrep vectors", { @@ -100,11 +99,109 @@ test_that("as.data.frame(
, ) can create altrep vectors", { table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3)) df_table <- as.data.frame(table) - expect_true(is_altrep_int_nonull(df_table$int)) - expect_true(is_altrep_dbl_nonull(df_table$dbl)) + expect_true(is_altrep(df_table$int)) + expect_true(is_altrep(df_table$dbl)) batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3)) df_batch <- as.data.frame(batch) - expect_true(is_altrep_int_nonull(df_batch$int)) - expect_true(is_altrep_dbl_nonull(df_batch$dbl)) + expect_true(is_altrep(df_batch$int)) + expect_true(is_altrep(df_batch$dbl)) +}) + +expect_altrep_rountrip <- function(x, fn, ...) { + alt <- Array$create(x)$as_vector() + + expect_true(is_altrep(alt)) + expect_identical(fn(x, ...), fn(alt, ...)) + expect_true(is_altrep(alt)) +} + +test_that("altrep min/max/sum identical to R versions for double", { + x <- c(1, 2, 3) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- c(1, 2, NA_real_) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- rep(NA_real_, 3) + expect_warning( + expect_altrep_rountrip(x, min, na.rm = TRUE), + "no non-missing arguments to min" + ) + expect_warning( + expect_altrep_rountrip(x, max, na.rm = TRUE), + "no non-missing arguments to max" + ) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) +}) + +test_that("altrep min/max/sum identical to R versions for int", { + x <- c(1L, 2L, 3L) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- c(1L, 2L, NA_integer_) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- rep(NA_integer_, 3) + expect_warning( + expect_altrep_rountrip(x, min, na.rm = TRUE), + "no non-missing arguments to min" + ) + expect_warning( + expect_altrep_rountrip(x, max, na.rm = TRUE), + "no non-missing arguments to max" + ) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + # sum(x) is INT_MIN -> convert to double. + x <- as.integer(c(-2^31 + 1L, -1L)) + expect_altrep_rountrip(x, sum) +}) + +test_that("altrep vectors handle serialization", { + ints <- c(1L, 2L, NA_integer_) + dbls <- c(1, 2, NA_real_) + + expect_identical(ints, unserialize(serialize(Array$create(ints)$as_vector(), NULL))) + expect_identical(dbls, unserialize(serialize(Array$create(dbls)$as_vector(), NULL))) +}) + +test_that("altrep vectors handle coercion", { + ints <- c(1L, 2L, NA_integer_) + dbls <- c(1, 2, NA_real_) + + expect_identical(ints, as.integer(Array$create(dbls)$as_vector())) + expect_identical(dbls, as.numeric(Array$create(ints)$as_vector())) })