diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 90a459e19cb..a4adf58997c 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -115,7 +115,7 @@ const std::shared_ptr& GetChunkedArray(SEXP alt) { // materialization is needed. // data2: starts as NULL, and becomes a standard R vector with the same // data if necessary: if materialization is needed, e.g. if we need -// to access its data pointer, with DATAPTR(). +// to access its data pointer, with INTEGER(), REAL(), etc. template struct AltrepVectorBase { // store the Array as an external pointer in data1, mark as immutable @@ -220,7 +220,14 @@ struct AltrepVectorPrimitive : public AltrepVectorBase(DATAPTR(copy))); + if constexpr (std::is_same_v) { + Get_region(alt, 0, size, REAL(copy)); + } else if constexpr (std::is_same_v) { + Get_region(alt, 0, size, INTEGER(copy)); + } else { + static_assert(std::is_same_v || std::is_same_v, + "ALTREP not implemented for this c_type"); + } // store as data2, this is now considered materialized SetRepresentation(alt, copy); @@ -269,13 +276,27 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { + return REAL(Materialize(alt)); + } else if constexpr (std::is_same_v) { + return INTEGER(Materialize(alt)); + } else { + static_assert(std::is_same_v || std::is_same_v, + "ALTREP not implemented for this c_type"); + } } // The value at position i static c_type Elt(SEXP alt, R_xlen_t i) { if (IsMaterialized(alt)) { - return reinterpret_cast(DATAPTR(Representation(alt)))[i]; + if constexpr (std::is_same_v) { + return REAL(Representation(alt))[i]; + } else if constexpr (std::is_same_v) { + return INTEGER(Representation(alt))[i]; + } else { + static_assert(std::is_same_v || std::is_same_v, + "ALTREP not implemented for this c_type"); + } } auto altrep_data = @@ -531,7 +552,7 @@ struct AltrepFactor : public AltrepVectorBase { SEXP copy = PROTECT(Rf_allocVector(INTSXP, size)); // copy the data from the array, through Get_region - Get_region(alt, 0, size, reinterpret_cast(DATAPTR(copy))); + Get_region(alt, 0, size, INTEGER(copy)); // store as data2, this is now considered materialized SetRepresentation(alt, copy); @@ -552,7 +573,7 @@ struct AltrepFactor : public AltrepVectorBase { return nullptr; } - static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); } + static void* Dataptr(SEXP alt, Rboolean writeable) { return INTEGER(Materialize(alt)); } static SEXP Duplicate(SEXP alt, Rboolean /* deep */) { // the representation integer vector @@ -892,7 +913,9 @@ struct AltrepVectorString : public AltrepVectorBase> { return s; } - static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); } + static void* Dataptr(SEXP alt, Rboolean writeable) { + return const_cast(DATAPTR_RO(Materialize(alt))); + } static SEXP Materialize(SEXP alt) { if (Base::IsMaterialized(alt)) { @@ -931,7 +954,9 @@ struct AltrepVectorString : public AltrepVectorBase> { } static const void* Dataptr_or_null(SEXP alt) { - if (Base::IsMaterialized(alt)) return DATAPTR(Representation(alt)); + if (Base::IsMaterialized(alt)) { + return DATAPTR_RO(alt); + } // otherwise give up return nullptr; @@ -1267,21 +1292,14 @@ sexp test_arrow_altrep_copy_by_dataptr(sexp x) { if (TYPEOF(x) == INTSXP) { cpp11::writable::integers out(Rf_xlength(x)); - int* ptr = reinterpret_cast(DATAPTR(x)); + int* ptr = INTEGER(x); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } return out; } else if (TYPEOF(x) == REALSXP) { cpp11::writable::doubles out(Rf_xlength(x)); - double* ptr = reinterpret_cast(DATAPTR(x)); - for (R_xlen_t i = 0; i < n; i++) { - out[i] = ptr[i]; - } - return out; - } else if (TYPEOF(x) == STRSXP) { - cpp11::writable::strings out(Rf_xlength(x)); - SEXP* ptr = reinterpret_cast(DATAPTR(x)); + double* ptr = REAL(x); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 1595a3c8fb5..d1882e56daf 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -173,7 +173,7 @@ template class RBuffer : public MutableBuffer { public: explicit RBuffer(RVector vec) - : MutableBuffer(reinterpret_cast(DATAPTR(vec)), + : MutableBuffer(reinterpret_cast(getDataPointer(vec)), vec.size() * sizeof(typename RVector::value_type), arrow::CPUDevice::memory_manager(gc_memory_pool())), vec_(vec) {} @@ -181,6 +181,24 @@ class RBuffer : public MutableBuffer { private: // vec_ holds the memory RVector vec_; + + static void* getDataPointer(RVector& vec) { + if (TYPEOF(vec) == LGLSXP) { + return LOGICAL(vec); + } else if (TYPEOF(vec) == INTSXP) { + return INTEGER(vec); + } else if (TYPEOF(vec) == REALSXP) { + return REAL(vec); + } else if (TYPEOF(vec) == CPLXSXP) { + return COMPLEX(vec); + } else if (TYPEOF(vec) == STRSXP) { + // We don't want to expose the string data here, so we error + cpp11::stop("Operation not supported for string vectors."); + } else { + // raw + return RAW(vec); + } + } }; std::shared_ptr InferArrowTypeFromFactor(SEXP); diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 3292c44520f..981bd911fba 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1214,11 +1214,11 @@ bool can_reuse_memory(SEXP x, const std::shared_ptr& type) { // because MakeSimpleArray below will force materialization switch (type->id()) { case Type::INT32: - return TYPEOF(x) == INTSXP && !OBJECT(x); + return TYPEOF(x) == INTSXP && !Rf_isObject(x); case Type::DOUBLE: - return TYPEOF(x) == REALSXP && !OBJECT(x); + return TYPEOF(x) == REALSXP && !Rf_isObject(x); case Type::INT8: - return TYPEOF(x) == RAWSXP && !OBJECT(x); + return TYPEOF(x) == RAWSXP && !Rf_isObject(x); case Type::INT64: return TYPEOF(x) == REALSXP && Rf_inherits(x, "integer64"); default: @@ -1412,17 +1412,17 @@ bool vector_from_r_memory(SEXP x, const std::shared_ptr& type, switch (type->id()) { case Type::INT32: - return TYPEOF(x) == INTSXP && !OBJECT(x) && + return TYPEOF(x) == INTSXP && !Rf_isObject(x) && vector_from_r_memory_impl(x, type, columns, j, tasks); case Type::DOUBLE: - return TYPEOF(x) == REALSXP && !OBJECT(x) && + return TYPEOF(x) == REALSXP && !Rf_isObject(x) && vector_from_r_memory_impl(x, type, columns, j, tasks); case Type::UINT8: - return TYPEOF(x) == RAWSXP && !OBJECT(x) && + return TYPEOF(x) == RAWSXP && !Rf_isObject(x) && vector_from_r_memory_impl(x, type, columns, j, tasks); diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R index 50bd40988e5..d1e90b6b59a 100644 --- a/r/tests/testthat/test-altrep.R +++ b/r/tests/testthat/test-altrep.R @@ -170,7 +170,7 @@ test_that("element access methods for int32 ALTREP with nulls", { expect_identical(test_arrow_altrep_copy_by_region(altrep, 123), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # because there are no nulls, DATAPTR() does not materialize + # because there are nulls, DATAPTR() does materialize expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) expect_true(test_arrow_altrep_is_materialized(altrep)) @@ -193,7 +193,7 @@ test_that("element access methods for double ALTREP with nulls", { expect_identical(test_arrow_altrep_copy_by_region(altrep, 123), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # because there are no nulls, DATAPTR() does not materialize + # because there are nulls, DATAPTR() does materialize expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) expect_true(test_arrow_altrep_is_materialized(altrep)) @@ -244,14 +244,13 @@ test_that("element access methods for character ALTREP", { expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # DATAPTR() should always materialize for strings - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) + # match() calls DATAPTR() internally which materializes the vector + match(altrep, c("1", "40", "999")) expect_true(test_arrow_altrep_is_materialized(altrep)) # test element access after materialization expect_true(test_arrow_altrep_is_materialized(altrep)) expect_identical(test_arrow_altrep_copy_by_element(altrep), original) - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) }) test_that("element access methods for character ALTREP from large_utf8()", { @@ -265,14 +264,13 @@ test_that("element access methods for character ALTREP from large_utf8()", { expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # DATAPTR() should always materialize for strings - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) + # match() calls DATAPTR() internally which materializes the vector + match(altrep, c("1", "40", "999")) expect_true(test_arrow_altrep_is_materialized(altrep)) # test element access after materialization expect_true(test_arrow_altrep_is_materialized(altrep)) expect_identical(test_arrow_altrep_copy_by_element(altrep), original) - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) }) test_that("empty vectors are not altrep", {