Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 35 additions & 17 deletions r/src/altrep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ const std::shared_ptr<ChunkedArray>& GetChunkedArray(SEXP alt) {
// materialization is needed.
// data2: starts as NULL, and becomes a standard R vector with the same
// data if necessary: if materialization is needed, e.g. if we need
// to access its data pointer, with DATAPTR().
// to access its data pointer, with INTEGER(), REAL(), etc.
template <typename Impl>
struct AltrepVectorBase {
// store the Array as an external pointer in data1, mark as immutable
Expand Down Expand Up @@ -220,7 +220,14 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
SEXP copy = PROTECT(Rf_allocVector(sexp_type, size));

// copy the data from the array, through Get_region
Get_region(alt, 0, size, reinterpret_cast<c_type*>(DATAPTR(copy)));
if constexpr (std::is_same_v<c_type, double>) {
Get_region(alt, 0, size, REAL(copy));
} else if constexpr (std::is_same_v<c_type, int>) {
Get_region(alt, 0, size, INTEGER(copy));
} else {
static_assert(std::is_same_v<c_type, double> || std::is_same_v<c_type, int>,
"ALTREP not implemented for this c_type");
}

// store as data2, this is now considered materialized
SetRepresentation(alt, copy);
Expand Down Expand Up @@ -269,13 +276,27 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
}

// Otherwise we have to materialize and hand the pointer to data2
return DATAPTR(Materialize(alt));
if constexpr (std::is_same_v<c_type, double>) {
return REAL(Materialize(alt));
} else if constexpr (std::is_same_v<c_type, int>) {
return INTEGER(Materialize(alt));
} else {
static_assert(std::is_same_v<c_type, double> || std::is_same_v<c_type, int>,
"ALTREP not implemented for this c_type");
}
}

// The value at position i
static c_type Elt(SEXP alt, R_xlen_t i) {
if (IsMaterialized(alt)) {
return reinterpret_cast<c_type*>(DATAPTR(Representation(alt)))[i];
if constexpr (std::is_same_v<c_type, double>) {
return REAL(Representation(alt))[i];
} else if constexpr (std::is_same_v<c_type, int>) {
return INTEGER(Representation(alt))[i];
} else {
static_assert(std::is_same_v<c_type, double> || std::is_same_v<c_type, int>,
"ALTREP not implemented for this c_type");
}
}

auto altrep_data =
Expand Down Expand Up @@ -531,7 +552,7 @@ struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));

// copy the data from the array, through Get_region
Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
Get_region(alt, 0, size, INTEGER(copy));

// store as data2, this is now considered materialized
SetRepresentation(alt, copy);
Expand All @@ -552,7 +573,7 @@ struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
return nullptr;
}

static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); }
static void* Dataptr(SEXP alt, Rboolean writeable) { return INTEGER(Materialize(alt)); }

static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
// the representation integer vector
Expand Down Expand Up @@ -892,7 +913,9 @@ struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> {
return s;
}

static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); }
static void* Dataptr(SEXP alt, Rboolean writeable) {
return const_cast<void*>(DATAPTR_RO(Materialize(alt)));
}

static SEXP Materialize(SEXP alt) {
if (Base::IsMaterialized(alt)) {
Expand Down Expand Up @@ -931,7 +954,9 @@ struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> {
}

static const void* Dataptr_or_null(SEXP alt) {
if (Base::IsMaterialized(alt)) return DATAPTR(Representation(alt));
if (Base::IsMaterialized(alt)) {
return DATAPTR_RO(alt);
}

// otherwise give up
return nullptr;
Expand Down Expand Up @@ -1267,21 +1292,14 @@ sexp test_arrow_altrep_copy_by_dataptr(sexp x) {

if (TYPEOF(x) == INTSXP) {
cpp11::writable::integers out(Rf_xlength(x));
int* ptr = reinterpret_cast<int*>(DATAPTR(x));
int* ptr = INTEGER(x);
for (R_xlen_t i = 0; i < n; i++) {
out[i] = ptr[i];
}
return out;
} else if (TYPEOF(x) == REALSXP) {
cpp11::writable::doubles out(Rf_xlength(x));
double* ptr = reinterpret_cast<double*>(DATAPTR(x));
for (R_xlen_t i = 0; i < n; i++) {
out[i] = ptr[i];
}
return out;
} else if (TYPEOF(x) == STRSXP) {
cpp11::writable::strings out(Rf_xlength(x));
SEXP* ptr = reinterpret_cast<SEXP*>(DATAPTR(x));
double* ptr = REAL(x);
for (R_xlen_t i = 0; i < n; i++) {
out[i] = ptr[i];
}
Expand Down
20 changes: 19 additions & 1 deletion r/src/arrow_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,32 @@ template <typename RVector>
class RBuffer : public MutableBuffer {
public:
explicit RBuffer(RVector vec)
: MutableBuffer(reinterpret_cast<uint8_t*>(DATAPTR(vec)),
: MutableBuffer(reinterpret_cast<uint8_t*>(getDataPointer(vec)),
vec.size() * sizeof(typename RVector::value_type),
arrow::CPUDevice::memory_manager(gc_memory_pool())),
vec_(vec) {}

private:
// vec_ holds the memory
RVector vec_;

static void* getDataPointer(RVector& vec) {
if (TYPEOF(vec) == LGLSXP) {
return LOGICAL(vec);
} else if (TYPEOF(vec) == INTSXP) {
return INTEGER(vec);
} else if (TYPEOF(vec) == REALSXP) {
return REAL(vec);
} else if (TYPEOF(vec) == CPLXSXP) {
return COMPLEX(vec);
} else if (TYPEOF(vec) == STRSXP) {
// We don't want to expose the string data here, so we error
cpp11::stop("Operation not supported for string vectors.");
} else {
// raw
return RAW(vec);
}
}
};

std::shared_ptr<arrow::DataType> InferArrowTypeFromFactor(SEXP);
Expand Down
12 changes: 6 additions & 6 deletions r/src/r_to_arrow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,11 +1214,11 @@ bool can_reuse_memory(SEXP x, const std::shared_ptr<arrow::DataType>& type) {
// because MakeSimpleArray below will force materialization
switch (type->id()) {
case Type::INT32:
return TYPEOF(x) == INTSXP && !OBJECT(x);
return TYPEOF(x) == INTSXP && !Rf_isObject(x);
case Type::DOUBLE:
return TYPEOF(x) == REALSXP && !OBJECT(x);
return TYPEOF(x) == REALSXP && !Rf_isObject(x);
case Type::INT8:
return TYPEOF(x) == RAWSXP && !OBJECT(x);
return TYPEOF(x) == RAWSXP && !Rf_isObject(x);
case Type::INT64:
return TYPEOF(x) == REALSXP && Rf_inherits(x, "integer64");
default:
Expand Down Expand Up @@ -1412,17 +1412,17 @@ bool vector_from_r_memory(SEXP x, const std::shared_ptr<DataType>& type,

switch (type->id()) {
case Type::INT32:
return TYPEOF(x) == INTSXP && !OBJECT(x) &&
return TYPEOF(x) == INTSXP && !Rf_isObject(x) &&
vector_from_r_memory_impl<cpp11::integers, Int32Type>(x, type, columns, j,
tasks);

case Type::DOUBLE:
return TYPEOF(x) == REALSXP && !OBJECT(x) &&
return TYPEOF(x) == REALSXP && !Rf_isObject(x) &&
vector_from_r_memory_impl<cpp11::doubles, DoubleType>(x, type, columns, j,
tasks);

case Type::UINT8:
return TYPEOF(x) == RAWSXP && !OBJECT(x) &&
return TYPEOF(x) == RAWSXP && !Rf_isObject(x) &&
vector_from_r_memory_impl<cpp11::raws, UInt8Type>(x, type, columns, j,
tasks);

Expand Down
14 changes: 6 additions & 8 deletions r/tests/testthat/test-altrep.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ test_that("element access methods for int32 ALTREP with nulls", {
expect_identical(test_arrow_altrep_copy_by_region(altrep, 123), original)
expect_false(test_arrow_altrep_is_materialized(altrep))

# because there are no nulls, DATAPTR() does not materialize
# because there are nulls, DATAPTR() does materialize
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment change and the one below are not behavior changes with this PR, but I think the comments were simply wrong (either old or copy pasted). I've tried to correct them to be accurate descriptions of what's going on (but see "does not materialize" and then two lines later expect_true(test_arrow_altrep_is_materialized(altrep)) is at odds with each other

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're correct here that my earlier comment was simply wrong 🙂

expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original)
expect_true(test_arrow_altrep_is_materialized(altrep))

Expand All @@ -193,7 +193,7 @@ test_that("element access methods for double ALTREP with nulls", {
expect_identical(test_arrow_altrep_copy_by_region(altrep, 123), original)
expect_false(test_arrow_altrep_is_materialized(altrep))

# because there are no nulls, DATAPTR() does not materialize
# because there are nulls, DATAPTR() does materialize
expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original)
expect_true(test_arrow_altrep_is_materialized(altrep))

Expand Down Expand Up @@ -244,14 +244,13 @@ test_that("element access methods for character ALTREP", {
expect_identical(test_arrow_altrep_copy_by_element(altrep), original)
expect_false(test_arrow_altrep_is_materialized(altrep))

# DATAPTR() should always materialize for strings
expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original)
# match() calls DATAPTR() internally which materializes the vector
match(altrep, c("1", "40", "999"))
expect_true(test_arrow_altrep_is_materialized(altrep))

# test element access after materialization
expect_true(test_arrow_altrep_is_materialized(altrep))
expect_identical(test_arrow_altrep_copy_by_element(altrep), original)
expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original)
})

test_that("element access methods for character ALTREP from large_utf8()", {
Expand All @@ -265,14 +264,13 @@ test_that("element access methods for character ALTREP from large_utf8()", {
expect_identical(test_arrow_altrep_copy_by_element(altrep), original)
expect_false(test_arrow_altrep_is_materialized(altrep))

# DATAPTR() should always materialize for strings
expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original)
# match() calls DATAPTR() internally which materializes the vector
match(altrep, c("1", "40", "999"))
expect_true(test_arrow_altrep_is_materialized(altrep))

# test element access after materialization
expect_true(test_arrow_altrep_is_materialized(altrep))
expect_identical(test_arrow_altrep_copy_by_element(altrep), original)
expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original)
})

test_that("empty vectors are not altrep", {
Expand Down
Loading