From 26de2b8d6f5a7cf580d27d508328a9070c7d2a0b Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 26 Mar 2025 21:39:40 -0500 Subject: [PATCH 01/20] No more object --- r/src/r_to_arrow.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 3292c44520f..981bd911fba 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1214,11 +1214,11 @@ bool can_reuse_memory(SEXP x, const std::shared_ptr& type) { // because MakeSimpleArray below will force materialization switch (type->id()) { case Type::INT32: - return TYPEOF(x) == INTSXP && !OBJECT(x); + return TYPEOF(x) == INTSXP && !Rf_isObject(x); case Type::DOUBLE: - return TYPEOF(x) == REALSXP && !OBJECT(x); + return TYPEOF(x) == REALSXP && !Rf_isObject(x); case Type::INT8: - return TYPEOF(x) == RAWSXP && !OBJECT(x); + return TYPEOF(x) == RAWSXP && !Rf_isObject(x); case Type::INT64: return TYPEOF(x) == REALSXP && Rf_inherits(x, "integer64"); default: @@ -1412,17 +1412,17 @@ bool vector_from_r_memory(SEXP x, const std::shared_ptr& type, switch (type->id()) { case Type::INT32: - return TYPEOF(x) == INTSXP && !OBJECT(x) && + return TYPEOF(x) == INTSXP && !Rf_isObject(x) && vector_from_r_memory_impl(x, type, columns, j, tasks); case Type::DOUBLE: - return TYPEOF(x) == REALSXP && !OBJECT(x) && + return TYPEOF(x) == REALSXP && !Rf_isObject(x) && vector_from_r_memory_impl(x, type, columns, j, tasks); case Type::UINT8: - return TYPEOF(x) == RAWSXP && !OBJECT(x) && + return TYPEOF(x) == RAWSXP && !Rf_isObject(x) && vector_from_r_memory_impl(x, type, columns, j, tasks); From e5755a517fa794139c4d3ede1cb17e45150dc3b8 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 27 Mar 2025 22:38:37 -0500 Subject: [PATCH 02/20] One step (with a step to the side since STRING_PTR is also non-API) --- r/src/altrep.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 90a459e19cb..82b70b31a49 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -531,7 +531,7 @@ struct AltrepFactor : public AltrepVectorBase { SEXP copy = PROTECT(Rf_allocVector(INTSXP, size)); // copy the data from the array, through Get_region - Get_region(alt, 0, size, reinterpret_cast(DATAPTR(copy))); + Get_region(alt, 0, size, reinterpret_cast(INTEGER(copy))); // store as data2, this is now considered materialized SetRepresentation(alt, copy); @@ -892,7 +892,7 @@ struct AltrepVectorString : public AltrepVectorBase> { return s; } - static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); } + static void* Dataptr(SEXP alt, Rboolean writeable) { return STRING_PTR(Materialize(alt)); } static SEXP Materialize(SEXP alt) { if (Base::IsMaterialized(alt)) { @@ -931,7 +931,7 @@ struct AltrepVectorString : public AltrepVectorBase> { } static const void* Dataptr_or_null(SEXP alt) { - if (Base::IsMaterialized(alt)) return DATAPTR(Representation(alt)); + if (Base::IsMaterialized(alt)) return STRING_PTR(Representation(alt)); // otherwise give up return nullptr; @@ -1267,21 +1267,21 @@ sexp test_arrow_altrep_copy_by_dataptr(sexp x) { if (TYPEOF(x) == INTSXP) { cpp11::writable::integers out(Rf_xlength(x)); - int* ptr = reinterpret_cast(DATAPTR(x)); + int* ptr = reinterpret_cast(INTEGER(x)); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } return out; } else if (TYPEOF(x) == REALSXP) { cpp11::writable::doubles out(Rf_xlength(x)); - double* ptr = reinterpret_cast(DATAPTR(x)); + double* ptr = reinterpret_cast(REAL(x)); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } return out; } else if (TYPEOF(x) == STRSXP) { cpp11::writable::strings out(Rf_xlength(x)); - SEXP* ptr = reinterpret_cast(DATAPTR(x)); + SEXP* ptr = reinterpret_cast(STRING_PTR(x)); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } From 19077f28531bbe69ea65982bfcda85731667dab8 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 17:55:20 -0500 Subject: [PATCH 03/20] another step forward (though still with string_ptr) --- r/src/arrow_types.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 1595a3c8fb5..410b6b87679 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -173,7 +173,7 @@ template class RBuffer : public MutableBuffer { public: explicit RBuffer(RVector vec) - : MutableBuffer(reinterpret_cast(DATAPTR(vec)), + : MutableBuffer(reinterpret_cast(getDataPointer(vec)), vec.size() * sizeof(typename RVector::value_type), arrow::CPUDevice::memory_manager(gc_memory_pool())), vec_(vec) {} @@ -181,6 +181,22 @@ class RBuffer : public MutableBuffer { private: // vec_ holds the memory RVector vec_; + + static void* getDataPointer(RVector& vec) { + if (TYPEOF(vec) == LGLSXP) { + return LOGICAL(vec); + } else if (TYPEOF(vec) == INTSXP) { + return INTEGER(vec); + } else if (TYPEOF(vec) == REALSXP) { + return REAL(vec); + } else if (TYPEOF(vec) == CPLXSXP) { + return COMPLEX(vec); + } else if (TYPEOF(vec) == STRSXP) { + return STRING_PTR(vec); + } else if (TYPEOF(vec) == RAWSXP) { + return RAW(vec); + } + } }; std::shared_ptr InferArrowTypeFromFactor(SEXP); From ce537f82205f3035033b3d0c6fed949b21d77c12 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 18:00:29 -0500 Subject: [PATCH 04/20] linting --- r/src/altrep.cpp | 4 +++- r/src/arrow_types.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 82b70b31a49..4fdd09093e1 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -892,7 +892,9 @@ struct AltrepVectorString : public AltrepVectorBase> { return s; } - static void* Dataptr(SEXP alt, Rboolean writeable) { return STRING_PTR(Materialize(alt)); } + static void* Dataptr(SEXP alt, Rboolean writeable) { + return STRING_PTR(Materialize(alt)); + } static SEXP Materialize(SEXP alt) { if (Base::IsMaterialized(alt)) { diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 410b6b87679..d75b6e360c7 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -182,7 +182,7 @@ class RBuffer : public MutableBuffer { // vec_ holds the memory RVector vec_; - static void* getDataPointer(RVector& vec) { + static void* getDataPointer(RVector& vec) { if (TYPEOF(vec) == LGLSXP) { return LOGICAL(vec); } else if (TYPEOF(vec) == INTSXP) { From 045d7dd7d30c1286a155a331c14109387ecf6340 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 18:05:21 -0500 Subject: [PATCH 05/20] add an extra branch --- r/src/arrow_types.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index d75b6e360c7..1e6771b3085 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -195,6 +195,8 @@ class RBuffer : public MutableBuffer { return STRING_PTR(vec); } else if (TYPEOF(vec) == RAWSXP) { return RAW(vec); + } else { + return RAW(vec); } } }; From fb03d266be1e6a251b10e00d101daac702a62981 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 18:10:06 -0500 Subject: [PATCH 06/20] linting --- r/src/altrep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 4fdd09093e1..e3f47c0eb79 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -892,8 +892,8 @@ struct AltrepVectorString : public AltrepVectorBase> { return s; } - static void* Dataptr(SEXP alt, Rboolean writeable) { - return STRING_PTR(Materialize(alt)); + static void* Dataptr(SEXP alt, Rboolean writeable) { + return STRING_PTR(Materialize(alt)); } static SEXP Materialize(SEXP alt) { From f808bd5d1d3e2c97e9f6be6104fbbaf8be277245 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 18:55:23 -0500 Subject: [PATCH 07/20] STRING_ELT in types --- r/src/arrow_types.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 1e6771b3085..3217bc87013 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -192,10 +192,9 @@ class RBuffer : public MutableBuffer { } else if (TYPEOF(vec) == CPLXSXP) { return COMPLEX(vec); } else if (TYPEOF(vec) == STRSXP) { - return STRING_PTR(vec); - } else if (TYPEOF(vec) == RAWSXP) { - return RAW(vec); + return STRING_ELT(vec, 0); } else { + // raw return RAW(vec); } } From 993103ccfd156b21e2a6fd477e78834e5dd95618 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 20:38:24 -0500 Subject: [PATCH 08/20] DATAPTR be gone --- r/src/altrep.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index e3f47c0eb79..4d24bb62d8e 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -220,7 +220,11 @@ struct AltrepVectorPrimitive : public AltrepVectorBase(DATAPTR(copy))); + if constexpr (std::is_same_v) { + Get_region(alt, 0, size, reinterpret_cast(REAL(copy))); + } else { + Get_region(alt, 0, size, reinterpret_cast(INTEGER(copy))); + } // store as data2, this is now considered materialized SetRepresentation(alt, copy); @@ -269,13 +273,21 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { + return REAL(Materialize(alt)); + } else { + return INTEGER(Materialize(alt)); + } } // The value at position i static c_type Elt(SEXP alt, R_xlen_t i) { if (IsMaterialized(alt)) { - return reinterpret_cast(DATAPTR(Representation(alt)))[i]; + if constexpr (std::is_same_v) { + return reinterpret_cast(REAL(Representation(alt)))[i]; + } else { + return reinterpret_cast(INTEGER(Representation(alt)))[i]; + } } auto altrep_data = @@ -552,7 +564,7 @@ struct AltrepFactor : public AltrepVectorBase { return nullptr; } - static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); } + static void* Dataptr(SEXP alt, Rboolean writeable) { return INTEGER(Materialize(alt)); } static SEXP Duplicate(SEXP alt, Rboolean /* deep */) { // the representation integer vector From e8fa19f04bc44ded8eb88ac630280ea2047126c2 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 21:25:43 -0500 Subject: [PATCH 09/20] No more string_ptr --- r/src/altrep.cpp | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 4d24bb62d8e..431db544318 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -115,7 +115,7 @@ const std::shared_ptr& GetChunkedArray(SEXP alt) { // materialization is needed. // data2: starts as NULL, and becomes a standard R vector with the same // data if necessary: if materialization is needed, e.g. if we need -// to access its data pointer, with DATAPTR(). +// to access its data pointer, with INTEGER(), REAL(), etc. template struct AltrepVectorBase { // store the Array as an external pointer in data1, mark as immutable @@ -905,7 +905,19 @@ struct AltrepVectorString : public AltrepVectorBase> { } static void* Dataptr(SEXP alt, Rboolean writeable) { - return STRING_PTR(Materialize(alt)); + SEXP materialized = Materialize(alt); + // if (!isString(materialized)) { + // error("Expected a character vector"); + // } + + R_xlen_t len = XLENGTH(materialized); + void** str_array = (void**) R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) + + for (R_xlen_t i = 0; i < len; i++) { + str_array[i] = (void*) STRING_ELT(materialized, i); + } + + return str_array; // Pointer to array of SEXP elements } static SEXP Materialize(SEXP alt) { @@ -945,7 +957,18 @@ struct AltrepVectorString : public AltrepVectorBase> { } static const void* Dataptr_or_null(SEXP alt) { - if (Base::IsMaterialized(alt)) return STRING_PTR(Representation(alt)); + if (Base::IsMaterialized(alt)) { + SEXP materialized = Materialize(alt); + + R_xlen_t len = XLENGTH(materialized); + void** str_array = (void**) R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) + + for (R_xlen_t i = 0; i < len; i++) { + str_array[i] = (void*) STRING_ELT(materialized, i); + } + + return str_array; // Pointer to array of SEXP elements + } // otherwise give up return nullptr; @@ -1295,10 +1318,13 @@ sexp test_arrow_altrep_copy_by_dataptr(sexp x) { return out; } else if (TYPEOF(x) == STRSXP) { cpp11::writable::strings out(Rf_xlength(x)); - SEXP* ptr = reinterpret_cast(STRING_PTR(x)); - for (R_xlen_t i = 0; i < n; i++) { - out[i] = ptr[i]; + R_xlen_t len = Rf_xlength(x); + + for (R_xlen_t i = 0; i < len; i++) { + SEXP str_elt = reinterpret_cast(STRING_ELT(x, i)); + out[i] = str_elt; } + return out; } else { return R_NilValue; From acbbf95d4c4e9a17d7f33ec27365ff8ab03c9938 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 21:35:51 -0500 Subject: [PATCH 10/20] linting --- r/src/altrep.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 431db544318..e631e0ab12b 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -905,19 +905,16 @@ struct AltrepVectorString : public AltrepVectorBase> { } static void* Dataptr(SEXP alt, Rboolean writeable) { - SEXP materialized = Materialize(alt); - // if (!isString(materialized)) { - // error("Expected a character vector"); - // } + SEXP materialized = Materialize(alt); - R_xlen_t len = XLENGTH(materialized); - void** str_array = (void**) R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) - - for (R_xlen_t i = 0; i < len; i++) { - str_array[i] = (void*) STRING_ELT(materialized, i); - } + R_xlen_t len = XLENGTH(materialized); + void** str_array = + (void**)R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) + for (R_xlen_t i = 0; i < len; i++) { + str_array[i] = (void*) STRING_ELT(materialized, i); + } - return str_array; // Pointer to array of SEXP elements + return str_array; // Pointer to array of SEXP elements } static SEXP Materialize(SEXP alt) { From 8123ed7505de2838b7ebba590f7f6277ce38ea97 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 21:43:31 -0500 Subject: [PATCH 11/20] moar linting --- r/src/altrep.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index e631e0ab12b..74efa3bb9ce 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -911,7 +911,7 @@ struct AltrepVectorString : public AltrepVectorBase> { void** str_array = (void**)R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) for (R_xlen_t i = 0; i < len; i++) { - str_array[i] = (void*) STRING_ELT(materialized, i); + str_array[i] = (void*)STRING_ELT(materialized, i); } return str_array; // Pointer to array of SEXP elements @@ -958,13 +958,14 @@ struct AltrepVectorString : public AltrepVectorBase> { SEXP materialized = Materialize(alt); R_xlen_t len = XLENGTH(materialized); - void** str_array = (void**) R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) + void** str_array = + (void**) R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) for (R_xlen_t i = 0; i < len; i++) { - str_array[i] = (void*) STRING_ELT(materialized, i); + str_array[i] = (void*)STRING_ELT(materialized, i); } - return str_array; // Pointer to array of SEXP elements + return str_array; // Pointer to array of SEXP elements } // otherwise give up From 4a4985eefbc80a9572958c11c80fbdc5b0978270 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 28 Mar 2025 21:53:28 -0500 Subject: [PATCH 12/20] ugh --- r/src/altrep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 74efa3bb9ce..10e205e5552 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -959,10 +959,10 @@ struct AltrepVectorString : public AltrepVectorBase> { R_xlen_t len = XLENGTH(materialized); void** str_array = - (void**) R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) + (void**)R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) for (R_xlen_t i = 0; i < len; i++) { - str_array[i] = (void*)STRING_ELT(materialized, i); + str_array[i] = (void*)STRING_ELT(materialized, i); } return str_array; // Pointer to array of SEXP elements From d75b94a628620e468737c2784e0e696a8a421dbd Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 29 Mar 2025 08:21:15 -0500 Subject: [PATCH 13/20] When materialized, can us DATAPTR_RO actually --- r/src/altrep.cpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 10e205e5552..b64f28b8377 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -955,17 +955,7 @@ struct AltrepVectorString : public AltrepVectorBase> { static const void* Dataptr_or_null(SEXP alt) { if (Base::IsMaterialized(alt)) { - SEXP materialized = Materialize(alt); - - R_xlen_t len = XLENGTH(materialized); - void** str_array = - (void**)R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) - - for (R_xlen_t i = 0; i < len; i++) { - str_array[i] = (void*)STRING_ELT(materialized, i); - } - - return str_array; // Pointer to array of SEXP elements + return DATAPTR_RO(alt); } // otherwise give up From 6ef45eb2ef8c31ad08785e196593f8918d029e53 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 29 Mar 2025 08:31:23 -0500 Subject: [PATCH 14/20] some cleanup --- r/src/altrep.cpp | 7 ++++--- r/src/arrow_types.h | 10 +++++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index b64f28b8377..08e2e96a033 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -905,16 +905,17 @@ struct AltrepVectorString : public AltrepVectorBase> { } static void* Dataptr(SEXP alt, Rboolean writeable) { + // DATAPTR(Materialize(alt)) is disallowed by CRAN, so we need to createt the array of + // string and place the SEXPs in it ourselves with STRING_ELT() SEXP materialized = Materialize(alt); R_xlen_t len = XLENGTH(materialized); - void** str_array = - (void**)R_alloc(len, sizeof(void*)); // Allocate array (R's memory allocator) + void** str_array = (void**)R_alloc(len, sizeof(void*)); for (R_xlen_t i = 0; i < len; i++) { str_array[i] = (void*)STRING_ELT(materialized, i); } - return str_array; // Pointer to array of SEXP elements + return str_array; } static SEXP Materialize(SEXP alt) { diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 3217bc87013..c2f13e68573 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -192,7 +192,15 @@ class RBuffer : public MutableBuffer { } else if (TYPEOF(vec) == CPLXSXP) { return COMPLEX(vec); } else if (TYPEOF(vec) == STRSXP) { - return STRING_ELT(vec, 0); + cpp11::writable::strings out(Rf_xlength(vec)); + R_xlen_t len = Rf_xlength(vec); + + for (R_xlen_t i = 0; i < len; i++) { + SEXP str_elt = reinterpret_cast(STRING_ELT(vec, i)); + out[i] = str_elt; + } + + return out; } else { // raw return RAW(vec); From 19fe1da9aa1f8c7d53f4428cf284f7bde25615cc Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 31 Mar 2025 07:27:16 -0500 Subject: [PATCH 15/20] Clean up following review --- r/src/altrep.cpp | 5 +---- r/src/arrow_types.h | 12 +++--------- r/tests/testthat/test-altrep.R | 14 ++++++++------ 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 08e2e96a033..04abbef8425 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -1307,13 +1307,10 @@ sexp test_arrow_altrep_copy_by_dataptr(sexp x) { return out; } else if (TYPEOF(x) == STRSXP) { cpp11::writable::strings out(Rf_xlength(x)); - R_xlen_t len = Rf_xlength(x); - - for (R_xlen_t i = 0; i < len; i++) { + for (R_xlen_t i = 0; i < n; i++) { SEXP str_elt = reinterpret_cast(STRING_ELT(x, i)); out[i] = str_elt; } - return out; } else { return R_NilValue; diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index c2f13e68573..4ac98cc37ed 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -192,15 +192,9 @@ class RBuffer : public MutableBuffer { } else if (TYPEOF(vec) == CPLXSXP) { return COMPLEX(vec); } else if (TYPEOF(vec) == STRSXP) { - cpp11::writable::strings out(Rf_xlength(vec)); - R_xlen_t len = Rf_xlength(vec); - - for (R_xlen_t i = 0; i < len; i++) { - SEXP str_elt = reinterpret_cast(STRING_ELT(vec, i)); - out[i] = str_elt; - } - - return out; + // Technically this returns the address to the first element of the string vector + // which is similar to what we would get with DATAPTR() before it was removed + return STRING_ELT(vec, 0); } else { // raw return RAW(vec); diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R index 50bd40988e5..94aebda7e31 100644 --- a/r/tests/testthat/test-altrep.R +++ b/r/tests/testthat/test-altrep.R @@ -170,7 +170,7 @@ test_that("element access methods for int32 ALTREP with nulls", { expect_identical(test_arrow_altrep_copy_by_region(altrep, 123), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # because there are no nulls, DATAPTR() does not materialize + # because there are nulls, DATAPTR() does materialize expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) expect_true(test_arrow_altrep_is_materialized(altrep)) @@ -193,7 +193,7 @@ test_that("element access methods for double ALTREP with nulls", { expect_identical(test_arrow_altrep_copy_by_region(altrep, 123), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # because there are no nulls, DATAPTR() does not materialize + # because there are nulls, DATAPTR() does materialize expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) expect_true(test_arrow_altrep_is_materialized(altrep)) @@ -244,11 +244,12 @@ test_that("element access methods for character ALTREP", { expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # DATAPTR() should always materialize for strings + # DATAPTR() does not materialize expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) - expect_true(test_arrow_altrep_is_materialized(altrep)) + expect_false(test_arrow_altrep_is_materialized(altrep)) # test element access after materialization + test_arrow_altrep_force_materialize(altrep) expect_true(test_arrow_altrep_is_materialized(altrep)) expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) @@ -265,11 +266,12 @@ test_that("element access methods for character ALTREP from large_utf8()", { expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # DATAPTR() should always materialize for strings + # DATAPTR() sdoes not materialize expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) - expect_true(test_arrow_altrep_is_materialized(altrep)) + expect_false(test_arrow_altrep_is_materialized(altrep)) # test element access after materialization + test_arrow_altrep_force_materialize(altrep) expect_true(test_arrow_altrep_is_materialized(altrep)) expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) From 7351ce91d2f21ab8c3f219b09259de2d6e35415a Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 31 Mar 2025 12:51:03 -0500 Subject: [PATCH 16/20] Update r/src/altrep.cpp Co-authored-by: Dewey Dunnington --- r/src/altrep.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 04abbef8425..3aa71f2c004 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -275,8 +275,10 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { return REAL(Materialize(alt)); - } else { + } else if constexpr (std::is_same_v) { return INTEGER(Materialize(alt)); + } else { + static_assert(false, "ALTREP not implemented for c_type"); } } From 7a85dcf26ad01f790b3ec4935d145b53f6a8b0ff Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 31 Mar 2025 13:09:05 -0500 Subject: [PATCH 17/20] PR comments --- r/src/altrep.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 3aa71f2c004..97cca2af7d7 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -221,9 +221,9 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { - Get_region(alt, 0, size, reinterpret_cast(REAL(copy))); + Get_region(alt, 0, size, REAL(copy)); } else { - Get_region(alt, 0, size, reinterpret_cast(INTEGER(copy))); + Get_region(alt, 0, size, INTEGER(copy)); } // store as data2, this is now considered materialized @@ -907,17 +907,7 @@ struct AltrepVectorString : public AltrepVectorBase> { } static void* Dataptr(SEXP alt, Rboolean writeable) { - // DATAPTR(Materialize(alt)) is disallowed by CRAN, so we need to createt the array of - // string and place the SEXPs in it ourselves with STRING_ELT() - SEXP materialized = Materialize(alt); - - R_xlen_t len = XLENGTH(materialized); - void** str_array = (void**)R_alloc(len, sizeof(void*)); - for (R_xlen_t i = 0; i < len; i++) { - str_array[i] = (void*)STRING_ELT(materialized, i); - } - - return str_array; + return const_cast(DATAPTR_RO(Materialize(alt))); } static SEXP Materialize(SEXP alt) { From ba726626e8b530d299b9e0777cea65ffd615691e Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 5 Apr 2025 09:28:24 -0500 Subject: [PATCH 18/20] PR comments --- r/src/altrep.cpp | 19 ++++++++----------- r/src/arrow_types.h | 5 ++--- r/tests/testthat/test-altrep.R | 16 ++++++---------- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 97cca2af7d7..9903bb7a6e1 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -222,8 +222,10 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { Get_region(alt, 0, size, REAL(copy)); - } else { + } else if constexpr (std::is_same_v) { Get_region(alt, 0, size, INTEGER(copy)); + } else { + static_assert(false, "ALTREP not implemented for c_type"); } // store as data2, this is now considered materialized @@ -287,8 +289,10 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { return reinterpret_cast(REAL(Representation(alt)))[i]; - } else { + } else if constexpr (std::is_same_v) { return reinterpret_cast(INTEGER(Representation(alt)))[i]; + } else { + static_assert(false, "ALTREP not implemented for c_type"); } } @@ -1285,25 +1289,18 @@ sexp test_arrow_altrep_copy_by_dataptr(sexp x) { if (TYPEOF(x) == INTSXP) { cpp11::writable::integers out(Rf_xlength(x)); - int* ptr = reinterpret_cast(INTEGER(x)); + int* ptr = INTEGER(x); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } return out; } else if (TYPEOF(x) == REALSXP) { cpp11::writable::doubles out(Rf_xlength(x)); - double* ptr = reinterpret_cast(REAL(x)); + double* ptr = REAL(x); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } return out; - } else if (TYPEOF(x) == STRSXP) { - cpp11::writable::strings out(Rf_xlength(x)); - for (R_xlen_t i = 0; i < n; i++) { - SEXP str_elt = reinterpret_cast(STRING_ELT(x, i)); - out[i] = str_elt; - } - return out; } else { return R_NilValue; } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 4ac98cc37ed..d1882e56daf 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -192,9 +192,8 @@ class RBuffer : public MutableBuffer { } else if (TYPEOF(vec) == CPLXSXP) { return COMPLEX(vec); } else if (TYPEOF(vec) == STRSXP) { - // Technically this returns the address to the first element of the string vector - // which is similar to what we would get with DATAPTR() before it was removed - return STRING_ELT(vec, 0); + // We don't want to expose the string data here, so we error + cpp11::stop("Operation not supported for string vectors."); } else { // raw return RAW(vec); diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R index 94aebda7e31..d1e90b6b59a 100644 --- a/r/tests/testthat/test-altrep.R +++ b/r/tests/testthat/test-altrep.R @@ -244,15 +244,13 @@ test_that("element access methods for character ALTREP", { expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # DATAPTR() does not materialize - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) - expect_false(test_arrow_altrep_is_materialized(altrep)) + # match() calls DATAPTR() internally which materializes the vector + match(altrep, c("1", "40", "999")) + expect_true(test_arrow_altrep_is_materialized(altrep)) # test element access after materialization - test_arrow_altrep_force_materialize(altrep) expect_true(test_arrow_altrep_is_materialized(altrep)) expect_identical(test_arrow_altrep_copy_by_element(altrep), original) - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) }) test_that("element access methods for character ALTREP from large_utf8()", { @@ -266,15 +264,13 @@ test_that("element access methods for character ALTREP from large_utf8()", { expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # DATAPTR() sdoes not materialize - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) - expect_false(test_arrow_altrep_is_materialized(altrep)) + # match() calls DATAPTR() internally which materializes the vector + match(altrep, c("1", "40", "999")) + expect_true(test_arrow_altrep_is_materialized(altrep)) # test element access after materialization - test_arrow_altrep_force_materialize(altrep) expect_true(test_arrow_altrep_is_materialized(altrep)) expect_identical(test_arrow_altrep_copy_by_element(altrep), original) - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) }) test_that("empty vectors are not altrep", { From a4b457dd112eeeb760aff5525923f4a51df9c378 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 5 Apr 2025 09:38:07 -0500 Subject: [PATCH 19/20] More specific static assertions --- r/src/altrep.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 9903bb7a6e1..e71266b5969 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -225,7 +225,8 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { Get_region(alt, 0, size, INTEGER(copy)); } else { - static_assert(false, "ALTREP not implemented for c_type"); + static_assert(std::is_same_v || std::is_same_v, + "ALTREP not implemented for this c_type"); } // store as data2, this is now considered materialized @@ -280,7 +281,8 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { return INTEGER(Materialize(alt)); } else { - static_assert(false, "ALTREP not implemented for c_type"); + static_assert(std::is_same_v || std::is_same_v, + "ALTREP not implemented for this c_type"); } } @@ -292,7 +294,8 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { return reinterpret_cast(INTEGER(Representation(alt)))[i]; } else { - static_assert(false, "ALTREP not implemented for c_type"); + static_assert(std::is_same_v || std::is_same_v, + "ALTREP not implemented for this c_type"); } } From a15918a4ca7f0b05e0c474a151746be180d37388 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 5 Apr 2025 15:02:06 -0500 Subject: [PATCH 20/20] Remove a few more extraneous reinterpret_casts --- r/src/altrep.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index e71266b5969..a4adf58997c 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -290,9 +290,9 @@ struct AltrepVectorPrimitive : public AltrepVectorBase) { - return reinterpret_cast(REAL(Representation(alt)))[i]; + return REAL(Representation(alt))[i]; } else if constexpr (std::is_same_v) { - return reinterpret_cast(INTEGER(Representation(alt)))[i]; + return INTEGER(Representation(alt))[i]; } else { static_assert(std::is_same_v || std::is_same_v, "ALTREP not implemented for this c_type"); @@ -552,7 +552,7 @@ struct AltrepFactor : public AltrepVectorBase { SEXP copy = PROTECT(Rf_allocVector(INTSXP, size)); // copy the data from the array, through Get_region - Get_region(alt, 0, size, reinterpret_cast(INTEGER(copy))); + Get_region(alt, 0, size, INTEGER(copy)); // store as data2, this is now considered materialized SetRepresentation(alt, copy);