From f3ceb1fd5ae0969b751aa6d5e4ad0e386c0c10e8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 May 2024 16:15:33 -0300 Subject: [PATCH 01/20] port vctr --- r/R/vctr.R | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 r/R/vctr.R diff --git a/r/R/vctr.R b/r/R/vctr.R new file mode 100644 index 000000000..05a031b9d --- /dev/null +++ b/r/R/vctr.R @@ -0,0 +1,219 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Experimental Arrow encoded arrays as R vectors +#' +#' @param x An object that works with [as_nanoarrow_array_stream()]. Most +#' spatial objects in R already work with this method. +#' @param ... Passed to [as_nanoarrow_array_stream()] +#' @param schema An optional `schema` +#' +#' @return A vctr of class 'nanoarrow_vctr' +#' @export +#' +#' @examples +#' as_nanoarrow_vctr(1:5) +#' +as_nanoarrow_vctr <- function(x, ..., schema = NULL) { + if (inherits(x, "nanoarrow_vctr") && is.null(schema)) { + return(x) + } + + stream <- as_nanoarrow_array_stream(x, ..., schema = schema) + chunks <- collect_array_stream(stream, validate = FALSE) + new_nanoarrow_vctr(chunks, stream$get_schema()) +} + +new_nanoarrow_vctr <- function(chunks, schema, indices = NULL) { + offsets <- .Call(nanoarrow_c_vctr_chunk_offsets, chunks) + if (is.null(indices)) { + indices <- seq_len(offsets[length(offsets)]) + } + + structure( + indices, + schema = schema, + chunks = chunks, + offsets = offsets, + class = c("nanoarrow_vctr", "wk_vctr") + ) +} + +#' @export +`[.nanoarrow_vctr` <- function(x, i) { + attrs <- attributes(x) + x <- NextMethod() + + if (is.null(vctr_as_slice(x))) { + stop( + "Can't subset nanoarrow_vctr with non-slice (e.g., only i:j indexing is supported)" + ) + } + + attributes(x) <- attrs + x +} + +#' @export +`[<-.nanoarrow_vctr` <- function(x, i, value) { + stop("subset assignment for nanoarrow_vctr is not supported") +} + +#' @export +`[[<-.nanoarrow_vctr` <- function(x, i, value) { + stop("subset assignment for nanoarrow_vctr is not supported") +} + +#' @export +format.nanoarrow_vctr <- function(x, ...) { + # Technically we can do better here + format(convert_array_stream(x), ...) +} + +# Because RStudio's viewer uses this, we want to use the potentially abbreviated +# WKT from the format method +#' @export +as.character.nanoarrow_vctr <- function(x, ...) { + format(x, ...) +} + +#' @export +infer_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) { + attr(x, "schema", exact = TRUE) +} + +# Because zero-length vctrs are R's way of communicating "type", implement +# as_nanoarrow_schema() here so that it works in places that expect a type +#' @importFrom nanoarrow as_nanoarrow_schema +#' @export +as_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) { + attr(x, "schema", exact = TRUE) +} + +#' @export +as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { + as_nanoarrow_array_stream.nanoarrow_vctr(x, ..., schema = schema) +} + +#' @importFrom nanoarrow as_nanoarrow_array_stream +#' @export +as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { + if (!is.null(schema)) { + stream <- as_nanoarrow_array_stream(x, schema = NULL) + return(as_nanoarrow_array_stream(stream, schema = schema)) + } + + slice <- vctr_as_slice(x) + if (is.null(slice)) { + stop("Can't resolve non-slice nanoarrow_vctr to nanoarrow_array_stream") + } + + x_schema <- attr(x, "schema", exact = TRUE) + + # Zero-size slice can be an array stream with zero batches + if (slice[2] == 0) { + return(basic_array_stream(list(), schema = x_schema)) + } + + # Full slice doesn't need slicing logic + offsets <- attr(x, "offsets", exact = TRUE) + batches <- attr(x, "chunks", exact = TRUE) + if (slice[1] == 1 && slice[2] == max(offsets)) { + return( + basic_array_stream( + batches, + schema = x_schema, + validate = FALSE + ) + ) + } + + # Calculate first and last slice information + first_index <- slice[1] - 1L + end_index <- first_index + slice[2] + last_index <- end_index - 1L + first_chunk_index <- vctr_resolve_chunk(first_index, offsets) + last_chunk_index <- vctr_resolve_chunk(last_index, offsets) + + first_chunk_offset <- first_index - offsets[first_chunk_index + 1L] + first_chunk_length <- offsets[first_chunk_index + 2L] - first_index + last_chunk_offset <- 0L + last_chunk_length <- end_index - offsets[last_chunk_index + 1L] + + # Calculate first and last slices + if (first_chunk_index == last_chunk_index) { + batch <- vctr_array_slice( + batches[[first_chunk_index + 1L]], + first_chunk_offset, + last_chunk_length - first_chunk_offset + ) + + return( + basic_array_stream( + list(batch), + schema = x_schema, + validate = FALSE + ) + ) + } + + batch1 <- vctr_array_slice( + batches[[first_chunk_index + 1L]], + first_chunk_offset, + first_chunk_length + ) + + batchn <- vctr_array_slice( + batches[[last_chunk_index + 1L]], + last_chunk_offset, + last_chunk_length + ) + + seq_mid <- seq_len(last_chunk_index - first_chunk_index - 1) + batch_mid <- batches[first_chunk_index + seq_mid] + + basic_array_stream( + c( + list(batch1), + batch_mid, + list(batchn) + ), + schema = x_schema, + validate = FALSE + ) +} + + +# Utilities for vctr methods + +vctr_resolve_chunk <- function(x, offsets) { + .Call(nanoarrow_c_vctr_chunk_resolve, x, offsets) +} + +vctr_as_slice <- function(x) { + .Call(nanoarrow_c_vctr_as_slice, x) +} + +vctr_array_slice <- function(x, offset, length) { + new_offset <- x$offset + offset + new_length <- length + nanoarrow_array_modify( + x, + list(offset = new_offset, length = new_length), + validate = FALSE + ) +} From 8e2b36b21f1c929adb471010242812d3b073fafa Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 May 2024 16:36:36 -0300 Subject: [PATCH 02/20] port tests --- r/NAMESPACE | 9 ++ r/R/vctr.R | 10 +- r/man/as_nanoarrow_vctr.Rd | 27 ++++++ r/man/nanoarrow-package.Rd | 1 + r/src/init.c | 6 ++ r/src/vctr.c | 131 ++++++++++++++++++++++++++ r/tests/testthat/test-vctr.R | 173 +++++++++++++++++++++++++++++++++++ 7 files changed, 352 insertions(+), 5 deletions(-) create mode 100644 r/man/as_nanoarrow_vctr.Rd create mode 100644 r/src/vctr.c create mode 100644 r/tests/testthat/test-vctr.R diff --git a/r/NAMESPACE b/r/NAMESPACE index d868f0b2c..0d50f39c3 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -6,12 +6,16 @@ S3method("$",nanoarrow_buffer) S3method("$",nanoarrow_schema) S3method("$<-",nanoarrow_array) S3method("$<-",nanoarrow_schema) +S3method("[",nanoarrow_vctr) +S3method("[<-",nanoarrow_vctr) S3method("[[",nanoarrow_array) S3method("[[",nanoarrow_array_stream) S3method("[[",nanoarrow_buffer) S3method("[[",nanoarrow_schema) S3method("[[<-",nanoarrow_array) S3method("[[<-",nanoarrow_schema) +S3method("[[<-",nanoarrow_vctr) +S3method(as.character,nanoarrow_vctr) S3method(as.data.frame,nanoarrow_array) S3method(as.data.frame,nanoarrow_array_stream) S3method(as.raw,nanoarrow_buffer) @@ -48,12 +52,14 @@ S3method(as_nanoarrow_array_stream,data.frame) S3method(as_nanoarrow_array_stream,default) S3method(as_nanoarrow_array_stream,nanoarrow_array) S3method(as_nanoarrow_array_stream,nanoarrow_array_stream) +S3method(as_nanoarrow_array_stream,nanoarrow_vctr) S3method(as_nanoarrow_buffer,default) S3method(as_nanoarrow_buffer,nanoarrow_buffer) S3method(as_nanoarrow_schema,DataType) S3method(as_nanoarrow_schema,Field) S3method(as_nanoarrow_schema,Schema) S3method(as_nanoarrow_schema,nanoarrow_schema) +S3method(as_nanoarrow_schema,nanoarrow_vctr) S3method(convert_array,default) S3method(convert_array,double) S3method(convert_array,factor) @@ -64,6 +70,7 @@ S3method(format,nanoarrow_array) S3method(format,nanoarrow_array_stream) S3method(format,nanoarrow_buffer) S3method(format,nanoarrow_schema) +S3method(format,nanoarrow_vctr) S3method(infer_nanoarrow_ptype_extension,default) S3method(infer_nanoarrow_ptype_extension,nanoarrow_extension_spec_vctrs) S3method(infer_nanoarrow_schema,Array) @@ -93,6 +100,7 @@ S3method(infer_nanoarrow_schema,list) S3method(infer_nanoarrow_schema,logical) S3method(infer_nanoarrow_schema,nanoarrow_array) S3method(infer_nanoarrow_schema,nanoarrow_array_stream) +S3method(infer_nanoarrow_schema,nanoarrow_vctr) S3method(infer_nanoarrow_schema,raw) S3method(infer_nanoarrow_schema,vctrs_list_of) S3method(infer_nanoarrow_schema,vctrs_unspecified) @@ -121,6 +129,7 @@ export(as_nanoarrow_array_extension) export(as_nanoarrow_array_stream) export(as_nanoarrow_buffer) export(as_nanoarrow_schema) +export(as_nanoarrow_vctr) export(basic_array_stream) export(collect_array_stream) export(convert_array) diff --git a/r/R/vctr.R b/r/R/vctr.R index 05a031b9d..186947074 100644 --- a/r/R/vctr.R +++ b/r/R/vctr.R @@ -26,7 +26,8 @@ #' @export #' #' @examples -#' as_nanoarrow_vctr(1:5) +#' array <- as_nanoarrow_array(1:5) +#' as_nanoarrow_vctr(array) #' as_nanoarrow_vctr <- function(x, ..., schema = NULL) { if (inherits(x, "nanoarrow_vctr") && is.null(schema)) { @@ -81,11 +82,12 @@ new_nanoarrow_vctr <- function(chunks, schema, indices = NULL) { #' @export format.nanoarrow_vctr <- function(x, ...) { # Technically we can do better here - format(convert_array_stream(x), ...) + stream <- as_nanoarrow_array_stream(x) + format(convert_array_stream(stream), ...) } # Because RStudio's viewer uses this, we want to use the potentially abbreviated -# WKT from the format method +# format string. #' @export as.character.nanoarrow_vctr <- function(x, ...) { format(x, ...) @@ -98,7 +100,6 @@ infer_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) { # Because zero-length vctrs are R's way of communicating "type", implement # as_nanoarrow_schema() here so that it works in places that expect a type -#' @importFrom nanoarrow as_nanoarrow_schema #' @export as_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) { attr(x, "schema", exact = TRUE) @@ -109,7 +110,6 @@ as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { as_nanoarrow_array_stream.nanoarrow_vctr(x, ..., schema = schema) } -#' @importFrom nanoarrow as_nanoarrow_array_stream #' @export as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { if (!is.null(schema)) { diff --git a/r/man/as_nanoarrow_vctr.Rd b/r/man/as_nanoarrow_vctr.Rd new file mode 100644 index 000000000..8c6d8070a --- /dev/null +++ b/r/man/as_nanoarrow_vctr.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vctr.R +\name{as_nanoarrow_vctr} +\alias{as_nanoarrow_vctr} +\title{Experimental Arrow encoded arrays as R vectors} +\usage{ +as_nanoarrow_vctr(x, ..., schema = NULL) +} +\arguments{ +\item{x}{An object that works with \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}. Most +spatial objects in R already work with this method.} + +\item{...}{Passed to \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}} + +\item{schema}{An optional \code{schema}} +} +\value{ +A vctr of class 'nanoarrow_vctr' +} +\description{ +Experimental Arrow encoded arrays as R vectors +} +\examples{ +array <- as_nanoarrow_array(1:5) +as_nanoarrow_vctr(array) + +} diff --git a/r/man/nanoarrow-package.Rd b/r/man/nanoarrow-package.Rd index 5a2fc6ec5..106c080a0 100644 --- a/r/man/nanoarrow-package.Rd +++ b/r/man/nanoarrow-package.Rd @@ -11,6 +11,7 @@ Provides an 'R' interface to the 'nanoarrow' 'C' library and the 'Apache Arrow' \seealso{ Useful links: \itemize{ + \item \url{https://arrow.apache.org/nanoarrow/latest/r/} \item \url{https://github.com/apache/arrow-nanoarrow} \item Report bugs at \url{https://github.com/apache/arrow-nanoarrow/issues} } diff --git a/r/src/init.c b/r/src/init.c index 913ea77af..69c943911 100644 --- a/r/src/init.c +++ b/r/src/init.c @@ -91,6 +91,9 @@ extern SEXP nanoarrow_c_schema_set_dictionary(SEXP schema_mut_xptr, SEXP diction extern SEXP nanoarrow_c_preserved_count(void); extern SEXP nanoarrow_c_preserved_empty(void); extern SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj); +extern SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list); +extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp); +extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp); extern SEXP nanoarrow_c_version(void); extern SEXP nanoarrow_c_version_runtime(void); @@ -165,6 +168,9 @@ static const R_CallMethodDef CallEntries[] = { {"nanoarrow_c_preserved_empty", (DL_FUNC)&nanoarrow_c_preserved_empty, 0}, {"nanoarrow_c_preserve_and_release_on_other_thread", (DL_FUNC)&nanoarrow_c_preserve_and_release_on_other_thread, 1}, + {"nanoarrow_c_vctr_chunk_offsets", (DL_FUNC)&nanoarrow_c_vctr_chunk_offsets, 1}, + {"nanoarrow_c_vctr_chunk_resolve", (DL_FUNC)&nanoarrow_c_vctr_chunk_resolve, 2}, + {"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1}, {"nanoarrow_c_version", (DL_FUNC)&nanoarrow_c_version, 0}, {"nanoarrow_c_version_runtime", (DL_FUNC)&nanoarrow_c_version_runtime, 0}, {NULL, NULL, 0}}; diff --git a/r/src/vctr.c b/r/src/vctr.c new file mode 100644 index 000000000..73d8db514 --- /dev/null +++ b/r/src/vctr.c @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#define R_NO_REMAP +#include +#include + +#include "nanoarrow.h" + +SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list) { + int num_chunks = Rf_length(array_list); + SEXP offsets_sexp = PROTECT(Rf_allocVector(INTSXP, num_chunks + 1)); + int* offsets = INTEGER(offsets_sexp); + offsets[0] = 0; + int64_t cumulative_offset = 0; + + struct ArrowArray* array; + for (int i = 0; i < num_chunks; i++) { + array = (struct ArrowArray*)R_ExternalPtrAddr(VECTOR_ELT(array_list, i)); + cumulative_offset += array->length; + if (cumulative_offset > INT_MAX) { + Rf_error("Can't build nanoarrow_vctr with length > INT_MAX"); // # nocov + } + + offsets[i + 1] = cumulative_offset; + } + + UNPROTECT(1); + return offsets_sexp; +} + +static int resolve_chunk(int* sorted_offsets, int index, int start_offset_i, + int end_offset_i) { + if (start_offset_i >= (end_offset_i - 1)) { + return start_offset_i; + } + + int mid_offset_i = start_offset_i + (end_offset_i - start_offset_i) / 2; + int mid_index = sorted_offsets[mid_offset_i]; + if (index < mid_index) { + return resolve_chunk(sorted_offsets, index, start_offset_i, mid_offset_i); + } else { + return resolve_chunk(sorted_offsets, index, mid_offset_i, end_offset_i); + } +} + +SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp) { + int* offsets = INTEGER(offsets_sexp); + int end_offset_i = Rf_length(offsets_sexp) - 1; + int last_offset = offsets[end_offset_i]; + + int n = Rf_length(indices_sexp); + SEXP chunk_indices_sexp = PROTECT(Rf_allocVector(INTSXP, n)); + int* chunk_indices = INTEGER(chunk_indices_sexp); + + int buf[1024]; + for (int i = 0; i < n; i++) { + if (i % 1024 == 0) { + INTEGER_GET_REGION(indices_sexp, i, 1024, buf); + } + int index0 = buf[i % 1024]; + + if (index0 < 0 || index0 > last_offset) { + chunk_indices[i] = NA_INTEGER; + } else { + chunk_indices[i] = resolve_chunk(offsets, index0, 0, end_offset_i); + } + } + + UNPROTECT(1); + return chunk_indices_sexp; +} + +SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp) { + if (TYPEOF(indices_sexp) != INTSXP) { + return R_NilValue; + } + SEXP slice_sexp = PROTECT(Rf_allocVector(INTSXP, 2)); + int* slice = INTEGER(slice_sexp); + + int n = Rf_length(indices_sexp); + slice[1] = n; + + if (n == 1) { + slice[0] = INTEGER_ELT(indices_sexp, 0); + UNPROTECT(1); + return slice_sexp; + } else if (n == 0) { + slice[0] = NA_INTEGER; + UNPROTECT(1); + return slice_sexp; + } + + int buf[1024]; + INTEGER_GET_REGION(indices_sexp, 0, 1024, buf); + slice[0] = buf[0]; + + int last_value = buf[0]; + int this_value = 0; + + for (int i = 1; i < n; i++) { + if (i % 1024 == 0) { + INTEGER_GET_REGION(indices_sexp, i, 1024, buf); + } + + this_value = buf[i % 1024]; + if ((this_value - last_value) != 1) { + UNPROTECT(1); + return R_NilValue; + } + + last_value = this_value; + } + + UNPROTECT(1); + return slice_sexp; +} diff --git a/r/tests/testthat/test-vctr.R b/r/tests/testthat/test-vctr.R new file mode 100644 index 000000000..62a94df14 --- /dev/null +++ b/r/tests/testthat/test-vctr.R @@ -0,0 +1,173 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("as_nanoarrow_vctr() works for basic input", { + array <- as_nanoarrow_array(c("one", "two")) + vctr <- as_nanoarrow_vctr(array) + expect_identical(as.integer(unclass(vctr)), 1:2) + expect_identical(as_nanoarrow_vctr(vctr), vctr) + + expect_identical(infer_nanoarrow_schema(vctr)$format, "u") + expect_identical(as_nanoarrow_schema(vctr)$format, "u") +}) + +test_that("format() works for nanoarrow_vctr", { + array <- as_nanoarrow_array(c("one", "two")) + vctr <- as_nanoarrow_vctr(array) + expect_identical(format(vctr),format(c("one", "two"))) +}) + +test_that("nanoarrow_vctr to stream generates an empty stream for empty slice", { + vctr <- new_nanoarrow_vctr(list(), na_string()) + stream <- as_nanoarrow_array_stream(vctr) + schema_out <- stream$get_schema() + expect_identical(schema_out$format, "u") + expect_identical(collect_array_stream(stream), list()) +}) + +test_that("nanoarrow_vctr to stream generates identical stream for identity slice", { + array <- as_nanoarrow_array("one") + vctr <- new_nanoarrow_vctr(list(array), infer_nanoarrow_schema(array)) + + stream <- as_nanoarrow_array_stream(vctr) + schema_out <- stream$get_schema() + expect_identical(schema_out$format, "u") + + collected <- collect_array_stream(stream) + expect_length(collected, 1) + expect_identical( + convert_buffer(array$buffers[[3]]), + "one" + ) +}) + +test_that("nanoarrow_vctr to stream works for arbitrary slices", { + array1 <- as_nanoarrow_array(c("one", "two", "three")) + array2 <- as_nanoarrow_array(c("four", "five", "six", "seven")) + vctr <- new_nanoarrow_vctr(list(array1, array2), infer_nanoarrow_schema(array1)) + + chunks16 <- collect_array_stream( + as_nanoarrow_array_stream(vctr[1:6]) + ) + expect_length(chunks16, 2) + expect_identical(chunks16[[1]]$offset, 0L) + expect_identical(chunks16[[1]]$length, 3L) + expect_identical(chunks16[[2]]$offset, 0L) + expect_identical(chunks16[[2]]$length, 3L) + + chunks34 <- collect_array_stream( + as_nanoarrow_array_stream(vctr[3:4]) + ) + expect_length(chunks34, 2) + expect_identical(chunks34[[1]]$offset, 2L) + expect_identical(chunks34[[1]]$length, 1L) + expect_identical(chunks34[[2]]$offset, 0L) + expect_identical(chunks34[[2]]$length, 1L) + + chunks13 <- collect_array_stream( + as_nanoarrow_array_stream(vctr[1:3]) + ) + expect_length(chunks13, 1) + expect_identical(chunks13[[1]]$offset, 0L) + expect_identical(chunks13[[1]]$length, 3L) + + chunks46 <- collect_array_stream( + as_nanoarrow_array_stream(vctr[4:6]) + ) + expect_length(chunks46, 1) + expect_identical(chunks46[[1]]$offset, 0L) + expect_identical(chunks46[[1]]$length, 3L) + + chunks56 <- collect_array_stream( + as_nanoarrow_array_stream(vctr[5:6]) + ) + expect_length(chunks56, 1) + expect_identical(chunks56[[1]]$offset, 1L) + expect_identical(chunks56[[1]]$length, 2L) + + chunks57 <- collect_array_stream( + as_nanoarrow_array_stream(vctr[5:7]) + ) + expect_length(chunks57, 1) + expect_identical(chunks57[[1]]$offset, 1L) + expect_identical(chunks57[[1]]$length, 3L) +}) + +test_that("Errors occur for unsupported subset operations", { + array <- as_nanoarrow_array("one") + vctr <- as_nanoarrow_vctr(array) + expect_error( + vctr[5:1], + "Can't subset nanoarrow_vctr with non-slice" + ) + + expect_error( + vctr[1] <- "something", + "subset assignment for nanoarrow_vctr is not supported" + ) + + expect_error( + vctr[[1]] <- "something", + "subset assignment for nanoarrow_vctr is not supported" + ) +}) + +test_that("slice detector works", { + expect_identical( + vctr_as_slice(logical()), + NULL + ) + + expect_identical( + vctr_as_slice(2:1), + NULL + ) + + expect_identical( + vctr_as_slice(integer()), + c(NA_integer_, 0L) + ) + + expect_identical( + vctr_as_slice(2L), + c(2L, 1L) + ) + + expect_identical( + vctr_as_slice(1:10), + c(1L, 10L) + ) + + expect_identical( + vctr_as_slice(10:2048), + c(10L, (2048L - 10L + 1L)) + ) +}) + +test_that("chunk resolver works", { + chunk_offset1 <- 0:10 + + expect_identical( + vctr_resolve_chunk(c(-1L, 11L), chunk_offset1), + c(NA_integer_, NA_integer_) + ) + + expect_identical( + vctr_resolve_chunk(9:0, chunk_offset1), + 9:0 + ) +}) From f34ab9792ed2ce2bdc5b5df9c70ac64cae791e93 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 May 2024 16:42:28 -0300 Subject: [PATCH 03/20] add 32-bit chunk resolver --- src/nanoarrow/buffer_inline.h | 21 +++++++++++++++++++++ src/nanoarrow/utils_test.cc | 12 ++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/nanoarrow/buffer_inline.h b/src/nanoarrow/buffer_inline.h index 54a00a92f..e68de5873 100644 --- a/src/nanoarrow/buffer_inline.h +++ b/src/nanoarrow/buffer_inline.h @@ -50,6 +50,27 @@ static inline int64_t ArrowResolveChunk64(int64_t index, const int64_t* offsets, return lo; } +static inline int64_t ArrowResolveChunk32(int32_t index, const int32_t* offsets, + int32_t lo, int32_t hi) { + // Similar to std::upper_bound(), but slightly different as our offsets + // array always starts with 0. + int32_t n = hi - lo; + // First iteration does not need to check for n > 1 + // (lo < hi is guaranteed by the precondition). + NANOARROW_DCHECK(n > 1); + do { + const int32_t m = n >> 1; + const int32_t mid = lo + m; + if (index >= offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } while (n > 1); + return lo; +} + static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { int64_t doubled_capacity = current_capacity * 2; if (doubled_capacity > new_capacity) { diff --git a/src/nanoarrow/utils_test.cc b/src/nanoarrow/utils_test.cc index d73d18a38..e2eef993e 100644 --- a/src/nanoarrow/utils_test.cc +++ b/src/nanoarrow/utils_test.cc @@ -555,6 +555,18 @@ TEST(UtilsTest, ArrowResolveChunk64Test) { EXPECT_EQ(ArrowResolveChunk64(5, offsets, 0, n_offsets), 2); } +TEST(UtilsTest, ArrowResolveChunk32Test) { + int32_t offsets[] = {0, 2, 3, 6}; + int32_t n_offsets = 4; + + EXPECT_EQ(ArrowResolveChunk32(0, offsets, 0, n_offsets), 0); + EXPECT_EQ(ArrowResolveChunk32(1, offsets, 0, n_offsets), 0); + EXPECT_EQ(ArrowResolveChunk32(2, offsets, 0, n_offsets), 1); + EXPECT_EQ(ArrowResolveChunk32(3, offsets, 0, n_offsets), 2); + EXPECT_EQ(ArrowResolveChunk32(4, offsets, 0, n_offsets), 2); + EXPECT_EQ(ArrowResolveChunk32(5, offsets, 0, n_offsets), 2); +} + TEST(MaybeTest, ConstructionAndConversion) { using nanoarrow::NA; using nanoarrow::internal::Maybe; From 09d66bb5ce18c5469919e146fc24c946e926cb59 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 May 2024 16:48:29 -0300 Subject: [PATCH 04/20] user vctr converter --- r/src/vctr.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/r/src/vctr.c b/r/src/vctr.c index 73d8db514..33caa3266 100644 --- a/r/src/vctr.c +++ b/r/src/vctr.c @@ -43,25 +43,10 @@ SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list) { return offsets_sexp; } -static int resolve_chunk(int* sorted_offsets, int index, int start_offset_i, - int end_offset_i) { - if (start_offset_i >= (end_offset_i - 1)) { - return start_offset_i; - } - - int mid_offset_i = start_offset_i + (end_offset_i - start_offset_i) / 2; - int mid_index = sorted_offsets[mid_offset_i]; - if (index < mid_index) { - return resolve_chunk(sorted_offsets, index, start_offset_i, mid_offset_i); - } else { - return resolve_chunk(sorted_offsets, index, mid_offset_i, end_offset_i); - } -} - SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp) { int* offsets = INTEGER(offsets_sexp); - int end_offset_i = Rf_length(offsets_sexp) - 1; - int last_offset = offsets[end_offset_i]; + int n_offsets = Rf_length(offsets_sexp); + int last_offset = offsets[n_offsets - 1]; int n = Rf_length(indices_sexp); SEXP chunk_indices_sexp = PROTECT(Rf_allocVector(INTSXP, n)); @@ -77,7 +62,7 @@ SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp) { if (index0 < 0 || index0 > last_offset) { chunk_indices[i] = NA_INTEGER; } else { - chunk_indices[i] = resolve_chunk(offsets, index0, 0, end_offset_i); + chunk_indices[i] = ArrowResolveChunk32(index0, offsets, 0, n_offsets); } } From 0a31f2da1b18aefc8f6f836c521fc525d0be99e3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 May 2024 17:26:35 -0300 Subject: [PATCH 05/20] remove wk --- r/NAMESPACE | 6 ++++ r/R/convert-array.R | 6 ++++ r/R/vctr.R | 68 +++++++++++++++++++++++++++++++++++- r/man/as_nanoarrow_vctr.Rd | 3 ++ r/tests/testthat/test-vctr.R | 36 +++++++++++++++++++ 5 files changed, 118 insertions(+), 1 deletion(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 0d50f39c3..aacb4fcd0 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -18,6 +18,7 @@ S3method("[[<-",nanoarrow_vctr) S3method(as.character,nanoarrow_vctr) S3method(as.data.frame,nanoarrow_array) S3method(as.data.frame,nanoarrow_array_stream) +S3method(as.data.frame,nanoarrow_vctr) S3method(as.raw,nanoarrow_buffer) S3method(as.vector,nanoarrow_array) S3method(as.vector,nanoarrow_array_stream) @@ -60,9 +61,11 @@ S3method(as_nanoarrow_schema,Field) S3method(as_nanoarrow_schema,Schema) S3method(as_nanoarrow_schema,nanoarrow_schema) S3method(as_nanoarrow_schema,nanoarrow_vctr) +S3method(c,nanoarrow_vctr) S3method(convert_array,default) S3method(convert_array,double) S3method(convert_array,factor) +S3method(convert_array,nanoarrow_vctr) S3method(convert_array,vctrs_partial_frame) S3method(convert_array_extension,default) S3method(convert_array_extension,nanoarrow_extension_spec_vctrs) @@ -116,6 +119,7 @@ S3method(print,nanoarrow_array) S3method(print,nanoarrow_array_stream) S3method(print,nanoarrow_buffer) S3method(print,nanoarrow_schema) +S3method(print,nanoarrow_vctr) S3method(read_nanoarrow,character) S3method(read_nanoarrow,connection) S3method(read_nanoarrow,raw) @@ -123,6 +127,7 @@ S3method(str,nanoarrow_array) S3method(str,nanoarrow_array_stream) S3method(str,nanoarrow_buffer) S3method(str,nanoarrow_schema) +S3method(str,nanoarrow_vctr) export(array_stream_set_finalizer) export(as_nanoarrow_array) export(as_nanoarrow_array_extension) @@ -200,6 +205,7 @@ export(nanoarrow_pointer_release) export(nanoarrow_pointer_set_protected) export(nanoarrow_schema_modify) export(nanoarrow_schema_parse) +export(nanoarrow_vctr) export(nanoarrow_version) export(read_nanoarrow) export(register_nanoarrow_extension) diff --git a/r/R/convert-array.R b/r/R/convert-array.R index 6de82d175..1db8c625d 100644 --- a/r/R/convert-array.R +++ b/r/R/convert-array.R @@ -139,6 +139,12 @@ convert_fallback_other <- function(array, offset, length, to) { convert_array(array, to, .from_c = TRUE) } +#' @export +convert_array.nanoarrow_vctr <- function(array, to, ...) { + schema <- infer_nanoarrow_schema(to) + as_nanoarrow_vctr(stream, schema = schema) +} + #' @export convert_array.double <- function(array, to, ...) { # Handle conversion from decimal128 via arrow diff --git a/r/R/vctr.R b/r/R/vctr.R index 186947074..6f932f8ef 100644 --- a/r/R/vctr.R +++ b/r/R/vctr.R @@ -39,6 +39,16 @@ as_nanoarrow_vctr <- function(x, ..., schema = NULL) { new_nanoarrow_vctr(chunks, stream$get_schema()) } +#' @rdname as_nanoarrow_vctr +#' @export +nanoarrow_vctr <- function(schema = NULL) { + if (is.null(schema)) { + new_nanoarrow_vctr(list(), NULL) + } else { + new_nanoarrow_vctr(list(), as_nanoarrow_schema(schema)) + } +} + new_nanoarrow_vctr <- function(chunks, schema, indices = NULL) { offsets <- .Call(nanoarrow_c_vctr_chunk_offsets, chunks) if (is.null(indices)) { @@ -50,7 +60,7 @@ new_nanoarrow_vctr <- function(chunks, schema, indices = NULL) { schema = schema, chunks = chunks, offsets = offsets, - class = c("nanoarrow_vctr", "wk_vctr") + class = "nanoarrow_vctr" ) } @@ -197,6 +207,62 @@ as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { ) } +#' @export +c.nanoarrow_vctr <- function(...) { + stop("c() not implemented for nanoarrow_vctr()") +} + +# Ensures that nanoarrow_vctr can fit in a data.frame +#' @export +as.data.frame.nanoarrow_vctr <- function(x, ..., optional = FALSE) { + if (!optional) { + stop(sprintf("cannot coerce object of tyoe '%s' to data.frame", class(x)[1])) + } else { + new_data_frame(list(x)) + } +} + +#' @export +print.nanoarrow_vctr <- function(x, ...) { + schema <- attr(x, "schema", exact = TRUE) + if (is.null(schema)) { + cat("\n") + return(invisible(x)) + } + + formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE) + cat(sprintf("\n", formatted, length(x))) + + n_values <- min(length(x), 20) + more_values <- length(x) - n_values + stream <- as_nanoarrow_array_stream(utils::head(x, n_values)) + converted_head <- convert_array_stream(stream) + + print(converted_head) + if (more_values > 0) { + cat(sprintf("...and %d more values\n", more_values)) + } + + invisible(x) +} + +#' @export +str.nanoarrow_vctr <- function(object, ...) { + schema <- attr(object, "schema", exact = TRUE) + if (is.null(schema)) { + cat("\n") + return(invisible(object)) + } + + formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE) + cat(sprintf("\n", formatted, length(object))) + + for (chunk in attr(object, "chunks")) { + str(chunk, ...) + } + + invisible(object) +} # Utilities for vctr methods diff --git a/r/man/as_nanoarrow_vctr.Rd b/r/man/as_nanoarrow_vctr.Rd index 8c6d8070a..4b7700279 100644 --- a/r/man/as_nanoarrow_vctr.Rd +++ b/r/man/as_nanoarrow_vctr.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/vctr.R \name{as_nanoarrow_vctr} \alias{as_nanoarrow_vctr} +\alias{nanoarrow_vctr} \title{Experimental Arrow encoded arrays as R vectors} \usage{ as_nanoarrow_vctr(x, ..., schema = NULL) + +nanoarrow_vctr(schema = NULL) } \arguments{ \item{x}{An object that works with \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}. Most diff --git a/r/tests/testthat/test-vctr.R b/r/tests/testthat/test-vctr.R index 62a94df14..743483872 100644 --- a/r/tests/testthat/test-vctr.R +++ b/r/tests/testthat/test-vctr.R @@ -25,6 +25,42 @@ test_that("as_nanoarrow_vctr() works for basic input", { expect_identical(as_nanoarrow_schema(vctr)$format, "u") }) +test_that("nanorrow_vctr() creates an empty sentinel", { + vctr <- nanoarrow_vctr() + expect_identical( + expect_output( + print(vctr), + "" + ), + vctr + ) + + expect_identical( + expect_output( + str(vctr), + "" + ), + vctr + ) + + vctr <- nanoarrow_vctr(na_int32()) + expect_identical( + expect_output( + print(vctr), + "^ Date: Thu, 9 May 2024 17:27:29 -0300 Subject: [PATCH 06/20] fix --- r/R/convert-array.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/convert-array.R b/r/R/convert-array.R index 1db8c625d..d1a706962 100644 --- a/r/R/convert-array.R +++ b/r/R/convert-array.R @@ -142,7 +142,7 @@ convert_fallback_other <- function(array, offset, length, to) { #' @export convert_array.nanoarrow_vctr <- function(array, to, ...) { schema <- infer_nanoarrow_schema(to) - as_nanoarrow_vctr(stream, schema = schema) + as_nanoarrow_vctr(array, schema = schema) } #' @export From 1bde33361b76d840dac0c068ac4caddf9880e301 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 May 2024 21:32:59 -0300 Subject: [PATCH 07/20] fix format --- r/R/vctr.R | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/r/R/vctr.R b/r/R/vctr.R index 6f932f8ef..787dd3935 100644 --- a/r/R/vctr.R +++ b/r/R/vctr.R @@ -91,9 +91,27 @@ new_nanoarrow_vctr <- function(chunks, schema, indices = NULL) { #' @export format.nanoarrow_vctr <- function(x, ...) { - # Technically we can do better here + if (length(x) == 0) { + return(character()) + } + stream <- as_nanoarrow_array_stream(x) - format(convert_array_stream(stream), ...) + converted <- convert_array_stream(stream) + + # This needs to be a character() with the same length as x to work with + # RStudio's viewer. Data frames need special handling in this case. + size_stable_format(converted) +} + +size_stable_format <- function(x, ...) { + if (inherits(x, "data.frame")) { + cols <- lapply(x, size_stable_format, ...) + cols <- Map(paste, names(x), cols, sep = ": ") + rows <- do.call(paste, c(cols, list(sep = ", "))) + paste0("{", rows, "}") + } else { + format(x, ...) + } } # Because RStudio's viewer uses this, we want to use the potentially abbreviated @@ -218,7 +236,7 @@ as.data.frame.nanoarrow_vctr <- function(x, ..., optional = FALSE) { if (!optional) { stop(sprintf("cannot coerce object of tyoe '%s' to data.frame", class(x)[1])) } else { - new_data_frame(list(x)) + new_data_frame(list(x), nrow = length(x)) } } From b977f722c7221dfc8b11e2dd6e6de9c93b86d567 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 May 2024 22:52:23 -0300 Subject: [PATCH 08/20] test coverage --- r/NAMESPACE | 1 - r/R/vctr.R | 32 ++++++++++-------- r/tests/testthat/test-vctr.R | 64 ++++++++++++++++++++++++++---------- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index aacb4fcd0..aa6e71a08 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -15,7 +15,6 @@ S3method("[[",nanoarrow_schema) S3method("[[<-",nanoarrow_array) S3method("[[<-",nanoarrow_schema) S3method("[[<-",nanoarrow_vctr) -S3method(as.character,nanoarrow_vctr) S3method(as.data.frame,nanoarrow_array) S3method(as.data.frame,nanoarrow_array_stream) S3method(as.data.frame,nanoarrow_vctr) diff --git a/r/R/vctr.R b/r/R/vctr.R index 787dd3935..215c826e0 100644 --- a/r/R/vctr.R +++ b/r/R/vctr.R @@ -114,13 +114,6 @@ size_stable_format <- function(x, ...) { } } -# Because RStudio's viewer uses this, we want to use the potentially abbreviated -# format string. -#' @export -as.character.nanoarrow_vctr <- function(x, ...) { - format(x, ...) -} - #' @export infer_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) { attr(x, "schema", exact = TRUE) @@ -141,10 +134,14 @@ as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { #' @export as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { if (!is.null(schema)) { + # If a schema is passed, first resolve the stream as is and then use + # as_nanoarrow_array_stream() to either cast (when this is supported) + # or error. stream <- as_nanoarrow_array_stream(x, schema = NULL) return(as_nanoarrow_array_stream(stream, schema = schema)) } + # Resolve the indices as c(1-based start, length) slice <- vctr_as_slice(x) if (is.null(slice)) { stop("Can't resolve non-slice nanoarrow_vctr to nanoarrow_array_stream") @@ -227,6 +224,13 @@ as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { #' @export c.nanoarrow_vctr <- function(...) { + dots <- list(...) + + # This one we can do safely + if (length(dots) == 1) { + return(dots[[1]]) + } + stop("c() not implemented for nanoarrow_vctr()") } @@ -244,7 +248,7 @@ as.data.frame.nanoarrow_vctr <- function(x, ..., optional = FALSE) { print.nanoarrow_vctr <- function(x, ...) { schema <- attr(x, "schema", exact = TRUE) if (is.null(schema)) { - cat("\n") + cat(">\n") return(invisible(x)) } @@ -257,8 +261,10 @@ print.nanoarrow_vctr <- function(x, ...) { converted_head <- convert_array_stream(stream) print(converted_head) - if (more_values > 0) { + if (more_values >= 2) { cat(sprintf("...and %d more values\n", more_values)) + } else if (more_values >= 1) { + cat(sprintf("...and %d more value\n", more_values)) } invisible(x) @@ -268,16 +274,16 @@ print.nanoarrow_vctr <- function(x, ...) { str.nanoarrow_vctr <- function(object, ...) { schema <- attr(object, "schema", exact = TRUE) if (is.null(schema)) { - cat("\n") + cat(">\n") return(invisible(object)) } formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE) cat(sprintf("\n", formatted, length(object))) - for (chunk in attr(object, "chunks")) { - str(chunk, ...) - } + # Prints out the C data interface dump of each chunk with the chunk + # index above. + str(attr(object, "chunks")) invisible(object) } diff --git a/r/tests/testthat/test-vctr.R b/r/tests/testthat/test-vctr.R index 743483872..95bbd14ff 100644 --- a/r/tests/testthat/test-vctr.R +++ b/r/tests/testthat/test-vctr.R @@ -25,46 +25,74 @@ test_that("as_nanoarrow_vctr() works for basic input", { expect_identical(as_nanoarrow_schema(vctr)$format, "u") }) -test_that("nanorrow_vctr() creates an empty sentinel", { +test_that("print() and str() work on empty nanoarrow_vctr", { vctr <- nanoarrow_vctr() expect_identical( - expect_output( - print(vctr), - "" - ), + expect_output(print(vctr),">"), vctr ) expect_identical( - expect_output( - str(vctr), - "" - ), + expect_output(str(vctr), ">"), vctr ) vctr <- nanoarrow_vctr(na_int32()) expect_identical( - expect_output( - print(vctr), - "^ Date: Thu, 9 May 2024 23:09:28 -0300 Subject: [PATCH 09/20] test convert_array() to nanoarrow_vctr --- r/R/convert-array.R | 2 +- r/tests/testthat/test-convert-array.R | 35 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/r/R/convert-array.R b/r/R/convert-array.R index d1a706962..5ae65443e 100644 --- a/r/R/convert-array.R +++ b/r/R/convert-array.R @@ -141,7 +141,7 @@ convert_fallback_other <- function(array, offset, length, to) { #' @export convert_array.nanoarrow_vctr <- function(array, to, ...) { - schema <- infer_nanoarrow_schema(to) + schema <- attr(to, "schema", exact = TRUE) as_nanoarrow_vctr(array, schema = schema) } diff --git a/r/tests/testthat/test-convert-array.R b/r/tests/testthat/test-convert-array.R index 5731ce108..00bb0e85c 100644 --- a/r/tests/testthat/test-convert-array.R +++ b/r/tests/testthat/test-convert-array.R @@ -160,6 +160,41 @@ test_that("convert to vector works for tibble", { ) }) +test_that("convert to vector works for nanoarrow_vctr()", { + array <- as_nanoarrow_array(c("one", "two", "three")) + + # Check implicit/inferred nanoarrow_vctr() schema + vctr <- convert_array(array, nanoarrow_vctr()) + expect_s3_class(vctr, "nanoarrow_vctr") + expect_length(vctr, 3) + schema <- infer_nanoarrow_schema(vctr) + expect_identical(schema$format, "u") + + # Check with explicit schema of the correct type + vctr <- convert_array(array, nanoarrow_vctr(na_string())) + expect_s3_class(vctr, "nanoarrow_vctr") + expect_length(vctr, 3) + schema <- infer_nanoarrow_schema(vctr) + expect_identical(schema$format, "u") + + # Check nested conversion from a data.frame + df <- data.frame(x = c("one", "two", "three")) + array <- as_nanoarrow_array(df) + + vctr <- convert_array(array, nanoarrow_vctr()) + expect_s3_class(vctr, "nanoarrow_vctr") + expect_length(vctr, 3) + schema <- infer_nanoarrow_schema(vctr) + expect_identical(schema$format, "+s") + + + vctr <- convert_array(array, nanoarrow_vctr(na_struct(list(x = na_string())))) + expect_s3_class(vctr, "nanoarrow_vctr") + expect_length(vctr, 3) + schema <- infer_nanoarrow_schema(vctr) + expect_identical(schema$format, "+s") +}) + test_that("convert to vector works for struct-style vectors", { array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax")) expect_identical( From 575726408cadd2e73456a5dca71b4c0f50a34952 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 00:11:10 -0300 Subject: [PATCH 10/20] give a go --- r/src/materialize.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/r/src/materialize.c b/r/src/materialize.c index c2bca993b..5e261df60 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -19,8 +19,9 @@ #include #include +#include "array.h" #include "nanoarrow.h" - +#include "nanoarrow/r.h" #include "util.h" // Needed for the list_of materializer @@ -270,6 +271,33 @@ static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) { } } +static int nanoarrow_materialize_nanoarrow_vctr(struct RConverter* converter, + SEXP converter_xptr) { + // This is a case where the callee needs ownership, which we can do via a + // shallow copy. + SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + + // TODO: Check that this SEXP has a lifecycle that is going to work with this + SEXP array_xptr = VECTOR_ELT(converter_shelter, 2); + + SEXP array_out_xptr = PROTECT(nanoarrow_array_owning_xptr()); + struct ArrowArray* out_array = nanoarrow_output_array_from_xptr(array_xptr); + array_export(array_xptr, out_array); + + // Append the chunk to the pairlist + SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); + SEXP chunks_tail = PROTECT(Rf_getAttrib(converter->dst.vec_sexp, chunks_tail_sym)); + + SEXP next_sexp = PROTECT(Rf_cons(array_out_xptr, R_NilValue)); + SETCDR(chunks_tail, next_sexp); + UNPROTECT(1); + + Rf_setAttrib(converter->dst.vec_sexp, chunks_tail_sym, next_sexp); + UNPROTECT(3); + + return NANOARROW_OK; +} + static int nanoarrow_materialize_other(struct RConverter* converter, SEXP converter_xptr) { // Ensure that we have a ptype SEXP to send in the call back to R From ce5943812831e722f94f33a7d22a057cb18d5142 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 10:59:04 -0300 Subject: [PATCH 11/20] ensure proper ownership in conversion --- r/src/convert_array_stream.c | 81 ++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/r/src/convert_array_stream.c b/r/src/convert_array_stream.c index b5315f388..4c5a16f52 100644 --- a/r/src/convert_array_stream.c +++ b/r/src/convert_array_stream.c @@ -26,6 +26,46 @@ #include "convert.h" #include "schema.h" +static int convert_next(SEXP converter_xptr, struct ArrowArrayStream* stream, + SEXP schema_xptr, int64_t* n_batches) { + SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr()); + struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr); + + // Fetch the next array + int result = ArrowArrayStreamGetNext(stream, array, NULL); + if (result != NANOARROW_OK) { + Rf_error("ArrowArrayStream::get_next(): %s", ArrowArrayStreamGetLastError(stream)); + } + + // Check if the stream is finished + if (array->release == NULL) { + UNPROTECT(1); + return 0; + } + + // Bump the batch counter + (*n_batches)++; + + // Set the schema of the allocated array and pass it to the converter + R_SetExternalPtrTag(array_xptr, schema_xptr); + if (nanoarrow_converter_set_array(converter_xptr, array_xptr) != NANOARROW_OK) { + nanoarrow_converter_stop(converter_xptr); + } + + // After set_array, the converter is responsible for the array_xptr + UNPROTECT(1); + + // Materialize the array into the converter + int64_t n_materialized = + nanoarrow_converter_materialize_n(converter_xptr, array->length); + if (n_materialized != array->length) { + Rf_error("Expected to materialize %ld values in batch %ld but materialized %ld", + (long)array->length, (long)(*n_batches), (long)n_materialized); + } + + return 1; +} + SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp, SEXP size_sexp, SEXP n_sexp) { struct ArrowArrayStream* array_stream = @@ -58,49 +98,18 @@ SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp, nanoarrow_converter_stop(converter_xptr); } - SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr()); - struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr); - int64_t n_batches = 0; - int64_t n_materialized = 0; - if (n > 0) { - result = ArrowArrayStreamGetNext(array_stream, array, NULL); - n_batches++; - if (result != NANOARROW_OK) { - Rf_error("ArrowArrayStream::get_next(): %s", - ArrowArrayStreamGetLastError(array_stream)); + do { + if (n_batches >= n) { + break; } - - while (array->release != NULL) { - if (nanoarrow_converter_set_array(converter_xptr, array_xptr) != NANOARROW_OK) { - nanoarrow_converter_stop(converter_xptr); - } - - n_materialized = nanoarrow_converter_materialize_n(converter_xptr, array->length); - if (n_materialized != array->length) { - Rf_error("Expected to materialize %ld values in batch %ld but materialized %ld", - (long)array->length, (long)n_batches, (long)n_materialized); - } - - if (n_batches >= n) { - break; - } - - array->release(array); - result = ArrowArrayStreamGetNext(array_stream, array, NULL); - n_batches++; - if (result != NANOARROW_OK) { - Rf_error("ArrowArrayStream::get_next(): %s", - ArrowArrayStreamGetLastError(array_stream)); - } - } - } + } while (convert_next(converter_xptr, array_stream, schema_xptr, &n_batches)); if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } SEXP result_sexp = PROTECT(nanoarrow_converter_release_result(converter_xptr)); - UNPROTECT(4); + UNPROTECT(3); return result_sexp; } From f5cc835a2c4e56aa399d48fd5d8562687e119ec8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 11:26:59 -0300 Subject: [PATCH 12/20] maybe wire up conversion --- r/src/materialize.c | 67 +++++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/r/src/materialize.c b/r/src/materialize.c index 5e261df60..330ad88c3 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -109,6 +109,10 @@ int nanoarrow_ptype_is_data_frame(SEXP ptype) { (Rf_xlength(ptype) > 0 && has_attrib_safe(ptype, R_NamesSymbol))); } +int nanoarrow_ptype_is_nanoarrow_vctr(SEXP ptype) { + return Rf_inherits(ptype, "nanoarrow_vctr"); +} + SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) { SEXP result; @@ -123,7 +127,27 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) { } } - if (nanoarrow_ptype_is_data_frame(ptype)) { + if (nanoarrow_ptype_is_nanoarrow_vctr(ptype)) { + // The object we return here is one that will accumulate chunks and + // be finalized with a value (rather than being strictly copied into + // after every new chunk is seen). + result = PROTECT(Rf_allocVector(INTSXP, len)); + Rf_copyMostAttrib(ptype, result); + + // For the purposes of building the list of chunks, chunks is a pairlist + // (it will be converted to a regular list when this converter is finalized) + // Technically the first value here won't be used (this simplifies the + // appending). + SEXP chunks_list = PROTECT(Rf_list1(R_NilValue)); + + // To start, the chunks list and the end of the chunks list are the same node + SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); + SEXP chunks_sym = PROTECT(Rf_install("chunks")); + Rf_setAttrib(result, chunks_sym, chunks_list); + Rf_setAttrib(result, chunks_tail_sym, chunks_list); + + UNPROTECT(4); + } else if (nanoarrow_ptype_is_data_frame(ptype)) { R_xlen_t num_cols = Rf_xlength(ptype); result = PROTECT(Rf_allocVector(VECSXP, num_cols)); for (R_xlen_t i = 0; i < num_cols; i++) { @@ -208,6 +232,10 @@ static void fill_vec_with_nulls(SEXP x, R_xlen_t offset, R_xlen_t len) { } static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) { + if (nanoarrow_ptype_is_nanoarrow_vctr(dst)) { + Rf_error("Can't copy_vec_into() to nanoarrow_vctr"); + } + if (nanoarrow_ptype_is_data_frame(dst)) { if (!nanoarrow_ptype_is_data_frame(x)) { Rf_error("Expected record-style vctr result but got non-record-style result"); @@ -276,22 +304,26 @@ static int nanoarrow_materialize_nanoarrow_vctr(struct RConverter* converter, // This is a case where the callee needs ownership, which we can do via a // shallow copy. SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); - - // TODO: Check that this SEXP has a lifecycle that is going to work with this SEXP array_xptr = VECTOR_ELT(converter_shelter, 2); SEXP array_out_xptr = PROTECT(nanoarrow_array_owning_xptr()); struct ArrowArray* out_array = nanoarrow_output_array_from_xptr(array_xptr); array_export(array_xptr, out_array); - // Append the chunk to the pairlist + // Get the cached copy of the pairlist node at the end of the current + // chunks list. SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); SEXP chunks_tail = PROTECT(Rf_getAttrib(converter->dst.vec_sexp, chunks_tail_sym)); + // Create a length-1 pairlist node containing the chunk SEXP next_sexp = PROTECT(Rf_cons(array_out_xptr, R_NilValue)); + + // Append it to the end of the current pairlist SETCDR(chunks_tail, next_sexp); UNPROTECT(1); + // Update the cached copy of the pairlist node at the end of the current + // chunks list. Rf_setAttrib(converter->dst.vec_sexp, chunks_tail_sym, next_sexp); UNPROTECT(3); @@ -308,20 +340,19 @@ static int nanoarrow_materialize_other(struct RConverter* converter, UNPROTECT(1); } - // A unique situation where we don't want owning external pointers because we know - // these are protected for the duration of our call into R and because we don't want - // the underlying array to be released and invalidate the converter. The R code in - // convert_fallback_other() takes care of ensuring an independent copy with the correct - // offset/length. - SEXP schema_xptr = PROTECT(R_MakeExternalPtr( - (struct ArrowSchema*)converter->schema_view.schema, R_NilValue, R_NilValue)); - Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); - // We do need to set the protected member of the array external pointer to signal that - // it is not an independent array (i.e., force a shallow copy). - SEXP array_xptr = PROTECT(R_MakeExternalPtr( - (struct ArrowArray*)converter->array_view.array, schema_xptr, converter_xptr)); - Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array); + // Special-case the nanoarrow_vctr conversion + if (Rf_inherits(converter->dst.vec_sexp, "nanoarrow_vctr")) { + return nanoarrow_materialize_nanoarrow_vctr(converter, converter_xptr); + } + + // We've ensured proper ownership of array_xptr and ensured that its + // schema is set, so we can pass these safely to the R-level + // convert_fallback_other. + SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + SEXP array_xptr = VECTOR_ELT(converter_shelter, 2); + // The R code in convert_fallback_other() takes care of ensuring an independent copy + // with the correct offset/length if it is necessary to update them. SEXP offset_sexp = PROTECT( Rf_ScalarReal((double)(converter->src.array_view->offset + converter->src.offset))); SEXP length_sexp = PROTECT(Rf_ScalarReal((double)converter->src.length)); @@ -335,7 +366,7 @@ static int nanoarrow_materialize_other(struct RConverter* converter, copy_vec_into(result_src, converter->dst.vec_sexp, converter->dst.offset, converter->dst.length); - UNPROTECT(7); + UNPROTECT(5); return NANOARROW_OK; } From 120a6d602b361300faeab282edad1b625eb048b1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 12:16:41 -0300 Subject: [PATCH 13/20] maybe integrate the finalizer --- r/R/vctr.R | 22 ++++++++++------------ r/src/convert.c | 12 ++++++++++-- r/src/materialize.c | 41 +++++++++++++++++++++++++++++++++++++++++ r/src/materialize.h | 4 ++++ 4 files changed, 65 insertions(+), 14 deletions(-) diff --git a/r/R/vctr.R b/r/R/vctr.R index 215c826e0..659fc74f0 100644 --- a/r/R/vctr.R +++ b/r/R/vctr.R @@ -41,26 +41,24 @@ as_nanoarrow_vctr <- function(x, ..., schema = NULL) { #' @rdname as_nanoarrow_vctr #' @export -nanoarrow_vctr <- function(schema = NULL) { +nanoarrow_vctr <- function(schema = NULL, subclass = character()) { if (is.null(schema)) { - new_nanoarrow_vctr(list(), NULL) + new_nanoarrow_vctr(list(), NULL, subclass) } else { - new_nanoarrow_vctr(list(), as_nanoarrow_schema(schema)) + new_nanoarrow_vctr(list(), as_nanoarrow_schema(schema), subclass) } } -new_nanoarrow_vctr <- function(chunks, schema, indices = NULL) { +new_nanoarrow_vctr <- function(chunks, schema, subclass = character()) { offsets <- .Call(nanoarrow_c_vctr_chunk_offsets, chunks) - if (is.null(indices)) { - indices <- seq_len(offsets[length(offsets)]) - } + indices <- seq_len(offsets[length(offsets)]) structure( indices, schema = schema, chunks = chunks, offsets = offsets, - class = "nanoarrow_vctr" + class = union(subclass, "nanoarrow_vctr") ) } @@ -248,12 +246,12 @@ as.data.frame.nanoarrow_vctr <- function(x, ..., optional = FALSE) { print.nanoarrow_vctr <- function(x, ...) { schema <- attr(x, "schema", exact = TRUE) if (is.null(schema)) { - cat(">\n") + cat(sprintf("<%s >\n", class(x)[1])) return(invisible(x)) } formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE) - cat(sprintf("\n", formatted, length(x))) + cat(sprintf("<%s %s[%d]>\n", class(x)[1], formatted, length(x))) n_values <- min(length(x), 20) more_values <- length(x) - n_values @@ -274,12 +272,12 @@ print.nanoarrow_vctr <- function(x, ...) { str.nanoarrow_vctr <- function(object, ...) { schema <- attr(object, "schema", exact = TRUE) if (is.null(schema)) { - cat(">\n") + cat(sprintf("<%s >\n", class(object)[1])) return(invisible(object)) } formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE) - cat(sprintf("\n", formatted, length(object))) + cat(sprintf("<%s %s[%d]>\n", class(object)[1], formatted, length(object))) # Prints out the C data interface dump of each chunk with the chunk # index above. diff --git a/r/src/convert.c b/r/src/convert.c index 2148421a3..739134355 100644 --- a/r/src/convert.c +++ b/r/src/convert.c @@ -437,17 +437,25 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) { SEXP nanoarrow_converter_release_result(SEXP converter_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + // PROTECT()ing here because we are about to release the object from the // shelter of the converter and return it SEXP result = PROTECT(VECTOR_ELT(converter_shelter, 4)); SET_VECTOR_ELT(converter_shelter, 4, R_NilValue); + + // Perform any finalization on the vector before it is returned to R + SEXP final_result = + PROTECT(nanoarrow_materialize_finalize_result(converter_xptr, result)); + + // Reset the converter state converter->dst.vec_sexp = R_NilValue; converter->dst.offset = 0; converter->dst.length = 0; converter->size = 0; converter->capacity = 0; - UNPROTECT(1); - return result; + + UNPROTECT(2); + return final_result; } void nanoarrow_converter_stop(SEXP converter_xptr) { diff --git a/r/src/materialize.c b/r/src/materialize.c index 330ad88c3..cf9163337 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -299,6 +299,47 @@ static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) { } } +SEXP nanoarrow_materialize_finalize_result(SEXP converter_xptr, SEXP result) { + if (Rf_inherits(result, "nanoarrow_vctr")) { + // Get the schema for this converter. Technically this will overwrite + // a schema that was provided explicitly; however, we currently do not + // handle that case. + SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + SEXP schema_xptr = VECTOR_ELT(converter_shelter, 1); + + // We no longer need to keep track of chunks_tail + SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); + Rf_setAttrib(result, chunks_tail_sym, R_NilValue); + + // We also want to pass on the class of the ptype we recieved + SEXP subclass_sexp = Rf_getAttrib(result, R_ClassSymbol); + + // We no longer need the first element of the pairlist, which was + // intentionally set to R_NilValue. + SEXP chunks_sym = PROTECT(Rf_install("chunks")); + SEXP chunks_pairlist0 = Rf_getAttrib(result, chunks_sym); + + // If there were zero chunks, there will be no "first" node + SEXP chunks_list; + if (CDR(chunks_pairlist0) == R_NilValue) { + chunks_list = PROTECT(Rf_allocVector(VECSXP, 0)); + } else { + chunks_list = PROTECT(Rf_PairToVectorList(CDR(chunks_pairlist0))); + } + + // Set up the call to new_nanoarrow_vctr + SEXP new_nanoarrow_vctr_sym = PROTECT(Rf_install("new_nanoarrow_vctr")); + SEXP new_nanoarrow_vctr_call = PROTECT( + Rf_lang4(new_nanoarrow_vctr_sym, chunks_list, schema_xptr, subclass_sexp)); + SEXP final_result = PROTECT(Rf_eval(new_nanoarrow_vctr_call, nanoarrow_ns_pkg)); + + UNPROTECT(8); + return final_result; + } else { + return result; + } +} + static int nanoarrow_materialize_nanoarrow_vctr(struct RConverter* converter, SEXP converter_xptr) { // This is a case where the callee needs ownership, which we can do via a diff --git a/r/src/materialize.h b/r/src/materialize.h index a8f36cc90..7b1373e09 100644 --- a/r/src/materialize.h +++ b/r/src/materialize.h @@ -42,4 +42,8 @@ int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr); SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len); SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len); +// Finalize an object before returning to R. Currently only used for +// nanoarrow_vctr conversion. +SEXP nanoarrow_materialize_finalize_result(SEXP converter_xptr, SEXP result); + #endif From 4629efa912b76e60fbff2a2377f067c19b72441b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 12:37:26 -0300 Subject: [PATCH 14/20] working other than stack imablances --- r/src/materialize.c | 6 ++++-- r/tests/testthat/test-convert-array.R | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/r/src/materialize.c b/r/src/materialize.c index cf9163337..580c4520a 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -333,7 +333,7 @@ SEXP nanoarrow_materialize_finalize_result(SEXP converter_xptr, SEXP result) { Rf_lang4(new_nanoarrow_vctr_sym, chunks_list, schema_xptr, subclass_sexp)); SEXP final_result = PROTECT(Rf_eval(new_nanoarrow_vctr_call, nanoarrow_ns_pkg)); - UNPROTECT(8); + UNPROTECT(6); return final_result; } else { return result; @@ -345,11 +345,13 @@ static int nanoarrow_materialize_nanoarrow_vctr(struct RConverter* converter, // This is a case where the callee needs ownership, which we can do via a // shallow copy. SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + SEXP schema_xptr = VECTOR_ELT(converter_shelter, 1); SEXP array_xptr = VECTOR_ELT(converter_shelter, 2); SEXP array_out_xptr = PROTECT(nanoarrow_array_owning_xptr()); - struct ArrowArray* out_array = nanoarrow_output_array_from_xptr(array_xptr); + struct ArrowArray* out_array = nanoarrow_output_array_from_xptr(array_out_xptr); array_export(array_xptr, out_array); + R_SetExternalPtrTag(array_out_xptr, schema_xptr); // Get the cached copy of the pairlist node at the end of the current // chunks list. diff --git a/r/tests/testthat/test-convert-array.R b/r/tests/testthat/test-convert-array.R index 00bb0e85c..91ebb9d68 100644 --- a/r/tests/testthat/test-convert-array.R +++ b/r/tests/testthat/test-convert-array.R @@ -195,6 +195,27 @@ test_that("convert to vector works for nanoarrow_vctr()", { expect_identical(schema$format, "+s") }) +test_that("batched convert to vector works for nanoarrow_vctr()", { + empty_stream <- basic_array_stream(list(), schema = na_string()) + empty_vctr <- convert_array_stream(empty_stream, nanoarrow_vctr()) + expect_length(empty_vctr, 0) + expect_identical(infer_nanoarrow_schema(empty_vctr)$format, "u") + + stream1 <- basic_array_stream(list(c("one", "two", "three"))) + vctr1 <- convert_array_stream(stream1, nanoarrow_vctr()) + expect_length(vctr1, 3) + + stream2 <- basic_array_stream( + list(c("one", "two", "three"), c("four", "five", "six", "seven")) + ) + vctr2 <- convert_array_stream(stream2, nanoarrow_vctr()) + expect_length(vctr2, 7) + expect_identical( + convert_array_stream(as_nanoarrow_array_stream(vctr2)), + c("one", "two", "three", "four", "five", "six", "seven") + ) +}) + test_that("convert to vector works for struct-style vectors", { array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax")) expect_identical( From 06438fc9cf9040761804eb9f8295b7a5d8003821 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 12:43:29 -0300 Subject: [PATCH 15/20] fix protection count --- r/src/materialize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/materialize.c b/r/src/materialize.c index 580c4520a..22a3d1ace 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -146,7 +146,7 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) { Rf_setAttrib(result, chunks_sym, chunks_list); Rf_setAttrib(result, chunks_tail_sym, chunks_list); - UNPROTECT(4); + UNPROTECT(3); } else if (nanoarrow_ptype_is_data_frame(ptype)) { R_xlen_t num_cols = Rf_xlength(ptype); result = PROTECT(Rf_allocVector(VECSXP, num_cols)); From 7da7ce0d9893fd1470bc6a3d5cf234993555a3de Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 13:26:18 -0300 Subject: [PATCH 16/20] test subclasses --- r/R/convert-array.R | 6 ++++- r/R/vctr.R | 24 ++++++++++++----- r/man/as_nanoarrow_vctr.Rd | 10 +++++--- r/tests/testthat/test-convert-array.R | 16 ++++++++++++ r/tests/testthat/test-extension.R | 37 +++++++++++++++++++++++++++ 5 files changed, 82 insertions(+), 11 deletions(-) diff --git a/r/R/convert-array.R b/r/R/convert-array.R index 5ae65443e..a8dbbb101 100644 --- a/r/R/convert-array.R +++ b/r/R/convert-array.R @@ -142,7 +142,11 @@ convert_fallback_other <- function(array, offset, length, to) { #' @export convert_array.nanoarrow_vctr <- function(array, to, ...) { schema <- attr(to, "schema", exact = TRUE) - as_nanoarrow_vctr(array, schema = schema) + if (is.null(schema)) { + schema <- infer_nanoarrow_schema(array) + } + + new_nanoarrow_vctr(list(array), schema, class(to)) } #' @export diff --git a/r/R/vctr.R b/r/R/vctr.R index 659fc74f0..8035bf7ee 100644 --- a/r/R/vctr.R +++ b/r/R/vctr.R @@ -17,8 +17,9 @@ #' Experimental Arrow encoded arrays as R vectors #' -#' @param x An object that works with [as_nanoarrow_array_stream()]. Most -#' spatial objects in R already work with this method. +#' @param x An object that works with [as_nanoarrow_array_stream()]. +#' @param subclass An optional subclass of nanoarrow_vctr to prepend to the +#' final class name. #' @param ... Passed to [as_nanoarrow_array_stream()] #' @param schema An optional `schema` #' @@ -29,14 +30,14 @@ #' array <- as_nanoarrow_array(1:5) #' as_nanoarrow_vctr(array) #' -as_nanoarrow_vctr <- function(x, ..., schema = NULL) { +as_nanoarrow_vctr <- function(x, ..., schema = NULL, subclass = character()) { if (inherits(x, "nanoarrow_vctr") && is.null(schema)) { return(x) } stream <- as_nanoarrow_array_stream(x, ..., schema = schema) chunks <- collect_array_stream(stream, validate = FALSE) - new_nanoarrow_vctr(chunks, stream$get_schema()) + new_nanoarrow_vctr(chunks, stream$get_schema(), subclass) } #' @rdname as_nanoarrow_vctr @@ -102,7 +103,12 @@ format.nanoarrow_vctr <- function(x, ...) { } size_stable_format <- function(x, ...) { - if (inherits(x, "data.frame")) { + if (inherits(x, "nanoarrow_vctr")) { + # Extension types could have a default convert that gives a nanoarrow_vctr. + # If this is the case, they should be returning a subclass with a format + # method that ensures we don't get here. + rep(sprintf("<%s[%d]>", class(x)[1], seq_along(x))) + } else if (inherits(x, "data.frame")) { cols <- lapply(x, size_stable_format, ...) cols <- Map(paste, names(x), cols, sep = ": ") rows <- do.call(paste, c(cols, list(sep = ", "))) @@ -258,7 +264,13 @@ print.nanoarrow_vctr <- function(x, ...) { stream <- as_nanoarrow_array_stream(utils::head(x, n_values)) converted_head <- convert_array_stream(stream) - print(converted_head) + if (inherits(converted_head, "nanoarrow_vctr")) { + converted_head <- format(converted_head) + print(converted_head, quote = FALSE) + } else { + print(converted_head) + } + if (more_values >= 2) { cat(sprintf("...and %d more values\n", more_values)) } else if (more_values >= 1) { diff --git a/r/man/as_nanoarrow_vctr.Rd b/r/man/as_nanoarrow_vctr.Rd index 4b7700279..036d67eb7 100644 --- a/r/man/as_nanoarrow_vctr.Rd +++ b/r/man/as_nanoarrow_vctr.Rd @@ -5,17 +5,19 @@ \alias{nanoarrow_vctr} \title{Experimental Arrow encoded arrays as R vectors} \usage{ -as_nanoarrow_vctr(x, ..., schema = NULL) +as_nanoarrow_vctr(x, ..., schema = NULL, subclass = character()) -nanoarrow_vctr(schema = NULL) +nanoarrow_vctr(schema = NULL, subclass = character()) } \arguments{ -\item{x}{An object that works with \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}. Most -spatial objects in R already work with this method.} +\item{x}{An object that works with \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}.} \item{...}{Passed to \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}} \item{schema}{An optional \code{schema}} + +\item{subclass}{An optional subclass of nanoarrow_vctr to prepend to the +final class name.} } \value{ A vctr of class 'nanoarrow_vctr' diff --git a/r/tests/testthat/test-convert-array.R b/r/tests/testthat/test-convert-array.R index 91ebb9d68..9d2237dbd 100644 --- a/r/tests/testthat/test-convert-array.R +++ b/r/tests/testthat/test-convert-array.R @@ -216,6 +216,22 @@ test_that("batched convert to vector works for nanoarrow_vctr()", { ) }) +test_that("batched convert to vector works for nanoarrow_vctr() keeps subclass", { + vctr_ptype <- nanoarrow_vctr(subclass = "some_subclass") + + empty_stream <- basic_array_stream(list(), schema = na_string()) + empty_vctr <- convert_array_stream(empty_stream, vctr_ptype) + expect_s3_class(empty_vctr, "some_subclass") + + stream1 <- basic_array_stream(list(c(""))) + vctr1 <- convert_array_stream(stream1, vctr_ptype) + expect_s3_class(vctr1, "some_subclass") + + stream2 <- basic_array_stream(list(c(""), c(""))) + vctr2 <- convert_array_stream(stream2, vctr_ptype) + expect_s3_class(vctr2, "some_subclass") +}) + test_that("convert to vector works for struct-style vectors", { array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax")) expect_identical( diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index ceb5717bb..5935394c5 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -114,3 +114,40 @@ test_that("as_nanoarrow_array() dispatches on registered extension spec", { "some_ext" ) }) + +test_that("extensions can infer a schema of a nanoarrow_vctr() subclass", { + register_nanoarrow_extension( + "some_ext", + nanoarrow_extension_spec(subclass = "vctr_spec_class") + ) + on.exit(unregister_nanoarrow_extension("some_ext")) + + infer_nanoarrow_ptype_extension.vctr_spec_class <- function(spec, x, ...) { + nanoarrow_vctr(subclass = "some_vctr_subclass") + } + + s3_register( + "nanoarrow::infer_nanoarrow_ptype_extension", + "vctr_spec_class", + infer_nanoarrow_ptype_extension.vctr_spec_class + ) + + expect_identical( + infer_nanoarrow_ptype(na_extension(na_string(), "some_ext")), + nanoarrow_vctr(subclass = "some_vctr_subclass") + ) + + ext_array <- nanoarrow_extension_array(c("one", "two", "three"), "some_ext") + vctr <- convert_array(ext_array) + expect_s3_class(vctr, "some_vctr_subclass") + + # Ensure that registering a default conversion that returns a nanoarrow_vctr + # does not result in infinite recursion when printing or formatting it. + # An extension that does this should provide these methods for the subclass + # they return. + expect_length(format(vctr), length(vctr)) + expect_output( + expect_identical(print(vctr), vctr), + "some_vctr_subclass" + ) +}) From 2d383bf4c4078c7b18d6911b0e9ce72799604971 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 13:43:54 -0300 Subject: [PATCH 17/20] docs + spelling --- r/R/vctr.R | 24 ++++++++++++++++++++++++ r/man/as_nanoarrow_vctr.Rd | 26 +++++++++++++++++++++++++- r/src/materialize.c | 2 +- 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/r/R/vctr.R b/r/R/vctr.R index 8035bf7ee..33e12c041 100644 --- a/r/R/vctr.R +++ b/r/R/vctr.R @@ -17,6 +17,30 @@ #' Experimental Arrow encoded arrays as R vectors #' +#' This experimental vctr class allows zero or more Arrow arrays to +#' present as an R vector without converting them. This is useful for arrays +#' with types that do not have a non-lossy R equivalent, and helps provide an +#' intermediary object type where the default conversion is prohibitively +#' expensive (e.g., a nested list of data frames). These objects will not +#' survive many vctr transformations; however, they can be sliced without +#' copying the underlying arrays. +#' +#' The nanoarrow_vctr is currently implemented similarly to `factor()`: its +#' storage type is an `integer()` that is a sequence along the total length +#' of the vctr and there are attributes that are required to resolve these +#' indices to an array + offset. Sequences typically have a very compact +#' representation in recent version of R such that this has a cheap storage +#' footprint even for large arrays. The attributes are currently: +#' +#' - `schema`: The [nanoarrow_schema][as_nanoarrow_schema] shared by each chunk. +#' - `chunks`: A `list()` of `nanoarrow_array`. +#' - `offsets`: An `integer()` vector beginning with `0` and followed by the +#' cumulative length of each chunk. This allows the chunk index + offset +#' to be resolved from a logical index with `log(n)` complexity. +#' +#' This implementation is preliminary and may change; however, the result of +#' `as_nanoarrow_array_stream(some_vctr[begin:end])` should remain stable. +#' #' @param x An object that works with [as_nanoarrow_array_stream()]. #' @param subclass An optional subclass of nanoarrow_vctr to prepend to the #' final class name. diff --git a/r/man/as_nanoarrow_vctr.Rd b/r/man/as_nanoarrow_vctr.Rd index 036d67eb7..d3647edcf 100644 --- a/r/man/as_nanoarrow_vctr.Rd +++ b/r/man/as_nanoarrow_vctr.Rd @@ -23,7 +23,31 @@ final class name.} A vctr of class 'nanoarrow_vctr' } \description{ -Experimental Arrow encoded arrays as R vectors +This experimental vctr class allows zero or more Arrow arrays to +present as an R vector without converting them. This is useful for arrays +with types that do not have a non-lossy R equivalent, and helps provide an +intermediary object type where the default conversion is prohibitively +expensive (e.g., a nested list of data frames). These objects will not +survive many vctr transformations; however, they can be sliced without +copying the underlying arrays. +} +\details{ +The nanoarrow_vctr is currently implemented similarly to \code{factor()}: its +storage type is an \code{integer()} that is a sequence along the total length +of the vctr and there are attributes that are required to resolve these +indices to an array + offset. Sequences typically have a very compact +representation in recent version of R such that this has a cheap storage +footprint even for large arrays. The attributes are currently: +\itemize{ +\item \code{schema}: The \link[=as_nanoarrow_schema]{nanoarrow_schema} shared by each chunk. +\item \code{chunks}: A \code{list()} of \code{nanoarrow_array}. +\item \code{offsets}: An \code{integer()} vector beginning with \code{0} and followed by the +cumulative length of each chunk. This allows the chunk index + offset +to be resolved from a logical index with \code{log(n)} complexity. +} + +This implementation is preliminary and may change; however, the result of +\code{as_nanoarrow_array_stream(some_vctr[begin:end])} should remain stable. } \examples{ array <- as_nanoarrow_array(1:5) diff --git a/r/src/materialize.c b/r/src/materialize.c index 22a3d1ace..20bd62c0d 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -311,7 +311,7 @@ SEXP nanoarrow_materialize_finalize_result(SEXP converter_xptr, SEXP result) { SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); Rf_setAttrib(result, chunks_tail_sym, R_NilValue); - // We also want to pass on the class of the ptype we recieved + // We also want to pass on the class of the ptype we received SEXP subclass_sexp = Rf_getAttrib(result, R_ClassSymbol); // We no longer need the first element of the pairlist, which was From 8eb77e09630be98c961c1e547c010fcb8dda29e4 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 14:59:27 -0300 Subject: [PATCH 18/20] fix for nested data frame --- r/src/convert.c | 14 +++----------- r/src/materialize.c | 34 ++++++++++++++++++++++++++++------ r/src/materialize.h | 2 +- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/r/src/convert.c b/r/src/convert.c index 739134355..e232d67d6 100644 --- a/r/src/convert.c +++ b/r/src/convert.c @@ -414,11 +414,7 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) { SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SEXP current_result = VECTOR_ELT(converter_shelter, 4); - // Materialize never called (e.g., empty stream) - if (current_result == R_NilValue) { - NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0)); - current_result = VECTOR_ELT(converter_shelter, 4); - } + NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(converter_xptr)); // Check result size. A future implementation could also shrink the length // or reallocate a shorter vector. @@ -443,10 +439,6 @@ SEXP nanoarrow_converter_release_result(SEXP converter_xptr) { SEXP result = PROTECT(VECTOR_ELT(converter_shelter, 4)); SET_VECTOR_ELT(converter_shelter, 4, R_NilValue); - // Perform any finalization on the vector before it is returned to R - SEXP final_result = - PROTECT(nanoarrow_materialize_finalize_result(converter_xptr, result)); - // Reset the converter state converter->dst.vec_sexp = R_NilValue; converter->dst.offset = 0; @@ -454,8 +446,8 @@ SEXP nanoarrow_converter_release_result(SEXP converter_xptr) { converter->size = 0; converter->capacity = 0; - UNPROTECT(2); - return final_result; + UNPROTECT(1); + return result; } void nanoarrow_converter_stop(SEXP converter_xptr) { diff --git a/r/src/materialize.c b/r/src/materialize.c index 20bd62c0d..31533f8b3 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -299,12 +299,21 @@ static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) { } } -SEXP nanoarrow_materialize_finalize_result(SEXP converter_xptr, SEXP result) { - if (Rf_inherits(result, "nanoarrow_vctr")) { +int nanoarrow_materialize_finalize_result(SEXP converter_xptr) { + SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + SEXP result = VECTOR_ELT(converter_shelter, 4); + + // Materialize never called (e.g., empty stream) + if (result == R_NilValue) { + NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0)); + result = VECTOR_ELT(converter_shelter, 4); + } + + if (nanoarrow_ptype_is_nanoarrow_vctr(result)) { // Get the schema for this converter. Technically this will overwrite // a schema that was provided explicitly; however, we currently do not // handle that case. - SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + SEXP schema_xptr = VECTOR_ELT(converter_shelter, 1); // We no longer need to keep track of chunks_tail @@ -333,11 +342,24 @@ SEXP nanoarrow_materialize_finalize_result(SEXP converter_xptr, SEXP result) { Rf_lang4(new_nanoarrow_vctr_sym, chunks_list, schema_xptr, subclass_sexp)); SEXP final_result = PROTECT(Rf_eval(new_nanoarrow_vctr_call, nanoarrow_ns_pkg)); + SET_VECTOR_ELT(converter_shelter, 4, final_result); UNPROTECT(6); - return final_result; - } else { - return result; + } else if (nanoarrow_ptype_is_data_frame(result)) { + // For each child, finalize the result and then reassign it + SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3); + for (R_xlen_t i = 0; i < Rf_xlength(child_converter_xptrs); i++) { + SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i); + NANOARROW_RETURN_NOT_OK( + nanoarrow_materialize_finalize_result(child_converter_xptr)); + + SEXP child_result = + PROTECT(nanoarrow_converter_release_result(child_converter_xptr)); + SET_VECTOR_ELT(result, i, child_result); + UNPROTECT(1); + } } + + return NANOARROW_OK; } static int nanoarrow_materialize_nanoarrow_vctr(struct RConverter* converter, diff --git a/r/src/materialize.h b/r/src/materialize.h index 7b1373e09..c3b2c5cbd 100644 --- a/r/src/materialize.h +++ b/r/src/materialize.h @@ -44,6 +44,6 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len); // Finalize an object before returning to R. Currently only used for // nanoarrow_vctr conversion. -SEXP nanoarrow_materialize_finalize_result(SEXP converter_xptr, SEXP result); +int nanoarrow_materialize_finalize_result(SEXP converter_xptr); #endif From 8805697f890f17686fb15eb0dfeff185296e5661 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 15:30:55 -0300 Subject: [PATCH 19/20] test nested cases --- r/src/materialize.c | 5 +++ r/tests/testthat/test-convert-array.R | 44 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/r/src/materialize.c b/r/src/materialize.c index 31533f8b3..45d6950de 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -375,6 +375,11 @@ static int nanoarrow_materialize_nanoarrow_vctr(struct RConverter* converter, array_export(array_xptr, out_array); R_SetExternalPtrTag(array_out_xptr, schema_xptr); + // Update the offset/length in case a slice is being requested from the + // converter. + out_array->offset += converter->src.offset; + out_array->length = converter->src.length; + // Get the cached copy of the pairlist node at the end of the current // chunks list. SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); diff --git a/r/tests/testthat/test-convert-array.R b/r/tests/testthat/test-convert-array.R index 9d2237dbd..3c1d137fb 100644 --- a/r/tests/testthat/test-convert-array.R +++ b/r/tests/testthat/test-convert-array.R @@ -216,6 +216,50 @@ test_that("batched convert to vector works for nanoarrow_vctr()", { ) }) +test_that("convert to vector works for data.frame(nanoarrow_vctr())", { + array <- as_nanoarrow_array(data.frame(x = 1:5)) + df_vctr <- convert_array(array, data.frame(x = nanoarrow_vctr())) + expect_s3_class(df_vctr$x, "nanoarrow_vctr") + expect_identical( + convert_array_stream(as_nanoarrow_array_stream(df_vctr$x)), + 1:5 + ) +}) + +test_that("convert to vector works for list_of(nanoarrow_vctr())", { + skip_if_not_installed("arrow") + skip_if_not_installed("vctrs") + + array <- as_nanoarrow_array( + list(1:5, 6:10, NULL, 11:13), + schema = na_list(na_int32()) + ) + + list_vctr <- convert_array(array, vctrs::list_of(nanoarrow_vctr())) + + # Each item in the list should be a vctr with one chunk that is a slice + # of the original array + expect_s3_class(list_vctr[[1]], "nanoarrow_vctr") + vctr_array <- attr(list_vctr[[1]], "chunks")[[1]] + expect_identical(vctr_array$offset, 0L) + expect_identical(vctr_array$length, 5L) + expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:5) + + expect_s3_class(list_vctr[[2]], "nanoarrow_vctr") + vctr_array <- attr(list_vctr[[2]], "chunks")[[1]] + expect_identical(vctr_array$offset, 5L) + expect_identical(vctr_array$length, 5L) + expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:10) + + expect_null(list_vctr[[3]]) + + expect_s3_class(list_vctr[[4]], "nanoarrow_vctr") + vctr_array <- attr(list_vctr[[4]], "chunks")[[1]] + expect_identical(vctr_array$offset, 10L) + expect_identical(vctr_array$length, 3L) + expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:13) +}) + test_that("batched convert to vector works for nanoarrow_vctr() keeps subclass", { vctr_ptype <- nanoarrow_vctr(subclass = "some_subclass") From 8596a0abc27f05ea2a7b79ed41118c29e9cd2c21 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 15 May 2024 14:06:23 -0300 Subject: [PATCH 20/20] final edits --- r/R/vctr.R | 2 +- r/man/as_nanoarrow_vctr.Rd | 2 +- r/src/vctr.c | 7 ++++++- r/tests/testthat/test-convert-array.R | 3 +-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/r/R/vctr.R b/r/R/vctr.R index 33e12c041..4d1c443b6 100644 --- a/r/R/vctr.R +++ b/r/R/vctr.R @@ -29,7 +29,7 @@ #' storage type is an `integer()` that is a sequence along the total length #' of the vctr and there are attributes that are required to resolve these #' indices to an array + offset. Sequences typically have a very compact -#' representation in recent version of R such that this has a cheap storage +#' representation in recent versions of R such that this has a cheap storage #' footprint even for large arrays. The attributes are currently: #' #' - `schema`: The [nanoarrow_schema][as_nanoarrow_schema] shared by each chunk. diff --git a/r/man/as_nanoarrow_vctr.Rd b/r/man/as_nanoarrow_vctr.Rd index d3647edcf..1b6d14936 100644 --- a/r/man/as_nanoarrow_vctr.Rd +++ b/r/man/as_nanoarrow_vctr.Rd @@ -36,7 +36,7 @@ The nanoarrow_vctr is currently implemented similarly to \code{factor()}: its storage type is an \code{integer()} that is a sequence along the total length of the vctr and there are attributes that are required to resolve these indices to an array + offset. Sequences typically have a very compact -representation in recent version of R such that this has a cheap storage +representation in recent versions of R such that this has a cheap storage footprint even for large arrays. The attributes are currently: \itemize{ \item \code{schema}: The \link[=as_nanoarrow_schema]{nanoarrow_schema} shared by each chunk. diff --git a/r/src/vctr.c b/r/src/vctr.c index 33caa3266..e03fdfeec 100644 --- a/r/src/vctr.c +++ b/r/src/vctr.c @@ -20,6 +20,7 @@ #include #include "nanoarrow.h" +#include "nanoarrow/r.h" SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list) { int num_chunks = Rf_length(array_list); @@ -30,7 +31,7 @@ SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list) { struct ArrowArray* array; for (int i = 0; i < num_chunks; i++) { - array = (struct ArrowArray*)R_ExternalPtrAddr(VECTOR_ELT(array_list, i)); + array = nanoarrow_array_from_xptr(VECTOR_ELT(array_list, i)); cumulative_offset += array->length; if (cumulative_offset > INT_MAX) { Rf_error("Can't build nanoarrow_vctr with length > INT_MAX"); // # nocov @@ -90,6 +91,10 @@ SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp) { return slice_sexp; } + // It may be possible to check for the R ALTREP sequence type, + // which would eliminate the need for the below check for + // sequential values. + int buf[1024]; INTEGER_GET_REGION(indices_sexp, 0, 1024, buf); slice[0] = buf[0]; diff --git a/r/tests/testthat/test-convert-array.R b/r/tests/testthat/test-convert-array.R index 3c1d137fb..17d192fdd 100644 --- a/r/tests/testthat/test-convert-array.R +++ b/r/tests/testthat/test-convert-array.R @@ -177,7 +177,7 @@ test_that("convert to vector works for nanoarrow_vctr()", { schema <- infer_nanoarrow_schema(vctr) expect_identical(schema$format, "u") - # Check nested conversion from a data.frame + # Check conversion of a struct array df <- data.frame(x = c("one", "two", "three")) array <- as_nanoarrow_array(df) @@ -187,7 +187,6 @@ test_that("convert to vector works for nanoarrow_vctr()", { schema <- infer_nanoarrow_schema(vctr) expect_identical(schema$format, "+s") - vctr <- convert_array(array, nanoarrow_vctr(na_struct(list(x = na_string())))) expect_s3_class(vctr, "nanoarrow_vctr") expect_length(vctr, 3)