diff --git a/r/NAMESPACE b/r/NAMESPACE index d868f0b2c..aa6e71a08 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -6,14 +6,18 @@ S3method("$",nanoarrow_buffer) S3method("$",nanoarrow_schema) S3method("$<-",nanoarrow_array) S3method("$<-",nanoarrow_schema) +S3method("[",nanoarrow_vctr) +S3method("[<-",nanoarrow_vctr) S3method("[[",nanoarrow_array) S3method("[[",nanoarrow_array_stream) S3method("[[",nanoarrow_buffer) S3method("[[",nanoarrow_schema) S3method("[[<-",nanoarrow_array) S3method("[[<-",nanoarrow_schema) +S3method("[[<-",nanoarrow_vctr) S3method(as.data.frame,nanoarrow_array) S3method(as.data.frame,nanoarrow_array_stream) +S3method(as.data.frame,nanoarrow_vctr) S3method(as.raw,nanoarrow_buffer) S3method(as.vector,nanoarrow_array) S3method(as.vector,nanoarrow_array_stream) @@ -48,15 +52,19 @@ S3method(as_nanoarrow_array_stream,data.frame) S3method(as_nanoarrow_array_stream,default) S3method(as_nanoarrow_array_stream,nanoarrow_array) S3method(as_nanoarrow_array_stream,nanoarrow_array_stream) +S3method(as_nanoarrow_array_stream,nanoarrow_vctr) S3method(as_nanoarrow_buffer,default) S3method(as_nanoarrow_buffer,nanoarrow_buffer) S3method(as_nanoarrow_schema,DataType) S3method(as_nanoarrow_schema,Field) S3method(as_nanoarrow_schema,Schema) S3method(as_nanoarrow_schema,nanoarrow_schema) +S3method(as_nanoarrow_schema,nanoarrow_vctr) +S3method(c,nanoarrow_vctr) S3method(convert_array,default) S3method(convert_array,double) S3method(convert_array,factor) +S3method(convert_array,nanoarrow_vctr) S3method(convert_array,vctrs_partial_frame) S3method(convert_array_extension,default) S3method(convert_array_extension,nanoarrow_extension_spec_vctrs) @@ -64,6 +72,7 @@ S3method(format,nanoarrow_array) S3method(format,nanoarrow_array_stream) S3method(format,nanoarrow_buffer) S3method(format,nanoarrow_schema) +S3method(format,nanoarrow_vctr) S3method(infer_nanoarrow_ptype_extension,default) S3method(infer_nanoarrow_ptype_extension,nanoarrow_extension_spec_vctrs) S3method(infer_nanoarrow_schema,Array) @@ -93,6 +102,7 @@ S3method(infer_nanoarrow_schema,list) S3method(infer_nanoarrow_schema,logical) S3method(infer_nanoarrow_schema,nanoarrow_array) S3method(infer_nanoarrow_schema,nanoarrow_array_stream) +S3method(infer_nanoarrow_schema,nanoarrow_vctr) S3method(infer_nanoarrow_schema,raw) S3method(infer_nanoarrow_schema,vctrs_list_of) S3method(infer_nanoarrow_schema,vctrs_unspecified) @@ -108,6 +118,7 @@ S3method(print,nanoarrow_array) S3method(print,nanoarrow_array_stream) S3method(print,nanoarrow_buffer) S3method(print,nanoarrow_schema) +S3method(print,nanoarrow_vctr) S3method(read_nanoarrow,character) S3method(read_nanoarrow,connection) S3method(read_nanoarrow,raw) @@ -115,12 +126,14 @@ S3method(str,nanoarrow_array) S3method(str,nanoarrow_array_stream) S3method(str,nanoarrow_buffer) S3method(str,nanoarrow_schema) +S3method(str,nanoarrow_vctr) export(array_stream_set_finalizer) export(as_nanoarrow_array) export(as_nanoarrow_array_extension) export(as_nanoarrow_array_stream) export(as_nanoarrow_buffer) export(as_nanoarrow_schema) +export(as_nanoarrow_vctr) export(basic_array_stream) export(collect_array_stream) export(convert_array) @@ -191,6 +204,7 @@ export(nanoarrow_pointer_release) export(nanoarrow_pointer_set_protected) export(nanoarrow_schema_modify) export(nanoarrow_schema_parse) +export(nanoarrow_vctr) export(nanoarrow_version) export(read_nanoarrow) export(register_nanoarrow_extension) diff --git a/r/R/convert-array.R b/r/R/convert-array.R index 6de82d175..a8dbbb101 100644 --- a/r/R/convert-array.R +++ b/r/R/convert-array.R @@ -139,6 +139,16 @@ convert_fallback_other <- function(array, offset, length, to) { convert_array(array, to, .from_c = TRUE) } +#' @export +convert_array.nanoarrow_vctr <- function(array, to, ...) { + schema <- attr(to, "schema", exact = TRUE) + if (is.null(schema)) { + schema <- infer_nanoarrow_schema(array) + } + + new_nanoarrow_vctr(list(array), schema, class(to)) +} + #' @export convert_array.double <- function(array, to, ...) { # Handle conversion from decimal128 via arrow diff --git a/r/R/vctr.R b/r/R/vctr.R new file mode 100644 index 000000000..4d1c443b6 --- /dev/null +++ b/r/R/vctr.R @@ -0,0 +1,343 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Experimental Arrow encoded arrays as R vectors +#' +#' This experimental vctr class allows zero or more Arrow arrays to +#' present as an R vector without converting them. This is useful for arrays +#' with types that do not have a non-lossy R equivalent, and helps provide an +#' intermediary object type where the default conversion is prohibitively +#' expensive (e.g., a nested list of data frames). These objects will not +#' survive many vctr transformations; however, they can be sliced without +#' copying the underlying arrays. +#' +#' The nanoarrow_vctr is currently implemented similarly to `factor()`: its +#' storage type is an `integer()` that is a sequence along the total length +#' of the vctr and there are attributes that are required to resolve these +#' indices to an array + offset. Sequences typically have a very compact +#' representation in recent versions of R such that this has a cheap storage +#' footprint even for large arrays. The attributes are currently: +#' +#' - `schema`: The [nanoarrow_schema][as_nanoarrow_schema] shared by each chunk. +#' - `chunks`: A `list()` of `nanoarrow_array`. +#' - `offsets`: An `integer()` vector beginning with `0` and followed by the +#' cumulative length of each chunk. This allows the chunk index + offset +#' to be resolved from a logical index with `log(n)` complexity. +#' +#' This implementation is preliminary and may change; however, the result of +#' `as_nanoarrow_array_stream(some_vctr[begin:end])` should remain stable. +#' +#' @param x An object that works with [as_nanoarrow_array_stream()]. +#' @param subclass An optional subclass of nanoarrow_vctr to prepend to the +#' final class name. +#' @param ... Passed to [as_nanoarrow_array_stream()] +#' @param schema An optional `schema` +#' +#' @return A vctr of class 'nanoarrow_vctr' +#' @export +#' +#' @examples +#' array <- as_nanoarrow_array(1:5) +#' as_nanoarrow_vctr(array) +#' +as_nanoarrow_vctr <- function(x, ..., schema = NULL, subclass = character()) { + if (inherits(x, "nanoarrow_vctr") && is.null(schema)) { + return(x) + } + + stream <- as_nanoarrow_array_stream(x, ..., schema = schema) + chunks <- collect_array_stream(stream, validate = FALSE) + new_nanoarrow_vctr(chunks, stream$get_schema(), subclass) +} + +#' @rdname as_nanoarrow_vctr +#' @export +nanoarrow_vctr <- function(schema = NULL, subclass = character()) { + if (is.null(schema)) { + new_nanoarrow_vctr(list(), NULL, subclass) + } else { + new_nanoarrow_vctr(list(), as_nanoarrow_schema(schema), subclass) + } +} + +new_nanoarrow_vctr <- function(chunks, schema, subclass = character()) { + offsets <- .Call(nanoarrow_c_vctr_chunk_offsets, chunks) + indices <- seq_len(offsets[length(offsets)]) + + structure( + indices, + schema = schema, + chunks = chunks, + offsets = offsets, + class = union(subclass, "nanoarrow_vctr") + ) +} + +#' @export +`[.nanoarrow_vctr` <- function(x, i) { + attrs <- attributes(x) + x <- NextMethod() + + if (is.null(vctr_as_slice(x))) { + stop( + "Can't subset nanoarrow_vctr with non-slice (e.g., only i:j indexing is supported)" + ) + } + + attributes(x) <- attrs + x +} + +#' @export +`[<-.nanoarrow_vctr` <- function(x, i, value) { + stop("subset assignment for nanoarrow_vctr is not supported") +} + +#' @export +`[[<-.nanoarrow_vctr` <- function(x, i, value) { + stop("subset assignment for nanoarrow_vctr is not supported") +} + +#' @export +format.nanoarrow_vctr <- function(x, ...) { + if (length(x) == 0) { + return(character()) + } + + stream <- as_nanoarrow_array_stream(x) + converted <- convert_array_stream(stream) + + # This needs to be a character() with the same length as x to work with + # RStudio's viewer. Data frames need special handling in this case. + size_stable_format(converted) +} + +size_stable_format <- function(x, ...) { + if (inherits(x, "nanoarrow_vctr")) { + # Extension types could have a default convert that gives a nanoarrow_vctr. + # If this is the case, they should be returning a subclass with a format + # method that ensures we don't get here. + rep(sprintf("<%s[%d]>", class(x)[1], seq_along(x))) + } else if (inherits(x, "data.frame")) { + cols <- lapply(x, size_stable_format, ...) + cols <- Map(paste, names(x), cols, sep = ": ") + rows <- do.call(paste, c(cols, list(sep = ", "))) + paste0("{", rows, "}") + } else { + format(x, ...) + } +} + +#' @export +infer_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) { + attr(x, "schema", exact = TRUE) +} + +# Because zero-length vctrs are R's way of communicating "type", implement +# as_nanoarrow_schema() here so that it works in places that expect a type +#' @export +as_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) { + attr(x, "schema", exact = TRUE) +} + +#' @export +as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { + as_nanoarrow_array_stream.nanoarrow_vctr(x, ..., schema = schema) +} + +#' @export +as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) { + if (!is.null(schema)) { + # If a schema is passed, first resolve the stream as is and then use + # as_nanoarrow_array_stream() to either cast (when this is supported) + # or error. + stream <- as_nanoarrow_array_stream(x, schema = NULL) + return(as_nanoarrow_array_stream(stream, schema = schema)) + } + + # Resolve the indices as c(1-based start, length) + slice <- vctr_as_slice(x) + if (is.null(slice)) { + stop("Can't resolve non-slice nanoarrow_vctr to nanoarrow_array_stream") + } + + x_schema <- attr(x, "schema", exact = TRUE) + + # Zero-size slice can be an array stream with zero batches + if (slice[2] == 0) { + return(basic_array_stream(list(), schema = x_schema)) + } + + # Full slice doesn't need slicing logic + offsets <- attr(x, "offsets", exact = TRUE) + batches <- attr(x, "chunks", exact = TRUE) + if (slice[1] == 1 && slice[2] == max(offsets)) { + return( + basic_array_stream( + batches, + schema = x_schema, + validate = FALSE + ) + ) + } + + # Calculate first and last slice information + first_index <- slice[1] - 1L + end_index <- first_index + slice[2] + last_index <- end_index - 1L + first_chunk_index <- vctr_resolve_chunk(first_index, offsets) + last_chunk_index <- vctr_resolve_chunk(last_index, offsets) + + first_chunk_offset <- first_index - offsets[first_chunk_index + 1L] + first_chunk_length <- offsets[first_chunk_index + 2L] - first_index + last_chunk_offset <- 0L + last_chunk_length <- end_index - offsets[last_chunk_index + 1L] + + # Calculate first and last slices + if (first_chunk_index == last_chunk_index) { + batch <- vctr_array_slice( + batches[[first_chunk_index + 1L]], + first_chunk_offset, + last_chunk_length - first_chunk_offset + ) + + return( + basic_array_stream( + list(batch), + schema = x_schema, + validate = FALSE + ) + ) + } + + batch1 <- vctr_array_slice( + batches[[first_chunk_index + 1L]], + first_chunk_offset, + first_chunk_length + ) + + batchn <- vctr_array_slice( + batches[[last_chunk_index + 1L]], + last_chunk_offset, + last_chunk_length + ) + + seq_mid <- seq_len(last_chunk_index - first_chunk_index - 1) + batch_mid <- batches[first_chunk_index + seq_mid] + + basic_array_stream( + c( + list(batch1), + batch_mid, + list(batchn) + ), + schema = x_schema, + validate = FALSE + ) +} + +#' @export +c.nanoarrow_vctr <- function(...) { + dots <- list(...) + + # This one we can do safely + if (length(dots) == 1) { + return(dots[[1]]) + } + + stop("c() not implemented for nanoarrow_vctr()") +} + +# Ensures that nanoarrow_vctr can fit in a data.frame +#' @export +as.data.frame.nanoarrow_vctr <- function(x, ..., optional = FALSE) { + if (!optional) { + stop(sprintf("cannot coerce object of tyoe '%s' to data.frame", class(x)[1])) + } else { + new_data_frame(list(x), nrow = length(x)) + } +} + +#' @export +print.nanoarrow_vctr <- function(x, ...) { + schema <- attr(x, "schema", exact = TRUE) + if (is.null(schema)) { + cat(sprintf("<%s >\n", class(x)[1])) + return(invisible(x)) + } + + formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE) + cat(sprintf("<%s %s[%d]>\n", class(x)[1], formatted, length(x))) + + n_values <- min(length(x), 20) + more_values <- length(x) - n_values + stream <- as_nanoarrow_array_stream(utils::head(x, n_values)) + converted_head <- convert_array_stream(stream) + + if (inherits(converted_head, "nanoarrow_vctr")) { + converted_head <- format(converted_head) + print(converted_head, quote = FALSE) + } else { + print(converted_head) + } + + if (more_values >= 2) { + cat(sprintf("...and %d more values\n", more_values)) + } else if (more_values >= 1) { + cat(sprintf("...and %d more value\n", more_values)) + } + + invisible(x) +} + +#' @export +str.nanoarrow_vctr <- function(object, ...) { + schema <- attr(object, "schema", exact = TRUE) + if (is.null(schema)) { + cat(sprintf("<%s >\n", class(object)[1])) + return(invisible(object)) + } + + formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE) + cat(sprintf("<%s %s[%d]>\n", class(object)[1], formatted, length(object))) + + # Prints out the C data interface dump of each chunk with the chunk + # index above. + str(attr(object, "chunks")) + + invisible(object) +} + +# Utilities for vctr methods + +vctr_resolve_chunk <- function(x, offsets) { + .Call(nanoarrow_c_vctr_chunk_resolve, x, offsets) +} + +vctr_as_slice <- function(x) { + .Call(nanoarrow_c_vctr_as_slice, x) +} + +vctr_array_slice <- function(x, offset, length) { + new_offset <- x$offset + offset + new_length <- length + nanoarrow_array_modify( + x, + list(offset = new_offset, length = new_length), + validate = FALSE + ) +} diff --git a/r/man/as_nanoarrow_vctr.Rd b/r/man/as_nanoarrow_vctr.Rd new file mode 100644 index 000000000..1b6d14936 --- /dev/null +++ b/r/man/as_nanoarrow_vctr.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vctr.R +\name{as_nanoarrow_vctr} +\alias{as_nanoarrow_vctr} +\alias{nanoarrow_vctr} +\title{Experimental Arrow encoded arrays as R vectors} +\usage{ +as_nanoarrow_vctr(x, ..., schema = NULL, subclass = character()) + +nanoarrow_vctr(schema = NULL, subclass = character()) +} +\arguments{ +\item{x}{An object that works with \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}.} + +\item{...}{Passed to \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}} + +\item{schema}{An optional \code{schema}} + +\item{subclass}{An optional subclass of nanoarrow_vctr to prepend to the +final class name.} +} +\value{ +A vctr of class 'nanoarrow_vctr' +} +\description{ +This experimental vctr class allows zero or more Arrow arrays to +present as an R vector without converting them. This is useful for arrays +with types that do not have a non-lossy R equivalent, and helps provide an +intermediary object type where the default conversion is prohibitively +expensive (e.g., a nested list of data frames). These objects will not +survive many vctr transformations; however, they can be sliced without +copying the underlying arrays. +} +\details{ +The nanoarrow_vctr is currently implemented similarly to \code{factor()}: its +storage type is an \code{integer()} that is a sequence along the total length +of the vctr and there are attributes that are required to resolve these +indices to an array + offset. Sequences typically have a very compact +representation in recent versions of R such that this has a cheap storage +footprint even for large arrays. The attributes are currently: +\itemize{ +\item \code{schema}: The \link[=as_nanoarrow_schema]{nanoarrow_schema} shared by each chunk. +\item \code{chunks}: A \code{list()} of \code{nanoarrow_array}. +\item \code{offsets}: An \code{integer()} vector beginning with \code{0} and followed by the +cumulative length of each chunk. This allows the chunk index + offset +to be resolved from a logical index with \code{log(n)} complexity. +} + +This implementation is preliminary and may change; however, the result of +\code{as_nanoarrow_array_stream(some_vctr[begin:end])} should remain stable. +} +\examples{ +array <- as_nanoarrow_array(1:5) +as_nanoarrow_vctr(array) + +} diff --git a/r/man/nanoarrow-package.Rd b/r/man/nanoarrow-package.Rd index 5a2fc6ec5..106c080a0 100644 --- a/r/man/nanoarrow-package.Rd +++ b/r/man/nanoarrow-package.Rd @@ -11,6 +11,7 @@ Provides an 'R' interface to the 'nanoarrow' 'C' library and the 'Apache Arrow' \seealso{ Useful links: \itemize{ + \item \url{https://arrow.apache.org/nanoarrow/latest/r/} \item \url{https://github.com/apache/arrow-nanoarrow} \item Report bugs at \url{https://github.com/apache/arrow-nanoarrow/issues} } diff --git a/r/src/convert.c b/r/src/convert.c index 2148421a3..e232d67d6 100644 --- a/r/src/convert.c +++ b/r/src/convert.c @@ -414,11 +414,7 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) { SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SEXP current_result = VECTOR_ELT(converter_shelter, 4); - // Materialize never called (e.g., empty stream) - if (current_result == R_NilValue) { - NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0)); - current_result = VECTOR_ELT(converter_shelter, 4); - } + NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(converter_xptr)); // Check result size. A future implementation could also shrink the length // or reallocate a shorter vector. @@ -437,15 +433,19 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) { SEXP nanoarrow_converter_release_result(SEXP converter_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + // PROTECT()ing here because we are about to release the object from the // shelter of the converter and return it SEXP result = PROTECT(VECTOR_ELT(converter_shelter, 4)); SET_VECTOR_ELT(converter_shelter, 4, R_NilValue); + + // Reset the converter state converter->dst.vec_sexp = R_NilValue; converter->dst.offset = 0; converter->dst.length = 0; converter->size = 0; converter->capacity = 0; + UNPROTECT(1); return result; } diff --git a/r/src/convert_array_stream.c b/r/src/convert_array_stream.c index b5315f388..4c5a16f52 100644 --- a/r/src/convert_array_stream.c +++ b/r/src/convert_array_stream.c @@ -26,6 +26,46 @@ #include "convert.h" #include "schema.h" +static int convert_next(SEXP converter_xptr, struct ArrowArrayStream* stream, + SEXP schema_xptr, int64_t* n_batches) { + SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr()); + struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr); + + // Fetch the next array + int result = ArrowArrayStreamGetNext(stream, array, NULL); + if (result != NANOARROW_OK) { + Rf_error("ArrowArrayStream::get_next(): %s", ArrowArrayStreamGetLastError(stream)); + } + + // Check if the stream is finished + if (array->release == NULL) { + UNPROTECT(1); + return 0; + } + + // Bump the batch counter + (*n_batches)++; + + // Set the schema of the allocated array and pass it to the converter + R_SetExternalPtrTag(array_xptr, schema_xptr); + if (nanoarrow_converter_set_array(converter_xptr, array_xptr) != NANOARROW_OK) { + nanoarrow_converter_stop(converter_xptr); + } + + // After set_array, the converter is responsible for the array_xptr + UNPROTECT(1); + + // Materialize the array into the converter + int64_t n_materialized = + nanoarrow_converter_materialize_n(converter_xptr, array->length); + if (n_materialized != array->length) { + Rf_error("Expected to materialize %ld values in batch %ld but materialized %ld", + (long)array->length, (long)(*n_batches), (long)n_materialized); + } + + return 1; +} + SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp, SEXP size_sexp, SEXP n_sexp) { struct ArrowArrayStream* array_stream = @@ -58,49 +98,18 @@ SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp, nanoarrow_converter_stop(converter_xptr); } - SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr()); - struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr); - int64_t n_batches = 0; - int64_t n_materialized = 0; - if (n > 0) { - result = ArrowArrayStreamGetNext(array_stream, array, NULL); - n_batches++; - if (result != NANOARROW_OK) { - Rf_error("ArrowArrayStream::get_next(): %s", - ArrowArrayStreamGetLastError(array_stream)); + do { + if (n_batches >= n) { + break; } - - while (array->release != NULL) { - if (nanoarrow_converter_set_array(converter_xptr, array_xptr) != NANOARROW_OK) { - nanoarrow_converter_stop(converter_xptr); - } - - n_materialized = nanoarrow_converter_materialize_n(converter_xptr, array->length); - if (n_materialized != array->length) { - Rf_error("Expected to materialize %ld values in batch %ld but materialized %ld", - (long)array->length, (long)n_batches, (long)n_materialized); - } - - if (n_batches >= n) { - break; - } - - array->release(array); - result = ArrowArrayStreamGetNext(array_stream, array, NULL); - n_batches++; - if (result != NANOARROW_OK) { - Rf_error("ArrowArrayStream::get_next(): %s", - ArrowArrayStreamGetLastError(array_stream)); - } - } - } + } while (convert_next(converter_xptr, array_stream, schema_xptr, &n_batches)); if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } SEXP result_sexp = PROTECT(nanoarrow_converter_release_result(converter_xptr)); - UNPROTECT(4); + UNPROTECT(3); return result_sexp; } diff --git a/r/src/init.c b/r/src/init.c index 913ea77af..69c943911 100644 --- a/r/src/init.c +++ b/r/src/init.c @@ -91,6 +91,9 @@ extern SEXP nanoarrow_c_schema_set_dictionary(SEXP schema_mut_xptr, SEXP diction extern SEXP nanoarrow_c_preserved_count(void); extern SEXP nanoarrow_c_preserved_empty(void); extern SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj); +extern SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list); +extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp); +extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp); extern SEXP nanoarrow_c_version(void); extern SEXP nanoarrow_c_version_runtime(void); @@ -165,6 +168,9 @@ static const R_CallMethodDef CallEntries[] = { {"nanoarrow_c_preserved_empty", (DL_FUNC)&nanoarrow_c_preserved_empty, 0}, {"nanoarrow_c_preserve_and_release_on_other_thread", (DL_FUNC)&nanoarrow_c_preserve_and_release_on_other_thread, 1}, + {"nanoarrow_c_vctr_chunk_offsets", (DL_FUNC)&nanoarrow_c_vctr_chunk_offsets, 1}, + {"nanoarrow_c_vctr_chunk_resolve", (DL_FUNC)&nanoarrow_c_vctr_chunk_resolve, 2}, + {"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1}, {"nanoarrow_c_version", (DL_FUNC)&nanoarrow_c_version, 0}, {"nanoarrow_c_version_runtime", (DL_FUNC)&nanoarrow_c_version_runtime, 0}, {NULL, NULL, 0}}; diff --git a/r/src/materialize.c b/r/src/materialize.c index c2bca993b..45d6950de 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -19,8 +19,9 @@ #include #include +#include "array.h" #include "nanoarrow.h" - +#include "nanoarrow/r.h" #include "util.h" // Needed for the list_of materializer @@ -108,6 +109,10 @@ int nanoarrow_ptype_is_data_frame(SEXP ptype) { (Rf_xlength(ptype) > 0 && has_attrib_safe(ptype, R_NamesSymbol))); } +int nanoarrow_ptype_is_nanoarrow_vctr(SEXP ptype) { + return Rf_inherits(ptype, "nanoarrow_vctr"); +} + SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) { SEXP result; @@ -122,7 +127,27 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) { } } - if (nanoarrow_ptype_is_data_frame(ptype)) { + if (nanoarrow_ptype_is_nanoarrow_vctr(ptype)) { + // The object we return here is one that will accumulate chunks and + // be finalized with a value (rather than being strictly copied into + // after every new chunk is seen). + result = PROTECT(Rf_allocVector(INTSXP, len)); + Rf_copyMostAttrib(ptype, result); + + // For the purposes of building the list of chunks, chunks is a pairlist + // (it will be converted to a regular list when this converter is finalized) + // Technically the first value here won't be used (this simplifies the + // appending). + SEXP chunks_list = PROTECT(Rf_list1(R_NilValue)); + + // To start, the chunks list and the end of the chunks list are the same node + SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); + SEXP chunks_sym = PROTECT(Rf_install("chunks")); + Rf_setAttrib(result, chunks_sym, chunks_list); + Rf_setAttrib(result, chunks_tail_sym, chunks_list); + + UNPROTECT(3); + } else if (nanoarrow_ptype_is_data_frame(ptype)) { R_xlen_t num_cols = Rf_xlength(ptype); result = PROTECT(Rf_allocVector(VECSXP, num_cols)); for (R_xlen_t i = 0; i < num_cols; i++) { @@ -207,6 +232,10 @@ static void fill_vec_with_nulls(SEXP x, R_xlen_t offset, R_xlen_t len) { } static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) { + if (nanoarrow_ptype_is_nanoarrow_vctr(dst)) { + Rf_error("Can't copy_vec_into() to nanoarrow_vctr"); + } + if (nanoarrow_ptype_is_data_frame(dst)) { if (!nanoarrow_ptype_is_data_frame(x)) { Rf_error("Expected record-style vctr result but got non-record-style result"); @@ -270,6 +299,107 @@ static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) { } } +int nanoarrow_materialize_finalize_result(SEXP converter_xptr) { + SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + SEXP result = VECTOR_ELT(converter_shelter, 4); + + // Materialize never called (e.g., empty stream) + if (result == R_NilValue) { + NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0)); + result = VECTOR_ELT(converter_shelter, 4); + } + + if (nanoarrow_ptype_is_nanoarrow_vctr(result)) { + // Get the schema for this converter. Technically this will overwrite + // a schema that was provided explicitly; however, we currently do not + // handle that case. + + SEXP schema_xptr = VECTOR_ELT(converter_shelter, 1); + + // We no longer need to keep track of chunks_tail + SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); + Rf_setAttrib(result, chunks_tail_sym, R_NilValue); + + // We also want to pass on the class of the ptype we received + SEXP subclass_sexp = Rf_getAttrib(result, R_ClassSymbol); + + // We no longer need the first element of the pairlist, which was + // intentionally set to R_NilValue. + SEXP chunks_sym = PROTECT(Rf_install("chunks")); + SEXP chunks_pairlist0 = Rf_getAttrib(result, chunks_sym); + + // If there were zero chunks, there will be no "first" node + SEXP chunks_list; + if (CDR(chunks_pairlist0) == R_NilValue) { + chunks_list = PROTECT(Rf_allocVector(VECSXP, 0)); + } else { + chunks_list = PROTECT(Rf_PairToVectorList(CDR(chunks_pairlist0))); + } + + // Set up the call to new_nanoarrow_vctr + SEXP new_nanoarrow_vctr_sym = PROTECT(Rf_install("new_nanoarrow_vctr")); + SEXP new_nanoarrow_vctr_call = PROTECT( + Rf_lang4(new_nanoarrow_vctr_sym, chunks_list, schema_xptr, subclass_sexp)); + SEXP final_result = PROTECT(Rf_eval(new_nanoarrow_vctr_call, nanoarrow_ns_pkg)); + + SET_VECTOR_ELT(converter_shelter, 4, final_result); + UNPROTECT(6); + } else if (nanoarrow_ptype_is_data_frame(result)) { + // For each child, finalize the result and then reassign it + SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3); + for (R_xlen_t i = 0; i < Rf_xlength(child_converter_xptrs); i++) { + SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i); + NANOARROW_RETURN_NOT_OK( + nanoarrow_materialize_finalize_result(child_converter_xptr)); + + SEXP child_result = + PROTECT(nanoarrow_converter_release_result(child_converter_xptr)); + SET_VECTOR_ELT(result, i, child_result); + UNPROTECT(1); + } + } + + return NANOARROW_OK; +} + +static int nanoarrow_materialize_nanoarrow_vctr(struct RConverter* converter, + SEXP converter_xptr) { + // This is a case where the callee needs ownership, which we can do via a + // shallow copy. + SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + SEXP schema_xptr = VECTOR_ELT(converter_shelter, 1); + SEXP array_xptr = VECTOR_ELT(converter_shelter, 2); + + SEXP array_out_xptr = PROTECT(nanoarrow_array_owning_xptr()); + struct ArrowArray* out_array = nanoarrow_output_array_from_xptr(array_out_xptr); + array_export(array_xptr, out_array); + R_SetExternalPtrTag(array_out_xptr, schema_xptr); + + // Update the offset/length in case a slice is being requested from the + // converter. + out_array->offset += converter->src.offset; + out_array->length = converter->src.length; + + // Get the cached copy of the pairlist node at the end of the current + // chunks list. + SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail")); + SEXP chunks_tail = PROTECT(Rf_getAttrib(converter->dst.vec_sexp, chunks_tail_sym)); + + // Create a length-1 pairlist node containing the chunk + SEXP next_sexp = PROTECT(Rf_cons(array_out_xptr, R_NilValue)); + + // Append it to the end of the current pairlist + SETCDR(chunks_tail, next_sexp); + UNPROTECT(1); + + // Update the cached copy of the pairlist node at the end of the current + // chunks list. + Rf_setAttrib(converter->dst.vec_sexp, chunks_tail_sym, next_sexp); + UNPROTECT(3); + + return NANOARROW_OK; +} + static int nanoarrow_materialize_other(struct RConverter* converter, SEXP converter_xptr) { // Ensure that we have a ptype SEXP to send in the call back to R @@ -280,20 +410,19 @@ static int nanoarrow_materialize_other(struct RConverter* converter, UNPROTECT(1); } - // A unique situation where we don't want owning external pointers because we know - // these are protected for the duration of our call into R and because we don't want - // the underlying array to be released and invalidate the converter. The R code in - // convert_fallback_other() takes care of ensuring an independent copy with the correct - // offset/length. - SEXP schema_xptr = PROTECT(R_MakeExternalPtr( - (struct ArrowSchema*)converter->schema_view.schema, R_NilValue, R_NilValue)); - Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); - // We do need to set the protected member of the array external pointer to signal that - // it is not an independent array (i.e., force a shallow copy). - SEXP array_xptr = PROTECT(R_MakeExternalPtr( - (struct ArrowArray*)converter->array_view.array, schema_xptr, converter_xptr)); - Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array); + // Special-case the nanoarrow_vctr conversion + if (Rf_inherits(converter->dst.vec_sexp, "nanoarrow_vctr")) { + return nanoarrow_materialize_nanoarrow_vctr(converter, converter_xptr); + } + + // We've ensured proper ownership of array_xptr and ensured that its + // schema is set, so we can pass these safely to the R-level + // convert_fallback_other. + SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); + SEXP array_xptr = VECTOR_ELT(converter_shelter, 2); + // The R code in convert_fallback_other() takes care of ensuring an independent copy + // with the correct offset/length if it is necessary to update them. SEXP offset_sexp = PROTECT( Rf_ScalarReal((double)(converter->src.array_view->offset + converter->src.offset))); SEXP length_sexp = PROTECT(Rf_ScalarReal((double)converter->src.length)); @@ -307,7 +436,7 @@ static int nanoarrow_materialize_other(struct RConverter* converter, copy_vec_into(result_src, converter->dst.vec_sexp, converter->dst.offset, converter->dst.length); - UNPROTECT(7); + UNPROTECT(5); return NANOARROW_OK; } diff --git a/r/src/materialize.h b/r/src/materialize.h index a8f36cc90..c3b2c5cbd 100644 --- a/r/src/materialize.h +++ b/r/src/materialize.h @@ -42,4 +42,8 @@ int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr); SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len); SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len); +// Finalize an object before returning to R. Currently only used for +// nanoarrow_vctr conversion. +int nanoarrow_materialize_finalize_result(SEXP converter_xptr); + #endif diff --git a/r/src/vctr.c b/r/src/vctr.c new file mode 100644 index 000000000..e03fdfeec --- /dev/null +++ b/r/src/vctr.c @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#define R_NO_REMAP +#include +#include + +#include "nanoarrow.h" +#include "nanoarrow/r.h" + +SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list) { + int num_chunks = Rf_length(array_list); + SEXP offsets_sexp = PROTECT(Rf_allocVector(INTSXP, num_chunks + 1)); + int* offsets = INTEGER(offsets_sexp); + offsets[0] = 0; + int64_t cumulative_offset = 0; + + struct ArrowArray* array; + for (int i = 0; i < num_chunks; i++) { + array = nanoarrow_array_from_xptr(VECTOR_ELT(array_list, i)); + cumulative_offset += array->length; + if (cumulative_offset > INT_MAX) { + Rf_error("Can't build nanoarrow_vctr with length > INT_MAX"); // # nocov + } + + offsets[i + 1] = cumulative_offset; + } + + UNPROTECT(1); + return offsets_sexp; +} + +SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp) { + int* offsets = INTEGER(offsets_sexp); + int n_offsets = Rf_length(offsets_sexp); + int last_offset = offsets[n_offsets - 1]; + + int n = Rf_length(indices_sexp); + SEXP chunk_indices_sexp = PROTECT(Rf_allocVector(INTSXP, n)); + int* chunk_indices = INTEGER(chunk_indices_sexp); + + int buf[1024]; + for (int i = 0; i < n; i++) { + if (i % 1024 == 0) { + INTEGER_GET_REGION(indices_sexp, i, 1024, buf); + } + int index0 = buf[i % 1024]; + + if (index0 < 0 || index0 > last_offset) { + chunk_indices[i] = NA_INTEGER; + } else { + chunk_indices[i] = ArrowResolveChunk32(index0, offsets, 0, n_offsets); + } + } + + UNPROTECT(1); + return chunk_indices_sexp; +} + +SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp) { + if (TYPEOF(indices_sexp) != INTSXP) { + return R_NilValue; + } + SEXP slice_sexp = PROTECT(Rf_allocVector(INTSXP, 2)); + int* slice = INTEGER(slice_sexp); + + int n = Rf_length(indices_sexp); + slice[1] = n; + + if (n == 1) { + slice[0] = INTEGER_ELT(indices_sexp, 0); + UNPROTECT(1); + return slice_sexp; + } else if (n == 0) { + slice[0] = NA_INTEGER; + UNPROTECT(1); + return slice_sexp; + } + + // It may be possible to check for the R ALTREP sequence type, + // which would eliminate the need for the below check for + // sequential values. + + int buf[1024]; + INTEGER_GET_REGION(indices_sexp, 0, 1024, buf); + slice[0] = buf[0]; + + int last_value = buf[0]; + int this_value = 0; + + for (int i = 1; i < n; i++) { + if (i % 1024 == 0) { + INTEGER_GET_REGION(indices_sexp, i, 1024, buf); + } + + this_value = buf[i % 1024]; + if ((this_value - last_value) != 1) { + UNPROTECT(1); + return R_NilValue; + } + + last_value = this_value; + } + + UNPROTECT(1); + return slice_sexp; +} diff --git a/r/tests/testthat/test-convert-array.R b/r/tests/testthat/test-convert-array.R index 5731ce108..17d192fdd 100644 --- a/r/tests/testthat/test-convert-array.R +++ b/r/tests/testthat/test-convert-array.R @@ -160,6 +160,121 @@ test_that("convert to vector works for tibble", { ) }) +test_that("convert to vector works for nanoarrow_vctr()", { + array <- as_nanoarrow_array(c("one", "two", "three")) + + # Check implicit/inferred nanoarrow_vctr() schema + vctr <- convert_array(array, nanoarrow_vctr()) + expect_s3_class(vctr, "nanoarrow_vctr") + expect_length(vctr, 3) + schema <- infer_nanoarrow_schema(vctr) + expect_identical(schema$format, "u") + + # Check with explicit schema of the correct type + vctr <- convert_array(array, nanoarrow_vctr(na_string())) + expect_s3_class(vctr, "nanoarrow_vctr") + expect_length(vctr, 3) + schema <- infer_nanoarrow_schema(vctr) + expect_identical(schema$format, "u") + + # Check conversion of a struct array + df <- data.frame(x = c("one", "two", "three")) + array <- as_nanoarrow_array(df) + + vctr <- convert_array(array, nanoarrow_vctr()) + expect_s3_class(vctr, "nanoarrow_vctr") + expect_length(vctr, 3) + schema <- infer_nanoarrow_schema(vctr) + expect_identical(schema$format, "+s") + + vctr <- convert_array(array, nanoarrow_vctr(na_struct(list(x = na_string())))) + expect_s3_class(vctr, "nanoarrow_vctr") + expect_length(vctr, 3) + schema <- infer_nanoarrow_schema(vctr) + expect_identical(schema$format, "+s") +}) + +test_that("batched convert to vector works for nanoarrow_vctr()", { + empty_stream <- basic_array_stream(list(), schema = na_string()) + empty_vctr <- convert_array_stream(empty_stream, nanoarrow_vctr()) + expect_length(empty_vctr, 0) + expect_identical(infer_nanoarrow_schema(empty_vctr)$format, "u") + + stream1 <- basic_array_stream(list(c("one", "two", "three"))) + vctr1 <- convert_array_stream(stream1, nanoarrow_vctr()) + expect_length(vctr1, 3) + + stream2 <- basic_array_stream( + list(c("one", "two", "three"), c("four", "five", "six", "seven")) + ) + vctr2 <- convert_array_stream(stream2, nanoarrow_vctr()) + expect_length(vctr2, 7) + expect_identical( + convert_array_stream(as_nanoarrow_array_stream(vctr2)), + c("one", "two", "three", "four", "five", "six", "seven") + ) +}) + +test_that("convert to vector works for data.frame(nanoarrow_vctr())", { + array <- as_nanoarrow_array(data.frame(x = 1:5)) + df_vctr <- convert_array(array, data.frame(x = nanoarrow_vctr())) + expect_s3_class(df_vctr$x, "nanoarrow_vctr") + expect_identical( + convert_array_stream(as_nanoarrow_array_stream(df_vctr$x)), + 1:5 + ) +}) + +test_that("convert to vector works for list_of(nanoarrow_vctr())", { + skip_if_not_installed("arrow") + skip_if_not_installed("vctrs") + + array <- as_nanoarrow_array( + list(1:5, 6:10, NULL, 11:13), + schema = na_list(na_int32()) + ) + + list_vctr <- convert_array(array, vctrs::list_of(nanoarrow_vctr())) + + # Each item in the list should be a vctr with one chunk that is a slice + # of the original array + expect_s3_class(list_vctr[[1]], "nanoarrow_vctr") + vctr_array <- attr(list_vctr[[1]], "chunks")[[1]] + expect_identical(vctr_array$offset, 0L) + expect_identical(vctr_array$length, 5L) + expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:5) + + expect_s3_class(list_vctr[[2]], "nanoarrow_vctr") + vctr_array <- attr(list_vctr[[2]], "chunks")[[1]] + expect_identical(vctr_array$offset, 5L) + expect_identical(vctr_array$length, 5L) + expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:10) + + expect_null(list_vctr[[3]]) + + expect_s3_class(list_vctr[[4]], "nanoarrow_vctr") + vctr_array <- attr(list_vctr[[4]], "chunks")[[1]] + expect_identical(vctr_array$offset, 10L) + expect_identical(vctr_array$length, 3L) + expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:13) +}) + +test_that("batched convert to vector works for nanoarrow_vctr() keeps subclass", { + vctr_ptype <- nanoarrow_vctr(subclass = "some_subclass") + + empty_stream <- basic_array_stream(list(), schema = na_string()) + empty_vctr <- convert_array_stream(empty_stream, vctr_ptype) + expect_s3_class(empty_vctr, "some_subclass") + + stream1 <- basic_array_stream(list(c(""))) + vctr1 <- convert_array_stream(stream1, vctr_ptype) + expect_s3_class(vctr1, "some_subclass") + + stream2 <- basic_array_stream(list(c(""), c(""))) + vctr2 <- convert_array_stream(stream2, vctr_ptype) + expect_s3_class(vctr2, "some_subclass") +}) + test_that("convert to vector works for struct-style vectors", { array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax")) expect_identical( diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index ceb5717bb..5935394c5 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -114,3 +114,40 @@ test_that("as_nanoarrow_array() dispatches on registered extension spec", { "some_ext" ) }) + +test_that("extensions can infer a schema of a nanoarrow_vctr() subclass", { + register_nanoarrow_extension( + "some_ext", + nanoarrow_extension_spec(subclass = "vctr_spec_class") + ) + on.exit(unregister_nanoarrow_extension("some_ext")) + + infer_nanoarrow_ptype_extension.vctr_spec_class <- function(spec, x, ...) { + nanoarrow_vctr(subclass = "some_vctr_subclass") + } + + s3_register( + "nanoarrow::infer_nanoarrow_ptype_extension", + "vctr_spec_class", + infer_nanoarrow_ptype_extension.vctr_spec_class + ) + + expect_identical( + infer_nanoarrow_ptype(na_extension(na_string(), "some_ext")), + nanoarrow_vctr(subclass = "some_vctr_subclass") + ) + + ext_array <- nanoarrow_extension_array(c("one", "two", "three"), "some_ext") + vctr <- convert_array(ext_array) + expect_s3_class(vctr, "some_vctr_subclass") + + # Ensure that registering a default conversion that returns a nanoarrow_vctr + # does not result in infinite recursion when printing or formatting it. + # An extension that does this should provide these methods for the subclass + # they return. + expect_length(format(vctr), length(vctr)) + expect_output( + expect_identical(print(vctr), vctr), + "some_vctr_subclass" + ) +}) diff --git a/r/tests/testthat/test-vctr.R b/r/tests/testthat/test-vctr.R new file mode 100644 index 000000000..95bbd14ff --- /dev/null +++ b/r/tests/testthat/test-vctr.R @@ -0,0 +1,237 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("as_nanoarrow_vctr() works for basic input", { + array <- as_nanoarrow_array(c("one", "two")) + vctr <- as_nanoarrow_vctr(array) + expect_identical(as.integer(unclass(vctr)), 1:2) + expect_identical(as_nanoarrow_vctr(vctr), vctr) + + expect_identical(infer_nanoarrow_schema(vctr)$format, "u") + expect_identical(as_nanoarrow_schema(vctr)$format, "u") +}) + +test_that("print() and str() work on empty nanoarrow_vctr", { + vctr <- nanoarrow_vctr() + expect_identical( + expect_output(print(vctr),">"), + vctr + ) + + expect_identical( + expect_output(str(vctr), ">"), + vctr + ) + + vctr <- nanoarrow_vctr(na_int32()) + expect_identical( + expect_output(print(vctr), "^ 1 + // (lo < hi is guaranteed by the precondition). + NANOARROW_DCHECK(n > 1); + do { + const int32_t m = n >> 1; + const int32_t mid = lo + m; + if (index >= offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } while (n > 1); + return lo; +} + static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { int64_t doubled_capacity = current_capacity * 2; if (doubled_capacity > new_capacity) { diff --git a/src/nanoarrow/utils_test.cc b/src/nanoarrow/utils_test.cc index d73d18a38..e2eef993e 100644 --- a/src/nanoarrow/utils_test.cc +++ b/src/nanoarrow/utils_test.cc @@ -555,6 +555,18 @@ TEST(UtilsTest, ArrowResolveChunk64Test) { EXPECT_EQ(ArrowResolveChunk64(5, offsets, 0, n_offsets), 2); } +TEST(UtilsTest, ArrowResolveChunk32Test) { + int32_t offsets[] = {0, 2, 3, 6}; + int32_t n_offsets = 4; + + EXPECT_EQ(ArrowResolveChunk32(0, offsets, 0, n_offsets), 0); + EXPECT_EQ(ArrowResolveChunk32(1, offsets, 0, n_offsets), 0); + EXPECT_EQ(ArrowResolveChunk32(2, offsets, 0, n_offsets), 1); + EXPECT_EQ(ArrowResolveChunk32(3, offsets, 0, n_offsets), 2); + EXPECT_EQ(ArrowResolveChunk32(4, offsets, 0, n_offsets), 2); + EXPECT_EQ(ArrowResolveChunk32(5, offsets, 0, n_offsets), 2); +} + TEST(MaybeTest, ConstructionAndConversion) { using nanoarrow::NA; using nanoarrow::internal::Maybe;