diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 5468a95e0ab..74ed8f40e1e 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -54,7 +54,9 @@ Collate: 'arrow-package.R' 'type.R' 'array-data.R' + 'arrow-datum.R' 'array.R' + 'arrow-tabular.R' 'arrowExports.R' 'buffer.R' 'chunked-array.R' @@ -86,6 +88,7 @@ Collate: 'list.R' 'memory-pool.R' 'message.R' + 'metadata.R' 'parquet.R' 'python.R' 'record-batch-reader.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index fdc84aa5189..491bfe6ee6e 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -1,85 +1,57 @@ # Generated by roxygen2: do not edit by hand S3method("!=",ArrowObject) -S3method("$",RecordBatch) +S3method("$",ArrowTabular) S3method("$",Schema) S3method("$",SubTreeFileSystem) -S3method("$",Table) -S3method("$<-",Table) +S3method("$<-",ArrowTabular) S3method("==",ArrowObject) -S3method("[",Array) -S3method("[",ChunkedArray) +S3method("[",ArrowDatum) +S3method("[",ArrowTabular) S3method("[",Dataset) -S3method("[",RecordBatch) S3method("[",Schema) -S3method("[",Table) S3method("[",arrow_dplyr_query) -S3method("[[",RecordBatch) +S3method("[[",ArrowTabular) S3method("[[",Schema) -S3method("[[",Table) -S3method("[[<-",Table) -S3method("names<-",RecordBatch) -S3method("names<-",Table) -S3method(Ops,Array) -S3method(Ops,ChunkedArray) +S3method("[[<-",ArrowTabular) +S3method("names<-",ArrowTabular) +S3method(Ops,ArrowDatum) S3method(Ops,Expression) S3method(Ops,array_expression) S3method(all,equal.ArrowObject) -S3method(as.character,Array) -S3method(as.character,ChunkedArray) +S3method(as.character,ArrowDatum) S3method(as.character,FileFormat) -S3method(as.character,Scalar) -S3method(as.data.frame,RecordBatch) -S3method(as.data.frame,Table) +S3method(as.data.frame,ArrowTabular) S3method(as.data.frame,arrow_dplyr_query) -S3method(as.double,Array) -S3method(as.double,ChunkedArray) -S3method(as.double,Scalar) -S3method(as.integer,Array) -S3method(as.integer,ChunkedArray) -S3method(as.integer,Scalar) -S3method(as.list,RecordBatch) +S3method(as.double,ArrowDatum) +S3method(as.integer,ArrowDatum) +S3method(as.list,ArrowTabular) S3method(as.list,Schema) -S3method(as.list,Table) S3method(as.raw,Buffer) -S3method(as.vector,Array) -S3method(as.vector,ChunkedArray) -S3method(as.vector,Scalar) +S3method(as.vector,ArrowDatum) S3method(as.vector,array_expression) S3method(c,Dataset) +S3method(dim,ArrowTabular) S3method(dim,Dataset) -S3method(dim,RecordBatch) -S3method(dim,Table) S3method(dim,arrow_dplyr_query) -S3method(dimnames,RecordBatch) -S3method(dimnames,Table) -S3method(head,Array) -S3method(head,ChunkedArray) +S3method(dimnames,ArrowTabular) +S3method(head,ArrowDatum) +S3method(head,ArrowTabular) S3method(head,Dataset) -S3method(head,RecordBatch) -S3method(head,Table) S3method(head,arrow_dplyr_query) -S3method(is.na,Array) -S3method(is.na,ChunkedArray) +S3method(is.na,ArrowDatum) S3method(is.na,Expression) S3method(is.na,Scalar) S3method(is.na,array_expression) -S3method(is.nan,Array) -S3method(is.nan,ChunkedArray) -S3method(length,Array) -S3method(length,ChunkedArray) +S3method(is.nan,ArrowDatum) +S3method(length,ArrowDatum) S3method(length,Scalar) S3method(length,Schema) -S3method(match_arrow,Array) -S3method(match_arrow,ChunkedArray) +S3method(match_arrow,ArrowDatum) S3method(match_arrow,default) -S3method(max,Array) -S3method(max,ChunkedArray) -S3method(mean,Array) -S3method(mean,ChunkedArray) -S3method(mean,Scalar) -S3method(min,Array) -S3method(min,ChunkedArray) +S3method(max,ArrowDatum) +S3method(mean,ArrowDatum) +S3method(min,ArrowDatum) S3method(names,Dataset) S3method(names,FeatherReader) S3method(names,RecordBatch) @@ -96,22 +68,15 @@ S3method(print,arrow_r_metadata) S3method(read_message,InputStream) S3method(read_message,MessageReader) S3method(read_message,default) -S3method(row.names,RecordBatch) -S3method(row.names,Table) -S3method(sum,Array) -S3method(sum,ChunkedArray) -S3method(sum,Scalar) -S3method(tail,Array) -S3method(tail,ChunkedArray) +S3method(row.names,ArrowTabular) +S3method(sum,ArrowDatum) +S3method(tail,ArrowDatum) +S3method(tail,ArrowTabular) S3method(tail,Dataset) -S3method(tail,RecordBatch) -S3method(tail,Table) S3method(tail,arrow_dplyr_query) -S3method(type,Array) -S3method(type,ChunkedArray) +S3method(type,ArrowDatum) S3method(type,default) -S3method(unique,Array) -S3method(unique,ChunkedArray) +S3method(unique,ArrowDatum) S3method(vec_ptype_abbr,arrow_fixed_size_binary) S3method(vec_ptype_abbr,arrow_fixed_size_list) S3method(vec_ptype_abbr,arrow_large_list) diff --git a/r/R/array.R b/r/R/array.R index a8c2cbd8f29..1c13c320094 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#' @include arrow-package.R +#' @include arrow-datum.R #' @title Arrow Arrays #' @description An `Array` is an immutable data array with some logical type @@ -83,7 +83,7 @@ #' @name array #' @export Array <- R6Class("Array", - inherit = ArrowObject, + inherit = ArrowDatum, public = list( IsNull = function(i) Array__IsNull(self, i), IsValid = function(i) Array__IsValid(self, i), @@ -128,10 +128,6 @@ Array <- R6Class("Array", assert_is(other, "Array") Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) }, - cast = function(target_type, safe = TRUE, options = cast_options(safe)) { - assert_is(options, "CastOptions") - Array$create(Array__cast(self, as_type(target_type), options)) - }, View = function(type) { Array$create(Array__View(self, as_type(type))) }, @@ -241,107 +237,6 @@ FixedSizeListArray <- R6Class("FixedSizeListArray", inherit = Array, ) ) -#' @export -length.Array <- function(x) x$length() - -#' @export -is.na.Array <- function(x) call_function("is_null", x) - -#' @export -is.nan.Array <- function(x) call_function("is_nan", x) - -#' @export -as.vector.Array <- function(x, mode) x$as_vector() - -filter_rows <- function(x, i, keep_na = TRUE, ...) { - # General purpose function for [ row subsetting with R semantics - # Based on the input for `i`, calls x$Filter, x$Slice, or x$Take - nrows <- x$num_rows %||% x$length() # Depends on whether Array or Table-like - if (inherits(i, "array_expression")) { - # Evaluate it - i <- eval_array_expression(i) - } - if (is.logical(i)) { - if (isTRUE(i)) { - # Shortcut without doing any work - x - } else { - i <- rep_len(i, nrows) # For R recycling behavior; consider vctrs::vec_recycle() - x$Filter(i, keep_na) - } - } else if (is.numeric(i)) { - if (all(i < 0)) { - # in R, negative i means "everything but i" - i <- setdiff(seq_len(nrows), -1 * i) - } - if (is.sliceable(i)) { - x$Slice(i[1] - 1, length(i)) - } else if (all(i > 0)) { - x$Take(i - 1) - } else { - stop("Cannot mix positive and negative indices", call. = FALSE) - } - } else if (is.Array(i, INTEGER_TYPES)) { - # NOTE: this doesn't do the - 1 offset - x$Take(i) - } else if (is.Array(i, "bool")) { - x$Filter(i, keep_na) - } else { - # Unsupported cases - if (is.Array(i)) { - stop("Cannot extract rows with an Array of type ", i$type$ToString(), call. = FALSE) - } - stop("Cannot extract rows with an object of class ", class(i), call.=FALSE) - } -} - -#' @export -`[.Array` <- filter_rows - -#' @importFrom utils head -#' @export -head.Array <- function(x, n = 6L, ...) { - assert_is(n, c("numeric", "integer")) - assert_that(length(n) == 1) - len <- NROW(x) - if (n < 0) { - # head(x, negative) means all but the last n rows - n <- max(len + n, 0) - } else { - n <- min(len, n) - } - if (n == len) { - return(x) - } - x$Slice(0, n) -} - -#' @importFrom utils tail -#' @export -tail.Array <- function(x, n = 6L, ...) { - assert_is(n, c("numeric", "integer")) - assert_that(length(n) == 1) - len <- NROW(x) - if (n < 0) { - # tail(x, negative) means all but the first n rows - n <- min(-n, len) - } else { - n <- max(len - n, 0) - } - if (n == 0) { - return(x) - } - x$Slice(n) -} - -is.sliceable <- function(i) { - # Determine whether `i` can be expressed as a $Slice() command - is.numeric(i) && - length(i) > 0 && - all(i > 0) && - identical(as.integer(i), i[1]:i[length(i)]) -} - is.Array <- function(x, type = NULL) { is_it <- inherits(x, c("Array", "ChunkedArray")) if (is_it && !is.null(type)) { @@ -349,12 +244,3 @@ is.Array <- function(x, type = NULL) { } is_it } - -#' @export -as.double.Array <- function(x, ...) as.double(as.vector(x), ...) - -#' @export -as.integer.Array <- function(x, ...) as.integer(as.vector(x), ...) - -#' @export -as.character.Array <- function(x, ...) as.character(as.vector(x), ...) diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R new file mode 100644 index 00000000000..f4d9ad346aa --- /dev/null +++ b/r/R/arrow-datum.R @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R + +# Base class for Array, ChunkedArray, and Scalar, for S3 method dispatch only. +# Does not exist in C++ class hierarchy +ArrowDatum <- R6Class("ArrowDatum", inherit = ArrowObject, + public = list( + cast = function(target_type, safe = TRUE, ...) { + opts <- cast_options(safe, ...) + opts$to_type <- as_type(target_type) + call_function("cast", self, options = opts) + } + ) +) + +#' @export +length.ArrowDatum <- function(x) x$length() + +#' @export +is.na.ArrowDatum <- function(x) call_function("is_null", x) + +#' @export +is.nan.ArrowDatum <- function(x) call_function("is_nan", x) + +#' @export +as.vector.ArrowDatum <- function(x, mode) x$as_vector() + +filter_rows <- function(x, i, keep_na = TRUE, ...) { + # General purpose function for [ row subsetting with R semantics + # Based on the input for `i`, calls x$Filter, x$Slice, or x$Take + nrows <- x$num_rows %||% x$length() # Depends on whether Array or Table-like + if (inherits(i, "array_expression")) { + # Evaluate it + i <- eval_array_expression(i) + } + if (is.logical(i)) { + if (isTRUE(i)) { + # Shortcut without doing any work + x + } else { + i <- rep_len(i, nrows) # For R recycling behavior; consider vctrs::vec_recycle() + x$Filter(i, keep_na) + } + } else if (is.numeric(i)) { + if (all(i < 0)) { + # in R, negative i means "everything but i" + i <- setdiff(seq_len(nrows), -1 * i) + } + if (is.sliceable(i)) { + x$Slice(i[1] - 1, length(i)) + } else if (all(i > 0)) { + x$Take(i - 1) + } else { + stop("Cannot mix positive and negative indices", call. = FALSE) + } + } else if (is.Array(i, INTEGER_TYPES)) { + # NOTE: this doesn't do the - 1 offset + x$Take(i) + } else if (is.Array(i, "bool")) { + x$Filter(i, keep_na) + } else { + # Unsupported cases + if (is.Array(i)) { + stop("Cannot extract rows with an Array of type ", i$type$ToString(), call. = FALSE) + } + stop("Cannot extract rows with an object of class ", class(i), call.=FALSE) + } +} + +#' @export +`[.ArrowDatum` <- filter_rows + +#' @importFrom utils head +#' @export +head.ArrowDatum <- function(x, n = 6L, ...) { + assert_is(n, c("numeric", "integer")) + assert_that(length(n) == 1) + len <- NROW(x) + if (n < 0) { + # head(x, negative) means all but the last n rows + n <- max(len + n, 0) + } else { + n <- min(len, n) + } + if (n == len) { + return(x) + } + x$Slice(0, n) +} + +#' @importFrom utils tail +#' @export +tail.ArrowDatum <- function(x, n = 6L, ...) { + assert_is(n, c("numeric", "integer")) + assert_that(length(n) == 1) + len <- NROW(x) + if (n < 0) { + # tail(x, negative) means all but the first n rows + n <- min(-n, len) + } else { + n <- max(len - n, 0) + } + if (n == 0) { + return(x) + } + x$Slice(n) +} + +is.sliceable <- function(i) { + # Determine whether `i` can be expressed as a $Slice() command + is.numeric(i) && + length(i) > 0 && + all(i > 0) && + identical(as.integer(i), i[1]:i[length(i)]) +} + +#' @export +as.double.ArrowDatum <- function(x, ...) as.double(as.vector(x), ...) + +#' @export +as.integer.ArrowDatum <- function(x, ...) as.integer(as.vector(x), ...) + +#' @export +as.character.ArrowDatum <- function(x, ...) as.character(as.vector(x), ...) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 540cbcd8645..fd3f8b47856 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -33,7 +33,7 @@ "group_vars", "ungroup", "mutate", "arrange", "rename", "pull" ) ) - for (cl in c("Dataset", "RecordBatch", "Table", "arrow_dplyr_query")) { + for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) { for (m in dplyr_methods) { s3_register(m, cl) } diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R new file mode 100644 index 00000000000..c0ac3df5c9a --- /dev/null +++ b/r/R/arrow-tabular.R @@ -0,0 +1,192 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-datum.R + +# Base class for RecordBatch and Table for S3 method dispatch only. +# Does not exist in C++ class hierarchy +ArrowTabular <- R6Class("ArrowTabular", inherit = ArrowObject, + public = list( + ToString = function() ToString_tabular(self), + Take = function(i) { + if (is.numeric(i)) { + i <- as.integer(i) + } + if (is.integer(i)) { + i <- Array$create(i) + } + assert_that(is.Array(i)) + call_function("take", self, i) + }, + Filter = function(i, keep_na = TRUE) { + if (is.logical(i)) { + i <- Array$create(i) + } + assert_that(is.Array(i, "bool")) + call_function("filter", self, i, options = list(keep_na = keep_na)) + } + ) +) + +#' @export +as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { + df <- x$to_data_frame() + if (!is.null(r_metadata <- x$metadata$r)) { + df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata)) + } + df +} + +#' @export +`names<-.ArrowTabular` <- function(x, value) x$RenameColumns(value) + +#' @importFrom methods as +#' @export +`[.ArrowTabular` <- function(x, i, j, ..., drop = FALSE) { + if (nargs() == 2L) { + # List-like column extraction (x[i]) + return(x[, i]) + } + if (!missing(j)) { + # Selecting columns is cheaper than filtering rows, so do it first. + # That way, if we're filtering too, we have fewer arrays to filter/slice/take + if (is_integerish(j)) { + if (all(j < 0)) { + # in R, negative j means "everything but j" + j <- setdiff(seq_len(x$num_columns), -1 * j) + } + x <- x$SelectColumns(as.integer(j) - 1L) + } else if (is.character(j)) { + x <- x$SelectColumns(match(j, names(x)) - 1L) + } + + if (drop && ncol(x) == 1L) { + x <- x$column(0) + } + } + if (!missing(i)) { + x <- filter_rows(x, i, ...) + } + x +} + +#' @export +`[[.ArrowTabular` <- function(x, i, ...) { + if (is.character(i)) { + x$GetColumnByName(i) + } else if (is.numeric(i)) { + x$column(i - 1) + } else { + stop("'i' must be character or numeric, not ", class(i), call. = FALSE) + } +} + +#' @export +`$.ArrowTabular` <- function(x, name, ...) { + assert_that(is.string(name)) + if (name %in% ls(x)) { + get(name, x) + } else { + x$GetColumnByName(name) + } +} + +#' @export +`[[<-.ArrowTabular` <- function(x, i, value) { + if (!is.character(i) & !is.numeric(i)) { + stop("'i' must be character or numeric, not ", class(i), call. = FALSE) + } + assert_that(length(i) == 1, !is.na(i)) + + if (is.null(value)) { + if (is.character(i)) { + i <- match(i, names(x)) + } + x <- x$RemoveColumn(i - 1L) + } else { + if (!is.character(i)) { + # get or create a/the column name + if (i <= x$num_columns) { + i <- names(x)[i] + } else { + i <- as.character(i) + } + } + + # auto-magic recycling on non-ArrowObjects + if (!inherits(value, "ArrowObject")) { + value <- vctrs::vec_recycle(value, x$num_rows) + } + + # construct the field + if (inherits(x, "RecordBatch") && !inherits(value, "Array")) { + value <- Array$create(value) + } else if (inherits(x, "Table") && !inherits(value, "ChunkedArray")) { + value <- ChunkedArray$create(value) + } + new_field <- field(i, value$type) + + if (i %in% names(x)) { + i <- match(i, names(x)) - 1L + x <- x$SetColumn(i, new_field, value) + } else { + i <- x$num_columns + x <- x$AddColumn(i, new_field, value) + } + } + x +} + +#' @export +`$<-.ArrowTabular` <- function(x, i, value) { + assert_that(is.string(i)) + # We need to check if `i` is in names in case it is an active binding (e.g. + # `metadata`, in which case we use assign to change the active binding instead + # of the column in the table) + if (i %in% ls(x)) { + assign(i, value, x) + } else { + x[[i]] <- value + } + x +} + +#' @export +dim.ArrowTabular <- function(x) c(x$num_rows, x$num_columns) + +#' @export +as.list.ArrowTabular <- function(x, ...) as.list(as.data.frame(x, ...)) + +#' @export +row.names.ArrowTabular <- function(x) as.character(seq_len(nrow(x))) + +#' @export +dimnames.ArrowTabular <- function(x) list(row.names(x), names(x)) + +#' @export +head.ArrowTabular <- head.ArrowDatum + +#' @export +tail.ArrowTabular <- tail.ArrowDatum + +ToString_tabular <- function(x, ...) { + # Generic to work with both RecordBatch and Table + sch <- unlist(strsplit(x$schema$ToString(), "\n")) + sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch) + dims <- sprintf("%s rows x %s columns", nrow(x), ncol(x)) + paste(c(dims, sch), collapse = "\n") +} diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index ea31eb0fdc5..1631024fb44 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -280,18 +280,6 @@ io___CompressedInputStream__Make <- function(codec, raw){ .Call(`_arrow_io___CompressedInputStream__Make`, codec, raw) } -compute___CastOptions__initialize <- function(allow_int_overflow, allow_time_truncate, allow_float_truncate){ - .Call(`_arrow_compute___CastOptions__initialize`, allow_int_overflow, allow_time_truncate, allow_float_truncate) -} - -Array__cast <- function(array, target_type, options){ - .Call(`_arrow_Array__cast`, array, target_type, options) -} - -ChunkedArray__cast <- function(chunked_array, target_type, options){ - .Call(`_arrow_ChunkedArray__cast`, chunked_array, target_type, options) -} - RecordBatch__cast <- function(batch, schema, options){ .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 3d1602a7263..d639b235f3f 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#' @include arrow-package.R +#' @include arrow-datum.R #' @title ChunkedArray class #' @usage NULL @@ -56,7 +56,7 @@ #' @name ChunkedArray #' @seealso [Array] #' @export -ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowObject, +ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, public = list( length = function() ChunkedArray__length(self), chunk = function(i) Array$create(ChunkedArray__chunk(self, i)), @@ -83,10 +83,6 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowObject, } call_function("filter", self, i, options = list(keep_na = keep_na)) }, - cast = function(target_type, safe = TRUE, options = cast_options(safe)) { - assert_is(options, "CastOptions") - ChunkedArray__cast(self, as_type(target_type), options) - }, View = function(type) { ChunkedArray__View(self, as_type(type)) }, @@ -120,33 +116,3 @@ ChunkedArray$create <- function(..., type = NULL) { #' @rdname ChunkedArray #' @export chunked_array <- ChunkedArray$create - -#' @export -length.ChunkedArray <- function(x) x$length() - -#' @export -as.vector.ChunkedArray <- function(x, mode) x$as_vector() - -#' @export -is.na.ChunkedArray <- function(x) call_function("is_null", x) - -#' @export -is.nan.ChunkedArray <- function(x) call_function("is_nan", x) - -#' @export -`[.ChunkedArray` <- filter_rows - -#' @export -head.ChunkedArray <- head.Array - -#' @export -tail.ChunkedArray <- tail.Array - -#' @export -as.double.ChunkedArray <- as.double.Array - -#' @export -as.integer.ChunkedArray <- as.integer.Array - -#' @export -as.character.ChunkedArray <- as.character.Array diff --git a/r/R/compute.R b/r/R/compute.R index 2c067c238ab..6d6837ab448 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -35,39 +35,21 @@ call_function <- function(function_name, ..., args = list(...), options = empty_ } #' @export -sum.Array <- function(..., na.rm = FALSE) scalar_aggregate("sum", ..., na.rm = na.rm) +sum.ArrowDatum <- function(..., na.rm = FALSE) scalar_aggregate("sum", ..., na.rm = na.rm) #' @export -sum.ChunkedArray <- sum.Array +mean.ArrowDatum <- function(..., na.rm = FALSE) scalar_aggregate("mean", ..., na.rm = na.rm) #' @export -sum.Scalar <- sum.Array - -#' @export -mean.Array <- function(..., na.rm = FALSE) scalar_aggregate("mean", ..., na.rm = na.rm) - -#' @export -mean.ChunkedArray <- mean.Array - -#' @export -mean.Scalar <- mean.Array - -#' @export -min.Array <- function(..., na.rm = FALSE) { +min.ArrowDatum <- function(..., na.rm = FALSE) { scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("min") } #' @export -min.ChunkedArray <- min.Array - -#' @export -max.Array <- function(..., na.rm = FALSE) { +max.ArrowDatum <- function(..., na.rm = FALSE) { scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("max") } -#' @export -max.ChunkedArray <- max.Array - scalar_aggregate <- function(FUN, ..., na.rm = FALSE) { a <- collect_arrays_from_dots(list(...)) if (!na.rm && a$null_count > 0 && (FUN %in% c("mean", "sum"))) { @@ -99,13 +81,10 @@ collect_arrays_from_dots <- function(dots) { } #' @export -unique.Array <- function(x, incomparables = FALSE, ...) { +unique.ArrowDatum <- function(x, incomparables = FALSE, ...) { call_function("unique", x) } -#' @export -unique.ChunkedArray <- unique.Array - #' `match` for Arrow objects #' #' `base::match()` is not a generic, so we can't just define Arrow methods for @@ -123,29 +102,27 @@ match_arrow <- function(x, table, ...) UseMethod("match_arrow") match_arrow.default <- function(x, table, ...) match(x, table, ...) #' @export -match_arrow.Array <- function(x, table, ...) { +match_arrow.ArrowDatum <- function(x, table, ...) { if (!inherits(table, c("Array", "ChunkedArray"))) { table <- Array$create(table) } call_function("index_in_meta_binary", x, table) } -#' @export -match_arrow.ChunkedArray <- match_arrow.Array - -CastOptions <- R6Class("CastOptions", inherit = ArrowObject) - #' Cast options #' -#' @param safe enforce safe conversion -#' @param allow_int_overflow allow int conversion, `!safe` by default -#' @param allow_time_truncate allow time truncate, `!safe` by default -#' @param allow_float_truncate allow float truncate, `!safe` by default -#' -#' @export -cast_options <- function(safe = TRUE, - allow_int_overflow = !safe, - allow_time_truncate = !safe, - allow_float_truncate = !safe) { - compute___CastOptions__initialize(allow_int_overflow, allow_time_truncate, allow_float_truncate) +#' @param safe logical: enforce safe conversion? Default `TRUE` +#' @param ... additional cast options, such as `allow_int_overflow`, +#' `allow_time_truncate`, and `allow_float_truncate`, which are set to `!safe` +#' by default +#' @return A list +#' @export +#' @keywords internal +cast_options <- function(safe = TRUE, ...) { + opts <- list( + allow_int_overflow = !safe, + allow_time_truncate = !safe, + allow_float_truncate = !safe + ) + modifyList(opts, list(...)) } diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 875823032ca..8bc64ce089d 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -147,13 +147,13 @@ tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns) select.arrow_dplyr_query <- function(.data, ...) { column_select(arrow_dplyr_query(.data), !!!enquos(...)) } -select.Dataset <- select.Table <- select.RecordBatch <- select.arrow_dplyr_query +select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query #' @importFrom tidyselect vars_rename rename.arrow_dplyr_query <- function(.data, ...) { column_select(arrow_dplyr_query(.data), !!!enquos(...), .FUN = vars_rename) } -rename.Dataset <- rename.Table <- rename.RecordBatch <- rename.arrow_dplyr_query +rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query column_select <- function(.data, ..., .FUN = vars_select) { # .FUN is either tidyselect::vars_select or tidyselect::vars_rename @@ -236,7 +236,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) { set_filters(.data, filters) } -filter.Dataset <- filter.Table <- filter.RecordBatch <- filter.arrow_dplyr_query +filter.Dataset <- filter.ArrowTabular <- filter.arrow_dplyr_query # Create a data mask for evaluating a filter expression filter_mask <- function(.data) { @@ -301,8 +301,7 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { restore_dplyr_features(tab, x) } } -collect.Table <- as.data.frame.Table -collect.RecordBatch <- as.data.frame.RecordBatch +collect.ArrowTabular <- as.data.frame.ArrowTabular collect.Dataset <- function(x, ...) dplyr::collect(arrow_dplyr_query(x), ...) ensure_group_vars <- function(x) { @@ -345,7 +344,7 @@ pull.arrow_dplyr_query <- function(.data, var = -1) { .data$selected_columns <- set_names(.data$selected_columns[var], var) dplyr::collect(.data)[[1]] } -pull.Dataset <- pull.Table <- pull.RecordBatch <- pull.arrow_dplyr_query +pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query summarise.arrow_dplyr_query <- function(.data, ...) { .data <- arrow_dplyr_query(.data) @@ -361,7 +360,7 @@ summarise.arrow_dplyr_query <- function(.data, ...) { # TODO: determine whether work can be pushed down to Arrow dplyr::summarise(dplyr::collect(.data), ...) } -summarise.Dataset <- summarise.Table <- summarise.RecordBatch <- summarise.arrow_dplyr_query +summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query group_by.arrow_dplyr_query <- function(.data, ..., .add = FALSE, add = .add) { .data <- arrow_dplyr_query(.data) @@ -374,19 +373,19 @@ group_by.arrow_dplyr_query <- function(.data, ..., .add = FALSE, add = .add) { .data$group_by_vars <- gv .data } -group_by.Dataset <- group_by.Table <- group_by.RecordBatch <- group_by.arrow_dplyr_query +group_by.Dataset <- group_by.ArrowTabular <- group_by.arrow_dplyr_query groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x)) -groups.Dataset <- groups.Table <- groups.RecordBatch <- function(x) NULL +groups.Dataset <- groups.ArrowTabular <- function(x) NULL group_vars.arrow_dplyr_query <- function(x) x$group_by_vars -group_vars.Dataset <- group_vars.Table <- group_vars.RecordBatch <- function(x) NULL +group_vars.Dataset <- group_vars.ArrowTabular <- function(x) NULL ungroup.arrow_dplyr_query <- function(x, ...) { x$group_by_vars <- character() x } -ungroup.Dataset <- ungroup.Table <- ungroup.RecordBatch <- force +ungroup.Dataset <- ungroup.ArrowTabular <- force mutate.arrow_dplyr_query <- function(.data, ...) { .data <- arrow_dplyr_query(.data) @@ -398,7 +397,7 @@ mutate.arrow_dplyr_query <- function(.data, ...) { # vector transformation functions aren't yet implemented in Arrow C++. dplyr::mutate(dplyr::collect(.data), ...) } -mutate.Dataset <- mutate.Table <- mutate.RecordBatch <- mutate.arrow_dplyr_query +mutate.Dataset <- mutate.ArrowTabular <- mutate.arrow_dplyr_query # TODO: add transmute() that does what summarise() does (select only the vars we need) arrange.arrow_dplyr_query <- function(.data, ...) { @@ -409,7 +408,7 @@ arrange.arrow_dplyr_query <- function(.data, ...) { dplyr::arrange(dplyr::collect(.data), ...) } -arrange.Dataset <- arrange.Table <- arrange.RecordBatch <- arrange.arrow_dplyr_query +arrange.Dataset <- arrange.ArrowTabular <- arrange.arrow_dplyr_query query_on_dataset <- function(x) inherits(x$.data, "Dataset") diff --git a/r/R/expression.R b/r/R/expression.R index 3c62662c67a..0198e0ebe6a 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -32,7 +32,7 @@ array_expression <- function(FUN, } #' @export -Ops.Array <- function(e1, e2) { +Ops.ArrowDatum <- function(e1, e2) { if (.Generic %in% names(.array_function_map)) { expr <- build_array_expression(.Generic, e1, e2) eval_array_expression(expr) @@ -41,9 +41,6 @@ Ops.Array <- function(e1, e2) { } } -#' @export -Ops.ChunkedArray <- Ops.Array - #' @export Ops.array_expression <- function(e1, e2) { if (.Generic == "!") { @@ -75,9 +72,10 @@ build_array_expression <- function(.Generic, e1, e2, ...) { # ^^^ form doesn't work because Ops.Array evaluates eagerly, # but we can build that up quotient <- build_array_expression("%/%", e1, e2) + base <- build_array_expression("*", quotient, e2) # this cast is to ensure that the result of this and e1 are the same # (autocasting only applies to scalars) - base <- cast_array_expression(quotient * e2, e1$type) + base <- cast_array_expression(base, e1$type) return(build_array_expression("-", e1, base)) } diff --git a/r/R/metadata.R b/r/R/metadata.R new file mode 100644 index 00000000000..d3e5e2150bb --- /dev/null +++ b/r/R/metadata.R @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @importFrom utils object.size +.serialize_arrow_r_metadata <- function(x) { + assert_is(x, "list") + + # drop problems attributes (most likely from readr) + x[["attributes"]][["problems"]] <- NULL + + out <- serialize(x, NULL, ascii = TRUE) + + # if the metadata is over 100 kB, compress + if (option_compress_metadata() && object.size(out) > 100000) { + out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE) + + # but ensure that the compression+serialization is effective. + if (object.size(out) > object.size(out_comp)) out <- out_comp + } + + rawToChar(out) +} + +.unserialize_arrow_r_metadata <- function(x) { + tryCatch({ + out <- unserialize(charToRaw(x)) + + # if this is still raw, try decompressing + if (is.raw(out)) { + out <- unserialize(memDecompress(out, type = "gzip")) + } + out + }, error = function(e) { + warning("Invalid metadata$r", call. = FALSE) + NULL + }) +} + +apply_arrow_r_metadata <- function(x, r_metadata) { + tryCatch({ + columns_metadata <- r_metadata$columns + if (is.data.frame(x)) { + if (length(names(x)) && !is.null(columns_metadata)) { + for (name in intersect(names(columns_metadata), names(x))) { + x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]]) + } + } + } else if(is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { + x <- map2(x, columns_metadata, function(.x, .y) { + apply_arrow_r_metadata(.x, .y) + }) + x + } + + if (!is.null(r_metadata$attributes)) { + attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes + if (inherits(x, "POSIXlt")) { + # We store POSIXlt as a StructArray, which is translated back to R + # as a data.frame, but while data frames have a row.names = c(NA, nrow(x)) + # attribute, POSIXlt does not, so since this is now no longer an object + # of class data.frame, remove the extraneous attribute + attr(x, "row.names") <- NULL + } + } + + }, error = function(e) { + warning("Invalid metadata$r", call. = FALSE) + }) + x +} + +arrow_attributes <- function(x, only_top_level = FALSE) { + att <- attributes(x) + + removed_attributes <- character() + if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) { + removed_attributes <- c("class", "row.names", "names") + } else if (inherits(x, "data.frame")) { + removed_attributes <- c("row.names", "names") + } else if (inherits(x, "factor")) { + removed_attributes <- c("class", "levels") + } else if (inherits(x, "integer64") || inherits(x, "Date")) { + removed_attributes <- c("class") + } else if (inherits(x, "POSIXct")) { + removed_attributes <- c("class", "tzone") + } else if (inherits(x, "hms") || inherits(x, "difftime")) { + removed_attributes <- c("class", "units") + } + + att <- att[setdiff(names(att), removed_attributes)] + if (isTRUE(only_top_level)) { + return(att) + } + + if (is.data.frame(x)) { + columns <- map(x, arrow_attributes) + out <- if (length(att) || !all(map_lgl(columns, is.null))) { + list(attributes = att, columns = columns) + } + return(out) + } + + columns <- NULL + if (is.list(x) && !inherits(x, "POSIXlt")) { + # for list columns, we also keep attributes of each + # element in columns + columns <- map(x, arrow_attributes) + if (all(map_lgl(columns, is.null))) { + columns <- NULL + } + } + + if (length(att) || !is.null(columns)) { + list(attributes = att, columns = columns) + } else { + NULL + } +} diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 1d3645323c9..0797859884d 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -71,7 +71,7 @@ #' - `$columns`: Returns a list of `Array`s #' @rdname RecordBatch #' @name RecordBatch -RecordBatch <- R6Class("RecordBatch", inherit = ArrowObject, +RecordBatch <- R6Class("RecordBatch", inherit = ArrowTabular, public = list( column = function(i) RecordBatch__column(self, i), column_name = function(i) RecordBatch__column_name(self, i), @@ -84,12 +84,14 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowObject, assert_that(is.string(name)) RecordBatch__GetColumnByName(self, name) }, - SelectColumns = function(indices) { - RecordBatch__SelectColumns(self, indices) + SelectColumns = function(indices) RecordBatch__SelectColumns(self, indices), + AddColumn = function(i, new_field, value) { + stop("TODO: ARROW-10520", call. = FALSE) }, - RemoveColumn = function(i){ - RecordBatch__RemoveColumn(self, i) + SetColumn = function(i, new_field, value) { + stop("TODO: ARROW-10520", call. = FALSE) }, + RemoveColumn = function(i) RecordBatch__RemoveColumn(self, i), Slice = function(offset, length = NULL) { if (is.null(length)) { RecordBatch__Slice1(self, offset) @@ -97,34 +99,16 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowObject, RecordBatch__Slice2(self, offset, length) } }, - Take = function(i) { - if (is.numeric(i)) { - i <- as.integer(i) - } - if (is.integer(i)) { - i <- Array$create(i) - } - assert_is(i, "Array") - call_function("take", self, i) - }, - Filter = function(i, keep_na = TRUE) { - if (is.logical(i)) { - i <- Array$create(i) - } - assert_that(is.Array(i, "bool")) - call_function("filter", self, i, options = list(keep_na = keep_na)) - }, + # Take and Filter are methods on ArrowTabular serialize = function() ipc___SerializeRecordBatch__Raw(self), - ToString = function() ToString_tabular(self), - - cast = function(target_schema, safe = TRUE, options = cast_options(safe)) { + to_data_frame = function() { + RecordBatch__to_dataframe(self, use_threads = option_use_threads()) + }, + cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { assert_is(target_schema, "Schema") - assert_is(options, "CastOptions") assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") - RecordBatch__cast(self, target_schema, options) }, - invalidate = function() { .Call(`_arrow_RecordBatch__Reset`, self) super$invalidate() @@ -205,162 +189,3 @@ record_batch <- RecordBatch$create #' @export names.RecordBatch <- function(x) x$names() - -#' @export -`names<-.RecordBatch` <- function(x, value) x$RenameColumns(value) - -#' @importFrom methods as -#' @export -`[.RecordBatch` <- function(x, i, j, ..., drop = FALSE) { - if (nargs() == 2L) { - # List-like column extraction (x[i]) - return(x[, i]) - } - if (!missing(j)) { - # Selecting columns is cheaper than filtering rows, so do it first. - # That way, if we're filtering too, we have fewer arrays to filter/slice/take - if (is_integerish(j)) { - if (all(j < 0)) { - # in R, negative j means "everything but j" - j <- setdiff(seq_len(x$num_columns), -1 * j) - } - x <- x$SelectColumns(as.integer(j) - 1L) - } else if (is.character(j)) { - x <- x$SelectColumns(match(j, names(x)) - 1L) - } - - if (drop && ncol(x) == 1L) { - x <- x$column(0) - } - } - if (!missing(i)) { - x <- filter_rows(x, i, ...) - } - x -} - -#' @export -`[[.RecordBatch` <- function(x, i, ...) { - if (is.character(i)) { - x$GetColumnByName(i) - } else if (is.numeric(i)) { - x$column(i - 1) - } else { - stop("'i' must be character or numeric, not ", class(i), call. = FALSE) - } -} - -#' @export -`$.RecordBatch` <- function(x, name, ...) { - assert_that(is.string(name)) - if (name %in% ls(x)) { - get(name, x) - } else { - x$GetColumnByName(name) - } -} - -#' @export -dim.RecordBatch <- function(x) { - c(x$num_rows, x$num_columns) -} - -#' @export -as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ...) { - df <- RecordBatch__to_dataframe(x, use_threads = option_use_threads()) - if (!is.null(r_metadata <- x$metadata$r)) { - df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata)) - } - df -} - -#' @importFrom utils object.size -.serialize_arrow_r_metadata <- function(x) { - assert_is(x, "list") - - # drop problems attributes (most likely from readr) - x[["attributes"]][["problems"]] <- NULL - - out <- serialize(x, NULL, ascii = TRUE) - - # if the metadata is over 100 kB, compress - if (option_compress_metadata() && object.size(out) > 100000) { - out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE) - - # but ensure that the compression+serialization is effective. - if (object.size(out) > object.size(out_comp)) out <- out_comp - } - - rawToChar(out) -} - -.unserialize_arrow_r_metadata <- function(x) { - tryCatch({ - out <- unserialize(charToRaw(x)) - - # if this is still raw, try decompressing - if (is.raw(out)) { - out <- unserialize(memDecompress(out, type = "gzip")) - } - out - }, error = function(e) { - warning("Invalid metadata$r", call. = FALSE) - NULL - }) -} - -apply_arrow_r_metadata <- function(x, r_metadata) { - tryCatch({ - columns_metadata <- r_metadata$columns - if (is.data.frame(x)) { - if (length(names(x)) && !is.null(columns_metadata)) { - for (name in intersect(names(columns_metadata), names(x))) { - x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]]) - } - } - } else if(is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { - x <- map2(x, columns_metadata, function(.x, .y) { - apply_arrow_r_metadata(.x, .y) - }) - x - } - - if (!is.null(r_metadata$attributes)) { - attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes - if (inherits(x, "POSIXlt")) { - # We store POSIXlt as a StructArray, which is translated back to R - # as a data.frame, but while data frames have a row.names = c(NA, nrow(x)) - # attribute, POSIXlt does not, so since this is now no longer an object - # of class data.frame, remove the extraneous attribute - attr(x, "row.names") <- NULL - } - } - - }, error = function(e) { - warning("Invalid metadata$r", call. = FALSE) - }) - x -} - -#' @export -as.list.RecordBatch <- function(x, ...) as.list(as.data.frame(x, ...)) - -#' @export -row.names.RecordBatch <- function(x) as.character(seq_len(nrow(x))) - -#' @export -dimnames.RecordBatch <- function(x) list(row.names(x), names(x)) - -#' @export -head.RecordBatch <- head.Array - -#' @export -tail.RecordBatch <- tail.Array - -ToString_tabular <- function(x, ...) { - # Generic to work with both RecordBatch and Table - sch <- unlist(strsplit(x$schema$ToString(), "\n")) - sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch) - dims <- sprintf("%s rows x %s columns", nrow(x), ncol(x)) - paste(c(dims, sch), collapse = "\n") -} diff --git a/r/R/scalar.R b/r/R/scalar.R index 774fe571145..22c57c961c2 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#' @include arrow-package.R +#' @include arrow-datum.R #' @title Arrow scalars #' @usage NULL @@ -28,19 +28,10 @@ #' @rdname Scalar #' @export Scalar <- R6Class("Scalar", - inherit = ArrowObject, + inherit = ArrowDatum, # TODO: document the methods public = list( ToString = function() Scalar__ToString(self), - cast = function(target_type, safe = TRUE, ...) { - opts <- list( - to_type = as_type(target_type), - allow_int_overflow = !safe, - allow_time_truncate = !safe, - allow_float_truncate = !safe - ) - call_function("cast", self, options = modifyList(opts, list(...))) - }, as_vector = function() Scalar__as_vector(self) ), active = list( @@ -76,15 +67,3 @@ length.Scalar <- function(x) 1L #' @export is.na.Scalar <- function(x) !x$is_valid - -#' @export -as.vector.Scalar <- function(x, mode) x$as_vector() - -#' @export -as.double.Scalar <- as.double.Array - -#' @export -as.integer.Scalar <- as.integer.Array - -#' @export -as.character.Scalar <- as.character.Array diff --git a/r/R/table.R b/r/R/table.R index 730ed245dce..d2c9960e6d2 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -91,11 +91,9 @@ #' as.data.frame(tab[4:8, c("gear", "hp", "wt")]) #' } #' @export -Table <- R6Class("Table", inherit = ArrowObject, +Table <- R6Class("Table", inherit = ArrowTabular, public = list( - column = function(i) { - Table__column(self, i) - }, + column = function(i) Table__column(self, i), ColumnNames = function() Table__ColumnNames(self), RenameColumns = function(value) Table__RenameColumns(self, value), GetColumnByName = function(name) { @@ -107,21 +105,16 @@ Table <- R6Class("Table", inherit = ArrowObject, AddColumn = function(i, new_field, value) Table__AddColumn(self, i, new_field, value), SetColumn = function(i, new_field, value) Table__SetColumn(self, i, new_field, value), field = function(i) Table__field(self, i), - serialize = function(output_stream, ...) write_table(self, output_stream, ...), - ToString = function() ToString_tabular(self), - - cast = function(target_schema, safe = TRUE, options = cast_options(safe)) { + to_data_frame = function() { + Table__to_dataframe(self, use_threads = option_use_threads()) + }, + cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { assert_is(target_schema, "Schema") - assert_is(options, "CastOptions") assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") Table__cast(self, target_schema, options) }, - - SelectColumns = function(indices) { - Table__SelectColumns(self, indices) - }, - + SelectColumns = function(indices) Table__SelectColumns(self, indices), Slice = function(offset, length = NULL) { if (is.null(length)) { Table__Slice1(self, offset) @@ -129,39 +122,16 @@ Table <- R6Class("Table", inherit = ArrowObject, Table__Slice2(self, offset, length) } }, - Take = function(i) { - if (is.numeric(i)) { - i <- as.integer(i) - } - if (is.integer(i)) { - i <- Array$create(i) - } - call_function("take", self, i) - }, - Filter = function(i, keep_na = TRUE) { - if (is.logical(i)) { - i <- Array$create(i) - } - call_function("filter", self, i, options = list(keep_na = keep_na)) - }, - + # Take and Filter are methods on ArrowTabular Equals = function(other, check_metadata = FALSE, ...) { inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata)) }, - - Validate = function() { - Table__Validate(self) - }, - - ValidateFull = function() { - Table__ValidateFull(self) - }, - + Validate = function() Table__Validate(self), + ValidateFull = function() Table__ValidateFull(self), invalidate = function() { .Call(`_arrow_Table__Reset`, self) super$invalidate() } - ), active = list( @@ -186,54 +156,6 @@ Table <- R6Class("Table", inherit = ArrowObject, ) ) -arrow_attributes <- function(x, only_top_level = FALSE) { - att <- attributes(x) - - removed_attributes <- character() - if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) { - removed_attributes <- c("class", "row.names", "names") - } else if (inherits(x, "data.frame")) { - removed_attributes <- c("row.names", "names") - } else if (inherits(x, "factor")) { - removed_attributes <- c("class", "levels") - } else if (inherits(x, "integer64") || inherits(x, "Date")) { - removed_attributes <- c("class") - } else if (inherits(x, "POSIXct")) { - removed_attributes <- c("class", "tzone") - } else if (inherits(x, "hms") || inherits(x, "difftime")) { - removed_attributes <- c("class", "units") - } - - att <- att[setdiff(names(att), removed_attributes)] - if (isTRUE(only_top_level)) { - return(att) - } - - if (is.data.frame(x)) { - columns <- map(x, arrow_attributes) - out <- if (length(att) || !all(map_lgl(columns, is.null))) { - list(attributes = att, columns = columns) - } - return(out) - } - - columns <- NULL - if (is.list(x) && !inherits(x, "POSIXlt")) { - # for list columns, we also keep attributes of each - # element in columns - columns <- map(x, arrow_attributes) - if (all(map_lgl(columns, is.null))) { - columns <- NULL - } - } - - if (length(att) || !is.null(columns)) { - list(attributes = att, columns = columns) - } else { - NULL - } -} - Table$create <- function(..., schema = NULL) { dots <- list2(...) # making sure there are always names @@ -248,106 +170,5 @@ Table$create <- function(..., schema = NULL) { } } -#' @export -as.data.frame.Table <- function(x, row.names = NULL, optional = FALSE, ...) { - df <- Table__to_dataframe(x, use_threads = option_use_threads()) - if (!is.null(r_metadata <- x$metadata$r)) { - df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata)) - } - df -} - -#' @export -as.list.Table <- as.list.RecordBatch - -#' @export -row.names.Table <- row.names.RecordBatch - -#' @export -dimnames.Table <- dimnames.RecordBatch - -#' @export -dim.Table <- function(x) c(x$num_rows, x$num_columns) - #' @export names.Table <- function(x) x$ColumnNames() - -#' @export -`names<-.Table` <- function(x, value) x$RenameColumns(value) - -#' @export -`[.Table` <- `[.RecordBatch` - -#' @export -`[[.Table` <- `[[.RecordBatch` - -#' @export -`[[<-.Table` <- function(x, i, value) { - if (!is.character(i) & !is.numeric(i)) { - stop("'i' must be character or numeric, not ", class(i), call. = FALSE) - } else if (is.na(i)) { - # Catch if a NA_character or NA_integer is passed. These are caught elsewhere - # in cpp (i.e. _arrow_RecordBatch__column_name) - # TODO: figure out if catching in cpp like ^^^ is preferred - stop("'i' cannot be NA", call. = FALSE) - } - - if (is.null(value)) { - if (is.character(i)) { - i <- match(i, names(x)) - } - x <- x$RemoveColumn(i - 1L) - } else { - if (!is.character(i)) { - # get or create a/the column name - if (i <= x$num_columns) { - i <- names(x)[i] - } else { - i <- as.character(i) - } - } - - # auto-magic recycling on non-ArrowObjects - if (!inherits(value, "ArrowObject")) { - value <- vctrs::vec_recycle(value, x$num_rows) - } - - # construct the field - if (!inherits(value, "ChunkedArray")) { - value <- chunked_array(value) - } - new_field <- field(i, value$type) - - if (i %in% names(x)) { - i <- match(i, names(x)) - 1L - x <- x$SetColumn(i, new_field, value) - } else { - i <- x$num_columns - x <- x$AddColumn(i, new_field, value) - } - } - x -} - -#' @export -`$<-.Table` <- function(x, i, value) { - assert_that(is.string(i)) - # We need to check if `i` is in names in case it is an active binding (e.g. - # `metadata`, in which case we use assign to change the active binding instead - # of the column in the table) - if (i %in% ls(x)) { - assign(i, value, x) - } else { - x[[i]] <- value - } - x -} - -#' @export -`$.Table` <- `$.RecordBatch` - -#' @export -head.Table <- head.RecordBatch - -#' @export -tail.Table <- tail.RecordBatch diff --git a/r/R/type.R b/r/R/type.R index b21ead8e5ed..937a2db7ca1 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -66,11 +66,7 @@ type <- function(x) UseMethod("type") type.default <- function(x) Array__infer_type(x) #' @export -type.Array <- function(x) x$type - -#' @export -type.ChunkedArray <- function(x) x$type - +type.ArrowDatum <- function(x) x$type #----- metadata diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index d3da7670667..6e9bb9c9592 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -117,7 +117,6 @@ reference: - DataType - DictionaryType - FixedWidthType - - cast_options - title: Flight contents: - load_flight_server diff --git a/r/man/cast_options.Rd b/r/man/cast_options.Rd index 19dfe6503e1..40d78052c74 100644 --- a/r/man/cast_options.Rd +++ b/r/man/cast_options.Rd @@ -4,22 +4,19 @@ \alias{cast_options} \title{Cast options} \usage{ -cast_options( - safe = TRUE, - allow_int_overflow = !safe, - allow_time_truncate = !safe, - allow_float_truncate = !safe -) +cast_options(safe = TRUE, ...) } \arguments{ -\item{safe}{enforce safe conversion} +\item{safe}{logical: enforce safe conversion? Default \code{TRUE}} -\item{allow_int_overflow}{allow int conversion, \code{!safe} by default} - -\item{allow_time_truncate}{allow time truncate, \code{!safe} by default} - -\item{allow_float_truncate}{allow float truncate, \code{!safe} by default} +\item{...}{additional cast options, such as \code{allow_int_overflow}, +\code{allow_time_truncate}, and \code{allow_float_truncate}, which are set to \code{!safe} +by default} +} +\value{ +A list } \description{ Cast options } +\keyword{internal} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index c78085be831..15791addea9 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -604,52 +604,22 @@ BEGIN_CPP11 END_CPP11 } // compute.cpp -std::shared_ptr compute___CastOptions__initialize(bool allow_int_overflow, bool allow_time_truncate, bool allow_float_truncate); -extern "C" SEXP _arrow_compute___CastOptions__initialize(SEXP allow_int_overflow_sexp, SEXP allow_time_truncate_sexp, SEXP allow_float_truncate_sexp){ -BEGIN_CPP11 - arrow::r::Input::type allow_int_overflow(allow_int_overflow_sexp); - arrow::r::Input::type allow_time_truncate(allow_time_truncate_sexp); - arrow::r::Input::type allow_float_truncate(allow_float_truncate_sexp); - return cpp11::as_sexp(compute___CastOptions__initialize(allow_int_overflow, allow_time_truncate, allow_float_truncate)); -END_CPP11 -} -// compute.cpp -std::shared_ptr Array__cast(const std::shared_ptr& array, const std::shared_ptr& target_type, const std::shared_ptr& options); -extern "C" SEXP _arrow_Array__cast(SEXP array_sexp, SEXP target_type_sexp, SEXP options_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type array(array_sexp); - arrow::r::Input&>::type target_type(target_type_sexp); - arrow::r::Input&>::type options(options_sexp); - return cpp11::as_sexp(Array__cast(array, target_type, options)); -END_CPP11 -} -// compute.cpp -std::shared_ptr ChunkedArray__cast(const std::shared_ptr& chunked_array, const std::shared_ptr& target_type, const std::shared_ptr& options); -extern "C" SEXP _arrow_ChunkedArray__cast(SEXP chunked_array_sexp, SEXP target_type_sexp, SEXP options_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type chunked_array(chunked_array_sexp); - arrow::r::Input&>::type target_type(target_type_sexp); - arrow::r::Input&>::type options(options_sexp); - return cpp11::as_sexp(ChunkedArray__cast(chunked_array, target_type, options)); -END_CPP11 -} -// compute.cpp -std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, const std::shared_ptr& options); +std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, cpp11::list options); extern "C" SEXP _arrow_RecordBatch__cast(SEXP batch_sexp, SEXP schema_sexp, SEXP options_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); arrow::r::Input&>::type schema(schema_sexp); - arrow::r::Input&>::type options(options_sexp); + arrow::r::Input::type options(options_sexp); return cpp11::as_sexp(RecordBatch__cast(batch, schema, options)); END_CPP11 } // compute.cpp -std::shared_ptr Table__cast(const std::shared_ptr& table, const std::shared_ptr& schema, const std::shared_ptr& options); +std::shared_ptr Table__cast(const std::shared_ptr& table, const std::shared_ptr& schema, cpp11::list options); extern "C" SEXP _arrow_Table__cast(SEXP table_sexp, SEXP schema_sexp, SEXP options_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); arrow::r::Input&>::type schema(schema_sexp); - arrow::r::Input&>::type options(options_sexp); + arrow::r::Input::type options(options_sexp); return cpp11::as_sexp(Table__cast(table, schema, options)); END_CPP11 } @@ -3570,9 +3540,6 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, - { "_arrow_compute___CastOptions__initialize", (DL_FUNC) &_arrow_compute___CastOptions__initialize, 3}, - { "_arrow_Array__cast", (DL_FUNC) &_arrow_Array__cast, 3}, - { "_arrow_ChunkedArray__cast", (DL_FUNC) &_arrow_ChunkedArray__cast, 3}, { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 4497f5b59a3..2d69d8029c6 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -23,61 +23,34 @@ #include #include +std::shared_ptr make_cast_options(cpp11::list options); + arrow::compute::ExecContext* gc_context() { static arrow::compute::ExecContext context(gc_memory_pool()); return &context; } -// [[arrow::export]] -std::shared_ptr compute___CastOptions__initialize( - bool allow_int_overflow, bool allow_time_truncate, bool allow_float_truncate) { - auto options = std::make_shared(); - options->allow_int_overflow = allow_int_overflow; - options->allow_time_truncate = allow_time_truncate; - options->allow_float_truncate = allow_float_truncate; - return options; -} - -// [[arrow::export]] -std::shared_ptr Array__cast( - const std::shared_ptr& array, - const std::shared_ptr& target_type, - const std::shared_ptr& options) { - return ValueOrStop(arrow::compute::Cast(*array, target_type, *options, gc_context())); -} - -// [[arrow::export]] -std::shared_ptr ChunkedArray__cast( - const std::shared_ptr& chunked_array, - const std::shared_ptr& target_type, - const std::shared_ptr& options) { - arrow::Datum value(chunked_array); - arrow::Datum out = - ValueOrStop(arrow::compute::Cast(value, target_type, *options, gc_context())); - return out.chunked_array(); -} - // [[arrow::export]] std::shared_ptr RecordBatch__cast( const std::shared_ptr& batch, - const std::shared_ptr& schema, - const std::shared_ptr& options) { + const std::shared_ptr& schema, cpp11::list options) { + auto opts = make_cast_options(options); auto nc = batch->num_columns(); arrow::ArrayVector columns(nc); for (int i = 0; i < nc; i++) { columns[i] = ValueOrStop( - arrow::compute::Cast(*batch->column(i), schema->field(i)->type(), *options)); + arrow::compute::Cast(*batch->column(i), schema->field(i)->type(), *opts)); } return arrow::RecordBatch::Make(schema, batch->num_rows(), std::move(columns)); } // [[arrow::export]] -std::shared_ptr Table__cast( - const std::shared_ptr& table, - const std::shared_ptr& schema, - const std::shared_ptr& options) { +std::shared_ptr Table__cast(const std::shared_ptr& table, + const std::shared_ptr& schema, + cpp11::list options) { + auto opts = make_cast_options(options); auto nc = table->num_columns(); using ColumnVector = std::vector>; @@ -85,7 +58,7 @@ std::shared_ptr Table__cast( for (int i = 0; i < nc; i++) { arrow::Datum value(table->column(i)); arrow::Datum out = - ValueOrStop(arrow::compute::Cast(value, schema->field(i)->type(), *options)); + ValueOrStop(arrow::compute::Cast(value, schema->field(i)->type(), *opts)); columns[i] = out.chunked_array(); } return arrow::Table::Make(schema, std::move(columns), table->num_rows()); @@ -185,34 +158,36 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["skip_nulls"])); } - // hacky attempt to pass through to_type and other options if (func_name == "cast") { - using Options = arrow::compute::CastOptions; - auto out = std::make_shared(true); - SEXP to_type = options["to_type"]; - if (!Rf_isNull(to_type) && cpp11::as_cpp>(to_type)) { - out->to_type = cpp11::as_cpp>(to_type); - } + return make_cast_options(options); + } - SEXP allow_float_truncate = options["allow_float_truncate"]; - if (!Rf_isNull(allow_float_truncate) && cpp11::as_cpp(allow_float_truncate)) { - out->allow_float_truncate = cpp11::as_cpp(allow_float_truncate); - } + return nullptr; +} - SEXP allow_time_truncate = options["allow_time_truncate"]; - if (!Rf_isNull(allow_time_truncate) && cpp11::as_cpp(allow_time_truncate)) { - out->allow_time_truncate = cpp11::as_cpp(allow_time_truncate); - } +std::shared_ptr make_cast_options(cpp11::list options) { + using Options = arrow::compute::CastOptions; + auto out = std::make_shared(true); + SEXP to_type = options["to_type"]; + if (!Rf_isNull(to_type) && cpp11::as_cpp>(to_type)) { + out->to_type = cpp11::as_cpp>(to_type); + } - SEXP allow_int_overflow = options["allow_int_overflow"]; - if (!Rf_isNull(allow_int_overflow) && cpp11::as_cpp(allow_int_overflow)) { - out->allow_int_overflow = cpp11::as_cpp(allow_int_overflow); - } + SEXP allow_float_truncate = options["allow_float_truncate"]; + if (!Rf_isNull(allow_float_truncate) && cpp11::as_cpp(allow_float_truncate)) { + out->allow_float_truncate = cpp11::as_cpp(allow_float_truncate); + } - return out; + SEXP allow_time_truncate = options["allow_time_truncate"]; + if (!Rf_isNull(allow_time_truncate) && cpp11::as_cpp(allow_time_truncate)) { + out->allow_time_truncate = cpp11::as_cpp(allow_time_truncate); } - return nullptr; + SEXP allow_int_overflow = options["allow_int_overflow"]; + if (!Rf_isNull(allow_int_overflow) && cpp11::as_cpp(allow_int_overflow)) { + out->allow_int_overflow = cpp11::as_cpp(allow_int_overflow); + } + return out; } // [[arrow::export]] diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 3cf8de7d6c7..33e39c9289f 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -207,10 +207,11 @@ test_that("[[<- assignment", { # nonsense indexes expect_error(tab[[NA]] <- letters[10:1], "'i' must be character or numeric, not logical") - expect_error(tab[[NA_integer_]] <- letters[10:1], "'i' cannot be NA") - expect_error(tab[[NA_real_]] <- letters[10:1], "'i' cannot be NA") - expect_error(tab[[NA_character_]] <- letters[10:1], "'i' cannot be NA") expect_error(tab[[NULL]] <- letters[10:1], "'i' must be character or numeric, not NULL") + expect_error(tab[[NA_integer_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(tab[[NA_real_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(tab[[NA_character_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(tab[[c(1, 4)]] <- letters[10:1], "length(i) not equal to 1", fixed = TRUE) }) test_that("Table$Slice", { diff --git a/r/tests/testthat/test-compute-vector.R b/r/tests/testthat/test-compute-vector.R index b9097b6e1b6..4fe7fed4d1c 100644 --- a/r/tests/testthat/test-compute-vector.R +++ b/r/tests/testthat/test-compute-vector.R @@ -15,41 +15,53 @@ # specific language governing permissions and limitations # under the License. -context("compute: vector operations") - -expect_bool_function_equal <- function(array_exp, r_exp, class = "Array") { +expect_bool_function_equal <- function(array_exp, r_exp) { # Assert that the Array operation returns a boolean array # and that its contents are equal to expected - expect_is(array_exp, class) + expect_is(array_exp, "ArrowDatum") expect_type_equal(array_exp, bool()) expect_identical(as.vector(array_exp), r_exp) } -expect_array_compares <- function(r_values, compared_to, Class = Array) { - a <- Class$create(r_values) +expect_array_compares <- function(x, compared_to) { + r_values <- as.vector(x) + r_compared_to <- as.vector(compared_to) # Iterate over all comparison functions - expect_bool_function_equal(a == compared_to, r_values == compared_to, class(a)) - expect_bool_function_equal(a != compared_to, r_values != compared_to, class(a)) - expect_bool_function_equal(a > compared_to, r_values > compared_to, class(a)) - expect_bool_function_equal(a >= compared_to, r_values >= compared_to, class(a)) - expect_bool_function_equal(a < compared_to, r_values < compared_to, class(a)) - expect_bool_function_equal(a <= compared_to, r_values <= compared_to, class(a)) + expect_bool_function_equal(x == compared_to, r_values == r_compared_to) + expect_bool_function_equal(x != compared_to, r_values != r_compared_to) + expect_bool_function_equal(x > compared_to, r_values > r_compared_to) + expect_bool_function_equal(x >= compared_to, r_values >= r_compared_to) + expect_bool_function_equal(x < compared_to, r_values < r_compared_to) + expect_bool_function_equal(x <= compared_to, r_values <= r_compared_to) } -expect_chunked_array_compares <- function(...) expect_array_compares(..., Class = ChunkedArray) - test_that("compare ops with Array", { - expect_array_compares(1:5, 4L) - expect_array_compares(1:5, 4) # implicit casting - expect_array_compares(c(NA, 1:5), 4) - expect_array_compares(as.numeric(c(NA, 1:5)), 4) + a <- Array$create(1:5) + expect_array_compares(a, 4L) + expect_array_compares(a, 4) # implicit casting + expect_array_compares(a, Scalar$create(4)) + expect_array_compares(Array$create(c(NA, 1:5)), 4) + expect_array_compares(Array$create(as.numeric(c(NA, 1:5))), 4) + expect_array_compares(Array$create(c(NA, 1:5)), Array$create(rev(c(NA, 1:5)))) }) test_that("compare ops with ChunkedArray", { - expect_chunked_array_compares(1:5, 4L) - expect_chunked_array_compares(1:5, 4) # implicit casting - expect_chunked_array_compares(c(NA, 1:5), 4) - expect_chunked_array_compares(as.numeric(c(NA, 1:5)), 4) + expect_array_compares(ChunkedArray$create(1:3, 4:5), 4L) + expect_array_compares(ChunkedArray$create(1:3, 4:5), 4) # implicit casting + expect_array_compares(ChunkedArray$create(1:3, 4:5), Scalar$create(4)) + expect_array_compares(ChunkedArray$create(c(NA, 1:3), 4:5), 4) + expect_array_compares( + ChunkedArray$create(c(NA, 1:3), 4:5), + ChunkedArray$create(4:5, c(NA, 1:3)) + ) + expect_array_compares( + ChunkedArray$create(c(NA, 1:3), 4:5), + Array$create(c(NA, 1:5)) + ) + expect_array_compares( + Array$create(c(NA, 1:5)), + ChunkedArray$create(c(NA, 1:3), 4:5) + ) }) test_that("logic ops with Array", { @@ -74,18 +86,17 @@ test_that("logic ops with ChunkedArray", { truth <- expand.grid(left = c(TRUE, FALSE, NA), right = c(TRUE, FALSE, NA)) a_left <- ChunkedArray$create(truth$left) a_right <- ChunkedArray$create(truth$right) - expect_bool_function_equal(a_left & a_right, truth$left & truth$right, "ChunkedArray") - expect_bool_function_equal(a_left | a_right, truth$left | truth$right, "ChunkedArray") - expect_bool_function_equal(a_left == a_right, truth$left == truth$right, "ChunkedArray") - expect_bool_function_equal(a_left != a_right, truth$left != truth$right, "ChunkedArray") - expect_bool_function_equal(!a_left, !truth$left, "ChunkedArray") + expect_bool_function_equal(a_left & a_right, truth$left & truth$right) + expect_bool_function_equal(a_left | a_right, truth$left | truth$right) + expect_bool_function_equal(a_left == a_right, truth$left == truth$right) + expect_bool_function_equal(a_left != a_right, truth$left != truth$right) + expect_bool_function_equal(!a_left, !truth$left) # More complexity isEqualTo <- function(x, y) x == y & !is.na(x) expect_bool_function_equal( isEqualTo(a_left, a_right), - isEqualTo(truth$left, truth$right), - "ChunkedArray" + isEqualTo(truth$left, truth$right) ) })