diff --git a/r/NAMESPACE b/r/NAMESPACE index fb3ea82c4af..c75f54d946a 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -147,6 +147,7 @@ export(MemoryMappedFile) export(MessageReader) export(MessageType) export(MetadataVersion) +export(NullEncodingBehavior) export(ParquetArrowReaderProperties) export(ParquetFileFormat) export(ParquetFileReader) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 76d95a804a7..845cb3a1815 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -396,6 +396,21 @@ build_function_list <- function(FUN) { # Include mappings from R function name spellings lapply(set_names(names(.array_function_map)), wrapper), # Plus some special handling where it's not 1:1 + cast = function(x, target_type, safe = TRUE, ...) { + opts <- cast_options(safe, ...) + opts$to_type <- as_type(target_type) + FUN("cast", x, options = opts) + }, + dictionary_encode = function(x, null_encoding_behavior = c("mask", "encode")) { + null_encoding_behavior <- + NullEncodingBehavior[[toupper(match.arg(null_encoding_behavior))]] + FUN( + "dictionary_encode", + x, + options = list(null_encoding_behavior = null_encoding_behavior) + ) + }, + # as.factor() is mapped in expression.R as.character = function(x) { FUN("cast", x, options = cast_options(to_type = string())) }, diff --git a/r/R/enums.R b/r/R/enums.R index 170abf99865..ae44ccf2cad 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -134,3 +134,9 @@ MetadataVersion <- enum("MetadataVersion", QuantileInterpolation <- enum("QuantileInterpolation", LINEAR = 0L, LOWER = 1L, HIGHER = 2L, NEAREST = 3L, MIDPOINT = 4L ) + +#' @export +#' @rdname enums +NullEncodingBehavior <- enum("NullEncodingBehavior", + ENCODE = 0L, MASK = 1L +) diff --git a/r/R/expression.R b/r/R/expression.R index ed81418f41c..1974fc7f59b 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -117,6 +117,7 @@ cast_array_expression <- function(x, to_type, safe = TRUE, ...) { .unary_function_map <- list( "!" = "invert", + "as.factor" = "dictionary_encode", "is.na" = "is_null", "is.nan" = "is_nan", # nchar is defined in dplyr.R because it is more complex diff --git a/r/R/type.R b/r/R/type.R index 77e3129601c..ecb9b48a185 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -156,8 +156,8 @@ NestedType <- R6Class("NestedType", inherit = DataType) #' * `float16()` and `halffloat()` #' * `float32()` and `float()` #' * `bool()` and `boolean()` -#' * Called from `schema()` or `struct()`, `double()` also is supported as a -#' way of creating a `float64()` +#' * When called inside an `arrow` function, such as `schema()` or `cast()`, +#' `double()` also is supported as a way of creating a `float64()` #' #' `date32()` creates a datetime type with a "day" unit, like the R `Date` #' class. `date64()` has a "ms" unit. @@ -413,8 +413,8 @@ FixedSizeListType <- R6Class("FixedSizeListType", fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size) as_type <- function(type, name = "type") { + # magic so we don't have to mask base::double() if (identical(type, double())) { - # Magic so that we don't have to mask this base function type <- float64() } if (!inherits(type, "DataType")) { @@ -423,7 +423,6 @@ as_type <- function(type, name = "type") { type } - # vctrs support ----------------------------------------------------------- str_dup <- function(x, times) { paste0(rep(x, times = times), collapse = "") diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index f16e5dd5191..f113455a90d 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -135,8 +135,8 @@ A few functions have aliases: \item \code{float16()} and \code{halffloat()} \item \code{float32()} and \code{float()} \item \code{bool()} and \code{boolean()} -\item Called from \code{schema()} or \code{struct()}, \code{double()} also is supported as a -way of creating a \code{float64()} +\item When called inside an \code{arrow} function, such as \code{schema()} or \code{cast()}, +\code{double()} also is supported as a way of creating a \code{float64()} } \code{date32()} creates a datetime type with a "day" unit, like the R \code{Date} diff --git a/r/man/enums.Rd b/r/man/enums.Rd index fa3c64b8f95..b871516def8 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -14,6 +14,7 @@ \alias{ParquetVersionType} \alias{MetadataVersion} \alias{QuantileInterpolation} +\alias{NullEncodingBehavior} \title{Arrow enums} \format{ An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4. @@ -37,6 +38,8 @@ An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) o An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of length 5. An object of class \code{QuantileInterpolation} (inherits from \code{arrow-enum}) of length 5. + +An object of class \code{NullEncodingBehavior} (inherits from \code{arrow-enum}) of length 2. } \usage{ TimeUnit @@ -60,6 +63,8 @@ ParquetVersionType MetadataVersion QuantileInterpolation + +NullEncodingBehavior } \description{ Arrow enums diff --git a/r/src/compute.cpp b/r/src/compute.cpp index f33153069f2..2bc43a9660c 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -202,6 +202,17 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["skip_nulls"])); } + if (func_name == "dictionary_encode") { + using Options = arrow::compute::DictionaryEncodeOptions; + auto out = std::make_shared(Options::Defaults()); + if (!Rf_isNull(options["null_encoding_behavior"])) { + out->null_encoding_behavior = cpp11::as_cpp< + enum arrow::compute::DictionaryEncodeOptions::NullEncodingBehavior>( + options["null_encoding_behavior"]); + } + return out; + } + if (func_name == "cast") { return make_cast_options(options); } diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 7940ce8bf4b..def7886a0bf 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -358,7 +358,70 @@ test_that("relocate with selection helpers", { ) }) -test_that("explicit type conversions", { +test_that("explicit type conversions with cast()", { + num_int32 <- 12L + num_int64 <- bit64::as.integer64(10) + + int_types <- c(int8(), int16(), int32(), int64()) + uint_types <- c(uint8(), uint16(), uint32(), uint64()) + float_types <- c(float32(), float64()) + + types <- c( + int_types, + uint_types, + float_types, + double(), # not actually a type, a base R function but should be alias for float64 + string() + ) + + for (type in types) { + expect_type_equal( + { + t1 <- Table$create(x = num_int32) %>% + transmute(x = cast(x, type)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(type) + ) + expect_type_equal( + { + t1 <- Table$create(x = num_int64) %>% + transmute(x = cast(x, type)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(type) + ) + } + + # Arrow errors when truncating floats... + expect_error( + expect_type_equal( + { + t1 <- Table$create(pi = pi) %>% + transmute(three = cast(pi, int32())) %>% + compute() + t1$schema[[1]]$type + }, + int32() + ), + "truncated" + ) + + # ... unless safe = FALSE (or allow_float_truncate = TRUE) + expect_type_equal( + { + t1 <- Table$create(pi = pi) %>% + transmute(three = cast(pi, int32(), safe = FALSE)) %>% + compute() + t1$schema[[1]]$type + }, + int32() + ) +}) + +test_that("explicit type conversions with as.*()", { library(bit64) expect_dplyr_equal( input %>% @@ -421,12 +484,14 @@ test_that("explicit type conversions", { int2dbl = as.double(int), int2int = as.integer(int), int2lgl = as.logical(int), - lgl2chr = toupper(as.character(lgl)), # Arrow returns "true", "false" + lgl2chr = as.character(lgl), # Arrow returns "true", "false" here ... lgl2dbl = as.double(lgl), lgl2int = as.integer(lgl), - lgl2lgl = as.logical(lgl), + lgl2lgl = as.logical(lgl) ) %>% - collect(), + collect() %>% + # need to use toupper() *after* collect() or else skip if utf8proc not available + mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE" tibble( dbl = c(1, 0, NA_real_), int = c(1L, 0L, NA_integer_), @@ -435,9 +500,60 @@ test_that("explicit type conversions", { ) }) -test_that("bad explicit type conversions", { +test_that("as.factor()/dictionary_encode()", { + df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B")) + df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8)) + + expect_dplyr_equal( + input %>% + transmute(x = as.factor(x)) %>% + collect(), + df1 + ) + + expect_warning( + expect_dplyr_equal( + input %>% + transmute(x = as.factor(x)) %>% + collect(), + df2 + ), + "Coercing dictionary values to R character factor levels" + ) + + # dictionary values with default null encoding behavior ("mask") omits + # nulls from the dictionary values + expect_equal( + { + rb1 <- df1 %>% + record_batch() %>% + transmute(x = dictionary_encode(x)) %>% + compute() + dict <- rb1$x$dictionary() + as.vector(dict$Take(dict$SortIndices())) + }, + sort(unique(df1$x), na.last = NA) + ) + + # dictionary values with "encode" null encoding behavior includes nulls in + # the dictionary values + expect_equal( + { + rb1 <- df1 %>% + record_batch() %>% + transmute(x = dictionary_encode(x, null_encoding_behavior = "encode")) %>% + compute() + dict <- rb1$x$dictionary() + as.vector(dict$Take(dict$SortIndices())) + }, + sort(unique(df1$x), na.last = TRUE) + ) + +}) + +test_that("bad explicit type conversions with as.*()", { - # Arrow returns lowercase "true", "false" + # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R) expect_error( expect_dplyr_equal( input %>% @@ -448,7 +564,21 @@ test_that("bad explicit type conversions", { ) ) - # Arrow fails to parse these strings as Booleans + # Arrow fails to parse these strings as numbers (instead of returning NAs with + # a warning like R does) + expect_error( + expect_warning( + expect_dplyr_equal( + input %>% + transmute(chr2num = as.numeric(chr)) %>% + collect(), + tibble(chr = c("l.O", "S.S", "")) + ) + ) + ) + + # Arrow fails to parse these strings as Booleans (instead of returning NAs + # like R does) expect_error( expect_dplyr_equal( input %>% diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index 8f2cc7c0559..56cef722556 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -68,3 +68,41 @@ test_that("DataType$Equals", { expect_failure(expect_type_equal(a, z), "int32 not equal to double") expect_false(a$Equals(32L)) }) + +test_that("Masked data type functions still work", { + skip("Work around masking of data type functions (ARROW-12322)") + + # Works when type function is masked + string <- rlang::string + expect_type_equal( + Array$create("abc", type = string()), + arrow::string() + ) + rm(string) + + # Works when with non-Arrow function that returns an Arrow type + # when the non-Arrow function has the same name as a base R function... + str <- arrow::string + expect_type_equal( + Array$create("abc", type = str()), + arrow::string() + ) + rm(str) + + # ... and when it has the same name as an Arrow function + type <- arrow::string + expect_type_equal( + Array$create("abc", type = type()), + arrow::string() + ) + rm(type) + + # Works with local variable whose value is an Arrow type + type <- arrow::string() + expect_type_equal( + Array$create("abc", type = type), + arrow::string() + ) + rm(type) + +})