Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ export(MemoryMappedFile)
export(MessageReader)
export(MessageType)
export(MetadataVersion)
export(NullEncodingBehavior)
export(ParquetArrowReaderProperties)
export(ParquetFileFormat)
export(ParquetFileReader)
Expand Down
15 changes: 15 additions & 0 deletions r/R/dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,21 @@ build_function_list <- function(FUN) {
# Include mappings from R function name spellings
lapply(set_names(names(.array_function_map)), wrapper),
# Plus some special handling where it's not 1:1
cast = function(x, target_type, safe = TRUE, ...) {
opts <- cast_options(safe, ...)
opts$to_type <- as_type(target_type)
FUN("cast", x, options = opts)
},
dictionary_encode = function(x, null_encoding_behavior = c("mask", "encode")) {
null_encoding_behavior <-
NullEncodingBehavior[[toupper(match.arg(null_encoding_behavior))]]
FUN(
"dictionary_encode",
x,
options = list(null_encoding_behavior = null_encoding_behavior)
)
},
# as.factor() is mapped in expression.R
as.character = function(x) {
FUN("cast", x, options = cast_options(to_type = string()))
},
Expand Down
6 changes: 6 additions & 0 deletions r/R/enums.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,9 @@ MetadataVersion <- enum("MetadataVersion",
QuantileInterpolation <- enum("QuantileInterpolation",
LINEAR = 0L, LOWER = 1L, HIGHER = 2L, NEAREST = 3L, MIDPOINT = 4L
)

#' @export
#' @rdname enums
NullEncodingBehavior <- enum("NullEncodingBehavior",
ENCODE = 0L, MASK = 1L
)
1 change: 1 addition & 0 deletions r/R/expression.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ cast_array_expression <- function(x, to_type, safe = TRUE, ...) {

.unary_function_map <- list(
"!" = "invert",
"as.factor" = "dictionary_encode",
"is.na" = "is_null",
"is.nan" = "is_nan",
# nchar is defined in dplyr.R because it is more complex
Expand Down
7 changes: 3 additions & 4 deletions r/R/type.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ NestedType <- R6Class("NestedType", inherit = DataType)
#' * `float16()` and `halffloat()`
#' * `float32()` and `float()`
#' * `bool()` and `boolean()`
#' * Called from `schema()` or `struct()`, `double()` also is supported as a
#' way of creating a `float64()`
#' * When called inside an `arrow` function, such as `schema()` or `cast()`,
#' `double()` also is supported as a way of creating a `float64()`
#'
#' `date32()` creates a datetime type with a "day" unit, like the R `Date`
#' class. `date64()` has a "ms" unit.
Expand Down Expand Up @@ -413,8 +413,8 @@ FixedSizeListType <- R6Class("FixedSizeListType",
fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size)

as_type <- function(type, name = "type") {
# magic so we don't have to mask base::double()
if (identical(type, double())) {
# Magic so that we don't have to mask this base function
type <- float64()
}
if (!inherits(type, "DataType")) {
Expand All @@ -423,7 +423,6 @@ as_type <- function(type, name = "type") {
type
}


# vctrs support -----------------------------------------------------------
str_dup <- function(x, times) {
paste0(rep(x, times = times), collapse = "")
Expand Down
4 changes: 2 additions & 2 deletions r/man/data-type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions r/man/enums.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions r/src/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,17 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
cpp11::as_cpp<bool>(options["skip_nulls"]));
}

if (func_name == "dictionary_encode") {
using Options = arrow::compute::DictionaryEncodeOptions;
auto out = std::make_shared<Options>(Options::Defaults());
if (!Rf_isNull(options["null_encoding_behavior"])) {
out->null_encoding_behavior = cpp11::as_cpp<
enum arrow::compute::DictionaryEncodeOptions::NullEncodingBehavior>(
options["null_encoding_behavior"]);
}
return out;
}

if (func_name == "cast") {
return make_cast_options(options);
}
Expand Down
144 changes: 137 additions & 7 deletions r/tests/testthat/test-dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,70 @@ test_that("relocate with selection helpers", {
)
})

test_that("explicit type conversions", {
test_that("explicit type conversions with cast()", {
num_int32 <- 12L
num_int64 <- bit64::as.integer64(10)

int_types <- c(int8(), int16(), int32(), int64())
uint_types <- c(uint8(), uint16(), uint32(), uint64())
float_types <- c(float32(), float64())

types <- c(
int_types,
uint_types,
float_types,
double(), # not actually a type, a base R function but should be alias for float64
string()
)

for (type in types) {
expect_type_equal(
{
t1 <- Table$create(x = num_int32) %>%
transmute(x = cast(x, type)) %>%
compute()
t1$schema[[1]]$type
},
as_type(type)
)
expect_type_equal(
{
t1 <- Table$create(x = num_int64) %>%
transmute(x = cast(x, type)) %>%
compute()
t1$schema[[1]]$type
},
as_type(type)
)
}

# Arrow errors when truncating floats...
expect_error(
expect_type_equal(
{
t1 <- Table$create(pi = pi) %>%
transmute(three = cast(pi, int32())) %>%
compute()
t1$schema[[1]]$type
},
int32()
),
"truncated"
)

# ... unless safe = FALSE (or allow_float_truncate = TRUE)
expect_type_equal(
{
t1 <- Table$create(pi = pi) %>%
transmute(three = cast(pi, int32(), safe = FALSE)) %>%
compute()
t1$schema[[1]]$type
},
int32()
)
})

test_that("explicit type conversions with as.*()", {
library(bit64)
expect_dplyr_equal(
input %>%
Expand Down Expand Up @@ -421,12 +484,14 @@ test_that("explicit type conversions", {
int2dbl = as.double(int),
int2int = as.integer(int),
int2lgl = as.logical(int),
lgl2chr = toupper(as.character(lgl)), # Arrow returns "true", "false"
lgl2chr = as.character(lgl), # Arrow returns "true", "false" here ...
lgl2dbl = as.double(lgl),
lgl2int = as.integer(lgl),
lgl2lgl = as.logical(lgl),
lgl2lgl = as.logical(lgl)
) %>%
collect(),
collect() %>%
# need to use toupper() *after* collect() or else skip if utf8proc not available
mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE"
tibble(
dbl = c(1, 0, NA_real_),
int = c(1L, 0L, NA_integer_),
Expand All @@ -435,9 +500,60 @@ test_that("explicit type conversions", {
)
})

test_that("bad explicit type conversions", {
test_that("as.factor()/dictionary_encode()", {
df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B"))
df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8))

expect_dplyr_equal(
input %>%
transmute(x = as.factor(x)) %>%
collect(),
df1
)

expect_warning(
expect_dplyr_equal(
input %>%
transmute(x = as.factor(x)) %>%
collect(),
df2
),
"Coercing dictionary values to R character factor levels"
)

# dictionary values with default null encoding behavior ("mask") omits
# nulls from the dictionary values
expect_equal(
{
rb1 <- df1 %>%
record_batch() %>%
transmute(x = dictionary_encode(x)) %>%
compute()
dict <- rb1$x$dictionary()
as.vector(dict$Take(dict$SortIndices()))
},
sort(unique(df1$x), na.last = NA)
)

# dictionary values with "encode" null encoding behavior includes nulls in
# the dictionary values
expect_equal(
{
rb1 <- df1 %>%
record_batch() %>%
transmute(x = dictionary_encode(x, null_encoding_behavior = "encode")) %>%
compute()
dict <- rb1$x$dictionary()
as.vector(dict$Take(dict$SortIndices()))
},
sort(unique(df1$x), na.last = TRUE)
)

})

test_that("bad explicit type conversions with as.*()", {

# Arrow returns lowercase "true", "false"
# Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R)
expect_error(
expect_dplyr_equal(
input %>%
Expand All @@ -448,7 +564,21 @@ test_that("bad explicit type conversions", {
)
)

# Arrow fails to parse these strings as Booleans
# Arrow fails to parse these strings as numbers (instead of returning NAs with
# a warning like R does)
expect_error(
expect_warning(
expect_dplyr_equal(
input %>%
transmute(chr2num = as.numeric(chr)) %>%
collect(),
tibble(chr = c("l.O", "S.S", ""))
)
)
)

# Arrow fails to parse these strings as Booleans (instead of returning NAs
# like R does)
expect_error(
expect_dplyr_equal(
input %>%
Expand Down
38 changes: 38 additions & 0 deletions r/tests/testthat/test-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,41 @@ test_that("DataType$Equals", {
expect_failure(expect_type_equal(a, z), "int32 not equal to double")
expect_false(a$Equals(32L))
})

test_that("Masked data type functions still work", {
skip("Work around masking of data type functions (ARROW-12322)")

# Works when type function is masked
string <- rlang::string
expect_type_equal(
Array$create("abc", type = string()),
arrow::string()
)
rm(string)

# Works when with non-Arrow function that returns an Arrow type
# when the non-Arrow function has the same name as a base R function...
str <- arrow::string
expect_type_equal(
Array$create("abc", type = str()),
arrow::string()
)
rm(str)

# ... and when it has the same name as an Arrow function
type <- arrow::string
expect_type_equal(
Array$create("abc", type = type()),
arrow::string()
)
rm(type)

# Works with local variable whose value is an Arrow type
type <- arrow::string()
expect_type_equal(
Array$create("abc", type = type),
arrow::string()
)
rm(type)

})