From e68a8e8be01ae298d128e5bae25f5a031c58f13a Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 10:04:11 -0400 Subject: [PATCH 01/23] Fix and improve tests --- r/tests/testthat/test-dplyr.R | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 7940ce8bf4b..aab10d81d88 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -421,12 +421,13 @@ test_that("explicit type conversions", { int2dbl = as.double(int), int2int = as.integer(int), int2lgl = as.logical(int), - lgl2chr = toupper(as.character(lgl)), # Arrow returns "true", "false" + lgl2chr = as.character(lgl), # Arrow returns "true", "false" here ... lgl2dbl = as.double(lgl), lgl2int = as.integer(lgl), - lgl2lgl = as.logical(lgl), + lgl2lgl = as.logical(lgl) ) %>% - collect(), + collect() %>% + mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE" tibble( dbl = c(1, 0, NA_real_), int = c(1L, 0L, NA_integer_), @@ -437,7 +438,7 @@ test_that("explicit type conversions", { test_that("bad explicit type conversions", { - # Arrow returns lowercase "true", "false" + # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R) expect_error( expect_dplyr_equal( input %>% @@ -448,7 +449,19 @@ test_that("bad explicit type conversions", { ) ) - # Arrow fails to parse these strings as Booleans + # Arrow fails to parse these strings as numbers (instead of returning NAs with + # a warning like R does) + expect_error( + expect_dplyr_equal( + input %>% + transmute(chr2num = as.numeric(chr)) %>% + collect(), + tibble(chr = c("l.O", "S.S", "")) + ) + ) + + # Arrow fails to parse these strings as Booleans (instead of returning NAs + # like R does) expect_error( expect_dplyr_equal( input %>% From eb22995ba2ff193d89b7d20eaa02defac1ea5146 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 10:05:59 -0400 Subject: [PATCH 02/23] Defuse type in as_type() --- r/R/array.R | 4 ++-- r/R/arrow-datum.R | 2 +- r/R/chunked-array.R | 4 ++-- r/R/field.R | 2 +- r/R/type.R | 13 +++++++++---- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/r/R/array.R b/r/R/array.R index 1d63c5735a7..eb372fb4ff1 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -144,7 +144,7 @@ Array <- R6Class("Array", Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) }, View = function(type) { - Array$create(Array__View(self, as_type(type))) + Array$create(Array__View(self, as_type(enexpr(type)))) }, Validate = function() Array__Validate(self) ), @@ -156,7 +156,7 @@ Array <- R6Class("Array", ) Array$create <- function(x, type = NULL) { if (!is.null(type)) { - type <- as_type(type) + type <- as_type(enexpr(type)) } if (inherits(x, "Scalar")) { out <- x$as_array() diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index dd43307c9cc..a88de90b7e1 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -23,7 +23,7 @@ ArrowDatum <- R6Class("ArrowDatum", inherit = ArrowObject, public = list( cast = function(target_type, safe = TRUE, ...) { opts <- cast_options(safe, ...) - opts$to_type <- as_type(target_type) + opts$to_type <- as_type(enexpr(target_type)) call_function("cast", self, options = opts) } ) diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index a7f9c8f790c..b67e041f6c9 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -98,7 +98,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) }, View = function(type) { - ChunkedArray__View(self, as_type(type)) + ChunkedArray__View(self, as_type(enexpr(type))) }, Validate = function() { ChunkedArray__Validate(self) @@ -120,7 +120,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ChunkedArray$create <- function(..., type = NULL) { if (!is.null(type)) { - type <- as_type(type) + type <- as_type(enexpr(type)) } ChunkedArray__from_list(list2(...), type) } diff --git a/r/R/field.R b/r/R/field.R index 33549d344c5..49c6cd09c0a 100644 --- a/r/R/field.R +++ b/r/R/field.R @@ -55,7 +55,7 @@ Field <- R6Class("Field", inherit = ArrowObject, ) Field$create <- function(name, type, metadata) { assert_that(inherits(name, "character"), length(name) == 1L) - type <- as_type(type, name) + type <- as_type(enexpr(type), name) assert_that(missing(metadata), msg = "metadata= is currently ignored") Field__initialize(enc2utf8(name), type, TRUE) } diff --git a/r/R/type.R b/r/R/type.R index 77e3129601c..b2864ce6b4e 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -413,10 +413,15 @@ FixedSizeListType <- R6Class("FixedSizeListType", fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size) as_type <- function(type, name = "type") { - if (identical(type, double())) { - # Magic so that we don't have to mask this base function - type <- float64() - } + # quotation magic so we don't have to mask base::double() and to work around + # rlang possibly masking string(), etc. + double <- float64 + if (!is.call(type)) type <- enexpr(type) + type <- eval(type) + + # more magic just in case double() was evaluated before we could defuse it + if (identical(type, double())) type <- float64() + if (!inherits(type, "DataType")) { stop(name, " must be a DataType, not ", class(type), call. = FALSE) } From 756b7d3db1bac32313d40618d41efc2344cfe5c9 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 10:06:33 -0400 Subject: [PATCH 03/23] Implement cast() for dplyr --- r/R/dplyr.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 76d95a804a7..e501fed56b8 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -396,6 +396,11 @@ build_function_list <- function(FUN) { # Include mappings from R function name spellings lapply(set_names(names(.array_function_map)), wrapper), # Plus some special handling where it's not 1:1 + cast = function(x, target_type, safe = TRUE, ...) { + opts <- cast_options(safe, ...) + opts$to_type <- as_type(enexpr(target_type)) + FUN("cast", x, options = opts) + }, as.character = function(x) { FUN("cast", x, options = cast_options(to_type = string())) }, From add1b32e2952e3968e9ff5aa939660c7f38f8b3b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 11:52:18 -0400 Subject: [PATCH 04/23] Use eval(substitute()) instead of defusing --- r/R/array.R | 5 +++-- r/R/arrow-datum.R | 2 +- r/R/chunked-array.R | 5 +++-- r/R/dplyr.R | 2 +- r/R/field.R | 2 +- r/R/type.R | 10 ++++------ 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/r/R/array.R b/r/R/array.R index eb372fb4ff1..b8de6c854e7 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -144,7 +144,7 @@ Array <- R6Class("Array", Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) }, View = function(type) { - Array$create(Array__View(self, as_type(enexpr(type)))) + Array$create(Array__View(self, as_type(eval(substitute(type))))) }, Validate = function() Array__Validate(self) ), @@ -155,8 +155,9 @@ Array <- R6Class("Array", ) ) Array$create <- function(x, type = NULL) { + type <- eval(substitute(type)) if (!is.null(type)) { - type <- as_type(enexpr(type)) + type <- as_type(type) } if (inherits(x, "Scalar")) { out <- x$as_array() diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index a88de90b7e1..7560023531f 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -23,7 +23,7 @@ ArrowDatum <- R6Class("ArrowDatum", inherit = ArrowObject, public = list( cast = function(target_type, safe = TRUE, ...) { opts <- cast_options(safe, ...) - opts$to_type <- as_type(enexpr(target_type)) + opts$to_type <- as_type(eval(substitute(target_type))) call_function("cast", self, options = opts) } ) diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index b67e041f6c9..0455fe64d78 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -98,7 +98,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) }, View = function(type) { - ChunkedArray__View(self, as_type(enexpr(type))) + ChunkedArray__View(self, as_type(eval(substitute(type)))) }, Validate = function() { ChunkedArray__Validate(self) @@ -119,8 +119,9 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) ChunkedArray$create <- function(..., type = NULL) { + type <- eval(substitute(type)) if (!is.null(type)) { - type <- as_type(enexpr(type)) + type <- as_type(type) } ChunkedArray__from_list(list2(...), type) } diff --git a/r/R/dplyr.R b/r/R/dplyr.R index e501fed56b8..1bdde0c96c9 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -398,7 +398,7 @@ build_function_list <- function(FUN) { # Plus some special handling where it's not 1:1 cast = function(x, target_type, safe = TRUE, ...) { opts <- cast_options(safe, ...) - opts$to_type <- as_type(enexpr(target_type)) + opts$to_type <- as_type(eval(substitute(target_type))) FUN("cast", x, options = opts) }, as.character = function(x) { diff --git a/r/R/field.R b/r/R/field.R index 49c6cd09c0a..b8645537eca 100644 --- a/r/R/field.R +++ b/r/R/field.R @@ -55,7 +55,7 @@ Field <- R6Class("Field", inherit = ArrowObject, ) Field$create <- function(name, type, metadata) { assert_that(inherits(name, "character"), length(name) == 1L) - type <- as_type(enexpr(type), name) + type <- as_type(eval(substitute(type)), name) assert_that(missing(metadata), msg = "metadata= is currently ignored") Field__initialize(enc2utf8(name), type, TRUE) } diff --git a/r/R/type.R b/r/R/type.R index b2864ce6b4e..5d488cb3a1e 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -413,13 +413,11 @@ FixedSizeListType <- R6Class("FixedSizeListType", fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size) as_type <- function(type, name = "type") { - # quotation magic so we don't have to mask base::double() and to work around - # rlang possibly masking string(), etc. - double <- float64 - if (!is.call(type)) type <- enexpr(type) - type <- eval(type) + # work around other packages possibly masking the Arrow data type functions, + # for example rlang masking string() + type <- eval(substitute(type)) - # more magic just in case double() was evaluated before we could defuse it + # magic so we don't have to mask base::double() if (identical(type, double())) type <- float64() if (!inherits(type, "DataType")) { From ff25124033072394c65635331823060ae2af4a3e Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 11:53:13 -0400 Subject: [PATCH 05/23] Test fix for masked type functions --- r/tests/testthat/test-Array.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 35ae357f703..5145db53413 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -381,6 +381,21 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an Array$create(5, type = "not a type"), "type must be a DataType, not character" ) + + # Works when type function is masked + string <- rlang::string + expect_type_equal( + Array$create("x", type = string()) + ) + rm(string) + + # Works when with non-Arrow function that returns an Arrow type + str <- arrow::string + expect_type_equal( + Array$create("x", type = str()) + ) + rm(str) + }) test_that("Array$create() aborts on overflow", { From 8d50d782dcb76950d390cd77c6df951d11b9f3bf Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 13:36:36 -0400 Subject: [PATCH 06/23] Better method for defusing masked type functions --- r/R/array.R | 5 +++-- r/R/arrow-datum.R | 3 ++- r/R/chunked-array.R | 5 +++-- r/R/dplyr.R | 3 ++- r/R/field.R | 3 ++- r/R/scalar.R | 1 + r/R/type.R | 11 ++++++++++- 7 files changed, 23 insertions(+), 8 deletions(-) diff --git a/r/R/array.R b/r/R/array.R index b8de6c854e7..ad84b6ef135 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -144,7 +144,8 @@ Array <- R6Class("Array", Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) }, View = function(type) { - Array$create(Array__View(self, as_type(eval(substitute(type))))) + type <- unmask_type_fun(enexpr(type)) %||% type + Array$create(Array__View(self, as_type(type))) }, Validate = function() Array__Validate(self) ), @@ -155,7 +156,7 @@ Array <- R6Class("Array", ) ) Array$create <- function(x, type = NULL) { - type <- eval(substitute(type)) + type <- unmask_type_fun(enexpr(type)) %||% type if (!is.null(type)) { type <- as_type(type) } diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index 7560023531f..f152e3262e5 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -22,8 +22,9 @@ ArrowDatum <- R6Class("ArrowDatum", inherit = ArrowObject, public = list( cast = function(target_type, safe = TRUE, ...) { + target_type <- unmask_type_fun(enexpr(target_type)) %||% target_type opts <- cast_options(safe, ...) - opts$to_type <- as_type(eval(substitute(target_type))) + opts$to_type <- as_type(target_type) call_function("cast", self, options = opts) } ) diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 0455fe64d78..0d6be8c925d 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -98,7 +98,8 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) }, View = function(type) { - ChunkedArray__View(self, as_type(eval(substitute(type)))) + type <- unmask_type_fun(enexpr(type)) %||% type + ChunkedArray__View(self, as_type(type)) }, Validate = function() { ChunkedArray__Validate(self) @@ -119,7 +120,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) ChunkedArray$create <- function(..., type = NULL) { - type <- eval(substitute(type)) + type <- unmask_type_fun(enexpr(type)) %||% type if (!is.null(type)) { type <- as_type(type) } diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 1bdde0c96c9..86683940116 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -397,8 +397,9 @@ build_function_list <- function(FUN) { lapply(set_names(names(.array_function_map)), wrapper), # Plus some special handling where it's not 1:1 cast = function(x, target_type, safe = TRUE, ...) { + target_type <- unmask_type_fun(enexpr(target_type)) %||% target_type opts <- cast_options(safe, ...) - opts$to_type <- as_type(eval(substitute(target_type))) + opts$to_type <- as_type(target_type) FUN("cast", x, options = opts) }, as.character = function(x) { diff --git a/r/R/field.R b/r/R/field.R index b8645537eca..bb17998a1e6 100644 --- a/r/R/field.R +++ b/r/R/field.R @@ -55,7 +55,8 @@ Field <- R6Class("Field", inherit = ArrowObject, ) Field$create <- function(name, type, metadata) { assert_that(inherits(name, "character"), length(name) == 1L) - type <- as_type(eval(substitute(type)), name) + type <- unmask_type_fun(enexpr(type)) %||% type + type <- as_type(type, name) assert_that(missing(metadata), msg = "metadata= is currently ignored") Field__initialize(enc2utf8(name), type, TRUE) } diff --git a/r/R/scalar.R b/r/R/scalar.R index cbda5964a2c..02fc701c26b 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -48,6 +48,7 @@ Scalar <- R6Class("Scalar", ) ) Scalar$create <- function(x, type = NULL) { + type <- unmask_type_fun(enexpr(type)) %||% type if (is.null(x)) { x <- vctrs::unspecified(1) } else if (length(x) != 1 && !is.data.frame(x)) { diff --git a/r/R/type.R b/r/R/type.R index 5d488cb3a1e..06dd3fd3388 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -415,7 +415,7 @@ fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_siz as_type <- function(type, name = "type") { # work around other packages possibly masking the Arrow data type functions, # for example rlang masking string() - type <- eval(substitute(type)) + type <- unmask_type_fun(enexpr(type)) %||% type # magic so we don't have to mask base::double() if (identical(type, double())) type <- float64() @@ -426,6 +426,15 @@ as_type <- function(type, name = "type") { type } +unmask_type_fun <- function(expr) { + # if `expr` is an unevaluated function call, try to evaulate it in the arrow + # package environment, and if that fails, return NULL + if (is.call(expr)) { + try(return(eval(expr)), silent = TRUE) + } + NULL +} + # vctrs support ----------------------------------------------------------- str_dup <- function(x, times) { From 20826b2874991c6ed14a9db0423f0d9a98d2a4a9 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 13:58:13 -0400 Subject: [PATCH 07/23] Add tests --- r/tests/testthat/test-dplyr.R | 67 +++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index aab10d81d88..ef80cf058de 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -358,7 +358,70 @@ test_that("relocate with selection helpers", { ) }) -test_that("explicit type conversions", { +test_that("explicit type conversions with cast()", { + num_int32 <- 12L + num_int64 <- bit64::as.integer64(10) + + int_types <- c(int8(), int16(), int32(), int64()) + uint_types <- c(uint8(), uint16(), uint32(), uint64()) + float_types <- c(float32(), float64()) + + types <- c( + int_types, + uint_types, + float_types, + double(), # not actually a type, a base R function but should be alias for float64 + string() + ) + + for (type in types) { + expect_type_equal( + { + t1 <- Table$create(x = num_int32) %>% + transmute(x = cast(x, type)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(type) + ) + expect_type_equal( + { + t1 <- Table$create(x = num_int64) %>% + transmute(x = cast(x, type)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(type) + ) + } + + # Arrow errors when truncating floats... + expect_error( + expect_type_equal( + { + t1 <- Table$create(pi = pi) %>% + transmute(three = cast(pi, int32())) %>% + compute() + t1$schema[[1]]$type + }, + as_type(int32()) + ), + "truncated" + ) + + # ... unless safe = FALSE (or allow_float_truncate = TRUE) + expect_type_equal( + { + t1 <- Table$create(pi = pi) %>% + transmute(three = cast(pi, int32(), safe = FALSE)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(int32()) + ) +}) + +test_that("explicit type conversions with as.*()", { library(bit64) expect_dplyr_equal( input %>% @@ -436,7 +499,7 @@ test_that("explicit type conversions", { ) }) -test_that("bad explicit type conversions", { +test_that("bad explicit type conversions with as.*()", { # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R) expect_error( From d6887a101d1343c7a0c89811366a2ecb52b12b1c Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 13:58:31 -0400 Subject: [PATCH 08/23] Fix tests --- r/tests/testthat/test-Array.R | 6 ++++-- r/tests/testthat/test-dplyr.R | 12 +++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 5145db53413..a439b46d24f 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -385,14 +385,16 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an # Works when type function is masked string <- rlang::string expect_type_equal( - Array$create("x", type = string()) + Array$create("x", type = string()), + as_type(arrow::string()) ) rm(string) # Works when with non-Arrow function that returns an Arrow type str <- arrow::string expect_type_equal( - Array$create("x", type = str()) + Array$create("x", type = str()), + as_type(arrow::string()) ) rm(str) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index ef80cf058de..828fc188972 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -515,11 +515,13 @@ test_that("bad explicit type conversions with as.*()", { # Arrow fails to parse these strings as numbers (instead of returning NAs with # a warning like R does) expect_error( - expect_dplyr_equal( - input %>% - transmute(chr2num = as.numeric(chr)) %>% - collect(), - tibble(chr = c("l.O", "S.S", "")) + expect_warning( + expect_dplyr_equal( + input %>% + transmute(chr2num = as.numeric(chr)) %>% + collect(), + tibble(chr = c("l.O", "S.S", "")) + ) ) ) From a680180f79d677810dcc6a21b79f258bc7bc59c3 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 13:58:51 -0400 Subject: [PATCH 09/23] Update data type funs docs --- r/R/type.R | 4 ++-- r/man/data-type.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/r/R/type.R b/r/R/type.R index 06dd3fd3388..79d7f880d81 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -156,8 +156,8 @@ NestedType <- R6Class("NestedType", inherit = DataType) #' * `float16()` and `halffloat()` #' * `float32()` and `float()` #' * `bool()` and `boolean()` -#' * Called from `schema()` or `struct()`, `double()` also is supported as a -#' way of creating a `float64()` +#' * Called from `schema()` or `struct()` or from `cast()` in a dplyr verb, +#' `double()` also is supported as a way of creating a `float64()` #' #' `date32()` creates a datetime type with a "day" unit, like the R `Date` #' class. `date64()` has a "ms" unit. diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index f16e5dd5191..142c2271a81 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -135,8 +135,8 @@ A few functions have aliases: \item \code{float16()} and \code{halffloat()} \item \code{float32()} and \code{float()} \item \code{bool()} and \code{boolean()} -\item Called from \code{schema()} or \code{struct()}, \code{double()} also is supported as a -way of creating a \code{float64()} +\item Called from \code{schema()} or \code{struct()} or from \code{cast()} in a dplyr verb, +\code{double()} also is supported as a way of creating a \code{float64()} } \code{date32()} creates a datetime type with a "day" unit, like the R \code{Date} From 8d096ecee2808d1bf0682f44cfd90ab7779e6152 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 14:53:14 -0400 Subject: [PATCH 10/23] Fix try catch eval --- r/R/type.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/type.R b/r/R/type.R index 79d7f880d81..55f3a5c8cdc 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -427,10 +427,10 @@ as_type <- function(type, name = "type") { } unmask_type_fun <- function(expr) { - # if `expr` is an unevaluated function call, try to evaulate it in the arrow + # if `expr` is an unevaluated function call, try to evaluate it in the arrow # package environment, and if that fails, return NULL if (is.call(expr)) { - try(return(eval(expr)), silent = TRUE) + return(tryCatch(eval(expr), error = function(e) NULL)) } NULL } From bca42563c0479ff67cfb03aa8e2b5a284ce7b6e9 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 17:09:55 -0400 Subject: [PATCH 11/23] Implement as.factor() and dictionary_encode() for dplyr --- r/NAMESPACE | 1 + r/R/dplyr.R | 10 ++++++++++ r/R/enums.R | 6 ++++++ r/R/expression.R | 1 + r/man/enums.Rd | 5 +++++ r/src/compute.cpp | 11 +++++++++++ 6 files changed, 34 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index fb3ea82c4af..c75f54d946a 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -147,6 +147,7 @@ export(MemoryMappedFile) export(MessageReader) export(MessageType) export(MetadataVersion) +export(NullEncodingBehavior) export(ParquetArrowReaderProperties) export(ParquetFileFormat) export(ParquetFileReader) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 86683940116..f595f4cbac8 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -402,6 +402,16 @@ build_function_list <- function(FUN) { opts$to_type <- as_type(target_type) FUN("cast", x, options = opts) }, + dictionary_encode = function(x, null_encoding_behavior = c("mask", "encode")) { + null_encoding_behavior <- + NullEncodingBehavior[[toupper(match.arg(null_encoding_behavior))]] + FUN( + "dictionary_encode", + x, + options = list(null_encoding_behavior = null_encoding_behavior) + ) + }, + # as.factor() is mapped in expression.R as.character = function(x) { FUN("cast", x, options = cast_options(to_type = string())) }, diff --git a/r/R/enums.R b/r/R/enums.R index 170abf99865..ae44ccf2cad 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -134,3 +134,9 @@ MetadataVersion <- enum("MetadataVersion", QuantileInterpolation <- enum("QuantileInterpolation", LINEAR = 0L, LOWER = 1L, HIGHER = 2L, NEAREST = 3L, MIDPOINT = 4L ) + +#' @export +#' @rdname enums +NullEncodingBehavior <- enum("NullEncodingBehavior", + ENCODE = 0L, MASK = 1L +) diff --git a/r/R/expression.R b/r/R/expression.R index ed81418f41c..1974fc7f59b 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -117,6 +117,7 @@ cast_array_expression <- function(x, to_type, safe = TRUE, ...) { .unary_function_map <- list( "!" = "invert", + "as.factor" = "dictionary_encode", "is.na" = "is_null", "is.nan" = "is_nan", # nchar is defined in dplyr.R because it is more complex diff --git a/r/man/enums.Rd b/r/man/enums.Rd index fa3c64b8f95..b871516def8 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -14,6 +14,7 @@ \alias{ParquetVersionType} \alias{MetadataVersion} \alias{QuantileInterpolation} +\alias{NullEncodingBehavior} \title{Arrow enums} \format{ An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4. @@ -37,6 +38,8 @@ An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) o An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of length 5. An object of class \code{QuantileInterpolation} (inherits from \code{arrow-enum}) of length 5. + +An object of class \code{NullEncodingBehavior} (inherits from \code{arrow-enum}) of length 2. } \usage{ TimeUnit @@ -60,6 +63,8 @@ ParquetVersionType MetadataVersion QuantileInterpolation + +NullEncodingBehavior } \description{ Arrow enums diff --git a/r/src/compute.cpp b/r/src/compute.cpp index f33153069f2..2bc43a9660c 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -202,6 +202,17 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["skip_nulls"])); } + if (func_name == "dictionary_encode") { + using Options = arrow::compute::DictionaryEncodeOptions; + auto out = std::make_shared(Options::Defaults()); + if (!Rf_isNull(options["null_encoding_behavior"])) { + out->null_encoding_behavior = cpp11::as_cpp< + enum arrow::compute::DictionaryEncodeOptions::NullEncodingBehavior>( + options["null_encoding_behavior"]); + } + return out; + } + if (func_name == "cast") { return make_cast_options(options); } From 95272bc268a5742ecf3a79828172f0de75801bc7 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 17:10:06 -0400 Subject: [PATCH 12/23] Add and improve tests --- r/tests/testthat/test-Array.R | 4 +-- r/tests/testthat/test-dplyr.R | 52 +++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index a439b46d24f..c62ffd9749c 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -385,7 +385,7 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an # Works when type function is masked string <- rlang::string expect_type_equal( - Array$create("x", type = string()), + Array$create("abc", type = string()), as_type(arrow::string()) ) rm(string) @@ -393,7 +393,7 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an # Works when with non-Arrow function that returns an Arrow type str <- arrow::string expect_type_equal( - Array$create("x", type = str()), + Array$create("abc", type = str()), as_type(arrow::string()) ) rm(str) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 828fc188972..d66cb1ad02e 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -490,6 +490,7 @@ test_that("explicit type conversions with as.*()", { lgl2lgl = as.logical(lgl) ) %>% collect() %>% + # need to use toupper() *after* collect() or else skip if utf8proc not available mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE" tibble( dbl = c(1, 0, NA_real_), @@ -499,6 +500,57 @@ test_that("explicit type conversions with as.*()", { ) }) +test_that("as.factor()/dictionary_encode()", { + df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B")) + df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8)) + + expect_dplyr_equal( + input %>% + transmute(x = as.factor(x)) %>% + collect(), + df1 + ) + + expect_warning( + expect_dplyr_equal( + input %>% + transmute(x = as.factor(x)) %>% + collect(), + df2 + ), + "Coercing dictionary values to R character factor levels" + ) + + # dictionary values with default null encoding behavior ("mask") omits + # nulls from the dictionary values + expect_equal( + { + rb1 <- df1 %>% + record_batch() %>% + transmute(x = dictionary_encode(x)) %>% + compute() + dict <- rb1$x$dictionary() + as.vector(dict$Take(dict$SortIndices())) + }, + sort(unique(df1$x), na.last = NA) + ) + + # dictionary values with "encode" null encoding behavior includes nulls in + # the dictionary values + expect_equal( + { + rb1 <- df1 %>% + record_batch() %>% + transmute(x = dictionary_encode(x, null_encoding_behavior = "encode")) %>% + compute() + dict <- rb1$x$dictionary() + as.vector(dict$Take(dict$SortIndices())) + }, + sort(unique(df1$x), na.last = TRUE) + ) + +}) + test_that("bad explicit type conversions with as.*()", { # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R) From b0e11bc04492a3078c22ab0a96dd27ae9b350943 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 8 Apr 2021 17:11:36 -0400 Subject: [PATCH 13/23] Force CI From f67d7e336e5af98fd2d44a4be1098f101585e652 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 13:11:51 -0400 Subject: [PATCH 14/23] Docs fix Co-authored-by: Neal Richardson --- r/R/type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/type.R b/r/R/type.R index 55f3a5c8cdc..5f739815691 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -156,7 +156,7 @@ NestedType <- R6Class("NestedType", inherit = DataType) #' * `float16()` and `halffloat()` #' * `float32()` and `float()` #' * `bool()` and `boolean()` -#' * Called from `schema()` or `struct()` or from `cast()` in a dplyr verb, +#' * When called inside an `arrow` function, such as `schema()` or `cast()`, #' `double()` also is supported as a way of creating a `float64()` #' #' `date32()` creates a datetime type with a "day" unit, like the R `Date` From aa8085af604b5d369b64285d41e46817ea19edfa Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 13:12:19 -0400 Subject: [PATCH 15/23] Use parens Co-authored-by: Neal Richardson --- r/R/type.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/R/type.R b/r/R/type.R index 5f739815691..47cad0d63fa 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -418,7 +418,9 @@ as_type <- function(type, name = "type") { type <- unmask_type_fun(enexpr(type)) %||% type # magic so we don't have to mask base::double() - if (identical(type, double())) type <- float64() + if (identical(type, double())) { + type <- float64() + } if (!inherits(type, "DataType")) { stop(name, " must be a DataType, not ", class(type), call. = FALSE) From 8de46739e75918747ba322aa0bdb25b275338700 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 13:12:58 -0400 Subject: [PATCH 16/23] Build docs --- r/man/data-type.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index 142c2271a81..f113455a90d 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -135,7 +135,7 @@ A few functions have aliases: \item \code{float16()} and \code{halffloat()} \item \code{float32()} and \code{float()} \item \code{bool()} and \code{boolean()} -\item Called from \code{schema()} or \code{struct()} or from \code{cast()} in a dplyr verb, +\item When called inside an \code{arrow} function, such as \code{schema()} or \code{cast()}, \code{double()} also is supported as a way of creating a \code{float64()} } From afc24fca7f2c2f4c54f3b39f5aeeb5eb65dd5fc1 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 13:14:45 -0400 Subject: [PATCH 17/23] Remove as_type() around int32() Co-authored-by: Neal Richardson --- r/tests/testthat/test-dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index d66cb1ad02e..37c289ec593 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -404,7 +404,7 @@ test_that("explicit type conversions with cast()", { compute() t1$schema[[1]]$type }, - as_type(int32()) + int32() ), "truncated" ) From e4b3be79cf6a594101bd70d7fce4e0bf74bf4162 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 13:18:44 -0400 Subject: [PATCH 18/23] Remove other superfluous as_type() wrapping --- r/tests/testthat/test-Array.R | 4 ++-- r/tests/testthat/test-dplyr.R | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index c62ffd9749c..be14ce1afa8 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -386,7 +386,7 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an string <- rlang::string expect_type_equal( Array$create("abc", type = string()), - as_type(arrow::string()) + arrow::string() ) rm(string) @@ -394,7 +394,7 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an str <- arrow::string expect_type_equal( Array$create("abc", type = str()), - as_type(arrow::string()) + arrow::string() ) rm(str) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 37c289ec593..def7886a0bf 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -417,7 +417,7 @@ test_that("explicit type conversions with cast()", { compute() t1$schema[[1]]$type }, - as_type(int32()) + int32() ) }) From 347cc0019159da1211398c52d498b36fb5603f7b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 13:33:04 -0400 Subject: [PATCH 19/23] Improve unmask_type_fun() --- r/R/type.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/r/R/type.R b/r/R/type.R index 47cad0d63fa..ff462e33ece 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -429,10 +429,12 @@ as_type <- function(type, name = "type") { } unmask_type_fun <- function(expr) { - # if `expr` is an unevaluated function call, try to evaluate it in the arrow - # package environment, and if that fails, return NULL + # if `expr` is an unevaluated expression, try to evaluate it here in the arrow + # package environment. If that fails or returns something that is not a + # DataType, then return NULL. if (is.call(expr)) { - return(tryCatch(eval(expr), error = function(e) NULL)) + type <- tryCatch(eval(expr), error = function(e) NULL) + if (inherits(type, "DataType")) return(type) } NULL } From a381a7b69712aca2376185e30070be3cc46c522f Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 16:03:04 -0400 Subject: [PATCH 20/23] Encapsulate NSE logic --- r/NAMESPACE | 1 + r/R/array.R | 5 ++--- r/R/arrow-datum.R | 3 +-- r/R/arrow-package.R | 2 +- r/R/chunked-array.R | 5 ++--- r/R/dplyr.R | 3 +-- r/R/field.R | 3 +-- r/R/scalar.R | 3 +-- r/R/type.R | 11 ++++++++--- r/tests/testthat/test-Array.R | 17 +++++++++++++++++ 10 files changed, 35 insertions(+), 18 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index c75f54d946a..1e7e74a7292 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -330,6 +330,7 @@ importFrom(utils,head) importFrom(utils,install.packages) importFrom(utils,modifyList) importFrom(utils,object.size) +importFrom(utils,packageName) importFrom(utils,packageVersion) importFrom(utils,tail) importFrom(vctrs,s3_register) diff --git a/r/R/array.R b/r/R/array.R index ad84b6ef135..af41f13107c 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -144,8 +144,7 @@ Array <- R6Class("Array", Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) }, View = function(type) { - type <- unmask_type_fun(enexpr(type)) %||% type - Array$create(Array__View(self, as_type(type))) + Array$create(Array__View(self, as_type(enexpr(type)))) }, Validate = function() Array__Validate(self) ), @@ -156,7 +155,7 @@ Array <- R6Class("Array", ) ) Array$create <- function(x, type = NULL) { - type <- unmask_type_fun(enexpr(type)) %||% type + type <- enexpr(type) # this needs to be before the null check if (!is.null(type)) { type <- as_type(type) } diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index f152e3262e5..a88de90b7e1 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -22,9 +22,8 @@ ArrowDatum <- R6Class("ArrowDatum", inherit = ArrowObject, public = list( cast = function(target_type, safe = TRUE, ...) { - target_type <- unmask_type_fun(enexpr(target_type)) %||% target_type opts <- cast_options(safe, ...) - opts$to_type <- as_type(target_type) + opts$to_type <- as_type(enexpr(target_type)) call_function("cast", self, options = opts) } ) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index c716ff4a544..c8feda6c22c 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -107,7 +107,7 @@ option_use_threads <- function() { #' statistics from Arrow's memory allocator, and also Arrow's run-time #' information. #' @export -#' @importFrom utils packageVersion +#' @importFrom utils packageName packageVersion arrow_info <- function() { opts <- options() out <- list( diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 0d6be8c925d..d7e79161931 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -98,8 +98,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) }, View = function(type) { - type <- unmask_type_fun(enexpr(type)) %||% type - ChunkedArray__View(self, as_type(type)) + ChunkedArray__View(self, as_type(enexpr(type))) }, Validate = function() { ChunkedArray__Validate(self) @@ -120,7 +119,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) ChunkedArray$create <- function(..., type = NULL) { - type <- unmask_type_fun(enexpr(type)) %||% type + type <- enexpr(type) # this needs to be before the null check if (!is.null(type)) { type <- as_type(type) } diff --git a/r/R/dplyr.R b/r/R/dplyr.R index f595f4cbac8..d66571f5773 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -397,9 +397,8 @@ build_function_list <- function(FUN) { lapply(set_names(names(.array_function_map)), wrapper), # Plus some special handling where it's not 1:1 cast = function(x, target_type, safe = TRUE, ...) { - target_type <- unmask_type_fun(enexpr(target_type)) %||% target_type opts <- cast_options(safe, ...) - opts$to_type <- as_type(target_type) + opts$to_type <- as_type(enexpr(target_type)) FUN("cast", x, options = opts) }, dictionary_encode = function(x, null_encoding_behavior = c("mask", "encode")) { diff --git a/r/R/field.R b/r/R/field.R index bb17998a1e6..49c6cd09c0a 100644 --- a/r/R/field.R +++ b/r/R/field.R @@ -55,8 +55,7 @@ Field <- R6Class("Field", inherit = ArrowObject, ) Field$create <- function(name, type, metadata) { assert_that(inherits(name, "character"), length(name) == 1L) - type <- unmask_type_fun(enexpr(type)) %||% type - type <- as_type(type, name) + type <- as_type(enexpr(type), name) assert_that(missing(metadata), msg = "metadata= is currently ignored") Field__initialize(enc2utf8(name), type, TRUE) } diff --git a/r/R/scalar.R b/r/R/scalar.R index 02fc701c26b..b85891d8d50 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -48,14 +48,13 @@ Scalar <- R6Class("Scalar", ) ) Scalar$create <- function(x, type = NULL) { - type <- unmask_type_fun(enexpr(type)) %||% type if (is.null(x)) { x <- vctrs::unspecified(1) } else if (length(x) != 1 && !is.data.frame(x)) { # Wrap in a list type x <- list(x) } - Array__GetScalar(Array$create(x, type = type), 0) + Array__GetScalar(Array$create(x, type = enexpr(type)), 0) } #' @rdname array diff --git a/r/R/type.R b/r/R/type.R index ff462e33ece..ad3414e3b58 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -415,7 +415,7 @@ fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_siz as_type <- function(type, name = "type") { # work around other packages possibly masking the Arrow data type functions, # for example rlang masking string() - type <- unmask_type_fun(enexpr(type)) %||% type + type <- unmask_type_fun(type) # magic so we don't have to mask base::double() if (identical(type, double())) { @@ -431,12 +431,17 @@ as_type <- function(type, name = "type") { unmask_type_fun <- function(expr) { # if `expr` is an unevaluated expression, try to evaluate it here in the arrow # package environment. If that fails or returns something that is not a - # DataType, then return NULL. + # DataType, then evaluate it in the calling environment outside of the arrow + # package. if (is.call(expr)) { type <- tryCatch(eval(expr), error = function(e) NULL) if (inherits(type, "DataType")) return(type) } - NULL + i <- 2L + while (identical(packageName(parent.frame(i)), "arrow")) { + i <- i + 1L + } + eval.parent(expr, n = i) } diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index be14ce1afa8..9926d8e1f52 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -391,6 +391,7 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an rm(string) # Works when with non-Arrow function that returns an Arrow type + # when the non-Arrow function has the same name as a base R function... str <- arrow::string expect_type_equal( Array$create("abc", type = str()), @@ -398,6 +399,22 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an ) rm(str) + # ... and when it has the same name as an Arrow function + type <- arrow::string + expect_type_equal( + Array$create("abc", type = type()), + arrow::string() + ) + rm(type) + + # Works with local variable whose value is an Arrow type + type <- arrow::string() + expect_type_equal( + Array$create("abc", type = type), + arrow::string() + ) + rm(type) + }) test_that("Array$create() aborts on overflow", { From 98dfe29d7d43137a2b0faec0f5fe3fa699d3f664 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 16:12:11 -0400 Subject: [PATCH 21/23] Allow as_type to return NULL --- r/R/type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/type.R b/r/R/type.R index ad3414e3b58..7acededacd7 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -422,7 +422,7 @@ as_type <- function(type, name = "type") { type <- float64() } - if (!inherits(type, "DataType")) { + if (!is.null(type) && !inherits(type, "DataType")) { stop(name, " must be a DataType, not ", class(type), call. = FALSE) } type From f93659a4c2e825536547508f987036aa427f5145 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 18:42:34 -0400 Subject: [PATCH 22/23] Remove masking prevention code --- r/NAMESPACE | 1 - r/R/array.R | 3 +-- r/R/arrow-datum.R | 2 +- r/R/arrow-package.R | 2 +- r/R/chunked-array.R | 3 +-- r/R/dplyr.R | 2 +- r/R/field.R | 2 +- r/R/scalar.R | 2 +- r/R/type.R | 24 +----------------------- r/tests/testthat/test-Array.R | 34 ---------------------------------- 10 files changed, 8 insertions(+), 67 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 1e7e74a7292..c75f54d946a 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -330,7 +330,6 @@ importFrom(utils,head) importFrom(utils,install.packages) importFrom(utils,modifyList) importFrom(utils,object.size) -importFrom(utils,packageName) importFrom(utils,packageVersion) importFrom(utils,tail) importFrom(vctrs,s3_register) diff --git a/r/R/array.R b/r/R/array.R index af41f13107c..1d63c5735a7 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -144,7 +144,7 @@ Array <- R6Class("Array", Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) }, View = function(type) { - Array$create(Array__View(self, as_type(enexpr(type)))) + Array$create(Array__View(self, as_type(type))) }, Validate = function() Array__Validate(self) ), @@ -155,7 +155,6 @@ Array <- R6Class("Array", ) ) Array$create <- function(x, type = NULL) { - type <- enexpr(type) # this needs to be before the null check if (!is.null(type)) { type <- as_type(type) } diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index a88de90b7e1..dd43307c9cc 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -23,7 +23,7 @@ ArrowDatum <- R6Class("ArrowDatum", inherit = ArrowObject, public = list( cast = function(target_type, safe = TRUE, ...) { opts <- cast_options(safe, ...) - opts$to_type <- as_type(enexpr(target_type)) + opts$to_type <- as_type(target_type) call_function("cast", self, options = opts) } ) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index c8feda6c22c..c716ff4a544 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -107,7 +107,7 @@ option_use_threads <- function() { #' statistics from Arrow's memory allocator, and also Arrow's run-time #' information. #' @export -#' @importFrom utils packageName packageVersion +#' @importFrom utils packageVersion arrow_info <- function() { opts <- options() out <- list( diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index d7e79161931..a7f9c8f790c 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -98,7 +98,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) }, View = function(type) { - ChunkedArray__View(self, as_type(enexpr(type))) + ChunkedArray__View(self, as_type(type)) }, Validate = function() { ChunkedArray__Validate(self) @@ -119,7 +119,6 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, ) ChunkedArray$create <- function(..., type = NULL) { - type <- enexpr(type) # this needs to be before the null check if (!is.null(type)) { type <- as_type(type) } diff --git a/r/R/dplyr.R b/r/R/dplyr.R index d66571f5773..845cb3a1815 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -398,7 +398,7 @@ build_function_list <- function(FUN) { # Plus some special handling where it's not 1:1 cast = function(x, target_type, safe = TRUE, ...) { opts <- cast_options(safe, ...) - opts$to_type <- as_type(enexpr(target_type)) + opts$to_type <- as_type(target_type) FUN("cast", x, options = opts) }, dictionary_encode = function(x, null_encoding_behavior = c("mask", "encode")) { diff --git a/r/R/field.R b/r/R/field.R index 49c6cd09c0a..33549d344c5 100644 --- a/r/R/field.R +++ b/r/R/field.R @@ -55,7 +55,7 @@ Field <- R6Class("Field", inherit = ArrowObject, ) Field$create <- function(name, type, metadata) { assert_that(inherits(name, "character"), length(name) == 1L) - type <- as_type(enexpr(type), name) + type <- as_type(type, name) assert_that(missing(metadata), msg = "metadata= is currently ignored") Field__initialize(enc2utf8(name), type, TRUE) } diff --git a/r/R/scalar.R b/r/R/scalar.R index b85891d8d50..cbda5964a2c 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -54,7 +54,7 @@ Scalar$create <- function(x, type = NULL) { # Wrap in a list type x <- list(x) } - Array__GetScalar(Array$create(x, type = enexpr(type)), 0) + Array__GetScalar(Array$create(x, type = type), 0) } #' @rdname array diff --git a/r/R/type.R b/r/R/type.R index 7acededacd7..ecb9b48a185 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -413,38 +413,16 @@ FixedSizeListType <- R6Class("FixedSizeListType", fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size) as_type <- function(type, name = "type") { - # work around other packages possibly masking the Arrow data type functions, - # for example rlang masking string() - type <- unmask_type_fun(type) - # magic so we don't have to mask base::double() if (identical(type, double())) { type <- float64() } - - if (!is.null(type) && !inherits(type, "DataType")) { + if (!inherits(type, "DataType")) { stop(name, " must be a DataType, not ", class(type), call. = FALSE) } type } -unmask_type_fun <- function(expr) { - # if `expr` is an unevaluated expression, try to evaluate it here in the arrow - # package environment. If that fails or returns something that is not a - # DataType, then evaluate it in the calling environment outside of the arrow - # package. - if (is.call(expr)) { - type <- tryCatch(eval(expr), error = function(e) NULL) - if (inherits(type, "DataType")) return(type) - } - i <- 2L - while (identical(packageName(parent.frame(i)), "arrow")) { - i <- i + 1L - } - eval.parent(expr, n = i) -} - - # vctrs support ----------------------------------------------------------- str_dup <- function(x, times) { paste0(rep(x, times = times), collapse = "") diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 9926d8e1f52..35ae357f703 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -381,40 +381,6 @@ test_that("Array$create() supports the type= argument. conversion from INTSXP an Array$create(5, type = "not a type"), "type must be a DataType, not character" ) - - # Works when type function is masked - string <- rlang::string - expect_type_equal( - Array$create("abc", type = string()), - arrow::string() - ) - rm(string) - - # Works when with non-Arrow function that returns an Arrow type - # when the non-Arrow function has the same name as a base R function... - str <- arrow::string - expect_type_equal( - Array$create("abc", type = str()), - arrow::string() - ) - rm(str) - - # ... and when it has the same name as an Arrow function - type <- arrow::string - expect_type_equal( - Array$create("abc", type = type()), - arrow::string() - ) - rm(type) - - # Works with local variable whose value is an Arrow type - type <- arrow::string() - expect_type_equal( - Array$create("abc", type = type), - arrow::string() - ) - rm(type) - }) test_that("Array$create() aborts on overflow", { From 8e3f80fc17c8859de58a1157bd9ae9f7b013bb17 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 9 Apr 2021 18:42:49 -0400 Subject: [PATCH 23/23] Skip masking prevention tests --- r/tests/testthat/test-type.R | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index 8f2cc7c0559..56cef722556 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -68,3 +68,41 @@ test_that("DataType$Equals", { expect_failure(expect_type_equal(a, z), "int32 not equal to double") expect_false(a$Equals(32L)) }) + +test_that("Masked data type functions still work", { + skip("Work around masking of data type functions (ARROW-12322)") + + # Works when type function is masked + string <- rlang::string + expect_type_equal( + Array$create("abc", type = string()), + arrow::string() + ) + rm(string) + + # Works when with non-Arrow function that returns an Arrow type + # when the non-Arrow function has the same name as a base R function... + str <- arrow::string + expect_type_equal( + Array$create("abc", type = str()), + arrow::string() + ) + rm(str) + + # ... and when it has the same name as an Arrow function + type <- arrow::string + expect_type_equal( + Array$create("abc", type = type()), + arrow::string() + ) + rm(type) + + # Works with local variable whose value is an Arrow type + type <- arrow::string() + expect_type_equal( + Array$create("abc", type = type), + arrow::string() + ) + rm(type) + +})