From 2855efd196d5ce52ca1412a60c0e766e18250859 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 14 May 2021 15:02:07 -0400 Subject: [PATCH 01/22] Add type_id() R6 methods for ChunkedArray, Scalar --- r/R/chunked-array.R | 1 + r/R/scalar.R | 1 + 2 files changed, 2 insertions(+) diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index d03db3047fd..61093e203e7 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -80,6 +80,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum, public = list( length = function() ChunkedArray__length(self), + type_id = function() ChunkedArray__type(self)$id, chunk = function(i) Array$create(ChunkedArray__chunk(self, i)), as_vector = function() ChunkedArray__as_vector(self), Slice = function(offset, length = NULL){ diff --git a/r/R/scalar.R b/r/R/scalar.R index 9865315ee56..40e9c65ce71 100644 --- a/r/R/scalar.R +++ b/r/R/scalar.R @@ -56,6 +56,7 @@ Scalar <- R6Class("Scalar", # TODO: document the methods public = list( ToString = function() Scalar__ToString(self), + type_id = function() Scalar__type(self)$id, as_vector = function() Scalar__as_vector(self), as_array = function() MakeArrayFromScalar(self), Equals = function(other, ...) { From 87b8e722304fa0932e8150739aeddbe792c9d1f3 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 14 May 2021 15:06:32 -0400 Subject: [PATCH 02/22] Store schema in Expression R6 obj and use bind() to bind to a schema --- r/R/dplyr.R | 6 ++++-- r/R/expression.R | 9 ++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 56be8cff1db..c22ef8e5602 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -76,9 +76,11 @@ print.arrow_dplyr_query <- function(x, ...) { # Just a field_ref, so look up in the schema schm$GetFieldByName(name)$type$ToString() } else { - # Expression, so get its type and append the expression + # Expression, so get its type and append the expression. + # Need to bind the expression to the schema before finding its type. + expr$bind(schm) paste0( - expr$type(schm)$ToString(), + expr$type()$ToString(), " (", expr$ToString(), ")" ) } diff --git a/r/R/expression.R b/r/R/expression.R index 99d98b6af0a..fb65e78a916 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -75,7 +75,14 @@ Expression <- R6Class("Expression", inherit = ArrowObject, public = list( ToString = function() compute___expr__ToString(self), - type = function(schema) compute___expr__type(self, schema), + schema = NULL, + bind = function(schema) self$schema <- schema, + type = function() { + if (is.null(self$schema)) { + stop("Must bind() expression to a schema before returning its type", call. = FALSE) + } + compute___expr__type(self, self$schema) + }, cast = function(to_type, safe = TRUE, ...) { opts <- list( to_type = to_type, From 86d0ab74bbd7b4fb25e64fc003d6fb3789a7907c Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 14 May 2021 15:07:14 -0400 Subject: [PATCH 03/22] Add type_id() R6 method for Expression --- r/R/expression.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/expression.R b/r/R/expression.R index fb65e78a916..c0436a6060a 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -83,6 +83,7 @@ Expression <- R6Class("Expression", inherit = ArrowObject, } compute___expr__type(self, self$schema) }, + type_id = function() self$type()$id, cast = function(to_type, safe = TRUE, ...) { opts <- list( to_type = to_type, From 7f80a21145930cfea7208365afb359dab6aad2e9 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 14 May 2021 16:07:46 -0400 Subject: [PATCH 04/22] Add is.() functions --- r/R/dplyr-eval.R | 3 +++ r/R/dplyr-functions.R | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 2d19bd4cb90..5ed1818d3f9 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -86,6 +86,9 @@ arrow_mask <- function(.data) { f_env[[f]] <- fail } + # Bind the schema to the expressions + map(.data$selected_columns, ~.$bind(.data$.data$schema)) + # Add the column references and make the mask out <- new_data_mask( new_environment(.data$selected_columns, parent = f_env), diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 7e0eadfdcea..adfcf1cccf6 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -121,6 +121,39 @@ nse_funcs$as.numeric <- function(x) { Expression$create("cast", x, options = cast_options(to_type = float64())) } +# is.* type functions +nse_funcs$is.character <- function(x) { + x$type_id() %in% Type[c("STRING", "LARGE_STRING")] +} + +nse_funcs$is.numeric <- function(x) { + x$type_id() %in% c(2:12, 23:24) +} + +nse_funcs$is.double <- function(x) { + x$type_id() %in% Type["DOUBLE"] +} + +nse_funcs$is.integer <- function(x) { + x$type_id() %in% c(2:9) +} + +nse_funcs$is.integer64 <- function(x) { + x$type_id() %in% Type[c("UINT64", "INT64")] +} + +nse_funcs$is.logical <- function(x) { + x$type_id() %in% Type["BOOL"] +} + +nse_funcs$is.factor <- function(x) { + x$type_id() == Type["DICTIONARY"] +} + +nse_funcs$is.list <- function(x) { + x$type_id() %in% Type[c("LIST", "FIXED_SIZE_LIST", "LARGE_LIST")] +} + # String functions nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) { if (allowNA) { From 967485a2344633f104e684f3fc68a2594f01fb7f Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 14 May 2021 16:29:03 -0400 Subject: [PATCH 05/22] Add is.() tests --- r/tests/testthat/test-dplyr.R | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index bf5f06b038c..612c0e86e36 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -524,6 +524,85 @@ test_that("is.finite(), is.infinite(), is.nan()", { ) }) +test_that("type checks with as.*()", { + library(bit64) + expect_dplyr_equal( + input %>% + transmute( + chr_is_chr = is.character(chr), + chr_is_dbl = is.double(chr), + chr_is_fct = is.factor(chr), + chr_is_int = is.integer(chr), + chr_is_i64 = is.integer64(chr), + chr_is_lst = is.list(chr), + chr_is_lgl = is.logical(chr), + chr_is_num = is.numeric(chr), + dbl_is_chr = is.character(dbl), + dbl_is_dbl = is.double(dbl), + dbl_is_fct = is.factor(dbl), + dbl_is_int = is.integer(dbl), + dbl_is_i64 = is.integer64(dbl), + dbl_is_lst = is.list(dbl), + dbl_is_lgl = is.logical(dbl), + dbl_is_num = is.numeric(dbl), + fct_is_chr = is.character(fct), + fct_is_dbl = is.double(fct), + fct_is_fct = is.factor(fct), + fct_is_int = is.integer(fct), + fct_is_i64 = is.integer64(fct), + fct_is_lst = is.list(fct), + fct_is_lgl = is.logical(fct), + fct_is_num = is.numeric(fct), + int_is_chr = is.character(int), + int_is_dbl = is.double(int), + int_is_fct = is.factor(int), + int_is_int = is.integer(int), + int_is_i64 = is.integer64(int), + int_is_lst = is.list(int), + int_is_lgl = is.logical(int), + int_is_num = is.numeric(int), + lgl_is_chr = is.character(lgl), + lgl_is_dbl = is.double(lgl), + lgl_is_fct = is.factor(lgl), + lgl_is_int = is.integer(lgl), + lgl_is_i64 = is.integer64(lgl), + lgl_is_lst = is.list(lgl), + lgl_is_lgl = is.logical(lgl), + lgl_is_num = is.numeric(lgl) + ) %>% + collect(), + tbl + ) + expect_dplyr_equal( + input %>% + transmute( + i64_is_chr = is.character(i64), + # TODO: investigate why this is not matching when testthat runs it + #i64_is_dbl = is.double(i64), + i64_is_fct = is.factor(i64), + # we want Arrow to return TRUE, but bit64 returns FALSE + #i64_is_int = is.integer(i64), + i64_is_i64 = is.integer64(i64), + i64_is_lst = is.list(i64), + i64_is_lgl = is.logical(i64), + i64_is_num = is.numeric(i64), + lst_is_chr = is.character(lst), + lst_is_dbl = is.double(lst), + lst_is_fct = is.factor(lst), + lst_is_int = is.integer(lst), + lst_is_i64 = is.integer64(lst), + lst_is_lst = is.list(lst), + lst_is_lgl = is.logical(lst), + lst_is_num = is.numeric(lst) + ) %>% + collect(), + tibble( + i64 = as.integer64(1:3), + lst = list(c("a", "b"), c("d", "e"), c("f", "g")) + ) + ) +}) + test_that("as.factor()/dictionary_encode()", { skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression {x=dictionary_encode(x, {NON-REPRESENTABLE OPTIONS})}") df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B")) From d2db003192248722edd7e3b880e34e7b948cc505 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 14 May 2021 16:30:21 -0400 Subject: [PATCH 06/22] Add support for tidyselect where() helper in relocate() --- r/NAMESPACE | 1 + r/R/arrow-package.R | 2 +- r/R/dplyr-select.R | 15 ++++++++++----- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 1510ad89d26..991e384723e 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -300,6 +300,7 @@ importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,abort) importFrom(rlang,as_label) +importFrom(rlang,caller_env) importFrom(rlang,dots_n) importFrom(rlang,enexpr) importFrom(rlang,enexprs) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index e557f869325..c263d20f8df 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -19,7 +19,7 @@ #' @importFrom R6 R6Class #' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep #' @importFrom assertthat assert_that is.string -#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec is_bare_character quo_get_expr quo_set_expr .data seq2 is_quosure enexpr enexprs expr +#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec is_bare_character quo_get_expr quo_set_expr .data seq2 is_quosure enexpr enexprs expr caller_env #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index 3730fe63fec..2cbf680c777 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -59,11 +59,16 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL # The code in this function is adapted from the code in dplyr::relocate.data.frame # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829 - check_select_helpers(c(enexprs(...), enexpr(.before), enexpr(.after))) .data <- arrow_dplyr_query(.data) - to_move <- eval_select(expr(c(...)), .data$selected_columns) + # Bind the schema to the expressions + map(.data$selected_columns, ~.$bind(.data$.data$schema)) + + # Create a mask for evaluating expressions in tidyselect helpers + mask <- new_environment(.cache$functions, parent = caller_env()) + + to_move <- eval_select(substitute(c(...)), .data$selected_columns, mask) .before <- enquo(.before) .after <- enquo(.after) @@ -73,12 +78,12 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL if (has_before && has_after) { abort("Must supply only one of `.before` and `.after`.") } else if (has_before) { - where <- min(unname(eval_select(.before, .data$selected_columns))) + where <- min(unname(eval_select(quo_get_expr(.before), .data$selected_columns, mask))) if (!where %in% to_move) { to_move <- c(to_move, where) } } else if (has_after) { - where <- max(unname(eval_select(.after, .data$selected_columns))) + where <- max(unname(eval_select(quo_get_expr(.after), .data$selected_columns, mask))) if (!where %in% to_move) { to_move <- c(where, to_move) } @@ -117,4 +122,4 @@ check_select_helpers <- function(exprs) { call. = FALSE ) } -} \ No newline at end of file +} From b68ef06b9bc5a71b1a770e23da492a7f1270b735 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 14 May 2021 16:31:13 -0400 Subject: [PATCH 07/22] Update tests --- r/tests/testthat/test-dplyr.R | 19 ++++++++++--------- r/tests/testthat/test-expression.R | 13 ++++++++++--- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 612c0e86e36..a309a6aa14e 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -343,21 +343,22 @@ test_that("relocate", { }) test_that("relocate with selection helpers", { + df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a") expect_dplyr_equal( input %>% relocate(any_of(c("a", "e", "i", "o", "u"))) %>% collect(), df ) - expect_error( - df %>% Table$create() %>% relocate(where(is.character)), - "Unsupported selection helper" + expect_dplyr_equal( + input %>% relocate(where(is.character)) %>% collect(), + df ) - expect_error( - df %>% Table$create() %>% relocate(a, b, c, .after = where(is.character)), - "Unsupported selection helper" + expect_dplyr_equal( + input %>% relocate(a, b, c, .after = where(is.character)) %>% collect(), + df ) - expect_error( - df %>% Table$create() %>% relocate(d, e, f, .before = where(is.numeric)), - "Unsupported selection helper" + expect_dplyr_equal( + input %>% relocate(d, e, f, .before = where(is.numeric)) %>% collect(), + df ) }) diff --git a/r/tests/testthat/test-expression.R b/r/tests/testthat/test-expression.R index d0459fde5b5..141c16e30cc 100644 --- a/r/tests/testthat/test-expression.R +++ b/r/tests/testthat/test-expression.R @@ -49,13 +49,20 @@ test_that("C++ expressions", { fixed = TRUE ) expect_type_equal( - f$type(schema(f = float64())), + { + f$bind(schema(f = float64())) + f$type() + }, float64() ) expect_type_equal( - (f > 4)$type(schema(f = float64())), + { + f_gt_4 <- f > 4 + f_gt_4$bind(schema(f = float64())) + f_gt_4$type() + }, bool() ) # Interprets that as a list type expect_r6_class(f == c(1L, 2L), "Expression") -}) \ No newline at end of file +}) From b8016fd2fe8fbd63f939b640141b0ad88a5c6d0b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 11:42:48 -0400 Subject: [PATCH 08/22] Make as.integer() return false for UINT64 Co-authored-by: Neal Richardson --- r/R/dplyr-functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index adfcf1cccf6..d08ee928a49 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -139,7 +139,7 @@ nse_funcs$is.integer <- function(x) { } nse_funcs$is.integer64 <- function(x) { - x$type_id() %in% Type[c("UINT64", "INT64")] + x$type_id() %in% Type["INT64"] } nse_funcs$is.logical <- function(x) { From 55d7dd6059f69bb4091befc2e87b64548c718aee Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 18:02:32 -0400 Subject: [PATCH 09/22] Enumerate all types --- r/R/dplyr-functions.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index d08ee928a49..45b7b2b5c22 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -127,7 +127,9 @@ nse_funcs$is.character <- function(x) { } nse_funcs$is.numeric <- function(x) { - x$type_id() %in% c(2:12, 23:24) + x$type_id() %in% Type[c("UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", + "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE", + "DECIMAL", "DECIMAL256")] } nse_funcs$is.double <- function(x) { @@ -135,7 +137,8 @@ nse_funcs$is.double <- function(x) { } nse_funcs$is.integer <- function(x) { - x$type_id() %in% c(2:9) + x$type_id() %in% Type[c("UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", + "UINT64", "INT64")] } nse_funcs$is.integer64 <- function(x) { From eea39e25cbbb28e828e886922103fc58dfdfafd8 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 18:36:05 -0400 Subject: [PATCH 10/22] Tweak Expression$type(), $type_id() --- r/R/arrowExports.R | 4 ++++ r/R/dplyr-eval.R | 4 ++-- r/R/dplyr-select.R | 4 ++-- r/R/dplyr.R | 3 +-- r/R/expression.R | 14 +++++++------- r/src/arrowExports.cpp | 17 +++++++++++++++++ r/src/expression.cpp | 8 ++++++++ r/tests/testthat/test-expression.R | 11 ++--------- 8 files changed, 43 insertions(+), 22 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 038467fcad0..55a28529f85 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -812,6 +812,10 @@ compute___expr__type <- function(x, schema){ .Call(`_arrow_compute___expr__type`, x, schema) } +compute___expr__type_id <- function(x, schema){ + .Call(`_arrow_compute___expr__type_id`, x, schema) +} + ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level){ invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level)) } diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 5ed1818d3f9..f7c5b5f9dae 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -86,8 +86,8 @@ arrow_mask <- function(.data) { f_env[[f]] <- fail } - # Bind the schema to the expressions - map(.data$selected_columns, ~.$bind(.data$.data$schema)) + # Assign the schema to the expressions + map(.data$selected_columns, ~.$schema <- .data$.data$schema) # Add the column references and make the mask out <- new_data_mask( diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index 2cbf680c777..deaa2c99c8d 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -62,8 +62,8 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL .data <- arrow_dplyr_query(.data) - # Bind the schema to the expressions - map(.data$selected_columns, ~.$bind(.data$.data$schema)) + # Assign the schema to the expressions + map(.data$selected_columns, ~.$schema <- .data$.data$schema) # Create a mask for evaluating expressions in tidyselect helpers mask <- new_environment(.cache$functions, parent = caller_env()) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index c22ef8e5602..d9c916ba90a 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -78,9 +78,8 @@ print.arrow_dplyr_query <- function(x, ...) { } else { # Expression, so get its type and append the expression. # Need to bind the expression to the schema before finding its type. - expr$bind(schm) paste0( - expr$type()$ToString(), + expr$type(schm)$ToString(), " (", expr$ToString(), ")" ) } diff --git a/r/R/expression.R b/r/R/expression.R index c0436a6060a..417a12eeb81 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -76,14 +76,14 @@ Expression <- R6Class("Expression", inherit = ArrowObject, public = list( ToString = function() compute___expr__ToString(self), schema = NULL, - bind = function(schema) self$schema <- schema, - type = function() { - if (is.null(self$schema)) { - stop("Must bind() expression to a schema before returning its type", call. = FALSE) - } - compute___expr__type(self, self$schema) + type = function(schema = self$schema) { + assert_that(!is.null(schema)) + compute___expr__type(self, schema) + }, + type_id = function(schema = self$schema) { + assert_that(!is.null(schema)) + compute___expr__type_id(self, schema) }, - type_id = function() self$type()$id, cast = function(to_type, safe = TRUE, ...) { opts <- list( to_type = to_type, diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 0a9b8394e4b..b7ca5e9414c 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3145,6 +3145,22 @@ extern "C" SEXP _arrow_compute___expr__type(SEXP x_sexp, SEXP schema_sexp){ } #endif +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +arrow::Type::type compute___expr__type_id(const std::shared_ptr& x, const std::shared_ptr& schema); +extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type x(x_sexp); + arrow::r::Input&>::type schema(schema_sexp); + return cpp11::as_sexp(compute___expr__type_id(x, schema)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){ + Rf_error("Cannot call compute___expr__type_id(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // feather.cpp #if defined(ARROW_R_WITH_ARROW) void ipc___WriteFeather__Table(const std::shared_ptr& stream, const std::shared_ptr& table, int version, int chunk_size, arrow::Compression::type compression, int compression_level); @@ -7074,6 +7090,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, + { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, diff --git a/r/src/expression.cpp b/r/src/expression.cpp index d8745ade479..e2be97b0b36 100644 --- a/r/src/expression.cpp +++ b/r/src/expression.cpp @@ -76,4 +76,12 @@ std::shared_ptr compute___expr__type( return bound.type(); } +// [[arrow::export]] +arrow::Type::type compute___expr__type_id( + const std::shared_ptr& x, + const std::shared_ptr& schema) { + auto bound = ValueOrStop(x->Bind(*schema)); + return bound.type()->id(); +} + #endif diff --git a/r/tests/testthat/test-expression.R b/r/tests/testthat/test-expression.R index 141c16e30cc..49babf30d5c 100644 --- a/r/tests/testthat/test-expression.R +++ b/r/tests/testthat/test-expression.R @@ -49,18 +49,11 @@ test_that("C++ expressions", { fixed = TRUE ) expect_type_equal( - { - f$bind(schema(f = float64())) - f$type() - }, + f$type(schema(f = float64())), float64() ) expect_type_equal( - { - f_gt_4 <- f > 4 - f_gt_4$bind(schema(f = float64())) - f_gt_4$type() - }, + (f > 4)$type(schema(f = float64())), bool() ) # Interprets that as a list type From fdee3e873bca0ad2b9c4eb4868f4c6bdd3cb4aec Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 18:44:27 -0400 Subject: [PATCH 11/22] Fix map() error --- r/R/dplyr-eval.R | 2 +- r/R/dplyr-select.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index f7c5b5f9dae..de68d2f2c4d 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -87,7 +87,7 @@ arrow_mask <- function(.data) { } # Assign the schema to the expressions - map(.data$selected_columns, ~.$schema <- .data$.data$schema) + map(.data$selected_columns, ~(.$schema <- .data$.data$schema)) # Add the column references and make the mask out <- new_data_mask( diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index deaa2c99c8d..686965a4197 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -63,7 +63,7 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL .data <- arrow_dplyr_query(.data) # Assign the schema to the expressions - map(.data$selected_columns, ~.$schema <- .data$.data$schema) + map(.data$selected_columns, ~(.$schema <- .data$.data$schema)) # Create a mask for evaluating expressions in tidyselect helpers mask <- new_environment(.cache$functions, parent = caller_env()) From 466543fb3ddeb2fcfc5c150f4381c52b1e418b2f Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 21:22:25 -0400 Subject: [PATCH 12/22] Lint and fix comment --- r/R/dplyr.R | 3 +-- r/src/expression.cpp | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index d9c916ba90a..56be8cff1db 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -76,8 +76,7 @@ print.arrow_dplyr_query <- function(x, ...) { # Just a field_ref, so look up in the schema schm$GetFieldByName(name)$type$ToString() } else { - # Expression, so get its type and append the expression. - # Need to bind the expression to the schema before finding its type. + # Expression, so get its type and append the expression paste0( expr$type(schm)$ToString(), " (", expr$ToString(), ")" diff --git a/r/src/expression.cpp b/r/src/expression.cpp index e2be97b0b36..4b671cb99dd 100644 --- a/r/src/expression.cpp +++ b/r/src/expression.cpp @@ -77,9 +77,8 @@ std::shared_ptr compute___expr__type( } // [[arrow::export]] -arrow::Type::type compute___expr__type_id( - const std::shared_ptr& x, - const std::shared_ptr& schema) { +arrow::Type::type compute___expr__type_id(const std::shared_ptr& x, + const std::shared_ptr& schema) { auto bound = ValueOrStop(x->Bind(*schema)); return bound.type()->id(); } From 66872498844024a3f8560edc80805ec5a2e23dc5 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 21:25:39 -0400 Subject: [PATCH 13/22] Use == instead of %% where possible --- r/R/dplyr-functions.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 45b7b2b5c22..774816e26b8 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -133,7 +133,7 @@ nse_funcs$is.numeric <- function(x) { } nse_funcs$is.double <- function(x) { - x$type_id() %in% Type["DOUBLE"] + x$type_id() == Type["DOUBLE"] } nse_funcs$is.integer <- function(x) { @@ -142,11 +142,11 @@ nse_funcs$is.integer <- function(x) { } nse_funcs$is.integer64 <- function(x) { - x$type_id() %in% Type["INT64"] + x$type_id() == Type["INT64"] } nse_funcs$is.logical <- function(x) { - x$type_id() %in% Type["BOOL"] + x$type_id() == Type["BOOL"] } nse_funcs$is.factor <- function(x) { From e05538830901eaf2f63fa4707872c5766839f0d5 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 23:27:29 -0400 Subject: [PATCH 14/22] Implement more type checking functions --- r/R/dplyr-functions.R | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 774816e26b8..124f63b0fdb 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -57,6 +57,10 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { Expression$create("cast", x, options = opts) } +nse_funcs$is <- function(x, type) { + x$type() == as_type(type) +} + nse_funcs$dictionary_encode <- function(x, null_encoding_behavior = c("mask", "encode")) { behavior <- toupper(match.arg(null_encoding_behavior)) @@ -125,38 +129,51 @@ nse_funcs$as.numeric <- function(x) { nse_funcs$is.character <- function(x) { x$type_id() %in% Type[c("STRING", "LARGE_STRING")] } - nse_funcs$is.numeric <- function(x) { x$type_id() %in% Type[c("UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE", "DECIMAL", "DECIMAL256")] } - nse_funcs$is.double <- function(x) { x$type_id() == Type["DOUBLE"] } - nse_funcs$is.integer <- function(x) { x$type_id() %in% Type[c("UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64")] } - nse_funcs$is.integer64 <- function(x) { x$type_id() == Type["INT64"] } - nse_funcs$is.logical <- function(x) { x$type_id() == Type["BOOL"] } - nse_funcs$is.factor <- function(x) { x$type_id() == Type["DICTIONARY"] } - nse_funcs$is.list <- function(x) { x$type_id() %in% Type[c("LIST", "FIXED_SIZE_LIST", "LARGE_LIST")] } +# rlang::is_* type functions +nse_funcs$is_character <- function(x, n = NULL) { + nse_funcs$is.character(x) && (is.null(n) || length(x) == n) +} +nse_funcs$is_double <- function(x, n = NULL, finite = NULL) { + nse_funcs$is.double(x) && + (is.null(n) || length(x) == n) && + (is.null(finite) || + (finite && as.vector(!any(is_in(x, c(NA_real_, Inf,-Inf, NaN)))))) +} +nse_funcs$is_integer <- function(x, n = NULL) { + nse_funcs$is.integer(x) && (is.null(n) || length(x) == n) +} +nse_funcs$is_list <- function(x, n = NULL) { + nse_funcs$is.list(x) && (is.null(n) || length(x) == n) +} +nse_funcs$is_logical <- function(x, n = NULL) { + nse_funcs$is.logical(x) && (is.null(n) || length(x) == n) +} + # String functions nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) { if (allowNA) { From 4a53165fd433d090b458e81487d3298d33463944 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 23:27:49 -0400 Subject: [PATCH 15/22] Add more tests --- r/tests/testthat/test-dplyr.R | 55 ++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index a309a6aa14e..4c06fe5c148 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -525,7 +525,29 @@ test_that("is.finite(), is.infinite(), is.nan()", { ) }) -test_that("type checks with as.*()", { +test_that("type checks with is()", { + expect_equal( + Table$create( + i32 = Array$create(1, int32()), + f64 = Array$create(1.1, float64()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + i32_is_i32 = is(i32, int32()), + i32_is_i64 = is(i32, float64()), + i32_is_str = is(i32, arrow::string()), + f64_is_i32 = is(f64, int32()), + f64_is_i64 = is(f64, float64()), + f64_is_str = is(f64, arrow::string()), + str_is_i32 = is(str, int32()), + str_is_i64 = is(str, float64()), + str_is_str = is(str, arrow::string()) + ) %>% + collect() %>% t() %>% as.vector(), + c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE) + ) +}) + +test_that("type checks with is.*()", { library(bit64) expect_dplyr_equal( input %>% @@ -604,6 +626,37 @@ test_that("type checks with as.*()", { ) }) +test_that("type checks with is_*()", { + library(rlang) + expect_dplyr_equal( + input %>% + transmute( + chr_is_chr = is_character(chr), + chr_is_dbl = is_double(chr), + chr_is_int = is_integer(chr), + chr_is_lst = is_list(chr), + chr_is_lgl = is_logical(chr), + dbl_is_chr = is_character(dbl), + dbl_is_dbl = is_double(dbl), + dbl_is_int = is_integer(dbl), + dbl_is_lst = is_list(dbl), + dbl_is_lgl = is_logical(dbl), + int_is_chr = is_character(int), + int_is_dbl = is_double(int), + int_is_int = is_integer(int), + int_is_lst = is_list(int), + int_is_lgl = is_logical(int), + lgl_is_chr = is_character(lgl), + lgl_is_dbl = is_double(lgl), + lgl_is_int = is_integer(lgl), + lgl_is_lst = is_list(lgl), + lgl_is_lgl = is_logical(lgl) + ) %>% + collect(), + tbl + ) +}) + test_that("as.factor()/dictionary_encode()", { skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression {x=dictionary_encode(x, {NON-REPRESENTABLE OPTIONS})}") df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B")) From 5d6830c591070d07589d8f591235dfaa63e153f5 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 23:30:23 -0400 Subject: [PATCH 16/22] Rename is() args for consistency with methods::is() --- r/R/dplyr-functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 124f63b0fdb..0874885c4b2 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -57,8 +57,8 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { Expression$create("cast", x, options = opts) } -nse_funcs$is <- function(x, type) { - x$type() == as_type(type) +nse_funcs$is <- function(object, class2) { + object$type() == as_type(class2) } nse_funcs$dictionary_encode <- function(x, From bb57bf6d958bb46bd0bc4d555f0642bda93c6f22 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 20 May 2021 23:58:36 -0400 Subject: [PATCH 17/22] Improve is() implementation and tests --- r/R/dplyr-functions.R | 6 +++++- r/tests/testthat/test-dplyr.R | 28 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 0874885c4b2..780f9d1b274 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -58,7 +58,11 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { } nse_funcs$is <- function(object, class2) { - object$type() == as_type(class2) + if (is.character(class2)) { + object$type()$ToString() == class2 + } else { + object$type() == as_type(class2) + } } nse_funcs$dictionary_encode <- function(x, diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 4c06fe5c148..54edb68db3e 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -529,20 +529,48 @@ test_that("type checks with is()", { expect_equal( Table$create( i32 = Array$create(1, int32()), + dec = Array$create(pi)$cast(decimal(3, 2)), f64 = Array$create(1.1, float64()), str = Array$create("a", arrow::string()) ) %>% transmute( i32_is_i32 = is(i32, int32()), + i32_is_dec = is(i32, decimal(3, 2)), i32_is_i64 = is(i32, float64()), i32_is_str = is(i32, arrow::string()), + dec_is_i32 = is(dec, int32()), + dec_is_dec = is(dec, decimal(3, 2)), + dec_is_i64 = is(dec, float64()), + dec_is_str = is(dec, arrow::string()), f64_is_i32 = is(f64, int32()), + f64_is_dec = is(f64, decimal(3, 2)), f64_is_i64 = is(f64, float64()), f64_is_str = is(f64, arrow::string()), str_is_i32 = is(str, int32()), + str_is_dec = is(str, decimal(3, 2)), str_is_i64 = is(str, float64()), str_is_str = is(str, arrow::string()) ) %>% collect() %>% t() %>% as.vector(), + c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, + FALSE, FALSE, FALSE, FALSE, TRUE) + ) + expect_equal( + Table$create( + i32 = Array$create(1, int32()), + f64 = Array$create(1.1, float64()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + i32_is_i32 = is(i32, "int32"), + i32_is_i64 = is(i32, "double"), + i32_is_str = is(i32, "string"), + f64_is_i32 = is(f64, "int32"), + f64_is_i64 = is(f64, "double"), + f64_is_str = is(f64, "string"), + str_is_i32 = is(str, "int32"), + str_is_i64 = is(str, "double"), + str_is_str = is(str, "string") + ) %>% + collect() %>% t() %>% as.vector(), c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE) ) }) From 69685e39d22e728be1dab04f11ab3186208a6f46 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 21 May 2021 10:05:51 -0400 Subject: [PATCH 18/22] Improve is() implementation and tests --- r/R/dplyr-functions.R | 9 +++++ r/tests/testthat/test-dplyr.R | 70 ++++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 14 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 780f9d1b274..c8a7710d113 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -58,7 +58,16 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { } nse_funcs$is <- function(object, class2) { + assert_that(is.string(class2) || inherits(class2, "DataType")) if (is.character(class2)) { + class2 <- switch(class2, + "utf8" = "string", + "float16" = "halffloat", + "float32" = "float", + "boolean" = "bool", + "float64" = "double", + class2 + ) object$type()$ToString() == class2 } else { object$type() == as_type(class2) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 54edb68db3e..d326baaa9e6 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -526,6 +526,7 @@ test_that("is.finite(), is.infinite(), is.nan()", { }) test_that("type checks with is()", { + # with class2=DataType expect_equal( Table$create( i32 = Array$create(1, int32()), @@ -554,25 +555,66 @@ test_that("type checks with is()", { c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE) ) + # with class2=string expect_equal( Table$create( - i32 = Array$create(1, int32()), - f64 = Array$create(1.1, float64()), - str = Array$create("a", arrow::string()) - ) %>% transmute( - i32_is_i32 = is(i32, "int32"), - i32_is_i64 = is(i32, "double"), - i32_is_str = is(i32, "string"), - f64_is_i32 = is(f64, "int32"), - f64_is_i64 = is(f64, "double"), - f64_is_str = is(f64, "string"), - str_is_i32 = is(str, "int32"), - str_is_i64 = is(str, "double"), - str_is_str = is(str, "string") - ) %>% + i32 = Array$create(1, int32()), + f64 = Array$create(1.1, float64()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + i32_is_i32 = is(i32, "int32"), + i32_is_i64 = is(i32, "double"), + i32_is_str = is(i32, "string"), + f64_is_i32 = is(f64, "int32"), + f64_is_i64 = is(f64, "double"), + f64_is_str = is(f64, "string"), + str_is_i32 = is(str, "int32"), + str_is_i64 = is(str, "double"), + str_is_str = is(str, "string") + ) %>% collect() %>% t() %>% as.vector(), c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE) ) + # with class2=string alias + expect_equal( + Table$create( + f16 = Array$create(NA_real_, halffloat()), + f32 = Array$create(1.1, float()), + f64 = Array$create(2.2, float64()), + lgl = Array$create(TRUE, bool()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + f16_is_f16 = is(f16, "float16"), + f16_is_f32 = is(f16, "float32"), + f16_is_f64 = is(f16, "float64"), + f16_is_lgl = is(f16, "boolean"), + f16_is_str = is(f16, "utf8"), + f32_is_f16 = is(f32, "float16"), + f32_is_f32 = is(f32, "float32"), + f32_is_f64 = is(f32, "float64"), + f32_is_lgl = is(f32, "boolean"), + f32_is_str = is(f32, "utf8"), + f64_is_f16 = is(f64, "float16"), + f64_is_f32 = is(f64, "float32"), + f64_is_f64 = is(f64, "float64"), + f64_is_lgl = is(f64, "boolean"), + f64_is_str = is(f64, "utf8"), + lgl_is_f16 = is(lgl, "float16"), + lgl_is_f32 = is(lgl, "float32"), + lgl_is_f64 = is(lgl, "float64"), + lgl_is_lgl = is(lgl, "boolean"), + lgl_is_str = is(lgl, "utf8"), + str_is_f16 = is(str, "float16"), + str_is_f32 = is(str, "float32"), + str_is_f64 = is(str, "float64"), + str_is_lgl = is(str, "boolean"), + str_is_str = is(str, "utf8") + ) %>% + collect() %>% t() %>% as.vector(), + c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, + FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, + FALSE, FALSE, TRUE) + ) }) test_that("type checks with is.*()", { From 65c54c665f3e060b9e30eee729eb76a6b3edd7e0 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Wed, 26 May 2021 15:25:47 -0400 Subject: [PATCH 19/22] Break out type alias string canonicalization into function --- r/R/dplyr-functions.R | 17 +++++------------ r/R/type.R | 18 ++++++++++++++++++ r/tests/testthat/test-type.R | 8 ++++++++ 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index c8a7710d113..77478f01175 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -58,19 +58,12 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { } nse_funcs$is <- function(object, class2) { - assert_that(is.string(class2) || inherits(class2, "DataType")) - if (is.character(class2)) { - class2 <- switch(class2, - "utf8" = "string", - "float16" = "halffloat", - "float32" = "float", - "boolean" = "bool", - "float64" = "double", - class2 - ) - object$type()$ToString() == class2 - } else { + if (is.string(class2)) { + object$type()$ToString() == canonical_type_str(class2) + } else if (inherits(class2, "DataType")) { object$type() == as_type(class2) + } else { + stop("Second argument to is() is not a string or DataType", call. = FALSE) } } diff --git a/r/R/type.R b/r/R/type.R index 04b0a378e10..c3c52fa7c9d 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -428,6 +428,24 @@ as_type <- function(type, name = "type") { type } +canonical_type_str <- function(type_str) { + # canonicalizes data type strings, converting aliases to match the strings + # returned by DataType$ToString() + # TODO: handle string representations of parameterized data types (such as + # "decimal128(3,1)") and format them to exactly match what is returned + # by DataType$ToString() + # TODO: error on unrecognized data type strings + assert_that(is.string(type_str)) + switch(type_str, + "utf8" = "string", + "float16" = "halffloat", + "float32" = "float", + "boolean" = "bool", + "float64" = "double", + type_str + ) +} + # vctrs support ----------------------------------------------------------- str_dup <- function(x, times) { paste0(rep(x, times = times), collapse = "") diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index 56cef722556..83896bd3e48 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -106,3 +106,11 @@ test_that("Masked data type functions still work", { rm(type) }) + +test_that("Type alias strings are correctly canonicalized", { + expect_equal(canonical_type_str("utf8"), utf8()$ToString()) + expect_equal(canonical_type_str("float16"), float16()$ToString()) + expect_equal(canonical_type_str("float32"), float32()$ToString()) + expect_equal(canonical_type_str("boolean"), boolean()$ToString()) + expect_equal(canonical_type_str("float64"), float64()$ToString()) +}) From 269bb24428c90971b0bd0de530d85c21229ec997 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Thu, 27 May 2021 23:05:46 -0400 Subject: [PATCH 20/22] Assert n, finite are NULL in is_*() functions --- r/R/dplyr-functions.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 77478f01175..ec26fffa985 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -162,22 +162,24 @@ nse_funcs$is.list <- function(x) { # rlang::is_* type functions nse_funcs$is_character <- function(x, n = NULL) { - nse_funcs$is.character(x) && (is.null(n) || length(x) == n) + assert_that(is.null(n)) + nse_funcs$is.character(x) } nse_funcs$is_double <- function(x, n = NULL, finite = NULL) { - nse_funcs$is.double(x) && - (is.null(n) || length(x) == n) && - (is.null(finite) || - (finite && as.vector(!any(is_in(x, c(NA_real_, Inf,-Inf, NaN)))))) + assert_that(is.null(n) && is.null(finite)) + nse_funcs$is.double(x) } nse_funcs$is_integer <- function(x, n = NULL) { - nse_funcs$is.integer(x) && (is.null(n) || length(x) == n) + assert_that(is.null(n)) + nse_funcs$is.integer(x) } nse_funcs$is_list <- function(x, n = NULL) { - nse_funcs$is.list(x) && (is.null(n) || length(x) == n) + assert_that(is.null(n)) + nse_funcs$is.list(x) } nse_funcs$is_logical <- function(x, n = NULL) { - nse_funcs$is.logical(x) && (is.null(n) || length(x) == n) + assert_that(is.null(n)) + nse_funcs$is.logical(x) } # String functions From 58ef25b4832b06bfae43d4f509c268c8a0a28e4b Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 28 May 2021 14:15:41 -0400 Subject: [PATCH 21/22] Improve canonical_type_str() --- r/R/type.R | 55 +++++++++++++++---- r/tests/testthat/test-type.R | 103 ++++++++++++++++++++++++++++++++++- 2 files changed, 143 insertions(+), 15 deletions(-) diff --git a/r/R/type.R b/r/R/type.R index c3c52fa7c9d..a22323c4ba1 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -429,20 +429,51 @@ as_type <- function(type, name = "type") { } canonical_type_str <- function(type_str) { - # canonicalizes data type strings, converting aliases to match the strings - # returned by DataType$ToString() - # TODO: handle string representations of parameterized data types (such as - # "decimal128(3,1)") and format them to exactly match what is returned - # by DataType$ToString() - # TODO: error on unrecognized data type strings + # canonicalizes data type strings, converting data type function names and + # aliases to match the strings returned by DataType$ToString() assert_that(is.string(type_str)) + if (grepl("[([<]", type_str)) { + stop("Cannot interpret string representations of data types that have parameters", call. = FALSE) + } switch(type_str, - "utf8" = "string", - "float16" = "halffloat", - "float32" = "float", - "boolean" = "bool", - "float64" = "double", - type_str + int8 = "int8", + int16 = "int16", + int32 = "int32", + int64 = "int64", + uint8 = "uint8", + uint16 = "uint16", + uint32 = "uint32", + uint64 = "uint64", + float16 = "halffloat", + halffloat = "halffloat", + float32 = "float", + float = "float", + float64 = "double", + double = "double", + boolean = "bool", + bool = "bool", + utf8 = "string", + large_utf8 = "large_string", + large_string = "large_string", + binary = "binary", + large_binary = "large_binary", + fixed_size_binary = "fixed_size_binary", + string = "string", + date32 = "date32", + date64 = "date64", + time32 = "time32", + time64 = "time64", + null = "null", + timestamp = "timestamp", + decimal = "decimal128", + struct = "struct", + list_of = "list", + list = "list", + large_list_of = "large_list", + large_list = "large_list", + fixed_size_list_of = "fixed_size_list", + fixed_size_list = "fixed_size_list", + stop("Unrecognized string representation of data type", call. = FALSE) ) } diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index 83896bd3e48..a3118be2a2c 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -107,10 +107,107 @@ test_that("Masked data type functions still work", { }) -test_that("Type alias strings are correctly canonicalized", { - expect_equal(canonical_type_str("utf8"), utf8()$ToString()) +test_that("Type strings are correctly canonicalized", { + # data types without arguments + expect_equal(canonical_type_str("int8"), int8()$ToString()) + expect_equal(canonical_type_str("int16"), int16()$ToString()) + expect_equal(canonical_type_str("int32"), int32()$ToString()) + expect_equal(canonical_type_str("int64"), int64()$ToString()) + expect_equal(canonical_type_str("uint8"), uint8()$ToString()) + expect_equal(canonical_type_str("uint16"), uint16()$ToString()) + expect_equal(canonical_type_str("uint32"), uint32()$ToString()) + expect_equal(canonical_type_str("uint64"), uint64()$ToString()) expect_equal(canonical_type_str("float16"), float16()$ToString()) + expect_equal(canonical_type_str("halffloat"), halffloat()$ToString()) expect_equal(canonical_type_str("float32"), float32()$ToString()) - expect_equal(canonical_type_str("boolean"), boolean()$ToString()) + expect_equal(canonical_type_str("float"), float()$ToString()) expect_equal(canonical_type_str("float64"), float64()$ToString()) + expect_equal(canonical_type_str("double"), float64()$ToString()) + expect_equal(canonical_type_str("boolean"), boolean()$ToString()) + expect_equal(canonical_type_str("bool"), bool()$ToString()) + expect_equal(canonical_type_str("utf8"), utf8()$ToString()) + expect_equal(canonical_type_str("large_utf8"), large_utf8()$ToString()) + expect_equal(canonical_type_str("large_string"), large_utf8()$ToString()) + expect_equal(canonical_type_str("binary"), binary()$ToString()) + expect_equal(canonical_type_str("large_binary"), large_binary()$ToString()) + expect_equal(canonical_type_str("string"), arrow::string()$ToString()) + expect_equal(canonical_type_str("null"), null()$ToString()) + + # data types with arguments + expect_equal( + canonical_type_str("fixed_size_binary"), + sub("^([^([<]+).*$", "\\1", fixed_size_binary(42)$ToString()) + ) + expect_equal( + canonical_type_str("date32"), + sub("^([^([<]+).*$", "\\1", date32()$ToString()) + ) + expect_equal( + canonical_type_str("date64"), + sub("^([^([<]+).*$", "\\1", date64()$ToString()) + ) + expect_equal( + canonical_type_str("time32"), + sub("^([^([<]+).*$", "\\1", time32()$ToString()) + ) + expect_equal( + canonical_type_str("time64"), + sub("^([^([<]+).*$", "\\1", time64()$ToString()) + ) + expect_equal( + canonical_type_str("timestamp"), + sub("^([^([<]+).*$", "\\1", timestamp()$ToString()) + ) + expect_equal( + canonical_type_str("decimal"), + sub("^([^([<]+).*$", "\\1", decimal(3,2)$ToString()) + ) + expect_equal( + canonical_type_str("struct"), + sub("^([^([<]+).*$", "\\1", struct(foo = int32())$ToString()) + ) + expect_equal( + canonical_type_str("list_of"), + sub("^([^([<]+).*$", "\\1", list_of(int32())$ToString()) + ) + expect_equal( + canonical_type_str("list"), + sub("^([^([<]+).*$", "\\1", list_of(int32())$ToString()) + ) + expect_equal( + canonical_type_str("large_list_of"), + sub("^([^([<]+).*$", "\\1", large_list_of(int32())$ToString()) + ) + expect_equal( + canonical_type_str("large_list"), + sub("^([^([<]+).*$", "\\1", large_list_of(int32())$ToString()) + ) + expect_equal( + canonical_type_str("fixed_size_list_of"), + sub("^([^([<]+).*$", "\\1", fixed_size_list_of(int32(), 42)$ToString()) + ) + expect_equal( + canonical_type_str("fixed_size_list"), + sub("^([^([<]+).*$", "\\1", fixed_size_list_of(int32(), 42)$ToString()) + ) + + # unsupported data types + expect_error( + canonical_type_str("decimal128(3, 2)"), + "parameters" + ) + expect_error( + canonical_type_str("list"), + "parameters" + ) + expect_error( + canonical_type_str("time32[s]"), + "parameters" + ) + + # unrecognized data types + expect_error( + canonical_type_str("foo"), + "Unrecognized" + ) }) From bf01efc3f77c011e50df31a6d7d3a9de49e36032 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 28 May 2021 14:15:55 -0400 Subject: [PATCH 22/22] Improve is() --- r/R/dplyr-functions.R | 16 +++++++- r/tests/testthat/test-dplyr.R | 74 ++++++++++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index ec26fffa985..fadd216a30c 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -59,7 +59,21 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { nse_funcs$is <- function(object, class2) { if (is.string(class2)) { - object$type()$ToString() == canonical_type_str(class2) + switch(class2, + # for R data types, pass off to is.*() functions + character = nse_funcs$is.character(object), + numeric = nse_funcs$is.numeric(object), + integer = nse_funcs$is.integer(object), + integer64 = nse_funcs$is.integer64(object), + logical = nse_funcs$is.logical(object), + factor = nse_funcs$is.factor(object), + list = nse_funcs$is.list(object), + # for Arrow data types, compare class2 with object$type()$ToString(), + # but first strip off any parameters to only compare the top-level data + # type, and canonicalize class2 + sub("^([^([<]+).*$", "\\1", object$type()$ToString()) == + canonical_type_str(class2) + ) } else if (inherits(class2, "DataType")) { object$type() == as_type(class2) } else { diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index d326baaa9e6..378640e8308 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -525,7 +525,7 @@ test_that("is.finite(), is.infinite(), is.nan()", { ) }) -test_that("type checks with is()", { +test_that("type checks with is() giving Arrow types", { # with class2=DataType expect_equal( Table$create( @@ -617,6 +617,78 @@ test_that("type checks with is()", { ) }) +test_that("type checks with is() giving R types", { + library(bit64) + expect_dplyr_equal( + input %>% + transmute( + chr_is_chr = is(chr, "character"), + chr_is_fct = is(chr, "factor"), + chr_is_int = is(chr, "integer"), + chr_is_i64 = is(chr, "integer64"), + chr_is_lst = is(chr, "list"), + chr_is_lgl = is(chr, "logical"), + chr_is_num = is(chr, "numeric"), + dbl_is_chr = is(dbl, "character"), + dbl_is_fct = is(dbl, "factor"), + dbl_is_int = is(dbl, "integer"), + dbl_is_i64 = is(dbl, "integer64"), + dbl_is_lst = is(dbl, "list"), + dbl_is_lgl = is(dbl, "logical"), + dbl_is_num = is(dbl, "numeric"), + fct_is_chr = is(fct, "character"), + fct_is_fct = is(fct, "factor"), + fct_is_int = is(fct, "integer"), + fct_is_i64 = is(fct, "integer64"), + fct_is_lst = is(fct, "list"), + fct_is_lgl = is(fct, "logical"), + fct_is_num = is(fct, "numeric"), + int_is_chr = is(int, "character"), + int_is_fct = is(int, "factor"), + int_is_int = is(int, "integer"), + int_is_i64 = is(int, "integer64"), + int_is_lst = is(int, "list"), + int_is_lgl = is(int, "logical"), + int_is_num = is(int, "numeric"), + lgl_is_chr = is(lgl, "character"), + lgl_is_fct = is(lgl, "factor"), + lgl_is_int = is(lgl, "integer"), + lgl_is_i64 = is(lgl, "integer64"), + lgl_is_lst = is(lgl, "list"), + lgl_is_lgl = is(lgl, "logical"), + lgl_is_num = is(lgl, "numeric") + ) %>% + collect(), + tbl + ) + expect_dplyr_equal( + input %>% + transmute( + i64_is_chr = is(i64, "character"), + i64_is_fct = is(i64, "factor"), + # we want Arrow to return TRUE, but bit64 returns FALSE + #i64_is_int = is(i64, "integer"), + i64_is_i64 = is(i64, "integer64"), + i64_is_lst = is(i64, "list"), + i64_is_lgl = is(i64, "logical"), + # we want Arrow to return TRUE, but bit64 returns FALSE + #i64_is_num = is(i64, "numeric"), + lst_is_chr = is(lst, "character"), + lst_is_fct = is(lst, "factor"), + lst_is_int = is(lst, "integer"), + lst_is_i64 = is(lst, "integer64"), + lst_is_lst = is(lst, "list"), + lst_is_lgl = is(lst, "logical"), + lst_is_num = is(lst, "numeric") + ) %>% + collect(), + tibble( + i64 = as.integer64(1:3), + lst = list(c("a", "b"), c("d", "e"), c("f", "g")) + ) + ) +}) + test_that("type checks with is.*()", { library(bit64) expect_dplyr_equal(