From c080750ddb102c1f54db4e59c1e8ef3e9b9fdfc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 30 Jun 2022 16:38:24 +0100 Subject: [PATCH 1/6] sort of git stash while I re-clone the repo --- r/R/dplyr-eval.R | 52 +++++++++++--------- r/R/dplyr-funcs-datetime.R | 2 +- r/R/dplyr-funcs.R | 19 ++++--- r/R/dplyr-mutate.R | 17 ++++++- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- r/tests/testthat/test-dplyr-funcs.R | 4 +- 6 files changed, 62 insertions(+), 34 deletions(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index a8fb7c43300..f97f4c42cf4 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -22,30 +22,36 @@ arrow_eval <- function(expr, mask) { # This yields an Expression as long as the `exprs` are implemented in Arrow. # Otherwise, it returns a try-error - tryCatch(eval_tidy(expr, mask), error = function(e) { - # Look for the cases where bad input was given, i.e. this would fail - # in regular dplyr anyway, and let those raise those as errors; - # else, for things not supported in Arrow return a "try-error", - # which we'll handle differently - msg <- conditionMessage(e) - if (getOption("arrow.debug", FALSE)) print(msg) - patterns <- .cache$i18ized_error_pattern - if (is.null(patterns)) { - patterns <- i18ize_error_messages() - # Memoize it - .cache$i18ized_error_pattern <- patterns - } - if (grepl(patterns, msg)) { - stop(e) - } + tryCatch( + eval_tidy(expr, mask), + error = function(e) { + # Look for the cases where bad input was given, i.e. this would fail + # in regular dplyr anyway, and let those raise those as errors; + # else, for things not supported in Arrow return a "try-error", + # which we'll handle differently + msg <- conditionMessage(e) + if (getOption("arrow.debug", FALSE)) print(msg) + patterns <- .cache$i18ized_error_pattern + if (is.null(patterns)) { + patterns <- i18ize_error_messages() + # Memoize it + .cache$i18ized_error_pattern <- patterns + } + if (grepl(patterns, msg)) { + stop(e) + } - out <- structure(msg, class = "try-error", condition = e) - if (grepl("not supported.*Arrow", msg) || getOption("arrow.debug", FALSE)) { - # One of ours. Mark it so that consumers can handle it differently - class(out) <- c("arrow-try-error", class(out)) - } - invisible(out) - }) + out <- structure(msg, class = "try-error", condition = e) + if (grepl("not supported.*Arrow", msg) || getOption("arrow.debug", FALSE)) { + # One of ours. Mark it so that consumers can handle it differently + class(out) <- c("arrow-try-error", class(out)) + } + expr_text <- rlang::quo_get_expr(expr)[[1]] %>% rlang::expr_text() + if (!expr_text %in% names(mask$.top_env)) { + class(out) <- c("arrow-binding-error", class(out)) + } + invisible(out) + }) } handle_arrow_not_supported <- function(err, lab) { diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 8ecb80b6b45..52264a9c886 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -172,7 +172,7 @@ register_bindings_datetime_components <- function() { (call_binding("yday", x) - 1) %/% 7 + 1 }) - register_binding("month", function(x, + register_binding("lubridate::month", function(x, label = FALSE, abbr = TRUE, locale = Sys.getlocale("LC_TIME")) { diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 95c1f69f4fb..9a637dd005e 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -58,15 +58,13 @@ NULL #' @keywords internal #' register_binding <- function(fun_name, fun, registry = nse_funcs) { - name <- gsub("^.*?::", "", fun_name) - namespace <- gsub("::.*$", "", fun_name) - - previous_fun <- if (name %in% names(registry)) registry[[name]] else NULL + previous_fun <- if (fun_name %in% names(registry)) registry[[fun_name]] else NULL + # setting fun to NULL removes existing functions from the registry if (is.null(fun) && !is.null(previous_fun)) { - rm(list = name, envir = registry, inherits = FALSE) + rm(list = c(fun_name), envir = registry, inherits = FALSE) } else { - registry[[name]] <- fun + registry[[fun_name]] <- fun } invisible(previous_fun) @@ -107,6 +105,7 @@ create_binding_cache <- function() { register_bindings_math() register_bindings_string() register_bindings_type() + register_bindings_utils() # We only create the cache for nse_funcs and not agg_funcs .cache$functions <- c(as.list(nse_funcs), arrow_funcs) @@ -116,3 +115,11 @@ create_binding_cache <- function() { nse_funcs <- new.env(parent = emptyenv()) agg_funcs <- new.env(parent = emptyenv()) .cache <- new.env(parent = emptyenv()) + +register_bindings_utils <- function() { + register_binding("::", function(lhs, rhs) { + lhs_name <- as.character(substitute(lhs)) + rhs_name <- as.character(substitute(rhs)) + nse_funcs[[paste0(lhs_name, "::", rhs_name)]] + }) +} diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 653c1e6f25a..b256e7dad23 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -54,12 +54,27 @@ mutate.arrow_dplyr_query <- function(.data, # (which overwrites the previous name) new_var <- names(exprs)[i] results[[new_var]] <- arrow_eval(exprs[[i]], mask) - if (inherits(results[[new_var]], "try-error")) { + if (inherits(results[[new_var]], "arrow-binding-error")) { + + expr <- rlang::quo_get_expr(exprs[[i]]) + new_expr <- grep( + paste0("::", rlang::expr_text(expr[[1]])), + names(mask$.top_env), + value = TRUE + ) %>% + rlang::parse_expr() + + exprs[[i]][[2]][[1]] <- new_expr + + results[[new_var]] <- arrow_eval(exprs[[i]], mask) + + } else if (inherits(results[[new_var]], "try-error")) { msg <- handle_arrow_not_supported( results[[new_var]], format_expr(exprs[[i]]) ) return(abandon_ship(call, .data, msg)) + } else if (!inherits(results[[new_var]], "Expression") && !is.null(results[[new_var]])) { # We need some wrapping to handle literal values diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index dc9e5609eaf..012af518115 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -417,7 +417,7 @@ test_that("extract quarter from timestamp", { test_that("extract month from timestamp", { compare_dplyr_binding( .input %>% - mutate(x = month(datetime)) %>% + mutate(x = lubridate::month(datetime)) %>% collect(), test_df ) diff --git a/r/tests/testthat/test-dplyr-funcs.R b/r/tests/testthat/test-dplyr-funcs.R index d96b4b2cf87..3c09849b984 100644 --- a/r/tests/testthat/test-dplyr-funcs.R +++ b/r/tests/testthat/test-dplyr-funcs.R @@ -26,8 +26,8 @@ test_that("register_binding() works", { expect_false("some_fun" %in% names(fake_registry)) expect_silent(expect_null(register_binding("some_fun", NULL, fake_registry))) - expect_null(register_binding("some_pkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) + expect_null(register_binding("some.pkg::some_fun", fun1, fake_registry)) + expect_identical(fake_registry$`some.pkg::some_fun`, fun1) }) test_that("register_binding_agg() works", { From 207b9fe50b6054fbfae32a63df1d254c42ac5692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 5 Jul 2022 12:13:19 +0100 Subject: [PATCH 2/6] use namespaced `strptime` as the first test --- r/R/dplyr-funcs-datetime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 52264a9c886..dc71dee75ad 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -28,7 +28,7 @@ register_bindings_datetime <- function() { } register_bindings_datetime_utility <- function() { - register_binding("strptime", function(x, + register_binding("base::strptime", function(x, format = "%Y-%m-%d %H:%M:%S", tz = "", unit = "ms") { @@ -172,7 +172,7 @@ register_bindings_datetime_components <- function() { (call_binding("yday", x) - 1) %/% 7 + 1 }) - register_binding("lubridate::month", function(x, + register_binding("month", function(x, label = FALSE, abbr = TRUE, locale = Sys.getlocale("LC_TIME")) { From af5894ec293905847224de9fd40d3950c4988100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 5 Jul 2022 12:14:11 +0100 Subject: [PATCH 3/6] comment --- r/R/dplyr-eval.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index f97f4c42cf4..c95cefb608d 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -46,6 +46,8 @@ arrow_eval <- function(expr, mask) { # One of ours. Mark it so that consumers can handle it differently class(out) <- c("arrow-try-error", class(out)) } + # if the expression text (i.e. the name of the function) is not in the + # names of the top expr_text <- rlang::quo_get_expr(expr)[[1]] %>% rlang::expr_text() if (!expr_text %in% names(mask$.top_env)) { class(out) <- c("arrow-binding-error", class(out)) From 957103812271dd68b3d093fe2988e1205cc38955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 5 Jul 2022 12:14:28 +0100 Subject: [PATCH 4/6] test with `base::strptime` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 012af518115..0299441cf2d 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -81,7 +81,7 @@ test_that("strptime", { t_string %>% arrow_table() %>% mutate( - x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "Pacific/Marquesas") + x = base::strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "Pacific/Marquesas") ) %>% collect(), t_stamp_with_pm_tz @@ -417,7 +417,7 @@ test_that("extract quarter from timestamp", { test_that("extract month from timestamp", { compare_dplyr_binding( .input %>% - mutate(x = lubridate::month(datetime)) %>% + mutate(x = month(datetime)) %>% collect(), test_df ) From 1ef487a269e487e9af803b4a4fde61becd5542de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 5 Jul 2022 12:59:53 +0100 Subject: [PATCH 5/6] comment --- r/R/dplyr-eval.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index c95cefb608d..2d2fd972a48 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -47,7 +47,7 @@ arrow_eval <- function(expr, mask) { class(out) <- c("arrow-try-error", class(out)) } # if the expression text (i.e. the name of the function) is not in the - # names of the top + # names of the top environment (which is the function registry) expr_text <- rlang::quo_get_expr(expr)[[1]] %>% rlang::expr_text() if (!expr_text %in% names(mask$.top_env)) { class(out) <- c("arrow-binding-error", class(out)) From fe1a8b26918cd6ca3b5f5e0efb2ca9dd56fe15a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 6 Jul 2022 16:59:49 +0100 Subject: [PATCH 6/6] add the `::` fix --- r/R/dplyr-funcs.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 9a637dd005e..16453abce23 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -120,6 +120,13 @@ register_bindings_utils <- function() { register_binding("::", function(lhs, rhs) { lhs_name <- as.character(substitute(lhs)) rhs_name <- as.character(substitute(rhs)) - nse_funcs[[paste0(lhs_name, "::", rhs_name)]] + + binding_name <- paste0(lhs_name, "::", rhs_name) + + if (!is.null(nse_funcs[[binding_name]])) { + nse_funcs[[binding_name]] + } else { + asNamespace(lhs_name)[[rhs_name]] + } }) }