From 8d50ac6ceab93ab661ea0f81f88b948cd534c6bf Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 2 Aug 2022 23:56:07 +0100 Subject: [PATCH 01/77] Add function which pulls out expressions in across() into separate quosures --- r/R/dplyr-mutate.R | 54 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 653c1e6f25a..ae0fa5f83bc 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -24,7 +24,9 @@ mutate.arrow_dplyr_query <- function(.data, .before = NULL, .after = NULL) { call <- match.call() - exprs <- ensure_named_exprs(quos(...)) + + expression_list <- unfold_across(.data, quos(...)) + exprs <- ensure_named_exprs(expression_list) .keep <- match.arg(.keep) .before <- enquo(.before) @@ -151,3 +153,53 @@ ensure_named_exprs <- function(exprs) { names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr) exprs } + +# Take the input quos and unfold any instances of across() +# into individual quosures +unfold_across <- function(.data, quos_in){ + + quos_out <- list() + # Check for any expressions starting with across + for (quo_i in seq_along(quos_in)) { + + quo_in <- quos_in[quo_i] + quo_expr <- quo_get_expr(quo_in[[1]]) + + if (rlang::is_call(quo_expr, "across")) { + new_quos <- list() + + cols <- names(select(.data, !!quo_expr[[2]])) + funcs <- quo_expr[[3]] + + for (col in cols) { + for (i in seq_along(funcs)) { + func <- funcs[[i]] + # column name is either the name of the item or index + col_suffix <- names(funcs)[[i]] + if (col_suffix == "") { + col_suffix <- i + } + + # if we've supplied multiple functions using list() ignore tihs element + if (!rlang::is_symbol(func, "list")) { + # get the expression + new_quo <- list(quo(!!call2(func, sym(col)))) + # give the expression a name + names(new_quo) <- paste0(col, "_", col_suffix) + # append to temporary list of new quosures + new_quos <- append(new_quos, new_quo) + } + + } + } + + # append the new expressions generated + quos_out <- append(quos_out, new_quos) + } else { + quos_out <- append(quos_out, quo_in) + } + } + + quos_out + +} From 666baa5e871e433480dfa1f6857bfc41cc794088 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:01:53 +0100 Subject: [PATCH 02/77] Add test --- r/tests/testthat/test-dplyr-mutate.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 66e3b4edf0d..403c6fe519d 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -589,3 +589,15 @@ test_that("mutate() and transmute() with namespaced functions", { tbl ) }) + +test_that("Can use across() within mutate()", { + compare_dplyr_binding( + .input %>% + mutate( + a = base::round(dbl) + base::log(int) + ) %>% + collect(), + tbl + ) + +}) From 6074aac29123f6245a695d8068051d6eac917a81 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:14:55 +0100 Subject: [PATCH 03/77] Update test to (correctly) fail --- r/tests/testthat/test-dplyr-mutate.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 403c6fe519d..b8dba69d868 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -591,13 +591,18 @@ test_that("mutate() and transmute() with namespaced functions", { }) test_that("Can use across() within mutate()", { - compare_dplyr_binding( - .input %>% + + # gives the right error with window functions + expect_warning( + arrow_table(tbl) %>% mutate( - a = base::round(dbl) + base::log(int) + x = int + 2, + across(c("int", "dbl"), list(mean = mean, sd = sd, round)), + exp(dbl2) ) %>% collect(), - tbl + "window functions not currently supported in Arrow; pulling data into R", + fixed = TRUE ) }) From ba98591e431e5ece930208ce4a3838fd1afee127 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:15:07 +0100 Subject: [PATCH 04/77] Move suffix code inside block --- r/R/dplyr-mutate.R | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index ae0fa5f83bc..82d0d5b943a 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -174,14 +174,13 @@ unfold_across <- function(.data, quos_in){ for (col in cols) { for (i in seq_along(funcs)) { func <- funcs[[i]] - # column name is either the name of the item or index - col_suffix <- names(funcs)[[i]] - if (col_suffix == "") { - col_suffix <- i - } - - # if we've supplied multiple functions using list() ignore tihs element + # if we've supplied multiple functions using list() ignore this element if (!rlang::is_symbol(func, "list")) { + # column name is either the name of the item or index + col_suffix <- names(funcs)[[i]] + if (col_suffix == "") { + col_suffix <- i - 1 + } # get the expression new_quo <- list(quo(!!call2(func, sym(col)))) # give the expression a name @@ -189,10 +188,8 @@ unfold_across <- function(.data, quos_in){ # append to temporary list of new quosures new_quos <- append(new_quos, new_quo) } - } } - # append the new expressions generated quos_out <- append(quos_out, new_quos) } else { From 097eb8a457c0b364578c8f8c055c599cbdeb445f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:36:05 +0100 Subject: [PATCH 05/77] Fix so it passes in the case of a single function --- r/R/dplyr-mutate.R | 16 ++++++++++++---- r/tests/testthat/test-dplyr-mutate.R | 7 +++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 82d0d5b943a..a3cba1e9a34 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -169,22 +169,30 @@ unfold_across <- function(.data, quos_in){ new_quos <- list() cols <- names(select(.data, !!quo_expr[[2]])) - funcs <- quo_expr[[3]] + funcs <- as.list(quo_expr[[3]]) for (col in cols) { for (i in seq_along(funcs)) { func <- funcs[[i]] - # if we've supplied multiple functions using list() ignore this element - if (!rlang::is_symbol(func, "list")) { + + # work out the name of the new column + if (length(funcs) == 1) { + new_colname <- col + } else { # column name is either the name of the item or index col_suffix <- names(funcs)[[i]] if (col_suffix == "") { col_suffix <- i - 1 } + new_colname <- paste0(col, "_", col_suffix) + } + + # if we've supplied multiple functions using list() ignore this element + if (!rlang::is_symbol(func, "list")) { # get the expression new_quo <- list(quo(!!call2(func, sym(col)))) # give the expression a name - names(new_quo) <- paste0(col, "_", col_suffix) + names(new_quo) <- new_colname # append to temporary list of new quosures new_quos <- append(new_quos, new_quo) } diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index b8dba69d868..ae9aa71d5a3 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -592,6 +592,13 @@ test_that("mutate() and transmute() with namespaced functions", { test_that("Can use across() within mutate()", { + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), round)) %>% + collect(), + tbl + ) + # gives the right error with window functions expect_warning( arrow_table(tbl) %>% From 5dc9a5143d3037ce31fa01608cc012b546e33018 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:39:15 +0100 Subject: [PATCH 06/77] Add more tests --- r/tests/testthat/test-dplyr-mutate.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index ae9aa71d5a3..e769205c422 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -599,6 +599,30 @@ test_that("Can use across() within mutate()", { tbl ) + compare_dplyr_binding( + .input %>% + mutate(across(c(1, 2), round)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, round)) %>% + collect(), + tbl + ) + + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(where(is.double))) %>% + collect(), + tbl + ), + "Unsupported selection helper" + ) + # gives the right error with window functions expect_warning( arrow_table(tbl) %>% From d5fd7331752476a40ae6ac8ee32a8ae6e3b8b5b3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:40:59 +0100 Subject: [PATCH 07/77] Add another comment --- r/R/dplyr-mutate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index a3cba1e9a34..5f64771831c 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -180,6 +180,7 @@ unfold_across <- function(.data, quos_in){ new_colname <- col } else { # column name is either the name of the item or index + # i - 1 because we exclude list col_suffix <- names(funcs)[[i]] if (col_suffix == "") { col_suffix <- i - 1 @@ -206,5 +207,4 @@ unfold_across <- function(.data, quos_in){ } quos_out - } From 2c882c3a8f7e30d89e916508f3d765aafb8a9312 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:43:01 +0100 Subject: [PATCH 08/77] Run styler --- r/R/dplyr-mutate.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 5f64771831c..9406f727ada 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -156,16 +156,14 @@ ensure_named_exprs <- function(exprs) { # Take the input quos and unfold any instances of across() # into individual quosures -unfold_across <- function(.data, quos_in){ - +unfold_across <- function(.data, quos_in) { quos_out <- list() # Check for any expressions starting with across for (quo_i in seq_along(quos_in)) { - quo_in <- quos_in[quo_i] quo_expr <- quo_get_expr(quo_in[[1]]) - if (rlang::is_call(quo_expr, "across")) { + if (is_call(quo_expr, "across")) { new_quos <- list() cols <- names(select(.data, !!quo_expr[[2]])) @@ -189,7 +187,7 @@ unfold_across <- function(.data, quos_in){ } # if we've supplied multiple functions using list() ignore this element - if (!rlang::is_symbol(func, "list")) { + if (!is_symbol(func, "list")) { # get the expression new_quo <- list(quo(!!call2(func, sym(col)))) # give the expression a name From bb730d6bac4be5217cb9bee080a312a8096d736b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:43:10 +0100 Subject: [PATCH 09/77] Import rlang functions --- r/R/arrow-package.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index f3e0b817d5f..52722e0db74 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -23,7 +23,7 @@ #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure -#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match +#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal From 8bc2f6b53dddac1d56bcaaf36ad26da3c64713e1 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:51:17 +0100 Subject: [PATCH 10/77] Run document() to import more rlang functions --- r/NAMESPACE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index c4c18ba16d7..d54071b4a28 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -425,6 +425,7 @@ importFrom(rlang,eval_tidy) importFrom(rlang,exec) importFrom(rlang,expr) importFrom(rlang,is_bare_character) +importFrom(rlang,is_call) importFrom(rlang,is_character) importFrom(rlang,is_empty) importFrom(rlang,is_false) @@ -432,6 +433,7 @@ importFrom(rlang,is_integerish) importFrom(rlang,is_interactive) importFrom(rlang,is_list) importFrom(rlang,is_quosure) +importFrom(rlang,is_symbol) importFrom(rlang,list2) importFrom(rlang,new_data_mask) importFrom(rlang,new_environment) From 77340e107c1b30a5521037e48ba3e8e8bbb28082 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:56:23 +0100 Subject: [PATCH 11/77] Unskip test --- r/tests/testthat/test-dplyr-mutate.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index e769205c422..b9d1844773f 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -279,16 +279,16 @@ test_that("dplyr::mutate's examples", { # Examples we don't support should succeed # but warn that they're pulling data into R to do so - # across and autosplicing: ARROW-11699 compare_dplyr_binding( .input %>% select(name, homeworld, species) %>% mutate(across(!name, as.factor)) %>% collect(), starwars, - warning = "Expression across.*not supported in Arrow" ) + + # group_by then mutate compare_dplyr_binding( .input %>% From 1963832cc1f319bf28702bbf15067ba6820f01e2 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 00:59:35 +0100 Subject: [PATCH 12/77] Pfft, whitespace --- r/tests/testthat/test-dplyr-mutate.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index b9d1844773f..6f840234cab 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -287,8 +287,6 @@ test_that("dplyr::mutate's examples", { starwars, ) - - # group_by then mutate compare_dplyr_binding( .input %>% From c6f5e82a05de1e12849a2f3ed9f1ce2d75dcb26a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 09:35:29 +0100 Subject: [PATCH 13/77] Update to work with both mutate and summarise --- r/R/dplyr-mutate.R | 7 ++++--- r/R/dplyr-summarize.R | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 9406f727ada..f294bd3fb51 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -167,16 +167,17 @@ unfold_across <- function(.data, quos_in) { new_quos <- list() cols <- names(select(.data, !!quo_expr[[2]])) - funcs <- as.list(quo_expr[[3]]) + + funcs <- quo_expr[[3]] for (col in cols) { for (i in seq_along(funcs)) { - func <- funcs[[i]] - # work out the name of the new column if (length(funcs) == 1) { + func <- funcs new_colname <- col } else { + func <- funcs[[i]] # column name is either the name of the item or index # i - 1 because we exclude list col_suffix <- names(funcs)[[i]] diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 92587f6c685..d00db8982dc 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -182,7 +182,8 @@ agg_funcs[["::"]] <- function(lhs, rhs) { summarise.arrow_dplyr_query <- function(.data, ...) { call <- match.call() .data <- as_adq(.data) - exprs <- quos(...) + exprs <- unfold_across(.data, quos(...)) + # Only retain the columns we need to do our aggregations vars_to_keep <- unique(c( unlist(lapply(exprs, all.vars)), # vars referenced in summarise @@ -198,7 +199,7 @@ summarise.arrow_dplyr_query <- function(.data, ...) { .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data))) # Try stuff, if successful return() - out <- try(do_arrow_summarize(.data, ...), silent = TRUE) + out <- try(do_arrow_summarize(.data, !!!exprs), silent = TRUE) if (inherits(out, "try-error")) { return(abandon_ship(call, .data, format(out))) } else { From b196573c88f2bfb6455678c0248ffb105bc12c1d Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 09:35:48 +0100 Subject: [PATCH 14/77] Add across tests for summarise --- r/tests/testthat/test-dplyr-summarize.R | 51 +++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index f799fcbf384..8aa7475fa5c 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -1089,3 +1089,54 @@ test_that("summarise() supports namespacing", { tbl ) }) + +test_that("Can use across() within summarise()", { + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarise(across(c(int, dbl), list(mean = mean, sd = sd))) %>% + collect(), + tbl + ) + + # we don't support purrr-style formulas yet here + # but do we support them elsewhere in Arrow? + # TODO: check this + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarise(across(c(int, dbl), ~ mean(.x, na.rm = TRUE))) %>% + collect(), + tbl + ) + + # this fails as .names isn't used here + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarise(across(starts_with("dbl"), mean, .names = "mean_{.col}")) %>% + collect(), + tbl + ) + + # this fails as .names isn't used here + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarise(across(c(int, dbl), list(mean = mean, sd = sd), .names = "{.col}.{.fn}")) %>% + collect(), + tbl + ) + + # this doesn't work yet either, also because .names + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarise(across(starts_with("dbl"), list(mean, sd), .names = "{.col}.fn.{.fn}")) %>% + collect(), + tbl + ) + +}) + From 1288a406b45387d152f394b4c9d16987624fa370 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 10:19:16 +0100 Subject: [PATCH 15/77] Add comment --- r/R/dplyr-mutate.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index f294bd3fb51..acc317c0813 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -166,6 +166,8 @@ unfold_across <- function(.data, quos_in) { if (is_call(quo_expr, "across")) { new_quos <- list() + # use select to get the column names so we can take advantage of + # tidyselect cols <- names(select(.data, !!quo_expr[[2]])) funcs <- quo_expr[[3]] From 692e41ba897bf0266772274eff974850d6ebbbf3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 11:49:57 +0100 Subject: [PATCH 16/77] Comment out failing summarise tests and add some more really simple summarise tests in --- r/tests/testthat/test-dplyr-summarize.R | 64 +++++++++++++++---------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 8aa7475fa5c..3f950901e50 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -1095,48 +1095,64 @@ test_that("Can use across() within summarise()", { compare_dplyr_binding( .input %>% group_by(some_grouping) %>% - summarise(across(c(int, dbl), list(mean = mean, sd = sd))) %>% - collect(), - tbl - ) - - # we don't support purrr-style formulas yet here - # but do we support them elsewhere in Arrow? - # TODO: check this - compare_dplyr_binding( - .input %>% - group_by(some_grouping) %>% - summarise(across(c(int, dbl), ~ mean(.x, na.rm = TRUE))) %>% + summarise(across(c(int, dbl), mean)) %>% collect(), tbl ) - # this fails as .names isn't used here compare_dplyr_binding( .input %>% group_by(some_grouping) %>% - summarise(across(starts_with("dbl"), mean, .names = "mean_{.col}")) %>% + summarise(across(c(int, dbl), list(mean = mean, sd))) %>% collect(), tbl ) - # this fails as .names isn't used here compare_dplyr_binding( .input %>% group_by(some_grouping) %>% - summarise(across(c(int, dbl), list(mean = mean, sd = sd), .names = "{.col}.{.fn}")) %>% + summarise(across(c(int, dbl), list(mean = mean, sd = sd))) %>% collect(), tbl ) - # this doesn't work yet either, also because .names - compare_dplyr_binding( - .input %>% - group_by(some_grouping) %>% - summarise(across(starts_with("dbl"), list(mean, sd), .names = "{.col}.fn.{.fn}")) %>% - collect(), - tbl - ) + # we don't support purrr-style formulas yet here + # but do we support them elsewhere in Arrow? + # TODO: check this + # compare_dplyr_binding( + # .input %>% + # group_by(some_grouping) %>% + # summarise(across(c(int, dbl), ~ mean(.x, na.rm = TRUE))) %>% + # collect(), + # tbl + # ) + + # # this fails as .names isn't used here + # compare_dplyr_binding( + # .input %>% + # group_by(some_grouping) %>% + # summarise(across(starts_with("dbl"), mean, .names = "mean_{.col}")) %>% + # collect(), + # tbl + # ) + # + # # this fails as .names isn't used here + # compare_dplyr_binding( + # .input %>% + # group_by(some_grouping) %>% + # summarise(across(c(int, dbl), list(mean = mean, sd = sd), .names = "{.col}.{.fn}")) %>% + # collect(), + # tbl + # ) + # + # # this doesn't work yet either, also because .names + # compare_dplyr_binding( + # .input %>% + # group_by(some_grouping) %>% + # summarise(across(starts_with("dbl"), list(mean, sd), .names = "{.col}.fn.{.fn}")) %>% + # collect(), + # tbl + # ) }) From 5d7af315660d69c5435d7b7d8be6c369b9764e80 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 13:11:51 +0100 Subject: [PATCH 17/77] Abstract out column names logic --- r/R/dplyr-mutate.R | 51 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index acc317c0813..04d7952d2de 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -169,34 +169,35 @@ unfold_across <- function(.data, quos_in) { # use select to get the column names so we can take advantage of # tidyselect cols <- names(select(.data, !!quo_expr[[2]])) - funcs <- quo_expr[[3]] + # TODO: refactor to take in the .names argument to across() + col_names <- get_across_names(cols, funcs) + for (col in cols) { for (i in seq_along(funcs)) { + # work out the name of the new column if (length(funcs) == 1) { func <- funcs - new_colname <- col + name_ref <- i + } else { func <- funcs[[i]] - # column name is either the name of the item or index - # i - 1 because we exclude list - col_suffix <- names(funcs)[[i]] - if (col_suffix == "") { - col_suffix <- i - 1 - } - new_colname <- paste0(col, "_", col_suffix) + name_ref <- i - 1 } # if we've supplied multiple functions using list() ignore this element if (!is_symbol(func, "list")) { + # get the expression new_quo <- list(quo(!!call2(func, sym(col)))) # give the expression a name - names(new_quo) <- new_colname + names(new_quo) <- col_names[[col]][[name_ref]] + # append to temporary list of new quosures new_quos <- append(new_quos, new_quo) + } } } @@ -209,3 +210,33 @@ unfold_across <- function(.data, quos_in) { quos_out } + +get_across_names <- function(cols, funcs){ + + names <- list() + + if (length(funcs) == 1) { + names <- set_names(as.list(cols), cols) + } else { + for (i in seq_along(funcs)) { + func <- funcs[[i]] + col_suffix <- names(funcs)[[i]] + if (!is_symbol(func, "list")) { + if (col_suffix == "") { + col_suffix <- i - 1 + } + new_colnames <- set_names(paste0(cols, "_", col_suffix), cols) + + # TODO: refactor this - surely it can be vectorised? + for (col_i in seq_along(new_colnames)) { + which_item <- names(new_colnames)[[col_i]] + col_name <- new_colnames[[col_i]] + + old_vals <- names[[which_item]] + names[[which_item]] <- c(old_vals, col_name) + } + } + } + } + names +} From 9ef1c57a6a26f4cb4c1db468043f1f37e87d9144 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 13:19:14 +0100 Subject: [PATCH 18/77] Add comment --- r/R/dplyr-mutate.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 04d7952d2de..84eca1d5a71 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -177,6 +177,7 @@ unfold_across <- function(.data, quos_in) { for (col in cols) { for (i in seq_along(funcs)) { + # TODO: this has a weird smell, refactor # work out the name of the new column if (length(funcs) == 1) { func <- funcs From 1f88b8bfdfd3f411a964ae332489139cea23a996 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 13:21:21 +0100 Subject: [PATCH 19/77] Pass through .groups --- r/R/dplyr-summarize.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index d00db8982dc..7d02be8e052 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -179,7 +179,8 @@ agg_funcs[["::"]] <- function(lhs, rhs) { # The following S3 methods are registered on load if dplyr is present -summarise.arrow_dplyr_query <- function(.data, ...) { +summarise.arrow_dplyr_query <- function(.data, ..., .groups = NULL) { + call <- match.call() .data <- as_adq(.data) exprs <- unfold_across(.data, quos(...)) @@ -199,7 +200,7 @@ summarise.arrow_dplyr_query <- function(.data, ...) { .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data))) # Try stuff, if successful return() - out <- try(do_arrow_summarize(.data, !!!exprs), silent = TRUE) + out <- try(do_arrow_summarize(.data, !!!exprs, .groups = .groups), silent = TRUE) if (inherits(out, "try-error")) { return(abandon_ship(call, .data, format(out))) } else { From 36706daddf703a9401fb5724d459818949b7d4dd Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 3 Aug 2022 14:08:11 +0100 Subject: [PATCH 20/77] Back out changes to summarise() --- r/R/dplyr-summarize.R | 6 +-- r/tests/testthat/test-dplyr-summarize.R | 67 ------------------------- 2 files changed, 3 insertions(+), 70 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 7d02be8e052..2f602f1f3fb 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -179,11 +179,11 @@ agg_funcs[["::"]] <- function(lhs, rhs) { # The following S3 methods are registered on load if dplyr is present -summarise.arrow_dplyr_query <- function(.data, ..., .groups = NULL) { +summarise.arrow_dplyr_query <- function(.data, ...) { call <- match.call() .data <- as_adq(.data) - exprs <- unfold_across(.data, quos(...)) + exprs <- quos(...) # Only retain the columns we need to do our aggregations vars_to_keep <- unique(c( @@ -200,7 +200,7 @@ summarise.arrow_dplyr_query <- function(.data, ..., .groups = NULL) { .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data))) # Try stuff, if successful return() - out <- try(do_arrow_summarize(.data, !!!exprs, .groups = .groups), silent = TRUE) + out <- try(do_arrow_summarize(.data, ...), silent = TRUE) if (inherits(out, "try-error")) { return(abandon_ship(call, .data, format(out))) } else { diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 3f950901e50..f799fcbf384 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -1089,70 +1089,3 @@ test_that("summarise() supports namespacing", { tbl ) }) - -test_that("Can use across() within summarise()", { - - compare_dplyr_binding( - .input %>% - group_by(some_grouping) %>% - summarise(across(c(int, dbl), mean)) %>% - collect(), - tbl - ) - - compare_dplyr_binding( - .input %>% - group_by(some_grouping) %>% - summarise(across(c(int, dbl), list(mean = mean, sd))) %>% - collect(), - tbl - ) - - compare_dplyr_binding( - .input %>% - group_by(some_grouping) %>% - summarise(across(c(int, dbl), list(mean = mean, sd = sd))) %>% - collect(), - tbl - ) - - # we don't support purrr-style formulas yet here - # but do we support them elsewhere in Arrow? - # TODO: check this - # compare_dplyr_binding( - # .input %>% - # group_by(some_grouping) %>% - # summarise(across(c(int, dbl), ~ mean(.x, na.rm = TRUE))) %>% - # collect(), - # tbl - # ) - - # # this fails as .names isn't used here - # compare_dplyr_binding( - # .input %>% - # group_by(some_grouping) %>% - # summarise(across(starts_with("dbl"), mean, .names = "mean_{.col}")) %>% - # collect(), - # tbl - # ) - # - # # this fails as .names isn't used here - # compare_dplyr_binding( - # .input %>% - # group_by(some_grouping) %>% - # summarise(across(c(int, dbl), list(mean = mean, sd = sd), .names = "{.col}.{.fn}")) %>% - # collect(), - # tbl - # ) - # - # # this doesn't work yet either, also because .names - # compare_dplyr_binding( - # .input %>% - # group_by(some_grouping) %>% - # summarise(across(starts_with("dbl"), list(mean, sd), .names = "{.col}.fn.{.fn}")) %>% - # collect(), - # tbl - # ) - -}) - From 850dcc49dbbc7353dcf69960d9c9aa99183298bc Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 4 Aug 2022 09:28:08 +0100 Subject: [PATCH 21/77] Refactor to remove a for loop --- r/R/dplyr-mutate.R | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 84eca1d5a71..023b6263d9b 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -157,6 +157,7 @@ ensure_named_exprs <- function(exprs) { # Take the input quos and unfold any instances of across() # into individual quosures unfold_across <- function(.data, quos_in) { + quos_out <- list() # Check for any expressions starting with across for (quo_i in seq_along(quos_in)) { @@ -174,15 +175,14 @@ unfold_across <- function(.data, quos_in) { # TODO: refactor to take in the .names argument to across() col_names <- get_across_names(cols, funcs) - for (col in cols) { + # TODO: refactor some of this into helper functions + # TODO: also look at using the purrr functions + for (one_col in cols) { for (i in seq_along(funcs)) { - # TODO: this has a weird smell, refactor - # work out the name of the new column if (length(funcs) == 1) { func <- funcs name_ref <- i - } else { func <- funcs[[i]] name_ref <- i - 1 @@ -192,10 +192,9 @@ unfold_across <- function(.data, quos_in) { if (!is_symbol(func, "list")) { # get the expression - new_quo <- list(quo(!!call2(func, sym(col)))) + new_quo <- list(quo(!!call2(func, sym(one_col)))) # give the expression a name - names(new_quo) <- col_names[[col]][[name_ref]] - + names(new_quo) <- col_names[[one_col]][[name_ref]] # append to temporary list of new quosures new_quos <- append(new_quos, new_quo) @@ -228,13 +227,11 @@ get_across_names <- function(cols, funcs){ } new_colnames <- set_names(paste0(cols, "_", col_suffix), cols) - # TODO: refactor this - surely it can be vectorised? - for (col_i in seq_along(new_colnames)) { - which_item <- names(new_colnames)[[col_i]] - col_name <- new_colnames[[col_i]] - - old_vals <- names[[which_item]] - names[[which_item]] <- c(old_vals, col_name) + # append the new names to the list of column names + if (is_empty(names)) { + names <- as.list(new_colnames) + } else { + names <- Map(c, names, new_colnames) } } } From 290d549d40a3ba850c56efec15c9e329403ebdfd Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sun, 7 Aug 2022 18:50:38 +0100 Subject: [PATCH 22/77] Refactor to remove for loops --- r/R/dplyr-mutate.R | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 023b6263d9b..6983f1eeff5 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -211,30 +211,20 @@ unfold_across <- function(.data, quos_in) { quos_out } +#' @return A named list, each element is a vector of column names get_across_names <- function(cols, funcs){ names <- list() - if (length(funcs) == 1) { names <- set_names(as.list(cols), cols) } else { - for (i in seq_along(funcs)) { - func <- funcs[[i]] - col_suffix <- names(funcs)[[i]] - if (!is_symbol(func, "list")) { - if (col_suffix == "") { - col_suffix <- i - 1 - } - new_colnames <- set_names(paste0(cols, "_", col_suffix), cols) - - # append the new names to the list of column names - if (is_empty(names)) { - names <- as.list(new_colnames) - } else { - names <- Map(c, names, new_colnames) - } - } - } + extracted_funcs <- funcs[map_lgl(funcs, ~!is_symbol(.x, "list"))] + func_names <- names(extracted_funcs) + func_indices <- seq_along(extracted_funcs) + # if the function is unnamed (an empty character), use the index instead + suffixes <- map2_chr(func_names, func_indices, max) + out_cols <- map(cols, ~paste(.x, suffixes, sep="_")) + names <- set_names(out_cols, cols) } names } From ad6033b9b4eabc192d6957c81890a653246db340 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sun, 7 Aug 2022 18:54:42 +0100 Subject: [PATCH 23/77] Move comments around --- r/R/dplyr-mutate.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 6983f1eeff5..6b853d0149f 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -218,10 +218,11 @@ get_across_names <- function(cols, funcs){ if (length(funcs) == 1) { names <- set_names(as.list(cols), cols) } else { + # list() is used to specify the list of functions so remove it extracted_funcs <- funcs[map_lgl(funcs, ~!is_symbol(.x, "list"))] + # if the function is unnamed (an empty character), use the index instead func_names <- names(extracted_funcs) func_indices <- seq_along(extracted_funcs) - # if the function is unnamed (an empty character), use the index instead suffixes <- map2_chr(func_names, func_indices, max) out_cols <- map(cols, ~paste(.x, suffixes, sep="_")) names <- set_names(out_cols, cols) From f5051e1b3641ccf7721a78780b92fa2a9cf346b2 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sun, 7 Aug 2022 21:30:38 +0100 Subject: [PATCH 24/77] Refactor to calculate names and quosures together --- r/R/dplyr-mutate.R | 66 ++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 6b853d0149f..9ea98d311b4 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -167,41 +167,13 @@ unfold_across <- function(.data, quos_in) { if (is_call(quo_expr, "across")) { new_quos <- list() - # use select to get the column names so we can take advantage of - # tidyselect + # use select to get the column names so we can take advantage of tidyselect cols <- names(select(.data, !!quo_expr[[2]])) funcs <- quo_expr[[3]] # TODO: refactor to take in the .names argument to across() - col_names <- get_across_names(cols, funcs) - - # TODO: refactor some of this into helper functions - # TODO: also look at using the purrr functions - for (one_col in cols) { - for (i in seq_along(funcs)) { - - if (length(funcs) == 1) { - func <- funcs - name_ref <- i - } else { - func <- funcs[[i]] - name_ref <- i - 1 - } - - # if we've supplied multiple functions using list() ignore this element - if (!is_symbol(func, "list")) { - - # get the expression - new_quo <- list(quo(!!call2(func, sym(one_col)))) - # give the expression a name - names(new_quo) <- col_names[[one_col]][[name_ref]] - # append to temporary list of new quosures - new_quos <- append(new_quos, new_quo) - - } - } - } - # append the new expressions generated + new_quos <- get_across_names(cols, funcs) + quos_out <- append(quos_out, new_quos) } else { quos_out <- append(quos_out, quo_in) @@ -214,18 +186,38 @@ unfold_across <- function(.data, quos_in) { #' @return A named list, each element is a vector of column names get_across_names <- function(cols, funcs){ - names <- list() + new_quosures <- list() + if (length(funcs) == 1) { - names <- set_names(as.list(cols), cols) + # work out the quosures from the call + col_syms <- syms(cols) + new_quosures <- map(col_syms, ~quo(!!call2(funcs, .x))) + # if only 1 function, we overwrite the old columns + new_quosures <- set_names(new_quosures, cols) } else { # list() is used to specify the list of functions so remove it extracted_funcs <- funcs[map_lgl(funcs, ~!is_symbol(.x, "list"))] + # if the function is unnamed (an empty character), use the index instead + func_list <- as.list(extracted_funcs) func_names <- names(extracted_funcs) func_indices <- seq_along(extracted_funcs) - suffixes <- map2_chr(func_names, func_indices, max) - out_cols <- map(cols, ~paste(.x, suffixes, sep="_")) - names <- set_names(out_cols, cols) + names(func_list) <- map2_chr(func_names, func_indices, max) + + func_list_full <- rep(func_list, length(cols)) + cols <- rep(cols, each = length(func_list)) + + # get names of new quosures + new_quo_names <- map2_chr(names(func_list_full), cols, ~paste(.y, .x, sep="_")) + + # get new quosures + new_quo_list <- map2(func_list_full, cols, ~quo(!!call2(.x, sym(.y)))) + + new_quosures <- set_names(new_quo_list, new_quo_names) + } - names + new_quosures } + +# write the function which gets you an individual quosire and names is +# then write/find the functions which takes the list of From 1dcb5bb9807aab9825793c97736f18b9019e78f3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sun, 7 Aug 2022 21:32:54 +0100 Subject: [PATCH 25/77] Refactor back into original function --- r/R/dplyr-mutate.R | 70 +++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 9ea98d311b4..0c12b5460d7 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -157,7 +157,6 @@ ensure_named_exprs <- function(exprs) { # Take the input quos and unfold any instances of across() # into individual quosures unfold_across <- function(.data, quos_in) { - quos_out <- list() # Check for any expressions starting with across for (quo_i in seq_along(quos_in)) { @@ -172,52 +171,41 @@ unfold_across <- function(.data, quos_in) { funcs <- quo_expr[[3]] # TODO: refactor to take in the .names argument to across() - new_quos <- get_across_names(cols, funcs) - - quos_out <- append(quos_out, new_quos) - } else { - quos_out <- append(quos_out, quo_in) - } - } - - quos_out -} - -#' @return A named list, each element is a vector of column names -get_across_names <- function(cols, funcs){ - - new_quosures <- list() + new_quos <- list() - if (length(funcs) == 1) { - # work out the quosures from the call - col_syms <- syms(cols) - new_quosures <- map(col_syms, ~quo(!!call2(funcs, .x))) - # if only 1 function, we overwrite the old columns - new_quosures <- set_names(new_quosures, cols) - } else { - # list() is used to specify the list of functions so remove it - extracted_funcs <- funcs[map_lgl(funcs, ~!is_symbol(.x, "list"))] + if (length(funcs) == 1) { + # work out the quosures from the call + col_syms <- syms(cols) + new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x))) + # if only 1 function, we overwrite the old columns + new_quos <- set_names(new_quos, cols) + } else { + # list() is used to specify the list of functions so remove it + extracted_funcs <- funcs[map_lgl(funcs, ~ !is_symbol(.x, "list"))] - # if the function is unnamed (an empty character), use the index instead - func_list <- as.list(extracted_funcs) - func_names <- names(extracted_funcs) - func_indices <- seq_along(extracted_funcs) - names(func_list) <- map2_chr(func_names, func_indices, max) + # if the function is unnamed (an empty character), use the index instead + func_list <- as.list(extracted_funcs) + func_names <- names(extracted_funcs) + func_indices <- seq_along(extracted_funcs) + names(func_list) <- map2_chr(func_names, func_indices, max) - func_list_full <- rep(func_list, length(cols)) - cols <- rep(cols, each = length(func_list)) + func_list_full <- rep(func_list, length(cols)) + cols <- rep(cols, each = length(func_list)) - # get names of new quosures - new_quo_names <- map2_chr(names(func_list_full), cols, ~paste(.y, .x, sep="_")) + # get names of new quosures + new_quo_names <- map2_chr(names(func_list_full), cols, ~ paste(.y, .x, sep = "_")) - # get new quosures - new_quo_list <- map2(func_list_full, cols, ~quo(!!call2(.x, sym(.y)))) + # get new quosures + new_quo_list <- map2(func_list_full, cols, ~ quo(!!call2(.x, sym(.y)))) - new_quosures <- set_names(new_quo_list, new_quo_names) + new_quos <- set_names(new_quo_list, new_quo_names) + } + quos_out <- append(quos_out, new_quos) + } else { + quos_out <- append(quos_out, quo_in) + } } - new_quosures -} -# write the function which gets you an individual quosire and names is -# then write/find the functions which takes the list of + quos_out +} From dffb49e397f5cc1248995ad99df1116afccb8c0f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sun, 7 Aug 2022 21:34:32 +0100 Subject: [PATCH 26/77] Add a TODO --- r/R/dplyr-mutate.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 0c12b5460d7..76ee05fe328 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -159,6 +159,7 @@ ensure_named_exprs <- function(exprs) { unfold_across <- function(.data, quos_in) { quos_out <- list() # Check for any expressions starting with across + # TODO: can we refactor this to not be a for loop? for (quo_i in seq_along(quos_in)) { quo_in <- quos_in[quo_i] quo_expr <- quo_get_expr(quo_in[[1]]) From 63bc6952209835d6b67a38faa92180c55fc065fb Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 9 Aug 2022 22:21:26 +0100 Subject: [PATCH 27/77] Update failing test and add justification in comment --- r/tests/testthat/test-dplyr-mutate.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 6f840234cab..6e103f4ea06 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -279,10 +279,11 @@ test_that("dplyr::mutate's examples", { # Examples we don't support should succeed # but warn that they're pulling data into R to do so + # test modified from version in dplyr::mutate due to ARROW-12632 compare_dplyr_binding( .input %>% - select(name, homeworld, species) %>% - mutate(across(!name, as.factor)) %>% + select(name, height, mass) %>% + mutate(across(!name, as.character)) %>% collect(), starwars, ) From 62859769cde38f7edb02b5dc24cf1e0ec499cb4e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 9 Aug 2022 22:23:17 +0100 Subject: [PATCH 28/77] Remove whitespace --- r/R/dplyr-summarize.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 2f602f1f3fb..92587f6c685 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -180,11 +180,9 @@ agg_funcs[["::"]] <- function(lhs, rhs) { # The following S3 methods are registered on load if dplyr is present summarise.arrow_dplyr_query <- function(.data, ...) { - call <- match.call() .data <- as_adq(.data) exprs <- quos(...) - # Only retain the columns we need to do our aggregations vars_to_keep <- unique(c( unlist(lapply(exprs, all.vars)), # vars referenced in summarise From 02d92d3e7ebd633b4c260f9595be4825977c1fec Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 9 Aug 2022 22:31:50 +0100 Subject: [PATCH 29/77] Add some failing tests --- r/tests/testthat/test-dplyr-mutate.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 6e103f4ea06..7fab0abca1b 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -612,6 +612,20 @@ test_that("Can use across() within mutate()", { tbl ) + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, list(round))) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, c(round))) %>% + collect(), + tbl + ) + expect_error( compare_dplyr_binding( .input %>% From de3ebd069b3f10fceed5fbe9981adf23d17705ef Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 9 Aug 2022 22:42:03 +0100 Subject: [PATCH 30/77] Add another failing test --- r/tests/testthat/test-dplyr-mutate.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 7fab0abca1b..f2046f4b685 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -598,6 +598,13 @@ test_that("Can use across() within mutate()", { tbl ) + compare_dplyr_binding( + .input %>% + mutate(.fns = round, across(c(dbl, dbl2))) %>% + collect(), + tbl + ) + compare_dplyr_binding( .input %>% mutate(across(c(1, 2), round)) %>% From 3c465b0adbfde12708b2ae700a8ed5d28b43b779 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 9 Aug 2022 22:42:48 +0100 Subject: [PATCH 31/77] Fix typo --- r/tests/testthat/test-dplyr-mutate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index f2046f4b685..bc03110c315 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -600,7 +600,7 @@ test_that("Can use across() within mutate()", { compare_dplyr_binding( .input %>% - mutate(.fns = round, across(c(dbl, dbl2))) %>% + mutate(across(.fns = round, c(dbl, dbl2))) %>% collect(), tbl ) From 5c56bfdfa82ac3a01e20689511a696de509be8cf Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 9 Aug 2022 23:05:01 +0100 Subject: [PATCH 32/77] Use match.call so we don't worry about argument order --- r/R/dplyr-mutate.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 76ee05fe328..cdc066adb7f 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -166,10 +166,16 @@ unfold_across <- function(.data, quos_in) { if (is_call(quo_expr, "across")) { new_quos <- list() + call <- match.call(dplyr::across, quo_expr) # use select to get the column names so we can take advantage of tidyselect - cols <- names(select(.data, !!quo_expr[[2]])) - funcs <- quo_expr[[3]] + cols <- names(select(.data, !!call[[".cols"]])) + funcs <- call[[".fns"]] + + # ARROW-17364: add support for .names argument + if (!is.null(call[[".names"]])) { + abort("`.names` argument to `across()` not yet supported in Arrow") + } # TODO: refactor to take in the .names argument to across() new_quos <- list() From 869081d070d933c7d0c9bbc141ddf115097b0ab7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 00:01:42 +0100 Subject: [PATCH 33/77] Raise error for deprecated argument, add tests for unsupported and deprecated argument errors, add more failing tests --- r/R/dplyr-mutate.R | 18 +++++++++----- r/tests/testthat/test-dplyr-mutate.R | 35 ++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index cdc066adb7f..ec27d0b86ed 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -166,24 +166,30 @@ unfold_across <- function(.data, quos_in) { if (is_call(quo_expr, "across")) { new_quos <- list() - call <- match.call(dplyr::across, quo_expr) + across_call <- match.call(dplyr::across, quo_expr) # use select to get the column names so we can take advantage of tidyselect - cols <- names(select(.data, !!call[[".cols"]])) - funcs <- call[[".fns"]] + cols <- names(select(.data, !!across_call[[".cols"]])) + funcs <- across_call[[".fns"]] + + # ensure isn't single item vector or list + funcs <- funcs[!as.character(funcs) %in% c("c", "list")] # ARROW-17364: add support for .names argument - if (!is.null(call[[".names"]])) { + if (!is.null(across_call[[".names"]])) { abort("`.names` argument to `across()` not yet supported in Arrow") } - # TODO: refactor to take in the .names argument to across() + if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) { + abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") + } + new_quos <- list() if (length(funcs) == 1) { # work out the quosures from the call col_syms <- syms(cols) - new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x))) + new_quos <- map(col_syms, ~ quo(!!call2(as.character(funcs), .x))) # if only 1 function, we overwrite the old columns new_quos <- set_names(new_quos, cols) } else { diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index bc03110c315..c54f15defe3 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -598,6 +598,7 @@ test_that("Can use across() within mutate()", { tbl ) + # across() arguments not in default order compare_dplyr_binding( .input %>% mutate(across(.fns = round, c(dbl, dbl2))) %>% @@ -605,30 +606,56 @@ test_that("Can use across() within mutate()", { tbl ) + # ARROW-17364: .names argument not yet supported for across() + expect_error( + tbl %>% + arrow_table() %>% + mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% + collect(), + regexp = "`.names` argument to `across()` not yet supported in Arrow", + fixed = TRUE + ) + + # ellipses (...) are a deprecated argument + expect_error( + tbl %>% + arrow_table() %>% + mutate(across(c(dbl, dbl2), round, digits = -1)) %>% + collect(), + regexp = "`...` argument to `across()` is deprecated in dplyr and not supported in Arrow", + fixed = TRUE + ) + + # alternative ways of specifying .fns - as a list compare_dplyr_binding( .input %>% - mutate(across(c(1, 2), round)) %>% + mutate(across(1:dbl2, list(round))) %>% collect(), tbl ) + # supply .fns as a one-item vector + # Unclear as of yet whether the dplyr behaviour is a bug or not + # See: https://github.com/tidyverse/dplyr/issues/6395 compare_dplyr_binding( .input %>% - mutate(across(1:dbl2, round)) %>% + mutate(across(1:dbl2, c(round))) %>% collect(), tbl ) + # use a purrr-style lambda formula compare_dplyr_binding( .input %>% - mutate(across(1:dbl2, list(round))) %>% + mutate(across(1:dbl2, ~round(.x, digits = -1))) %>% collect(), tbl ) + # .fns = NULL, the default compare_dplyr_binding( .input %>% - mutate(across(1:dbl2, c(round))) %>% + mutate(across(1:dbl2, NULL)) %>% collect(), tbl ) From 20c73bdbab601ae600587d80e54569fc170e7baa Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 00:14:05 +0100 Subject: [PATCH 34/77] Skip failing tests as they're part of .names --- r/tests/testthat/test-dplyr-mutate.R | 30 +++++++++++++++++----------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index c54f15defe3..1d8b671cd63 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -627,21 +627,27 @@ test_that("Can use across() within mutate()", { ) # alternative ways of specifying .fns - as a list - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, list(round))) %>% - collect(), - tbl + # ARROW-17364: .names argument not yet supported for across() + # See: https://github.com/tidyverse/dplyr/issues/6395 for why this is related + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, list(round))) %>% + collect(), + tbl + ) ) # supply .fns as a one-item vector - # Unclear as of yet whether the dplyr behaviour is a bug or not - # See: https://github.com/tidyverse/dplyr/issues/6395 - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, c(round))) %>% - collect(), - tbl + # ARROW-17364: .names argument not yet supported for across() + # See: https://github.com/tidyverse/dplyr/issues/6395 for why this is related + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, c(round))) %>% + collect(), + tbl + ) ) # use a purrr-style lambda formula From 5eb31628e29384fb3395c27d92670503f7caf154 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 00:19:22 +0100 Subject: [PATCH 35/77] Remove redundant calls to as.character --- r/R/dplyr-mutate.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index ec27d0b86ed..1fd52308eeb 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -170,10 +170,10 @@ unfold_across <- function(.data, quos_in) { # use select to get the column names so we can take advantage of tidyselect cols <- names(select(.data, !!across_call[[".cols"]])) - funcs <- across_call[[".fns"]] + funcs <- as.character(across_call[[".fns"]]) # ensure isn't single item vector or list - funcs <- funcs[!as.character(funcs) %in% c("c", "list")] + funcs <- funcs[!funcs %in% c("c", "list")] # ARROW-17364: add support for .names argument if (!is.null(across_call[[".names"]])) { @@ -189,7 +189,7 @@ unfold_across <- function(.data, quos_in) { if (length(funcs) == 1) { # work out the quosures from the call col_syms <- syms(cols) - new_quos <- map(col_syms, ~ quo(!!call2(as.character(funcs), .x))) + new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x))) # if only 1 function, we overwrite the old columns new_quos <- set_names(new_quos, cols) } else { From 45fce8acb2eefeb7f3db7f92358a7ce63ac11d95 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 00:42:39 +0100 Subject: [PATCH 36/77] Reorder code and return early if .fns is NULL --- r/R/dplyr-mutate.R | 22 +++++++++++++++------- r/tests/testthat/test-dplyr-mutate.R | 16 ++++++++++------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 1fd52308eeb..1dcd1dfe01f 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -168,20 +168,28 @@ unfold_across <- function(.data, quos_in) { new_quos <- list() across_call <- match.call(dplyr::across, quo_expr) + if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) { + abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") + } + + # ARROW-17364: add support for .names argument + if (!is.null(across_call[[".names"]])) { + abort("`.names` argument to `across()` not yet supported in Arrow") + } + # use select to get the column names so we can take advantage of tidyselect cols <- names(select(.data, !!across_call[[".cols"]])) funcs <- as.character(across_call[[".fns"]]) + if (is_empty(funcs)) { + return() + } + # ensure isn't single item vector or list funcs <- funcs[!funcs %in% c("c", "list")] - # ARROW-17364: add support for .names argument - if (!is.null(across_call[[".names"]])) { - abort("`.names` argument to `across()` not yet supported in Arrow") - } - - if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) { - abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") + if (funcs[[1]] == "~") { + abort("purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow") } new_quos <- list() diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 1d8b671cd63..293b3914237 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -650,12 +650,16 @@ test_that("Can use across() within mutate()", { ) ) - # use a purrr-style lambda formula - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, ~round(.x, digits = -1))) %>% - collect(), - tbl + # ARROW-17366: purrr-style lmabda functions not yet supported + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, ~round(.x, digits = -1))) %>% + collect(), + tbl + ), + regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", + fixed = TRUE ) # .fns = NULL, the default From f40d76edda9cbd74edb887d845aa72e88997855e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 00:46:46 +0100 Subject: [PATCH 37/77] Run styler --- r/tests/testthat/test-dplyr-mutate.R | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 293b3914237..0ad5201b55a 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -590,7 +590,6 @@ test_that("mutate() and transmute() with namespaced functions", { }) test_that("Can use across() within mutate()", { - compare_dplyr_binding( .input %>% mutate(across(c(dbl, dbl2), round)) %>% @@ -612,7 +611,7 @@ test_that("Can use across() within mutate()", { arrow_table() %>% mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% collect(), - regexp = "`.names` argument to `across()` not yet supported in Arrow", + regexp = "`.names` argument to `across()` not yet supported in Arrow", fixed = TRUE ) @@ -622,7 +621,7 @@ test_that("Can use across() within mutate()", { arrow_table() %>% mutate(across(c(dbl, dbl2), round, digits = -1)) %>% collect(), - regexp = "`...` argument to `across()` is deprecated in dplyr and not supported in Arrow", + regexp = "`...` argument to `across()` is deprecated in dplyr and not supported in Arrow", fixed = TRUE ) @@ -631,10 +630,10 @@ test_that("Can use across() within mutate()", { # See: https://github.com/tidyverse/dplyr/issues/6395 for why this is related expect_error( compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, list(round))) %>% - collect(), - tbl + .input %>% + mutate(across(1:dbl2, list(round))) %>% + collect(), + tbl ) ) @@ -654,10 +653,10 @@ test_that("Can use across() within mutate()", { expect_error( compare_dplyr_binding( .input %>% - mutate(across(1:dbl2, ~round(.x, digits = -1))) %>% + mutate(across(1:dbl2, ~ round(.x, digits = -1))) %>% collect(), tbl - ), + ), regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", fixed = TRUE ) @@ -692,5 +691,4 @@ test_that("Can use across() within mutate()", { "window functions not currently supported in Arrow; pulling data into R", fixed = TRUE ) - }) From 5c720a79e0c823e28948d208d25ca1b65794df27 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 08:05:28 +0100 Subject: [PATCH 38/77] Add comments, refactor some bits out as functions --- r/R/dplyr-mutate.R | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 1dcd1dfe01f..2f3546c8a8a 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -159,7 +159,6 @@ ensure_named_exprs <- function(exprs) { unfold_across <- function(.data, quos_in) { quos_out <- list() # Check for any expressions starting with across - # TODO: can we refactor this to not be a for loop? for (quo_i in seq_along(quos_in)) { quo_in <- quos_in[quo_i] quo_expr <- quo_get_expr(quo_in[[1]]) @@ -177,10 +176,11 @@ unfold_across <- function(.data, quos_in) { abort("`.names` argument to `across()` not yet supported in Arrow") } - # use select to get the column names so we can take advantage of tidyselect + # use select() to get the column names so we can take advantage of tidyselect cols <- names(select(.data, !!across_call[[".cols"]])) funcs <- as.character(across_call[[".fns"]]) + # calling across() with .fns = NULL returns all columns unchanged if (is_empty(funcs)) { return() } @@ -204,20 +204,16 @@ unfold_across <- function(.data, quos_in) { # list() is used to specify the list of functions so remove it extracted_funcs <- funcs[map_lgl(funcs, ~ !is_symbol(.x, "list"))] - # if the function is unnamed (an empty character), use the index instead - func_list <- as.list(extracted_funcs) - func_names <- names(extracted_funcs) - func_indices <- seq_along(extracted_funcs) - names(func_list) <- map2_chr(func_names, func_indices, max) + func_list <- ensure_named_funcs(extracted_funcs) func_list_full <- rep(func_list, length(cols)) - cols <- rep(cols, each = length(func_list)) + cols_list_full <- rep(cols, each = length(func_list)) # get names of new quosures - new_quo_names <- map2_chr(names(func_list_full), cols, ~ paste(.y, .x, sep = "_")) + new_quo_names <- map2_chr(names(func_list_full), cols_list_full, ~ paste(.y, .x, sep = "_")) # get new quosures - new_quo_list <- map2(func_list_full, cols, ~ quo(!!call2(.x, sym(.y)))) + new_quo_list <- map2(func_list_full, cols_list_full, ~ quo(!!call2(.x, sym(.y)))) new_quos <- set_names(new_quo_list, new_quo_names) } @@ -230,3 +226,12 @@ unfold_across <- function(.data, quos_in) { quos_out } + +# if the function is unnamed (an empty character), use the index instead +ensure_named_funcs <- function(funcs){ + func_list <- as.list(funcs) + func_names <- names(funcs) + func_indices <- seq_along(funcs) + names(func_list) <- map2_chr(func_names, func_indices, max) + func_list +} From 32ce5502cdb04afc290ee85f10c95f9cd636358f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 08:20:03 +0100 Subject: [PATCH 39/77] Add another failing test --- r/tests/testthat/test-dplyr-mutate.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 0ad5201b55a..b1232c09bc6 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -597,6 +597,13 @@ test_that("Can use across() within mutate()", { tbl ) + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>% + collect(), + tbl + ) + # across() arguments not in default order compare_dplyr_binding( .input %>% From bccbe0acf76c86be9b6597b8b0da249376102169 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 08:57:44 +0100 Subject: [PATCH 40/77] Remove broken workaround, move comments to better location, refactor out long code into functions --- r/R/dplyr-mutate.R | 54 +++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 2f3546c8a8a..a9c21a87b97 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -185,37 +185,27 @@ unfold_across <- function(.data, quos_in) { return() } - # ensure isn't single item vector or list - funcs <- funcs[!funcs %in% c("c", "list")] - if (funcs[[1]] == "~") { - abort("purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow") + abort( + paste( + "purrr-style lambda functions as `.fns` argument to `across()`", + "not yet supported in Arrow" + ) + ) } - new_quos <- list() - + # if only 1 function, we overwrite the old columns with the new values if (length(funcs) == 1) { # work out the quosures from the call col_syms <- syms(cols) new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x))) - # if only 1 function, we overwrite the old columns new_quos <- set_names(new_quos, cols) } else { - # list() is used to specify the list of functions so remove it - extracted_funcs <- funcs[map_lgl(funcs, ~ !is_symbol(.x, "list"))] + # remove `c()` and `list()` which have been used to specify functions + extracted_funcs <- funcs[map_lgl(funcs, ~ !.x %in% c("c", "list"))] func_list <- ensure_named_funcs(extracted_funcs) - - func_list_full <- rep(func_list, length(cols)) - cols_list_full <- rep(cols, each = length(func_list)) - - # get names of new quosures - new_quo_names <- map2_chr(names(func_list_full), cols_list_full, ~ paste(.y, .x, sep = "_")) - - # get new quosures - new_quo_list <- map2(func_list_full, cols_list_full, ~ quo(!!call2(.x, sym(.y)))) - - new_quos <- set_names(new_quo_list, new_quo_names) + new_quos <- quosures_from_func_list(func_list, cols) } quos_out <- append(quos_out, new_quos) @@ -228,10 +218,30 @@ unfold_across <- function(.data, quos_in) { } # if the function is unnamed (an empty character), use the index instead -ensure_named_funcs <- function(funcs){ +ensure_named_funcs <- function(funcs) { func_list <- as.list(funcs) - func_names <- names(funcs) + func_names <- names(funcs) %||% rep("", length(funcs)) func_indices <- seq_along(funcs) names(func_list) <- map2_chr(func_names, func_indices, max) func_list } + +# given a named list of functions and column names, create a list of new quosures +quosures_from_func_list <- function(func_list, cols) { + func_list_full <- rep(func_list, length(cols)) + cols_list_full <- rep(cols, each = length(func_list)) + + # get names of new quosures + new_quo_names <- map2_chr( + names(func_list_full), cols_list_full, + ~ paste(.y, .x, sep = "_") + ) + + # get new quosures + new_quo_list <- map2( + func_list_full, cols_list_full, + ~ quo(!!call2(.x, sym(.y))) + ) + + set_names(new_quo_list, new_quo_names) +} From 721d420c4343fe75a4d1bccaf6f65c78121334db Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Aug 2022 08:58:19 +0100 Subject: [PATCH 41/77] Uncomment tests --- r/tests/testthat/test-dplyr-mutate.R | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index b1232c09bc6..03981fe98a0 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -633,27 +633,19 @@ test_that("Can use across() within mutate()", { ) # alternative ways of specifying .fns - as a list - # ARROW-17364: .names argument not yet supported for across() - # See: https://github.com/tidyverse/dplyr/issues/6395 for why this is related - expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, list(round))) %>% - collect(), - tbl - ) + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, list(round))) %>% + collect(), + tbl ) # supply .fns as a one-item vector - # ARROW-17364: .names argument not yet supported for across() - # See: https://github.com/tidyverse/dplyr/issues/6395 for why this is related - expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, c(round))) %>% - collect(), - tbl - ) + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, c(round))) %>% + collect(), + tbl ) # ARROW-17366: purrr-style lmabda functions not yet supported From 3cb51a4ed06e6f74c8b366e88957b35b750a6665 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 12 Aug 2022 15:22:56 +0100 Subject: [PATCH 42/77] Add failing test --- r/tests/testthat/test-dplyr-mutate.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 03981fe98a0..e2834c7b638 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -604,6 +604,13 @@ test_that("Can use across() within mutate()", { tbl ) + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), list("fun1" = round, "fun2" = sqrt))) %>% + collect(), + tbl + ) + # across() arguments not in default order compare_dplyr_binding( .input %>% From c34fc78d7aa7c0d18236d62c92321d153e4bacaa Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 11 Aug 2022 16:27:00 +0100 Subject: [PATCH 43/77] Update r/R/dplyr-mutate.R Co-authored-by: Dewey Dunnington --- r/R/dplyr-mutate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index a9c21a87b97..cbe0e7e3239 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -177,7 +177,7 @@ unfold_across <- function(.data, quos_in) { } # use select() to get the column names so we can take advantage of tidyselect - cols <- names(select(.data, !!across_call[[".cols"]])) + cols <- names(dplyr::select(.data, !!across_call[[".cols"]])) funcs <- as.character(across_call[[".fns"]]) # calling across() with .fns = NULL returns all columns unchanged From a4ee3871ac8059a5861499d18f0b9cb256a5380c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 11 Aug 2022 16:31:09 +0100 Subject: [PATCH 44/77] Update r/tests/testthat/test-dplyr-mutate.R Co-authored-by: Dewey Dunnington --- r/tests/testthat/test-dplyr-mutate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index e2834c7b638..0e311543aa9 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -655,7 +655,7 @@ test_that("Can use across() within mutate()", { tbl ) - # ARROW-17366: purrr-style lmabda functions not yet supported + # ARROW-17366: purrr-style lambda functions not yet supported expect_error( compare_dplyr_binding( .input %>% From b7b4fa489fcb757fcc1acd6fb8382bc1918a1133 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 12 Aug 2022 15:27:42 +0100 Subject: [PATCH 45/77] Add another failing test --- r/tests/testthat/test-dplyr-mutate.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 0e311543aa9..fa897bfd86f 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -611,6 +611,14 @@ test_that("Can use across() within mutate()", { tbl ) + expect_error( + tbl %>% + arrow_table() %>% + mutate(across(c(dbl, dbl2), list("fun1" = round(sqrt(dbl))))) %>% + collect(), + regexp = "add in a decent error message" + ) + # across() arguments not in default order compare_dplyr_binding( .input %>% From f6114d279df3981b283cf3b196a714109ef7217c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 15 Aug 2022 12:32:46 +0100 Subject: [PATCH 46/77] Use user-supplied list name for output column name --- r/R/arrow-package.R | 3 ++- r/R/dplyr-mutate.R | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 52722e0db74..21a23175494 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -23,7 +23,8 @@ #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure -#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call +#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args +#' @importFrom rlang call_match #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index cbe0e7e3239..14bca9c4547 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -165,7 +165,7 @@ unfold_across <- function(.data, quos_in) { if (is_call(quo_expr, "across")) { new_quos <- list() - across_call <- match.call(dplyr::across, quo_expr) + across_call <- call_match(quo_expr, dplyr::across, defaults = TRUE) if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) { abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") @@ -176,16 +176,16 @@ unfold_across <- function(.data, quos_in) { abort("`.names` argument to `across()` not yet supported in Arrow") } - # use select() to get the column names so we can take advantage of tidyselect - cols <- names(dplyr::select(.data, !!across_call[[".cols"]])) - funcs <- as.character(across_call[[".fns"]]) + # use select() to get the columns so we can take advantage of tidyselect + source_cols <- names(dplyr::select(.data, !!across_call[[".cols"]])) + funcs <- across_call[[".fns"]] # calling across() with .fns = NULL returns all columns unchanged if (is_empty(funcs)) { return() } - if (funcs[[1]] == "~") { + if (!is_list(funcs) && as.character(funcs)[[1]] == "~") { abort( paste( "purrr-style lambda functions as `.fns` argument to `across()`", @@ -195,17 +195,15 @@ unfold_across <- function(.data, quos_in) { } # if only 1 function, we overwrite the old columns with the new values - if (length(funcs) == 1) { + if (length(funcs) == 1 && is.name(funcs)) { # work out the quosures from the call - col_syms <- syms(cols) + col_syms <- syms(source_cols) new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x))) - new_quos <- set_names(new_quos, cols) + new_quos <- set_names(new_quos, source_cols) } else { - # remove `c()` and `list()` which have been used to specify functions - extracted_funcs <- funcs[map_lgl(funcs, ~ !.x %in% c("c", "list"))] - + extracted_funcs <- call_args(funcs) func_list <- ensure_named_funcs(extracted_funcs) - new_quos <- quosures_from_func_list(func_list, cols) + new_quos <- quosures_from_func_list(func_list, source_cols) } quos_out <- append(quos_out, new_quos) From 7707ce0917c1c870fb0553be5c6178224fcf1500 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 15 Aug 2022 12:41:51 +0100 Subject: [PATCH 47/77] Update test for dodgy input --- r/tests/testthat/test-dplyr-mutate.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index fa897bfd86f..ec43cac1284 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -611,12 +611,18 @@ test_that("Can use across() within mutate()", { tbl ) + # this is valid is neither R nor Arrow expect_error( - tbl %>% - arrow_table() %>% - mutate(across(c(dbl, dbl2), list("fun1" = round(sqrt(dbl))))) %>% - collect(), - regexp = "add in a decent error message" + expect_warning( + compare_dplyr_binding( + .input %>% + arrow_table() %>% + mutate(across(c(dbl, dbl2), list("fun1" = round(sqrt(dbl))))) %>% + collect(), + tbl, + warning = TRUE + ) + ) ) # across() arguments not in default order From 02107f37d5177c0ec6797f94b896f4e321d533fc Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 15 Aug 2022 12:50:49 +0100 Subject: [PATCH 48/77] Run devtools::document() --- r/NAMESPACE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index d54071b4a28..9816e11d69b 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -413,6 +413,8 @@ importFrom(rlang,as_function) importFrom(rlang,as_label) importFrom(rlang,as_quosure) importFrom(rlang,call2) +importFrom(rlang,call_args) +importFrom(rlang,call_match) importFrom(rlang,caller_env) importFrom(rlang,dots_n) importFrom(rlang,enexpr) From f836d2a7bab32055d9ca72b1e2b30c93e8697cca Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 15 Aug 2022 12:59:58 +0100 Subject: [PATCH 49/77] Ensure new quos have correct env --- r/R/arrow-package.R | 2 +- r/R/dplyr-mutate.R | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 21a23175494..d9fe65ce5c7 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -24,7 +24,7 @@ #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args -#' @importFrom rlang call_match +#' @importFrom rlang call_match quo_set_env quo_get_env #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 14bca9c4547..5620cfefe99 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -162,6 +162,7 @@ unfold_across <- function(.data, quos_in) { for (quo_i in seq_along(quos_in)) { quo_in <- quos_in[quo_i] quo_expr <- quo_get_expr(quo_in[[1]]) + quo_env <- quo_get_env(quo_in[[1]]) if (is_call(quo_expr, "across")) { new_quos <- list() @@ -203,7 +204,7 @@ unfold_across <- function(.data, quos_in) { } else { extracted_funcs <- call_args(funcs) func_list <- ensure_named_funcs(extracted_funcs) - new_quos <- quosures_from_func_list(func_list, source_cols) + new_quos <- quosures_from_func_list(func_list, source_cols, quo_env) } quos_out <- append(quos_out, new_quos) @@ -225,7 +226,7 @@ ensure_named_funcs <- function(funcs) { } # given a named list of functions and column names, create a list of new quosures -quosures_from_func_list <- function(func_list, cols) { +quosures_from_func_list <- function(func_list, cols, quo_env) { func_list_full <- rep(func_list, length(cols)) cols_list_full <- rep(cols, each = length(func_list)) @@ -241,5 +242,6 @@ quosures_from_func_list <- function(func_list, cols) { ~ quo(!!call2(.x, sym(.y))) ) - set_names(new_quo_list, new_quo_names) + quosures <- set_names(new_quo_list, new_quo_names) + map(quosures, ~quo_set_env(.x, quo_env)) } From 84748512e26025e032d5be27978ffc58c35fabc7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 15 Aug 2022 15:01:55 +0100 Subject: [PATCH 50/77] Add explanation of failing test --- r/tests/testthat/test-dplyr-mutate.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index ec43cac1284..8d9c7185053 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -689,6 +689,7 @@ test_that("Can use across() within mutate()", { tbl ) + # ARROW-12778 - `where()` is not yet supported expect_error( compare_dplyr_binding( .input %>% From 21cb4be0e877efee75d68fdb4f26f1200debee4c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 15 Aug 2022 15:43:49 +0100 Subject: [PATCH 51/77] Add a test containing more expressions before and after across(), and one which overwrite output of one of the inputs to across --- r/tests/testthat/test-dplyr-mutate.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 8d9c7185053..a2d3420fbaf 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -597,6 +597,18 @@ test_that("Can use across() within mutate()", { tbl ) + compare_dplyr_binding( + .input %>% + mutate( + dbl2 = dbl * 2, + across(c(dbl, dbl2), round), + int2 = int * 2, + dbl = dbl + 3 + ) %>% + collect(), + tbl + ) + compare_dplyr_binding( .input %>% mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>% From aa052e46525323a3883186d3535fe11f27f33eb3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 22 Aug 2022 10:51:12 +0100 Subject: [PATCH 52/77] Rename unfold_across -> expand_across --- r/R/dplyr-mutate.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 5620cfefe99..d6f5532ec89 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -25,7 +25,7 @@ mutate.arrow_dplyr_query <- function(.data, .after = NULL) { call <- match.call() - expression_list <- unfold_across(.data, quos(...)) + expression_list <- expand_across(.data, quos(...)) exprs <- ensure_named_exprs(expression_list) .keep <- match.arg(.keep) @@ -156,7 +156,7 @@ ensure_named_exprs <- function(exprs) { # Take the input quos and unfold any instances of across() # into individual quosures -unfold_across <- function(.data, quos_in) { +expand_across <- function(.data, quos_in) { quos_out <- list() # Check for any expressions starting with across for (quo_i in seq_along(quos_in)) { From c76cd39ff2fba0b07bd7019e1c7276412758daaa Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 22 Aug 2022 10:57:09 +0100 Subject: [PATCH 53/77] Update match.call with extra cols --- r/NAMESPACE | 2 +- r/R/arrow-package.R | 2 +- r/R/dplyr-mutate.R | 10 +++++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 9816e11d69b..1118b46d99d 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -414,7 +414,6 @@ importFrom(rlang,as_label) importFrom(rlang,as_quosure) importFrom(rlang,call2) importFrom(rlang,call_args) -importFrom(rlang,call_match) importFrom(rlang,caller_env) importFrom(rlang,dots_n) importFrom(rlang,enexpr) @@ -443,6 +442,7 @@ importFrom(rlang,quo_get_env) importFrom(rlang,quo_get_expr) importFrom(rlang,quo_is_null) importFrom(rlang,quo_name) +importFrom(rlang,quo_set_env) importFrom(rlang,quo_set_expr) importFrom(rlang,quos) importFrom(rlang,seq2) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index d9fe65ce5c7..7a81c7e24f1 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -24,7 +24,7 @@ #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args -#' @importFrom rlang call_match quo_set_env quo_get_env +#' @importFrom rlang quo_set_env quo_get_env #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index d6f5532ec89..206be9685c3 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -160,13 +160,21 @@ expand_across <- function(.data, quos_in) { quos_out <- list() # Check for any expressions starting with across for (quo_i in seq_along(quos_in)) { + # do it like this to preserve naming quo_in <- quos_in[quo_i] quo_expr <- quo_get_expr(quo_in[[1]]) quo_env <- quo_get_env(quo_in[[1]]) if (is_call(quo_expr, "across")) { new_quos <- list() - across_call <- call_match(quo_expr, dplyr::across, defaults = TRUE) + + across_call <- match.call( + definition = dplyr::across, + call = quo_expr, + expand.dots = FALSE, + envir = quo_env + ) + if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) { abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") From 6ffb9a12736bfeca549d3e0a968e6129a4ce8f2f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 22 Aug 2022 12:31:12 +0100 Subject: [PATCH 54/77] Add failing test for unnamed cols --- r/tests/testthat/test-dplyr-mutate.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index a2d3420fbaf..c863f04ff6e 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -645,6 +645,14 @@ test_that("Can use across() within mutate()", { tbl ) + # across() with no columns named + compare_dplyr_binding( + .input %>% + mutate(across(.fns = round)) %>% + collect(), + tbl + ) + # ARROW-17364: .names argument not yet supported for across() expect_error( tbl %>% From b78bce876eb2296cd73d8c6759d7efb2ec3449bc Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 22 Aug 2022 12:40:18 +0100 Subject: [PATCH 55/77] Add tests and functionality for if .cols argument not supplied --- r/R/dplyr-mutate.R | 11 +++++++++-- r/tests/testthat/test-dplyr-mutate.R | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 206be9685c3..610626f3313 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -175,7 +175,6 @@ expand_across <- function(.data, quos_in) { envir = quo_env ) - if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) { abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") } @@ -185,8 +184,16 @@ expand_across <- function(.data, quos_in) { abort("`.names` argument to `across()` not yet supported in Arrow") } + if (!is.null(across_call[[".cols"]])) { + cols <- across_call[[".cols"]] + } else { + cols <- quote(everything()) + } + + cols <- as_quosure(cols, quo_env) + # use select() to get the columns so we can take advantage of tidyselect - source_cols <- names(dplyr::select(.data, !!across_call[[".cols"]])) + source_cols <- names(dplyr::select(.data, !!cols)) funcs <- across_call[[".fns"]] # calling across() with .fns = NULL returns all columns unchanged diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index c863f04ff6e..2582148d1fd 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -648,6 +648,7 @@ test_that("Can use across() within mutate()", { # across() with no columns named compare_dplyr_binding( .input %>% + select(int, dbl, dbl2) %>% mutate(across(.fns = round)) %>% collect(), tbl From 1b45ea226559ea85def682b573098da191ea7c94 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 22 Aug 2022 23:54:19 +0100 Subject: [PATCH 56/77] Uncomment names test --- r/R/dplyr-mutate.R | 132 +++++++++++++++++++-------- r/tests/testthat/test-dplyr-mutate.R | 29 ++++-- 2 files changed, 119 insertions(+), 42 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 610626f3313..3b164d343ea 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -179,11 +179,6 @@ expand_across <- function(.data, quos_in) { abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") } - # ARROW-17364: add support for .names argument - if (!is.null(across_call[[".names"]])) { - abort("`.names` argument to `across()` not yet supported in Arrow") - } - if (!is.null(across_call[[".cols"]])) { cols <- across_call[[".cols"]] } else { @@ -192,16 +187,22 @@ expand_across <- function(.data, quos_in) { cols <- as_quosure(cols, quo_env) - # use select() to get the columns so we can take advantage of tidyselect - source_cols <- names(dplyr::select(.data, !!cols)) - funcs <- across_call[[".fns"]] + setup <- across_setup( + !!cols, + fns = across_call[[".fns"]], + names = across_call[[".names"]], + .caller_env = quo_env, + mask = .data, + inline = TRUE + ) # calling across() with .fns = NULL returns all columns unchanged - if (is_empty(funcs)) { + if (is_empty(setup$fns)) { + # this needs to be updated to match dplyr's version return() } - if (!is_list(funcs) && as.character(funcs)[[1]] == "~") { + if (!is_list(setup$fns) && as.character(setup$fns)[[1]] == "~") { abort( paste( "purrr-style lambda functions as `.fns` argument to `across()`", @@ -211,15 +212,13 @@ expand_across <- function(.data, quos_in) { } # if only 1 function, we overwrite the old columns with the new values - if (length(funcs) == 1 && is.name(funcs)) { + if (length(setup$fns) == 0 && is.name(setup$fns)) { # work out the quosures from the call - col_syms <- syms(source_cols) - new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x))) - new_quos <- set_names(new_quos, source_cols) + col_syms <- syms(setup$vars) + new_quos <- map(col_syms, ~ quo(!!call2(setup$fns, .x))) + new_quos <- set_names(new_quos, setup$names) } else { - extracted_funcs <- call_args(funcs) - func_list <- ensure_named_funcs(extracted_funcs) - new_quos <- quosures_from_func_list(func_list, source_cols, quo_env) + new_quos <- quosures_from_func_list(setup, quo_env) } quos_out <- append(quos_out, new_quos) @@ -231,25 +230,10 @@ expand_across <- function(.data, quos_in) { quos_out } -# if the function is unnamed (an empty character), use the index instead -ensure_named_funcs <- function(funcs) { - func_list <- as.list(funcs) - func_names <- names(funcs) %||% rep("", length(funcs)) - func_indices <- seq_along(funcs) - names(func_list) <- map2_chr(func_names, func_indices, max) - func_list -} - # given a named list of functions and column names, create a list of new quosures -quosures_from_func_list <- function(func_list, cols, quo_env) { - func_list_full <- rep(func_list, length(cols)) - cols_list_full <- rep(cols, each = length(func_list)) - - # get names of new quosures - new_quo_names <- map2_chr( - names(func_list_full), cols_list_full, - ~ paste(.y, .x, sep = "_") - ) +quosures_from_func_list <- function(setup, quo_env) { + func_list_full <- rep(setup$fns, length(setup$vars)) + cols_list_full <- rep(setup$vars, each = length(setup$fns)) # get new quosures new_quo_list <- map2( @@ -257,6 +241,82 @@ quosures_from_func_list <- function(func_list, cols, quo_env) { ~ quo(!!call2(.x, sym(.y))) ) - quosures <- set_names(new_quo_list, new_quo_names) + quosures <- set_names(new_quo_list, setup$names) map(quosures, ~quo_set_env(.x, quo_env)) } + +across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE){ + cols <- enquo(cols) + + if (is.null(fns) && quo_is_call(cols, "~")) { + bullets <- c( + "Must supply a column selection.", + i = glue("You most likely meant: `{across_if_fn}(everything(), {as_label(cols)})`."), + i = "The first argument `.cols` selects a set of columns.", + i = "The second argument `.fns` operates on each selected columns." + ) + abort(bullets, call = call(across_if_fn)) + } + vars <- names(dplyr::select(mask, !!cols)) + + # need to work out what this block does + # if (is.null(fns)) { + # if (!is.null(names)) { + # glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") + # names <- fix_call( + # vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique"), + # call = call(across_if_fn) + # ) + # } else { + # names <- names_vars + # } + # + # value <- list(vars = vars, fns = fns, names = names) + # return(value) + # } + + # apply `.names` smart default + if (is.function(fns) || is_formula(fns) || is.name(fns)) { + names <- names %||% "{.col}" + fns <- list("1" = fns) + } else { + names <- names %||% "{.col}_{.fn}" + fns <- call_args(fns) + } + + if (!is.list(fns)) { + msg <- c("`.fns` must be NULL, a function, a formula, or a list of functions/formulas.") + abort(msg, call = call(across_if_fn)) + } + + # make sure fns has names, use number to replace unnamed + if (is.null(names(fns))) { + names_fns <- seq_along(fns) + } else { + names_fns <- names(fns) + empties <- which(names_fns == "") + if (length(empties)) { + names_fns[empties] <- empties + } + } + + glue_mask <- across_glue_mask(.caller_env, + .col = rep(vars, each = length(fns)), + .fn = rep(names_fns , length(vars)) + ) + names <- vctrs::vec_as_names(glue::glue(names, .envir = glue_mask), repair = "check_unique") + + if (!inline) { + fns <- map(fns, as_function) + } + + list(vars = vars, fns = fns, names = names) +} + +across_glue_mask <- function(.col, .fn, .caller_env) { + glue_mask <- env(.caller_env, .col = .col, .fn = .fn) + env_bind_active( + glue_mask, col = function() glue_mask$.col, fn = function() glue_mask$.fn + ) + glue_mask +} diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 2582148d1fd..ddbaf71c5e3 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -654,14 +654,31 @@ test_that("Can use across() within mutate()", { tbl ) - # ARROW-17364: .names argument not yet supported for across() - expect_error( - tbl %>% - arrow_table() %>% + + + # dynamic variable name + int = c("dbl", "dbl2") + compare_dplyr_binding( + .input %>% + select(int, dbl, dbl2) %>% + mutate(across(all_of(int), sqrt)) %>% + collect(), + tbl + ) + + # .names argument + compare_dplyr_binding( + .input %>% mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% collect(), - regexp = "`.names` argument to `across()` not yet supported in Arrow", - fixed = TRUE + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% + collect(), + tbl ) # ellipses (...) are a deprecated argument From d3a13c04f260bcc87dc076d75d9438d522b638fe Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 23 Aug 2022 08:01:32 +0100 Subject: [PATCH 57/77] Handle lambda functions and null list of functions --- r/R/dplyr-mutate.R | 34 ++++++++++++++++------------ r/tests/testthat/test-dplyr-mutate.R | 13 +++++++++-- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 3b164d343ea..cb3ca13ef92 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -260,20 +260,17 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE){ vars <- names(dplyr::select(mask, !!cols)) # need to work out what this block does - # if (is.null(fns)) { - # if (!is.null(names)) { - # glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") - # names <- fix_call( - # vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique"), - # call = call(across_if_fn) - # ) - # } else { - # names <- names_vars - # } - # - # value <- list(vars = vars, fns = fns, names = names) - # return(value) - # } + if (is.null(fns)) { + if (!is.null(names)) { + glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") + names <- vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique") + } else { + names <- vars + } + + value <- list(vars = vars, fns = fns, names = names) + return(value) + } # apply `.names` smart default if (is.function(fns) || is_formula(fns) || is.name(fns)) { @@ -284,6 +281,15 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE){ fns <- call_args(fns) } + if (any(map_lgl(fns, is_formula))) { + abort( + paste( + "purrr-style lambda functions as `.fns` argument to `across()`", + "not yet supported in Arrow" + ) + ) + } + if (!is.list(fns)) { msg <- c("`.fns` must be NULL, a function, a formula, or a list of functions/formulas.") abort(msg, call = call(across_if_fn)) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index ddbaf71c5e3..8b0f52d7a4e 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -654,8 +654,6 @@ test_that("Can use across() within mutate()", { tbl ) - - # dynamic variable name int = c("dbl", "dbl2") compare_dplyr_binding( @@ -719,6 +717,17 @@ test_that("Can use across() within mutate()", { fixed = TRUE ) + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, list(~ round(.x, digits = -1), ~sqrt(.x)))) %>% + collect(), + tbl + ), + regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", + fixed = TRUE + ) + # .fns = NULL, the default compare_dplyr_binding( .input %>% From 9f49d2fb304d7aee95a1fff39f7e7c442876dc0b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 23 Aug 2022 08:14:50 +0100 Subject: [PATCH 58/77] Move across tests to a separate file --- r/tests/testthat/test-dplyr-across.R | 171 ++++++++++++++++++++++++++ r/tests/testthat/test-dplyr-mutate.R | 172 --------------------------- 2 files changed, 171 insertions(+), 172 deletions(-) create mode 100644 r/tests/testthat/test-dplyr-across.R diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R new file mode 100644 index 00000000000..45eef676220 --- /dev/null +++ b/r/tests/testthat/test-dplyr-across.R @@ -0,0 +1,171 @@ +test_that("Can use across() within mutate()", { + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), round)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + dbl2 = dbl * 2, + across(c(dbl, dbl2), round), + int2 = int * 2, + dbl = dbl + 3 + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), list("fun1" = round, "fun2" = sqrt))) %>% + collect(), + tbl + ) + + # this is valid is neither R nor Arrow + expect_error( + expect_warning( + compare_dplyr_binding( + .input %>% + arrow_table() %>% + mutate(across(c(dbl, dbl2), list("fun1" = round(sqrt(dbl))))) %>% + collect(), + tbl, + warning = TRUE + ) + ) + ) + + # across() arguments not in default order + compare_dplyr_binding( + .input %>% + mutate(across(.fns = round, c(dbl, dbl2))) %>% + collect(), + tbl + ) + + # across() with no columns named + compare_dplyr_binding( + .input %>% + select(int, dbl, dbl2) %>% + mutate(across(.fns = round)) %>% + collect(), + tbl + ) + + # dynamic variable name + int = c("dbl", "dbl2") + compare_dplyr_binding( + .input %>% + select(int, dbl, dbl2) %>% + mutate(across(all_of(int), sqrt)) %>% + collect(), + tbl + ) + + # .names argument + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% + collect(), + tbl + ) + + # ellipses (...) are a deprecated argument + expect_error( + tbl %>% + arrow_table() %>% + mutate(across(c(dbl, dbl2), round, digits = -1)) %>% + collect(), + regexp = "`...` argument to `across()` is deprecated in dplyr and not supported in Arrow", + fixed = TRUE + ) + + # alternative ways of specifying .fns - as a list + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, list(round))) %>% + collect(), + tbl + ) + + # supply .fns as a one-item vector + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, c(round))) %>% + collect(), + tbl + ) + + # ARROW-17366: purrr-style lambda functions not yet supported + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, ~ round(.x, digits = -1))) %>% + collect(), + tbl + ), + regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", + fixed = TRUE + ) + + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, list(~ round(.x, digits = -1), ~sqrt(.x)))) %>% + collect(), + tbl + ), + regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", + fixed = TRUE + ) + + # .fns = NULL, the default + compare_dplyr_binding( + .input %>% + mutate(across(1:dbl2, NULL)) %>% + collect(), + tbl + ) + + # ARROW-12778 - `where()` is not yet supported + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(where(is.double))) %>% + collect(), + tbl + ), + "Unsupported selection helper" + ) + + # gives the right error with window functions + expect_warning( + arrow_table(tbl) %>% + mutate( + x = int + 2, + across(c("int", "dbl"), list(mean = mean, sd = sd, round)), + exp(dbl2) + ) %>% + collect(), + "window functions not currently supported in Arrow; pulling data into R", + fixed = TRUE + ) +}) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 8b0f52d7a4e..5baeb8f3a80 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -588,175 +588,3 @@ test_that("mutate() and transmute() with namespaced functions", { tbl ) }) - -test_that("Can use across() within mutate()", { - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), round)) %>% - collect(), - tbl - ) - - compare_dplyr_binding( - .input %>% - mutate( - dbl2 = dbl * 2, - across(c(dbl, dbl2), round), - int2 = int * 2, - dbl = dbl + 3 - ) %>% - collect(), - tbl - ) - - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>% - collect(), - tbl - ) - - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), list("fun1" = round, "fun2" = sqrt))) %>% - collect(), - tbl - ) - - # this is valid is neither R nor Arrow - expect_error( - expect_warning( - compare_dplyr_binding( - .input %>% - arrow_table() %>% - mutate(across(c(dbl, dbl2), list("fun1" = round(sqrt(dbl))))) %>% - collect(), - tbl, - warning = TRUE - ) - ) - ) - - # across() arguments not in default order - compare_dplyr_binding( - .input %>% - mutate(across(.fns = round, c(dbl, dbl2))) %>% - collect(), - tbl - ) - - # across() with no columns named - compare_dplyr_binding( - .input %>% - select(int, dbl, dbl2) %>% - mutate(across(.fns = round)) %>% - collect(), - tbl - ) - - # dynamic variable name - int = c("dbl", "dbl2") - compare_dplyr_binding( - .input %>% - select(int, dbl, dbl2) %>% - mutate(across(all_of(int), sqrt)) %>% - collect(), - tbl - ) - - # .names argument - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% - collect(), - tbl - ) - - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% - collect(), - tbl - ) - - # ellipses (...) are a deprecated argument - expect_error( - tbl %>% - arrow_table() %>% - mutate(across(c(dbl, dbl2), round, digits = -1)) %>% - collect(), - regexp = "`...` argument to `across()` is deprecated in dplyr and not supported in Arrow", - fixed = TRUE - ) - - # alternative ways of specifying .fns - as a list - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, list(round))) %>% - collect(), - tbl - ) - - # supply .fns as a one-item vector - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, c(round))) %>% - collect(), - tbl - ) - - # ARROW-17366: purrr-style lambda functions not yet supported - expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, ~ round(.x, digits = -1))) %>% - collect(), - tbl - ), - regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", - fixed = TRUE - ) - - expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, list(~ round(.x, digits = -1), ~sqrt(.x)))) %>% - collect(), - tbl - ), - regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", - fixed = TRUE - ) - - # .fns = NULL, the default - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, NULL)) %>% - collect(), - tbl - ) - - # ARROW-12778 - `where()` is not yet supported - expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(where(is.double))) %>% - collect(), - tbl - ), - "Unsupported selection helper" - ) - - # gives the right error with window functions - expect_warning( - arrow_table(tbl) %>% - mutate( - x = int + 2, - across(c("int", "dbl"), list(mean = mean, sd = sd, round)), - exp(dbl2) - ) %>% - collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE - ) -}) From 37d5f0fe8975d88c8f49f8db077c7a8098560486 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 23 Aug 2022 08:34:52 +0100 Subject: [PATCH 59/77] Move across to own file --- r/DESCRIPTION | 1 + r/NAMESPACE | 3 + r/R/arrow-package.R | 2 +- r/R/dplyr-across.R | 179 ++++++++++++++++++++++++++++++++++++++++++++ r/R/dplyr-mutate.R | 173 ------------------------------------------ 5 files changed, 184 insertions(+), 174 deletions(-) create mode 100644 r/R/dplyr-across.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 95c14058698..b26c3f1e848 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -91,6 +91,7 @@ Collate: 'dataset-scan.R' 'dataset-write.R' 'dictionary.R' + 'dplyr-across.R' 'dplyr-arrange.R' 'dplyr-collect.R' 'dplyr-count.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 1118b46d99d..1f547e5f672 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -422,6 +422,7 @@ importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) importFrom(rlang,env_bind) +importFrom(rlang,env_bind_active) importFrom(rlang,eval_tidy) importFrom(rlang,exec) importFrom(rlang,expr) @@ -430,6 +431,7 @@ importFrom(rlang,is_call) importFrom(rlang,is_character) importFrom(rlang,is_empty) importFrom(rlang,is_false) +importFrom(rlang,is_formula) importFrom(rlang,is_integerish) importFrom(rlang,is_interactive) importFrom(rlang,is_list) @@ -440,6 +442,7 @@ importFrom(rlang,new_data_mask) importFrom(rlang,new_environment) importFrom(rlang,quo_get_env) importFrom(rlang,quo_get_expr) +importFrom(rlang,quo_is_call) importFrom(rlang,quo_is_null) importFrom(rlang,quo_name) importFrom(rlang,quo_set_env) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 7a81c7e24f1..7a2a3f64f5c 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -24,7 +24,7 @@ #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args -#' @importFrom rlang quo_set_env quo_get_env +#' @importFrom rlang quo_set_env quo_get_env is_formula env_bind_active quo_is_call #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R new file mode 100644 index 00000000000..5c8f7c93f14 --- /dev/null +++ b/r/R/dplyr-across.R @@ -0,0 +1,179 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +expand_across <- function(.data, quos_in) { + quos_out <- list() + # retrieve items using their values to preserve naming of quos other than across + for (quo_i in seq_along(quos_in)) { + quo_in <- quos_in[quo_i] + quo_expr <- quo_get_expr(quo_in[[1]]) + quo_env <- quo_get_env(quo_in[[1]]) + + if (is_call(quo_expr, "across")) { + new_quos <- list() + + across_call <- match.call( + definition = dplyr::across, + call = quo_expr, + expand.dots = FALSE, + envir = quo_env + ) + + if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) { + abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") + } + + if (!is.null(across_call[[".cols"]])) { + cols <- across_call[[".cols"]] + } else { + cols <- quote(everything()) + } + + cols <- as_quosure(cols, quo_env) + + setup <- across_setup( + !!cols, + fns = across_call[[".fns"]], + names = across_call[[".names"]], + .caller_env = quo_env, + mask = .data, + inline = TRUE + ) + + # calling across() with .fns = NULL returns all columns unchanged + if (is_empty(setup$fns)) { + # this needs to be updated to match dplyr's version + return() + } + + if (!is_list(setup$fns) && as.character(setup$fns)[[1]] == "~") { + abort( + paste( + "purrr-style lambda functions as `.fns` argument to `across()`", + "not yet supported in Arrow" + ) + ) + } + + new_quos <- quosures_from_func_list(setup, quo_env) + + quos_out <- append(quos_out, new_quos) + } else { + quos_out <- append(quos_out, quo_in) + } + } + + quos_out +} + +# given a named list of functions and column names, create a list of new quosures +quosures_from_func_list <- function(setup, quo_env) { + func_list_full <- rep(setup$fns, length(setup$vars)) + cols_list_full <- rep(setup$vars, each = length(setup$fns)) + + # get new quosures + new_quo_list <- map2( + func_list_full, cols_list_full, + ~ quo(!!call2(.x, sym(.y))) + ) + + quosures <- set_names(new_quo_list, setup$names) + map(quosures, ~ quo_set_env(.x, quo_env)) +} + +across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { + cols <- enquo(cols) + + if (is.null(fns) && quo_is_call(cols, "~")) { + bullets <- c( + "Must supply a column selection.", + i = glue("You most likely meant: `{across_if_fn}(everything(), {as_label(cols)})`."), + i = "The first argument `.cols` selects a set of columns.", + i = "The second argument `.fns` operates on each selected columns." + ) + abort(bullets, call = call(across_if_fn)) + } + vars <- names(dplyr::select(mask, !!cols)) + + # need to work out what this block does + if (is.null(fns)) { + if (!is.null(names)) { + glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") + names <- vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique") + } else { + names <- vars + } + + value <- list(vars = vars, fns = fns, names = names) + return(value) + } + + # apply `.names` smart default + if (is.function(fns) || is_formula(fns) || is.name(fns)) { + names <- names %||% "{.col}" + fns <- list("1" = fns) + } else { + names <- names %||% "{.col}_{.fn}" + fns <- call_args(fns) + } + + if (any(map_lgl(fns, is_formula))) { + abort( + paste( + "purrr-style lambda functions as `.fns` argument to `across()`", + "not yet supported in Arrow" + ) + ) + } + + if (!is.list(fns)) { + msg <- c("`.fns` must be NULL, a function, a formula, or a list of functions/formulas.") + abort(msg, call = call(across_if_fn)) + } + + # make sure fns has names, use number to replace unnamed + if (is.null(names(fns))) { + names_fns <- seq_along(fns) + } else { + names_fns <- names(fns) + empties <- which(names_fns == "") + if (length(empties)) { + names_fns[empties] <- empties + } + } + + glue_mask <- across_glue_mask(.caller_env, + .col = rep(vars, each = length(fns)), + .fn = rep(names_fns, length(vars)) + ) + names <- vctrs::vec_as_names(glue::glue(names, .envir = glue_mask), repair = "check_unique") + + if (!inline) { + fns <- map(fns, as_function) + } + + list(vars = vars, fns = fns, names = names) +} + +across_glue_mask <- function(.col, .fn, .caller_env) { + glue_mask <- env(.caller_env, .col = .col, .fn = .fn) + env_bind_active( + glue_mask, + col = function() glue_mask$.col, fn = function() glue_mask$.fn + ) + glue_mask +} diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index cb3ca13ef92..ac555fafe0b 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -153,176 +153,3 @@ ensure_named_exprs <- function(exprs) { names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr) exprs } - -# Take the input quos and unfold any instances of across() -# into individual quosures -expand_across <- function(.data, quos_in) { - quos_out <- list() - # Check for any expressions starting with across - for (quo_i in seq_along(quos_in)) { - # do it like this to preserve naming - quo_in <- quos_in[quo_i] - quo_expr <- quo_get_expr(quo_in[[1]]) - quo_env <- quo_get_env(quo_in[[1]]) - - if (is_call(quo_expr, "across")) { - new_quos <- list() - - across_call <- match.call( - definition = dplyr::across, - call = quo_expr, - expand.dots = FALSE, - envir = quo_env - ) - - if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) { - abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow") - } - - if (!is.null(across_call[[".cols"]])) { - cols <- across_call[[".cols"]] - } else { - cols <- quote(everything()) - } - - cols <- as_quosure(cols, quo_env) - - setup <- across_setup( - !!cols, - fns = across_call[[".fns"]], - names = across_call[[".names"]], - .caller_env = quo_env, - mask = .data, - inline = TRUE - ) - - # calling across() with .fns = NULL returns all columns unchanged - if (is_empty(setup$fns)) { - # this needs to be updated to match dplyr's version - return() - } - - if (!is_list(setup$fns) && as.character(setup$fns)[[1]] == "~") { - abort( - paste( - "purrr-style lambda functions as `.fns` argument to `across()`", - "not yet supported in Arrow" - ) - ) - } - - # if only 1 function, we overwrite the old columns with the new values - if (length(setup$fns) == 0 && is.name(setup$fns)) { - # work out the quosures from the call - col_syms <- syms(setup$vars) - new_quos <- map(col_syms, ~ quo(!!call2(setup$fns, .x))) - new_quos <- set_names(new_quos, setup$names) - } else { - new_quos <- quosures_from_func_list(setup, quo_env) - } - - quos_out <- append(quos_out, new_quos) - } else { - quos_out <- append(quos_out, quo_in) - } - } - - quos_out -} - -# given a named list of functions and column names, create a list of new quosures -quosures_from_func_list <- function(setup, quo_env) { - func_list_full <- rep(setup$fns, length(setup$vars)) - cols_list_full <- rep(setup$vars, each = length(setup$fns)) - - # get new quosures - new_quo_list <- map2( - func_list_full, cols_list_full, - ~ quo(!!call2(.x, sym(.y))) - ) - - quosures <- set_names(new_quo_list, setup$names) - map(quosures, ~quo_set_env(.x, quo_env)) -} - -across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE){ - cols <- enquo(cols) - - if (is.null(fns) && quo_is_call(cols, "~")) { - bullets <- c( - "Must supply a column selection.", - i = glue("You most likely meant: `{across_if_fn}(everything(), {as_label(cols)})`."), - i = "The first argument `.cols` selects a set of columns.", - i = "The second argument `.fns` operates on each selected columns." - ) - abort(bullets, call = call(across_if_fn)) - } - vars <- names(dplyr::select(mask, !!cols)) - - # need to work out what this block does - if (is.null(fns)) { - if (!is.null(names)) { - glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") - names <- vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique") - } else { - names <- vars - } - - value <- list(vars = vars, fns = fns, names = names) - return(value) - } - - # apply `.names` smart default - if (is.function(fns) || is_formula(fns) || is.name(fns)) { - names <- names %||% "{.col}" - fns <- list("1" = fns) - } else { - names <- names %||% "{.col}_{.fn}" - fns <- call_args(fns) - } - - if (any(map_lgl(fns, is_formula))) { - abort( - paste( - "purrr-style lambda functions as `.fns` argument to `across()`", - "not yet supported in Arrow" - ) - ) - } - - if (!is.list(fns)) { - msg <- c("`.fns` must be NULL, a function, a formula, or a list of functions/formulas.") - abort(msg, call = call(across_if_fn)) - } - - # make sure fns has names, use number to replace unnamed - if (is.null(names(fns))) { - names_fns <- seq_along(fns) - } else { - names_fns <- names(fns) - empties <- which(names_fns == "") - if (length(empties)) { - names_fns[empties] <- empties - } - } - - glue_mask <- across_glue_mask(.caller_env, - .col = rep(vars, each = length(fns)), - .fn = rep(names_fns , length(vars)) - ) - names <- vctrs::vec_as_names(glue::glue(names, .envir = glue_mask), repair = "check_unique") - - if (!inline) { - fns <- map(fns, as_function) - } - - list(vars = vars, fns = fns, names = names) -} - -across_glue_mask <- function(.col, .fn, .caller_env) { - glue_mask <- env(.caller_env, .col = .col, .fn = .fn) - env_bind_active( - glue_mask, col = function() glue_mask$.col, fn = function() glue_mask$.fn - ) - glue_mask -} From 9abddac5dced23443e0d7e41b8a6c4265789dc48 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 23 Aug 2022 08:40:33 +0100 Subject: [PATCH 60/77] Remove redundant line --- r/R/dplyr-across.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index 5c8f7c93f14..f0371f1c5a8 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -43,10 +43,8 @@ expand_across <- function(.data, quos_in) { cols <- quote(everything()) } - cols <- as_quosure(cols, quo_env) - setup <- across_setup( - !!cols, + cols = !!as_quosure(cols, quo_env), fns = across_call[[".fns"]], names = across_call[[".names"]], .caller_env = quo_env, @@ -109,7 +107,6 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { } vars <- names(dplyr::select(mask, !!cols)) - # need to work out what this block does if (is.null(fns)) { if (!is.null(names)) { glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") @@ -117,7 +114,6 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { } else { names <- vars } - value <- list(vars = vars, fns = fns, names = names) return(value) } From 59e38d041f58c0536c6c64ffebcb3b12d0a138cf Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 23 Aug 2022 08:56:14 +0100 Subject: [PATCH 61/77] Add another test of names --- r/tests/testthat/test-dplyr-across.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index 45eef676220..520904f823d 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -83,7 +83,7 @@ test_that("Can use across() within mutate()", { compare_dplyr_binding( .input %>% - mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% + mutate(across(c(dbl, dbl2), round, .names = "round_{.col}")) %>% collect(), tbl ) From e8b065f3b9996d948c4af3cd6615759b1f6021fc Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 24 Aug 2022 09:56:37 +0100 Subject: [PATCH 62/77] Ru styler --- r/tests/testthat/test-dplyr-across.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index 520904f823d..a7ed49f58bf 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -13,7 +13,7 @@ test_that("Can use across() within mutate()", { across(c(dbl, dbl2), round), int2 = int * 2, dbl = dbl + 3 - ) %>% + ) %>% collect(), tbl ) @@ -64,7 +64,7 @@ test_that("Can use across() within mutate()", { ) # dynamic variable name - int = c("dbl", "dbl2") + int <- c("dbl", "dbl2") compare_dplyr_binding( .input %>% select(int, dbl, dbl2) %>% @@ -75,7 +75,7 @@ test_that("Can use across() within mutate()", { # .names argument compare_dplyr_binding( - .input %>% + .input %>% mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% collect(), tbl @@ -129,7 +129,7 @@ test_that("Can use across() within mutate()", { expect_error( compare_dplyr_binding( .input %>% - mutate(across(1:dbl2, list(~ round(.x, digits = -1), ~sqrt(.x)))) %>% + mutate(across(1:dbl2, list(~ round(.x, digits = -1), ~ sqrt(.x)))) %>% collect(), tbl ), From d3ddb9b3d44a4fc4aeb9cf01978c44a800b4f608 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 24 Aug 2022 17:31:29 +0100 Subject: [PATCH 63/77] Add Apache header and swap tbl for example_data --- r/tests/testthat/test-dplyr-across.R | 53 ++++++++++++++++++---------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index a7ed49f58bf..db0f7760913 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -1,9 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + test_that("Can use across() within mutate()", { compare_dplyr_binding( .input %>% mutate(across(c(dbl, dbl2), round)) %>% collect(), - tbl + example_data ) compare_dplyr_binding( @@ -15,21 +32,21 @@ test_that("Can use across() within mutate()", { dbl = dbl + 3 ) %>% collect(), - tbl + example_data ) compare_dplyr_binding( .input %>% mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>% collect(), - tbl + example_data ) compare_dplyr_binding( .input %>% mutate(across(c(dbl, dbl2), list("fun1" = round, "fun2" = sqrt))) %>% collect(), - tbl + example_data ) # this is valid is neither R nor Arrow @@ -40,7 +57,7 @@ test_that("Can use across() within mutate()", { arrow_table() %>% mutate(across(c(dbl, dbl2), list("fun1" = round(sqrt(dbl))))) %>% collect(), - tbl, + example_data, warning = TRUE ) ) @@ -51,7 +68,7 @@ test_that("Can use across() within mutate()", { .input %>% mutate(across(.fns = round, c(dbl, dbl2))) %>% collect(), - tbl + example_data ) # across() with no columns named @@ -60,7 +77,7 @@ test_that("Can use across() within mutate()", { select(int, dbl, dbl2) %>% mutate(across(.fns = round)) %>% collect(), - tbl + example_data ) # dynamic variable name @@ -70,7 +87,7 @@ test_that("Can use across() within mutate()", { select(int, dbl, dbl2) %>% mutate(across(all_of(int), sqrt)) %>% collect(), - tbl + example_data ) # .names argument @@ -78,19 +95,19 @@ test_that("Can use across() within mutate()", { .input %>% mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% collect(), - tbl + example_data ) compare_dplyr_binding( .input %>% mutate(across(c(dbl, dbl2), round, .names = "round_{.col}")) %>% collect(), - tbl + example_data ) # ellipses (...) are a deprecated argument expect_error( - tbl %>% + example_data %>% arrow_table() %>% mutate(across(c(dbl, dbl2), round, digits = -1)) %>% collect(), @@ -103,7 +120,7 @@ test_that("Can use across() within mutate()", { .input %>% mutate(across(1:dbl2, list(round))) %>% collect(), - tbl + example_data ) # supply .fns as a one-item vector @@ -111,7 +128,7 @@ test_that("Can use across() within mutate()", { .input %>% mutate(across(1:dbl2, c(round))) %>% collect(), - tbl + example_data ) # ARROW-17366: purrr-style lambda functions not yet supported @@ -120,7 +137,7 @@ test_that("Can use across() within mutate()", { .input %>% mutate(across(1:dbl2, ~ round(.x, digits = -1))) %>% collect(), - tbl + example_data ), regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", fixed = TRUE @@ -131,7 +148,7 @@ test_that("Can use across() within mutate()", { .input %>% mutate(across(1:dbl2, list(~ round(.x, digits = -1), ~ sqrt(.x)))) %>% collect(), - tbl + example_data ), regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", fixed = TRUE @@ -142,7 +159,7 @@ test_that("Can use across() within mutate()", { .input %>% mutate(across(1:dbl2, NULL)) %>% collect(), - tbl + example_data ) # ARROW-12778 - `where()` is not yet supported @@ -151,14 +168,14 @@ test_that("Can use across() within mutate()", { .input %>% mutate(across(where(is.double))) %>% collect(), - tbl + example_data ), "Unsupported selection helper" ) # gives the right error with window functions expect_warning( - arrow_table(tbl) %>% + arrow_table(example_data) %>% mutate( x = int + 2, across(c("int", "dbl"), list(mean = mean, sd = sd, round)), From 7a2603e921152ee6b27df407054c142a24b2b4b5 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 24 Aug 2022 17:32:22 +0100 Subject: [PATCH 64/77] Call dplyr in tests --- r/tests/testthat/test-dplyr-across.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index db0f7760913..eafb8808363 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +library(dplyr, warn.conflicts = FALSE) + test_that("Can use across() within mutate()", { compare_dplyr_binding( .input %>% From b8f86367928e1d8eb7302551c50eaafbbf688df3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 25 Aug 2022 09:01:12 +0100 Subject: [PATCH 65/77] Namespace glue --- r/R/dplyr-across.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index f0371f1c5a8..8c4c068392c 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -99,7 +99,7 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { if (is.null(fns) && quo_is_call(cols, "~")) { bullets <- c( "Must supply a column selection.", - i = glue("You most likely meant: `{across_if_fn}(everything(), {as_label(cols)})`."), + i = glue::glue("You most likely meant: `{across_if_fn}(everything(), {as_label(cols)})`."), i = "The first argument `.cols` selects a set of columns.", i = "The second argument `.fns` operates on each selected columns." ) @@ -110,7 +110,7 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { if (is.null(fns)) { if (!is.null(names)) { glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") - names <- vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique") + names <- vctrs::vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique") } else { names <- vars } From cad3872fa5407d9fa363663204603fa1c32d0216 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 25 Aug 2022 11:16:59 +0100 Subject: [PATCH 66/77] Remove extra code referencing unsupported features --- r/R/dplyr-across.R | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index 8c4c068392c..dcf7f533803 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -96,21 +96,12 @@ quosures_from_func_list <- function(setup, quo_env) { across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { cols <- enquo(cols) - if (is.null(fns) && quo_is_call(cols, "~")) { - bullets <- c( - "Must supply a column selection.", - i = glue::glue("You most likely meant: `{across_if_fn}(everything(), {as_label(cols)})`."), - i = "The first argument `.cols` selects a set of columns.", - i = "The second argument `.fns` operates on each selected columns." - ) - abort(bullets, call = call(across_if_fn)) - } vars <- names(dplyr::select(mask, !!cols)) if (is.null(fns)) { if (!is.null(names)) { - glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") - names <- vctrs::vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique") + glue_mask <- across_glue_mask(.caller_env, .col = vars, .fn = "1") + names <- vctrs::vec_as_names(glue::glue(names, .envir = glue_mask), repair = "check_unique") } else { names <- vars } From 0b2e4591b551eb367858fc8ac7b0dd5677889e43 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 25 Aug 2022 16:49:37 +0100 Subject: [PATCH 67/77] Remove redundant ref --- r/R/dplyr-across.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index dcf7f533803..96024eb2d66 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -129,7 +129,7 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { if (!is.list(fns)) { msg <- c("`.fns` must be NULL, a function, a formula, or a list of functions/formulas.") - abort(msg, call = call(across_if_fn)) + abort(msg) } # make sure fns has names, use number to replace unnamed From 4ffeafb0f504f382f01f64e89ee22c0515dc61e0 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 25 Aug 2022 17:58:27 +0100 Subject: [PATCH 68/77] Add glue as explicit dependency --- r/DESCRIPTION | 1 + r/NAMESPACE | 1 + r/R/arrow-package.R | 1 + 3 files changed, 3 insertions(+) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index b26c3f1e848..a728be37734 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -31,6 +31,7 @@ Biarch: true Imports: assertthat, bit64 (>= 0.9-7), + glue, methods, purrr, R6, diff --git a/r/NAMESPACE b/r/NAMESPACE index 1f547e5f672..fddcf242f28 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -390,6 +390,7 @@ importFrom(assertthat,assert_that) importFrom(assertthat,is.string) importFrom(bit64,print.integer64) importFrom(bit64,str.integer64) +importFrom(glue,glue) importFrom(methods,as) importFrom(purrr,as_mapper) importFrom(purrr,flatten) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 7a2a3f64f5c..9e562edc3e7 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -26,6 +26,7 @@ #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args #' @importFrom rlang quo_set_env quo_get_env is_formula env_bind_active quo_is_call #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select +#' @importFrom glue glue #' @useDynLib arrow, .registration = TRUE #' @keywords internal "_PACKAGE" From 9926a497b78cafef9f74c10aa6c432a9b7d20245 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 30 Aug 2022 14:07:48 +0100 Subject: [PATCH 69/77] Add test for no functions --- r/R/dplyr-across.R | 7 ------- r/tests/testthat/test-dplyr-across.R | 8 ++++++++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index 96024eb2d66..bc393b39a5f 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -16,7 +16,6 @@ # under the License. expand_across <- function(.data, quos_in) { - quos_out <- list() # retrieve items using their values to preserve naming of quos other than across for (quo_i in seq_along(quos_in)) { quo_in <- quos_in[quo_i] @@ -52,12 +51,6 @@ expand_across <- function(.data, quos_in) { inline = TRUE ) - # calling across() with .fns = NULL returns all columns unchanged - if (is_empty(setup$fns)) { - # this needs to be updated to match dplyr's version - return() - } - if (!is_list(setup$fns) && as.character(setup$fns)[[1]] == "~") { abort( paste( diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index eafb8808363..f4c7938415d 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -82,6 +82,14 @@ test_that("Can use across() within mutate()", { example_data ) + # across() with no functions + compare_dplyr_binding( + .input %>% + mutate(across(starts_with("dbl"))) %>% + collect(), + example_data + ) + # dynamic variable name int <- c("dbl", "dbl2") compare_dplyr_binding( From c777cadf15765737fe5197a10a26805a498c2d8d Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 30 Aug 2022 14:09:18 +0100 Subject: [PATCH 70/77] Add back in code deleted --- r/R/dplyr-across.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index bc393b39a5f..263388bf6fe 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -51,6 +51,12 @@ expand_across <- function(.data, quos_in) { inline = TRUE ) + # calling across() with .fns = NULL returns all columns unchanged + # this is a no-op in mutate() but may need updating for other funcs + if (is_empty(setup$fns)) { + return() + } + if (!is_list(setup$fns) && as.character(setup$fns)[[1]] == "~") { abort( paste( From 12cc5ebfdb4b490c381bb2a94bacb17984ed6b6a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 30 Aug 2022 20:56:19 +0100 Subject: [PATCH 71/77] Add new helper --- r/tests/testthat/helper-expectation.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index eb2e6b02195..ba11700ab62 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -321,3 +321,7 @@ split_vector_as_list <- function(vec) { vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)] list(vec1, vec2) } + +expect_across_equal <- function(actual, expected, tbl) { + expect_identical(expand_across(tbl, actual), as.list(expected)) +} From 8912b38053a2d575f2a4b9fc5a14abbb33a9ebd4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 30 Aug 2022 20:56:45 +0100 Subject: [PATCH 72/77] Properly handle no functions supplied --- r/R/dplyr-across.R | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index 263388bf6fe..245769735a4 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -16,6 +16,7 @@ # under the License. expand_across <- function(.data, quos_in) { + quos_out <- list() # retrieve items using their values to preserve naming of quos other than across for (quo_i in seq_along(quos_in)) { quo_in <- quos_in[quo_i] @@ -51,13 +52,7 @@ expand_across <- function(.data, quos_in) { inline = TRUE ) - # calling across() with .fns = NULL returns all columns unchanged - # this is a no-op in mutate() but may need updating for other funcs - if (is_empty(setup$fns)) { - return() - } - - if (!is_list(setup$fns) && as.character(setup$fns)[[1]] == "~") { + if (!is_list(setup$fns) && !is.null(setup$fns) && as.character(setup$fns)[[1]] == "~") { abort( paste( "purrr-style lambda functions as `.fns` argument to `across()`", @@ -66,7 +61,7 @@ expand_across <- function(.data, quos_in) { ) } - new_quos <- quosures_from_func_list(setup, quo_env) + new_quos <- quosures_from_setup(setup, quo_env) quos_out <- append(quos_out, new_quos) } else { @@ -78,15 +73,23 @@ expand_across <- function(.data, quos_in) { } # given a named list of functions and column names, create a list of new quosures -quosures_from_func_list <- function(setup, quo_env) { - func_list_full <- rep(setup$fns, length(setup$vars)) - cols_list_full <- rep(setup$vars, each = length(setup$fns)) - - # get new quosures - new_quo_list <- map2( - func_list_full, cols_list_full, - ~ quo(!!call2(.x, sym(.y))) - ) +quosures_from_setup <- function(setup, quo_env) { + if (!is.null(setup$fns)) { + func_list_full <- rep(setup$fns, length(setup$vars)) + cols_list_full <- rep(setup$vars, each = length(setup$fns)) + + # get new quosures + new_quo_list <- map2( + func_list_full, cols_list_full, + ~ quo(!!call2(.x, sym(.y))) + ) + } else { + # if there's no functions, just map to variables themselves + new_quo_list <- map( + setup$vars, + ~ quo(!!sym(.x)) + ) + } quosures <- set_names(new_quo_list, setup$names) map(quosures, ~ quo_set_env(.x, quo_env)) @@ -104,6 +107,7 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { } else { names <- vars } + value <- list(vars = vars, fns = fns, names = names) return(value) } From d94cd23f4b06b53e815b76e4f27dc7d97b70ef96 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 30 Aug 2022 20:56:57 +0100 Subject: [PATCH 73/77] Separate mutate vs expand_across tests --- r/tests/testthat/test-dplyr-across.R | 218 +++++++++++---------------- r/tests/testthat/test-dplyr-mutate.R | 54 +++++++ 2 files changed, 143 insertions(+), 129 deletions(-) diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index f4c7938415d..d55b2c2ad53 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -17,182 +17,142 @@ library(dplyr, warn.conflicts = FALSE) -test_that("Can use across() within mutate()", { - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), round)) %>% - collect(), +test_that("expand_across correctly expands quosures", { + + # single unnamed function + expect_across_equal( + quos(across(c(dbl, dbl2), round)), + quos( + dbl = round(dbl), + dbl2 = round(dbl2) + ), example_data ) - compare_dplyr_binding( - .input %>% - mutate( - dbl2 = dbl * 2, - across(c(dbl, dbl2), round), - int2 = int * 2, - dbl = dbl + 3 - ) %>% - collect(), + # multiple unnamed functions + expect_across_equal( + quos(across(c(dbl, dbl2), list(exp, sqrt))), + quos( + dbl_1 = exp(dbl), + dbl_2 = sqrt(dbl), + dbl2_1 = exp(dbl2), + dbl2_2 = sqrt(dbl2) + ), example_data ) - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>% - collect(), + # multiple named arguments + expect_across_equal( + quos(across(c(dbl, dbl2), list("fun1" = round, "fun2" = sqrt))), + quos( + dbl_fun1 = round(dbl), + dbl_fun2 = sqrt(dbl), + dbl2_fun1 = round(dbl2), + dbl2_fun2 = sqrt(dbl2) + ), example_data ) - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), list("fun1" = round, "fun2" = sqrt))) %>% - collect(), + # .names argument + expect_across_equal( + quos(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")), + quos( + dbl.1 = round(dbl), + dbl2.1 = round(dbl2) + ), example_data ) - # this is valid is neither R nor Arrow - expect_error( - expect_warning( - compare_dplyr_binding( - .input %>% - arrow_table() %>% - mutate(across(c(dbl, dbl2), list("fun1" = round(sqrt(dbl))))) %>% - collect(), - example_data, - warning = TRUE - ) - ) + expect_across_equal( + quos(across(c(dbl, dbl2), round, .names = "round_{.col}")), + quos( + round_dbl = round(dbl), + round_dbl2 = round(dbl2) + ), + example_data ) # across() arguments not in default order - compare_dplyr_binding( - .input %>% - mutate(across(.fns = round, c(dbl, dbl2))) %>% - collect(), + expect_across_equal( + quos(across(.fns = round, c(dbl, dbl2))), + quos( + dbl = round(dbl), + dbl2 = round(dbl2) + ), example_data ) # across() with no columns named - compare_dplyr_binding( - .input %>% - select(int, dbl, dbl2) %>% - mutate(across(.fns = round)) %>% - collect(), - example_data + expect_across_equal( + quos(across(.fns = round)), + quos( + int = round(int), + dbl = round(dbl), + dbl2 = round(dbl2) + ), + example_data %>% select(int, dbl, dbl2) ) - # across() with no functions - compare_dplyr_binding( - .input %>% - mutate(across(starts_with("dbl"))) %>% - collect(), + # across() with no functions returns columns unchanged + expect_across_equal( + quos(across(starts_with("dbl"))), + quos( + dbl = dbl, + dbl2 = dbl2 + ), example_data ) # dynamic variable name int <- c("dbl", "dbl2") - compare_dplyr_binding( - .input %>% - select(int, dbl, dbl2) %>% - mutate(across(all_of(int), sqrt)) %>% - collect(), - example_data - ) - - # .names argument - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>% - collect(), - example_data - ) - - compare_dplyr_binding( - .input %>% - mutate(across(c(dbl, dbl2), round, .names = "round_{.col}")) %>% - collect(), + expect_across_equal( + quos(across(all_of(int), sqrt)), + quos( + dbl = sqrt(dbl), + dbl2 = sqrt(dbl2) + ), example_data ) # ellipses (...) are a deprecated argument expect_error( - example_data %>% - arrow_table() %>% - mutate(across(c(dbl, dbl2), round, digits = -1)) %>% - collect(), + expand_across( + example_data, + quos(across(c(dbl, dbl2), round, digits = -1)) + ), regexp = "`...` argument to `across()` is deprecated in dplyr and not supported in Arrow", fixed = TRUE ) # alternative ways of specifying .fns - as a list - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, list(round))) %>% - collect(), + expect_across_equal( + quos(across(1:dbl2, list(round))), + quos( + int_1 = round(int), + dbl_1 = round(dbl), + dbl2_1 = round(dbl2) + ), example_data ) # supply .fns as a one-item vector - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, c(round))) %>% - collect(), + expect_across_equal( + quos(across(1:dbl2, c(round))), + quos( + int_1 = round(int), + dbl_1 = round(dbl), + dbl2_1 = round(dbl2) + ), example_data ) # ARROW-17366: purrr-style lambda functions not yet supported expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, ~ round(.x, digits = -1))) %>% - collect(), - example_data - ), - regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", - fixed = TRUE - ) - - expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, list(~ round(.x, digits = -1), ~ sqrt(.x)))) %>% - collect(), - example_data + expand_across( + example_data, + quos(across(1:dbl2, ~ round(.x, digits = -1))) ), regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", fixed = TRUE ) - - # .fns = NULL, the default - compare_dplyr_binding( - .input %>% - mutate(across(1:dbl2, NULL)) %>% - collect(), - example_data - ) - - # ARROW-12778 - `where()` is not yet supported - expect_error( - compare_dplyr_binding( - .input %>% - mutate(across(where(is.double))) %>% - collect(), - example_data - ), - "Unsupported selection helper" - ) - - # gives the right error with window functions - expect_warning( - arrow_table(example_data) %>% - mutate( - x = int + 2, - across(c("int", "dbl"), list(mean = mean, sd = sd, round)), - exp(dbl2) - ) %>% - collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE - ) }) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 5baeb8f3a80..f1de5c70454 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -588,3 +588,57 @@ test_that("mutate() and transmute() with namespaced functions", { tbl ) }) + +test_that("Can use across() within mutate()", { + + # expressions work in the right order + compare_dplyr_binding( + .input %>% + mutate( + dbl2 = dbl * 2, + across(c(dbl, dbl2), round), + int2 = int * 2, + dbl = dbl + 3 + ) %>% + collect(), + example_data + ) + + # this is valid is neither R nor Arrow + expect_error( + expect_warning( + compare_dplyr_binding( + .input %>% + arrow_table() %>% + mutate(across(c(dbl, dbl2), list("fun1" = round(sqrt(dbl))))) %>% + collect(), + example_data, + warning = TRUE + ) + ) + ) + + # ARROW-12778 - `where()` is not yet supported + expect_error( + compare_dplyr_binding( + .input %>% + mutate(across(where(is.double))) %>% + collect(), + example_data + ), + "Unsupported selection helper" + ) + + # gives the right error with window functions + expect_warning( + arrow_table(example_data) %>% + mutate( + x = int + 2, + across(c("int", "dbl"), list(mean = mean, sd = sd, round)), + exp(dbl2) + ) %>% + collect(), + "window functions not currently supported in Arrow; pulling data into R", + fixed = TRUE + ) +}) From e3215104a711f86aa5db11abd3da4ed0553fb3c1 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 31 Aug 2022 08:52:34 +0100 Subject: [PATCH 74/77] Reorder tests and add a load more for combos of .names and .fns not yet encountered --- r/tests/testthat/test-dplyr-across.R | 95 ++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 19 deletions(-) diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index d55b2c2ad53..b39ba52ec80 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -41,7 +41,17 @@ test_that("expand_across correctly expands quosures", { example_data ) - # multiple named arguments + # single named function + expect_across_equal( + quos(across(c(dbl, dbl2), list("fun1" = round))), + quos( + dbl_fun1 = round(dbl), + dbl2_fun1 = round(dbl2) + ), + example_data + ) + + # multiple named functions expect_across_equal( quos(across(c(dbl, dbl2), list("fun1" = round, "fun2" = sqrt))), quos( @@ -53,21 +63,24 @@ test_that("expand_across correctly expands quosures", { example_data ) - # .names argument + # mix of named and unnamed functions expect_across_equal( - quos(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")), + quos(across(c(dbl, dbl2), list(round, "fun2" = sqrt))), quos( - dbl.1 = round(dbl), - dbl2.1 = round(dbl2) + dbl_1 = round(dbl), + dbl_fun2 = sqrt(dbl), + dbl2_1 = round(dbl2), + dbl2_fun2 = sqrt(dbl2) ), example_data ) + # across() with no functions returns columns unchanged expect_across_equal( - quos(across(c(dbl, dbl2), round, .names = "round_{.col}")), + quos(across(starts_with("dbl"))), quos( - round_dbl = round(dbl), - round_dbl2 = round(dbl2) + dbl = dbl, + dbl2 = dbl2 ), example_data ) @@ -93,17 +106,7 @@ test_that("expand_across correctly expands quosures", { example_data %>% select(int, dbl, dbl2) ) - # across() with no functions returns columns unchanged - expect_across_equal( - quos(across(starts_with("dbl"))), - quos( - dbl = dbl, - dbl2 = dbl2 - ), - example_data - ) - - # dynamic variable name + # column selection via dynamic variable name int <- c("dbl", "dbl2") expect_across_equal( quos(across(all_of(int), sqrt)), @@ -155,4 +158,58 @@ test_that("expand_across correctly expands quosures", { regexp = "purrr-style lambda functions as `.fns` argument to `across()` not yet supported in Arrow", fixed = TRUE ) + + # .names argument + expect_across_equal( + quos(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")), + quos( + dbl.1 = round(dbl), + dbl2.1 = round(dbl2) + ), + example_data + ) + + # names argument with custom text + expect_across_equal( + quos(across(c(dbl, dbl2), round, .names = "round_{.col}")), + quos( + round_dbl = round(dbl), + round_dbl2 = round(dbl2) + ), + example_data + ) + + # names argument supplied but no functions + expect_across_equal( + quos(across(starts_with("dbl"), .names = "new_{.col}")), + quos( + new_dbl = dbl, + new_dbl2 = dbl2 + ), + example_data + ) + + # .names argument and functions named + expect_across_equal( + quos(across(c(dbl, dbl2), list("my_round" = round, "my_exp" = exp), .names = "{.col}.{.fn}")), + quos( + dbl.my_round = round(dbl), + dbl.my_exp = exp(dbl), + dbl2.my_round = round(dbl2), + dbl2.my_exp = exp(dbl2) + ), + example_data + ) + + # .names argument and mix of named and unnamed functions + expect_across_equal( + quos(across(c(dbl, dbl2), list(round, "my_exp" = exp), .names = "{.col}.{.fn}")), + quos( + dbl.1 = round(dbl), + dbl.my_exp = exp(dbl), + dbl2.1 = round(dbl2), + dbl2.my_exp = exp(dbl2) + ), + example_data + ) }) From bd6e2b6bbb48e3b83213e8e02099ae05360024f3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 31 Aug 2022 09:20:57 +0100 Subject: [PATCH 75/77] Add in test for dodgy names spec --- r/R/dplyr-across.R | 13 +++++++++++++ r/tests/testthat/test-dplyr-across.R | 11 +++++++++++ 2 files changed, 24 insertions(+) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index 245769735a4..b1d505e12ff 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -156,6 +156,19 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { fns <- map(fns, as_function) } + # ensure .names argument has resulted in + if (length(names) != (length(vars) * length(fns))) { + abort( + c( + "`.names` specification must produce (number of columns * number of functions) names.", + x = paste0( + length(vars) * length(fns), " names required (", length(vars), " columns * ", length(fns), " functions)\n ", + length(names), " name(s) produced: ", paste(names, collapse = ",") + ) + ) + ) + } + list(vars = vars, fns = fns, names = names) } diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index b39ba52ec80..8945c2a5f3b 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -212,4 +212,15 @@ test_that("expand_across correctly expands quosures", { ), example_data ) + + # dodgy .names specification + expect_error( + expand_across( + example_data, + quos(across(c(dbl, dbl2), list(round, "my_exp" = exp), .names = "zarg")) + ), + regexp = "`.names` specification must produce (number of columns * number of functions) names.", + fixed = TRUE + ) + }) From b2fb25a05c844b2664f39e7e1f8bf3a2bcfa0898 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 31 Aug 2022 09:42:11 +0100 Subject: [PATCH 76/77] Simplify across function --- r/R/dplyr-across.R | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index b1d505e12ff..01a9262b81e 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -173,10 +173,5 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE) { } across_glue_mask <- function(.col, .fn, .caller_env) { - glue_mask <- env(.caller_env, .col = .col, .fn = .fn) - env_bind_active( - glue_mask, - col = function() glue_mask$.col, fn = function() glue_mask$.fn - ) - glue_mask + env(.caller_env, .col = .col, .fn = .fn, col = .col, fn = .fn) } From 29c8fc516dd362ee4e362def934ac46253c08087 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 31 Aug 2022 09:44:57 +0100 Subject: [PATCH 77/77] Remove extraneous rlang call --- r/NAMESPACE | 1 - r/R/arrow-package.R | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index fddcf242f28..49db309b8e8 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -423,7 +423,6 @@ importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) importFrom(rlang,env_bind) -importFrom(rlang,env_bind_active) importFrom(rlang,eval_tidy) importFrom(rlang,exec) importFrom(rlang,expr) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 9e562edc3e7..e8aa93f9534 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -24,7 +24,7 @@ #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args -#' @importFrom rlang quo_set_env quo_get_env is_formula env_bind_active quo_is_call +#' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select #' @importFrom glue glue #' @useDynLib arrow, .registration = TRUE