From 403d9df04491edc1b0f0a6f02e357653fb6b1796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 7 Jul 2022 18:00:29 +0100 Subject: [PATCH 01/80] add a `_ToString` method for the ExecPlan --- r/src/arrowExports.cpp | 9 +++++++++ r/src/compute-exec.cpp | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 5e207d657c7..6809270d8dc 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -898,6 +898,14 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp +std::string ExecPlan_ToString(const std::shared_ptr& plan); +extern "C" SEXP _arrow_ExecPlan_ToString(SEXP plan_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type plan(plan_sexp); + return cpp11::as_sexp(ExecPlan_ToString(plan)); +END_CPP11 +} +// compute-exec.cpp #if defined(ARROW_R_WITH_DATASET) std::shared_ptr ExecNode_Scan(const std::shared_ptr& plan, const std::shared_ptr& dataset, const std::shared_ptr& filter, std::vector materialized_field_names); extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP filter_sexp, SEXP materialized_field_names_sexp){ @@ -5242,6 +5250,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 5}, { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1}, { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1}, + { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 1}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 76112b4cefd..f2a5ebafbb7 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -125,6 +125,11 @@ std::shared_ptr ExecNode_output_schema( return node->output_schema(); } +// [[arrow::export]] +std::string ExecPlan_ToString(const std::shared_ptr& plan) { + return plan->ToString(); +} + #if defined(ARROW_R_WITH_DATASET) #include From 4a1f94eedaa882f9443793470d1f900ca62f11ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 7 Jul 2022 18:00:52 +0100 Subject: [PATCH 02/80] add `ToString` method for the `ExecPlan` R6 object --- r/R/arrowExports.R | 4 ++++ r/R/query-engine.R | 1 + 2 files changed, 5 insertions(+) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 84f6ee54fc7..6de6ab5bbfa 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -416,6 +416,10 @@ ExecNode_output_schema <- function(node) { .Call(`_arrow_ExecNode_output_schema`, node) } +ExecPlan_ToString <- function(plan) { + .Call(`_arrow_ExecPlan_ToString`, plan) +} + ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { .Call(`_arrow_ExecNode_Scan`, plan, dataset, filter, materialized_field_names) } diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 513b861d414..264e111455c 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -259,6 +259,7 @@ ExecPlan <- R6Class("ExecPlan", ... ) }, + ToString = function() ExecPlan_ToString(self), Stop = function() ExecPlan_StopProducing(self) ) ) From f03ced055869a2867fce5c71e7d80227813f1b8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 7 Jul 2022 18:01:37 +0100 Subject: [PATCH 03/80] define `show_arrow_query()` and add it to `print.arrow_dplyr_query()` --- r/R/dplyr.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 8018cb5a60e..f6a8712959b 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -154,6 +154,9 @@ print.arrow_dplyr_query <- function(x, ...) { sep = "" ) } + + show_arrow_query(x) + cat("See $.data for the source Arrow object\n") invisible(x) } @@ -219,6 +222,15 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { x } +show_arrow_query <- function(.data) { + adq <- arrow:::as_adq(.data) + plan <- arrow:::ExecPlan$create() + final_node <- plan$Build(.data) + cat("* ExecPlan\n") + cat(plan$ToString()) + invisible(.data) +} + ensure_group_vars <- function(x) { if (inherits(x, "arrow_dplyr_query")) { # Before pulling data from Arrow, make sure all group vars are in the projection From fdd20be2d606422048a41cffeb61f18744c86496 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 13:17:07 +0100 Subject: [PATCH 04/80] rename `show_arrow_query()` to `show_exec_plan()` + unit tests --- r/R/dplyr.R | 23 +++++++++++------ r/tests/testthat/test-dplyr-query.R | 38 +++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index f6a8712959b..346909a4691 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -155,8 +155,6 @@ print.arrow_dplyr_query <- function(x, ...) { ) } - show_arrow_query(x) - cat("See $.data for the source Arrow object\n") invisible(x) } @@ -222,13 +220,22 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { x } -show_arrow_query <- function(.data) { - adq <- arrow:::as_adq(.data) - plan <- arrow:::ExecPlan$create() - final_node <- plan$Build(.data) - cat("* ExecPlan\n") + +#' Show the details of an Arrow ExecPlan +#' +#' This is a function which gives more details about an ExecPlan. This is similar +#' to `dplyr::show_query()`. +#' +#' @param x an `arrow_dplyr_query` to print the ExecPlan for. +#' +#' @return The argument, invisibly. +#' @export +show_exec_plan <- function(x) { + adq <- as_adq(x) + plan <- ExecPlan$create() + final_node <- plan$Build(x) cat(plan$ToString()) - invisible(.data) + invisible(x) } ensure_group_vars <- function(x) { diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index d55ed07cfc8..00911f006fa 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -293,3 +293,41 @@ test_that("No duplicate field names are allowed in an arrow_dplyr_query", { ) ) }) + +test_that("show_exec_plan() and show_query()", { + expect_output( + tbl %>% + arrow_table() %>% + filter(dbl > 2, chr != "e") %>% + select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% + show_exec_plan(), + regexp = paste0( + 'ExecPlan with 3 nodes:\n2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten"', + ': add_checked(cast(int, {to_type=double, allow_int_overflow=false, ', + 'allow_time_truncate=false, allow_time_overflow=false, ', + 'allow_decimal_truncate=false, allow_float_truncate=false, ', + 'allow_invalid_utf8=false}), 10)]}\n 1:FilterNode{filter=((dbl > 2) and ', + '(chr != "e"))}\n 0:TableSourceNode{}' + ), + fixed = TRUE + ) + + expect_output( + tbl %>% + record_batch() %>% + filter(dbl > 2, chr != "e") %>% + select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% + show_exec_plan(), + regexp = paste0( + 'ExecPlan with 3 nodes:\n2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten"', + ': add_checked(cast(int, {to_type=double, allow_int_overflow=false, ', + 'allow_time_truncate=false, allow_time_overflow=false, ', + 'allow_decimal_truncate=false, allow_float_truncate=false, ', + 'allow_invalid_utf8=false}), 10)]}\n 1:FilterNode{filter=((dbl > 2) and ', + '(chr != "e"))}\n 0:TableSourceNode{}' + ), + fixed = TRUE + ) +}) From 350bea307c1b61d080dc230c44bfba80bd826595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 14:15:10 +0100 Subject: [PATCH 05/80] lint --- r/tests/testthat/test-dplyr-query.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 00911f006fa..9f5edc172ea 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -303,12 +303,12 @@ test_that("show_exec_plan() and show_query()", { mutate(int_plus_ten = int + 10) %>% show_exec_plan(), regexp = paste0( - 'ExecPlan with 3 nodes:\n2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten"', - ': add_checked(cast(int, {to_type=double, allow_int_overflow=false, ', - 'allow_time_truncate=false, allow_time_overflow=false, ', - 'allow_decimal_truncate=false, allow_float_truncate=false, ', - 'allow_invalid_utf8=false}), 10)]}\n 1:FilterNode{filter=((dbl > 2) and ', - '(chr != "e"))}\n 0:TableSourceNode{}' + "ExecPlan with 3 nodes:\n2:ProjectNode{projection=[chr, int, lgl, \"int_plus_ten\"", + ": add_checked(cast(int, {to_type=double, allow_int_overflow=false, ", + "allow_time_truncate=false, allow_time_overflow=false, ", + "allow_decimal_truncate=false, allow_float_truncate=false, ", + "allow_invalid_utf8=false}), 10)]}\n 1:FilterNode{filter=((dbl > 2) and ", + "(chr != \"e\"))}\n 0:TableSourceNode{}" ), fixed = TRUE ) @@ -321,12 +321,12 @@ test_that("show_exec_plan() and show_query()", { mutate(int_plus_ten = int + 10) %>% show_exec_plan(), regexp = paste0( - 'ExecPlan with 3 nodes:\n2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten"', - ': add_checked(cast(int, {to_type=double, allow_int_overflow=false, ', - 'allow_time_truncate=false, allow_time_overflow=false, ', - 'allow_decimal_truncate=false, allow_float_truncate=false, ', - 'allow_invalid_utf8=false}), 10)]}\n 1:FilterNode{filter=((dbl > 2) and ', - '(chr != "e"))}\n 0:TableSourceNode{}' + "ExecPlan with 3 nodes:\n2:ProjectNode{projection=[chr, int, lgl, \"int_plus_ten\"", + ": add_checked(cast(int, {to_type=double, allow_int_overflow=false, ", + "allow_time_truncate=false, allow_time_overflow=false, ", + "allow_decimal_truncate=false, allow_float_truncate=false, ", + "allow_invalid_utf8=false}), 10)]}\n 1:FilterNode{filter=((dbl > 2) and ", + "(chr != \"e\"))}\n 0:TableSourceNode{}" ), fixed = TRUE ) From c889ec003c35e34cfef8ca33f06e10b2c6b91a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 14:53:53 +0100 Subject: [PATCH 06/80] use `expect_snapshot()` instead of `expect_output()` --- r/tests/testthat/_snaps/dplyr-query.md | 22 ++++++++++++++++++++ r/tests/testthat/test-dplyr-query.R | 28 +++++--------------------- 2 files changed, 27 insertions(+), 23 deletions(-) create mode 100644 r/tests/testthat/_snaps/dplyr-query.md diff --git a/r/tests/testthat/_snaps/dplyr-query.md b/r/tests/testthat/_snaps/dplyr-query.md new file mode 100644 index 00000000000..c4c22df8c16 --- /dev/null +++ b/r/tests/testthat/_snaps/dplyr-query.md @@ -0,0 +1,22 @@ +# show_exec_plan() and show_query() + + Code + tbl %>% arrow_table() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% show_exec_plan() + Output + ExecPlan with 3 nodes: + 2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten": add_checked(cast(int, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), 10)]} + 1:FilterNode{filter=((dbl > 2) and (chr != "e"))} + 0:TableSourceNode{} + +--- + + Code + tbl %>% record_batch() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% show_exec_plan() + Output + ExecPlan with 3 nodes: + 2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten": add_checked(cast(int, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), 10)]} + 1:FilterNode{filter=((dbl > 2) and (chr != "e"))} + 0:TableSourceNode{} + diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 9f5edc172ea..49f6d2bd2ff 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -295,39 +295,21 @@ test_that("No duplicate field names are allowed in an arrow_dplyr_query", { }) test_that("show_exec_plan() and show_query()", { - expect_output( + expect_snapshot( tbl %>% arrow_table() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% mutate(int_plus_ten = int + 10) %>% - show_exec_plan(), - regexp = paste0( - "ExecPlan with 3 nodes:\n2:ProjectNode{projection=[chr, int, lgl, \"int_plus_ten\"", - ": add_checked(cast(int, {to_type=double, allow_int_overflow=false, ", - "allow_time_truncate=false, allow_time_overflow=false, ", - "allow_decimal_truncate=false, allow_float_truncate=false, ", - "allow_invalid_utf8=false}), 10)]}\n 1:FilterNode{filter=((dbl > 2) and ", - "(chr != \"e\"))}\n 0:TableSourceNode{}" - ), - fixed = TRUE + show_exec_plan() ) - expect_output( + expect_snapshot( tbl %>% record_batch() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% mutate(int_plus_ten = int + 10) %>% - show_exec_plan(), - regexp = paste0( - "ExecPlan with 3 nodes:\n2:ProjectNode{projection=[chr, int, lgl, \"int_plus_ten\"", - ": add_checked(cast(int, {to_type=double, allow_int_overflow=false, ", - "allow_time_truncate=false, allow_time_overflow=false, ", - "allow_decimal_truncate=false, allow_float_truncate=false, ", - "allow_invalid_utf8=false}), 10)]}\n 1:FilterNode{filter=((dbl > 2) and ", - "(chr != \"e\"))}\n 0:TableSourceNode{}" - ), - fixed = TRUE - ) + show_exec_plan() + ) }) From 126054001c2de61619ee859e225b2d5b9f2b3eb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 15:01:16 +0100 Subject: [PATCH 07/80] remove empty row --- r/R/dplyr.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 346909a4691..b8d5fdde6b3 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -154,7 +154,6 @@ print.arrow_dplyr_query <- function(x, ...) { sep = "" ) } - cat("See $.data for the source Arrow object\n") invisible(x) } From 1f4801d7087982b8e88446f3f5b786d25eddf9ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 15:12:20 +0100 Subject: [PATCH 08/80] update test and snapshot --- r/tests/testthat/_snaps/dplyr-query.md | 2 +- r/tests/testthat/test-dplyr-query.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/_snaps/dplyr-query.md b/r/tests/testthat/_snaps/dplyr-query.md index c4c22df8c16..fab935976bb 100644 --- a/r/tests/testthat/_snaps/dplyr-query.md +++ b/r/tests/testthat/_snaps/dplyr-query.md @@ -1,4 +1,4 @@ -# show_exec_plan() and show_query() +# show_exec_plan() Code tbl %>% arrow_table() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 49f6d2bd2ff..57328d5bd0e 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -294,7 +294,7 @@ test_that("No duplicate field names are allowed in an arrow_dplyr_query", { ) }) -test_that("show_exec_plan() and show_query()", { +test_that("show_exec_plan()", { expect_snapshot( tbl %>% arrow_table() %>% @@ -311,5 +311,5 @@ test_that("show_exec_plan() and show_query()", { select(chr, int, lgl) %>% mutate(int_plus_ten = int + 10) %>% show_exec_plan() - ) + ) }) From 254e14c1ca6ad21864186f2a9a134237e19ae19f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 15:22:16 +0100 Subject: [PATCH 09/80] document + export `show_exec_plan()` --- r/NAMESPACE | 1 + r/R/dplyr.R | 12 +++++++++--- r/man/show_exec_plan.Rd | 26 ++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 r/man/show_exec_plan.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 5762df9eb0c..6f3215e5e43 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -348,6 +348,7 @@ export(s3_bucket) export(schema) export(set_cpu_count) export(set_io_thread_count) +export(show_exec_plan) export(starts_with) export(string) export(struct) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index b8d5fdde6b3..7c2d8b071af 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -219,16 +219,22 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { x } - #' Show the details of an Arrow ExecPlan #' -#' This is a function which gives more details about an ExecPlan. This is similar -#' to `dplyr::show_query()`. +#' This is a function which gives more details about the `ExecPlan` of an +#' `arrow_dplyr_query` object. It is similar to `dplyr::show_query()`. #' #' @param x an `arrow_dplyr_query` to print the ExecPlan for. #' #' @return The argument, invisibly. #' @export +#' +#' @examples +#' mtcars %>% +#' arrow_table() %>% +#' filter(mpg > 20) %>% +#' mutate(x = gear/carb) %>% +#' show_exec_plan() show_exec_plan <- function(x) { adq <- as_adq(x) plan <- ExecPlan$create() diff --git a/r/man/show_exec_plan.Rd b/r/man/show_exec_plan.Rd new file mode 100644 index 00000000000..5c693a69bd0 --- /dev/null +++ b/r/man/show_exec_plan.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr.R +\name{show_exec_plan} +\alias{show_exec_plan} +\title{Show the details of an Arrow ExecPlan} +\usage{ +show_exec_plan(x) +} +\arguments{ +\item{x}{an \code{arrow_dplyr_query} to print the ExecPlan for.} +} +\value{ +The argument, invisibly. +} +\description{ +This is a function which gives more details about the \code{ExecPlan} of an +\code{arrow_dplyr_query} object. It is similar to \code{dplyr::show_query()}. +} +\examples{ +# +mtcars \%>\% + arrow_table() \%>\% + filter(mpg > 20) \%>\% + mutate(x = gear/carb) \%>\% + show_exec_plan() +} From 629476551357627a3c4b9f50d25532eefc716a25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 15:30:12 +0100 Subject: [PATCH 10/80] example + redocument --- r/R/dplyr.R | 1 + r/man/show_exec_plan.Rd | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 7c2d8b071af..4d8c56115b1 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -230,6 +230,7 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' @export #' #' @examples +#' library(dplyr) #' mtcars %>% #' arrow_table() %>% #' filter(mpg > 20) %>% diff --git a/r/man/show_exec_plan.Rd b/r/man/show_exec_plan.Rd index 5c693a69bd0..86ae224db33 100644 --- a/r/man/show_exec_plan.Rd +++ b/r/man/show_exec_plan.Rd @@ -17,7 +17,7 @@ This is a function which gives more details about the \code{ExecPlan} of an \code{arrow_dplyr_query} object. It is similar to \code{dplyr::show_query()}. } \examples{ -# +library(dplyr) mtcars \%>\% arrow_table() \%>\% filter(mpg > 20) \%>\% From cb5d5b77277db9f9c2a00d6849c8dfe2716672ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 16:48:09 +0100 Subject: [PATCH 11/80] add `show_exec_plan` to r/_pkgdown.yml under *Computation* --- r/_pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index c0f599fb8a5..9f51fd9f97a 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -219,6 +219,7 @@ reference: - match_arrow - value_counts - list_compute_functions + - show_exec_plan - title: Connections to other systems contents: - to_arrow From 424f12a833c0f03e8f26b8acddabd64793e91193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 11 Jul 2022 18:23:08 +0100 Subject: [PATCH 12/80] bump ci From ccd8f7635c98a2289ab50f9b9558104aae7cb82b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 12 Jul 2022 10:19:53 +0100 Subject: [PATCH 13/80] run example only when {dplyr} is available --- r/R/dplyr.R | 2 +- r/man/show_exec_plan.Rd | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 4d8c56115b1..f09fb8904d1 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -229,7 +229,7 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' @return The argument, invisibly. #' @export #' -#' @examples +#' @examplesIf arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE) #' library(dplyr) #' mtcars %>% #' arrow_table() %>% diff --git a/r/man/show_exec_plan.Rd b/r/man/show_exec_plan.Rd index 86ae224db33..85b0d26de87 100644 --- a/r/man/show_exec_plan.Rd +++ b/r/man/show_exec_plan.Rd @@ -17,10 +17,12 @@ This is a function which gives more details about the \code{ExecPlan} of an \code{arrow_dplyr_query} object. It is similar to \code{dplyr::show_query()}. } \examples{ +\dontshow{if (arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(dplyr) mtcars \%>\% arrow_table() \%>\% filter(mpg > 20) \%>\% mutate(x = gear/carb) \%>\% show_exec_plan() +\dontshow{\}) # examplesIf} } From 83e5492f79bcc280ace83e5fcc5b0521414935e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 12 Jul 2022 11:28:43 +0100 Subject: [PATCH 14/80] remove snapshots and test with `expect_output()` * add dataset test * add tests for `summarise()`, `group_by()` and `join()` --- r/tests/testthat/_snaps/dplyr-query.md | 22 ---------- r/tests/testthat/test-dataset.R | 17 +++++++ r/tests/testthat/test-dplyr-query.R | 61 ++++++++++++++++++++++++-- 3 files changed, 74 insertions(+), 26 deletions(-) delete mode 100644 r/tests/testthat/_snaps/dplyr-query.md diff --git a/r/tests/testthat/_snaps/dplyr-query.md b/r/tests/testthat/_snaps/dplyr-query.md deleted file mode 100644 index fab935976bb..00000000000 --- a/r/tests/testthat/_snaps/dplyr-query.md +++ /dev/null @@ -1,22 +0,0 @@ -# show_exec_plan() - - Code - tbl %>% arrow_table() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% show_exec_plan() - Output - ExecPlan with 3 nodes: - 2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten": add_checked(cast(int, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), 10)]} - 1:FilterNode{filter=((dbl > 2) and (chr != "e"))} - 0:TableSourceNode{} - ---- - - Code - tbl %>% record_batch() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% show_exec_plan() - Output - ExecPlan with 3 nodes: - 2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten": add_checked(cast(int, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), 10)]} - 1:FilterNode{filter=((dbl > 2) and (chr != "e"))} - 0:TableSourceNode{} - diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 8fb9f32c2ad..2d3a8f897a1 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -1215,3 +1215,20 @@ test_that("FileSystemFactoryOptions input validation", { fixed = TRUE ) }) + +test_that("show_exec_plan() with datasets", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + + expect_output( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6L & part == 1) %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + # the filter including the auto-casting of part + "FilterNode.*filter=.*int > 6.*cast.*", + "SourceNode" # the starting point + ) + ) +}) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 57328d5bd0e..45a6a56fbec 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -295,21 +295,74 @@ test_that("No duplicate field names are allowed in an arrow_dplyr_query", { }) test_that("show_exec_plan()", { - expect_snapshot( + # arrow_table and mutate + expect_output( tbl %>% arrow_table() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% mutate(int_plus_ten = int + 10) %>% - show_exec_plan() + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "chr, int, lgl, \"int_plus_ten\".*", # selected columns + "(dbl > 2).*", # the filter expressions + "chr != \"e\".*", + "TableSourceNode" # the starting point + ) ) - expect_snapshot( + # record_batch & mutate + expect_output( tbl %>% record_batch() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% mutate(int_plus_ten = int + 10) %>% - show_exec_plan() + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "chr, int, lgl, \"int_plus_ten\".*", # selected columns + "(dbl > 2).*", # the filter expressions + "chr != \"e\".*", + "TableSourceNode" # the starting point" + ) + ) + + # test with group_by and summarise + expect_output( + tbl %>% + arrow_table() %>% + group_by(lgl) %>% + summarise(avg = mean(dbl, na.rm = TRUE)) %>% + ungroup() %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "GroupByNode.*keys.*\"lgl\".*", # the group_by statement + "aggregates.*\\thash_mean.*avg.*skip_nulls=true, min_count=0.*", # the aggregation + "TableSourceNode" # the starting point + ) + ) + + # test with join + expect_output( + tbl %>% + arrow_table() %>% + left_join( + example_data %>% + arrow_table() %>% + mutate(doubled_dbl = dbl * 2) %>% + select(int, doubled_dbl), + by = "int" + ) %>% + select(int, verses, doubled_dbl) %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "HashJoinNode.*", # the join + "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate + "TableSourceNode" # the starting point + ) ) }) From b9c0c30baa7f101504c9aff82eabe85a6ab87328 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 12 Jul 2022 12:07:27 +0100 Subject: [PATCH 15/80] lint --- r/tests/testthat/test-dplyr-query.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 45a6a56fbec..c766d79045f 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -312,7 +312,7 @@ test_that("show_exec_plan()", { ) ) - # record_batch & mutate + # record_batch and mutate expect_output( tbl %>% record_batch() %>% From 07de250ac3230303313593735babc22a0d0c0674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 12 Jul 2022 12:30:50 +0100 Subject: [PATCH 16/80] bump ci From 0e3b8949da96673bc5b1c65b0adecc8a13401c70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 12 Jul 2022 16:42:44 +0100 Subject: [PATCH 17/80] docs --- r/R/dplyr.R | 8 ++++---- r/man/show_exec_plan.Rd | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index f09fb8904d1..991eb0bece1 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -219,12 +219,12 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { x } -#' Show the details of an Arrow ExecPlan +#' Show the details of an Arrow Execution Plan #' -#' This is a function which gives more details about the `ExecPlan` of an -#' `arrow_dplyr_query` object. It is similar to `dplyr::show_query()`. +#' This is a function which gives more details about the Execution Plan (`ExecPlan`) +#' of an `arrow_dplyr_query` object. It is similar to `dplyr::explain()`. #' -#' @param x an `arrow_dplyr_query` to print the ExecPlan for. +#' @param x an `arrow_dplyr_query` to print the `ExecPlan` for. #' #' @return The argument, invisibly. #' @export diff --git a/r/man/show_exec_plan.Rd b/r/man/show_exec_plan.Rd index 85b0d26de87..0a84c75c659 100644 --- a/r/man/show_exec_plan.Rd +++ b/r/man/show_exec_plan.Rd @@ -2,19 +2,19 @@ % Please edit documentation in R/dplyr.R \name{show_exec_plan} \alias{show_exec_plan} -\title{Show the details of an Arrow ExecPlan} +\title{Show the details of an Arrow Execution Plan} \usage{ show_exec_plan(x) } \arguments{ -\item{x}{an \code{arrow_dplyr_query} to print the ExecPlan for.} +\item{x}{an \code{arrow_dplyr_query} to print the \code{ExecPlan} for.} } \value{ The argument, invisibly. } \description{ -This is a function which gives more details about the \code{ExecPlan} of an -\code{arrow_dplyr_query} object. It is similar to \code{dplyr::show_query()}. +This is a function which gives more details about the Execution Plan (\code{ExecPlan}) +of an \code{arrow_dplyr_query} object. It is similar to \code{dplyr::explain()}. } \examples{ \dontshow{if (arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} From 3aab4e01b9b25f142ae0b856f1b5d8deacb6ece5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 11:17:48 +0100 Subject: [PATCH 18/80] improved `show_exec_plan()` docs --- r/R/dplyr.R | 9 +++++---- r/man/show_exec_plan.Rd | 8 +++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 991eb0bece1..b6324650aca 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -221,12 +221,14 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' Show the details of an Arrow Execution Plan #' -#' This is a function which gives more details about the Execution Plan (`ExecPlan`) -#' of an `arrow_dplyr_query` object. It is similar to `dplyr::explain()`. +#' This is a function which gives more details about the logical query plan +#' that will be executed when evaluating an `arrow_dplyr_query` object. +#' It calls the C++ `ExecPlan` object's print method. +#' Functionally, it is similar to `dplyr::explain()`. #' #' @param x an `arrow_dplyr_query` to print the `ExecPlan` for. #' -#' @return The argument, invisibly. +#' @return `x`, invisibly. #' @export #' #' @examplesIf arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE) @@ -237,7 +239,6 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' mutate(x = gear/carb) %>% #' show_exec_plan() show_exec_plan <- function(x) { - adq <- as_adq(x) plan <- ExecPlan$create() final_node <- plan$Build(x) cat(plan$ToString()) diff --git a/r/man/show_exec_plan.Rd b/r/man/show_exec_plan.Rd index 0a84c75c659..63ed4a7ec66 100644 --- a/r/man/show_exec_plan.Rd +++ b/r/man/show_exec_plan.Rd @@ -10,11 +10,13 @@ show_exec_plan(x) \item{x}{an \code{arrow_dplyr_query} to print the \code{ExecPlan} for.} } \value{ -The argument, invisibly. +\code{x}, invisibly. } \description{ -This is a function which gives more details about the Execution Plan (\code{ExecPlan}) -of an \code{arrow_dplyr_query} object. It is similar to \code{dplyr::explain()}. +This is a function which gives more details about the logical query plan +that will be executed when evaluating an \code{arrow_dplyr_query} object. +It calls the C++ \code{ExecPlan} object's print method. +Functionally, it is similar to \code{dplyr::explain()}. } \examples{ \dontshow{if (arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} From b9b4d49f9150b227c1421065b65544f334b3ced6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 11:27:19 +0100 Subject: [PATCH 19/80] use `adq` --- r/R/dplyr.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 0b3c4b86cc7..f1daaa02a5d 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -239,8 +239,9 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' mutate(x = gear/carb) %>% #' show_exec_plan() show_exec_plan <- function(x) { + adq <- as_adq(x) plan <- ExecPlan$create() - final_node <- plan$Build(x) + final_node <- plan$Build(adq) cat(plan$ToString()) invisible(x) } From 4779f08faf0ce566b77b3cd5784610cb36f633a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 13:38:22 +0100 Subject: [PATCH 20/80] add minimal test --- r/tests/testthat/test-dplyr-query.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 3ae7af401a2..6f7bf36bac3 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -435,6 +435,19 @@ test_that("query_can_stream()", { }) test_that("show_exec_plan()", { + # minimal test - this fails if we don't coerce the input to `show_exec_plan()` + # to be an `arrow_dplyr_query` + expect_output( + mtcars %>% + arrow_table() %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # this node would evaluate Expressions to produce new columns + "TableSourceNode" # the entry point + ) + ) + # arrow_table and mutate expect_output( tbl %>% @@ -448,7 +461,7 @@ test_that("show_exec_plan()", { "chr, int, lgl, \"int_plus_ten\".*", # selected columns "(dbl > 2).*", # the filter expressions "chr != \"e\".*", - "TableSourceNode" # the starting point + "TableSourceNode" # the entry point ) ) From 5730cecea0619256eea89b436aecdf7f7b9cb34b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 14:25:58 +0100 Subject: [PATCH 21/80] failing unit tests for `arrange()` and `head()` "nodes" --- r/tests/testthat/test-dplyr-query.R | 52 ++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 6f7bf36bac3..13b7e364d0f 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -459,6 +459,7 @@ test_that("show_exec_plan()", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "chr, int, lgl, \"int_plus_ten\".*", # selected columns + "FilterNode.*", # the filter node "(dbl > 2).*", # the filter expressions "chr != \"e\".*", "TableSourceNode" # the entry point @@ -478,7 +479,7 @@ test_that("show_exec_plan()", { "chr, int, lgl, \"int_plus_ten\".*", # selected columns "(dbl > 2).*", # the filter expressions "chr != \"e\".*", - "TableSourceNode" # the starting point" + "TableSourceNode" # the entry point" ) ) @@ -492,9 +493,11 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "GroupByNode.*keys.*\"lgl\".*", # the group_by statement - "aggregates.*\\thash_mean.*avg.*skip_nulls=true, min_count=0.*", # the aggregation - "TableSourceNode" # the starting point + "GroupByNode.*", # the group_by statement + "keys.*\"lgl\".*", # the key for the aggregations + "aggregates.*\\thash_mean.*avg.*skip_nulls=true, min_count=0.*", # the aggregations + "ProjectNode.*", # the output columns + "TableSourceNode" # the entry point ) ) @@ -514,8 +517,47 @@ test_that("show_exec_plan()", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "HashJoinNode.*", # the join + "ProjectNode.*", # the output columns for the second table "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate - "TableSourceNode" # the starting point + "TableSourceNode.*", # the second table + "ProjectNode.*", # output columns for the first table + "TableSourceNode" # the first table + ) + ) + + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "order_by_sink.*", # there should be something here regarding + # arrange (maybe "order_by_sink"), but it is + # missing + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point + ) + ) + + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + head(3) %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "select_k_sink.*", # there should be something here regarding + # head (maybe "select_k_sink"), but it is + # missing + "order_by_sink.*", # the arrange + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point ) ) }) From 23f5f22865f6cbd1c23b9d2cc9cacf7fab54f6e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 16:37:23 +0100 Subject: [PATCH 22/80] add `show_query.arrow-dplyr_query()` and `explain.arrow_dplyr_query()` --- r/R/arrow-package.R | 3 ++- r/R/dplyr.R | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index a2c37d0ce36..0dd65f2a111 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -41,7 +41,8 @@ "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute", "arrange", "rename", "pull", "relocate", "compute", "collapse", "distinct", "left_join", "right_join", "inner_join", "full_join", - "semi_join", "anti_join", "count", "tally", "rename_with", "union", "union_all", "glimpse" + "semi_join", "anti_join", "count", "tally", "rename_with", "union", + "union_all", "glimpse", "show_query", "explain" ) ) for (cl in c("Dataset", "ArrowTabular", "RecordBatchReader", "arrow_dplyr_query")) { diff --git a/r/R/dplyr.R b/r/R/dplyr.R index f1daaa02a5d..c0c005c612a 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -246,6 +246,18 @@ show_exec_plan <- function(x) { invisible(x) } +show_query.arrow_dplyr_query <- function(x, ...) { + show_exec_plan(x) +} + +show_query.Dataset <- show_query.ArrowTabular <- show_query.RecordBatchReader <- show_query.arrow_dplyr_query + +explain.arrow_dplyr_query <- function(x, ...) { + show_exec_plan(x) +} + +explain.Dataset <- explain.ArrowTabular <- explain.RecordBatchReader <- explain.arrow_dplyr_query + ensure_group_vars <- function(x) { if (inherits(x, "arrow_dplyr_query")) { # Before pulling data from Arrow, make sure all group vars are in the projection From 2eb3cdb7ab9cdbdb0006f7e6d2c4fa7b1cf5481e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 17:07:52 +0100 Subject: [PATCH 23/80] tests for `show_query()` & `explain()` + clarifications --- r/tests/testthat/test-dplyr-query.R | 264 ++++++++++++++++++++++++++-- 1 file changed, 246 insertions(+), 18 deletions(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 13b7e364d0f..1591b54ddac 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -434,7 +434,7 @@ test_that("query_can_stream()", { ) }) -test_that("show_exec_plan()", { +test_that("show_exec_plan(), show_query() and explain()", { # minimal test - this fails if we don't coerce the input to `show_exec_plan()` # to be an `arrow_dplyr_query` expect_output( @@ -443,8 +443,32 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # this node would evaluate Expressions to produce new columns - "TableSourceNode" # the entry point + "ProjectNode.*", # output columns + "TableSourceNode" # entry point + ) + ) + + # minimal test - show_query() + expect_output( + mtcars %>% + arrow_table() %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output new columns + "TableSourceNode" # entry point + ) + ) + + # minimal test - explain() + expect_output( + mtcars %>% + arrow_table() %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "TableSourceNode" # entry point ) ) @@ -459,10 +483,46 @@ test_that("show_exec_plan()", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "chr, int, lgl, \"int_plus_ten\".*", # selected columns - "FilterNode.*", # the filter node - "(dbl > 2).*", # the filter expressions + "FilterNode.*", # filter node + "(dbl > 2).*", # filter expressions "chr != \"e\".*", - "TableSourceNode" # the entry point + "TableSourceNode" # entry point + ) + ) + + # arrow_table and mutate - show_query() + expect_output( + tbl %>% + arrow_table() %>% + filter(dbl > 2, chr != "e") %>% + select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "chr, int, lgl, \"int_plus_ten\".*", # selected columns + "FilterNode.*", # filter node + "(dbl > 2).*", # filter expressions + "chr != \"e\".*", + "TableSourceNode" # entry point + ) + ) + + # arrow_table and mutate - explain() + expect_output( + tbl %>% + arrow_table() %>% + filter(dbl > 2, chr != "e") %>% + select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "chr, int, lgl, \"int_plus_ten\".*", # selected columns + "FilterNode.*", # filter node + "(dbl > 2).*", # filter expressions + "chr != \"e\".*", + "TableSourceNode" # entry point ) ) @@ -493,10 +553,49 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns "GroupByNode.*", # the group_by statement - "keys.*\"lgl\".*", # the key for the aggregations - "aggregates.*\\thash_mean.*avg.*skip_nulls=true, min_count=0.*", # the aggregations - "ProjectNode.*", # the output columns + "keys=.*lgl.*", # the key for the aggregations + "aggregates=.*hash_mean.*avg.*", # the aggregations + "ProjectNode.*", # the input columns + "TableSourceNode" # the entry point + ) + ) + + # test with group_by and summarise - show_query() + expect_output( + tbl %>% + arrow_table() %>% + group_by(lgl) %>% + summarise(avg = mean(dbl, na.rm = TRUE)) %>% + ungroup() %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "GroupByNode.*", # the group_by statement + "keys=.*lgl.*", # the key for the aggregations + "aggregates=.*hash_mean.*avg.*", # the aggregations + "ProjectNode.*", # the input columns + "TableSourceNode" # the entry point + ) + ) + + # test with group_by and summarise - explain() + expect_output( + tbl %>% + arrow_table() %>% + group_by(lgl) %>% + summarise(avg = mean(dbl, na.rm = TRUE)) %>% + ungroup() %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "GroupByNode.*", # the group_by statement + "keys=.*lgl.*", # the key for the aggregations + "aggregates=.*hash_mean.*avg.*", # the aggregations + "ProjectNode.*", # the input columns "TableSourceNode" # the entry point ) ) @@ -516,15 +615,67 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns "HashJoinNode.*", # the join - "ProjectNode.*", # the output columns for the second table + "ProjectNode.*", # input columns for the second table "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate "TableSourceNode.*", # the second table - "ProjectNode.*", # output columns for the first table + "ProjectNode.*", # input columns for the first table "TableSourceNode" # the first table ) ) + # test with join - show_query() + expect_output( + tbl %>% + arrow_table() %>% + left_join( + example_data %>% + arrow_table() %>% + mutate(doubled_dbl = dbl * 2) %>% + select(int, doubled_dbl), + by = "int" + ) %>% + select(int, verses, doubled_dbl) %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "HashJoinNode.*", # the join + "ProjectNode.*", # input columns for the second table + "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate + "TableSourceNode.*", # the second table + "ProjectNode.*", # input columns for the first table + "TableSourceNode" # the first table + ) + ) + + # test with join - explain() + expect_output( + tbl %>% + arrow_table() %>% + left_join( + example_data %>% + arrow_table() %>% + mutate(doubled_dbl = dbl * 2) %>% + select(int, doubled_dbl), + by = "int" + ) %>% + select(int, verses, doubled_dbl) %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "HashJoinNode.*", # the join + "ProjectNode.*", # input columns for the second table + "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate + "TableSourceNode.*", # the second table + "ProjectNode.*", # input columns for the first table + "TableSourceNode" # the first table + ) + ) + + skip("WIP - arrange and head are not currently captured by C++ ExecPlan print method") expect_output( mtcars %>% arrow_table() %>% @@ -534,9 +685,43 @@ test_that("show_exec_plan()", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns - "order_by_sink.*", # there should be something here regarding - # arrange (maybe "order_by_sink"), but it is - # missing + "order_by.*", # there should be something in the output + # regarding arrange and its corresponding + # ExecNode ("order_by"), but it is missing + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point + ) + ) + + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "order_by.*", # there should be something in the output + # regarding arrange and its corresponding + # ExecNode ("order_by"), but it is missing + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point + ) + ) + + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "order_by.*", # there should be something in the output + # regarding arrange and its corresponding + # ExecNode ("order_by"), but it is missing "FilterNode.*", # the filter node "TableSourceNode.*" # the entry point ) @@ -552,10 +737,53 @@ test_that("show_exec_plan()", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns - "select_k_sink.*", # there should be something here regarding - # head (maybe "select_k_sink"), but it is - # missing - "order_by_sink.*", # the arrange + "select_k.*", # there should be something in the output + # regarding head and its corresponding + # ExecNode ("select_k"), but it is missing + "order_by.*", # there should be something in the output + # regarding arrange and its corresponding + # ExecNode ("order_by"), but it is missing + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point + ) + ) + + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + head(3) %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "select_k.*", # there should be something in the output + # regarding head and its corresponding + # ExecNode ("select_k"), but it is missing + "order_by.*", # there should be something in the output + # regarding arrange and its corresponding + # ExecNode ("order_by"), but it is missing + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point + ) + ) + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + head(3) %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "select_k.*", # there should be something in the output + # regarding head and its corresponding + # ExecNode ("select_k"), but it is missing + "order_by.*", # there should be something in the output + # regarding arrange and its corresponding + # ExecNode ("order_by"), but it is missing "FilterNode.*", # the filter node "TableSourceNode.*" # the entry point ) From 2b334957cbfae3b14ced944b38fff54a68f27b1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 19:49:51 +0100 Subject: [PATCH 24/80] move dataset tests to test-dataset-dplyr.R + unskip --- r/tests/testthat/test-dataset-dplyr.R | 94 +++++++++++++++++++++++++++ r/tests/testthat/test-dataset.R | 17 ----- r/tests/testthat/test-dplyr-query.R | 1 - 3 files changed, 94 insertions(+), 18 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index fecda56c6c2..08227dd0e63 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -340,3 +340,97 @@ test_that("dplyr method not implemented messages", { fixed = TRUE ) }) + +test_that("show_exec_plan(), show_query() and explain() with datasets", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + + # minimal test + expect_output( + ds %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "SourceNode" # entry point + ) + ) + + # filter and select + expect_output( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6L & part == 1) %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "FilterNode.*", # filter node + "int > 6.*cast.*", # filtering expressions + auto-casting of part + "SourceNode" # entry point + ) + ) + + # group_by and summarise + expect_output( + ds %>% + group_by(part) %>% + summarise(avg = mean(int)) %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "GroupByNode.*", # group by node + "keys=.*part.*", # key for aggregations + "aggregates=.*hash_mean.*", # aggregations + "ProjectNode.*", # input columns + "SourceNode" # entry point + ) + ) + + # arrange and head + expect_output( + ds %>% + filter(lgl) %>% + arrange(chr) %>% + # head() %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "order_by.*", # there should be something in the output + # regarding arrange, maybe "order_by", but + # it is missing + "FilterNode*", # filter node + "filter=lgl.*", # filtering expression + "SourceNode" # entry point + ) + ) + + expect_output( + ds %>% + filter(lgl) %>% + arrange(chr) %>% + head() %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "order_by.*", # there should be something in the output + # regarding arrange, maybe "order_by", but + # it is missing + "FilterNode*", # filter node + "filter=lgl.*", # filtering expression + "SourceNode" # entry point + ) + ) + + ds %>% + filter(lgl) %>% + arrange(chr) %>% + head() %>% + # group_by(part) %>% + # summarise(avg = mean(int)) %>% + show_exec_plan() + collect() + +}) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index ad13c7affef..5c826d6dffc 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -1226,20 +1226,3 @@ test_that("FileSystemFactoryOptions input validation", { fixed = TRUE ) }) - -test_that("show_exec_plan() with datasets", { - ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) - - expect_output( - ds %>% - select(string = chr, integer = int, part) %>% - filter(integer > 6L & part == 1) %>% - show_exec_plan(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - # the filter including the auto-casting of part - "FilterNode.*filter=.*int > 6.*cast.*", - "SourceNode" # the starting point - ) - ) -}) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 1591b54ddac..83184f742a8 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -675,7 +675,6 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - skip("WIP - arrange and head are not currently captured by C++ ExecPlan print method") expect_output( mtcars %>% arrow_table() %>% From 4ff30659c27b9553f4b7367b0bf0c1e860136cee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 19:58:21 +0100 Subject: [PATCH 25/80] clean-up --- r/tests/testthat/test-dataset-dplyr.R | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 08227dd0e63..9c82a74b23b 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -392,12 +392,11 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { ds %>% filter(lgl) %>% arrange(chr) %>% - # head() %>% show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns - "order_by.*", # there should be something in the output + "order_by.*", # there should be something in the output # regarding arrange, maybe "order_by", but # it is missing "FilterNode*", # filter node @@ -415,22 +414,15 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns - "order_by.*", # there should be something in the output - # regarding arrange, maybe "order_by", but - # it is missing + "select_k.*", # there should be something in the output + # regarding head and its corresponding + # ExecNode ("select_k"), but it is missing + "order_by.*", # there should be something in the output + # regarding arrange, maybe "order_by", but + # it is missing "FilterNode*", # filter node "filter=lgl.*", # filtering expression "SourceNode" # entry point ) ) - - ds %>% - filter(lgl) %>% - arrange(chr) %>% - head() %>% - # group_by(part) %>% - # summarise(avg = mean(int)) %>% - show_exec_plan() - collect() - }) From d686f9d4c4866cda93c1fa5896cdd520959e46be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 13 Jul 2022 19:59:25 +0100 Subject: [PATCH 26/80] clean-up --- r/tests/testthat/test-dataset-dplyr.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 9c82a74b23b..81c77c6330a 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -397,8 +397,8 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns "order_by.*", # there should be something in the output - # regarding arrange, maybe "order_by", but - # it is missing + # regarding arrange and its corresponding + # ExecNode ("order_by"), but it is missing "FilterNode*", # filter node "filter=lgl.*", # filtering expression "SourceNode" # entry point From d452a408859feb504899a14e77a6e2dd03733fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 11:34:23 +0100 Subject: [PATCH 27/80] define `ExecPlan_ToStringWithSink` C++ function --- r/R/arrowExports.R | 4 ++++ r/src/arrowExports.cpp | 12 ++++++++++++ r/src/compute-exec.cpp | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 6de6ab5bbfa..8d56173784b 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -420,6 +420,10 @@ ExecPlan_ToString <- function(plan) { .Call(`_arrow_ExecPlan_ToString`, plan) } +ExecPlan_ToStringWithSink <- function(plan, final_node, sort_options, head) { + .Call(`_arrow_ExecPlan_ToStringWithSink`, plan, final_node, sort_options, head) +} + ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { .Call(`_arrow_ExecNode_Scan`, plan, dataset, filter, materialized_field_names) } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 6809270d8dc..ffae6999746 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -906,6 +906,17 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp +std::string ExecPlan_ToStringWithSink(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, int64_t head); +extern "C" SEXP _arrow_ExecPlan_ToStringWithSink(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP head_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type plan(plan_sexp); + arrow::r::Input&>::type final_node(final_node_sexp); + arrow::r::Input::type sort_options(sort_options_sexp); + arrow::r::Input::type head(head_sexp); + return cpp11::as_sexp(ExecPlan_ToStringWithSink(plan, final_node, sort_options, head)); +END_CPP11 +} +// compute-exec.cpp #if defined(ARROW_R_WITH_DATASET) std::shared_ptr ExecNode_Scan(const std::shared_ptr& plan, const std::shared_ptr& dataset, const std::shared_ptr& filter, std::vector materialized_field_names); extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP filter_sexp, SEXP materialized_field_names_sexp){ @@ -5251,6 +5262,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1}, { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1}, { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 1}, + { "_arrow_ExecPlan_ToStringWithSink", (DL_FUNC) &_arrow_ExecPlan_ToStringWithSink, 4}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index f2a5ebafbb7..ddeddd14916 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -130,6 +130,41 @@ std::string ExecPlan_ToString(const std::shared_ptr& plan) { return plan->ToString(); } +// [[arrow::export]] +std::string ExecPlan_ToStringWithSink( + const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, int64_t head = -1) { + // For now, don't require R to construct SinkNodes. + // Instead, just pass the node we should collect as an argument. + arrow::AsyncGenerator> sink_gen; + + // Sorting uses a different sink node; there is no general sort yet + if (sort_options.size() > 0) { + if (head >= 0) { + // Use the SelectK node to take only what we need + MakeExecNodeOrStop( + "select_k_sink", plan.get(), {final_node.get()}, + compute::SelectKSinkNodeOptions{ + arrow::compute::SelectKOptions( + head, std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)) + ->sort_keys), + &sink_gen}); + } else { + MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, + compute::OrderBySinkNodeOptions{ + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); + } + } else { + MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, + compute::SinkNodeOptions{&sink_gen}); + } + return plan->ToString(); +} + #if defined(ARROW_R_WITH_DATASET) #include From f4a1ba4330b9069c36f67b0f28b4341e117d0978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 11:35:31 +0100 Subject: [PATCH 28/80] define the `ToStringWithSink()` R6 ExecPlan method and use it in `show_exec_plan()` --- r/R/dplyr.R | 2 +- r/R/query-engine.R | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index c0c005c612a..136cb021e84 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -242,7 +242,7 @@ show_exec_plan <- function(x) { adq <- as_adq(x) plan <- ExecPlan$create() final_node <- plan$Build(adq) - cat(plan$ToString()) + cat(plan$ToStringWithSink(final_node)) invisible(x) } diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 9b6a053f5d3..776961debb6 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -260,6 +260,34 @@ ExecPlan <- R6Class("ExecPlan", ) }, ToString = function() ExecPlan_ToString(self), + ToStringWithSink = function(node) { + # final_node <- self$Build(.data) + # ExecPlan_ToStringWithSink(self, final_node) + assert_is(node, "ExecNode") + + # Sorting and head/tail (if sorted) are handled in the SinkNode, + # created in ExecPlan_run + sorting <- node$extras$sort %||% list() + select_k <- node$extras$head %||% -1L + has_sorting <- length(sorting) > 0 + if (has_sorting) { + if (!is.null(node$extras$tail)) { + # Reverse the sort order and take the top K, then after we'll reverse + # the resulting rows so that it is ordered as expected + sorting$orders <- !sorting$orders + select_k <- node$extras$tail + } + sorting$orders <- as.integer(sorting$orders) + } + + out <- ExecPlan_ToStringWithSink( + self, + node, + sorting, + select_k + ) + out + }, Stop = function() ExecPlan_StopProducing(self) ) ) From ac7d756e4f8ec50598b6896a805ae2bec41b49dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 11:47:04 +0100 Subject: [PATCH 29/80] updated unit tests --- r/tests/testthat/test-dplyr-query.R | 72 +++++++++++------------------ 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 83184f742a8..06ee3311e0e 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -682,13 +682,11 @@ test_that("show_exec_plan(), show_query() and explain()", { arrange(desc(wt)) %>% show_exec_plan(), regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "order_by.*", # there should be something in the output - # regarding arrange and its corresponding - # ExecNode ("order_by"), but it is missing - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node + "ProjectNode.*", # output columns + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point ) ) @@ -699,13 +697,11 @@ test_that("show_exec_plan(), show_query() and explain()", { arrange(desc(wt)) %>% show_query(), regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "order_by.*", # there should be something in the output - # regarding arrange and its corresponding - # ExecNode ("order_by"), but it is missing - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node + "ProjectNode.*", # output columns + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point ) ) @@ -716,13 +712,11 @@ test_that("show_exec_plan(), show_query() and explain()", { arrange(desc(wt)) %>% explain(), regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "order_by.*", # there should be something in the output - # regarding arrange and its corresponding - # ExecNode ("order_by"), but it is missing - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node + "ProjectNode.*", # output columns + "FilterNode.*", # the filter node + "TableSourceNode.*" # the entry point ) ) @@ -733,17 +727,13 @@ test_that("show_exec_plan(), show_query() and explain()", { arrange(desc(wt)) %>% head(3) %>% show_exec_plan(), + # for some reason the FilterNode disappears when head/tail are involved + + # we do not have additional information regarding the SinkNode regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # "ProjectNode.*", # output columns - "select_k.*", # there should be something in the output - # regarding head and its corresponding - # ExecNode ("select_k"), but it is missing - "order_by.*", # there should be something in the output - # regarding arrange and its corresponding - # ExecNode ("order_by"), but it is missing - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "SourceNode.*" # the entry point ) ) @@ -754,17 +744,13 @@ test_that("show_exec_plan(), show_query() and explain()", { arrange(desc(wt)) %>% head(3) %>% show_query(), + # for some reason the FilterNode disappears when head/tail are involved + + # we do not have additional information regarding the SinkNode regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # "ProjectNode.*", # output columns - "select_k.*", # there should be something in the output - # regarding head and its corresponding - # ExecNode ("select_k"), but it is missing - "order_by.*", # there should be something in the output - # regarding arrange and its corresponding - # ExecNode ("order_by"), but it is missing - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "SourceNode.*" # the entry point ) ) expect_output( @@ -774,17 +760,13 @@ test_that("show_exec_plan(), show_query() and explain()", { arrange(desc(wt)) %>% head(3) %>% explain(), + # for some reason the FilterNode disappears when head/tail are involved + + # we do not have additional information regarding the SinkNode regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # "ProjectNode.*", # output columns - "select_k.*", # there should be something in the output - # regarding head and its corresponding - # ExecNode ("select_k"), but it is missing - "order_by.*", # there should be something in the output - # regarding arrange and its corresponding - # ExecNode ("order_by"), but it is missing - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "SourceNode.*" # the entry point ) ) }) From 761b66d174a3bc9517b72fbbfd627ce83265e516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 11:53:46 +0100 Subject: [PATCH 30/80] comments --- r/R/query-engine.R | 5 +++-- r/tests/testthat/test-dplyr-query.R | 13 ++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 776961debb6..dbeb3f9e99b 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -260,9 +260,10 @@ ExecPlan <- R6Class("ExecPlan", ) }, ToString = function() ExecPlan_ToString(self), + # SinkNodes (involved in arrange and/or head/tail operations) are created in + # ExecPlan_run, so we take a similar approach to expose them before calling + # the print method ToStringWithSink = function(node) { - # final_node <- self$Build(.data) - # ExecPlan_ToStringWithSink(self, final_node) assert_is(node, "ExecNode") # Sorting and head/tail (if sorted) are handled in the SinkNode, diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 06ee3311e0e..a074ba6291f 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -728,7 +728,8 @@ test_that("show_exec_plan(), show_query() and explain()", { head(3) %>% show_exec_plan(), # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + # we do not have additional information regarding the SinkNode + + # the entry point is now a SourceNode and not a TableSourceNode regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "SinkNode.*", # @@ -745,12 +746,13 @@ test_that("show_exec_plan(), show_query() and explain()", { head(3) %>% show_query(), # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + # we do not have additional information regarding the SinkNode + + # the entry point is now a SourceNode and not a TableSourceNode regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "SinkNode.*", # "ProjectNode.*", # output columns - "SourceNode.*" # the entry point + "SourceNode.*" # the entry point ) ) expect_output( @@ -761,12 +763,13 @@ test_that("show_exec_plan(), show_query() and explain()", { head(3) %>% explain(), # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + # we do not have additional information regarding the SinkNode + + # the entry point is now a SourceNode and not a TableSourceNode regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "SinkNode.*", # "ProjectNode.*", # output columns - "SourceNode.*" # the entry point + "SourceNode.*" # the entry point ) ) }) From a36f826b97069e75e611e08d61297f212bb8767b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 11:56:28 +0100 Subject: [PATCH 31/80] clang style --- r/src/compute-exec.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index ddeddd14916..287f94f6c79 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -133,8 +133,8 @@ std::string ExecPlan_ToString(const std::shared_ptr& plan) { // [[arrow::export]] std::string ExecPlan_ToStringWithSink( const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, int64_t head = -1) { + const std::shared_ptr& final_node, cpp11::list sort_options, + int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; From d887d42227fe6c763bec916b0a7a4f0c83b13b66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 12:06:27 +0100 Subject: [PATCH 32/80] comment --- r/R/query-engine.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index dbeb3f9e99b..342f0c63e11 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -261,8 +261,8 @@ ExecPlan <- R6Class("ExecPlan", }, ToString = function() ExecPlan_ToString(self), # SinkNodes (involved in arrange and/or head/tail operations) are created in - # ExecPlan_run, so we take a similar approach to expose them before calling - # the print method + # ExecPlan_run and are not captured by the regulat print method. We take a + # similar approach to expose them before calling the print method. ToStringWithSink = function(node) { assert_is(node, "ExecNode") From effc96344480cd43b7ee997c73b4b1260c96d73d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 12:24:06 +0100 Subject: [PATCH 33/80] clang style --- r/src/compute-exec.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 287f94f6c79..5382235b22a 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -144,19 +144,19 @@ std::string ExecPlan_ToStringWithSink( if (head >= 0) { // Use the SelectK node to take only what we need MakeExecNodeOrStop( - "select_k_sink", plan.get(), {final_node.get()}, - compute::SelectKSinkNodeOptions{ - arrow::compute::SelectKOptions( - head, std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)) - ->sort_keys), - &sink_gen}); + "select_k_sink", plan.get(), {final_node.get()}, + compute::SelectKSinkNodeOptions{ + arrow::compute::SelectKOptions( + head, std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)) + ->sort_keys), + &sink_gen}); } else { MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, compute::OrderBySinkNodeOptions{ - *std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)), - &sink_gen}); + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); } } else { MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, From 4571df4b133e498af22bb8b7419e76ed039cd2e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 12:29:07 +0100 Subject: [PATCH 34/80] clang style --- r/src/compute-exec.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 5382235b22a..b34ed393282 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -146,10 +146,10 @@ std::string ExecPlan_ToStringWithSink( MakeExecNodeOrStop( "select_k_sink", plan.get(), {final_node.get()}, compute::SelectKSinkNodeOptions{ - arrow::compute::SelectKOptions( - head, std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)) - ->sort_keys), + arrow::compute::SelectKOptions( + head, std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)) + ->sort_keys), &sink_gen}); } else { MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, From d1e4ce7ac3ec352a7ba8f17baf0c1910be29c72e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 12:45:09 +0100 Subject: [PATCH 35/80] clang style --- r/src/compute-exec.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index b34ed393282..2362f460d0a 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -148,9 +148,9 @@ std::string ExecPlan_ToStringWithSink( compute::SelectKSinkNodeOptions{ arrow::compute::SelectKOptions( head, std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)) - ->sort_keys), - &sink_gen}); + make_compute_options("sort_indices", sort_options)) + ->sort_keys), + &sink_gen}); } else { MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, compute::OrderBySinkNodeOptions{ From e2220fe164059df5ba130cd26374c26ac59d9e44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 12:57:00 +0100 Subject: [PATCH 36/80] update dataset tests --- r/tests/testthat/test-dataset-dplyr.R | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 81c77c6330a..b0bd61653fb 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -394,14 +394,12 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { arrange(chr) %>% show_exec_plan(), regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "order_by.*", # there should be something in the output - # regarding arrange and its corresponding - # ExecNode ("order_by"), but it is missing - "FilterNode*", # filter node - "filter=lgl.*", # filtering expression - "SourceNode" # entry point + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*chr.*ASC.*", # arrange goes via the OrderBy sink node + "ProjectNode.*", # output columns + "FilterNode.*", # filter node + "filter=lgl.*", # filtering expression + "SourceNode" # entry point ) ) @@ -411,17 +409,12 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { arrange(chr) %>% head() %>% show_exec_plan(), + # for some reason the FilterNode disappears when head/tail are involved + + # we do not have additional information regarding the SinkNode regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # "ProjectNode.*", # output columns - "select_k.*", # there should be something in the output - # regarding head and its corresponding - # ExecNode ("select_k"), but it is missing - "order_by.*", # there should be something in the output - # regarding arrange, maybe "order_by", but - # it is missing - "FilterNode*", # filter node - "filter=lgl.*", # filtering expression "SourceNode" # entry point ) ) From 3c96fb7a93df7118399ae90791cd6421a58fe433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 14:03:30 +0100 Subject: [PATCH 37/80] comment --- r/tests/testthat/test-dplyr-query.R | 56 ++++++++++++++--------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index a074ba6291f..a2096b6c7ee 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -486,7 +486,7 @@ test_that("show_exec_plan(), show_query() and explain()", { "FilterNode.*", # filter node "(dbl > 2).*", # filter expressions "chr != \"e\".*", - "TableSourceNode" # entry point + "TableSourceNode" # entry point ) ) @@ -504,7 +504,7 @@ test_that("show_exec_plan(), show_query() and explain()", { "FilterNode.*", # filter node "(dbl > 2).*", # filter expressions "chr != \"e\".*", - "TableSourceNode" # entry point + "TableSourceNode" # entry point ) ) @@ -522,7 +522,7 @@ test_that("show_exec_plan(), show_query() and explain()", { "FilterNode.*", # filter node "(dbl > 2).*", # filter expressions "chr != \"e\".*", - "TableSourceNode" # entry point + "TableSourceNode" # entry point ) ) @@ -539,7 +539,7 @@ test_that("show_exec_plan(), show_query() and explain()", { "chr, int, lgl, \"int_plus_ten\".*", # selected columns "(dbl > 2).*", # the filter expressions "chr != \"e\".*", - "TableSourceNode" # the entry point" + "TableSourceNode" # the entry point" ) ) @@ -592,11 +592,11 @@ test_that("show_exec_plan(), show_query() and explain()", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns - "GroupByNode.*", # the group_by statement - "keys=.*lgl.*", # the key for the aggregations - "aggregates=.*hash_mean.*avg.*", # the aggregations - "ProjectNode.*", # the input columns - "TableSourceNode" # the entry point + "GroupByNode.*", # group_by statement + "keys=.*lgl.*", # key for the aggregations + "aggregates=.*hash_mean.*avg.*", # aggregations + "ProjectNode.*", # input columns + "TableSourceNode" # entry point ) ) @@ -618,10 +618,10 @@ test_that("show_exec_plan(), show_query() and explain()", { "ProjectNode.*", # output columns "HashJoinNode.*", # the join "ProjectNode.*", # input columns for the second table - "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate - "TableSourceNode.*", # the second table + "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # mutate + "TableSourceNode.*", # second table "ProjectNode.*", # input columns for the first table - "TableSourceNode" # the first table + "TableSourceNode" # first table ) ) @@ -641,12 +641,12 @@ test_that("show_exec_plan(), show_query() and explain()", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns - "HashJoinNode.*", # the join + "HashJoinNode.*", # join "ProjectNode.*", # input columns for the second table "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate - "TableSourceNode.*", # the second table + "TableSourceNode.*", # second table "ProjectNode.*", # input columns for the first table - "TableSourceNode" # the first table + "TableSourceNode" # first table ) ) @@ -666,12 +666,12 @@ test_that("show_exec_plan(), show_query() and explain()", { regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns - "HashJoinNode.*", # the join + "HashJoinNode.*", # join "ProjectNode.*", # input columns for the second table - "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate - "TableSourceNode.*", # the second table + "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # mutate + "TableSourceNode.*", # second table "ProjectNode.*", # input columns for the first table - "TableSourceNode" # the first table + "TableSourceNode" # first table ) ) @@ -685,8 +685,8 @@ test_that("show_exec_plan(), show_query() and explain()", { "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node "ProjectNode.*", # output columns - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "FilterNode.*", # filter node + "TableSourceNode.*" # entry point ) ) @@ -700,8 +700,8 @@ test_that("show_exec_plan(), show_query() and explain()", { "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node "ProjectNode.*", # output columns - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "FilterNode.*", # filter node + "TableSourceNode.*" # entry point ) ) @@ -715,8 +715,8 @@ test_that("show_exec_plan(), show_query() and explain()", { "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node "ProjectNode.*", # output columns - "FilterNode.*", # the filter node - "TableSourceNode.*" # the entry point + "FilterNode.*", # filter node + "TableSourceNode.*" # entry point ) ) @@ -734,7 +734,7 @@ test_that("show_exec_plan(), show_query() and explain()", { "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "SinkNode.*", # "ProjectNode.*", # output columns - "SourceNode.*" # the entry point + "SourceNode.*" # entry point ) ) @@ -752,7 +752,7 @@ test_that("show_exec_plan(), show_query() and explain()", { "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "SinkNode.*", # "ProjectNode.*", # output columns - "SourceNode.*" # the entry point + "SourceNode.*" # entry point ) ) expect_output( @@ -769,7 +769,7 @@ test_that("show_exec_plan(), show_query() and explain()", { "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "SinkNode.*", # "ProjectNode.*", # output columns - "SourceNode.*" # the entry point + "SourceNode.*" # entry point ) ) }) From 068dc6d0ee4db197568a4b56526b266e978fcb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 15:03:29 +0100 Subject: [PATCH 38/80] bump ci From fb2a5a00fc24d16da8e0a596256e0c18b02f5289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 18 Jul 2022 17:31:50 +0100 Subject: [PATCH 39/80] don't run the cyclocomp linter on the ExecPlan R6 definition --- r/R/query-engine.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 342f0c63e11..63904821389 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# nolint start: cyclocomp_linter, ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, public = list( @@ -292,6 +293,7 @@ ExecPlan <- R6Class("ExecPlan", Stop = function() ExecPlan_StopProducing(self) ) ) +# nolint end. ExecPlan$create <- function(use_threads = option_use_threads()) { ExecPlan_create(use_threads) } From c14a497435e313d84b735ac5dd358ced45214503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 09:10:02 +0100 Subject: [PATCH 40/80] typo --- r/R/query-engine.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 63904821389..94bffa41e94 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -262,7 +262,7 @@ ExecPlan <- R6Class("ExecPlan", }, ToString = function() ExecPlan_ToString(self), # SinkNodes (involved in arrange and/or head/tail operations) are created in - # ExecPlan_run and are not captured by the regulat print method. We take a + # ExecPlan_run and are not captured by the regular print method. We take a # similar approach to expose them before calling the print method. ToStringWithSink = function(node) { assert_is(node, "ExecNode") From aac9edc1b94d16e4ea42b79b32fe269fa4efefc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 09:10:36 +0100 Subject: [PATCH 41/80] `&&` --- r/R/dplyr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 136cb021e84..d4b8ca03c20 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -231,7 +231,7 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' @return `x`, invisibly. #' @export #' -#' @examplesIf arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE) +#' @examplesIf arrow_with_dataset() && requireNamespace("dplyr", quietly = TRUE) #' library(dplyr) #' mtcars %>% #' arrow_table() %>% From 3faecc650b528c97d22449b682adda878054545e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 09:15:30 +0100 Subject: [PATCH 42/80] rename `ToStrinWithSink` to `ToString` and keep only this method --- r/R/arrowExports.R | 8 ++------ r/R/dplyr.R | 2 +- r/R/query-engine.R | 5 ++--- r/src/arrowExports.cpp | 17 ++++------------- r/src/compute-exec.cpp | 7 +------ 5 files changed, 10 insertions(+), 29 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 8d56173784b..136bd70988c 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -416,12 +416,8 @@ ExecNode_output_schema <- function(node) { .Call(`_arrow_ExecNode_output_schema`, node) } -ExecPlan_ToString <- function(plan) { - .Call(`_arrow_ExecPlan_ToString`, plan) -} - -ExecPlan_ToStringWithSink <- function(plan, final_node, sort_options, head) { - .Call(`_arrow_ExecPlan_ToStringWithSink`, plan, final_node, sort_options, head) +ExecPlan_ToString <- function(plan, final_node, sort_options, head) { + .Call(`_arrow_ExecPlan_ToString`, plan, final_node, sort_options, head) } ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { diff --git a/r/R/dplyr.R b/r/R/dplyr.R index d4b8ca03c20..2ad07c3b869 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -242,7 +242,7 @@ show_exec_plan <- function(x) { adq <- as_adq(x) plan <- ExecPlan$create() final_node <- plan$Build(adq) - cat(plan$ToStringWithSink(final_node)) + cat(plan$ToString(final_node)) invisible(x) } diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 94bffa41e94..dc2f532ca98 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -260,11 +260,10 @@ ExecPlan <- R6Class("ExecPlan", ... ) }, - ToString = function() ExecPlan_ToString(self), # SinkNodes (involved in arrange and/or head/tail operations) are created in # ExecPlan_run and are not captured by the regular print method. We take a # similar approach to expose them before calling the print method. - ToStringWithSink = function(node) { + ToString = function(node) { assert_is(node, "ExecNode") # Sorting and head/tail (if sorted) are handled in the SinkNode, @@ -282,7 +281,7 @@ ExecPlan <- R6Class("ExecPlan", sorting$orders <- as.integer(sorting$orders) } - out <- ExecPlan_ToStringWithSink( + out <- ExecPlan_ToString( self, node, sorting, diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index ffae6999746..ef82afd0291 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -898,22 +898,14 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp -std::string ExecPlan_ToString(const std::shared_ptr& plan); -extern "C" SEXP _arrow_ExecPlan_ToString(SEXP plan_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type plan(plan_sexp); - return cpp11::as_sexp(ExecPlan_ToString(plan)); -END_CPP11 -} -// compute-exec.cpp -std::string ExecPlan_ToStringWithSink(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, int64_t head); -extern "C" SEXP _arrow_ExecPlan_ToStringWithSink(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP head_sexp){ +std::string ExecPlan_ToString(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, int64_t head); +extern "C" SEXP _arrow_ExecPlan_ToString(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP head_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); arrow::r::Input&>::type final_node(final_node_sexp); arrow::r::Input::type sort_options(sort_options_sexp); arrow::r::Input::type head(head_sexp); - return cpp11::as_sexp(ExecPlan_ToStringWithSink(plan, final_node, sort_options, head)); + return cpp11::as_sexp(ExecPlan_ToString(plan, final_node, sort_options, head)); END_CPP11 } // compute-exec.cpp @@ -5261,8 +5253,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 5}, { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1}, { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1}, - { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 1}, - { "_arrow_ExecPlan_ToStringWithSink", (DL_FUNC) &_arrow_ExecPlan_ToStringWithSink, 4}, + { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 4}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 2362f460d0a..56ca0307d5e 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -126,12 +126,7 @@ std::shared_ptr ExecNode_output_schema( } // [[arrow::export]] -std::string ExecPlan_ToString(const std::shared_ptr& plan) { - return plan->ToString(); -} - -// [[arrow::export]] -std::string ExecPlan_ToStringWithSink( +std::string ExecPlan_ToString( const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, int64_t head = -1) { From 1afcce98340b65cad6fbc7dc3db44b0d328d7dce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 09:17:00 +0100 Subject: [PATCH 43/80] docs --- r/man/show_exec_plan.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/man/show_exec_plan.Rd b/r/man/show_exec_plan.Rd index 63ed4a7ec66..7c4374520a6 100644 --- a/r/man/show_exec_plan.Rd +++ b/r/man/show_exec_plan.Rd @@ -19,7 +19,7 @@ It calls the C++ \code{ExecPlan} object's print method. Functionally, it is similar to \code{dplyr::explain()}. } \examples{ -\dontshow{if (arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_with_dataset() && requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(dplyr) mtcars \%>\% arrow_table() \%>\% From 1de34471bfcbb5cd85069495ea5090a8cf5f6834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 09:24:02 +0100 Subject: [PATCH 44/80] C++ lint --- r/src/compute-exec.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 56ca0307d5e..a650c32da8f 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -126,10 +126,9 @@ std::shared_ptr ExecNode_output_schema( } // [[arrow::export]] -std::string ExecPlan_ToString( - const std::shared_ptr& plan, - const std::shared_ptr& final_node, cpp11::list sort_options, - int64_t head = -1) { +std::string ExecPlan_ToString(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; From f40e5d289553543ce070ac67bbcf565b289548b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 10:28:23 +0100 Subject: [PATCH 45/80] add `ExecPlan_prepare` helper and update `ExecPlan_ToString` C++ functions --- r/R/arrowExports.R | 4 ++-- r/src/arrowExports.cpp | 9 +++++---- r/src/compute-exec.cpp | 43 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 136bd70988c..fb53308b83d 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -416,8 +416,8 @@ ExecNode_output_schema <- function(node) { .Call(`_arrow_ExecNode_output_schema`, node) } -ExecPlan_ToString <- function(plan, final_node, sort_options, head) { - .Call(`_arrow_ExecPlan_ToString`, plan, final_node, sort_options, head) +ExecPlan_ToString <- function(plan, final_node, sort_options, metadata, head) { + .Call(`_arrow_ExecPlan_ToString`, plan, final_node, sort_options, metadata, head) } ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index ef82afd0291..e168bdf4978 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -898,14 +898,15 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp -std::string ExecPlan_ToString(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, int64_t head); -extern "C" SEXP _arrow_ExecPlan_ToString(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP head_sexp){ +std::string ExecPlan_ToString(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head); +extern "C" SEXP _arrow_ExecPlan_ToString(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP metadata_sexp, SEXP head_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); arrow::r::Input&>::type final_node(final_node_sexp); arrow::r::Input::type sort_options(sort_options_sexp); + arrow::r::Input::type metadata(metadata_sexp); arrow::r::Input::type head(head_sexp); - return cpp11::as_sexp(ExecPlan_ToString(plan, final_node, sort_options, head)); + return cpp11::as_sexp(ExecPlan_ToString(plan, final_node, sort_options, metadata, head)); END_CPP11 } // compute-exec.cpp @@ -5253,7 +5254,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 5}, { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1}, { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1}, - { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 4}, + { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 5}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index a650c32da8f..db41c48ef7d 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -125,10 +125,10 @@ std::shared_ptr ExecNode_output_schema( return node->output_schema(); } -// [[arrow::export]] -std::string ExecPlan_ToString(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, int64_t head = -1) { +std::pair, std::shared_ptr> +ExecPlan_prepare(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; @@ -156,7 +156,40 @@ std::string ExecPlan_ToString(const std::shared_ptr& plan, MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, compute::SinkNodeOptions{&sink_gen}); } - return plan->ToString(); + StopIfNotOk(plan->Validate()); + + // If the generator is destroyed before being completely drained, inform plan + std::shared_ptr stop_producing{nullptr, [plan](...) { + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); + if (not_finished_yet) { + plan->StopProducing(); + } + }}; + // Attach metadata to the schema + auto out_schema = final_node->output_schema(); + if (metadata.size() > 0) { + auto kv = strings_to_kvm(metadata); + out_schema = out_schema->WithMetadata(kv); + } + + std::pair, std::shared_ptr> + out; + out.first = plan; + out.second = compute::MakeGeneratorReader( + out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, + gc_memory_pool()); + return out; +} + +// [[arrow::export]] +std::string ExecPlan_ToString(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { + auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); + return prepared_plan.first->ToString(); } #if defined(ARROW_R_WITH_DATASET) From 8aae3c37bceccb460d77e3aaf9a12aac8e215f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 10:28:50 +0100 Subject: [PATCH 46/80] update ToString R6 ExecPlan method --- r/R/query-engine.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index dc2f532ca98..00d79a416a4 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -285,6 +285,7 @@ ExecPlan <- R6Class("ExecPlan", self, node, sorting, + prepare_key_value_metadata(node$final_metadata()), select_k ) out From fae74aeaa7f4588658f547f5a3d02e11edb7f147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 10:45:07 +0100 Subject: [PATCH 47/80] c++ lint --- r/src/compute-exec.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index db41c48ef7d..472b8692617 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -126,9 +126,9 @@ std::shared_ptr ExecNode_output_schema( } std::pair, std::shared_ptr> -ExecPlan_prepare(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { + ExecPlan_prepare(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; @@ -160,14 +160,14 @@ ExecPlan_prepare(const std::shared_ptr& plan, // If the generator is destroyed before being completely drained, inform plan std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); - if (not_finished_yet) { - plan->StopProducing(); - } - }}; + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); + if (not_finished_yet) { + plan->StopProducing(); + } + }}; // Attach metadata to the schema auto out_schema = final_node->output_schema(); if (metadata.size() > 0) { From ebd4a02e7df6cc7da791a3bd728495966fdf2579 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 10:59:37 +0100 Subject: [PATCH 48/80] C++ lint --- r/src/compute-exec.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 472b8692617..b7a3a061825 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -161,9 +161,9 @@ std::pair, std::shared_ptr stop_producing{nullptr, [plan](...) { bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); if (not_finished_yet) { plan->StopProducing(); } @@ -176,18 +176,19 @@ std::pair, std::shared_ptr, std::shared_ptr> - out; + out; out.first = plan; out.second = compute::MakeGeneratorReader( - out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, - gc_memory_pool()); + out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, + gc_memory_pool()); return out; } // [[arrow::export]] std::string ExecPlan_ToString(const std::shared_ptr& plan, const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { + cpp11::list sort_options, cpp11::strings metadata, + int64_t head = -1) { auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); return prepared_plan.first->ToString(); } From 311b70b364349ed0189cf250f21e4ce6337e9a18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 11:18:04 +0100 Subject: [PATCH 49/80] another C++ lint --- r/src/compute-exec.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index b7a3a061825..174cf1d38a2 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -126,9 +126,9 @@ std::shared_ptr ExecNode_output_schema( } std::pair, std::shared_ptr> - ExecPlan_prepare(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { +ExecPlan_prepare(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; From d0bc159f70435ebb10807aa2861cd09f7db43e76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 13:14:10 +0100 Subject: [PATCH 50/80] simpler ExecPlan_prepare --- r/src/compute-exec.cpp | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 174cf1d38a2..27c44408cd3 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -125,7 +125,7 @@ std::shared_ptr ExecNode_output_schema( return node->output_schema(); } -std::pair, std::shared_ptr> +std::shared_ptr ExecPlan_prepare(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { @@ -156,32 +156,7 @@ ExecPlan_prepare(const std::shared_ptr& plan, MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, compute::SinkNodeOptions{&sink_gen}); } - StopIfNotOk(plan->Validate()); - - // If the generator is destroyed before being completely drained, inform plan - std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); - if (not_finished_yet) { - plan->StopProducing(); - } - }}; - // Attach metadata to the schema - auto out_schema = final_node->output_schema(); - if (metadata.size() > 0) { - auto kv = strings_to_kvm(metadata); - out_schema = out_schema->WithMetadata(kv); - } - - std::pair, std::shared_ptr> - out; - out.first = plan; - out.second = compute::MakeGeneratorReader( - out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, - gc_memory_pool()); - return out; + return plan; } // [[arrow::export]] @@ -190,7 +165,7 @@ std::string ExecPlan_ToString(const std::shared_ptr& plan, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - return prepared_plan.first->ToString(); + return prepared_plan->ToString(); } #if defined(ARROW_R_WITH_DATASET) From d7834b09f1338d7a758b1c982c291562b76e9b59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 13:44:11 +0100 Subject: [PATCH 51/80] C++ lint --- r/src/compute-exec.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 27c44408cd3..83f9f019933 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -125,10 +125,10 @@ std::shared_ptr ExecNode_output_schema( return node->output_schema(); } -std::shared_ptr -ExecPlan_prepare(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { +std::shared_ptr ExecPlan_prepare( + const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; From 9a026511a9bbd5fdb2e84e5ab4da9b0f6976815c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 13:50:49 +0100 Subject: [PATCH 52/80] more C++ linting --- r/src/compute-exec.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 83f9f019933..ba265d13b4f 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -127,8 +127,8 @@ std::shared_ptr ExecNode_output_schema( std::shared_ptr ExecPlan_prepare( const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { + const std::shared_ptr& final_node, cpp11::list sort_options, + cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; From ac37a9dbfa29972668cfa27d0c0a5fe4ec2aa952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 15:51:55 +0100 Subject: [PATCH 53/80] extend ExecPlan_prepare and use it for ExecPlan_run --- r/src/compute-exec.cpp | 104 ++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 63 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index ba265d13b4f..6c69583225c 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -55,11 +55,10 @@ std::shared_ptr MakeExecNodeOrStop( }); } -// [[arrow::export]] -std::shared_ptr ExecPlan_run( - const std::shared_ptr& plan, - const std::shared_ptr& final_node, cpp11::list sort_options, - cpp11::strings metadata, int64_t head = -1) { +std::pair, std::shared_ptr> +ExecPlan_prepare(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; @@ -69,39 +68,37 @@ std::shared_ptr ExecPlan_run( if (head >= 0) { // Use the SelectK node to take only what we need MakeExecNodeOrStop( - "select_k_sink", plan.get(), {final_node.get()}, - compute::SelectKSinkNodeOptions{ - arrow::compute::SelectKOptions( - head, std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)) - ->sort_keys), - &sink_gen}); + "select_k_sink", plan.get(), {final_node.get()}, + compute::SelectKSinkNodeOptions{ + arrow::compute::SelectKOptions( + head, std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)) + ->sort_keys), + &sink_gen}); } else { MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, compute::OrderBySinkNodeOptions{ - *std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)), - &sink_gen}); + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); } } else { MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, compute::SinkNodeOptions{&sink_gen}); } - StopIfNotOk(plan->Validate()); - StopIfNotOk(plan->StartProducing()); // If the generator is destroyed before being completely drained, inform plan std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); - if (not_finished_yet) { - plan->StopProducing(); - } - }}; + if (not_finished_yet) { + plan->StopProducing(); + } + }}; // Attach metadata to the schema auto out_schema = final_node->output_schema(); @@ -109,9 +106,24 @@ std::shared_ptr ExecPlan_run( auto kv = strings_to_kvm(metadata); out_schema = out_schema->WithMetadata(kv); } - return compute::MakeGeneratorReader( - out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, - gc_memory_pool()); + + std::pair, std::shared_ptr> + out; + out.first = plan; + out.second = compute::MakeGeneratorReader( + out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, + gc_memory_pool()); + return out; +} + +// [[arrow::export]] +std::shared_ptr ExecPlan_run( + const std::shared_ptr& plan, + const std::shared_ptr& final_node, cpp11::list sort_options, + cpp11::strings metadata, int64_t head = -1) { + auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); + StopIfNotOk(prepared_plan.first->StartProducing()); + return prepared_plan.second; } // [[arrow::export]] @@ -125,47 +137,13 @@ std::shared_ptr ExecNode_output_schema( return node->output_schema(); } -std::shared_ptr ExecPlan_prepare( - const std::shared_ptr& plan, - const std::shared_ptr& final_node, cpp11::list sort_options, - cpp11::strings metadata, int64_t head = -1) { - // For now, don't require R to construct SinkNodes. - // Instead, just pass the node we should collect as an argument. - arrow::AsyncGenerator> sink_gen; - - // Sorting uses a different sink node; there is no general sort yet - if (sort_options.size() > 0) { - if (head >= 0) { - // Use the SelectK node to take only what we need - MakeExecNodeOrStop( - "select_k_sink", plan.get(), {final_node.get()}, - compute::SelectKSinkNodeOptions{ - arrow::compute::SelectKOptions( - head, std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)) - ->sort_keys), - &sink_gen}); - } else { - MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, - compute::OrderBySinkNodeOptions{ - *std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)), - &sink_gen}); - } - } else { - MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, - compute::SinkNodeOptions{&sink_gen}); - } - return plan; -} - // [[arrow::export]] std::string ExecPlan_ToString(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - return prepared_plan->ToString(); + return prepared_plan.first->ToString(); } #if defined(ARROW_R_WITH_DATASET) From 238e4062c45b800b10e3b33219a7db1dbbe1d5b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 16:03:18 +0100 Subject: [PATCH 54/80] C++ lint --- r/src/compute-exec.cpp | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 6c69583225c..114b51ca755 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -68,19 +68,19 @@ ExecPlan_prepare(const std::shared_ptr& plan, if (head >= 0) { // Use the SelectK node to take only what we need MakeExecNodeOrStop( - "select_k_sink", plan.get(), {final_node.get()}, - compute::SelectKSinkNodeOptions{ - arrow::compute::SelectKOptions( - head, std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)) - ->sort_keys), - &sink_gen}); + "select_k_sink", plan.get(), {final_node.get()}, + compute::SelectKSinkNodeOptions{ + arrow::compute::SelectKOptions( + head, std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)) + ->sort_keys), + &sink_gen}); } else { MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, compute::OrderBySinkNodeOptions{ - *std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)), - &sink_gen}); + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); } } else { MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, @@ -90,15 +90,15 @@ ExecPlan_prepare(const std::shared_ptr& plan, // If the generator is destroyed before being completely drained, inform plan std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); - if (not_finished_yet) { - plan->StopProducing(); - } - }}; + if (not_finished_yet) { + plan->StopProducing(); + } + }}; // Attach metadata to the schema auto out_schema = final_node->output_schema(); @@ -108,11 +108,11 @@ ExecPlan_prepare(const std::shared_ptr& plan, } std::pair, std::shared_ptr> - out; + out; out.first = plan; out.second = compute::MakeGeneratorReader( - out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, - gc_memory_pool()); + out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, + gc_memory_pool()); return out; } From 9a34255307522652f14e31c70bcf61c28b8476d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 16:10:32 +0100 Subject: [PATCH 55/80] C++ lint --- r/src/compute-exec.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 114b51ca755..d549eba6538 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -90,15 +90,15 @@ ExecPlan_prepare(const std::shared_ptr& plan, // If the generator is destroyed before being completely drained, inform plan std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); - - if (not_finished_yet) { - plan->StopProducing(); - } - }}; + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); + + if (not_finished_yet) { + plan->StopProducing(); + } + }}; // Attach metadata to the schema auto out_schema = final_node->output_schema(); From 1db36b621a73a1574f1f0ea8e15dd779209fe12f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 16:29:01 +0100 Subject: [PATCH 56/80] include safe call into R header file --- r/src/compute-exec.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index d549eba6538..4d356a78afd 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -16,6 +16,7 @@ // under the License. #include "./arrow_types.h" +#include "./safe-call-into-r.h" #include #include From 215345156f35a1fbe50ed4a9fb1fd69b07365b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 17:03:33 +0100 Subject: [PATCH 57/80] revert to old ExecPlan_run --- r/src/compute-exec.cpp | 56 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 4d356a78afd..485f12bc7a2 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -16,7 +16,6 @@ // under the License. #include "./arrow_types.h" -#include "./safe-call-into-r.h" #include #include @@ -122,9 +121,58 @@ std::shared_ptr ExecPlan_run( const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { - auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - StopIfNotOk(prepared_plan.first->StartProducing()); - return prepared_plan.second; + // For now, don't require R to construct SinkNodes. + // Instead, just pass the node we should collect as an argument. + arrow::AsyncGenerator> sink_gen; + + // Sorting uses a different sink node; there is no general sort yet + if (sort_options.size() > 0) { + if (head >= 0) { + // Use the SelectK node to take only what we need + MakeExecNodeOrStop( + "select_k_sink", plan.get(), {final_node.get()}, + compute::SelectKSinkNodeOptions{ + arrow::compute::SelectKOptions( + head, std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)) + ->sort_keys), + &sink_gen}); + } else { + MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, + compute::OrderBySinkNodeOptions{ + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); + } + } else { + MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, + compute::SinkNodeOptions{&sink_gen}); + } + + StopIfNotOk(plan->Validate()); + StopIfNotOk(plan->StartProducing()); + + // If the generator is destroyed before being completely drained, inform plan + std::shared_ptr stop_producing{nullptr, [plan](...) { + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); + + if (not_finished_yet) { + plan->StopProducing(); + } + }}; + + // Attach metadata to the schema + auto out_schema = final_node->output_schema(); + if (metadata.size() > 0) { + auto kv = strings_to_kvm(metadata); + out_schema = out_schema->WithMetadata(kv); + } + return compute::MakeGeneratorReader( + out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, + gc_memory_pool()); } // [[arrow::export]] From 807aa88c3d2f47dce91bac3bcb57a8f6a053f2bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 17:14:53 +0100 Subject: [PATCH 58/80] c++ lint --- r/src/compute-exec.cpp | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 485f12bc7a2..334da56bcf1 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -130,19 +130,19 @@ std::shared_ptr ExecPlan_run( if (head >= 0) { // Use the SelectK node to take only what we need MakeExecNodeOrStop( - "select_k_sink", plan.get(), {final_node.get()}, - compute::SelectKSinkNodeOptions{ - arrow::compute::SelectKOptions( - head, std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)) - ->sort_keys), - &sink_gen}); + "select_k_sink", plan.get(), {final_node.get()}, + compute::SelectKSinkNodeOptions{ + arrow::compute::SelectKOptions( + head, std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)) + ->sort_keys), + &sink_gen}); } else { MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, compute::OrderBySinkNodeOptions{ - *std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)), - &sink_gen}); + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); } } else { MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, @@ -154,15 +154,15 @@ std::shared_ptr ExecPlan_run( // If the generator is destroyed before being completely drained, inform plan std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); - if (not_finished_yet) { - plan->StopProducing(); - } - }}; + if (not_finished_yet) { + plan->StopProducing(); + } + }}; // Attach metadata to the schema auto out_schema = final_node->output_schema(); @@ -171,8 +171,8 @@ std::shared_ptr ExecPlan_run( out_schema = out_schema->WithMetadata(kv); } return compute::MakeGeneratorReader( - out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, - gc_memory_pool()); + out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, + gc_memory_pool()); } // [[arrow::export]] From 6141c457f25db2093a43cb96d53bc0e005a39c40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 19 Jul 2022 17:31:03 +0100 Subject: [PATCH 59/80] simplified ExecPlan_prepare --- r/src/compute-exec.cpp | 39 ++++++--------------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 334da56bcf1..1660fec56ee 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -55,10 +55,10 @@ std::shared_ptr MakeExecNodeOrStop( }); } -std::pair, std::shared_ptr> -ExecPlan_prepare(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { +std::shared_ptr ExecPlan_prepare( + const std::shared_ptr& plan, + const std::shared_ptr& final_node, cpp11::list sort_options, + cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; @@ -86,34 +86,7 @@ ExecPlan_prepare(const std::shared_ptr& plan, MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, compute::SinkNodeOptions{&sink_gen}); } - StopIfNotOk(plan->Validate()); - - // If the generator is destroyed before being completely drained, inform plan - std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); - - if (not_finished_yet) { - plan->StopProducing(); - } - }}; - - // Attach metadata to the schema - auto out_schema = final_node->output_schema(); - if (metadata.size() > 0) { - auto kv = strings_to_kvm(metadata); - out_schema = out_schema->WithMetadata(kv); - } - - std::pair, std::shared_ptr> - out; - out.first = plan; - out.second = compute::MakeGeneratorReader( - out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, - gc_memory_pool()); - return out; + return plan; } // [[arrow::export]] @@ -192,7 +165,7 @@ std::string ExecPlan_ToString(const std::shared_ptr& plan, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - return prepared_plan.first->ToString(); + return prepared_plan->ToString(); } #if defined(ARROW_R_WITH_DATASET) From 9af14034255cc709013ca6bf83b0481049481250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Jul 2022 13:51:32 +0100 Subject: [PATCH 60/80] `ExecPlan_prepare` to return both the plan and the sink node --- r/src/compute-exec.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 1660fec56ee..2f585b97ec9 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -55,19 +55,20 @@ std::shared_ptr MakeExecNodeOrStop( }); } -std::shared_ptr ExecPlan_prepare( - const std::shared_ptr& plan, - const std::shared_ptr& final_node, cpp11::list sort_options, - cpp11::strings metadata, int64_t head = -1) { +std::pair, std::shared_ptr> +ExecPlan_prepare(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; + std::shared_ptr node; // Sorting uses a different sink node; there is no general sort yet if (sort_options.size() > 0) { if (head >= 0) { // Use the SelectK node to take only what we need - MakeExecNodeOrStop( + node = MakeExecNodeOrStop( "select_k_sink", plan.get(), {final_node.get()}, compute::SelectKSinkNodeOptions{ arrow::compute::SelectKOptions( @@ -76,17 +77,22 @@ std::shared_ptr ExecPlan_prepare( ->sort_keys), &sink_gen}); } else { - MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, + node = MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, compute::OrderBySinkNodeOptions{ *std::dynamic_pointer_cast( make_compute_options("sort_indices", sort_options)), &sink_gen}); } } else { - MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, + node = MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, compute::SinkNodeOptions{&sink_gen}); } - return plan; + + std::pair, std::shared_ptr> + out; + out.first = plan; + out.second = node; + return out; } // [[arrow::export]] @@ -165,7 +171,7 @@ std::string ExecPlan_ToString(const std::shared_ptr& plan, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - return prepared_plan->ToString(); + return prepared_plan.first->ToString(); } #if defined(ARROW_R_WITH_DATASET) From 1f88ff71af85b36cd966dba994673aae80206a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 20 Jul 2022 16:23:30 +0100 Subject: [PATCH 61/80] C++ lint --- r/src/compute-exec.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 2f585b97ec9..f93bf1bd00c 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -77,19 +77,19 @@ ExecPlan_prepare(const std::shared_ptr& plan, ->sort_keys), &sink_gen}); } else { - node = MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, - compute::OrderBySinkNodeOptions{ - *std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)), - &sink_gen}); + node = + MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, + compute::OrderBySinkNodeOptions{ + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); } } else { node = MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, - compute::SinkNodeOptions{&sink_gen}); + compute::SinkNodeOptions{&sink_gen}); } - std::pair, std::shared_ptr> - out; + std::pair, std::shared_ptr> out; out.first = plan; out.second = node; return out; From fbb4c1ee13b303e59e7631ec1770d41785fbed0e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 20 Jul 2022 14:50:51 -0300 Subject: [PATCH 62/80] fix the rabbit hole that I led Dragos down --- r/R/arrowExports.R | 4 +-- r/R/dplyr.R | 2 +- r/R/query-engine.R | 5 ++- r/src/arrowExports.cpp | 8 ++--- r/src/compute-exec.cpp | 69 +++++++++++++----------------------------- 5 files changed, 30 insertions(+), 58 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index fb53308b83d..c73d8fd97bd 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -416,8 +416,8 @@ ExecNode_output_schema <- function(node) { .Call(`_arrow_ExecNode_output_schema`, node) } -ExecPlan_ToString <- function(plan, final_node, sort_options, metadata, head) { - .Call(`_arrow_ExecPlan_ToString`, plan, final_node, sort_options, metadata, head) +ExecPlan_BuildAndShow <- function(plan, final_node, sort_options, metadata, head) { + .Call(`_arrow_ExecPlan_BuildAndShow`, plan, final_node, sort_options, metadata, head) } ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 2ad07c3b869..e888c2a30fd 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -242,7 +242,7 @@ show_exec_plan <- function(x) { adq <- as_adq(x) plan <- ExecPlan$create() final_node <- plan$Build(adq) - cat(plan$ToString(final_node)) + cat(plan$BuildAndShow(final_node)) invisible(x) } diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 00d79a416a4..7b37179687f 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -263,7 +263,7 @@ ExecPlan <- R6Class("ExecPlan", # SinkNodes (involved in arrange and/or head/tail operations) are created in # ExecPlan_run and are not captured by the regular print method. We take a # similar approach to expose them before calling the print method. - ToString = function(node) { + BuildAndShow = function(node) { assert_is(node, "ExecNode") # Sorting and head/tail (if sorted) are handled in the SinkNode, @@ -281,14 +281,13 @@ ExecPlan <- R6Class("ExecPlan", sorting$orders <- as.integer(sorting$orders) } - out <- ExecPlan_ToString( + ExecPlan_BuildAndShow( self, node, sorting, prepare_key_value_metadata(node$final_metadata()), select_k ) - out }, Stop = function() ExecPlan_StopProducing(self) ) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index e168bdf4978..79a614ce4be 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -898,15 +898,15 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp -std::string ExecPlan_ToString(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head); -extern "C" SEXP _arrow_ExecPlan_ToString(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP metadata_sexp, SEXP head_sexp){ +std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head); +extern "C" SEXP _arrow_ExecPlan_BuildAndShow(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP metadata_sexp, SEXP head_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); arrow::r::Input&>::type final_node(final_node_sexp); arrow::r::Input::type sort_options(sort_options_sexp); arrow::r::Input::type metadata(metadata_sexp); arrow::r::Input::type head(head_sexp); - return cpp11::as_sexp(ExecPlan_ToString(plan, final_node, sort_options, metadata, head)); + return cpp11::as_sexp(ExecPlan_BuildAndShow(plan, final_node, sort_options, metadata, head)); END_CPP11 } // compute-exec.cpp @@ -5254,7 +5254,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 5}, { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1}, { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1}, - { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 5}, + { "_arrow_ExecPlan_BuildAndShow", (DL_FUNC) &_arrow_ExecPlan_BuildAndShow, 5}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index f93bf1bd00c..497e0cdc722 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -55,54 +55,13 @@ std::shared_ptr MakeExecNodeOrStop( }); } -std::pair, std::shared_ptr> +std::pair, std::shared_ptr> ExecPlan_prepare(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; - std::shared_ptr node; - - // Sorting uses a different sink node; there is no general sort yet - if (sort_options.size() > 0) { - if (head >= 0) { - // Use the SelectK node to take only what we need - node = MakeExecNodeOrStop( - "select_k_sink", plan.get(), {final_node.get()}, - compute::SelectKSinkNodeOptions{ - arrow::compute::SelectKOptions( - head, std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)) - ->sort_keys), - &sink_gen}); - } else { - node = - MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, - compute::OrderBySinkNodeOptions{ - *std::dynamic_pointer_cast( - make_compute_options("sort_indices", sort_options)), - &sink_gen}); - } - } else { - node = MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, - compute::SinkNodeOptions{&sink_gen}); - } - - std::pair, std::shared_ptr> out; - out.first = plan; - out.second = node; - return out; -} - -// [[arrow::export]] -std::shared_ptr ExecPlan_run( - const std::shared_ptr& plan, - const std::shared_ptr& final_node, cpp11::list sort_options, - cpp11::strings metadata, int64_t head = -1) { - // For now, don't require R to construct SinkNodes. - // Instead, just pass the node we should collect as an argument. - arrow::AsyncGenerator> sink_gen; // Sorting uses a different sink node; there is no general sort yet if (sort_options.size() > 0) { @@ -129,7 +88,6 @@ std::shared_ptr ExecPlan_run( } StopIfNotOk(plan->Validate()); - StopIfNotOk(plan->StartProducing()); // If the generator is destroyed before being completely drained, inform plan std::shared_ptr stop_producing{nullptr, [plan](...) { @@ -149,9 +107,24 @@ std::shared_ptr ExecPlan_run( auto kv = strings_to_kvm(metadata); out_schema = out_schema->WithMetadata(kv); } - return compute::MakeGeneratorReader( + + std::pair, std::shared_ptr> + out; + out.first = plan; + out.second = compute::MakeGeneratorReader( out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, gc_memory_pool()); + return out; +} + +// [[arrow::export]] +std::shared_ptr ExecPlan_run( + const std::shared_ptr& plan, + const std::shared_ptr& final_node, cpp11::list sort_options, + cpp11::strings metadata, int64_t head = -1) { + auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); + StopIfNotOk(prepared_plan.first->StartProducing()); + return prepared_plan.second; } // [[arrow::export]] @@ -166,10 +139,10 @@ std::shared_ptr ExecNode_output_schema( } // [[arrow::export]] -std::string ExecPlan_ToString(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, - int64_t head = -1) { +std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, + int64_t head = -1) { auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); return prepared_plan.first->ToString(); } From da730196fc62b7a1572dc2d40e2550744039a142 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 20 Jul 2022 15:10:56 -0300 Subject: [PATCH 63/80] always call startproducing --- r/src/compute-exec.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 497e0cdc722..8387e1f47e6 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -144,6 +144,7 @@ std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); + arrow::StopIfNotOk(prepared_plan.first->StartProducing()); return prepared_plan.first->ToString(); } From ee90354aff9cdc0240097202661dc3bc04e3e63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Jul 2022 09:51:35 +0100 Subject: [PATCH 64/80] bump ci From 66dcd221a38485b52fcb6621874ea0c1a3f33289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Jul 2022 17:25:57 +0100 Subject: [PATCH 65/80] updated unit tests + snapshots --- r/tests/testthat/_snaps/dplyr-query.md | 109 +++++++++ r/tests/testthat/test-dplyr-query.R | 313 +++++++++---------------- 2 files changed, 226 insertions(+), 196 deletions(-) create mode 100644 r/tests/testthat/_snaps/dplyr-query.md diff --git a/r/tests/testthat/_snaps/dplyr-query.md b/r/tests/testthat/_snaps/dplyr-query.md new file mode 100644 index 00000000000..371eb1c729d --- /dev/null +++ b/r/tests/testthat/_snaps/dplyr-query.md @@ -0,0 +1,109 @@ +# show_query() and explain() + + Code + mtcars %>% arrow_table() %>% show_query() + Output + ExecPlan with 3 nodes: + 2:SinkNode{} + 1:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} + 0:TableSourceNode{} + +--- + + Code + tbl %>% arrow_table() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% show_query() + Output + ExecPlan with 4 nodes: + 3:SinkNode{} + 2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten": add_checked(cast(int, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), 10)]} + 1:FilterNode{filter=((dbl > 2) and (chr != "e"))} + 0:TableSourceNode{} + +--- + + Code + tbl %>% arrow_table() %>% left_join(example_data %>% arrow_table() %>% mutate( + doubled_dbl = dbl * 2) %>% select(int, doubled_dbl), by = "int") %>% select( + int, verses, doubled_dbl) %>% show_query() + Output + ExecPlan with 7 nodes: + 6:SinkNode{} + 5:ProjectNode{projection=[int, verses, doubled_dbl]} + 4:HashJoinNode{} + 3:ProjectNode{projection=[int, "doubled_dbl": multiply_checked(dbl, 2)]} + 2:TableSourceNode{} + 1:ProjectNode{projection=[int, dbl, dbl2, lgl, false, chr, fct, verses, padded_strings, another_chr]} + 0:TableSourceNode{} + +--- + + Code + tbl %>% arrow_table() %>% left_join(example_data %>% arrow_table() %>% mutate( + doubled_dbl = dbl * 2) %>% select(int, doubled_dbl), by = "int") %>% select( + int, verses, doubled_dbl) %>% explain() + Output + ExecPlan with 7 nodes: + 6:SinkNode{} + 5:ProjectNode{projection=[int, verses, doubled_dbl]} + 4:HashJoinNode{} + 3:ProjectNode{projection=[int, "doubled_dbl": multiply_checked(dbl, 2)]} + 2:TableSourceNode{} + 1:ProjectNode{projection=[int, dbl, dbl2, lgl, false, chr, fct, verses, padded_strings, another_chr]} + 0:TableSourceNode{} + +--- + + Code + mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% + show_query() + Output + ExecPlan with 4 nodes: + 3:OrderBySinkNode{by={sort_keys=[FieldRef.Name(wt) DESC], null_placement=AtEnd}} + 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} + 1:FilterNode{filter=(mpg > 20)} + 0:TableSourceNode{} + +--- + + Code + mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% explain() + Output + ExecPlan with 4 nodes: + 3:OrderBySinkNode{by={sort_keys=[FieldRef.Name(wt) DESC], null_placement=AtEnd}} + 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} + 1:FilterNode{filter=(mpg > 20)} + 0:TableSourceNode{} + +--- + + Code + mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% + show_query() + Output + ExecPlan with 4 nodes: + 3:OrderBySinkNode{by={k=3, sort_keys=[FieldRef.Name(wt) DESC]}} + 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} + 1:FilterNode{filter=(mpg > 20)} + 0:TableSourceNode{} + ExecPlan with 3 nodes: + 2:SinkNode{} + 1:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} + 0:SourceNode{} + +--- + + Code + mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% + explain() + Output + ExecPlan with 4 nodes: + 3:OrderBySinkNode{by={k=3, sort_keys=[FieldRef.Name(wt) DESC]}} + 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} + 1:FilterNode{filter=(mpg > 20)} + 0:TableSourceNode{} + ExecPlan with 3 nodes: + 2:SinkNode{} + 1:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} + 0:SourceNode{} + diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index a2096b6c7ee..3ef9dcfdc3b 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -434,7 +434,7 @@ test_that("query_can_stream()", { ) }) -test_that("show_exec_plan(), show_query() and explain()", { +test_that("show_exec_plan()", { # minimal test - this fails if we don't coerce the input to `show_exec_plan()` # to be an `arrow_dplyr_query` expect_output( @@ -443,30 +443,7 @@ test_that("show_exec_plan(), show_query() and explain()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "TableSourceNode" # entry point - ) - ) - - # minimal test - show_query() - expect_output( - mtcars %>% - arrow_table() %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output new columns - "TableSourceNode" # entry point - ) - ) - - # minimal test - explain() - expect_output( - mtcars %>% - arrow_table() %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # final node "ProjectNode.*", # output columns "TableSourceNode" # entry point ) @@ -482,42 +459,7 @@ test_that("show_exec_plan(), show_query() and explain()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "chr, int, lgl, \"int_plus_ten\".*", # selected columns - "FilterNode.*", # filter node - "(dbl > 2).*", # filter expressions - "chr != \"e\".*", - "TableSourceNode" # entry point - ) - ) - - # arrow_table and mutate - show_query() - expect_output( - tbl %>% - arrow_table() %>% - filter(dbl > 2, chr != "e") %>% - select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "chr, int, lgl, \"int_plus_ten\".*", # selected columns - "FilterNode.*", # filter node - "(dbl > 2).*", # filter expressions - "chr != \"e\".*", - "TableSourceNode" # entry point - ) - ) - - # arrow_table and mutate - explain() - expect_output( - tbl %>% - arrow_table() %>% - filter(dbl > 2, chr != "e") %>% - select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # final node "chr, int, lgl, \"int_plus_ten\".*", # selected columns "FilterNode.*", # filter node "(dbl > 2).*", # filter expressions @@ -536,7 +478,9 @@ test_that("show_exec_plan(), show_query() and explain()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # final node "chr, int, lgl, \"int_plus_ten\".*", # selected columns + "FilterNode.*", # filter node "(dbl > 2).*", # the filter expressions "chr != \"e\".*", "TableSourceNode" # the entry point" @@ -553,6 +497,7 @@ test_that("show_exec_plan(), show_query() and explain()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # final node "ProjectNode.*", # output columns "GroupByNode.*", # the group_by statement "keys=.*lgl.*", # the key for the aggregations @@ -562,44 +507,6 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - # test with group_by and summarise - show_query() - expect_output( - tbl %>% - arrow_table() %>% - group_by(lgl) %>% - summarise(avg = mean(dbl, na.rm = TRUE)) %>% - ungroup() %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "GroupByNode.*", # the group_by statement - "keys=.*lgl.*", # the key for the aggregations - "aggregates=.*hash_mean.*avg.*", # the aggregations - "ProjectNode.*", # the input columns - "TableSourceNode" # the entry point - ) - ) - - # test with group_by and summarise - explain() - expect_output( - tbl %>% - arrow_table() %>% - group_by(lgl) %>% - summarise(avg = mean(dbl, na.rm = TRUE)) %>% - ungroup() %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "GroupByNode.*", # group_by statement - "keys=.*lgl.*", # key for the aggregations - "aggregates=.*hash_mean.*avg.*", # aggregations - "ProjectNode.*", # input columns - "TableSourceNode" # entry point - ) - ) - # test with join expect_output( tbl %>% @@ -615,6 +522,7 @@ test_that("show_exec_plan(), show_query() and explain()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # final node "ProjectNode.*", # output columns "HashJoinNode.*", # the join "ProjectNode.*", # input columns for the second table @@ -625,8 +533,104 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - # test with join - show_query() + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node + "ProjectNode.*", # output columns + "FilterNode.*", # filter node + "TableSourceNode.*" # entry point + ) + ) + + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + head(3) %>% + show_exec_plan(), + # we expect 2 chained ExecPlans here due to arrange being a pipeline breaker + # sink node and head effectively starting a new ExecPlan + # the entry point for the second ExecPlan is a SourceNode and not a TableSourceNode + regexp = paste0( + # the first ExecPlan (up to and including arrange) + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*k=3.*", # the arrange sink node (NB the k is set here, + # not in the head() node) + "ProjectNode.*", # + "FilterNode.*mpg > 20.*", # filter node and filtering expression + "TableSourceNode.*", # entry point + # the second ExecPlan (head) + "SinkNode.*", # final node + "ProjectNode.*", # output columns + "SourceNode.*" # entry point + ) + ) +}) + +test_that(" show_query() and explain()", { + # show_query and explain are identical to show_exec_plan, so will be tested + # with expect_snapshot() to minimise repetition and the size of the test file + + # minimal test + expect_snapshot( + mtcars %>% + arrow_table() %>% + show_query() + ) + + expect_output( + mtcars %>% + arrow_table() %>% + explain() + ) + + # arrow_table + mutate + expect_snapshot( + tbl %>% + arrow_table() %>% + filter(dbl > 2, chr != "e") %>% + select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% + show_query() + ) + + expect_output( + tbl %>% + arrow_table() %>% + filter(dbl > 2, chr != "e") %>% + select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% + explain() + ) + + # test with group by + summarise expect_output( + tbl %>% + arrow_table() %>% + group_by(lgl) %>% + summarise(avg = mean(dbl, na.rm = TRUE)) %>% + ungroup() %>% + show_query() + ) + + expect_output( + tbl %>% + arrow_table() %>% + group_by(lgl) %>% + summarise(avg = mean(dbl, na.rm = TRUE)) %>% + ungroup() %>% + explain() + ) + + # test with join + expect_snapshot( tbl %>% arrow_table() %>% left_join( @@ -637,21 +641,10 @@ test_that("show_exec_plan(), show_query() and explain()", { by = "int" ) %>% select(int, verses, doubled_dbl) %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "HashJoinNode.*", # join - "ProjectNode.*", # input columns for the second table - "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate - "TableSourceNode.*", # second table - "ProjectNode.*", # input columns for the first table - "TableSourceNode" # first table - ) + show_query() ) - # test with join - explain() - expect_output( + expect_snapshot( tbl %>% arrow_table() %>% left_join( @@ -662,114 +655,42 @@ test_that("show_exec_plan(), show_query() and explain()", { by = "int" ) %>% select(int, verses, doubled_dbl) %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "HashJoinNode.*", # join - "ProjectNode.*", # input columns for the second table - "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # mutate - "TableSourceNode.*", # second table - "ProjectNode.*", # input columns for the first table - "TableSourceNode" # first table - ) + explain() ) - expect_output( + # test with filter and arrange + expect_snapshot( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% - show_exec_plan(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node - "ProjectNode.*", # output columns - "FilterNode.*", # filter node - "TableSourceNode.*" # entry point - ) + show_query() ) - expect_output( + expect_snapshot( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node - "ProjectNode.*", # output columns - "FilterNode.*", # filter node - "TableSourceNode.*" # entry point - ) + explain() ) - expect_output( - mtcars %>% - arrow_table() %>% - filter(mpg > 20) %>% - arrange(desc(wt)) %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node - "ProjectNode.*", # output columns - "FilterNode.*", # filter node - "TableSourceNode.*" # entry point - ) - ) - - expect_output( + # test with filter + arrange + head + expect_snapshot( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% - show_exec_plan(), - # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + - # the entry point is now a SourceNode and not a TableSourceNode - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # - "ProjectNode.*", # output columns - "SourceNode.*" # entry point - ) + show_query() ) - expect_output( + expect_snapshot( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% - show_query(), - # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + - # the entry point is now a SourceNode and not a TableSourceNode - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # - "ProjectNode.*", # output columns - "SourceNode.*" # entry point - ) - ) - expect_output( - mtcars %>% - arrow_table() %>% - filter(mpg > 20) %>% - arrange(desc(wt)) %>% - head(3) %>% - explain(), - # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + - # the entry point is now a SourceNode and not a TableSourceNode - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # - "ProjectNode.*", # output columns - "SourceNode.*" # entry point - ) + explain() ) }) From 0482c6ce303985158d53359c2529ba5752497f94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Jul 2022 17:31:13 +0100 Subject: [PATCH 66/80] updated dataset unit tests + snapshots --- r/tests/testthat/_snaps/dataset-dplyr.md | 126 +++++++++++++++++++++++ r/tests/testthat/test-dataset-dplyr.R | 89 +++++++++++++++- 2 files changed, 211 insertions(+), 4 deletions(-) create mode 100644 r/tests/testthat/_snaps/dataset-dplyr.md diff --git a/r/tests/testthat/_snaps/dataset-dplyr.md b/r/tests/testthat/_snaps/dataset-dplyr.md new file mode 100644 index 00000000000..257f2b6384e --- /dev/null +++ b/r/tests/testthat/_snaps/dataset-dplyr.md @@ -0,0 +1,126 @@ +# how_query() and explain() with datasets + + Code + ds %>% show_query() + Output + ExecPlan with 3 nodes: + 2:SinkNode{} + 1:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} + 0:SourceNode{} + +--- + + Code + ds %>% explain() + Output + ExecPlan with 3 nodes: + 2:SinkNode{} + 1:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} + 0:SourceNode{} + +--- + + Code + ds %>% select(string = chr, integer = int, part) %>% filter(integer > 6L & + part == 1) %>% show_query() + Output + ExecPlan with 4 nodes: + 3:SinkNode{} + 2:ProjectNode{projection=["string": chr, "integer": int, part]} + 1:FilterNode{filter=((int > 6) and (cast(part, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}) == 1))} + 0:SourceNode{} + +--- + + Code + ds %>% select(string = chr, integer = int, part) %>% filter(integer > 6L & + part == 1) %>% explain() + Output + ExecPlan with 4 nodes: + 3:SinkNode{} + 2:ProjectNode{projection=["string": chr, "integer": int, part]} + 1:FilterNode{filter=((int > 6) and (cast(part, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}) == 1))} + 0:SourceNode{} + +--- + + Code + ds %>% group_by(part) %>% summarise(avg = mean(int)) %>% show_query() + Output + ExecPlan with 6 nodes: + 5:SinkNode{} + 4:ProjectNode{projection=[part, avg]} + 3:ProjectNode{projection=[part, avg]} + 2:GroupByNode{keys=["part"], aggregates=[ + hash_mean(avg, {skip_nulls=false, min_count=0}), + ]} + 1:ProjectNode{projection=["avg": int, part]} + 0:SourceNode{} + +--- + + Code + ds %>% group_by(part) %>% summarise(avg = mean(int)) %>% explain() + Output + ExecPlan with 6 nodes: + 5:SinkNode{} + 4:ProjectNode{projection=[part, avg]} + 3:ProjectNode{projection=[part, avg]} + 2:GroupByNode{keys=["part"], aggregates=[ + hash_mean(avg, {skip_nulls=false, min_count=0}), + ]} + 1:ProjectNode{projection=["avg": int, part]} + 0:SourceNode{} + +--- + + Code + ds %>% filter(lgl) %>% arrange(chr) %>% show_query() + Output + ExecPlan with 4 nodes: + 3:OrderBySinkNode{by={sort_keys=[FieldRef.Name(chr) ASC], null_placement=AtEnd}} + 2:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} + 1:FilterNode{filter=lgl} + 0:SourceNode{} + +--- + + Code + ds %>% filter(lgl) %>% arrange(chr) %>% explain() + Output + ExecPlan with 4 nodes: + 3:OrderBySinkNode{by={sort_keys=[FieldRef.Name(chr) ASC], null_placement=AtEnd}} + 2:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} + 1:FilterNode{filter=lgl} + 0:SourceNode{} + +--- + + Code + ds %>% filter(lgl) %>% arrange(chr) %>% head() %>% show_query() + Output + ExecPlan with 4 nodes: + 3:OrderBySinkNode{by={k=6, sort_keys=[FieldRef.Name(chr) ASC]}} + 2:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} + 1:FilterNode{filter=lgl} + 0:SourceNode{} + ExecPlan with 3 nodes: + 2:SinkNode{} + 1:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} + 0:SourceNode{} + +--- + + Code + ds %>% filter(lgl) %>% arrange(chr) %>% head() %>% explain() + Output + ExecPlan with 4 nodes: + 3:OrderBySinkNode{by={k=6, sort_keys=[FieldRef.Name(chr) ASC]}} + 2:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} + 1:FilterNode{filter=lgl} + 0:SourceNode{} + ExecPlan with 3 nodes: + 2:SinkNode{} + 1:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} + 0:SourceNode{} + diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index b0bd61653fb..e43551d0bb1 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -341,7 +341,7 @@ test_that("dplyr method not implemented messages", { ) }) -test_that("show_exec_plan(), show_query() and explain() with datasets", { +test_that("show_exec_plan() with datasets", { ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) # minimal test @@ -350,6 +350,7 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # final node "ProjectNode.*", # output columns "SourceNode" # entry point ) @@ -363,6 +364,7 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # final node "ProjectNode.*", # output columns "FilterNode.*", # filter node "int > 6.*cast.*", # filtering expressions + auto-casting of part @@ -378,6 +380,7 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # final node "ProjectNode.*", # output columns "GroupByNode.*", # group by node "keys=.*part.*", # key for aggregations @@ -409,13 +412,91 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { arrange(chr) %>% head() %>% show_exec_plan(), - # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + # we expect 2 chained ExecPlans here due to arrange being a pipeline breaker + # sink node and head effectively starting a new ExecPlan regexp = paste0( + # first ExecPlan (up to and including arrange) "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # + "OrderBySinkNode.*", # the arrange sink node + "ProjectNode.*", # output columns + "FilterNode.*filter=lgl.*", # filter node + "SourceNode.*", # entry point + # second ExecPlan (head) + "SinkNode.*", # the output node "ProjectNode.*", # output columns "SourceNode" # entry point ) ) }) + +test_that("how_query() and explain() with datasets", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + + # minimal test + expect_snapshot( + ds %>% + show_query() + ) + expect_snapshot( + ds %>% + explain() + ) + + # filter and select + expect_snapshot( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6L & part == 1) %>% + show_query() + ) + expect_snapshot( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6L & part == 1) %>% + explain() + ) + + # group_by and summarise + expect_snapshot( + ds %>% + group_by(part) %>% + summarise(avg = mean(int)) %>% + show_query() + ) + expect_snapshot( + ds %>% + group_by(part) %>% + summarise(avg = mean(int)) %>% + explain() + ) + + # filter + arrange + expect_snapshot( + ds %>% + filter(lgl) %>% + arrange(chr) %>% + show_query() + ) + expect_snapshot( + ds %>% + filter(lgl) %>% + arrange(chr) %>% + explain() + ) + + # filter + arrange + head + expect_snapshot( + ds %>% + filter(lgl) %>% + arrange(chr) %>% + head() %>% + show_query() + ) + expect_snapshot( + ds %>% + filter(lgl) %>% + arrange(chr) %>% + head() %>% + explain() + ) +}) From ad5024f4176ece6d5a658a9e3bf97660f72f91e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Jul 2022 17:32:53 +0100 Subject: [PATCH 67/80] remove `BuildAndShow` + add `ToString` --- r/R/arrowExports.R | 4 ++-- r/R/query-engine.R | 44 ++++++++++-------------------------------- r/src/arrowExports.cpp | 12 ++++-------- r/src/compute-exec.cpp | 9 ++------- 4 files changed, 18 insertions(+), 51 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index c73d8fd97bd..6de6ab5bbfa 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -416,8 +416,8 @@ ExecNode_output_schema <- function(node) { .Call(`_arrow_ExecNode_output_schema`, node) } -ExecPlan_BuildAndShow <- function(plan, final_node, sort_options, metadata, head) { - .Call(`_arrow_ExecPlan_BuildAndShow`, plan, final_node, sort_options, metadata, head) +ExecPlan_ToString <- function(plan) { + .Call(`_arrow_ExecPlan_ToString`, plan) } ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 7b37179687f..517845905d7 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -67,7 +67,7 @@ ExecPlan <- R6Class("ExecPlan", out$extras$source_schema <- .data$schema out }, - Build = function(.data) { + Build = function(.data, explain = FALSE) { # This method takes an arrow_dplyr_query and chains together the # ExecNodes that they produce. It does not evaluate them--that is Run(). group_vars <- dplyr::group_vars(.data) @@ -83,10 +83,10 @@ ExecPlan <- R6Class("ExecPlan", # SinkNode, so if there are any steps done after head/tail, we need to # evaluate the query up to then and then do a new query for the rest. # as_record_batch_reader() will build and run an ExecPlan - node <- self$SourceNode(as_record_batch_reader(.data$.data)) + node <- self$SourceNode(as_record_batch_reader(.data$.data, explain = explain)) } else { # Recurse - node <- self$Build(.data$.data) + node <- self$Build(.data$.data, explain = explain) } } else { node <- self$Scan(.data) @@ -192,7 +192,7 @@ ExecPlan <- R6Class("ExecPlan", } node }, - Run = function(node) { + Run = function(node, explain = FALSE) { assert_is(node, "ExecNode") # Sorting and head/tail (if sorted) are handled in the SinkNode, @@ -249,6 +249,10 @@ ExecPlan <- R6Class("ExecPlan", out <- as_record_batch_reader(tab) } + if (explain) { + cat(self$ToString()) + } + out }, Write = function(node, ...) { @@ -260,36 +264,8 @@ ExecPlan <- R6Class("ExecPlan", ... ) }, - # SinkNodes (involved in arrange and/or head/tail operations) are created in - # ExecPlan_run and are not captured by the regular print method. We take a - # similar approach to expose them before calling the print method. - BuildAndShow = function(node) { - assert_is(node, "ExecNode") - - # Sorting and head/tail (if sorted) are handled in the SinkNode, - # created in ExecPlan_run - sorting <- node$extras$sort %||% list() - select_k <- node$extras$head %||% -1L - has_sorting <- length(sorting) > 0 - if (has_sorting) { - if (!is.null(node$extras$tail)) { - # Reverse the sort order and take the top K, then after we'll reverse - # the resulting rows so that it is ordered as expected - sorting$orders <- !sorting$orders - select_k <- node$extras$tail - } - sorting$orders <- as.integer(sorting$orders) - } - - ExecPlan_BuildAndShow( - self, - node, - sorting, - prepare_key_value_metadata(node$final_metadata()), - select_k - ) - }, - Stop = function() ExecPlan_StopProducing(self) + Stop = function() ExecPlan_StopProducing(self), + ToString = function() ExecPlan_ToString(self) ) ) # nolint end. diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 79a614ce4be..6809270d8dc 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -898,15 +898,11 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp -std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head); -extern "C" SEXP _arrow_ExecPlan_BuildAndShow(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP metadata_sexp, SEXP head_sexp){ +std::string ExecPlan_ToString(const std::shared_ptr& plan); +extern "C" SEXP _arrow_ExecPlan_ToString(SEXP plan_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); - arrow::r::Input&>::type final_node(final_node_sexp); - arrow::r::Input::type sort_options(sort_options_sexp); - arrow::r::Input::type metadata(metadata_sexp); - arrow::r::Input::type head(head_sexp); - return cpp11::as_sexp(ExecPlan_BuildAndShow(plan, final_node, sort_options, metadata, head)); + return cpp11::as_sexp(ExecPlan_ToString(plan)); END_CPP11 } // compute-exec.cpp @@ -5254,7 +5250,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 5}, { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1}, { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1}, - { "_arrow_ExecPlan_BuildAndShow", (DL_FUNC) &_arrow_ExecPlan_BuildAndShow, 5}, + { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 1}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 8387e1f47e6..b8d51d8c6fa 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -139,13 +139,8 @@ std::shared_ptr ExecNode_output_schema( } // [[arrow::export]] -std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, - int64_t head = -1) { - auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - arrow::StopIfNotOk(prepared_plan.first->StartProducing()); - return prepared_plan.first->ToString(); +std::string ExecPlan_ToString(const std::shared_ptr& plan) { + return plan->ToString(); } #if defined(ARROW_R_WITH_DATASET) From 20799f9e5b55bb6c144aabd8626ddbd3c9143156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Jul 2022 17:35:50 +0100 Subject: [PATCH 68/80] update `as_record_batch_reader` and use it inside `show_exec_plan()` --- r/R/dplyr.R | 5 +---- r/R/record-batch-reader.R | 6 +++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index e888c2a30fd..cace986bc03 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -239,10 +239,7 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' mutate(x = gear/carb) %>% #' show_exec_plan() show_exec_plan <- function(x) { - adq <- as_adq(x) - plan <- ExecPlan$create() - final_node <- plan$Build(adq) - cat(plan$BuildAndShow(final_node)) + as_record_batch_reader(as_adq(x), explain = TRUE) invisible(x) } diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R index 8f6a600dfb1..b6f257cc3dd 100644 --- a/r/R/record-batch-reader.R +++ b/r/R/record-batch-reader.R @@ -236,11 +236,11 @@ as_record_batch_reader.Dataset <- function(x, ...) { #' @rdname as_record_batch_reader #' @export -as_record_batch_reader.arrow_dplyr_query <- function(x, ...) { +as_record_batch_reader.arrow_dplyr_query <- function(x, ..., explain = FALSE) { # See query-engine.R for ExecPlan/Nodes plan <- ExecPlan$create() - final_node <- plan$Build(x) - plan$Run(final_node) + final_node <- plan$Build(x, explain = explain) + plan$Run(final_node, explain = explain) } #' @rdname as_record_batch_reader From 21bc8c32ee6ca6923c1d3ddea06f914b4de657a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Jul 2022 18:27:43 +0100 Subject: [PATCH 69/80] docs --- r/R/record-batch-reader.R | 1 + r/man/as_record_batch_reader.Rd | 4 +++- r/tests/testthat/test-dataset-dplyr.R | 4 ++-- r/tests/testthat/test-dplyr-query.R | 8 ++++---- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R index b6f257cc3dd..6d6b5cd873d 100644 --- a/r/R/record-batch-reader.R +++ b/r/R/record-batch-reader.R @@ -191,6 +191,7 @@ RecordBatchFileReader$create <- function(file) { #' Convert an object to an Arrow RecordBatchReader #' #' @param x An object to convert to a [RecordBatchReader] +#' @param explain logical. If `TRUE` the `ExecPlan` is printed. #' @param ... Passed to S3 methods #' #' @return A [RecordBatchReader] diff --git a/r/man/as_record_batch_reader.Rd b/r/man/as_record_batch_reader.Rd index e635c0b98bd..f4332dfe53c 100644 --- a/r/man/as_record_batch_reader.Rd +++ b/r/man/as_record_batch_reader.Rd @@ -23,7 +23,7 @@ as_record_batch_reader(x, ...) \method{as_record_batch_reader}{Dataset}(x, ...) -\method{as_record_batch_reader}{arrow_dplyr_query}(x, ...) +\method{as_record_batch_reader}{arrow_dplyr_query}(x, ..., explain = FALSE) \method{as_record_batch_reader}{Scanner}(x, ...) } @@ -31,6 +31,8 @@ as_record_batch_reader(x, ...) \item{x}{An object to convert to a \link{RecordBatchReader}} \item{...}{Passed to S3 methods} + +\item{explain}{logical. If \code{TRUE} the \code{ExecPlan} is printed.} } \value{ A \link{RecordBatchReader} diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index e43551d0bb1..2db03308bfd 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -470,7 +470,7 @@ test_that("how_query() and explain() with datasets", { explain() ) - # filter + arrange + # filter & arrange expect_snapshot( ds %>% filter(lgl) %>% @@ -484,7 +484,7 @@ test_that("how_query() and explain() with datasets", { explain() ) - # filter + arrange + head + # filter & arrange & head expect_snapshot( ds %>% filter(lgl) %>% diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 3ef9dcfdc3b..44ade710bf1 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -591,7 +591,7 @@ test_that(" show_query() and explain()", { explain() ) - # arrow_table + mutate + # arrow_table & mutate expect_snapshot( tbl %>% arrow_table() %>% @@ -610,7 +610,7 @@ test_that(" show_query() and explain()", { explain() ) - # test with group by + summarise + # test with group by & summarise expect_output( tbl %>% arrow_table() %>% @@ -658,7 +658,7 @@ test_that(" show_query() and explain()", { explain() ) - # test with filter and arrange + # test with filter & arrange expect_snapshot( mtcars %>% arrow_table() %>% @@ -675,7 +675,7 @@ test_that(" show_query() and explain()", { explain() ) - # test with filter + arrange + head + # test with filter & arrange & head expect_snapshot( mtcars %>% arrow_table() %>% From 6376bb22e6078f8844ad3e49f8cef51d700bf875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 21 Jul 2022 19:56:59 +0100 Subject: [PATCH 70/80] lints --- r/tests/testthat/test-dataset-dplyr.R | 4 ++-- r/tests/testthat/test-dplyr-query.R | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 2db03308bfd..902f07d3563 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -470,7 +470,7 @@ test_that("how_query() and explain() with datasets", { explain() ) - # filter & arrange + # filter and arrange expect_snapshot( ds %>% filter(lgl) %>% @@ -484,7 +484,7 @@ test_that("how_query() and explain() with datasets", { explain() ) - # filter & arrange & head + # filter and arrange and head expect_snapshot( ds %>% filter(lgl) %>% diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 44ade710bf1..1743dbcf869 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -591,7 +591,7 @@ test_that(" show_query() and explain()", { explain() ) - # arrow_table & mutate + # arrow_table and mutate expect_snapshot( tbl %>% arrow_table() %>% @@ -610,7 +610,7 @@ test_that(" show_query() and explain()", { explain() ) - # test with group by & summarise + # test with group by and summarise expect_output( tbl %>% arrow_table() %>% @@ -658,7 +658,7 @@ test_that(" show_query() and explain()", { explain() ) - # test with filter & arrange + # test with filter and arrange expect_snapshot( mtcars %>% arrow_table() %>% From 5751543601b84ffc6680a499fc302bc5b2115899 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 09:06:36 +0100 Subject: [PATCH 71/80] revert to da730196fc62b7a1572dc2d40e2550744039a142 --- r/R/arrowExports.R | 4 +- r/R/dplyr.R | 5 +- r/R/query-engine.R | 44 +++- r/R/record-batch-reader.R | 7 +- r/man/as_record_batch_reader.Rd | 4 +- r/src/arrowExports.cpp | 12 +- r/src/compute-exec.cpp | 9 +- r/tests/testthat/_snaps/dataset-dplyr.md | 126 --------- r/tests/testthat/_snaps/dplyr-query.md | 109 -------- r/tests/testthat/test-dataset-dplyr.R | 89 +------ r/tests/testthat/test-dplyr-query.R | 313 ++++++++++++++--------- 11 files changed, 259 insertions(+), 463 deletions(-) delete mode 100644 r/tests/testthat/_snaps/dataset-dplyr.md delete mode 100644 r/tests/testthat/_snaps/dplyr-query.md diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 6de6ab5bbfa..c73d8fd97bd 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -416,8 +416,8 @@ ExecNode_output_schema <- function(node) { .Call(`_arrow_ExecNode_output_schema`, node) } -ExecPlan_ToString <- function(plan) { - .Call(`_arrow_ExecPlan_ToString`, plan) +ExecPlan_BuildAndShow <- function(plan, final_node, sort_options, metadata, head) { + .Call(`_arrow_ExecPlan_BuildAndShow`, plan, final_node, sort_options, metadata, head) } ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { diff --git a/r/R/dplyr.R b/r/R/dplyr.R index cace986bc03..e888c2a30fd 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -239,7 +239,10 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' mutate(x = gear/carb) %>% #' show_exec_plan() show_exec_plan <- function(x) { - as_record_batch_reader(as_adq(x), explain = TRUE) + adq <- as_adq(x) + plan <- ExecPlan$create() + final_node <- plan$Build(adq) + cat(plan$BuildAndShow(final_node)) invisible(x) } diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 517845905d7..7b37179687f 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -67,7 +67,7 @@ ExecPlan <- R6Class("ExecPlan", out$extras$source_schema <- .data$schema out }, - Build = function(.data, explain = FALSE) { + Build = function(.data) { # This method takes an arrow_dplyr_query and chains together the # ExecNodes that they produce. It does not evaluate them--that is Run(). group_vars <- dplyr::group_vars(.data) @@ -83,10 +83,10 @@ ExecPlan <- R6Class("ExecPlan", # SinkNode, so if there are any steps done after head/tail, we need to # evaluate the query up to then and then do a new query for the rest. # as_record_batch_reader() will build and run an ExecPlan - node <- self$SourceNode(as_record_batch_reader(.data$.data, explain = explain)) + node <- self$SourceNode(as_record_batch_reader(.data$.data)) } else { # Recurse - node <- self$Build(.data$.data, explain = explain) + node <- self$Build(.data$.data) } } else { node <- self$Scan(.data) @@ -192,7 +192,7 @@ ExecPlan <- R6Class("ExecPlan", } node }, - Run = function(node, explain = FALSE) { + Run = function(node) { assert_is(node, "ExecNode") # Sorting and head/tail (if sorted) are handled in the SinkNode, @@ -249,10 +249,6 @@ ExecPlan <- R6Class("ExecPlan", out <- as_record_batch_reader(tab) } - if (explain) { - cat(self$ToString()) - } - out }, Write = function(node, ...) { @@ -264,8 +260,36 @@ ExecPlan <- R6Class("ExecPlan", ... ) }, - Stop = function() ExecPlan_StopProducing(self), - ToString = function() ExecPlan_ToString(self) + # SinkNodes (involved in arrange and/or head/tail operations) are created in + # ExecPlan_run and are not captured by the regular print method. We take a + # similar approach to expose them before calling the print method. + BuildAndShow = function(node) { + assert_is(node, "ExecNode") + + # Sorting and head/tail (if sorted) are handled in the SinkNode, + # created in ExecPlan_run + sorting <- node$extras$sort %||% list() + select_k <- node$extras$head %||% -1L + has_sorting <- length(sorting) > 0 + if (has_sorting) { + if (!is.null(node$extras$tail)) { + # Reverse the sort order and take the top K, then after we'll reverse + # the resulting rows so that it is ordered as expected + sorting$orders <- !sorting$orders + select_k <- node$extras$tail + } + sorting$orders <- as.integer(sorting$orders) + } + + ExecPlan_BuildAndShow( + self, + node, + sorting, + prepare_key_value_metadata(node$final_metadata()), + select_k + ) + }, + Stop = function() ExecPlan_StopProducing(self) ) ) # nolint end. diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R index 6d6b5cd873d..8f6a600dfb1 100644 --- a/r/R/record-batch-reader.R +++ b/r/R/record-batch-reader.R @@ -191,7 +191,6 @@ RecordBatchFileReader$create <- function(file) { #' Convert an object to an Arrow RecordBatchReader #' #' @param x An object to convert to a [RecordBatchReader] -#' @param explain logical. If `TRUE` the `ExecPlan` is printed. #' @param ... Passed to S3 methods #' #' @return A [RecordBatchReader] @@ -237,11 +236,11 @@ as_record_batch_reader.Dataset <- function(x, ...) { #' @rdname as_record_batch_reader #' @export -as_record_batch_reader.arrow_dplyr_query <- function(x, ..., explain = FALSE) { +as_record_batch_reader.arrow_dplyr_query <- function(x, ...) { # See query-engine.R for ExecPlan/Nodes plan <- ExecPlan$create() - final_node <- plan$Build(x, explain = explain) - plan$Run(final_node, explain = explain) + final_node <- plan$Build(x) + plan$Run(final_node) } #' @rdname as_record_batch_reader diff --git a/r/man/as_record_batch_reader.Rd b/r/man/as_record_batch_reader.Rd index f4332dfe53c..e635c0b98bd 100644 --- a/r/man/as_record_batch_reader.Rd +++ b/r/man/as_record_batch_reader.Rd @@ -23,7 +23,7 @@ as_record_batch_reader(x, ...) \method{as_record_batch_reader}{Dataset}(x, ...) -\method{as_record_batch_reader}{arrow_dplyr_query}(x, ..., explain = FALSE) +\method{as_record_batch_reader}{arrow_dplyr_query}(x, ...) \method{as_record_batch_reader}{Scanner}(x, ...) } @@ -31,8 +31,6 @@ as_record_batch_reader(x, ...) \item{x}{An object to convert to a \link{RecordBatchReader}} \item{...}{Passed to S3 methods} - -\item{explain}{logical. If \code{TRUE} the \code{ExecPlan} is printed.} } \value{ A \link{RecordBatchReader} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 6809270d8dc..79a614ce4be 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -898,11 +898,15 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp -std::string ExecPlan_ToString(const std::shared_ptr& plan); -extern "C" SEXP _arrow_ExecPlan_ToString(SEXP plan_sexp){ +std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head); +extern "C" SEXP _arrow_ExecPlan_BuildAndShow(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP metadata_sexp, SEXP head_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); - return cpp11::as_sexp(ExecPlan_ToString(plan)); + arrow::r::Input&>::type final_node(final_node_sexp); + arrow::r::Input::type sort_options(sort_options_sexp); + arrow::r::Input::type metadata(metadata_sexp); + arrow::r::Input::type head(head_sexp); + return cpp11::as_sexp(ExecPlan_BuildAndShow(plan, final_node, sort_options, metadata, head)); END_CPP11 } // compute-exec.cpp @@ -5250,7 +5254,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 5}, { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1}, { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1}, - { "_arrow_ExecPlan_ToString", (DL_FUNC) &_arrow_ExecPlan_ToString, 1}, + { "_arrow_ExecPlan_BuildAndShow", (DL_FUNC) &_arrow_ExecPlan_BuildAndShow, 5}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index b8d51d8c6fa..8387e1f47e6 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -139,8 +139,13 @@ std::shared_ptr ExecNode_output_schema( } // [[arrow::export]] -std::string ExecPlan_ToString(const std::shared_ptr& plan) { - return plan->ToString(); +std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, cpp11::strings metadata, + int64_t head = -1) { + auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); + arrow::StopIfNotOk(prepared_plan.first->StartProducing()); + return prepared_plan.first->ToString(); } #if defined(ARROW_R_WITH_DATASET) diff --git a/r/tests/testthat/_snaps/dataset-dplyr.md b/r/tests/testthat/_snaps/dataset-dplyr.md deleted file mode 100644 index 257f2b6384e..00000000000 --- a/r/tests/testthat/_snaps/dataset-dplyr.md +++ /dev/null @@ -1,126 +0,0 @@ -# how_query() and explain() with datasets - - Code - ds %>% show_query() - Output - ExecPlan with 3 nodes: - 2:SinkNode{} - 1:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} - 0:SourceNode{} - ---- - - Code - ds %>% explain() - Output - ExecPlan with 3 nodes: - 2:SinkNode{} - 1:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} - 0:SourceNode{} - ---- - - Code - ds %>% select(string = chr, integer = int, part) %>% filter(integer > 6L & - part == 1) %>% show_query() - Output - ExecPlan with 4 nodes: - 3:SinkNode{} - 2:ProjectNode{projection=["string": chr, "integer": int, part]} - 1:FilterNode{filter=((int > 6) and (cast(part, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}) == 1))} - 0:SourceNode{} - ---- - - Code - ds %>% select(string = chr, integer = int, part) %>% filter(integer > 6L & - part == 1) %>% explain() - Output - ExecPlan with 4 nodes: - 3:SinkNode{} - 2:ProjectNode{projection=["string": chr, "integer": int, part]} - 1:FilterNode{filter=((int > 6) and (cast(part, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}) == 1))} - 0:SourceNode{} - ---- - - Code - ds %>% group_by(part) %>% summarise(avg = mean(int)) %>% show_query() - Output - ExecPlan with 6 nodes: - 5:SinkNode{} - 4:ProjectNode{projection=[part, avg]} - 3:ProjectNode{projection=[part, avg]} - 2:GroupByNode{keys=["part"], aggregates=[ - hash_mean(avg, {skip_nulls=false, min_count=0}), - ]} - 1:ProjectNode{projection=["avg": int, part]} - 0:SourceNode{} - ---- - - Code - ds %>% group_by(part) %>% summarise(avg = mean(int)) %>% explain() - Output - ExecPlan with 6 nodes: - 5:SinkNode{} - 4:ProjectNode{projection=[part, avg]} - 3:ProjectNode{projection=[part, avg]} - 2:GroupByNode{keys=["part"], aggregates=[ - hash_mean(avg, {skip_nulls=false, min_count=0}), - ]} - 1:ProjectNode{projection=["avg": int, part]} - 0:SourceNode{} - ---- - - Code - ds %>% filter(lgl) %>% arrange(chr) %>% show_query() - Output - ExecPlan with 4 nodes: - 3:OrderBySinkNode{by={sort_keys=[FieldRef.Name(chr) ASC], null_placement=AtEnd}} - 2:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} - 1:FilterNode{filter=lgl} - 0:SourceNode{} - ---- - - Code - ds %>% filter(lgl) %>% arrange(chr) %>% explain() - Output - ExecPlan with 4 nodes: - 3:OrderBySinkNode{by={sort_keys=[FieldRef.Name(chr) ASC], null_placement=AtEnd}} - 2:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} - 1:FilterNode{filter=lgl} - 0:SourceNode{} - ---- - - Code - ds %>% filter(lgl) %>% arrange(chr) %>% head() %>% show_query() - Output - ExecPlan with 4 nodes: - 3:OrderBySinkNode{by={k=6, sort_keys=[FieldRef.Name(chr) ASC]}} - 2:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} - 1:FilterNode{filter=lgl} - 0:SourceNode{} - ExecPlan with 3 nodes: - 2:SinkNode{} - 1:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} - 0:SourceNode{} - ---- - - Code - ds %>% filter(lgl) %>% arrange(chr) %>% head() %>% explain() - Output - ExecPlan with 4 nodes: - 3:OrderBySinkNode{by={k=6, sort_keys=[FieldRef.Name(chr) ASC]}} - 2:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} - 1:FilterNode{filter=lgl} - 0:SourceNode{} - ExecPlan with 3 nodes: - 2:SinkNode{} - 1:ProjectNode{projection=[int, dbl, lgl, chr, fct, ts, part]} - 0:SourceNode{} - diff --git a/r/tests/testthat/_snaps/dplyr-query.md b/r/tests/testthat/_snaps/dplyr-query.md deleted file mode 100644 index 371eb1c729d..00000000000 --- a/r/tests/testthat/_snaps/dplyr-query.md +++ /dev/null @@ -1,109 +0,0 @@ -# show_query() and explain() - - Code - mtcars %>% arrow_table() %>% show_query() - Output - ExecPlan with 3 nodes: - 2:SinkNode{} - 1:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} - 0:TableSourceNode{} - ---- - - Code - tbl %>% arrow_table() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% show_query() - Output - ExecPlan with 4 nodes: - 3:SinkNode{} - 2:ProjectNode{projection=[chr, int, lgl, "int_plus_ten": add_checked(cast(int, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), 10)]} - 1:FilterNode{filter=((dbl > 2) and (chr != "e"))} - 0:TableSourceNode{} - ---- - - Code - tbl %>% arrow_table() %>% left_join(example_data %>% arrow_table() %>% mutate( - doubled_dbl = dbl * 2) %>% select(int, doubled_dbl), by = "int") %>% select( - int, verses, doubled_dbl) %>% show_query() - Output - ExecPlan with 7 nodes: - 6:SinkNode{} - 5:ProjectNode{projection=[int, verses, doubled_dbl]} - 4:HashJoinNode{} - 3:ProjectNode{projection=[int, "doubled_dbl": multiply_checked(dbl, 2)]} - 2:TableSourceNode{} - 1:ProjectNode{projection=[int, dbl, dbl2, lgl, false, chr, fct, verses, padded_strings, another_chr]} - 0:TableSourceNode{} - ---- - - Code - tbl %>% arrow_table() %>% left_join(example_data %>% arrow_table() %>% mutate( - doubled_dbl = dbl * 2) %>% select(int, doubled_dbl), by = "int") %>% select( - int, verses, doubled_dbl) %>% explain() - Output - ExecPlan with 7 nodes: - 6:SinkNode{} - 5:ProjectNode{projection=[int, verses, doubled_dbl]} - 4:HashJoinNode{} - 3:ProjectNode{projection=[int, "doubled_dbl": multiply_checked(dbl, 2)]} - 2:TableSourceNode{} - 1:ProjectNode{projection=[int, dbl, dbl2, lgl, false, chr, fct, verses, padded_strings, another_chr]} - 0:TableSourceNode{} - ---- - - Code - mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% - show_query() - Output - ExecPlan with 4 nodes: - 3:OrderBySinkNode{by={sort_keys=[FieldRef.Name(wt) DESC], null_placement=AtEnd}} - 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} - 1:FilterNode{filter=(mpg > 20)} - 0:TableSourceNode{} - ---- - - Code - mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% explain() - Output - ExecPlan with 4 nodes: - 3:OrderBySinkNode{by={sort_keys=[FieldRef.Name(wt) DESC], null_placement=AtEnd}} - 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} - 1:FilterNode{filter=(mpg > 20)} - 0:TableSourceNode{} - ---- - - Code - mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% - show_query() - Output - ExecPlan with 4 nodes: - 3:OrderBySinkNode{by={k=3, sort_keys=[FieldRef.Name(wt) DESC]}} - 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} - 1:FilterNode{filter=(mpg > 20)} - 0:TableSourceNode{} - ExecPlan with 3 nodes: - 2:SinkNode{} - 1:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} - 0:SourceNode{} - ---- - - Code - mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% - explain() - Output - ExecPlan with 4 nodes: - 3:OrderBySinkNode{by={k=3, sort_keys=[FieldRef.Name(wt) DESC]}} - 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} - 1:FilterNode{filter=(mpg > 20)} - 0:TableSourceNode{} - ExecPlan with 3 nodes: - 2:SinkNode{} - 1:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb]} - 0:SourceNode{} - diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 902f07d3563..b0bd61653fb 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -341,7 +341,7 @@ test_that("dplyr method not implemented messages", { ) }) -test_that("show_exec_plan() with datasets", { +test_that("show_exec_plan(), show_query() and explain() with datasets", { ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) # minimal test @@ -350,7 +350,6 @@ test_that("show_exec_plan() with datasets", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # final node "ProjectNode.*", # output columns "SourceNode" # entry point ) @@ -364,7 +363,6 @@ test_that("show_exec_plan() with datasets", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # final node "ProjectNode.*", # output columns "FilterNode.*", # filter node "int > 6.*cast.*", # filtering expressions + auto-casting of part @@ -380,7 +378,6 @@ test_that("show_exec_plan() with datasets", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # final node "ProjectNode.*", # output columns "GroupByNode.*", # group by node "keys=.*part.*", # key for aggregations @@ -412,91 +409,13 @@ test_that("show_exec_plan() with datasets", { arrange(chr) %>% head() %>% show_exec_plan(), - # we expect 2 chained ExecPlans here due to arrange being a pipeline breaker - # sink node and head effectively starting a new ExecPlan + # for some reason the FilterNode disappears when head/tail are involved + + # we do not have additional information regarding the SinkNode regexp = paste0( - # first ExecPlan (up to and including arrange) "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "OrderBySinkNode.*", # the arrange sink node - "ProjectNode.*", # output columns - "FilterNode.*filter=lgl.*", # filter node - "SourceNode.*", # entry point - # second ExecPlan (head) - "SinkNode.*", # the output node + "SinkNode.*", # "ProjectNode.*", # output columns "SourceNode" # entry point ) ) }) - -test_that("how_query() and explain() with datasets", { - ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) - - # minimal test - expect_snapshot( - ds %>% - show_query() - ) - expect_snapshot( - ds %>% - explain() - ) - - # filter and select - expect_snapshot( - ds %>% - select(string = chr, integer = int, part) %>% - filter(integer > 6L & part == 1) %>% - show_query() - ) - expect_snapshot( - ds %>% - select(string = chr, integer = int, part) %>% - filter(integer > 6L & part == 1) %>% - explain() - ) - - # group_by and summarise - expect_snapshot( - ds %>% - group_by(part) %>% - summarise(avg = mean(int)) %>% - show_query() - ) - expect_snapshot( - ds %>% - group_by(part) %>% - summarise(avg = mean(int)) %>% - explain() - ) - - # filter and arrange - expect_snapshot( - ds %>% - filter(lgl) %>% - arrange(chr) %>% - show_query() - ) - expect_snapshot( - ds %>% - filter(lgl) %>% - arrange(chr) %>% - explain() - ) - - # filter and arrange and head - expect_snapshot( - ds %>% - filter(lgl) %>% - arrange(chr) %>% - head() %>% - show_query() - ) - expect_snapshot( - ds %>% - filter(lgl) %>% - arrange(chr) %>% - head() %>% - explain() - ) -}) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 1743dbcf869..a2096b6c7ee 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -434,7 +434,7 @@ test_that("query_can_stream()", { ) }) -test_that("show_exec_plan()", { +test_that("show_exec_plan(), show_query() and explain()", { # minimal test - this fails if we don't coerce the input to `show_exec_plan()` # to be an `arrow_dplyr_query` expect_output( @@ -443,7 +443,30 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # final node + "ProjectNode.*", # output columns + "TableSourceNode" # entry point + ) + ) + + # minimal test - show_query() + expect_output( + mtcars %>% + arrow_table() %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output new columns + "TableSourceNode" # entry point + ) + ) + + # minimal test - explain() + expect_output( + mtcars %>% + arrow_table() %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns "TableSourceNode" # entry point ) @@ -459,7 +482,42 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # final node + "chr, int, lgl, \"int_plus_ten\".*", # selected columns + "FilterNode.*", # filter node + "(dbl > 2).*", # filter expressions + "chr != \"e\".*", + "TableSourceNode" # entry point + ) + ) + + # arrow_table and mutate - show_query() + expect_output( + tbl %>% + arrow_table() %>% + filter(dbl > 2, chr != "e") %>% + select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "chr, int, lgl, \"int_plus_ten\".*", # selected columns + "FilterNode.*", # filter node + "(dbl > 2).*", # filter expressions + "chr != \"e\".*", + "TableSourceNode" # entry point + ) + ) + + # arrow_table and mutate - explain() + expect_output( + tbl %>% + arrow_table() %>% + filter(dbl > 2, chr != "e") %>% + select(chr, int, lgl) %>% + mutate(int_plus_ten = int + 10) %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "chr, int, lgl, \"int_plus_ten\".*", # selected columns "FilterNode.*", # filter node "(dbl > 2).*", # filter expressions @@ -478,9 +536,7 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # final node "chr, int, lgl, \"int_plus_ten\".*", # selected columns - "FilterNode.*", # filter node "(dbl > 2).*", # the filter expressions "chr != \"e\".*", "TableSourceNode" # the entry point" @@ -497,7 +553,6 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # final node "ProjectNode.*", # output columns "GroupByNode.*", # the group_by statement "keys=.*lgl.*", # the key for the aggregations @@ -507,6 +562,44 @@ test_that("show_exec_plan()", { ) ) + # test with group_by and summarise - show_query() + expect_output( + tbl %>% + arrow_table() %>% + group_by(lgl) %>% + summarise(avg = mean(dbl, na.rm = TRUE)) %>% + ungroup() %>% + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "GroupByNode.*", # the group_by statement + "keys=.*lgl.*", # the key for the aggregations + "aggregates=.*hash_mean.*avg.*", # the aggregations + "ProjectNode.*", # the input columns + "TableSourceNode" # the entry point + ) + ) + + # test with group_by and summarise - explain() + expect_output( + tbl %>% + arrow_table() %>% + group_by(lgl) %>% + summarise(avg = mean(dbl, na.rm = TRUE)) %>% + ungroup() %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "GroupByNode.*", # group_by statement + "keys=.*lgl.*", # key for the aggregations + "aggregates=.*hash_mean.*avg.*", # aggregations + "ProjectNode.*", # input columns + "TableSourceNode" # entry point + ) + ) + # test with join expect_output( tbl %>% @@ -522,7 +615,6 @@ test_that("show_exec_plan()", { show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # final node "ProjectNode.*", # output columns "HashJoinNode.*", # the join "ProjectNode.*", # input columns for the second table @@ -533,104 +625,8 @@ test_that("show_exec_plan()", { ) ) + # test with join - show_query() expect_output( - mtcars %>% - arrow_table() %>% - filter(mpg > 20) %>% - arrange(desc(wt)) %>% - show_exec_plan(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node - "ProjectNode.*", # output columns - "FilterNode.*", # filter node - "TableSourceNode.*" # entry point - ) - ) - - expect_output( - mtcars %>% - arrow_table() %>% - filter(mpg > 20) %>% - arrange(desc(wt)) %>% - head(3) %>% - show_exec_plan(), - # we expect 2 chained ExecPlans here due to arrange being a pipeline breaker - # sink node and head effectively starting a new ExecPlan - # the entry point for the second ExecPlan is a SourceNode and not a TableSourceNode - regexp = paste0( - # the first ExecPlan (up to and including arrange) - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "OrderBySinkNode.*k=3.*", # the arrange sink node (NB the k is set here, - # not in the head() node) - "ProjectNode.*", # - "FilterNode.*mpg > 20.*", # filter node and filtering expression - "TableSourceNode.*", # entry point - # the second ExecPlan (head) - "SinkNode.*", # final node - "ProjectNode.*", # output columns - "SourceNode.*" # entry point - ) - ) -}) - -test_that(" show_query() and explain()", { - # show_query and explain are identical to show_exec_plan, so will be tested - # with expect_snapshot() to minimise repetition and the size of the test file - - # minimal test - expect_snapshot( - mtcars %>% - arrow_table() %>% - show_query() - ) - - expect_output( - mtcars %>% - arrow_table() %>% - explain() - ) - - # arrow_table and mutate - expect_snapshot( - tbl %>% - arrow_table() %>% - filter(dbl > 2, chr != "e") %>% - select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% - show_query() - ) - - expect_output( - tbl %>% - arrow_table() %>% - filter(dbl > 2, chr != "e") %>% - select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% - explain() - ) - - # test with group by and summarise - expect_output( - tbl %>% - arrow_table() %>% - group_by(lgl) %>% - summarise(avg = mean(dbl, na.rm = TRUE)) %>% - ungroup() %>% - show_query() - ) - - expect_output( - tbl %>% - arrow_table() %>% - group_by(lgl) %>% - summarise(avg = mean(dbl, na.rm = TRUE)) %>% - ungroup() %>% - explain() - ) - - # test with join - expect_snapshot( tbl %>% arrow_table() %>% left_join( @@ -641,10 +637,21 @@ test_that(" show_query() and explain()", { by = "int" ) %>% select(int, verses, doubled_dbl) %>% - show_query() + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "HashJoinNode.*", # join + "ProjectNode.*", # input columns for the second table + "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate + "TableSourceNode.*", # second table + "ProjectNode.*", # input columns for the first table + "TableSourceNode" # first table + ) ) - expect_snapshot( + # test with join - explain() + expect_output( tbl %>% arrow_table() %>% left_join( @@ -655,42 +662,114 @@ test_that(" show_query() and explain()", { by = "int" ) %>% select(int, verses, doubled_dbl) %>% - explain() + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "ProjectNode.*", # output columns + "HashJoinNode.*", # join + "ProjectNode.*", # input columns for the second table + "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # mutate + "TableSourceNode.*", # second table + "ProjectNode.*", # input columns for the first table + "TableSourceNode" # first table + ) ) - # test with filter and arrange - expect_snapshot( + expect_output( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% - show_query() + show_exec_plan(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node + "ProjectNode.*", # output columns + "FilterNode.*", # filter node + "TableSourceNode.*" # entry point + ) ) - expect_snapshot( + expect_output( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% - explain() + show_query(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node + "ProjectNode.*", # output columns + "FilterNode.*", # filter node + "TableSourceNode.*" # entry point + ) ) - # test with filter & arrange & head - expect_snapshot( + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + explain(), + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node + "ProjectNode.*", # output columns + "FilterNode.*", # filter node + "TableSourceNode.*" # entry point + ) + ) + + expect_output( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% - show_query() + show_exec_plan(), + # for some reason the FilterNode disappears when head/tail are involved + + # we do not have additional information regarding the SinkNode + + # the entry point is now a SourceNode and not a TableSourceNode + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # + "ProjectNode.*", # output columns + "SourceNode.*" # entry point + ) ) - expect_snapshot( + expect_output( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% - explain() + show_query(), + # for some reason the FilterNode disappears when head/tail are involved + + # we do not have additional information regarding the SinkNode + + # the entry point is now a SourceNode and not a TableSourceNode + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # + "ProjectNode.*", # output columns + "SourceNode.*" # entry point + ) + ) + expect_output( + mtcars %>% + arrow_table() %>% + filter(mpg > 20) %>% + arrange(desc(wt)) %>% + head(3) %>% + explain(), + # for some reason the FilterNode disappears when head/tail are involved + + # we do not have additional information regarding the SinkNode + + # the entry point is now a SourceNode and not a TableSourceNode + regexp = paste0( + "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan + "SinkNode.*", # + "ProjectNode.*", # output columns + "SourceNode.*" # entry point + ) ) }) From e29a835bcbe0fb3d4b45890213ebf78e581862c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 09:23:44 +0100 Subject: [PATCH 72/80] don't start producing the plan --- r/src/compute-exec.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 8387e1f47e6..497e0cdc722 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -144,7 +144,6 @@ std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - arrow::StopIfNotOk(prepared_plan.first->StartProducing()); return prepared_plan.first->ToString(); } From 1309e0f0ce7a1d24bc11c34a490543949e4a8f70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 10:31:36 +0100 Subject: [PATCH 73/80] remove separate tests for `show_query()` and `explain()` --- r/tests/testthat/test-dplyr-query.R | 216 +--------------------------- 1 file changed, 3 insertions(+), 213 deletions(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index a2096b6c7ee..4384ff5f188 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -435,6 +435,9 @@ test_that("query_can_stream()", { }) test_that("show_exec_plan(), show_query() and explain()", { + # show_query() and explain() are wrappers around show_exec_plan() and are not + # tested separately + # minimal test - this fails if we don't coerce the input to `show_exec_plan()` # to be an `arrow_dplyr_query` expect_output( @@ -448,30 +451,6 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - # minimal test - show_query() - expect_output( - mtcars %>% - arrow_table() %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output new columns - "TableSourceNode" # entry point - ) - ) - - # minimal test - explain() - expect_output( - mtcars %>% - arrow_table() %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "TableSourceNode" # entry point - ) - ) - # arrow_table and mutate expect_output( tbl %>% @@ -490,42 +469,6 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - # arrow_table and mutate - show_query() - expect_output( - tbl %>% - arrow_table() %>% - filter(dbl > 2, chr != "e") %>% - select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "chr, int, lgl, \"int_plus_ten\".*", # selected columns - "FilterNode.*", # filter node - "(dbl > 2).*", # filter expressions - "chr != \"e\".*", - "TableSourceNode" # entry point - ) - ) - - # arrow_table and mutate - explain() - expect_output( - tbl %>% - arrow_table() %>% - filter(dbl > 2, chr != "e") %>% - select(chr, int, lgl) %>% - mutate(int_plus_ten = int + 10) %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "chr, int, lgl, \"int_plus_ten\".*", # selected columns - "FilterNode.*", # filter node - "(dbl > 2).*", # filter expressions - "chr != \"e\".*", - "TableSourceNode" # entry point - ) - ) - # record_batch and mutate expect_output( tbl %>% @@ -562,44 +505,6 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - # test with group_by and summarise - show_query() - expect_output( - tbl %>% - arrow_table() %>% - group_by(lgl) %>% - summarise(avg = mean(dbl, na.rm = TRUE)) %>% - ungroup() %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "GroupByNode.*", # the group_by statement - "keys=.*lgl.*", # the key for the aggregations - "aggregates=.*hash_mean.*avg.*", # the aggregations - "ProjectNode.*", # the input columns - "TableSourceNode" # the entry point - ) - ) - - # test with group_by and summarise - explain() - expect_output( - tbl %>% - arrow_table() %>% - group_by(lgl) %>% - summarise(avg = mean(dbl, na.rm = TRUE)) %>% - ungroup() %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "GroupByNode.*", # group_by statement - "keys=.*lgl.*", # key for the aggregations - "aggregates=.*hash_mean.*avg.*", # aggregations - "ProjectNode.*", # input columns - "TableSourceNode" # entry point - ) - ) - # test with join expect_output( tbl %>% @@ -625,56 +530,6 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - # test with join - show_query() - expect_output( - tbl %>% - arrow_table() %>% - left_join( - example_data %>% - arrow_table() %>% - mutate(doubled_dbl = dbl * 2) %>% - select(int, doubled_dbl), - by = "int" - ) %>% - select(int, verses, doubled_dbl) %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "HashJoinNode.*", # join - "ProjectNode.*", # input columns for the second table - "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate - "TableSourceNode.*", # second table - "ProjectNode.*", # input columns for the first table - "TableSourceNode" # first table - ) - ) - - # test with join - explain() - expect_output( - tbl %>% - arrow_table() %>% - left_join( - example_data %>% - arrow_table() %>% - mutate(doubled_dbl = dbl * 2) %>% - select(int, doubled_dbl), - by = "int" - ) %>% - select(int, verses, doubled_dbl) %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "ProjectNode.*", # output columns - "HashJoinNode.*", # join - "ProjectNode.*", # input columns for the second table - "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # mutate - "TableSourceNode.*", # second table - "ProjectNode.*", # input columns for the first table - "TableSourceNode" # first table - ) - ) - expect_output( mtcars %>% arrow_table() %>% @@ -690,36 +545,6 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - expect_output( - mtcars %>% - arrow_table() %>% - filter(mpg > 20) %>% - arrange(desc(wt)) %>% - show_query(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node - "ProjectNode.*", # output columns - "FilterNode.*", # filter node - "TableSourceNode.*" # entry point - ) - ) - - expect_output( - mtcars %>% - arrow_table() %>% - filter(mpg > 20) %>% - arrange(desc(wt)) %>% - explain(), - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node - "ProjectNode.*", # output columns - "FilterNode.*", # filter node - "TableSourceNode.*" # entry point - ) - ) - expect_output( mtcars %>% arrow_table() %>% @@ -737,39 +562,4 @@ test_that("show_exec_plan(), show_query() and explain()", { "SourceNode.*" # entry point ) ) - - expect_output( - mtcars %>% - arrow_table() %>% - filter(mpg > 20) %>% - arrange(desc(wt)) %>% - head(3) %>% - show_query(), - # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + - # the entry point is now a SourceNode and not a TableSourceNode - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # - "ProjectNode.*", # output columns - "SourceNode.*" # entry point - ) - ) - expect_output( - mtcars %>% - arrow_table() %>% - filter(mpg > 20) %>% - arrange(desc(wt)) %>% - head(3) %>% - explain(), - # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + - # the entry point is now a SourceNode and not a TableSourceNode - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # - "ProjectNode.*", # output columns - "SourceNode.*" # entry point - ) - ) }) From 5f890dc514f0c634df0ad84b793ebd0f8a58a634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 11:11:13 +0100 Subject: [PATCH 74/80] revert to fb2a5a00fc24d16da8e0a596256e0c18b02f5289 and use the `BuildAndShow` name --- r/R/arrowExports.R | 4 +-- r/R/query-engine.R | 3 +- r/man/show_exec_plan.Rd | 2 +- r/src/arrowExports.cpp | 9 +++--- r/src/compute-exec.cpp | 65 ++++++++++++++++++++++++----------------- 5 files changed, 47 insertions(+), 36 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index c73d8fd97bd..d424502bb12 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -416,8 +416,8 @@ ExecNode_output_schema <- function(node) { .Call(`_arrow_ExecNode_output_schema`, node) } -ExecPlan_BuildAndShow <- function(plan, final_node, sort_options, metadata, head) { - .Call(`_arrow_ExecPlan_BuildAndShow`, plan, final_node, sort_options, metadata, head) +ExecPlan_BuildAndShow <- function(plan, final_node, sort_options, head) { + .Call(`_arrow_ExecPlan_BuildAndShow`, plan, final_node, sort_options, head) } ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 7b37179687f..af008487e02 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -261,7 +261,7 @@ ExecPlan <- R6Class("ExecPlan", ) }, # SinkNodes (involved in arrange and/or head/tail operations) are created in - # ExecPlan_run and are not captured by the regular print method. We take a + # ExecPlan_run and are not captured by the regulat print method. We take a # similar approach to expose them before calling the print method. BuildAndShow = function(node) { assert_is(node, "ExecNode") @@ -285,7 +285,6 @@ ExecPlan <- R6Class("ExecPlan", self, node, sorting, - prepare_key_value_metadata(node$final_metadata()), select_k ) }, diff --git a/r/man/show_exec_plan.Rd b/r/man/show_exec_plan.Rd index 7c4374520a6..63ed4a7ec66 100644 --- a/r/man/show_exec_plan.Rd +++ b/r/man/show_exec_plan.Rd @@ -19,7 +19,7 @@ It calls the C++ \code{ExecPlan} object's print method. Functionally, it is similar to \code{dplyr::explain()}. } \examples{ -\dontshow{if (arrow_with_dataset() && requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(dplyr) mtcars \%>\% arrow_table() \%>\% diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 79a614ce4be..956b88861c7 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -898,15 +898,14 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp -std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head); -extern "C" SEXP _arrow_ExecPlan_BuildAndShow(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP metadata_sexp, SEXP head_sexp){ +std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, int64_t head); +extern "C" SEXP _arrow_ExecPlan_BuildAndShow(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP head_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); arrow::r::Input&>::type final_node(final_node_sexp); arrow::r::Input::type sort_options(sort_options_sexp); - arrow::r::Input::type metadata(metadata_sexp); arrow::r::Input::type head(head_sexp); - return cpp11::as_sexp(ExecPlan_BuildAndShow(plan, final_node, sort_options, metadata, head)); + return cpp11::as_sexp(ExecPlan_BuildAndShow(plan, final_node, sort_options, head)); END_CPP11 } // compute-exec.cpp @@ -5254,7 +5253,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 5}, { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1}, { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1}, - { "_arrow_ExecPlan_BuildAndShow", (DL_FUNC) &_arrow_ExecPlan_BuildAndShow, 5}, + { "_arrow_ExecPlan_BuildAndShow", (DL_FUNC) &_arrow_ExecPlan_BuildAndShow, 4}, { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 497e0cdc722..43722c8d9a6 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -55,10 +55,11 @@ std::shared_ptr MakeExecNodeOrStop( }); } -std::pair, std::shared_ptr> -ExecPlan_prepare(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { +// [[arrow::export]] +std::shared_ptr ExecPlan_run( + const std::shared_ptr& plan, + const std::shared_ptr& final_node, cpp11::list sort_options, + cpp11::strings metadata, int64_t head = -1) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; @@ -88,6 +89,7 @@ ExecPlan_prepare(const std::shared_ptr& plan, } StopIfNotOk(plan->Validate()); + StopIfNotOk(plan->StartProducing()); // If the generator is destroyed before being completely drained, inform plan std::shared_ptr stop_producing{nullptr, [plan](...) { @@ -107,24 +109,9 @@ ExecPlan_prepare(const std::shared_ptr& plan, auto kv = strings_to_kvm(metadata); out_schema = out_schema->WithMetadata(kv); } - - std::pair, std::shared_ptr> - out; - out.first = plan; - out.second = compute::MakeGeneratorReader( + return compute::MakeGeneratorReader( out_schema, [stop_producing, plan, sink_gen] { return sink_gen(); }, gc_memory_pool()); - return out; -} - -// [[arrow::export]] -std::shared_ptr ExecPlan_run( - const std::shared_ptr& plan, - const std::shared_ptr& final_node, cpp11::list sort_options, - cpp11::strings metadata, int64_t head = -1) { - auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - StopIfNotOk(prepared_plan.first->StartProducing()); - return prepared_plan.second; } // [[arrow::export]] @@ -139,12 +126,38 @@ std::shared_ptr ExecNode_output_schema( } // [[arrow::export]] -std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, - const std::shared_ptr& final_node, - cpp11::list sort_options, cpp11::strings metadata, - int64_t head = -1) { - auto prepared_plan = ExecPlan_prepare(plan, final_node, sort_options, metadata, head); - return prepared_plan.first->ToString(); +std::string ExecPlan_BuildAndShow( + const std::shared_ptr& plan, + const std::shared_ptr& final_node, cpp11::list sort_options, + int64_t head = -1) { + // For now, don't require R to construct SinkNodes. + // Instead, just pass the node we should collect as an argument. + arrow::AsyncGenerator> sink_gen; + + // Sorting uses a different sink node; there is no general sort yet + if (sort_options.size() > 0) { + if (head >= 0) { + // Use the SelectK node to take only what we need + MakeExecNodeOrStop( + "select_k_sink", plan.get(), {final_node.get()}, + compute::SelectKSinkNodeOptions{ + arrow::compute::SelectKOptions( + head, std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)) + ->sort_keys), + &sink_gen}); + } else { + MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, + compute::OrderBySinkNodeOptions{ + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); + } + } else { + MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, + compute::SinkNodeOptions{&sink_gen}); + } + return plan->ToString(); } #if defined(ARROW_R_WITH_DATASET) From c4912a41799dcf7d80cf6792fcf10ca27d51ae43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 11:22:22 +0100 Subject: [PATCH 75/80] docs + comments to indicate the duplicate code in: * `ExecPlan_run` and `ExecPlan_BuildAndShow` C++ functions * `$Run()` and `$BuildAndShow()` R6 methods --- r/R/dplyr.R | 3 ++- r/R/query-engine.R | 9 +++++++++ r/man/show_exec_plan.Rd | 5 +++-- r/src/compute-exec.cpp | 11 +++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index e888c2a30fd..fc0aafb46ed 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -224,7 +224,8 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' This is a function which gives more details about the logical query plan #' that will be executed when evaluating an `arrow_dplyr_query` object. #' It calls the C++ `ExecPlan` object's print method. -#' Functionally, it is similar to `dplyr::explain()`. +#' Functionally, it is similar to `dplyr::explain()`. This function is used as +#' the `dplyr::explain()` and `dplyr::show_query()` methods. #' #' @param x an `arrow_dplyr_query` to print the `ExecPlan` for. #' diff --git a/r/R/query-engine.R b/r/R/query-engine.R index af008487e02..222d2290a5e 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -193,6 +193,8 @@ ExecPlan <- R6Class("ExecPlan", node }, Run = function(node) { + # a section of this code is used by `BuildAndShow()` too - the 2 need to be in sync + # Start of chunk used in `BuildAndShow()` assert_is(node, "ExecNode") # Sorting and head/tail (if sorted) are handled in the SinkNode, @@ -210,6 +212,8 @@ ExecPlan <- R6Class("ExecPlan", sorting$orders <- as.integer(sorting$orders) } + # End of chunk used in `BuildAndShow()` + out <- ExecPlan_run( self, node, @@ -264,6 +268,9 @@ ExecPlan <- R6Class("ExecPlan", # ExecPlan_run and are not captured by the regulat print method. We take a # similar approach to expose them before calling the print method. BuildAndShow = function(node) { + # a section of this code is copied from `Run()` - the 2 need to be in sync + # Start of chunk copied from `Run()` + assert_is(node, "ExecNode") # Sorting and head/tail (if sorted) are handled in the SinkNode, @@ -281,6 +288,8 @@ ExecPlan <- R6Class("ExecPlan", sorting$orders <- as.integer(sorting$orders) } + # End of chunk copied from `Run()` + ExecPlan_BuildAndShow( self, node, diff --git a/r/man/show_exec_plan.Rd b/r/man/show_exec_plan.Rd index 63ed4a7ec66..c020838b2ed 100644 --- a/r/man/show_exec_plan.Rd +++ b/r/man/show_exec_plan.Rd @@ -16,10 +16,11 @@ show_exec_plan(x) This is a function which gives more details about the logical query plan that will be executed when evaluating an \code{arrow_dplyr_query} object. It calls the C++ \code{ExecPlan} object's print method. -Functionally, it is similar to \code{dplyr::explain()}. +Functionally, it is similar to \code{dplyr::explain()}. This function is used as +the \code{dplyr::explain()} and \code{dplyr::show_query()} methods. } \examples{ -\dontshow{if (arrow_with_dataset() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_with_dataset() && requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(dplyr) mtcars \%>\% arrow_table() \%>\% diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 43722c8d9a6..9fcd0b5d478 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -60,6 +60,9 @@ std::shared_ptr ExecPlan_run( const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { + // a section of this code is used by BuildAndShow too - the 2 need to be in sync + // Start of chunk used in ExecPlan_BuildAndShow + // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; @@ -88,6 +91,8 @@ std::shared_ptr ExecPlan_run( compute::SinkNodeOptions{&sink_gen}); } + // End of chunk used in ExecPlan_BuildAndShow + StopIfNotOk(plan->Validate()); StopIfNotOk(plan->StartProducing()); @@ -130,6 +135,9 @@ std::string ExecPlan_BuildAndShow( const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, int64_t head = -1) { + // a section of this code is copied from ExecPlan_run - the 2 need to be in sync + // Start of chunk copied from ExecPlan_run + // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; @@ -157,6 +165,9 @@ std::string ExecPlan_BuildAndShow( MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, compute::SinkNodeOptions{&sink_gen}); } + + // End of chunk copied from ExecPlan_run + return plan->ToString(); } From 5aaa3cdd45dd5784de1582d789ae59c1330d5bd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 11:36:57 +0100 Subject: [PATCH 76/80] removed `ungroup()` call --- r/tests/testthat/test-dplyr-query.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index 4384ff5f188..a34ec9b5a96 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -492,7 +492,6 @@ test_that("show_exec_plan(), show_query() and explain()", { arrow_table() %>% group_by(lgl) %>% summarise(avg = mean(dbl, na.rm = TRUE)) %>% - ungroup() %>% show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan From ce8b17e1134148203236fd1530071b6301931b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 12:06:55 +0100 Subject: [PATCH 77/80] clang lint --- r/src/compute-exec.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 9fcd0b5d478..5401019c3f9 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -131,10 +131,9 @@ std::shared_ptr ExecNode_output_schema( } // [[arrow::export]] -std::string ExecPlan_BuildAndShow( - const std::shared_ptr& plan, - const std::shared_ptr& final_node, cpp11::list sort_options, - int64_t head = -1) { +std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, + const std::shared_ptr& final_node, + cpp11::list sort_options, int64_t head = -1) { // a section of this code is copied from ExecPlan_run - the 2 need to be in sync // Start of chunk copied from ExecPlan_run From 0ffa30a46820955a020128944a0d1bffc8a2eb1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 12:26:59 +0100 Subject: [PATCH 78/80] comment --- r/tests/testthat/test-dataset-dplyr.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index b0bd61653fb..44a086393e2 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -342,6 +342,9 @@ test_that("dplyr method not implemented messages", { }) test_that("show_exec_plan(), show_query() and explain() with datasets", { + # show_query() and explain() are wrappers around show_exec_plan() and are not + # tested separately + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) # minimal test From 6a8c75346325cc6e47e1c0bcef2aaf3c82bb488e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 14:35:48 +0100 Subject: [PATCH 79/80] warn and don't build & print the ExecPlan when we have a nested query --- r/R/dplyr.R | 7 +++++++ r/tests/testthat/test-dataset-dplyr.R | 13 ++++--------- r/tests/testthat/test-dplyr-query.R | 14 ++++---------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/r/R/dplyr.R b/r/R/dplyr.R index fc0aafb46ed..e66fe6cb1e2 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -242,6 +242,13 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { show_exec_plan <- function(x) { adq <- as_adq(x) plan <- ExecPlan$create() + # do not show the plan if we have a nested query (as this will force the + # evaluation of the inner query/queries) + # TODO see if we can remove after ARROW-16628 + if (is_collapsed(x) && has_head_tail(x$.data)) { + warn("The `ExecPlan` cannot be printed for a nested query.") + return(invisible(x)) + } final_node <- plan$Build(adq) cat(plan$BuildAndShow(final_node)) invisible(x) diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 44a086393e2..b6982939ee3 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -406,19 +406,14 @@ test_that("show_exec_plan(), show_query() and explain() with datasets", { ) ) - expect_output( + # printing the ExecPlan for a nested query would currently force the + # evaluation of the inner one(s), which we want to avoid => no output + expect_warning( ds %>% filter(lgl) %>% arrange(chr) %>% head() %>% show_exec_plan(), - # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # - "ProjectNode.*", # output columns - "SourceNode" # entry point - ) + "The `ExecPlan` cannot be printed for a nested query." ) }) diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index a34ec9b5a96..0c87224c422 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -544,21 +544,15 @@ test_that("show_exec_plan(), show_query() and explain()", { ) ) - expect_output( + # printing the ExecPlan for a nested query would currently force the + # evaluation of the inner one(s), which we want to avoid => no output + expect_warning( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% head(3) %>% show_exec_plan(), - # for some reason the FilterNode disappears when head/tail are involved + - # we do not have additional information regarding the SinkNode + - # the entry point is now a SourceNode and not a TableSourceNode - regexp = paste0( - "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan - "SinkNode.*", # - "ProjectNode.*", # output columns - "SourceNode.*" # entry point - ) + "The `ExecPlan` cannot be printed for a nested query." ) }) From 35e9ef8c4c4a218e4b04b501090afc96fad9b621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 22 Jul 2022 14:50:57 +0100 Subject: [PATCH 80/80] comments in `ExecPlan_prepare` --- r/src/compute-exec.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index e095793eb09..91d646f0a3c 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -60,6 +60,10 @@ std::pair, std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) { + // a section of this code is copied and used in ExecPlan_BuildAndShow - the 2 need + // to be in sync + // Start of chunk used in ExecPlan_BuildAndShow + // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; @@ -88,6 +92,8 @@ ExecPlan_prepare(const std::shared_ptr& plan, compute::SinkNodeOptions{&sink_gen}); } + // End of chunk used in ExecPlan_BuildAndShow + StopIfNotOk(plan->Validate()); // If the generator is destroyed before being completely drained, inform plan @@ -159,8 +165,8 @@ std::shared_ptr ExecNode_output_schema( std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options, int64_t head = -1) { - // a section of this code is copied from ExecPlan_run - the 2 need to be in sync - // Start of chunk copied from ExecPlan_run + // a section of this code is copied from ExecPlan_prepare - the 2 need to be in sync + // Start of chunk copied from ExecPlan_prepare // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. @@ -190,7 +196,7 @@ std::string ExecPlan_BuildAndShow(const std::shared_ptr& plan compute::SinkNodeOptions{&sink_gen}); } - // End of chunk copied from ExecPlan_run + // End of chunk copied from ExecPlan_prepare return plan->ToString(); }