From 785ab5f3c3aa4e5f2e30655d4765ca729bee7aa2 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 22 Dec 2022 19:19:36 +0000 Subject: [PATCH 1/7] Keep right join keys initially --- r/R/dplyr-join.R | 13 ++++++++++++- r/tests/testthat/test-dplyr-join.R | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-join.R b/r/R/dplyr-join.R index cb27c478b65..2fe328b938d 100644 --- a/r/R/dplyr-join.R +++ b/r/R/dplyr-join.R @@ -72,7 +72,18 @@ right_join.arrow_dplyr_query <- function(x, suffix = c(".x", ".y"), ..., keep = FALSE) { - do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "RIGHT_OUTER") + + # Initially keep join keys so we can coalesce them after when keep=FALSE + query <- do_join(x, y, by, copy, suffix, ..., keep = TRUE, join_type = "RIGHT_OUTER") + + # If we are doing a right outer join and not keeping the join keys of + # both sides, we need to coalesce. Otherwise, rows that exist in the + # RHS will have NAs for the join keys. + if (!keep) { + query$selected_columns <- post_join_projection(names(x), names(y), handle_join_by(by, x, y), suffix) + } + + query } right_join.Dataset <- right_join.ArrowTabular <- right_join.RecordBatchReader <- right_join.arrow_dplyr_query diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index f0d3c8605dc..903e91813d2 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -371,3 +371,24 @@ test_that("full joins handle keep", { ) } }) + +test_that("right_join correctly coalesces keys", { + tbl1 <- tibble::tibble(x = 1:10, y = letters[1:10]) + tbl2 <- tibble::tibble(x = 1:3, z = letters[11:13]) + + compare_dplyr_binding( + .input %>% + right_join(tbl1, by = "x") %>% + arrange(x) %>% + collect(), + tbl2 + ) + + compare_dplyr_binding( + .input %>% + right_join(tbl1, by = "x", keep = TRUE) %>% + arrange(x.x) %>% + collect(), + tbl2 + ) +}) From 816a7c8feb36ffee739a1704c076535cf35b4dc3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 3 Jan 2023 08:13:51 +0000 Subject: [PATCH 2/7] Remove for loops from tests --- r/tests/testthat/test-dplyr-join.R | 107 ++++++++++++++++------------- 1 file changed, 60 insertions(+), 47 deletions(-) diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index 903e91813d2..941ed6975ec 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -141,47 +141,74 @@ test_that("Error handling", { # TODO: casting: int and float columns? test_that("right_join", { - for (keep in c(TRUE, FALSE)) { - compare_dplyr_binding( + + compare_dplyr_binding( .input %>% - right_join(to_join, by = "some_grouping", keep = !!keep) %>% + right_join(to_join, by = "some_grouping", keep = TRUE) %>% collect(), left ) - } -}) -test_that("inner_join", { - for (keep in c(TRUE, FALSE)) { - compare_dplyr_binding( + compare_dplyr_binding( .input %>% - inner_join(to_join, by = "some_grouping", keep = !!keep) %>% + right_join(to_join, by = "some_grouping", keep = FALSE) %>% collect(), left ) - } +}) + +test_that("inner_join", { + + compare_dplyr_binding( + .input %>% + inner_join(to_join, by = "some_grouping", keep = TRUE) %>% + collect(), + left + ) + + compare_dplyr_binding( + .input %>% + inner_join(to_join, by = "some_grouping", keep = FALSE) %>% + collect(), + left + ) + }) test_that("full_join", { - for (keep in c(TRUE, FALSE)) { - compare_dplyr_binding( - .input %>% - full_join(to_join, by = "some_grouping", keep = !!keep) %>% - collect(), - left - ) - } + + compare_dplyr_binding( + .input %>% + full_join(to_join, by = "some_grouping", keep = TRUE) %>% + collect(), + left + ) + + compare_dplyr_binding( + .input %>% + full_join(to_join, by = "some_grouping", keep = FALSE) %>% + collect(), + left + ) + }) test_that("semi_join", { - for (keep in c(TRUE, FALSE)) { - compare_dplyr_binding( - .input %>% - semi_join(to_join, by = "some_grouping", keep = !!keep) %>% - collect(), - left - ) - } + + compare_dplyr_binding( + .input %>% + semi_join(to_join, by = "some_grouping", keep = TRUE) %>% + collect(), + left + ) + + compare_dplyr_binding( + .input %>% + semi_join(to_join, by = "some_grouping", keep = FALSE) %>% + collect(), + left + ) + }) test_that("anti_join", { @@ -361,34 +388,20 @@ test_that("full joins handle keep", { z = 6:10 ) - for (keep in c(TRUE, FALSE)) { - compare_dplyr_binding( - .input %>% - full_join(full_data_df, by = c("y", "x"), keep = !!keep) %>% - arrange(index) %>% - collect(), - small_dataset_df - ) - } -}) - -test_that("right_join correctly coalesces keys", { - tbl1 <- tibble::tibble(x = 1:10, y = letters[1:10]) - tbl2 <- tibble::tibble(x = 1:3, z = letters[11:13]) - compare_dplyr_binding( .input %>% - right_join(tbl1, by = "x") %>% - arrange(x) %>% + full_join(full_data_df, by = c("y", "x"), keep = TRUE) %>% + arrange(index) %>% collect(), - tbl2 + small_dataset_df ) compare_dplyr_binding( .input %>% - right_join(tbl1, by = "x", keep = TRUE) %>% - arrange(x.x) %>% + full_join(full_data_df, by = c("y", "x"), keep = FALSE) %>% + arrange(index) %>% collect(), - tbl2 + small_dataset_df ) + }) From a24c5fe323a1fa00d5d9e47a3f47a6bbe17a8ec8 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 3 Jan 2023 08:14:15 +0000 Subject: [PATCH 3/7] Run styler --- r/tests/testthat/test-dplyr-join.R | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index 941ed6975ec..9f22e42f480 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -141,24 +141,22 @@ test_that("Error handling", { # TODO: casting: int and float columns? test_that("right_join", { - compare_dplyr_binding( - .input %>% - right_join(to_join, by = "some_grouping", keep = TRUE) %>% - collect(), - left - ) + .input %>% + right_join(to_join, by = "some_grouping", keep = TRUE) %>% + collect(), + left + ) compare_dplyr_binding( - .input %>% - right_join(to_join, by = "some_grouping", keep = FALSE) %>% - collect(), - left - ) + .input %>% + right_join(to_join, by = "some_grouping", keep = FALSE) %>% + collect(), + left + ) }) test_that("inner_join", { - compare_dplyr_binding( .input %>% inner_join(to_join, by = "some_grouping", keep = TRUE) %>% @@ -172,11 +170,9 @@ test_that("inner_join", { collect(), left ) - }) test_that("full_join", { - compare_dplyr_binding( .input %>% full_join(to_join, by = "some_grouping", keep = TRUE) %>% @@ -190,11 +186,9 @@ test_that("full_join", { collect(), left ) - }) test_that("semi_join", { - compare_dplyr_binding( .input %>% semi_join(to_join, by = "some_grouping", keep = TRUE) %>% @@ -208,7 +202,6 @@ test_that("semi_join", { collect(), left ) - }) test_that("anti_join", { @@ -403,5 +396,4 @@ test_that("full joins handle keep", { collect(), small_dataset_df ) - }) From 7bbfa753568f899c844a61d47ff5f49e48083280 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 3 Jan 2023 08:15:54 +0000 Subject: [PATCH 4/7] Ensure join keys don't fully overlap --- r/tests/testthat/test-dplyr-join.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index 9f22e42f480..4dad3e0a05c 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -21,8 +21,8 @@ left <- example_data left$some_grouping <- rep(c(1, 2), 5) to_join <- tibble::tibble( - some_grouping = c(1, 2), - capital_letters = c("A", "B"), + some_grouping = c(1, 2, 3), + capital_letters = c("A", "B", "C"), another_column = TRUE ) From 65275ba33f84a0094935e109b480bcb0da0edc99 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 3 Jan 2023 09:24:57 +0000 Subject: [PATCH 5/7] Try with suggestions from code review --- r/R/dplyr-join.R | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/r/R/dplyr-join.R b/r/R/dplyr-join.R index 2fe328b938d..6470f7f6d98 100644 --- a/r/R/dplyr-join.R +++ b/r/R/dplyr-join.R @@ -35,8 +35,13 @@ do_join <- function(x, # For outer joins, we need to output the join keys on both sides so we # can coalesce them afterwards. - left_output <- names(x) - right_output <- if (keep || join_type == "FULL_OUTER") { + left_output <- if (join_type == "RIGHT_OUTER") { + setdiff(names(x), by) + } else { + names(x) + } + + right_output <- if (keep || join_type %in% c("FULL_OUTER", "RIGHT_OUTER")) { names(y) } else { setdiff(names(y), by) @@ -73,17 +78,8 @@ right_join.arrow_dplyr_query <- function(x, ..., keep = FALSE) { - # Initially keep join keys so we can coalesce them after when keep=FALSE - query <- do_join(x, y, by, copy, suffix, ..., keep = TRUE, join_type = "RIGHT_OUTER") - - # If we are doing a right outer join and not keeping the join keys of - # both sides, we need to coalesce. Otherwise, rows that exist in the - # RHS will have NAs for the join keys. - if (!keep) { - query$selected_columns <- post_join_projection(names(x), names(y), handle_join_by(by, x, y), suffix) - } + do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "RIGHT_OUTER") - query } right_join.Dataset <- right_join.ArrowTabular <- right_join.RecordBatchReader <- right_join.arrow_dplyr_query From 2143d451b8be970ece96d2a6ed5918abe4d60f62 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 4 Jan 2023 09:00:13 +0000 Subject: [PATCH 6/7] Add !keep --- r/R/dplyr-join.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-join.R b/r/R/dplyr-join.R index 6470f7f6d98..f57e2bfec9d 100644 --- a/r/R/dplyr-join.R +++ b/r/R/dplyr-join.R @@ -35,7 +35,7 @@ do_join <- function(x, # For outer joins, we need to output the join keys on both sides so we # can coalesce them afterwards. - left_output <- if (join_type == "RIGHT_OUTER") { + left_output <- if (!keep && join_type == "RIGHT_OUTER") { setdiff(names(x), by) } else { names(x) From e79f657fda998603cbe6715c44321bede52bb414 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 4 Jan 2023 09:05:33 +0000 Subject: [PATCH 7/7] Run styler --- r/R/dplyr-join.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/R/dplyr-join.R b/r/R/dplyr-join.R index f57e2bfec9d..bdc8a2940db 100644 --- a/r/R/dplyr-join.R +++ b/r/R/dplyr-join.R @@ -77,9 +77,7 @@ right_join.arrow_dplyr_query <- function(x, suffix = c(".x", ".y"), ..., keep = FALSE) { - do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "RIGHT_OUTER") - } right_join.Dataset <- right_join.ArrowTabular <- right_join.RecordBatchReader <- right_join.arrow_dplyr_query