apache · nealrichardson · Sep 14, 2021 · Sep 20, 2021 · Sep 29, 2021 · Sep 30, 2021
diff --git a/cpp/src/arrow/compute/exec/options.h b/cpp/src/arrow/compute/exec/options.h
@@ -155,9 +155,9 @@ class ARROW_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
         output_all(true),
         output_prefix_for_left(std::move(output_prefix_for_left)),
         output_prefix_for_right(std::move(output_prefix_for_right)) {
-    key_cmp.resize(left_keys.size());
-    for (size_t i = 0; i < left_keys.size(); ++i) {
-      key_cmp[i] = JoinKeyCmp::EQ;
+    this->key_cmp.resize(this->left_keys.size());
+    for (size_t i = 0; i < this->left_keys.size(); ++i) {
+      this->key_cmp[i] = JoinKeyCmp::EQ;
     }
   }
   HashJoinNodeOptions(
@@ -174,9 +174,9 @@ class ARROW_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
         right_output(std::move(right_output)),
         output_prefix_for_left(std::move(output_prefix_for_left)),
         output_prefix_for_right(std::move(output_prefix_for_right)) {
-    key_cmp.resize(left_keys.size());
-    for (size_t i = 0; i < left_keys.size(); ++i) {
-      key_cmp[i] = JoinKeyCmp::EQ;
+    this->key_cmp.resize(this->left_keys.size());
+    for (size_t i = 0; i < this->left_keys.size(); ++i) {
+      this->key_cmp[i] = JoinKeyCmp::EQ;
     }
   }
   HashJoinNodeOptions(

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
@@ -92,6 +92,7 @@ Collate:
     'expression.R'
     'dplyr-functions.R'
     'dplyr-group-by.R'
+    'dplyr-join.R'
     'dplyr-mutate.R'
     'dplyr-select.R'
     'dplyr-summarize.R'

diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -142,6 +142,7 @@ export(HivePartitioning)
 export(HivePartitioningFactory)
 export(InMemoryDataset)
 export(IpcFileFormat)
+export(JoinType)
 export(JsonParseOptions)
 export(JsonReadOptions)
 export(JsonTableReader)

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
@@ -36,7 +36,8 @@
       "select", "filter", "collect", "summarise", "group_by", "groups",
       "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
       "arrange", "rename", "pull", "relocate", "compute", "collapse",
-      "distinct"
+      "distinct", "left_join", "right_join", "inner_join", "full_join",
+      "semi_join", "anti_join"
     )
   )
   for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) {

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
@@ -89,6 +89,14 @@ implicit_schema <- function(.data) {
 
   if (is.null(.data$aggregations)) {
     new_fields <- map(.data$selected_columns, ~ .$type(old_schm))
+    if (!is.null(.data$join) && !(.data$join$type %in% JoinType[1:4])) {
+      # Add cols from right side, except for semi/anti joins
+      right_cols <- .data$join$right_data$selected_columns
+      new_fields <- c(new_fields, map(
+        right_cols[setdiff(names(right_cols), .data$join$by)],
+        ~ .$type(.data$join$right_data$.data$schema)
+      ))
+    }
   } else {
     new_fields <- map(summarize_projection(.data), ~ .$type(old_schm))
     # * Put group_by_vars first (this can't be done by summarize,

diff --git a/r/R/dplyr-join.R b/r/R/dplyr-join.R
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+do_join <- function(x,
+                    y,
+                    by = NULL,
+                    copy = FALSE,
+                    suffix = c(".x", ".y"),
+                    ...,
+                    keep = FALSE,
+                    na_matches,
+                    join_type) {
+  # TODO: handle `copy` arg: ignore?
+  # TODO: handle `suffix` arg: Arrow does prefix
+  # TODO: handle `keep` arg: "Should the join keys from both ‘x’ and ‘y’ be preserved in the output?"
+  # TODO: handle `na_matches` arg
+  x <- as_adq(x)
+  y <- as_adq(y)
+  by <- handle_join_by(by, x, y)
+
+  x$join <- list(
+    type = JoinType[[join_type]],
+    right_data = y,
+    by = by
+  )
+  collapse.arrow_dplyr_query(x)
+}
+
+left_join.arrow_dplyr_query <- function(x,
+                                        y,
+                                        by = NULL,
+                                        copy = FALSE,
+                                        suffix = c(".x", ".y"),
+                                        ...,
+                                        keep = FALSE) {
+  do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_OUTER")
+}
+left_join.Dataset <- left_join.ArrowTabular <- left_join.arrow_dplyr_query
+
+right_join.arrow_dplyr_query <- function(x,
+                                         y,
+                                         by = NULL,
+                                         copy = FALSE,
+                                         suffix = c(".x", ".y"),
+                                         ...,
+                                         keep = FALSE) {
+  do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "RIGHT_OUTER")
+}
+right_join.Dataset <- right_join.ArrowTabular <- right_join.arrow_dplyr_query
+
+inner_join.arrow_dplyr_query <- function(x,
+                                         y,
+                                         by = NULL,
+                                         copy = FALSE,
+                                         suffix = c(".x", ".y"),
+                                         ...,
+                                         keep = FALSE) {
+  do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "INNER")
+}
+inner_join.Dataset <- inner_join.ArrowTabular <- inner_join.arrow_dplyr_query
+
+full_join.arrow_dplyr_query <- function(x,
+                                        y,
+                                        by = NULL,
+                                        copy = FALSE,
+                                        suffix = c(".x", ".y"),
+                                        ...,
+                                        keep = FALSE) {
+  do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "FULL_OUTER")
+}
+full_join.Dataset <- full_join.ArrowTabular <- full_join.arrow_dplyr_query
+
+semi_join.arrow_dplyr_query <- function(x,
+                                        y,
+                                        by = NULL,
+                                        copy = FALSE,
+                                        suffix = c(".x", ".y"),
+                                        ...,
+                                        keep = FALSE) {
+  do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_SEMI")
+}
+semi_join.Dataset <- semi_join.ArrowTabular <- semi_join.arrow_dplyr_query
+
+anti_join.arrow_dplyr_query <- function(x,
+                                        y,
+                                        by = NULL,
+                                        copy = FALSE,
+                                        suffix = c(".x", ".y"),
+                                        ...,
+                                        keep = FALSE) {
+  do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_ANTI")
+}
+anti_join.Dataset <- anti_join.ArrowTabular <- anti_join.arrow_dplyr_query
+
+handle_join_by <- function(by, x, y) {
+  if (is.null(by)) {
+    return(set_names(intersect(names(x), names(y))))
+  }
+  stopifnot(is.character(by))
+  if (is.null(names(by))) {
+    by <- set_names(by)
+  }
+  # TODO: nicer messages?
+  stopifnot(
+    all(names(by) %in% names(x)),
+    all(by %in% names(y))
+  )
+  by
+}
diff --git a/r/R/enums.R b/r/R/enums.R
@@ -163,3 +163,16 @@ RoundMode <- enum("RoundMode",
   HALF_TO_EVEN = 8L,
   HALF_TO_ODD = 9L
 )
+
+#' @export
+#' @rdname enums
+JoinType <- enum("JoinType",
+  LEFT_SEMI = 0L,
+  RIGHT_SEMI = 1L,
+  LEFT_ANTI = 2L,
+  RIGHT_ANTI = 3L,
+  INNER = 4L,
+  LEFT_OUTER = 5L,
+  RIGHT_OUTER = 6L,
+  FULL_OUTER = 7L
+)
diff --git a/r/R/query-engine.R b/r/R/query-engine.R
@@ -143,7 +143,15 @@ ExecPlan <- R6Class("ExecPlan",
             )
           }
         }
-      } else {
+      } else if (!is.null(.data$join)) {
+        node <- node$Join(
+          type = .data$join$type,
+          right_node = self$Build(.data$join$right_data),
+          by = .data$join$by,
+          left_output = names(.data),
+          right_output = setdiff(names(.data$join$right_data), .data$join$by)
+        )
+      } else if (length(node$schema)) {
         # If any columns are derived, reordered, or renamed we need to Project
         # If there are aggregations, the projection was already handled above
         # We have to project at least once to eliminate some junk columns
@@ -206,6 +214,22 @@ ExecNode <- R6Class("ExecNode",
       self$preserve_sort(
         ExecNode_Aggregate(self, options, target_names, out_field_names, key_names)
       )
+    },
+    Join = function(type, right_node, by, left_output, right_output) {
+      self$preserve_sort(
+        ExecNode_Join(
+          self,
+          type,
+          right_node,
+          left_keys = names(by),
+          right_keys = by,
+          left_output = left_output,
+          right_output = right_output
+        )
+      )
     }
+  ),
+  active = list(
+    schema = function() ExecNode_output_schema(self)
   )
 )
diff --git a/r/man/enums.Rd b/r/man/enums.Rd
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp