Rdatatable · mattdowle · May 22, 2019 · May 18, 2019 · May 18, 2019 · May 20, 2019
@@ -80,6 +80,8 @@
 
 9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code.
 
+10. It is now possible to join two tables on their common columns, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Use `on=.NATURAL` or `options("datatable.naturaljoin"=TRUE)`. Latter one works only when `x` has no key, if key is present then key columns are being used to join as before. Thanks to David Kulp for request.
+
 #### BUG FIXES
 
 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting.

@@ -481,8 +481,23 @@ replace_dot_alias <- function(e) {
       i = as.data.table(i)
     }
     if (is.data.table(i)) {
-      if (!haskey(x) && missing(on)) {
-        stop("When i is a data.table (or character vector), the columns to join by must be specified either using 'on=' argument (see ?data.table) or by keying x (i.e. sorted, and, marked as sorted, see ?setkey). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
+      naturaljoin = FALSE
+      if (missing(on)) {
+        if (!haskey(x)) {
+          if (getOption("datatable.naturaljoin")) naturaljoin = TRUE
+          else stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
+        }
+      } else if (identical(substitute(on), as.name(".NATURAL"))) naturaljoin = TRUE
+      if (naturaljoin) { # natural join #629
+        common_names = intersect(names(x), names(i))
+        len_common_names = length(common_names)
+        if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables")
+        if (verbose) {
+          which_cols_msg = if (len_common_names == length(x)) " all 'x' columns"
+          else paste(":", brackify(common_names))
+          cat("Joining but 'x' has no key, natural join using", which_cols_msg, "\n", sep = "")
+        }
+        on = common_names
       }
       if (!missing(on)) {
         # on = .() is now possible, #1257

@@ -58,6 +58,7 @@
        "datatable.use.index"="TRUE",           # global switch to address #1422
        "datatable.prettyprint.char" = NULL,     # FR #1091
        "datatable.old.unique.by.key" = "FALSE"  # TODO: change warnings in duplicated.R to error on or after May 2019 then remove a year after that.
+       ,"datatable.naturaljoin" = "FALSE"      # natural join, when set to TRUE then `on` defaults to `.NATURAL`
        )
   for (i in setdiff(names(opts),names(options()))) {
     eval(parse(text=paste0("options(",i,"=",opts[i],")")))

@@ -249,7 +249,7 @@ test <- function(num,x,y=TRUE,error=NULL,warning=NULL,output=NULL,message=NULL)
   } else {
     memtest = FALSE          # nocov
     filename = NA_character_ # nocov
-    foreign = FALSE          # nocov
+    foreign = FALSE          # nocov ; assumes users of 'cc(F); test(...)' has LANGUAGE=en
   }
   if (!missing(error) && !missing(y))
     stop("Test ",numStr," is invalid: when error= is provided it does not make sense to pass y as well")  # nocov

@@ -94,7 +94,8 @@ oldOptions = options(
   datatable.verbose = FALSE,
   datatable.alloccol = 1024L,
   datatable.print.class = FALSE,  #  This is TRUE in cc.R and we like TRUE. But output= tests need to be updated (they assume FALSE currently)
-  datatable.rbindlist.check = NULL
+  datatable.rbindlist.check = NULL,
+  datatable.naturaljoin = FALSE
 )
 # some tests (e.g. 1066, 1293) rely on capturing output that will be garbled with small width
 if (getOption('width') < 80L) options(width = 80L)
@@ -364,7 +365,7 @@ TESTDT = data.table(NULL)
 test(122, TESTDT[1], TESTDT)
 test(123, TESTDT[0], TESTDT)
 test(124, TESTDT[1:10], TESTDT)
-test(125, TESTDT["k"], error="the columns to join by must be specified either using")
+test(125, TESTDT["k"], error="the columns to join by must be specified using")
 # test 126 no longer needed now that test() has 'error' argument
 
 TESTDT = data.table(a=3L,v=2L,key="a")  # testing 1-row table
@@ -1207,7 +1208,7 @@ test(415, x:=1, error="defined for use in j, once only and in particular ways")
 
 # Somehow never tested that X[Y] is error if X is unkeyed.
 DT = data.table(a=1:3,b=4:6)
-test(416, DT[J(2)], error="the columns to join by must be specified either using")
+test(416, DT[J(2)], error="the columns to join by must be specified using")
 
 # Test shallow copy verbose message from := adding a column, and (TO DO) only when X is NAMED.
 DT = data.table(a=1:3,b=4:6)
@@ -14739,6 +14740,35 @@ test(2044.84, dt1[dt2, on="b==a",             verbose=TRUE], data.table(a=NA_rea
 test(2044.85, dt1[dt2, on="b==a", nomatch=0L, verbose=TRUE], data.table(a=double(), b=integer(), i.b=logical()),
               output=msg)
 
+# natural join #629
+d1 = data.table(id1=rep(1L,3), id2=2:4, v1=1:3)
+d2 = data.table(id1=rep(1L,3), id2=3:5, v2=3:1)
+ans = data.table(id1=rep(1L, 3), id2=3:5, v1=c(2:3,NA_integer_), v2=3:1)
+test(2045.01, d1[d2], error="columns to join by must be specified")
+test(2045.02, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]")
+options(datatable.naturaljoin=TRUE)
+test(2045.03, d1[d2, on=.(id1,id2)], ans)
+test(2045.04, d1[d2, on=.(id1,id2), nomatch=NULL], ans[1:2])
+test(2045.05, d1[d2, verbose=TRUE], ans, output="natural join using: [id1, id2]")
+test(2045.06, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]")
+test(2045.07, d1[d2, nomatch=NULL, verbose=TRUE], ans[1:2], output="natural join using: [id1, id2]")
+setkey(d1, id1)
+test(2045.08, nrow(d1[d2, allow.cartesian=TRUE]), 9L) # join
+test(2045.09, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") # ignore key when on=.NATURAL
+setkey(d1, NULL)
+setnames(d2, c("a","b","c"))
+test(2045.10, d1[d2], error="Attempting to do natural join but no common columns in provided tables")
+test(2045.11, d1[d2, on=.NATURAL], error="Attempting to do natural join but no common columns in provided tables")
+d2 = data.table(id1=2:4, id2=letters[3:5], v2=3:1)
+test(2045.12, d1[d2, on=.(id1,id2)], error="Incompatible join types: x.id2 (integer) and i.id2 (character)")
+test(2045.13, d1[d2, verbose=TRUE], output="natural join", error="Incompatible join types: x.id2 (integer) and i.id2 (character)")
+test(2045.14, d1[d1, verbose=TRUE], d1, output="natural join using all 'x' columns")
+d1 = setDT(replicate(20L, 1L, simplify = FALSE))
+d2 = copy(d1[ , 1:15])
+setnames(d2, 1L, 'X1')
+test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural join using: \\[.*[.]{3}\\]")
+options(datatable.naturaljoin=FALSE)
+
 
 ###################################
 #  Add new tests above this line  #

@@ -154,7 +154,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
 
   \item{drop}{ Never used by \code{data.table}. Do not use. It needs to be here because \code{data.table} inherits from \code{data.frame}. See \href{vignettes/datatable-faq.html}{datatable-faq}.}
 
-  \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. There are multiple ways of specifying the \code{on} argument:
+  \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. When \code{.NATURAL} keyword provided then \emph{natural join} is made (join on common columns). Optionally when setting option \code{"datatable.naturaljoin"=TRUE} and missing \code{x} has no key then \code{on} defaults to \code{.NATURAL}. There are multiple ways of specifying the \code{on} argument:
         \itemize{
             \item{As an unnamed character vector, e.g., \code{X[Y, on=c("a", "b")]}, used when columns \code{a} and \code{b} are common to both \code{X} and \code{Y}.}
             \item{\emph{Foreign key joins}: As a \emph{named} character vector when the join columns have different names in \code{X} and \code{Y}.