diff --git a/NEWS.md b/NEWS.md index d553038998..f1e99577ca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -80,6 +80,8 @@ 9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. +10. It is now possible to join two tables on their common columns, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Use `on=.NATURAL` or `options("datatable.naturaljoin"=TRUE)`. Latter one works only when `x` has no key, if key is present then key columns are being used to join as before. Thanks to David Kulp for request. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/data.table.R b/R/data.table.R index 324b5583c0..b4d0346c9d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -481,8 +481,23 @@ replace_dot_alias <- function(e) { i = as.data.table(i) } if (is.data.table(i)) { - if (!haskey(x) && missing(on)) { - stop("When i is a data.table (or character vector), the columns to join by must be specified either using 'on=' argument (see ?data.table) or by keying x (i.e. sorted, and, marked as sorted, see ?setkey). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") + naturaljoin = FALSE + if (missing(on)) { + if (!haskey(x)) { + if (getOption("datatable.naturaljoin")) naturaljoin = TRUE + else stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") + } + } else if (identical(substitute(on), as.name(".NATURAL"))) naturaljoin = TRUE + if (naturaljoin) { # natural join #629 + common_names = intersect(names(x), names(i)) + len_common_names = length(common_names) + if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables") + if (verbose) { + which_cols_msg = if (len_common_names == length(x)) " all 'x' columns" + else paste(":", brackify(common_names)) + cat("Joining but 'x' has no key, natural join using", which_cols_msg, "\n", sep = "") + } + on = common_names } if (!missing(on)) { # on = .() is now possible, #1257 diff --git a/R/onLoad.R b/R/onLoad.R index 82fa26d95f..dd41f994cf 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -58,6 +58,7 @@ "datatable.use.index"="TRUE", # global switch to address #1422 "datatable.prettyprint.char" = NULL, # FR #1091 "datatable.old.unique.by.key" = "FALSE" # TODO: change warnings in duplicated.R to error on or after May 2019 then remove a year after that. + ,"datatable.naturaljoin" = "FALSE" # natural join, when set to TRUE then `on` defaults to `.NATURAL` ) for (i in setdiff(names(opts),names(options()))) { eval(parse(text=paste0("options(",i,"=",opts[i],")"))) diff --git a/R/test.data.table.R b/R/test.data.table.R index 4dcc5b2798..3674937563 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -249,7 +249,7 @@ test <- function(num,x,y=TRUE,error=NULL,warning=NULL,output=NULL,message=NULL) } else { memtest = FALSE # nocov filename = NA_character_ # nocov - foreign = FALSE # nocov + foreign = FALSE # nocov ; assumes users of 'cc(F); test(...)' has LANGUAGE=en } if (!missing(error) && !missing(y)) stop("Test ",numStr," is invalid: when error= is provided it does not make sense to pass y as well") # nocov diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 52c7bd3376..812a970c87 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -94,7 +94,8 @@ oldOptions = options( datatable.verbose = FALSE, datatable.alloccol = 1024L, datatable.print.class = FALSE, # This is TRUE in cc.R and we like TRUE. But output= tests need to be updated (they assume FALSE currently) - datatable.rbindlist.check = NULL + datatable.rbindlist.check = NULL, + datatable.naturaljoin = FALSE ) # some tests (e.g. 1066, 1293) rely on capturing output that will be garbled with small width if (getOption('width') < 80L) options(width = 80L) @@ -364,7 +365,7 @@ TESTDT = data.table(NULL) test(122, TESTDT[1], TESTDT) test(123, TESTDT[0], TESTDT) test(124, TESTDT[1:10], TESTDT) -test(125, TESTDT["k"], error="the columns to join by must be specified either using") +test(125, TESTDT["k"], error="the columns to join by must be specified using") # test 126 no longer needed now that test() has 'error' argument TESTDT = data.table(a=3L,v=2L,key="a") # testing 1-row table @@ -1207,7 +1208,7 @@ test(415, x:=1, error="defined for use in j, once only and in particular ways") # Somehow never tested that X[Y] is error if X is unkeyed. DT = data.table(a=1:3,b=4:6) -test(416, DT[J(2)], error="the columns to join by must be specified either using") +test(416, DT[J(2)], error="the columns to join by must be specified using") # Test shallow copy verbose message from := adding a column, and (TO DO) only when X is NAMED. DT = data.table(a=1:3,b=4:6) @@ -14739,6 +14740,35 @@ test(2044.84, dt1[dt2, on="b==a", verbose=TRUE], data.table(a=NA_rea test(2044.85, dt1[dt2, on="b==a", nomatch=0L, verbose=TRUE], data.table(a=double(), b=integer(), i.b=logical()), output=msg) +# natural join #629 +d1 = data.table(id1=rep(1L,3), id2=2:4, v1=1:3) +d2 = data.table(id1=rep(1L,3), id2=3:5, v2=3:1) +ans = data.table(id1=rep(1L, 3), id2=3:5, v1=c(2:3,NA_integer_), v2=3:1) +test(2045.01, d1[d2], error="columns to join by must be specified") +test(2045.02, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") +options(datatable.naturaljoin=TRUE) +test(2045.03, d1[d2, on=.(id1,id2)], ans) +test(2045.04, d1[d2, on=.(id1,id2), nomatch=NULL], ans[1:2]) +test(2045.05, d1[d2, verbose=TRUE], ans, output="natural join using: [id1, id2]") +test(2045.06, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") +test(2045.07, d1[d2, nomatch=NULL, verbose=TRUE], ans[1:2], output="natural join using: [id1, id2]") +setkey(d1, id1) +test(2045.08, nrow(d1[d2, allow.cartesian=TRUE]), 9L) # join +test(2045.09, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") # ignore key when on=.NATURAL +setkey(d1, NULL) +setnames(d2, c("a","b","c")) +test(2045.10, d1[d2], error="Attempting to do natural join but no common columns in provided tables") +test(2045.11, d1[d2, on=.NATURAL], error="Attempting to do natural join but no common columns in provided tables") +d2 = data.table(id1=2:4, id2=letters[3:5], v2=3:1) +test(2045.12, d1[d2, on=.(id1,id2)], error="Incompatible join types: x.id2 (integer) and i.id2 (character)") +test(2045.13, d1[d2, verbose=TRUE], output="natural join", error="Incompatible join types: x.id2 (integer) and i.id2 (character)") +test(2045.14, d1[d1, verbose=TRUE], d1, output="natural join using all 'x' columns") +d1 = setDT(replicate(20L, 1L, simplify = FALSE)) +d2 = copy(d1[ , 1:15]) +setnames(d2, 1L, 'X1') +test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural join using: \\[.*[.]{3}\\]") +options(datatable.naturaljoin=FALSE) + ################################### # Add new tests above this line # diff --git a/man/data.table.Rd b/man/data.table.Rd index 16cb39cd3b..87d302ecf6 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -154,7 +154,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{drop}{ Never used by \code{data.table}. Do not use. It needs to be here because \code{data.table} inherits from \code{data.frame}. See \href{vignettes/datatable-faq.html}{datatable-faq}.} - \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. There are multiple ways of specifying the \code{on} argument: + \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. When \code{.NATURAL} keyword provided then \emph{natural join} is made (join on common columns). Optionally when setting option \code{"datatable.naturaljoin"=TRUE} and missing \code{x} has no key then \code{on} defaults to \code{.NATURAL}. There are multiple ways of specifying the \code{on} argument: \itemize{ \item{As an unnamed character vector, e.g., \code{X[Y, on=c("a", "b")]}, used when columns \code{a} and \code{b} are common to both \code{X} and \code{Y}.} \item{\emph{Foreign key joins}: As a \emph{named} character vector when the join columns have different names in \code{X} and \code{Y}.