From de6184529e82dabc2b89a202a3ecb720ae282ff3 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 18 May 2019 16:24:24 +0530 Subject: [PATCH 1/4] add natural join support, closes #629 --- NEWS.md | 2 ++ R/data.table.R | 4 ++++ R/test.data.table.R | 1 + inst/tests/tests.Rraw | 12 ++++++++++++ man/data.table.Rd | 2 +- 5 files changed, 20 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 719c9b52dc..9e1b08f99f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -80,6 +80,8 @@ 9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. +10. When doing join but `x` data.table has no key and `on` argument is missing, it will attempt to do _natural join_ thus set `on` to common columns across both data.tables, [#629](https://github.com/Rdatatable/data.table/issues/629). Thanks to David Kulp for request. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/data.table.R b/R/data.table.R index 0419d7bc47..877434787b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -483,6 +483,10 @@ replace_dot_alias <- function(e) { i = as.data.table(i) } if (is.data.table(i)) { + if (!haskey(x) && missing(on) && length(common_names <- intersect(names(x), names(i)))) { + if (verbose) cat("Joining but 'x' has no key and 'on' is missing, natural join using common columns\n") + on = common_names # natural join #629 + } if (!haskey(x) && missing(on) && is.null(xo)) { stop("When i is a data.table (or character vector), the columns to join by must be specified either using 'on=' argument (see ?data.table) or by keying x (i.e. sorted, and, marked as sorted, see ?setkey). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") } diff --git a/R/test.data.table.R b/R/test.data.table.R index 7f161b3bdc..d2573aa4cb 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -249,6 +249,7 @@ test <- function(num,x,y=TRUE,error=NULL,warning=NULL,output=NULL,message=NULL) } else { memtest = FALSE # nocov filename = NA_character_ # nocov + foreign = FALSE # nocov # assumes users of 'cc(F); test(...)' has LANGUAGE=en } if (!missing(error) && !missing(y)) stop("Test ",numStr," is invalid: when error= is provided it does not make sense to pass y as well") # nocov diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 749fd54f9c..ed019723a6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14613,6 +14613,18 @@ test(2043.4, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd z = 1:4 test(2043.5, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z +# natural join #629 +d1 = data.table(id1=1:3, id2=2:4, v1=1:3) +d2 = data.table(id1=2:4, id2=3:5, v2=3:1) +ans = data.table(id1=2:4, id2=3:5, v1=c(2:3,NA_integer_), v2=3:1) +test(2044.1, d1[d2, on=.(id1,id2)], ans) +test(2044.2, d1[d2, on=.(id1,id2), nomatch=NULL], ans[1:2]) +test(2044.3, d1[d2, verbose=TRUE], ans, output="natural join") +test(2044.4, d1[d2, nomatch=NULL, verbose=TRUE], ans[1:2], output="natural join") +d2 = data.table(id1=2:4, id2=letters[3:5], v2=3:1) +test(2044.5, d1[d2, on=.(id1,id2)], error="typeof.*integer.*typeof.*character") +test(2044.6, d1[d2, verbose=TRUE], output="natural join", error="typeof.*integer.*typeof.*character") + ################################### # Add new tests above this line # diff --git a/man/data.table.Rd b/man/data.table.Rd index 16cb39cd3b..065418c0c3 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -154,7 +154,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{drop}{ Never used by \code{data.table}. Do not use. It needs to be here because \code{data.table} inherits from \code{data.frame}. See \href{vignettes/datatable-faq.html}{datatable-faq}.} - \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. There are multiple ways of specifying the \code{on} argument: + \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. When missing and \code{x} has no key then it will attempt to perform \emph{natural join} thus set \code{on} to columns that are common in both tables. There are multiple ways of specifying the \code{on} argument: \itemize{ \item{As an unnamed character vector, e.g., \code{X[Y, on=c("a", "b")]}, used when columns \code{a} and \code{b} are common to both \code{X} and \code{Y}.} \item{\emph{Foreign key joins}: As a \emph{named} character vector when the join columns have different names in \code{X} and \code{Y}. From 448ade86cea4b715023a77fa908d389bab57d8df Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 19 May 2019 00:35:23 +0800 Subject: [PATCH 2/4] Fine-tune messaging around natural joins --- R/data.table.R | 8 ++++++-- inst/tests/tests.Rraw | 13 +++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 877434787b..942f1a4725 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -484,11 +484,15 @@ replace_dot_alias <- function(e) { } if (is.data.table(i)) { if (!haskey(x) && missing(on) && length(common_names <- intersect(names(x), names(i)))) { - if (verbose) cat("Joining but 'x' has no key and 'on' is missing, natural join using common columns\n") + if (verbose) { + which_cols_msg = if (length(common_names) == length(x)) " all 'x' columns" + else paste(":", brackify(common_names)) + cat("Joining but 'x' has no key and 'on' is missing, defaulting to natural join using", which_cols_msg, "\n", sep = "") + } on = common_names # natural join #629 } if (!haskey(x) && missing(on) && is.null(xo)) { - stop("When i is a data.table (or character vector), the columns to join by must be specified either using 'on=' argument (see ?data.table) or by keying x (i.e. sorted, and, marked as sorted, see ?setkey). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") + stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") } if (!missing(on)) { # on = .() is now possible, #1257 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ed019723a6..c55f8b9d8f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -364,7 +364,7 @@ TESTDT = data.table(NULL) test(122, TESTDT[1], TESTDT) test(123, TESTDT[0], TESTDT) test(124, TESTDT[1:10], TESTDT) -test(125, TESTDT["k"], error="the columns to join by must be specified either using") +test(125, TESTDT["k"], error="the columns to join by must be specified using") # test 126 no longer needed now that test() has 'error' argument TESTDT = data.table(a=3L,v=2L,key="a") # testing 1-row table @@ -1207,7 +1207,7 @@ test(415, x:=1, error="defined for use in j, once only and in particular ways") # Somehow never tested that X[Y] is error if X is unkeyed. DT = data.table(a=1:3,b=4:6) -test(416, DT[J(2)], error="the columns to join by must be specified either using") +test(416, DT[J(2)], error="the columns to join by must be specified using") # Test shallow copy verbose message from := adding a column, and (TO DO) only when X is NAMED. DT = data.table(a=1:3,b=4:6) @@ -14619,12 +14619,17 @@ d2 = data.table(id1=2:4, id2=3:5, v2=3:1) ans = data.table(id1=2:4, id2=3:5, v1=c(2:3,NA_integer_), v2=3:1) test(2044.1, d1[d2, on=.(id1,id2)], ans) test(2044.2, d1[d2, on=.(id1,id2), nomatch=NULL], ans[1:2]) -test(2044.3, d1[d2, verbose=TRUE], ans, output="natural join") +test(2044.3, d1[d2, verbose=TRUE], ans, output="natural join using: [id1, id2]") test(2044.4, d1[d2, nomatch=NULL, verbose=TRUE], ans[1:2], output="natural join") d2 = data.table(id1=2:4, id2=letters[3:5], v2=3:1) test(2044.5, d1[d2, on=.(id1,id2)], error="typeof.*integer.*typeof.*character") test(2044.6, d1[d2, verbose=TRUE], output="natural join", error="typeof.*integer.*typeof.*character") - +test(2044.7, d1[d1, verbose = TRUE], d1, output="natural join using all 'x' columns") +d1 = setDT(replicate(20L, 1L, simplify = FALSE)) +d2 = copy(d1[ , 1:15]) +setnames(d2, 1L, 'X1') +test(2044.8, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), + output="natural join using: \\[.*[.]{3}\\]") ################################### # Add new tests above this line # From 1ff1b584c13bc3ad15e5ba35f5e4ff569b11a39e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 May 2019 19:19:42 +0530 Subject: [PATCH 3/4] natural join only by .NATURAL keyword or option --- NEWS.md | 2 +- R/data.table.R | 21 ++++++++++++++------- R/onLoad.R | 1 + inst/tests/tests.Rraw | 35 ++++++++++++++++++++++++----------- man/data.table.Rd | 2 +- 5 files changed, 41 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9e1b08f99f..5f614aa8e0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -80,7 +80,7 @@ 9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. -10. When doing join but `x` data.table has no key and `on` argument is missing, it will attempt to do _natural join_ thus set `on` to common columns across both data.tables, [#629](https://github.com/Rdatatable/data.table/issues/629). Thanks to David Kulp for request. +10. It is now possible to join two tables on their common columns, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Use `on=.NATURAL` or `options("datatable.naturaljoin"=TRUE)`. Latter one works only when `x` has no key, if key is present then key columns are being used to join as before. Thanks to David Kulp for request. #### BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index 942f1a4725..d1fde119a0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -483,16 +483,23 @@ replace_dot_alias <- function(e) { i = as.data.table(i) } if (is.data.table(i)) { - if (!haskey(x) && missing(on) && length(common_names <- intersect(names(x), names(i)))) { + naturaljoin = FALSE + if (missing(on)) { + if (!haskey(x)) { + if (getOption("datatable.naturaljoin")) naturaljoin = TRUE + else if (is.null(xo)) stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") + } + } else if (identical(substitute(on), as.name(".NATURAL"))) naturaljoin = TRUE + if (naturaljoin) { # natural join #629 + common_names = intersect(names(x), names(i)) + len_common_names = length(common_names) + if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables") if (verbose) { - which_cols_msg = if (length(common_names) == length(x)) " all 'x' columns" + which_cols_msg = if (len_common_names == length(x)) " all 'x' columns" else paste(":", brackify(common_names)) - cat("Joining but 'x' has no key and 'on' is missing, defaulting to natural join using", which_cols_msg, "\n", sep = "") + cat("Joining but 'x' has no key, natural join using", which_cols_msg, "\n", sep = "") } - on = common_names # natural join #629 - } - if (!haskey(x) && missing(on) && is.null(xo)) { - stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") + on = common_names } if (!missing(on)) { # on = .() is now possible, #1257 diff --git a/R/onLoad.R b/R/onLoad.R index 82fa26d95f..dd41f994cf 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -58,6 +58,7 @@ "datatable.use.index"="TRUE", # global switch to address #1422 "datatable.prettyprint.char" = NULL, # FR #1091 "datatable.old.unique.by.key" = "FALSE" # TODO: change warnings in duplicated.R to error on or after May 2019 then remove a year after that. + ,"datatable.naturaljoin" = "FALSE" # natural join, when set to TRUE then `on` defaults to `.NATURAL` ) for (i in setdiff(names(opts),names(options()))) { eval(parse(text=paste0("options(",i,"=",opts[i],")"))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c55f8b9d8f..69d15c3e2f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14614,22 +14614,35 @@ z = 1:4 test(2043.5, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z # natural join #629 -d1 = data.table(id1=1:3, id2=2:4, v1=1:3) -d2 = data.table(id1=2:4, id2=3:5, v2=3:1) -ans = data.table(id1=2:4, id2=3:5, v1=c(2:3,NA_integer_), v2=3:1) -test(2044.1, d1[d2, on=.(id1,id2)], ans) -test(2044.2, d1[d2, on=.(id1,id2), nomatch=NULL], ans[1:2]) -test(2044.3, d1[d2, verbose=TRUE], ans, output="natural join using: [id1, id2]") -test(2044.4, d1[d2, nomatch=NULL, verbose=TRUE], ans[1:2], output="natural join") +d1 = data.table(id1=rep(1L,3), id2=2:4, v1=1:3) +d2 = data.table(id1=rep(1L,3), id2=3:5, v2=3:1) +ans = data.table(id1=rep(1L, 3), id2=3:5, v1=c(2:3,NA_integer_), v2=3:1) +test(2044.01, d1[d2], error="columns to join by must be specified") +test(2044.02, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") +options("datatable.naturaljoin"=TRUE) +test(2044.03, d1[d2, on=.(id1,id2)], ans) +test(2044.04, d1[d2, on=.(id1,id2), nomatch=NULL], ans[1:2]) +test(2044.05, d1[d2, verbose=TRUE], ans, output="natural join using: [id1, id2]") +test(2044.06, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") +test(2044.07, d1[d2, nomatch=NULL, verbose=TRUE], ans[1:2], output="natural join using: [id1, id2]") +setkey(d1, id1) +test(2044.08, nrow(d1[d2, allow.cartesian=TRUE]), 9L) # join +test(2044.09, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") # ignore key when on=.NATURAL +setkey(d1, NULL) +setnames(d2, c("a","b","c")) +test(2044.10, d1[d2], error="Attempting to do natural join but no common columns in provided tables") +test(2044.11, d1[d2, on=.NATURAL], error="Attempting to do natural join but no common columns in provided tables") d2 = data.table(id1=2:4, id2=letters[3:5], v2=3:1) -test(2044.5, d1[d2, on=.(id1,id2)], error="typeof.*integer.*typeof.*character") -test(2044.6, d1[d2, verbose=TRUE], output="natural join", error="typeof.*integer.*typeof.*character") -test(2044.7, d1[d1, verbose = TRUE], d1, output="natural join using all 'x' columns") +test(2044.12, d1[d2, on=.(id1,id2)], error="typeof.*integer.*typeof.*character") +test(2044.13, d1[d2, verbose=TRUE], output="natural join", error="typeof.*integer.*typeof.*character") +test(2044.14, d1[d1, verbose=TRUE], d1, output="natural join using all 'x' columns") d1 = setDT(replicate(20L, 1L, simplify = FALSE)) d2 = copy(d1[ , 1:15]) setnames(d2, 1L, 'X1') -test(2044.8, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), +test(2044.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural join using: \\[.*[.]{3}\\]") +options("datatable.naturaljoin"=FALSE) + ################################### # Add new tests above this line # diff --git a/man/data.table.Rd b/man/data.table.Rd index 065418c0c3..87d302ecf6 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -154,7 +154,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{drop}{ Never used by \code{data.table}. Do not use. It needs to be here because \code{data.table} inherits from \code{data.frame}. See \href{vignettes/datatable-faq.html}{datatable-faq}.} - \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. When missing and \code{x} has no key then it will attempt to perform \emph{natural join} thus set \code{on} to columns that are common in both tables. There are multiple ways of specifying the \code{on} argument: + \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. When \code{.NATURAL} keyword provided then \emph{natural join} is made (join on common columns). Optionally when setting option \code{"datatable.naturaljoin"=TRUE} and missing \code{x} has no key then \code{on} defaults to \code{.NATURAL}. There are multiple ways of specifying the \code{on} argument: \itemize{ \item{As an unnamed character vector, e.g., \code{X[Y, on=c("a", "b")]}, used when columns \code{a} and \code{b} are common to both \code{X} and \code{Y}.} \item{\emph{Foreign key joins}: As a \emph{named} character vector when the join columns have different names in \code{X} and \code{Y}. From 93d9436bbf74a40e37a376c55b81cac850220746 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 21 May 2019 19:43:07 -0700 Subject: [PATCH 4/4] tests had slightly different verbose output in master after last PR --- inst/tests/tests.Rraw | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4269f16756..812a970c87 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -94,7 +94,8 @@ oldOptions = options( datatable.verbose = FALSE, datatable.alloccol = 1024L, datatable.print.class = FALSE, # This is TRUE in cc.R and we like TRUE. But output= tests need to be updated (they assume FALSE currently) - datatable.rbindlist.check = NULL + datatable.rbindlist.check = NULL, + datatable.naturaljoin = FALSE ) # some tests (e.g. 1066, 1293) rely on capturing output that will be garbled with small width if (getOption('width') < 80L) options(width = 80L) @@ -14745,7 +14746,7 @@ d2 = data.table(id1=rep(1L,3), id2=3:5, v2=3:1) ans = data.table(id1=rep(1L, 3), id2=3:5, v1=c(2:3,NA_integer_), v2=3:1) test(2045.01, d1[d2], error="columns to join by must be specified") test(2045.02, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") -options("datatable.naturaljoin"=TRUE) +options(datatable.naturaljoin=TRUE) test(2045.03, d1[d2, on=.(id1,id2)], ans) test(2045.04, d1[d2, on=.(id1,id2), nomatch=NULL], ans[1:2]) test(2045.05, d1[d2, verbose=TRUE], ans, output="natural join using: [id1, id2]") @@ -14759,14 +14760,14 @@ setnames(d2, c("a","b","c")) test(2045.10, d1[d2], error="Attempting to do natural join but no common columns in provided tables") test(2045.11, d1[d2, on=.NATURAL], error="Attempting to do natural join but no common columns in provided tables") d2 = data.table(id1=2:4, id2=letters[3:5], v2=3:1) -test(2045.12, d1[d2, on=.(id1,id2)], error="typeof.*integer.*typeof.*character") -test(2045.13, d1[d2, verbose=TRUE], output="natural join", error="typeof.*integer.*typeof.*character") +test(2045.12, d1[d2, on=.(id1,id2)], error="Incompatible join types: x.id2 (integer) and i.id2 (character)") +test(2045.13, d1[d2, verbose=TRUE], output="natural join", error="Incompatible join types: x.id2 (integer) and i.id2 (character)") test(2045.14, d1[d1, verbose=TRUE], d1, output="natural join using all 'x' columns") d1 = setDT(replicate(20L, 1L, simplify = FALSE)) d2 = copy(d1[ , 1:15]) setnames(d2, 1L, 'X1') test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural join using: \\[.*[.]{3}\\]") -options("datatable.naturaljoin"=FALSE) +options(datatable.naturaljoin=FALSE) ###################################