From bdf379b871cbb78d9dc2152d8c381070b32c0b74 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 30 Jul 2019 16:11:13 +0200 Subject: [PATCH 1/2] natural join using X[on=Y], closes #3621 --- NEWS.md | 2 +- R/data.table.R | 11 +++++++++-- inst/tests/tests.Rraw | 15 +++++++++++---- vignettes/datatable-importing.Rmd | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 411a589857..2654f42d5a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -82,7 +82,7 @@ 9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. -10. `on=.NATURAL` (TODO: `X[on=Y]`) joins two tables on their common column names, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Thanks to David Kulp for request. As before, when `on=` is not provided, `X` must have a key and the key columns are used to join (like rownames, but multi-column and multi-type). +10. `on=.NATURAL` (or alternatively `X[on=Y]` [#3621](https://github.com/Rdatatable/data.table/issues/3621)) joins two tables on their common column names, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Thanks to David Kulp for request. As before, when `on=` is not provided, `X` must have a key and the key columns are used to join (like rownames, but multi-column and multi-type). 11. `as.data.table` gains `key` argument mirroring its use in `setDT` and `data.table`, [#890](https://github.com/Rdatatable/data.table/issues/890). As a byproduct, the arguments of `as.data.table.array` have changed order, which could affect code relying on positional arguments to this method. Thanks @cooldome for the suggestion and @MichaelChirico for implementation. diff --git a/R/data.table.R b/R/data.table.R index a7e085a5fd..6e4b6ab8e1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -176,6 +176,11 @@ replace_order = function(isub, verbose, env) { } bynull = !missingby && is.null(by) #3530 byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI" + naturaljoin = FALSE + if (missing(i) && !missing(on)) { + i = eval.parent(.massagei(substitute(on))) + naturaljoin = TRUE + } if (missing(i) && missing(j)) { tt_isub = substitute(i) tt_jsub = substitute(j) @@ -413,13 +418,15 @@ replace_order = function(isub, verbose, env) { isnull_inames = is.null(names(i)) i = as.data.table(i) } + if (is.data.table(i)) { - naturaljoin = FALSE if (missing(on)) { if (!haskey(x)) { stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") } - } else if (identical(substitute(on), as.name(".NATURAL"))) naturaljoin = TRUE + } else if (identical(substitute(on), as.name(".NATURAL"))) { + naturaljoin = TRUE + } if (naturaljoin) { # natural join #629 common_names = intersect(names(x), names(i)) len_common_names = length(common_names) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dbfb740e69..863796275f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12942,10 +12942,10 @@ test(1948.14, DT[i, on = 1L], error = "'on' argument should be a named atomic ve # helpful error when on= is provided but not i, rather than silently ignoring on= DT = data.table(A=1:3) -test(1949.1, DT[,,on=A], DT, warning="i and j are both missing so ignoring the other arguments") -test(1949.2, DT[,1,on=A], DT, warning="ignoring on= because it is only relevant to i but i is not provided") -test(1949.3, DT[on=A], DT, warning="i and j are both missing so ignoring the other arguments") -test(1949.4, DT[,on=A], DT, warning="i and j are both missing so ignoring the other arguments") +test(1949.1, DT[,,on=A], error="object 'A' not found") # tests .1 to .4 amended after #3621 +test(1949.2, DT[,1,on=A], error="object 'A' not found") +test(1949.3, DT[on=A], error="object 'A' not found") +test(1949.4, DT[,on=A], error="object 'A' not found") test(1949.5, DT[1,,with=FALSE], error="j must be provided when with=FALSE") test(1949.6, DT[], output="A.*1.*2.*3") # no error test(1949.7, DT[,], output="A.*1.*2.*3") # no error, #3163 @@ -15428,6 +15428,13 @@ test(2071.10, dcast(data.table(a=1, b=1, l=list(list(1))), a ~ b, value.var='l') test(2071.11, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), data.table(a=1, `2`=3, key='a')) +# natural join using X[on=Y] #3621 +X = data.table(a=1:2, b=1:2) +test(2072.01, X[on=.(a=2:3, d=2:1)], data.table(a=2:3, b=c(2L,NA_integer_), d=2:1)) +Y = data.table(a=2:3, d=2:1) +test(2072.02, X[on=Y], data.table(a=2:3, b=c(2L,NA_integer_), d=2:1)) + + ################################### # Add new tests above this line # ################################### diff --git a/vignettes/datatable-importing.Rmd b/vignettes/datatable-importing.Rmd index 16a3cb39d2..63436b6b65 100644 --- a/vignettes/datatable-importing.Rmd +++ b/vignettes/datatable-importing.Rmd @@ -126,7 +126,7 @@ If you don't mind having `id` and `grp` registered as variables globally in your Common practice by R packages is to provide customization options set by `options(name=val)` and fetched using `getOption("name", default)`. Function arguments often specify a call to `getOption()` so that the user knows (from `?fun` or `args(fun)`) the name of the option controlling the default for that parameter; e.g. `fun(..., verbose=getOption("datatable.verbose", FALSE))`. All `data.table` options start with `datatable.` so as to not conflict with options in other packages. A user simply calls `options(datatable.verbose=TRUE)` to turn on verbosity. This affects all calls to `fun()` other the ones which have been provided `verbose=` explicity; e.g. `fun(..., verbose=FALSE)`. -The option mechanism in R is _global_. Meaning that if a user sets a `data.table` option for their own use, that setting also affects code inside any package that is using `data.table` too. For an option like `datatable.verbose`, this is exactly the desired behavior since the desire is to trace and log all `data.table` operations from wherever they originate; turning on verbosity does not affect the results. Another unique-to-R and excellent-for-production option is R's `options(warn=2)` which turns all warnings into errors. Again, the desire is to affect any warning in any package so as to not missing any warnings in production. There are 6 `datatable.print.*` options and 3 optimization options which do not affect the result of operations, either. However, there is one `data.table` option that does and is now a concern: `datatable.nomatch`. This option changes the default join from outer to inner. [Aside, the default join is outer because outer is safer; it doesn't drop missing data silently.] Some users prefer inner join to be the default and we provided this option for them. However, a user setting this option can unintentionally change the behavior of joins inside packages that use `data.table`. Accordingly, in v1.12.4, we have started the process to deprecate the `datatable.nomatch` option. It is the only `data.table` option with this concern. +The option mechanism in R is _global_. Meaning that if a user sets a `data.table` option for their own use, that setting also affects code inside any package that is using `data.table` too. For an option like `datatable.verbose`, this is exactly the desired behavior since the desire is to trace and log all `data.table` operations from wherever they originate; turning on verbosity does not affect the results. Another unique-to-R and excellent-for-production option is R's `options(warn=2)` which turns all warnings into errors. Again, the desire is to affect any warning in any package so as to not missing any warnings in production. There are 6 `datatable.print.*` options and 3 optimization options which do not affect the result of operations, either. However, there is one `data.table` option that does and is now a concern: `datatable.nomatch`. This option changes the default join from outer to inner. [Aside, the default join is outer because outer is safer; it doesn't drop missing data silently; moreover it is consistent to base R way of matching by names and indices.] Some users prefer inner join to be the default and we provided this option for them. However, a user setting this option can unintentionally change the behavior of joins inside packages that use `data.table`. Accordingly, in v1.12.4, we have started the process to deprecate the `datatable.nomatch` option. It is the only `data.table` option with this concern. ## Troubleshooting From 2fb2e4124c7d43d70ad59e16aef6cfc05b45b561 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 13 Aug 2019 15:03:40 -0700 Subject: [PATCH 2/2] fit test number into increasing order after merge --- inst/tests/tests.Rraw | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 894e52a2c9..323b0368c3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15649,12 +15649,11 @@ test(2074.41, fread('a\n1', na.strings='9', verbose=TRUE), output='One or more o # cbind 0 cols, #3334 test(2075, data.table(data.table(a=1), data.table()), data.table(data.table(a=1))) - -# natural join using X[on=Y] #3621 +# natural join using X[on=Y], #3621 X = data.table(a=1:2, b=1:2) -test(2072.01, X[on=.(a=2:3, d=2:1)], data.table(a=2:3, b=c(2L,NA_integer_), d=2:1)) +test(2076.01, X[on=.(a=2:3, d=2:1)], data.table(a=2:3, b=c(2L,NA_integer_), d=2:1)) Y = data.table(a=2:3, d=2:1) -test(2072.02, X[on=Y], data.table(a=2:3, b=c(2L,NA_integer_), d=2:1)) +test(2076.02, X[on=Y], data.table(a=2:3, b=c(2L,NA_integer_), d=2:1)) ###################################