From 4746b97ee039ca2393e740c4cc3da9bab1cee3bd Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Sat, 23 Oct 2021 11:02:19 +0200
Subject: [PATCH 1/3] add incomparables arg to merge.data.table

---
 NEWS.md               |  2 ++
 R/merge.R             | 23 +++++++++++++++++++++--
 inst/tests/tests.Rraw | 13 +++++++++++++
 man/merge.Rd          |  9 ++++++++-
 4 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/NEWS.md b/NEWS.md
index 5faf40723f..b9a8fff98f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -451,6 +451,8 @@
     
 47. `tables()` failed with `argument "..." is missing` when called from within a function taking `...`; e.g. `function(...) { tables() }`, [#5197](https://github.com/Rdatatable/data.table/issues/5197). Thanks @greg-minshall for the report and @michaelchirico for the fix.
 
+48. `merge.data.table` silently ignored `incomparables` argument because `...` is not further processed, [#2587](https://github.com/Rdatatable/data.table/issues/2587). `merge.data.table(x, y, incomparables = VEC)` produces now the same result as `merge.data.frame(setDF(x), setDF(y), incomparables = VEC)` and further also warns the user about unused arguments passed to `merge`. Thanks to @GBsuperman for the report and @ben-schwen for the fix.
+
 ## NOTES
 
 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.<type>()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :
diff --git a/R/merge.R b/R/merge.R
index 683a6d08a4..a27f0259ef 100644
--- a/R/merge.R
+++ b/R/merge.R
@@ -1,5 +1,5 @@
 merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
-               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
+               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) {
   if (!sort %in% c(TRUE, FALSE))
     stopf("Argument 'sort' should be logical TRUE/FALSE")
   if (!no.dups %in% c(TRUE, FALSE))
@@ -14,7 +14,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   x0 = length(x)==0L
   y0 = length(y)==0L
   if (x0 || y0) {
-    if (x0 && y0) 
+    if (x0 && y0)
       warningf("Neither of the input data.tables to join have columns.")
     else if (x0)
       warningf("Input data.table '%s' has no columns.", "x")
@@ -54,6 +54,15 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
     by = unname(by)
     by.x = by.y = by
   }
+
+  # warn about unused arguments #2587
+  if (length(list(...))) {
+    ell = as.list(substitute(list(...)))[-1L]
+    for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n)
+    unnamed_n = length(ell) - sum(names(ell) != "")
+    if (unnamed_n)
+      warningf("Passed %d unknown and unnamed arguments.", unnamed_n)
+  }
   # with i. prefix in v1.9.3, this goes away. Left here for now ...
   ## sidestep the auto-increment column number feature-leading-to-bug by
   ## ensuring no names end in ".1", see unit test
@@ -72,6 +81,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
     end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L])
   }
 
+  # implement incomparables argument #2587
+  # %fin% to be replaced when #5232 is implemented/closed
+  "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table
+  if (!is.null(incomparables)) {
+    xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by)
+    yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by)
+    # subset both so later steps still work
+    x = x[xind]
+    y = y[yind]
+  }
   dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian]   # includes JIS columns (with a i. prefix if conflict with x names)
 
   if (all.y && nrow(y)) {  # If y does not have any rows, no need to proceed
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 6382a13a85..e0a0510577 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18348,3 +18348,16 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec
 test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')),
              groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species')))
 
+# merge.data.table ignores incomparables without warnings. #2587
+x <- data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5)
+y <- data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5)
+DT = as.data.table(x)
+test(2226.1, setDF(merge(DT, y, by = "k2", incomparables = NA)),         merge(x, y, by="k2", incomparables = NA))
+test(2226.2, setDF(merge(DT, y, by = "k2", incomparables = c(NA,4))),    merge(x, y, by="k2", incomparables = c(NA,4)))
+test(2226.3, setDF(merge(DT, y, by = "k2", incomparables = c(4,5))),     merge(x, y, by="k2", incomparables = c(4,5)))
+test(2226.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1, NA, 4, 5)))
+test(2226.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA, 3, 4, 5)))
+test(2226.6, merge(DT, y, by = "k2", unk=1), merge(DT, y, by = "k2"), warning="Unknown argument 'unk' has been passed.")
+test(2226.7, merge(DT, y, by = "k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L),
+  merge(DT, y, by = "k2"), warning = c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
+
diff --git a/man/merge.Rd b/man/merge.Rd
index 6fcbc10866..d8246668c3 100644
--- a/man/merge.Rd
+++ b/man/merge.Rd
@@ -21,7 +21,7 @@ Use the \code{by}, \code{by.x} and \code{by.y} arguments explicitly to override
 \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE,
 all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE,
 allow.cartesian=getOption("datatable.allow.cartesian"),  # default FALSE
-\dots)
+incomparables = NULL, \dots)
 }
 
 \arguments{
@@ -51,6 +51,7 @@ fashion as the \code{\link{merge.data.frame}} method does.}
 non-\code{by.y} column names in \code{y} when they have the same column name
 as any \code{by.x}.}
 \item{allow.cartesian}{See \code{allow.cartesian} in \code{\link{[.data.table}}.}
+\item{incomparables}{values which cannot be matched and therefore are excluded from by columns.}
 \item{\dots}{Not used at this time.}
 }
 
@@ -125,6 +126,12 @@ setnames(d2, "a", "b")
 merge(d1, d2, by.x="a", by.y="b")
 merge(d1, d2, by.x="a", by.y="b", all=TRUE)
 merge(d2, d1, by.x="b", by.y="a")
+
+# using incomparables values
+d1 <- data.table(a=c(1,2,NA,NA,3,1), z=1:6)
+d2 <- data.table(a=c(1,2,NA), z=10:12)
+merge(d1, d2, by="a")
+merge(d1, d2, by="a", incomparables=NA)
 }
 
 \keyword{ data }

From bfeb71bf949aca0e1334c229c1f2a055b4fedba9 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Tue, 30 Nov 2021 14:51:54 -0700
Subject: [PATCH 2/3] move %fin% inside scope where it is used

---
 NEWS.md   | 2 +-
 R/merge.R | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 66ac0c1af7..c92bf00240 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -518,7 +518,7 @@
 
 50. `fwrite()` could produce not-ISO-compliant timestamps such as `2023-03-08T17:22:32.:00Z` when under a whole second by less than numerical tolerance of one microsecond, [#5238](https://github.com/Rdatatable/data.table/issues/5238). Thanks to @avraam-inside for the report and Václav Tlapák for the fix.
 
-51. `merge.data.table` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). Thanks to @GBsuperman for the report and @ben-schwen for the fix.
+51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). Thanks to @GBsuperman for the report and @ben-schwen for the fix.
 
 ## NOTES
 
diff --git a/R/merge.R b/R/merge.R
index b479bb3354..cbc9b9e291 100644
--- a/R/merge.R
+++ b/R/merge.R
@@ -82,9 +82,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   }
 
   # implement incomparables argument #2587
-  # %fin% to be replaced when #5232 is implemented/closed
-  "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table
   if (!is.null(incomparables)) {
+    # %fin% to be replaced when #5232 is implemented/closed
+    "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table
     xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by)
     yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by)
     # subset both so later steps still work

From 3b49262731381d4700c25a61a45502017fa954a3 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Tue, 30 Nov 2021 15:01:23 -0700
Subject: [PATCH 3/3] news item tweak

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index c92bf00240..61759c24d8 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -518,7 +518,7 @@
 
 50. `fwrite()` could produce not-ISO-compliant timestamps such as `2023-03-08T17:22:32.:00Z` when under a whole second by less than numerical tolerance of one microsecond, [#5238](https://github.com/Rdatatable/data.table/issues/5238). Thanks to @avraam-inside for the report and Václav Tlapák for the fix.
 
-51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). Thanks to @GBsuperman for the report and @ben-schwen for the fix.
+51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). It is now implemented and any other ignored arguments (e.g. misspellings) are now warned about. Thanks to @GBsuperman for the report and @ben-schwen for the fix.
 
 ## NOTES