diff --git a/NEWS.md b/NEWS.md index 2e09efbc3a..61759c24d8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -518,6 +518,8 @@ 50. `fwrite()` could produce not-ISO-compliant timestamps such as `2023-03-08T17:22:32.:00Z` when under a whole second by less than numerical tolerance of one microsecond, [#5238](https://github.com/Rdatatable/data.table/issues/5238). Thanks to @avraam-inside for the report and Václav Tlapák for the fix. +51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). It is now implemented and any other ignored arguments (e.g. misspellings) are now warned about. Thanks to @GBsuperman for the report and @ben-schwen for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/merge.R b/R/merge.R index f237bcbf32..cbc9b9e291 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,5 +1,5 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, - all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) { + all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { if (!sort %in% c(TRUE, FALSE)) stopf("Argument 'sort' should be logical TRUE/FALSE") if (!no.dups %in% c(TRUE, FALSE)) @@ -14,7 +14,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL x0 = length(x)==0L y0 = length(y)==0L if (x0 || y0) { - if (x0 && y0) + if (x0 && y0) warningf("Neither of the input data.tables to join have columns.") else if (x0) warningf("Input data.table '%s' has no columns.", "x") @@ -54,6 +54,15 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by = unname(by) by.x = by.y = by } + + # warn about unused arguments #2587 + if (length(list(...))) { + ell = as.list(substitute(list(...)))[-1L] + for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) + unnamed_n = length(ell) - sum(names(ell) != "") + if (unnamed_n) + warningf("Passed %d unknown and unnamed arguments.", unnamed_n) + } # with i. prefix in v1.9.3, this goes away. Left here for now ... ## sidestep the auto-increment column number feature-leading-to-bug by ## ensuring no names end in ".1", see unit test @@ -72,6 +81,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } + # implement incomparables argument #2587 + if (!is.null(incomparables)) { + # %fin% to be replaced when #5232 is implemented/closed + "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table + xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by) + yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by) + # subset both so later steps still work + x = x[xind] + y = y[yind] + } dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names) if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e489f73461..ffa0a95ac3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18458,3 +18458,16 @@ unlink(f) # not supporting multi file zips yet test(2229.6, fread(testDir("multi-file.zip")), error="Compressed files containing more than 1 file are currently not supported.") +# merge.data.table ignored incomparables argument without warning, #2587 +x = data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5) +y = data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5) +DT = as.data.table(x) +test(2230.1, setDF(merge(DT, y, by="k2", incomparables=NA)), merge(x, y, by="k2", incomparables=NA)) +test(2230.2, setDF(merge(DT, y, by="k2", incomparables=c(NA,4))), merge(x, y, by="k2", incomparables=c(NA,4))) +test(2230.3, setDF(merge(DT, y, by="k2", incomparables=c(4,5))), merge(x, y, by="k2", incomparables=c(4,5))) +test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1,NA,4,5))) +test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5))) +test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.") +test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L), + merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments.")) + diff --git a/man/merge.Rd b/man/merge.Rd index 6fcbc10866..d8246668c3 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -21,7 +21,7 @@ Use the \code{by}, \code{by.x} and \code{by.y} arguments explicitly to override \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), # default FALSE -\dots) +incomparables = NULL, \dots) } \arguments{ @@ -51,6 +51,7 @@ fashion as the \code{\link{merge.data.frame}} method does.} non-\code{by.y} column names in \code{y} when they have the same column name as any \code{by.x}.} \item{allow.cartesian}{See \code{allow.cartesian} in \code{\link{[.data.table}}.} +\item{incomparables}{values which cannot be matched and therefore are excluded from by columns.} \item{\dots}{Not used at this time.} } @@ -125,6 +126,12 @@ setnames(d2, "a", "b") merge(d1, d2, by.x="a", by.y="b") merge(d1, d2, by.x="a", by.y="b", all=TRUE) merge(d2, d1, by.x="b", by.y="a") + +# using incomparables values +d1 <- data.table(a=c(1,2,NA,NA,3,1), z=1:6) +d2 <- data.table(a=c(1,2,NA), z=10:12) +merge(d1, d2, by="a") +merge(d1, d2, by="a", incomparables=NA) } \keyword{ data }