From 4746b97ee039ca2393e740c4cc3da9bab1cee3bd Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sat, 23 Oct 2021 11:02:19 +0200 Subject: [PATCH 1/3] add incomparables arg to merge.data.table --- NEWS.md | 2 ++ R/merge.R | 23 +++++++++++++++++++++-- inst/tests/tests.Rraw | 13 +++++++++++++ man/merge.Rd | 9 ++++++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5faf40723f..b9a8fff98f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -451,6 +451,8 @@ 47. `tables()` failed with `argument "..." is missing` when called from within a function taking `...`; e.g. `function(...) { tables() }`, [#5197](https://github.com/Rdatatable/data.table/issues/5197). Thanks @greg-minshall for the report and @michaelchirico for the fix. +48. `merge.data.table` silently ignored `incomparables` argument because `...` is not further processed, [#2587](https://github.com/Rdatatable/data.table/issues/2587). `merge.data.table(x, y, incomparables = VEC)` produces now the same result as `merge.data.frame(setDF(x), setDF(y), incomparables = VEC)` and further also warns the user about unused arguments passed to `merge`. Thanks to @GBsuperman for the report and @ben-schwen for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/merge.R b/R/merge.R index 683a6d08a4..a27f0259ef 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,5 +1,5 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, - all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) { + all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { if (!sort %in% c(TRUE, FALSE)) stopf("Argument 'sort' should be logical TRUE/FALSE") if (!no.dups %in% c(TRUE, FALSE)) @@ -14,7 +14,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL x0 = length(x)==0L y0 = length(y)==0L if (x0 || y0) { - if (x0 && y0) + if (x0 && y0) warningf("Neither of the input data.tables to join have columns.") else if (x0) warningf("Input data.table '%s' has no columns.", "x") @@ -54,6 +54,15 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by = unname(by) by.x = by.y = by } + + # warn about unused arguments #2587 + if (length(list(...))) { + ell = as.list(substitute(list(...)))[-1L] + for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) + unnamed_n = length(ell) - sum(names(ell) != "") + if (unnamed_n) + warningf("Passed %d unknown and unnamed arguments.", unnamed_n) + } # with i. prefix in v1.9.3, this goes away. Left here for now ... ## sidestep the auto-increment column number feature-leading-to-bug by ## ensuring no names end in ".1", see unit test @@ -72,6 +81,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } + # implement incomparables argument #2587 + # %fin% to be replaced when #5232 is implemented/closed + "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table + if (!is.null(incomparables)) { + xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by) + yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by) + # subset both so later steps still work + x = x[xind] + y = y[yind] + } dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names) if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6382a13a85..e0a0510577 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18348,3 +18348,16 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) +# merge.data.table ignores incomparables without warnings. #2587 +x <- data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5) +y <- data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5) +DT = as.data.table(x) +test(2226.1, setDF(merge(DT, y, by = "k2", incomparables = NA)), merge(x, y, by="k2", incomparables = NA)) +test(2226.2, setDF(merge(DT, y, by = "k2", incomparables = c(NA,4))), merge(x, y, by="k2", incomparables = c(NA,4))) +test(2226.3, setDF(merge(DT, y, by = "k2", incomparables = c(4,5))), merge(x, y, by="k2", incomparables = c(4,5))) +test(2226.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1, NA, 4, 5))) +test(2226.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA, 3, 4, 5))) +test(2226.6, merge(DT, y, by = "k2", unk=1), merge(DT, y, by = "k2"), warning="Unknown argument 'unk' has been passed.") +test(2226.7, merge(DT, y, by = "k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L), + merge(DT, y, by = "k2"), warning = c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments.")) + diff --git a/man/merge.Rd b/man/merge.Rd index 6fcbc10866..d8246668c3 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -21,7 +21,7 @@ Use the \code{by}, \code{by.x} and \code{by.y} arguments explicitly to override \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), # default FALSE -\dots) +incomparables = NULL, \dots) } \arguments{ @@ -51,6 +51,7 @@ fashion as the \code{\link{merge.data.frame}} method does.} non-\code{by.y} column names in \code{y} when they have the same column name as any \code{by.x}.} \item{allow.cartesian}{See \code{allow.cartesian} in \code{\link{[.data.table}}.} +\item{incomparables}{values which cannot be matched and therefore are excluded from by columns.} \item{\dots}{Not used at this time.} } @@ -125,6 +126,12 @@ setnames(d2, "a", "b") merge(d1, d2, by.x="a", by.y="b") merge(d1, d2, by.x="a", by.y="b", all=TRUE) merge(d2, d1, by.x="b", by.y="a") + +# using incomparables values +d1 <- data.table(a=c(1,2,NA,NA,3,1), z=1:6) +d2 <- data.table(a=c(1,2,NA), z=10:12) +merge(d1, d2, by="a") +merge(d1, d2, by="a", incomparables=NA) } \keyword{ data } From bfeb71bf949aca0e1334c229c1f2a055b4fedba9 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 30 Nov 2021 14:51:54 -0700 Subject: [PATCH 2/3] move %fin% inside scope where it is used --- NEWS.md | 2 +- R/merge.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 66ac0c1af7..c92bf00240 100644 --- a/NEWS.md +++ b/NEWS.md @@ -518,7 +518,7 @@ 50. `fwrite()` could produce not-ISO-compliant timestamps such as `2023-03-08T17:22:32.:00Z` when under a whole second by less than numerical tolerance of one microsecond, [#5238](https://github.com/Rdatatable/data.table/issues/5238). Thanks to @avraam-inside for the report and Václav Tlapák for the fix. -51. `merge.data.table` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). Thanks to @GBsuperman for the report and @ben-schwen for the fix. +51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). Thanks to @GBsuperman for the report and @ben-schwen for the fix. ## NOTES diff --git a/R/merge.R b/R/merge.R index b479bb3354..cbc9b9e291 100644 --- a/R/merge.R +++ b/R/merge.R @@ -82,9 +82,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } # implement incomparables argument #2587 - # %fin% to be replaced when #5232 is implemented/closed - "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table if (!is.null(incomparables)) { + # %fin% to be replaced when #5232 is implemented/closed + "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by) yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by) # subset both so later steps still work From 3b49262731381d4700c25a61a45502017fa954a3 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 30 Nov 2021 15:01:23 -0700 Subject: [PATCH 3/3] news item tweak --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c92bf00240..61759c24d8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -518,7 +518,7 @@ 50. `fwrite()` could produce not-ISO-compliant timestamps such as `2023-03-08T17:22:32.:00Z` when under a whole second by less than numerical tolerance of one microsecond, [#5238](https://github.com/Rdatatable/data.table/issues/5238). Thanks to @avraam-inside for the report and Václav Tlapák for the fix. -51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). Thanks to @GBsuperman for the report and @ben-schwen for the fix. +51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). It is now implemented and any other ignored arguments (e.g. misspellings) are now warned about. Thanks to @GBsuperman for the report and @ben-schwen for the fix. ## NOTES