From 05465b04de88b4f68377ee2184a44b7fdc262eae Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 17 Feb 2018 11:40:14 +1100 Subject: [PATCH 01/14] Fixed duplicate column names in merge when by.x in names(y) --- R/merge.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/merge.R b/R/merge.R index ceb87c75e9..fec29b3786 100644 --- a/R/merge.R +++ b/R/merge.R @@ -51,6 +51,13 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA start[chmatch(dupnames, start, 0L)] = paste(dupnames, suffixes[1L], sep="") end[chmatch(dupnames, end, 0L)] = paste(dupnames, suffixes[2L], sep="") } + # If by.x != by.y then the 'by' column(s) are named as 'by.x' - we need + # to also handle cases where the 'by.x' column names are in 'end' + dupkeyx = intersect(by.x, end) + if (length(dupkeyx)) { + by.x[chmatch(dupkeyx, by.x, 0L)] = paste(dupkeyx, suffixes[1L], sep="") + end[chmatch(dupkeyx, end, 0L)] = paste(dupkeyx, suffixes[2L], sep="") + } dt = y[x,nomatch = if (all.x) NA else 0L,on=by,allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names) From 79653e7f2dceb2e1b33f6f03cb66bff30e474936 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 17 Feb 2018 12:00:50 +1100 Subject: [PATCH 02/14] added test --- inst/tests/tests.Rraw | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c9537b91b4..75136e1fdb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11671,6 +11671,14 @@ unlink(f) test(1876, fread("http://hkhfsk\nhttp://fhdkf\nhttp://kjfhskd\nhttp://hfkjf", header=FALSE), # data not a download, #2531 data.table(V1=c("http://hkhfsk","http://fhdkf","http://kjfhskd","http://hfkjf"))) +# PR #2631 - Fix duplicated names arising in merge when by.x in names(y) +# Test should fail in there are any duplicate names after a join +parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43)) +children = data.table(parent=c("Sarah", "Max", "Max"), + name=c("Oliver", "Sebastian", "Michelle"), + sex=c("M", "M", "F"), age=c(5,8,7)) +joined = merge(parents, children, by.x="name", by.y="parent") +test(1877, length(names(joined)), length(unique(names(joined)))) ########################## @@ -11716,4 +11724,3 @@ cat("\n",plat,"\n\nAll ",ntest," tests in inst/tests/tests.Rraw completed ok in # date() is included so we can tell when CRAN checks were run (in particular if they have been rerun since # an update to Rdevel itself; data.table doesn't have any other dependency) since there appears to be no other # way to see the timestamp that CRAN checks were run. Some CRAN machines lag by several days. - From 091251d787ec97d8390934cdcf01cadce82a7b23 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 17 Feb 2018 12:14:00 +1100 Subject: [PATCH 03/14] added NEWS entry --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 55fa516c90..880645c6e2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -138,6 +138,9 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat 32. `x.` prefixes during joins sometimes resulted in a "column not found" error. This is now fixed. Closes [#2313](https://github.com/Rdatatable/data.table/issues/2313). Thanks to @franknarf1 for the MRE. +33. Fixed bug where result of `merge()` would contain duplicate column names if `by.x` was also in `names(y)`. +Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) + #### NOTES 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. From b6aaa98c3e6f13e24345b2833bd1bf4a1eb943ec Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 17 Feb 2018 12:28:36 +1100 Subject: [PATCH 04/14] merge throws warning when duplicate names in result --- NEWS.md | 3 ++- R/merge.R | 9 +++++++++ inst/tests/tests.Rraw | 4 +++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 880645c6e2..b7d8fdec82 100644 --- a/NEWS.md +++ b/NEWS.md @@ -139,7 +139,8 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat 32. `x.` prefixes during joins sometimes resulted in a "column not found" error. This is now fixed. Closes [#2313](https://github.com/Rdatatable/data.table/issues/2313). Thanks to @franknarf1 for the MRE. 33. Fixed bug where result of `merge()` would contain duplicate column names if `by.x` was also in `names(y)`. -Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) +Where there are duplicate column names (i.e. `suffixes = c("", "")`) `merge()` will throw a warning to match +the behaviour of `base:::merge.data.frame()`. Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631). #### NOTES diff --git a/R/merge.R b/R/merge.R index fec29b3786..64b626295f 100644 --- a/R/merge.R +++ b/R/merge.R @@ -85,6 +85,15 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA if (nrow(dt) > 0L) { setkeyv(dt, if (sort) by.x else NULL) } + + # Throw warning if there are duplicate column names in 'dt' (i.e. if + # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) + resultdupnames <- names(dt)[duplicated(names(dt))] + if (length(resultdupnames)) { + warning("column names ", paste0("'", resultdupnames, "'", collapse=", "), + " are duplicated in the result") + } + # merge resets class, #1378. X[Y] is quite clear that X is being *subset* by Y, # makes sense to therefore retain X's class, unlike `merge`. Hard to tell what # class to retain for *full join* for example. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 75136e1fdb..92b1453485 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11678,7 +11678,9 @@ children = data.table(parent=c("Sarah", "Max", "Max"), name=c("Oliver", "Sebastian", "Michelle"), sex=c("M", "M", "F"), age=c(5,8,7)) joined = merge(parents, children, by.x="name", by.y="parent") -test(1877, length(names(joined)), length(unique(names(joined)))) +test(1877.1, length(names(joined)), length(unique(names(joined)))) +test(1877.2, merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""), + warning = "column names 'name', 'sex', 'age' are duplicated in the result"), ########################## From f9c1380a3ae853c0b23caec32ea08be264fe2d12 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 17 Feb 2018 14:01:39 +1100 Subject: [PATCH 05/14] syntax typo --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 92b1453485..6a801ce17e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11679,8 +11679,8 @@ children = data.table(parent=c("Sarah", "Max", "Max"), sex=c("M", "M", "F"), age=c(5,8,7)) joined = merge(parents, children, by.x="name", by.y="parent") test(1877.1, length(names(joined)), length(unique(names(joined)))) -test(1877.2, merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""), - warning = "column names 'name', 'sex', 'age' are duplicated in the result"), +test(1877.2, merge(parents, children, by.x="name", by.y="parent", suffixes=c("","")), + warning = "column names 'name', 'sex', 'age' are duplicated in the result") ########################## From 560b39029684264b8b7453bdc0da9ba5a99c17be Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sun, 18 Feb 2018 12:54:13 +1100 Subject: [PATCH 06/14] warning test --- inst/tests/tests.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6a801ce17e..96bb82fcc4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11680,7 +11680,8 @@ children = data.table(parent=c("Sarah", "Max", "Max"), joined = merge(parents, children, by.x="name", by.y="parent") test(1877.1, length(names(joined)), length(unique(names(joined)))) test(1877.2, merge(parents, children, by.x="name", by.y="parent", suffixes=c("","")), - warning = "column names 'name', 'sex', 'age' are duplicated in the result") + merge(parents, children, by.x="name", by.y="parent", suffixes=c("","")), + warning = "column names.*are duplicated in the result") ########################## From f81eaa5dd2662d12d3120c9526075cfe325b9818 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sun, 18 Feb 2018 13:14:09 +1100 Subject: [PATCH 07/14] warning test output comparison no longer generates warning --- inst/tests/tests.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9d4ab50c2a..d87872116a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11677,15 +11677,15 @@ test(1877.2, attr(setattr(data.table(x = 1:10), "class", character()), "class"), test(1877.3, attr(setattr(data.table(x = 1:10), "test", character()), "test"), character(0)) # ok before # PR #2631 - Fix duplicated names arising in merge when by.x in names(y) -# Test should fail in there are any duplicate names after a join +# 1878.1 should fail in there are any duplicate names after a join +# 1878.2 should fail if a warning is not thrown when suffixes leads to duplicate names parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43)) children = data.table(parent=c("Sarah", "Max", "Max"), name=c("Oliver", "Sebastian", "Michelle"), sex=c("M", "M", "F"), age=c(5,8,7)) joined = merge(parents, children, by.x="name", by.y="parent") test(1878.1, length(names(joined)), length(unique(names(joined)))) -test(1878.2, merge(parents, children, by.x="name", by.y="parent", suffixes=c("","")), - merge(parents, children, by.x="name", by.y="parent", suffixes=c("","")), +test(1878.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3, warning = "column names.*are duplicated in the result") ########################## From 2ec45262a285c44a0c9ed38e9d6a5bd9aec8cec4 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sun, 18 Feb 2018 13:24:10 +1100 Subject: [PATCH 08/14] integer vs double failed check fixed --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d87872116a..9b370943bb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11685,7 +11685,7 @@ children = data.table(parent=c("Sarah", "Max", "Max"), sex=c("M", "M", "F"), age=c(5,8,7)) joined = merge(parents, children, by.x="name", by.y="parent") test(1878.1, length(names(joined)), length(unique(names(joined)))) -test(1878.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3, +test(1878.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L, warning = "column names.*are duplicated in the result") ########################## From 2006b14a059302367cfab0a0d8f836fc3155f0e7 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sun, 18 Feb 2018 13:29:49 +1100 Subject: [PATCH 09/14] Removed test 1543.3 This test now fails because the output of merge.data.table does not match the output of merge.data.frame because base::merge.data.frame still leads to duplicate column names where by.x is in names(y). --- inst/tests/tests.Rraw | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9b370943bb..ae181504b0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7154,15 +7154,12 @@ ans2 <- setDF(merge(setDT(d1), setDT(d2), all=TRUE, by="A")) test(1542.2, ans1, ans2) # test duplicate name cases setnames(d2, c("A", "Y"), c("B", "A")) -ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by.x="A", by.y="B")) -ans2 <- setDF(merge(setDT(d1), setDT(d2), by.x="A", by.y="B")) -test(1543.3, ans1, ans2) ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), by.x="B", by.y="A")) ans2 <- setDF(merge(setDT(d2), setDT(d1), by.x="B", by.y="A")) -test(1543.4, ans1, ans2) +test(1543.3, ans1, ans2) ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A")) ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A")) -test(1543.5, ans1, ans2) +test(1543.4, ans1, ans2) # test for sort=FALSE argument, #1282 set.seed(1L) From 12fb8ac46a280eb6c55fd815801d433adbe4a4fa Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Mon, 19 Feb 2018 07:32:47 +1100 Subject: [PATCH 10/14] Updated merge documentation --- man/merge.Rd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/man/merge.Rd b/man/merge.Rd index a2da1fde3c..07e851bcf0 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -14,6 +14,10 @@ very similarly to that of \code{data.frame}s except that, by default, it attempt } Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to override this default. + +In addition, if any column names provided to \code{by.x} also occur in \code{names(y)} (and +not in \code{by.y}) then this function will add the \code{suffixes} to those column names, +while the \code{data.frame} method will not (leading to duplicate column names in the result). } \usage{ From b3d287a49cf85085782f8f2e27542bf1dc899918 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 22 Feb 2018 16:15:54 -0800 Subject: [PATCH 11/14] Minor wording: 'function' => 'data.table method' --- man/merge.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/merge.Rd b/man/merge.Rd index 07e851bcf0..b1811c7773 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -16,7 +16,7 @@ very similarly to that of \code{data.frame}s except that, by default, it attempt Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to override this default. In addition, if any column names provided to \code{by.x} also occur in \code{names(y)} (and -not in \code{by.y}) then this function will add the \code{suffixes} to those column names, +not in \code{by.y}) then this \code{data.table} method will add the \code{suffixes} to those column names, while the \code{data.frame} method will not (leading to duplicate column names in the result). } From 3f79e7d8ac593738fb42104ed4fef99c06768b7f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 22 Feb 2018 16:35:39 -0800 Subject: [PATCH 12/14] Added link and status to merge.Rd --- man/merge.Rd | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/man/merge.Rd b/man/merge.Rd index b1811c7773..9c2ccf9e2d 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -15,9 +15,11 @@ very similarly to that of \code{data.frame}s except that, by default, it attempt Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to override this default. -In addition, if any column names provided to \code{by.x} also occur in \code{names(y)} (and -not in \code{by.y}) then this \code{data.table} method will add the \code{suffixes} to those column names, -while the \code{data.frame} method will not (leading to duplicate column names in the result). +In addition, if any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y}) + then this \code{data.table} method will add the \code{suffixes} to those column names. As of + R v3.4.3, the \code{data.frame} method will not (leading to duplicate column names in the result) but a patch has + been proposed (see thread \url{http://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html} + (here)) which looks likely to be accepted for a future version of R. } \usage{ From a483996f67bbbcaabc5cad3f9ce44ad9c28b4fea Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 22 Feb 2018 16:46:57 -0800 Subject: [PATCH 13/14] Fixed my bracket typo --- man/merge.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/merge.Rd b/man/merge.Rd index 9c2ccf9e2d..95573a970c 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -15,11 +15,11 @@ very similarly to that of \code{data.frame}s except that, by default, it attempt Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to override this default. -In addition, if any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y}) +In addition, if any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y}, then this \code{data.table} method will add the \code{suffixes} to those column names. As of R v3.4.3, the \code{data.frame} method will not (leading to duplicate column names in the result) but a patch has been proposed (see thread \url{http://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html} - (here)) which looks likely to be accepted for a future version of R. + (here)) which is looking likely to be accepted for a future version of R. } \usage{ From 8ad024251dedc15e7a0033a3175a7488a92a3008 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 22 Feb 2018 17:03:31 -0800 Subject: [PATCH 14/14] Moved new paragraph to Details section. --- man/merge.Rd | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/man/merge.Rd b/man/merge.Rd index 95573a970c..0032a9e849 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -14,12 +14,6 @@ very similarly to that of \code{data.frame}s except that, by default, it attempt } Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to override this default. - -In addition, if any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y}, - then this \code{data.table} method will add the \code{suffixes} to those column names. As of - R v3.4.3, the \code{data.frame} method will not (leading to duplicate column names in the result) but a patch has - been proposed (see thread \url{http://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html} - (here)) which is looking likely to be accepted for a future version of R. } \usage{ @@ -72,6 +66,12 @@ control for providing the columns to merge on with the help of the newly impleme For a more \code{data.table}-centric way of merging two \code{data.table}s, see \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. See FAQ 1.12 for a detailed comparison of \code{merge} and \code{x[y, ...]}. + +If any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y}, +then this \code{data.table} method will add the \code{suffixes} to those column names. As of +R v3.4.3, the \code{data.frame} method will not (leading to duplicate column names in the result) but a patch has +been proposed (see thread \href{http://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html}{here}) +which is looking likely to be accepted for a future version of R. } \value{