diff --git a/NEWS.md b/NEWS.md index 28f5780152..47da1d2c8b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -139,6 +139,10 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat 33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing. +34. Fixed bug where result of `merge()` would contain duplicate column names if `by.x` was also in `names(y)`. +Where there are duplicate column names (i.e. `suffixes = c("", "")`) `merge()` will throw a warning to match +the behaviour of `base:::merge.data.frame()`. Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631). + #### NOTES 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. diff --git a/R/merge.R b/R/merge.R index ceb87c75e9..64b626295f 100644 --- a/R/merge.R +++ b/R/merge.R @@ -51,6 +51,13 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA start[chmatch(dupnames, start, 0L)] = paste(dupnames, suffixes[1L], sep="") end[chmatch(dupnames, end, 0L)] = paste(dupnames, suffixes[2L], sep="") } + # If by.x != by.y then the 'by' column(s) are named as 'by.x' - we need + # to also handle cases where the 'by.x' column names are in 'end' + dupkeyx = intersect(by.x, end) + if (length(dupkeyx)) { + by.x[chmatch(dupkeyx, by.x, 0L)] = paste(dupkeyx, suffixes[1L], sep="") + end[chmatch(dupkeyx, end, 0L)] = paste(dupkeyx, suffixes[2L], sep="") + } dt = y[x,nomatch = if (all.x) NA else 0L,on=by,allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names) @@ -78,6 +85,15 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA if (nrow(dt) > 0L) { setkeyv(dt, if (sort) by.x else NULL) } + + # Throw warning if there are duplicate column names in 'dt' (i.e. if + # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) + resultdupnames <- names(dt)[duplicated(names(dt))] + if (length(resultdupnames)) { + warning("column names ", paste0("'", resultdupnames, "'", collapse=", "), + " are duplicated in the result") + } + # merge resets class, #1378. X[Y] is quite clear that X is being *subset* by Y, # makes sense to therefore retain X's class, unlike `merge`. Hard to tell what # class to retain for *full join* for example. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 31bc4ec14b..93ea0f50cc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7163,15 +7163,12 @@ ans2 <- setDF(merge(setDT(d1), setDT(d2), all=TRUE, by="A")) test(1542.2, ans1, ans2) # test duplicate name cases setnames(d2, c("A", "Y"), c("B", "A")) -ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by.x="A", by.y="B")) -ans2 <- setDF(merge(setDT(d1), setDT(d2), by.x="A", by.y="B")) -test(1543.3, ans1, ans2) ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), by.x="B", by.y="A")) ans2 <- setDF(merge(setDT(d2), setDT(d1), by.x="B", by.y="A")) -test(1543.4, ans1, ans2) +test(1543.3, ans1, ans2) ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A")) ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A")) -test(1543.5, ans1, ans2) +test(1543.4, ans1, ans2) # test for sort=FALSE argument, #1282 set.seed(1L) @@ -11717,6 +11714,18 @@ test(1879.6, fread(f, verbose=TRUE), DT, sep = '.*')) unlink(f) +# Fix duplicated names arising in merge when by.x in names(y), PR#2631 +# 1880.1 should fail in there are any duplicate names after a join +# 1880.2 should fail if a warning is not thrown when suffixes leads to duplicate names +parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43)) +children = data.table(parent=c("Sarah", "Max", "Max"), + name=c("Oliver", "Sebastian", "Michelle"), + sex=c("M", "M", "F"), age=c(5,8,7)) +joined = merge(parents, children, by.x="name", by.y="parent") +test(1880.1, length(names(joined)), length(unique(names(joined)))) +test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L, + warning = "column names.*are duplicated in the result") + ################################### # Add new tests above this line # @@ -11737,4 +11746,3 @@ if (nfail > 0) { cat("\n",plat,"\n\nAll ",ntest," tests in inst/tests/tests.Rraw completed ok in ",timetaken(started.at)," on ",date(),"\n",sep="") # date() is included so we can tell exactly when these tests ran on CRAN. Sometimes a CRAN log can show error but that can be just # stale due to not updating yet since a fix in R-devel, for example. - diff --git a/man/merge.Rd b/man/merge.Rd index a2da1fde3c..0032a9e849 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -66,6 +66,12 @@ control for providing the columns to merge on with the help of the newly impleme For a more \code{data.table}-centric way of merging two \code{data.table}s, see \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. See FAQ 1.12 for a detailed comparison of \code{merge} and \code{x[y, ...]}. + +If any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y}, +then this \code{data.table} method will add the \code{suffixes} to those column names. As of +R v3.4.3, the \code{data.frame} method will not (leading to duplicate column names in the result) but a patch has +been proposed (see thread \href{http://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html}{here}) +which is looking likely to be accepted for a future version of R. } \value{