Rdatatable · mattdowle · Mar 5, 2018 · Mar 3, 2018 · Mar 3, 2018 · Mar 3, 2018
@@ -151,9 +151,10 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat
 
 33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing.
 
-34. Fixed bug where result of `merge()` would contain duplicate column names if `by.x` was also in `names(y)`.
-Where there are duplicate column names (i.e. `suffixes = c("", "")`) `merge()` will throw a warning to match
-the behaviour of `base:::merge.data.frame()`. Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631).
+34. Fixed cases where the result of `merge.data.table()` would contain duplicate column names if `by.x` was also in `names(y)`.
+`merge.data.table()` gains the `no.dups` argument (default TRUE) to match the correpsonding patched behaviour in `base:::merge.data.frame()`. Now, when `by.x` is also in `names(y)` the column name from `y` has the corresponding `suffixes` added to it. `by.x` remains unchanged for backwards compatibility reasons.
+In addition, where duplicate column names arise anyway (i.e. `suffixes = c("", "")`) `merge.data.table()` will now throw a warning to match the behaviour of `base:::merge.data.frame()`. 
+Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) and [PR#2653](https://github.com/Rdatatable/data.table/pull/2653)
 
 35. `CJ()` now fails with proper error message when results would exceed max integer, [#2636](https://github.com/Rdatatable/data.table/issues/2636).
 

@@ -1,7 +1,9 @@
 merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
-               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
+               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
   if (!sort %in% c(TRUE, FALSE))
     stop("Argument 'sort' should be logical TRUE/FALSE")
+  if (!no.dups %in% c(TRUE, FALSE))
+    stop("Argument 'no.dups' should be logical TRUE/FALSE")
   if (!is.data.table(y)) {
     y = as.data.table(y)
     if (missing(by) && missing(by.x)) {
@@ -51,11 +53,10 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
     start[chmatch(dupnames, start, 0L)] = paste(dupnames, suffixes[1L], sep="")
     end[chmatch(dupnames, end, 0L)] = paste(dupnames, suffixes[2L], sep="")
   }
-  # If by.x != by.y then the 'by' column(s) are named as 'by.x' - we need
-  # to also handle cases where the 'by.x' column names are in 'end'
+  # If no.dups = TRUE we also need to added the suffix to columns in y
+  # that share a name with by.x
   dupkeyx = intersect(by.x, end)
-  if (length(dupkeyx)) {
-    by.x[chmatch(dupkeyx, by.x, 0L)] = paste(dupkeyx, suffixes[1L], sep="")
+  if (no.dups && length(dupkeyx)) {
     end[chmatch(dupkeyx, end, 0L)] = paste(dupkeyx, suffixes[2L], sep="")
   }
 

@@ -11743,9 +11743,10 @@ test(1879.6, fread(f, verbose=TRUE), DT,
                   sep = '.*'))
 unlink(f)
 
-# Fix duplicated names arising in merge when by.x in names(y), PR#2631
+# Fix duplicated names arising in merge when by.x in names(y), PR#2631, PR#2653
 # 1880.1 should fail in there are any duplicate names after a join
 # 1880.2 should fail if a warning is not thrown when suffixes leads to duplicate names
+# 1880.3 tests no.dups = FALSE, where names should be duplicated after the join
 parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43))
 children = data.table(parent=c("Sarah", "Max", "Max"),
                        name=c("Oliver", "Sebastian", "Michelle"),
@@ -11754,6 +11755,8 @@ joined = merge(parents, children, by.x="name", by.y="parent")
 test(1880.1, length(names(joined)), length(unique(names(joined))))
 test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L,
              warning = "column names.*are duplicated in the result")
+joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE))
+test(1880.3, any(duplicated(names(joined))), TRUE)             
 
 # out-of-sample quote rule bump, #2265
 DT = data.table(A=rep("abc", 10000), B="def")

@@ -18,7 +18,7 @@ Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to overri
 
 \usage{
 \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL,
-all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"),
+all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE,
 allow.cartesian=getOption("datatable.allow.cartesian"),  # default FALSE
 ...)
 }
@@ -44,6 +44,9 @@ result is not sorted.}
 \item{suffixes}{A \code{character(2)} specifying the suffixes to be used for
 making non-\code{by} column names unique. The suffix behaviour works in a similar
 fashion as the \code{\link{merge.data.frame}} method does.}
+\item{no.dups}{logical indicating that \code{suffixes} are also appended to
+non-\code{by.y} column names in \code{y} when they have the same column name
+as any \code{by.x}.}
 \item{allow.cartesian}{See \code{allow.cartesian} in \code{\link{[.data.table}}.}
 \item{\dots}{Not used at this time.}
 }