From eb73d1ca90c269ff8c85241858d3781042bc1350 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Sat, 3 Mar 2018 16:44:26 +1100
Subject: [PATCH 1/4] Added no.dups to merge to match base R

merge.data.frame has now added an additional argument, no.dups,
to handle cases where by.x != by.y && any(by.x %in% names(y)).

The current version of merge.data.table was updated so that the
'suffixes' were added to both the by.x and y column in the joined
data.table. However, base R have decided to only add the suffix
to the column in Y for backwards compatibility.
---
 R/merge.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/R/merge.R b/R/merge.R
index 64b626295f..d91df55e8b 100644
--- a/R/merge.R
+++ b/R/merge.R
@@ -1,7 +1,9 @@
 merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
-               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
+               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
   if (!sort %in% c(TRUE, FALSE))
     stop("Argument 'sort' should be logical TRUE/FALSE")
+  if (!no.dups %in% c(TRUE, FALSE))
+    stop("Argument 'no.dups' should be logical TRUE/FALSE")
   if (!is.data.table(y)) {
     y = as.data.table(y)
     if (missing(by) && missing(by.x)) {
@@ -51,11 +53,10 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
     start[chmatch(dupnames, start, 0L)] = paste(dupnames, suffixes[1L], sep="")
     end[chmatch(dupnames, end, 0L)] = paste(dupnames, suffixes[2L], sep="")
   }
-  # If by.x != by.y then the 'by' column(s) are named as 'by.x' - we need
-  # to also handle cases where the 'by.x' column names are in 'end'
+  # If no.dups = TRUE we also need to added the suffix to columns in y
+  # that share a name with by.x
   dupkeyx = intersect(by.x, end)
-  if (length(dupkeyx)) {
-    by.x[chmatch(dupkeyx, by.x, 0L)] = paste(dupkeyx, suffixes[1L], sep="")
+  if (no.dups && length(dupkeyx)) {
     end[chmatch(dupkeyx, end, 0L)] = paste(dupkeyx, suffixes[2L], sep="")
   }
 

From 784a9d6686790b435e2a366aa881e031ffe5fb2b Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Sat, 3 Mar 2018 16:53:53 +1100
Subject: [PATCH 2/4] updated tests

---
 inst/tests/tests.Rraw | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 28b236f203..0b93d1bf89 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11743,9 +11743,10 @@ test(1879.6, fread(f, verbose=TRUE), DT,
                   sep = '.*'))
 unlink(f)
 
-# Fix duplicated names arising in merge when by.x in names(y), PR#2631
+# Fix duplicated names arising in merge when by.x in names(y), PR#2631, PR#2653
 # 1880.1 should fail in there are any duplicate names after a join
 # 1880.2 should fail if a warning is not thrown when suffixes leads to duplicate names
+# 1880.3 tests no.dups = FALSE, where names should be duplicated after the join
 parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43))
 children = data.table(parent=c("Sarah", "Max", "Max"),
                        name=c("Oliver", "Sebastian", "Michelle"),
@@ -11754,6 +11755,8 @@ joined = merge(parents, children, by.x="name", by.y="parent")
 test(1880.1, length(names(joined)), length(unique(names(joined))))
 test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L,
              warning = "column names.*are duplicated in the result")
+joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE))
+test(1880.3, any(duplicated(names(joined))), TRUE)             
 
 # out-of-sample quote rule bump, #2265
 DT = data.table(A=rep("abc", 10000), B="def")

From e96e271b5827d0521f63186fa5d0ca3ac82a5c72 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Sat, 3 Mar 2018 17:00:05 +1100
Subject: [PATCH 3/4] Update documentation for merge

---
 man/merge.Rd | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/man/merge.Rd b/man/merge.Rd
index 0032a9e849..f72fd5aa12 100644
--- a/man/merge.Rd
+++ b/man/merge.Rd
@@ -18,7 +18,7 @@ Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to overri
 
 \usage{
 \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL,
-all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"),
+all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE,
 allow.cartesian=getOption("datatable.allow.cartesian"),  # default FALSE
 ...)
 }
@@ -44,6 +44,9 @@ result is not sorted.}
 \item{suffixes}{A \code{character(2)} specifying the suffixes to be used for
 making non-\code{by} column names unique. The suffix behaviour works in a similar
 fashion as the \code{\link{merge.data.frame}} method does.}
+\item{no.dups}{logical indicating that \code{suffixes} are also appended to
+non-\code{by.y} column names in \code{y} when they have the same column name
+as any \code{by.x}.}
 \item{allow.cartesian}{See \code{allow.cartesian} in \code{\link{[.data.table}}.}
 \item{\dots}{Not used at this time.}
 }

From 02f46cf50b6a037ebd90ec6cbe58588e38eaa298 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Sat, 3 Mar 2018 17:12:17 +1100
Subject: [PATCH 4/4] Updated NEWS entry

---
 NEWS.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 5b9d92507c..88479a6105 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -151,9 +151,10 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat
 
 33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing.
 
-34. Fixed bug where result of `merge()` would contain duplicate column names if `by.x` was also in `names(y)`.
-Where there are duplicate column names (i.e. `suffixes = c("", "")`) `merge()` will throw a warning to match
-the behaviour of `base:::merge.data.frame()`. Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631).
+34. Fixed cases where the result of `merge.data.table()` would contain duplicate column names if `by.x` was also in `names(y)`.
+`merge.data.table()` gains the `no.dups` argument (default TRUE) to match the correpsonding patched behaviour in `base:::merge.data.frame()`. Now, when `by.x` is also in `names(y)` the column name from `y` has the corresponding `suffixes` added to it. `by.x` remains unchanged for backwards compatibility reasons.
+In addition, where duplicate column names arise anyway (i.e. `suffixes = c("", "")`) `merge.data.table()` will now throw a warning to match the behaviour of `base:::merge.data.frame()`. 
+Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) and [PR#2653](https://github.com/Rdatatable/data.table/pull/2653)
 
 35. `CJ()` now fails with proper error message when results would exceed max integer, [#2636](https://github.com/Rdatatable/data.table/issues/2636).