From eb73d1ca90c269ff8c85241858d3781042bc1350 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 3 Mar 2018 16:44:26 +1100 Subject: [PATCH 1/4] Added no.dups to merge to match base R merge.data.frame has now added an additional argument, no.dups, to handle cases where by.x != by.y && any(by.x %in% names(y)). The current version of merge.data.table was updated so that the 'suffixes' were added to both the by.x and y column in the joined data.table. However, base R have decided to only add the suffix to the column in Y for backwards compatibility. --- R/merge.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/merge.R b/R/merge.R index 64b626295f..d91df55e8b 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,7 +1,9 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, - all.y = all, sort = TRUE, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) { + all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) { if (!sort %in% c(TRUE, FALSE)) stop("Argument 'sort' should be logical TRUE/FALSE") + if (!no.dups %in% c(TRUE, FALSE)) + stop("Argument 'no.dups' should be logical TRUE/FALSE") if (!is.data.table(y)) { y = as.data.table(y) if (missing(by) && missing(by.x)) { @@ -51,11 +53,10 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA start[chmatch(dupnames, start, 0L)] = paste(dupnames, suffixes[1L], sep="") end[chmatch(dupnames, end, 0L)] = paste(dupnames, suffixes[2L], sep="") } - # If by.x != by.y then the 'by' column(s) are named as 'by.x' - we need - # to also handle cases where the 'by.x' column names are in 'end' + # If no.dups = TRUE we also need to added the suffix to columns in y + # that share a name with by.x dupkeyx = intersect(by.x, end) - if (length(dupkeyx)) { - by.x[chmatch(dupkeyx, by.x, 0L)] = paste(dupkeyx, suffixes[1L], sep="") + if (no.dups && length(dupkeyx)) { end[chmatch(dupkeyx, end, 0L)] = paste(dupkeyx, suffixes[2L], sep="") } From 784a9d6686790b435e2a366aa881e031ffe5fb2b Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 3 Mar 2018 16:53:53 +1100 Subject: [PATCH 2/4] updated tests --- inst/tests/tests.Rraw | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 28b236f203..0b93d1bf89 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11743,9 +11743,10 @@ test(1879.6, fread(f, verbose=TRUE), DT, sep = '.*')) unlink(f) -# Fix duplicated names arising in merge when by.x in names(y), PR#2631 +# Fix duplicated names arising in merge when by.x in names(y), PR#2631, PR#2653 # 1880.1 should fail in there are any duplicate names after a join # 1880.2 should fail if a warning is not thrown when suffixes leads to duplicate names +# 1880.3 tests no.dups = FALSE, where names should be duplicated after the join parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43)) children = data.table(parent=c("Sarah", "Max", "Max"), name=c("Oliver", "Sebastian", "Michelle"), @@ -11754,6 +11755,8 @@ joined = merge(parents, children, by.x="name", by.y="parent") test(1880.1, length(names(joined)), length(unique(names(joined)))) test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L, warning = "column names.*are duplicated in the result") +joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE)) +test(1880.3, any(duplicated(names(joined))), TRUE) # out-of-sample quote rule bump, #2265 DT = data.table(A=rep("abc", 10000), B="def") From e96e271b5827d0521f63186fa5d0ca3ac82a5c72 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 3 Mar 2018 17:00:05 +1100 Subject: [PATCH 3/4] Update documentation for merge --- man/merge.Rd | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/man/merge.Rd b/man/merge.Rd index 0032a9e849..f72fd5aa12 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -18,7 +18,7 @@ Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to overri \usage{ \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL, -all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), +all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), # default FALSE ...) } @@ -44,6 +44,9 @@ result is not sorted.} \item{suffixes}{A \code{character(2)} specifying the suffixes to be used for making non-\code{by} column names unique. The suffix behaviour works in a similar fashion as the \code{\link{merge.data.frame}} method does.} +\item{no.dups}{logical indicating that \code{suffixes} are also appended to +non-\code{by.y} column names in \code{y} when they have the same column name +as any \code{by.x}.} \item{allow.cartesian}{See \code{allow.cartesian} in \code{\link{[.data.table}}.} \item{\dots}{Not used at this time.} } From 02f46cf50b6a037ebd90ec6cbe58588e38eaa298 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sat, 3 Mar 2018 17:12:17 +1100 Subject: [PATCH 4/4] Updated NEWS entry --- NEWS.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5b9d92507c..88479a6105 100644 --- a/NEWS.md +++ b/NEWS.md @@ -151,9 +151,10 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat 33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing. -34. Fixed bug where result of `merge()` would contain duplicate column names if `by.x` was also in `names(y)`. -Where there are duplicate column names (i.e. `suffixes = c("", "")`) `merge()` will throw a warning to match -the behaviour of `base:::merge.data.frame()`. Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631). +34. Fixed cases where the result of `merge.data.table()` would contain duplicate column names if `by.x` was also in `names(y)`. +`merge.data.table()` gains the `no.dups` argument (default TRUE) to match the correpsonding patched behaviour in `base:::merge.data.frame()`. Now, when `by.x` is also in `names(y)` the column name from `y` has the corresponding `suffixes` added to it. `by.x` remains unchanged for backwards compatibility reasons. +In addition, where duplicate column names arise anyway (i.e. `suffixes = c("", "")`) `merge.data.table()` will now throw a warning to match the behaviour of `base:::merge.data.frame()`. +Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) and [PR#2653](https://github.com/Rdatatable/data.table/pull/2653) 35. `CJ()` now fails with proper error message when results would exceed max integer, [#2636](https://github.com/Rdatatable/data.table/issues/2636).