Rdatatable · mattdowle · Sep 9, 2019 · Sep 8, 2019 · Sep 8, 2019
@@ -2,24 +2,26 @@
 bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
 {
   callersi = i
-  # Just so that when a double in i which contains integers stored as double, is joined to an integer column
-  # in x, the i column is returned as integer in the result. Just before the call to bmerge() in [.data.table
-  # there is a shallow() copy of i to prevent this change of type affecting the user's object by reference.
-  # There is only one use of callersi here.
-  # Useful for ad hoc joins when the L postfix is often forgotten.
-  # Otherwise, the type of the i column is always returned.
-
   i = shallow(i)
-  # merge on .SD in i fails _sometimes_ because of set() being done here, #1926
+  # Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here
+  # by bmerge changing the type of the user's input object by reference. We now shallow copy i again. If we then
+  # coerce a column in i only, we are just changing the temporary coercion used for the merge operation. If we
+  # set callersi too then we are keeping that coerced i column in the merge result returned to user.
+  # The type of the i column is always returned (i.e. just i set not callersi too), other than:
+  #   i) to convert int-as-double to int, useful for ad hoc joins when the L postfix is often forgotten.
+  #  ii) to coerce i.factor to character when joining to x.character
+  # So those are the only two uses of callersi below.
+  # Careful to only use plonk syntax (full column) on i and x from now on, otherwise user's i and x would
+  # change. This is why shallow() is very importantly internal only, currently.
+
+  # Using .SD in j to join could fail due to being locked and set() being used here, #1926
   .Call(C_unlock, i)
   x = shallow(x)
   .Call(C_unlock, x)
   if (.Call(C_islocked, callersi)) {
     .Call(C_unlock, callersi)
     on.exit(.Call(C_lock, callersi))
   }
-  # careful to only plonk syntax (full column) on i/x from now on otherwise user's i and x would change;
-  #   this is why shallow() is very importantly internal only, currently.
 
   supported = c(ORDERING_TYPES, "factor", "integer64")
 
@@ -53,7 +55,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       } else {
         if (xclass=="character") {
           if (verbose) cat("Coercing factor column i.",names(i)[ic]," to type character to match type of x.",names(x)[xc],".\n",sep="")
-          set(i, j=ic, value=as.character(i[[ic]]))
+          set(i, j=ic, value=val<-as.character(i[[ic]]))
+          set(callersi, j=ic, value=val)  # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
           next
         } else if (iclass=="character") {
           if (verbose) cat("Matching character column i.",names(i)[ic]," to factor levels in x.",names(x)[xc],".\n",sep="")

@@ -2400,6 +2400,7 @@ setnames = function(x,old,new,skip_absent=FALSE) {
   # Sets by reference, maintains truelength, no copy of table at all.
   # But also more convenient than names(DT)[i]="newname"  because we can also do setnames(DT,"oldname","newname")
   # without an onerous match() ourselves. old can be positions, too, but we encourage by name for robustness.
+  # duplicates are permitted to be created without warning; e.g. in revdeps and for example, and setting spacer columns all with ""
   if (!is.data.frame(x)) stop("x is not a data.table or data.frame")
   ncol = length(x)
   if (length(names(x)) != ncol) stop("x has ",ncol," columns but its names are length ",length(names(x)))
@@ -2408,7 +2409,6 @@ setnames = function(x,old,new,skip_absent=FALSE) {
     # for setnames(DT,new); e.g., setnames(DT,c("A","B")) where ncol(DT)==2
     if (!is.character(old)) stop("Passed a vector of type '",typeof(old),"'. Needs to be type 'character'.")
     if (length(old) != ncol) stop("Can't assign ",length(old)," names to a ",ncol," column data.table")
-    # note that duplicate names are permitted to be created in this usage only
     if (anyNA(names(x))) {
       # if x somehow has some NA names, which() needs help to return them, #2475
       w = which((names(x) != old) | (is.na(names(x)) & !is.na(old)))
@@ -2421,7 +2421,7 @@ setnames = function(x,old,new,skip_absent=FALSE) {
   } else {
     if (missing(old)) stop("When 'new' is provided, 'old' must be provided too")
     if (!is.character(new)) stop("'new' is not a character vector")
-    if (anyDuplicated(new)) stop("Some duplicates exist in 'new': ", brackify(new[duplicated(new)]))
+    #  if (anyDuplicated(new)) warning("Some duplicates exist in 'new': ", brackify(new[duplicated(new)]))  # dups allowed without warning; warn if and when the dup causes an ambiguity
     if (anyNA(new)) stop("NA in 'new' at positions ", brackify(which(is.na(new))))
     if (anyDuplicated(old)) stop("Some duplicates exist in 'old': ", brackify(old[duplicated(old)]))
     if (is.numeric(old)) i = old = seq_along(x)[old]  # leave it to standard R to manipulate bounds and negative numbers

@@ -1675,8 +1675,8 @@ test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","
 # Test factor to character join when factor contains unused and reverse order levels :
 X = data.table(a=LETTERS[1:4],v=1:4,key="a")
 Y = data.table(a=factor(c("D","B"),levels=rev(LETTERS)),key="a")
-test(578, X[Y,verbose=TRUE], output="Coercing factor column i.a to type character to match type of x.a")
-test(579, X[Y], data.table(a=Y$a, v=c(4L,2L), key="a"))
+test(578, X[Y,verbose=TRUE], data.table(a=c("D","B"), v=c(4L,2L)),  # no key because "D">"B", consistent with v1.12.2 and before
+          output="Coercing factor column i.a to type character to match type of x.a")
 
 # Test that logical i in set() returns helpful error
 DT = data.table(a=1:3,b=4:6)
@@ -13592,7 +13592,7 @@ test(1967.59, setnames(x, 1:2, c(8L, 9L)), error = "'new' is not a character")
 test(1967.60, setnames(x, -1:1, c('hey', 'you')), error = "mixed.*negative")
 test(1967.61, setnames(x, 1+3i, 'cplx'), error = "'old' is type complex")
 test(1967.62, setnames(x, 1, c('d', 'e')), error = "'old' is length 1 but 'new'")
-test(1967.621, setnames(x, 1:2, c("a","a")), error = "Some duplicates exist in 'new': [a]")
+test(1967.621, setnames(x, 1:2, c("a","a")), data.table(a=1:5, a=6:10))
 test(1967.622, setnames(x, 1:2, c("a",NA)), error = "NA in 'new' at positions [2]")
 test(1967.63, setcolorder(x, c(1, 1)), error = 'specify duplicated column')
 test(1967.64, setcolorder(x, 1+3i), error = 'must be character or numeric')
@@ -14898,9 +14898,9 @@ test(2044.62, dt1[dt2, ..cols, on="doubleInt==int", verbose=TRUE],
 test(2044.63, dt1[dt2, ..cols, on="realDouble==int", verbose=TRUE],
               data.table(x.bool=c(rep(FALSE,4),TRUE), x.int=INT(2,4,6,8,10), x.doubleInt=c(2,4,6,8,10), i.bool=TRUE, i.int=1:5, i.doubleInt=as.double(1:5), i.char=letters[1:5]),
               output="Coerced integer column i.int to type double for join to match type of x.realDouble")
-cols = c("x.int","x.char","x.fact","i.int","i.char","i.fact")
+cols = c("x.int","x.char","x.fact","i.int","i.char","i.char")
 test(2044.64, dt1[dt2, ..cols, on="char==fact", verbose=TRUE],
-              ans<-data.table(x.int=1:5, x.char=letters[1:5], x.fact=factor(letters[1:5]), i.int=1:5, i.char=letters[1:5], i.fact=factor(letters[1:5])),
+              ans<-data.table(x.int=1:5, x.char=letters[1:5], x.fact=factor(letters[1:5]), i.int=1:5, i.char=letters[1:5], i.char=letters[1:5]),
               output="Coercing factor column i.fact to type character to match type of x.char")
 test(2044.65, dt1[dt2, ..cols, on="fact==char", verbose=TRUE],
               ans,