diff --git a/R/bmerge.R b/R/bmerge.R index d0c5c63a8e..3f5c19f36d 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -2,15 +2,19 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose) { callersi = i - # Just so that when a double in i which contains integers stored as double, is joined to an integer column - # in x, the i column is returned as integer in the result. Just before the call to bmerge() in [.data.table - # there is a shallow() copy of i to prevent this change of type affecting the user's object by reference. - # There is only one use of callersi here. - # Useful for ad hoc joins when the L postfix is often forgotten. - # Otherwise, the type of the i column is always returned. - i = shallow(i) - # merge on .SD in i fails _sometimes_ because of set() being done here, #1926 + # Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here + # by bmerge changing the type of the user's input object by reference. We now shallow copy i again. If we then + # coerce a column in i only, we are just changing the temporary coercion used for the merge operation. If we + # set callersi too then we are keeping that coerced i column in the merge result returned to user. + # The type of the i column is always returned (i.e. just i set not callersi too), other than: + # i) to convert int-as-double to int, useful for ad hoc joins when the L postfix is often forgotten. + # ii) to coerce i.factor to character when joining to x.character + # So those are the only two uses of callersi below. + # Careful to only use plonk syntax (full column) on i and x from now on, otherwise user's i and x would + # change. This is why shallow() is very importantly internal only, currently. + + # Using .SD in j to join could fail due to being locked and set() being used here, #1926 .Call(C_unlock, i) x = shallow(x) .Call(C_unlock, x) @@ -18,8 +22,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos .Call(C_unlock, callersi) on.exit(.Call(C_lock, callersi)) } - # careful to only plonk syntax (full column) on i/x from now on otherwise user's i and x would change; - # this is why shallow() is very importantly internal only, currently. supported = c(ORDERING_TYPES, "factor", "integer64") @@ -53,7 +55,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } else { if (xclass=="character") { if (verbose) cat("Coercing factor column i.",names(i)[ic]," to type character to match type of x.",names(x)[xc],".\n",sep="") - set(i, j=ic, value=as.character(i[[ic]])) + set(i, j=ic, value=val<-as.character(i[[ic]])) + set(callersi, j=ic, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 next } else if (iclass=="character") { if (verbose) cat("Matching character column i.",names(i)[ic]," to factor levels in x.",names(x)[xc],".\n",sep="") diff --git a/R/data.table.R b/R/data.table.R index 91ede5d212..3f97544372 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2400,6 +2400,7 @@ setnames = function(x,old,new,skip_absent=FALSE) { # Sets by reference, maintains truelength, no copy of table at all. # But also more convenient than names(DT)[i]="newname" because we can also do setnames(DT,"oldname","newname") # without an onerous match() ourselves. old can be positions, too, but we encourage by name for robustness. + # duplicates are permitted to be created without warning; e.g. in revdeps and for example, and setting spacer columns all with "" if (!is.data.frame(x)) stop("x is not a data.table or data.frame") ncol = length(x) if (length(names(x)) != ncol) stop("x has ",ncol," columns but its names are length ",length(names(x))) @@ -2408,7 +2409,6 @@ setnames = function(x,old,new,skip_absent=FALSE) { # for setnames(DT,new); e.g., setnames(DT,c("A","B")) where ncol(DT)==2 if (!is.character(old)) stop("Passed a vector of type '",typeof(old),"'. Needs to be type 'character'.") if (length(old) != ncol) stop("Can't assign ",length(old)," names to a ",ncol," column data.table") - # note that duplicate names are permitted to be created in this usage only if (anyNA(names(x))) { # if x somehow has some NA names, which() needs help to return them, #2475 w = which((names(x) != old) | (is.na(names(x)) & !is.na(old))) @@ -2421,7 +2421,7 @@ setnames = function(x,old,new,skip_absent=FALSE) { } else { if (missing(old)) stop("When 'new' is provided, 'old' must be provided too") if (!is.character(new)) stop("'new' is not a character vector") - if (anyDuplicated(new)) stop("Some duplicates exist in 'new': ", brackify(new[duplicated(new)])) + # if (anyDuplicated(new)) warning("Some duplicates exist in 'new': ", brackify(new[duplicated(new)])) # dups allowed without warning; warn if and when the dup causes an ambiguity if (anyNA(new)) stop("NA in 'new' at positions ", brackify(which(is.na(new)))) if (anyDuplicated(old)) stop("Some duplicates exist in 'old': ", brackify(old[duplicated(old)])) if (is.numeric(old)) i = old = seq_along(x)[old] # leave it to standard R to manipulate bounds and negative numbers diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 93aa828bf2..d705901860 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1675,8 +1675,8 @@ test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b"," # Test factor to character join when factor contains unused and reverse order levels : X = data.table(a=LETTERS[1:4],v=1:4,key="a") Y = data.table(a=factor(c("D","B"),levels=rev(LETTERS)),key="a") -test(578, X[Y,verbose=TRUE], output="Coercing factor column i.a to type character to match type of x.a") -test(579, X[Y], data.table(a=Y$a, v=c(4L,2L), key="a")) +test(578, X[Y,verbose=TRUE], data.table(a=c("D","B"), v=c(4L,2L)), # no key because "D">"B", consistent with v1.12.2 and before + output="Coercing factor column i.a to type character to match type of x.a") # Test that logical i in set() returns helpful error DT = data.table(a=1:3,b=4:6) @@ -13592,7 +13592,7 @@ test(1967.59, setnames(x, 1:2, c(8L, 9L)), error = "'new' is not a character") test(1967.60, setnames(x, -1:1, c('hey', 'you')), error = "mixed.*negative") test(1967.61, setnames(x, 1+3i, 'cplx'), error = "'old' is type complex") test(1967.62, setnames(x, 1, c('d', 'e')), error = "'old' is length 1 but 'new'") -test(1967.621, setnames(x, 1:2, c("a","a")), error = "Some duplicates exist in 'new': [a]") +test(1967.621, setnames(x, 1:2, c("a","a")), data.table(a=1:5, a=6:10)) test(1967.622, setnames(x, 1:2, c("a",NA)), error = "NA in 'new' at positions [2]") test(1967.63, setcolorder(x, c(1, 1)), error = 'specify duplicated column') test(1967.64, setcolorder(x, 1+3i), error = 'must be character or numeric') @@ -14898,9 +14898,9 @@ test(2044.62, dt1[dt2, ..cols, on="doubleInt==int", verbose=TRUE], test(2044.63, dt1[dt2, ..cols, on="realDouble==int", verbose=TRUE], data.table(x.bool=c(rep(FALSE,4),TRUE), x.int=INT(2,4,6,8,10), x.doubleInt=c(2,4,6,8,10), i.bool=TRUE, i.int=1:5, i.doubleInt=as.double(1:5), i.char=letters[1:5]), output="Coerced integer column i.int to type double for join to match type of x.realDouble") -cols = c("x.int","x.char","x.fact","i.int","i.char","i.fact") +cols = c("x.int","x.char","x.fact","i.int","i.char","i.char") test(2044.64, dt1[dt2, ..cols, on="char==fact", verbose=TRUE], - ans<-data.table(x.int=1:5, x.char=letters[1:5], x.fact=factor(letters[1:5]), i.int=1:5, i.char=letters[1:5], i.fact=factor(letters[1:5])), + ans<-data.table(x.int=1:5, x.char=letters[1:5], x.fact=factor(letters[1:5]), i.int=1:5, i.char=letters[1:5], i.char=letters[1:5]), output="Coercing factor column i.fact to type character to match type of x.char") test(2044.65, dt1[dt2, ..cols, on="fact==char", verbose=TRUE], ans,