diff --git a/NEWS.md b/NEWS.md index 21ba02f96c..bbcad04707 100644 --- a/NEWS.md +++ b/NEWS.md @@ -234,6 +234,8 @@ 30. `groupingsets` functions now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report. +31. A `data.table` created using `setDT()` on a `data.frame` containing identical columns referencing each other would cause `setkey()` to return incorrect results, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). Thanks @kirillmayantsev and @alex46015 for reporting, and @jaapwalhout and @Atrebas for helping to debug and isolate the issue. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/data.table.R b/R/data.table.R index 47c6729320..3885eef2ed 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1274,9 +1274,9 @@ replace_order = function(isub, verbose, env) { if (jcpy) jval = copy(jval) } else if (address(jval) == address(SDenv$.SD)) { jval = copy(jval) - } else if ( length(jcpy <- which(vapply_1c(jval, address) %in% vapply_1c(SDenv, address))) ) { + } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) { for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]]) - } else if (is.call(jsub) && jsub[[1L]] == "get" && is.list(jval)) { + } else if (is.call(jsub) && jsub[[1L]] == "get") { jval = copy(jval) # fix for #1212 } } else { @@ -1285,6 +1285,7 @@ replace_order = function(isub, verbose, env) { if (!truelength(jval)) alloc.col(jval) } } + if (!is.null(lhs)) { # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above. .Call(Cassign,x,irows,cols,newnames,jval) diff --git a/R/setkey.R b/R/setkey.R index fbf8accbe8..0ffdf23a46 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -41,6 +41,12 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU setattr(x,"index",NULL) # setkey(DT,NULL) also clears secondary keys. setindex(DT,NULL) just clears secondary keys. return(invisible(x)) } + if (!missing(verbose)) { + stopifnot(isTRUEorFALSE(verbose)) + # set the global verbose option because that is fetched from C code without having to pass it through + oldverbose = options(datatable.verbose=verbose) + on.exit(options(oldverbose)) + } if (!is.data.table(x)) stop("x is not a data.table") if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.") if (physical && identical(attr(x, ".data.table.locked", exact=TRUE),TRUE)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.") @@ -102,12 +108,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. if (length(o)) { - if (verbose) { - tt = suppressMessages(system.time(.Call(Creorder,x,o))) - cat("reorder took", tt["user.self"]+tt["sys.self"], "sec\n") - } else { - .Call(Creorder,x,o) - } + if (verbose) { last.started.at = proc.time() } + .Call(Creorder,x,o) + if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() } } else { if (verbose) cat("x is already ordered by these columns, no need to call reorder\n") } # else empty integer() from forderv means x is already ordered by those cols, nothing to do. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a84042353d..de3fe0282e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15771,6 +15771,22 @@ test(2086.04, DT[ , sum(a), keyby = list()], data.table(V1=55L)) test(2086.05, DT[ , sum(a), by = character()], data.table(V1=55L)) test(2086.06, DT[ , sum(a), keyby = character()], data.table(V1=55L)) +# simple queries can create tables with columns sharing the same address, #3766 +x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1) +test(2087.1, x[a == 1L, .(b, b2=b)][ , identical(address(b), address(b2))]) +# setkey detects and copies shared address columns, #3496 +x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) +x$b = x$a +setDT(x) +test(2087.2, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), key="a"), + output='Found and copied 1 column with a shared memory address') +x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) +x$b = x$a +x$c = x$a +setDT(x) +test(2087.3, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), c=paste0(1:2), key="a"), + output='Found and copied 2 columns with a shared memory address') + ################################### # Add new tests above this line # diff --git a/src/data.table.h b/src/data.table.h index 183d90bee8..469b9e76e7 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -216,4 +216,5 @@ void coerceFill(SEXP fill, double *dfill, int32_t *ifill, int64_t *i64fill); SEXP coerceFillR(SEXP fill); bool INHERITS(SEXP x, SEXP char_); bool Rinherits(SEXP x, SEXP char_); +void copySharedColumns(SEXP x); diff --git a/src/reorder.c b/src/reorder.c index e65a58f26c..ec3824efa0 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -21,6 +21,7 @@ SEXP reorder(SEXP x, SEXP order) maxSize=SIZEOF(v); if (ALTREP(v)) SET_VECTOR_ELT(x,i,duplicate(v)); // expand compact vector in place ready for reordering by reference } + copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16) error("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%d)", type2char(TYPEOF(x)), SIZEOF(x)); @@ -34,7 +35,6 @@ SEXP reorder(SEXP x, SEXP order) int nprotect = 0; if (ALTREP(order)) { order=PROTECT(duplicate(order)); nprotect++; } // TODO: how to fetch range of ALTREP compact vector - const int *restrict idx = INTEGER(order); int i=0; while (i1?"s":""); + // GetVerbose() (slightly expensive call of all options) called here only when needed + } +} +