From 3b3239c4b238c78fd09ad35b2d24a704922a9bea Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 16 Aug 2019 00:19:44 +0800 Subject: [PATCH 1/6] Closes #3496 and #3766 -- detect & block setkey from double-reordering identical columns vestigial --- NEWS.md | 2 ++ R/data.table.R | 18 +++++++++--------- R/setkey.R | 13 ++++++++----- inst/tests/tests.Rraw | 10 ++++++++++ 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index bc6b599c35..867664e00c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -228,6 +228,8 @@ 27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. +28. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Thanks @kirillmayantsev and @alex46015 for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/data.table.R b/R/data.table.R index 47c6729320..e926612f4b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1266,25 +1266,25 @@ replace_order = function(isub, verbose, env) { jval = eval(jsub, SDenv, parent.frame()) # copy 'jval' when required - # More speedup - only check + copy if irows is NULL # Temp fix for #921 - check address and copy *after* evaluating 'jval' - if (is.null(irows)) { - if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. - jcpy = address(jval) %in% vapply_1c(SDenv$.SD, address) # %chin% errors when RHS is list() - if (jcpy) jval = copy(jval) - } else if (address(jval) == address(SDenv$.SD)) { + # need to check if copy needed regardless of irows status (#3766) + if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. + jcpy = address(jval) %chin% vapply_1c(SDenv$.SD, address) + if (jcpy) jval = copy(jval) + } else { + if (address(jval) == address(SDenv$.SD)) { jval = copy(jval) - } else if ( length(jcpy <- which(vapply_1c(jval, address) %in% vapply_1c(SDenv, address))) ) { + } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) { for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]]) - } else if (is.call(jsub) && jsub[[1L]] == "get" && is.list(jval)) { + } else if (is.call(jsub) && jsub[[1L]] == "get") { jval = copy(jval) # fix for #1212 } - } else { if (is.data.table(jval)) { setattr(jval, '.data.table.locked', NULL) # fix for #1341 if (!truelength(jval)) alloc.col(jval) } } + if (!is.null(lhs)) { # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above. .Call(Cassign,x,irows,cols,newnames,jval) diff --git a/R/setkey.R b/R/setkey.R index fbf8accbe8..0037a9ddb0 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -102,12 +102,15 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. if (length(o)) { - if (verbose) { - tt = suppressMessages(system.time(.Call(Creorder,x,o))) - cat("reorder took", tt["user.self"]+tt["sys.self"], "sec\n") - } else { - .Call(Creorder,x,o) + # table has deep-duplicate columns, e.g. #3496 + if (any(idx <- duplicated(vapply_1c(x, address)))) { + if (verbose) { last.started.at=proc.time(); cat('Deep duplicate columns (identical address in memory) detected... copying', sum(idx), 'columns ...'); flush.console() } + for (j in which(idx)) set(x, NULL, j, copy(x[[j]])) + if (verbose) { cat("done in", timetaken(last.started.at), "\n"); flush.console() } } + if (verbose) { last.started.at = proc.time() } + .Call(Creorder,x,o) + if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() } } else { if (verbose) cat("x is already ordered by these columns, no need to call reorder\n") } # else empty integer() from forderv means x is already ordered by those cols, nothing to do. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 58c3c63c4a..503bdc8a75 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15671,6 +15671,16 @@ if (test_bit64) { test(2077.06, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g])) } +# #3766 simple queries can create tables with deep-copy-identical columns +x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1) +test(2078.1, x[a == 1L, .(b, b2=b)][ , !identical(address(b), address(b2))]) +# #3496 generally need to protect against deep-copy-idential columns in setkey +x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) +x$b = x$a +setDT(x) +test(2078.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns') +test(2078.3, x, data.table(a=paste0(1:2), b=paste0(1:2), key='a')) + ################################### # Add new tests above this line # From 9d4e929fb318a488be9d754f77bcfb1872500f69 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 18 Aug 2019 23:28:43 +0800 Subject: [PATCH 2/6] also closed #2245; test added --- NEWS.md | 2 +- inst/tests/tests.Rraw | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 867664e00c..f02e967a30 100644 --- a/NEWS.md +++ b/NEWS.md @@ -228,7 +228,7 @@ 27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. -28. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Thanks @kirillmayantsev and @alex46015 for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue. +28. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Finally, `[` might leave its output with `.data.table.locked`, preventing downstream use of `:=`, [#2245](https://github.com/Rdatatable/data.table/issues/2245). Thanks @kirillmayantsev, @alex46015, and @grayskripko for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue. #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 503bdc8a75..ace84c3166 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15680,6 +15680,9 @@ x$b = x$a setDT(x) test(2078.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns') test(2078.3, x, data.table(a=paste0(1:2), b=paste0(1:2), key='a')) +## #2245 unset .data.table.locked even if is.null(irows) +x <- data.table(a=c(0.85, -0.38, 1.19), b=c(0.56, 0.63, -1.30)) +test(2078.4, x[ , round(.SD, 1)][ , c := 8], data.table(a=c(.8, -.4, 1.2), b=c(.6,.6,-1.3), c=8)) ################################### From df3aed31da5e141a686f0df2dfbbe139be080c31 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 26 Aug 2019 15:35:48 -0700 Subject: [PATCH 3/6] test number typo --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b412fad704..3f6b68b7f4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15778,7 +15778,7 @@ test(2087.1, x[a == 1L, .(b, b2=b)][ , !identical(address(b), address(b2))]) x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) x$b = x$a setDT(x) -test(2078.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns') +test(2087.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns') test(2087.3, x, data.table(a=paste0(1:2), b=paste0(1:2), key='a')) ## #2245 unset .data.table.locked even if is.null(irows) x <- data.table(a=c(0.85, -0.38, 1.19), b=c(0.56, 0.63, -1.30)) From 7637f5130d52037de54fbc230e7ebbb379efa428 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 26 Aug 2019 18:46:59 -0700 Subject: [PATCH 4/6] created copySharedColumns at C level, called from reorder.c at root cause. Faster too by avoiding address() at R level which creates character strings for addresses and avoiding hash table of duplicated(); only noticeable for very many columns. --- R/setkey.R | 12 ++++++------ inst/tests/tests.Rraw | 10 ++++++++-- src/data.table.h | 1 + src/reorder.c | 2 +- src/utils.c | 36 ++++++++++++++++++++++++++++++++++++ 5 files changed, 52 insertions(+), 9 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 0037a9ddb0..0ffdf23a46 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -41,6 +41,12 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU setattr(x,"index",NULL) # setkey(DT,NULL) also clears secondary keys. setindex(DT,NULL) just clears secondary keys. return(invisible(x)) } + if (!missing(verbose)) { + stopifnot(isTRUEorFALSE(verbose)) + # set the global verbose option because that is fetched from C code without having to pass it through + oldverbose = options(datatable.verbose=verbose) + on.exit(options(oldverbose)) + } if (!is.data.table(x)) stop("x is not a data.table") if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.") if (physical && identical(attr(x, ".data.table.locked", exact=TRUE),TRUE)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.") @@ -102,12 +108,6 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. if (length(o)) { - # table has deep-duplicate columns, e.g. #3496 - if (any(idx <- duplicated(vapply_1c(x, address)))) { - if (verbose) { last.started.at=proc.time(); cat('Deep duplicate columns (identical address in memory) detected... copying', sum(idx), 'columns ...'); flush.console() } - for (j in which(idx)) set(x, NULL, j, copy(x[[j]])) - if (verbose) { cat("done in", timetaken(last.started.at), "\n"); flush.console() } - } if (verbose) { last.started.at = proc.time() } .Call(Creorder,x,o) if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3f6b68b7f4..2755fe2205 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15778,8 +15778,14 @@ test(2087.1, x[a == 1L, .(b, b2=b)][ , !identical(address(b), address(b2))]) x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) x$b = x$a setDT(x) -test(2087.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns') -test(2087.3, x, data.table(a=paste0(1:2), b=paste0(1:2), key='a')) +test(2087.2, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), key="a"), + output='Found and copied 1 column with a shared memory address') +x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) +x$b = x$a +x$c = x$a +setDT(x) +test(2087.3, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), c=paste0(1:2), key="a"), + output='Found and copied 2 columns with a shared memory address') ## #2245 unset .data.table.locked even if is.null(irows) x <- data.table(a=c(0.85, -0.38, 1.19), b=c(0.56, 0.63, -1.30)) test(2087.4, x[ , round(.SD, 1)][ , c := 8], data.table(a=c(.8, -.4, 1.2), b=c(.6,.6,-1.3), c=8)) diff --git a/src/data.table.h b/src/data.table.h index 183d90bee8..469b9e76e7 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -216,4 +216,5 @@ void coerceFill(SEXP fill, double *dfill, int32_t *ifill, int64_t *i64fill); SEXP coerceFillR(SEXP fill); bool INHERITS(SEXP x, SEXP char_); bool Rinherits(SEXP x, SEXP char_); +void copySharedColumns(SEXP x); diff --git a/src/reorder.c b/src/reorder.c index e65a58f26c..ec3824efa0 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -21,6 +21,7 @@ SEXP reorder(SEXP x, SEXP order) maxSize=SIZEOF(v); if (ALTREP(v)) SET_VECTOR_ELT(x,i,duplicate(v)); // expand compact vector in place ready for reordering by reference } + copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16) error("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%d)", type2char(TYPEOF(x)), SIZEOF(x)); @@ -34,7 +35,6 @@ SEXP reorder(SEXP x, SEXP order) int nprotect = 0; if (ALTREP(order)) { order=PROTECT(duplicate(order)); nprotect++; } // TODO: how to fetch range of ALTREP compact vector - const int *restrict idx = INTEGER(order); int i=0; while (i1?"s":""); + // GetVerbose() (slightly expensive call of all options) called here only when needed + } +} + From 5b59f6e17b8f1cfbcd4fdcefc603eb432d71e61a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 27 Aug 2019 16:13:45 +0800 Subject: [PATCH 5/6] remove ancillary issue fix to be filed independently --- NEWS.md | 2 +- R/data.table.R | 13 +++++++------ inst/tests/tests.Rraw | 8 +------- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8b29a90967..7baa5af5ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -234,7 +234,7 @@ 30. `groupingsets` functions now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report. -31. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Finally, `[` might leave its output with `.data.table.locked`, preventing downstream use of `:=`, [#2245](https://github.com/Rdatatable/data.table/issues/2245). Thanks @kirillmayantsev, @alex46015, and @grayskripko for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue. +31. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Thanks @kirillmayantsev and @alex46015 for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue. #### NOTES diff --git a/R/data.table.R b/R/data.table.R index e926612f4b..3885eef2ed 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1266,19 +1266,20 @@ replace_order = function(isub, verbose, env) { jval = eval(jsub, SDenv, parent.frame()) # copy 'jval' when required + # More speedup - only check + copy if irows is NULL # Temp fix for #921 - check address and copy *after* evaluating 'jval' - # need to check if copy needed regardless of irows status (#3766) - if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. - jcpy = address(jval) %chin% vapply_1c(SDenv$.SD, address) - if (jcpy) jval = copy(jval) - } else { - if (address(jval) == address(SDenv$.SD)) { + if (is.null(irows)) { + if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. + jcpy = address(jval) %in% vapply_1c(SDenv$.SD, address) # %chin% errors when RHS is list() + if (jcpy) jval = copy(jval) + } else if (address(jval) == address(SDenv$.SD)) { jval = copy(jval) } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) { for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]]) } else if (is.call(jsub) && jsub[[1L]] == "get") { jval = copy(jval) # fix for #1212 } + } else { if (is.data.table(jval)) { setattr(jval, '.data.table.locked', NULL) # fix for #1341 if (!truelength(jval)) alloc.col(jval) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2755fe2205..1e874a2461 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15771,10 +15771,7 @@ test(2086.04, DT[ , sum(a), keyby = list()], data.table(V1=55L)) test(2086.05, DT[ , sum(a), by = character()], data.table(V1=55L)) test(2086.06, DT[ , sum(a), keyby = character()], data.table(V1=55L)) -# #3766 simple queries can create tables with deep-copy-identical columns -x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1) -test(2087.1, x[a == 1L, .(b, b2=b)][ , !identical(address(b), address(b2))]) -# #3496 generally need to protect against deep-copy-idential columns in setkey +# #3496/#3766 setkey re-reorders columns which are deep-copy-identical x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) x$b = x$a setDT(x) @@ -15786,9 +15783,6 @@ x$c = x$a setDT(x) test(2087.3, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), c=paste0(1:2), key="a"), output='Found and copied 2 columns with a shared memory address') -## #2245 unset .data.table.locked even if is.null(irows) -x <- data.table(a=c(0.85, -0.38, 1.19), b=c(0.56, 0.63, -1.30)) -test(2087.4, x[ , round(.SD, 1)][ , c := 8], data.table(a=c(.8, -.4, 1.2), b=c(.6,.6,-1.3), c=8)) ################################### From f7031ba848bd4d074090779d45ccf4d6540c703b Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 27 Aug 2019 12:06:49 -0700 Subject: [PATCH 6/6] tweaks to news item and test --- NEWS.md | 2 +- inst/tests/tests.Rraw | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7baa5af5ab..bbcad04707 100644 --- a/NEWS.md +++ b/NEWS.md @@ -234,7 +234,7 @@ 30. `groupingsets` functions now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report. -31. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Thanks @kirillmayantsev and @alex46015 for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue. +31. A `data.table` created using `setDT()` on a `data.frame` containing identical columns referencing each other would cause `setkey()` to return incorrect results, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). Thanks @kirillmayantsev and @alex46015 for reporting, and @jaapwalhout and @Atrebas for helping to debug and isolate the issue. #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1e874a2461..de3fe0282e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15771,7 +15771,10 @@ test(2086.04, DT[ , sum(a), keyby = list()], data.table(V1=55L)) test(2086.05, DT[ , sum(a), by = character()], data.table(V1=55L)) test(2086.06, DT[ , sum(a), keyby = character()], data.table(V1=55L)) -# #3496/#3766 setkey re-reorders columns which are deep-copy-identical +# simple queries can create tables with columns sharing the same address, #3766 +x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1) +test(2087.1, x[a == 1L, .(b, b2=b)][ , identical(address(b), address(b2))]) +# setkey detects and copies shared address columns, #3496 x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) x$b = x$a setDT(x)