From 08b4b44a631ee4e752c02dff8a5fed712a94f6ec Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 19 Aug 2019 11:36:27 +0800 Subject: [PATCH 1/5] Closes #1470 -- streamline loop in GForce j optimization --- NEWS.md | 2 ++ R/data.table.R | 18 +++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index bc6b599c35..0f6e58f701 100644 --- a/NEWS.md +++ b/NEWS.md @@ -333,6 +333,8 @@ 19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object which has columns (such as a `data.frame` or `matrix`). Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are `data.frame` to support this use case. +20. `data.table` optimization much more efficient for many as-is columns in `j`, [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. + #### NOTES 1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background. diff --git a/R/data.table.R b/R/data.table.R index 47c6729320..a20bdc9775 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1461,6 +1461,7 @@ replace_order = function(isub, verbose, env) { lockBinding(".GRP",SDenv) lockBinding(".iSD",SDenv) + #browser() GForce = FALSE if ( getOption("datatable.optimize")>=1 && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[[1L]] %chin% c(".SD",".N"))) ) { # Ability to turn off if problems or to benchmark the benefit # Optimization to reduce overhead of calling lapply over and over for each group @@ -1612,9 +1613,9 @@ replace_order = function(isub, verbose, env) { } if (verbose) { if (!identical(oldjsub, jsub)) - cat("lapply optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub,width.cutoff=200L),"'\n",sep="") + cat("lapply optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") else - cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L),"'\n",sep="") + cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") } dotN = function(x) is.name(x) && x == ".N" # For #5760 # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with @@ -1644,7 +1645,7 @@ replace_order = function(isub, verbose, env) { } if (jsub[[1L]]=="list") { GForce = TRUE - for (ii in seq_along(jsub)[-1L]) if (!.ok(jsub[[ii]])) GForce = FALSE + for (ii in seq_along(jsub)[-1L]) if (!.ok(jsub[[ii]])) {GForce = FALSE; break} } else GForce = .ok(jsub) if (GForce) { if (jsub[[1L]]=="list") @@ -1666,13 +1667,12 @@ replace_order = function(isub, verbose, env) { nomeanopt=FALSE # to be set by .optmean() using <<- inside it oldjsub = jsub if (jsub[[1L]]=="list") { - for (ii in seq_along(jsub)[-1L]) { - this_jsub = jsub[[ii]] - if (dotN(this_jsub)) next; # For #5760 - # Addressing #1369, #2949 and #1974. Added is.symbol() check to handle cases where expanded function definition is used instead of function names. #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table. + # Addressing #1369, #2949 and #1974. Added is.symbol() check to handle cases where expanded function definition is used instead of function names. #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table. + # earlier, did a for loop and replaced only as necessary but this was slow with huge (e.g. 30K elements) j, #1470 + jsub = as.call(c(quote(list), lapply(jsub[-1L], function(this_jsub) { if (is.call(this_jsub) && is.symbol(this_jsub[[1L]]) && this_jsub[[1L]]=="mean") - jsub[[ii]] = .optmean(this_jsub) - } + .optmean(this_jsub) else this_jsub + }))) } else if (jsub[[1L]]=="mean") { jsub = .optmean(jsub) } From 55f813c21a937f1798b7e8aeb5d21d049d992525 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 28 Aug 2019 13:40:59 -0700 Subject: [PATCH 2/5] moved news item up to the right version --- NEWS.md | 4 ++-- R/data.table.R | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1cb78e1dda..75c6419be9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -162,6 +162,8 @@ 23. Added a `data.table` method for `utils::edit` to ensure a `data.table` is returned, for convenience, [#593](https://github.com/Rdatatable/data.table/issues/593). +24. More efficient optimization of as-is columns in `j`, [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. @@ -358,8 +360,6 @@ 19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object which has columns (such as a `data.frame` or `matrix`). Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are `data.frame` to support this use case. -20. `data.table` optimization much more efficient for many as-is columns in `j`, [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. - #### NOTES 1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background. diff --git a/R/data.table.R b/R/data.table.R index da7a458b40..f3f875a0c3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1461,7 +1461,6 @@ replace_order = function(isub, verbose, env) { lockBinding(".GRP",SDenv) lockBinding(".iSD",SDenv) - #browser() GForce = FALSE if ( getOption("datatable.optimize")>=1 && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[[1L]] %chin% c(".SD",".N"))) ) { # Ability to turn off if problems or to benchmark the benefit # Optimization to reduce overhead of calling lapply over and over for each group From ad3903e3296a5ef1f2d833509e96bdec3ba079bb Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 28 Aug 2019 14:36:06 -0700 Subject: [PATCH 3/5] used seq.int to avoid [-1L] copy, and similary sapply all of jsub to avoid jsub[-1L] --- R/data.table.R | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index f3f875a0c3..e57efa5f30 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1643,7 +1643,9 @@ replace_order = function(isub, verbose, env) { } if (jsub[[1L]]=="list") { GForce = TRUE - for (ii in seq_along(jsub)[-1L]) if (!.ok(jsub[[ii]])) {GForce = FALSE; break} + for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { + if (!.ok(jsub[[ii]])) {GForce = FALSE; break} + } } else GForce = .ok(jsub) if (GForce) { if (jsub[[1L]]=="list") @@ -1667,10 +1669,13 @@ replace_order = function(isub, verbose, env) { if (jsub[[1L]]=="list") { # Addressing #1369, #2949 and #1974. Added is.symbol() check to handle cases where expanded function definition is used instead of function names. #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table. # earlier, did a for loop and replaced only as necessary but this was slow with huge (e.g. 30K elements) j, #1470 - jsub = as.call(c(quote(list), lapply(jsub[-1L], function(this_jsub) { - if (is.call(this_jsub) && is.symbol(this_jsub[[1L]]) && this_jsub[[1L]]=="mean") - .optmean(this_jsub) else this_jsub - }))) + whichMean = which(sapply(jsub, function(x) { + is.call(x) && is.symbol(x[[1L]]) && x[[1L]]=="mean" + # jsub[[1]]=="list" so the first item will always be FALSE + })) + if (length(whichMean)) { + jsub[whichMean] = lapply(jsub[whichMean], .optmean) + } } else if (jsub[[1L]]=="mean") { jsub = .optmean(jsub) } From 8d0419d5d84f4d7875938da18b54ad27543c295e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 28 Aug 2019 15:39:38 -0700 Subject: [PATCH 4/5] more comments and removed the if(length(whichMean)) --- NEWS.md | 2 +- R/data.table.R | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 75c6419be9..0b762ca3db 100644 --- a/NEWS.md +++ b/NEWS.md @@ -162,7 +162,7 @@ 23. Added a `data.table` method for `utils::edit` to ensure a `data.table` is returned, for convenience, [#593](https://github.com/Rdatatable/data.table/issues/593). -24. More efficient optimization of as-is columns in `j`, [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. +24. More efficient optimization of many columns in `j` (e.g. from `.SD`), [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. #### BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index e57efa5f30..41702d4829 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1615,7 +1615,7 @@ replace_order = function(isub, verbose, env) { else cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") } - dotN = function(x) is.name(x) && x == ".N" # For #5760 + dotN = function(x) is.name(x) && x==".N" # For #5760. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with # nomatch=0L even now.. but not switching it on yet, will deal it separately. if (getOption("datatable.optimize")>=2 && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { @@ -1667,15 +1667,13 @@ replace_order = function(isub, verbose, env) { nomeanopt=FALSE # to be set by .optmean() using <<- inside it oldjsub = jsub if (jsub[[1L]]=="list") { - # Addressing #1369, #2949 and #1974. Added is.symbol() check to handle cases where expanded function definition is used instead of function names. #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table. - # earlier, did a for loop and replaced only as necessary but this was slow with huge (e.g. 30K elements) j, #1470 + # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been dotN() or the for-looped if() whichMean = which(sapply(jsub, function(x) { is.call(x) && is.symbol(x[[1L]]) && x[[1L]]=="mean" # jsub[[1]]=="list" so the first item will always be FALSE + # is.symbol() for when expanded function definition is used instead of function names. #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table. })) - if (length(whichMean)) { - jsub[whichMean] = lapply(jsub[whichMean], .optmean) - } + jsub[whichMean] = lapply(jsub[whichMean], .optmean) } else if (jsub[[1L]]=="mean") { jsub = .optmean(jsub) } From ebb611ae253418623182392e5d5b70350a28dfc3 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 28 Aug 2019 15:57:51 -0700 Subject: [PATCH 5/5] which() looked a bit early. changed more for readability and for coverage reasons --- R/data.table.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 41702d4829..43541e9afb 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1667,13 +1667,16 @@ replace_order = function(isub, verbose, env) { nomeanopt=FALSE # to be set by .optmean() using <<- inside it oldjsub = jsub if (jsub[[1L]]=="list") { - # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been dotN() or the for-looped if() - whichMean = which(sapply(jsub, function(x) { + # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been dotN() and/or the for-looped if() + todo = sapply(jsub, function(x) { is.call(x) && is.symbol(x[[1L]]) && x[[1L]]=="mean" # jsub[[1]]=="list" so the first item will always be FALSE - # is.symbol() for when expanded function definition is used instead of function names. #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table. - })) - jsub[whichMean] = lapply(jsub[whichMean], .optmean) + # is.symbol() for when expanded function definition is used instead of function names; #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table + }) + if (any(todo)) { + w = which(todo) + jsub[w] = lapply(jsub[w], .optmean) + } } else if (jsub[[1L]]=="mean") { jsub = .optmean(jsub) }