From bbe734fa200ad5dd4e12439b7ac82544b874c4be Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 24 Aug 2019 05:04:47 +0800 Subject: [PATCH 1/6] Closes #3209 -- g[[ --- R/data.table.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 47c6729320..0d79ed2bdf 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1512,7 +1512,7 @@ replace_order = function(isub, verbose, env) { jvnames = ansvarsnew } } else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with - subopt = length(jsub) == 3L && jsub[[1L]] == "[" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") + subopt = length(jsub) == 3L && (jsub[[1L]] == "[" || jsub[[1L]] == "[[") && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail" firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030 if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") && @@ -1628,7 +1628,7 @@ replace_order = function(isub, verbose, env) { } } else { # Apply GForce - gfuns = c("sum", "prod", "mean", "median", "var", "sd", ".N", "min", "max", "head", "last", "first", "tail", "[") # added .N for #5760 + gfuns = c("sum", "prod", "mean", "median", "var", "sd", ".N", "min", "max", "head", "last", "first", "tail", "[", "[[") # added .N for #5760 .ok = function(q) { if (dotN(q)) return(TRUE) # For #5760 # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD @@ -1640,7 +1640,7 @@ replace_order = function(isub, verbose, env) { # otherwise there must be three arguments, and only in two cases: # 1) head/tail(x, 1) or 2) x[n], n>0 length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && - ( (q1c %chin% c("head", "tail") && q3==1L) || (q1c == "[" && q3>0L) ) + ( (q1c %chin% c("head", "tail") && q3==1L) || ((q1c == "[" || q1c == "[[") && q3>0L) ) } if (jsub[[1L]]=="list") { GForce = TRUE @@ -2790,7 +2790,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { } # GForce functions -`g[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. +`g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment gfirst = function(x) .Call(Cgfirst, x) From d8dfc0df679d3f5f82b4b8cf57181e672ee38caa Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 24 Aug 2019 05:11:04 +0800 Subject: [PATCH 2/6] tidy code to simplify adding GForce functions gforce_ok back to local scope --- R/data.table.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 0d79ed2bdf..b540532b68 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1628,8 +1628,7 @@ replace_order = function(isub, verbose, env) { } } else { # Apply GForce - gfuns = c("sum", "prod", "mean", "median", "var", "sd", ".N", "min", "max", "head", "last", "first", "tail", "[", "[[") # added .N for #5760 - .ok = function(q) { + .gforce_ok = function(q) { if (dotN(q)) return(TRUE) # For #5760 # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD # is.symbol() is for #1369, #1974 and #2949 @@ -1644,8 +1643,8 @@ replace_order = function(isub, verbose, env) { } if (jsub[[1L]]=="list") { GForce = TRUE - for (ii in seq_along(jsub)[-1L]) if (!.ok(jsub[[ii]])) GForce = FALSE - } else GForce = .ok(jsub) + for (ii in seq_along(jsub)[-1L]) if (!.gforce_ok(jsub[[ii]])) {GForce = FALSE; break} + } else GForce = .gforce_ok(jsub) if (GForce) { if (jsub[[1L]]=="list") for (ii in seq_along(jsub)[-1L]) { @@ -2790,6 +2789,12 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { } # GForce functions +# to add a new function to GForce (from the R side -- the easy part!): +# (1) add it to gfuns +# (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function +# (3) define the gfun = function() R wrapper +gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", + "median", "min", "max", "var", "sd", ".N") # added .N for #5760 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment From 653d9ce3596c1f39b35a8dd23a4ed4d49f09c623 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 24 Aug 2019 22:50:24 +0800 Subject: [PATCH 3/6] add test, NEWS --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 6b6999c7a6..b3e582d2ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -232,6 +232,8 @@ 29. `integer64` defined on a subset of a new column would leave "gibberish" on the remaining rows, [#3723](https://github.com/Rdatatable/data.table/issues/3723). A bug in `rbindlist` with the same root cause was also fixed, [#1459](https://github.com/Rdatatable/data.table/issues/1459). Thanks @shrektan and @jangorecki for the reports. +30. `[[` calls by group _on vectors_ now optimized with `GForce`, [#3209](https://github.com/Rdatatable/data.table/issues/3209). Thanks @renkun-ken for the suggestion. Optimization for list columns has been skipped for now; please file a feature request with your use case if this would benefit you. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index aade8300e6..8709e76ec8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7977,6 +7977,16 @@ test(1581.11, ans2 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], test(1581.12, ans1, ans2) options(datatable.optimize = Inf) +# #3209 g[[ +options(datatable.optimize=1L) +test(1581.13, ans1 <- dt[x %in% letters[15:20], i1[[2]], by=x, verbose=TRUE], + output = "(GForce FALSE)") +options(datatable.optimize=2L) +test(1581.14, ans2 <- dt[x %in% letters[15:20], i1[[2]], by=x, verbose=TRUE], + output = "GForce optimized j") +test(1581.15, ans1, ans2) +options(datatable.optimize = Inf) + # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) @@ -15738,7 +15748,6 @@ if (test_bit64) { test(2083.2, rbind(data.table(a=1:2, b=as.integer64(c(1,NA))), data.table(a=3L), fill=TRUE)$b, as.integer64(c(1, NA, NA))) } - ################################### # Add new tests above this line # ################################### From ea47e4bb262296778d5d974cb0f7e8abbc3722b7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 24 Aug 2019 23:15:14 +0800 Subject: [PATCH 4/6] opt=Inf needed --- inst/tests/tests.Rraw | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8709e76ec8..7a6e17b160 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7981,11 +7981,10 @@ options(datatable.optimize = Inf) options(datatable.optimize=1L) test(1581.13, ans1 <- dt[x %in% letters[15:20], i1[[2]], by=x, verbose=TRUE], output = "(GForce FALSE)") -options(datatable.optimize=2L) +options(datatable.optimize=Inf) test(1581.14, ans2 <- dt[x %in% letters[15:20], i1[[2]], by=x, verbose=TRUE], output = "GForce optimized j") test(1581.15, ans1, ans2) -options(datatable.optimize = Inf) # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) From 77115fc506548446866a20d65b2fa25c8559b75f Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 30 Aug 2019 11:06:38 -0700 Subject: [PATCH 5/6] d1 not i1 because the i1 was removed from dt on line above between test 1580 and 1581.1 --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e84ecf7ead..f1e1bf7718 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7979,10 +7979,10 @@ options(datatable.optimize = Inf) # #3209 g[[ options(datatable.optimize=1L) -test(1581.13, ans1 <- dt[x %in% letters[15:20], i1[[2]], by=x, verbose=TRUE], +test(1581.13, ans1 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output = "(GForce FALSE)") options(datatable.optimize=Inf) -test(1581.14, ans2 <- dt[x %in% letters[15:20], i1[[2]], by=x, verbose=TRUE], +test(1581.14, ans2 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output = "GForce optimized j") test(1581.15, ans1, ans2) From dca85aec1d860885e0d4e1b3b4eeab990b03c2d0 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 30 Aug 2019 11:18:56 -0700 Subject: [PATCH 6/6] reinstated request in news item to file FR for this optimization on list columns --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4cc891b682..5300aaa0eb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -166,7 +166,7 @@ 25. `setnames(DT, old, new)` now omits any `old==new` to save redundant key and index name updates, [#3783](https://github.com/Rdatatable/data.table/issues/3783). `setnames(DT, new)` (i.e. not providing `old`) already omitted any column name updates where `names(DT)==new`; e.g. `setnames(DT, gsub('^_', '', names(DT)))` exits early if no columns start with `_`. -26. `[[` by group is now optimized, [#3209](https://github.com/Rdatatable/data.table/issues/3209). Thanks @renkun-ken for the suggestion. `[` by group was already optimized. +26. `[[` by group is now optimized for regular vectors (not type list), [#3209](https://github.com/Rdatatable/data.table/issues/3209). Thanks @renkun-ken for the suggestion. `[` by group was already optimized. Please file a feature request if you would like this optimization for list columns. #### BUG FIXES