From 5fabd648886e091a1da95890753692afd20cdd18 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sat, 30 Oct 2021 01:55:07 +0200 Subject: [PATCH 1/9] init commit --- R/data.table.R | 6 ++++-- src/gsumm.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/init.c | 2 ++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index e020ea3e3d..931a5752a5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1749,6 +1749,7 @@ replace_dot_alias = function(e) { q_named = match.call(shift, q) if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE) } # add gshift support + if (length(q)>=3 && q[[1L]] == "weighted.mean") return(TRUE) # ^^ base::startWith errors on NULL unfortunately # head-tail uses default value n=6 which as of now should not go gforce ... ^^ # otherwise there must be three arguments, and only in two cases: @@ -1773,7 +1774,7 @@ replace_dot_alias = function(e) { # adding argument to ghead/gtail if none is supplied to g-optimized head/tail if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) - if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 + if (length(jsub)==3L && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } else if (verbose) catf("GForce is on, left j unchanged\n"); @@ -2987,7 +2988,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { # (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function # (3) define the gfun = function() R wrapper gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", - "median", "min", "max", "var", "sd", ".N", "shift") # added .N for #334 + "median", "min", "max", "var", "sd", ".N", "shift", "weighted.mean") # added .N for #334 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment @@ -2995,6 +2996,7 @@ gfirst = function(x) .Call(Cgfirst, x) glast = function(x) .Call(Cglast, x) gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm) gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm) +gweighted.mean = function(x, w, mode=1L, na.rm=FALSE) { if (mode==2L) gsum(x*w)/gsum(w) else .Call(CgweightedMean, x, w, na.rm) } gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm) gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm) gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm) diff --git a/src/gsumm.c b/src/gsumm.c index 4964de8b6e..5119202e53 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -717,6 +717,62 @@ SEXP gmean(SEXP x, SEXP narmArg) return(ans); } + + +SEXP gweightedMean(SEXP x, SEXP w, SEXP narmArg) +{ + if (inherits(x, "factor")) + error(_("%s is not meaningful for factors."), "gweighted.mean"); + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("%s must be TRUE or FALSE"), "na.rm"); + const bool narm = LOGICAL(narmArg)[0]; + const bool nosubset = irowslen==-1; + const int n = nosubset ? length(x) : irowslen; + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gweighted.mean"); + + SEXP ans; + int protecti=0; + + switch(TYPEOF(x)) { + case LGLSXP: case INTSXP: + x = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(FALSE))); protecti++; + w = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(FALSE))); protecti++; + case REALSXP: { + if (INHERITS(x, char_integer64)) { + x = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(TRUE))); protecti++; + } + if (INHERITS(w, char_integer64)) { + w = PROTECT(coerceAs(w, ScalarReal(1), ScalarLogical(TRUE))); protecti++; + } + const double *restrict xd = REAL(x); + const double *restrict wd = REAL(w); + ans = PROTECT(allocVector(REALSXP, ngrp)); protecti++; + double *ansd = REAL(ans); + + for (int i=0; i Date: Mon, 1 Nov 2021 15:55:29 +0100 Subject: [PATCH 2/9] redirect gweighted.mean to gsum since 2*gsum is faster --- R/data.table.R | 3 ++- src/gsumm.c | 56 -------------------------------------------------- src/init.c | 2 -- 3 files changed, 2 insertions(+), 59 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 931a5752a5..ad35d628f8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1749,6 +1749,7 @@ replace_dot_alias = function(e) { q_named = match.call(shift, q) if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE) } # add gshift support + # weighted.mean #3977 if (length(q)>=3 && q[[1L]] == "weighted.mean") return(TRUE) # ^^ base::startWith errors on NULL unfortunately # head-tail uses default value n=6 which as of now should not go gforce ... ^^ @@ -2996,7 +2997,7 @@ gfirst = function(x) .Call(Cgfirst, x) glast = function(x) .Call(Cglast, x) gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm) gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm) -gweighted.mean = function(x, w, mode=1L, na.rm=FALSE) { if (mode==2L) gsum(x*w)/gsum(w) else .Call(CgweightedMean, x, w, na.rm) } +gweighted.mean = function(x, w, na.rm=FALSE) { if (missing(w)) gmean(x, na.rm) else gsum(x*w, na.rm)/gsum(w, na.rm) } gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm) gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm) gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm) diff --git a/src/gsumm.c b/src/gsumm.c index 5119202e53..4964de8b6e 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -717,62 +717,6 @@ SEXP gmean(SEXP x, SEXP narmArg) return(ans); } - - -SEXP gweightedMean(SEXP x, SEXP w, SEXP narmArg) -{ - if (inherits(x, "factor")) - error(_("%s is not meaningful for factors."), "gweighted.mean"); - if (!IS_TRUE_OR_FALSE(narmArg)) - error(_("%s must be TRUE or FALSE"), "na.rm"); - const bool narm = LOGICAL(narmArg)[0]; - const bool nosubset = irowslen==-1; - const int n = nosubset ? length(x) : irowslen; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gweighted.mean"); - - SEXP ans; - int protecti=0; - - switch(TYPEOF(x)) { - case LGLSXP: case INTSXP: - x = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(FALSE))); protecti++; - w = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(FALSE))); protecti++; - case REALSXP: { - if (INHERITS(x, char_integer64)) { - x = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(TRUE))); protecti++; - } - if (INHERITS(w, char_integer64)) { - w = PROTECT(coerceAs(w, ScalarReal(1), ScalarLogical(TRUE))); protecti++; - } - const double *restrict xd = REAL(x); - const double *restrict wd = REAL(w); - ans = PROTECT(allocVector(REALSXP, ngrp)); protecti++; - double *ansd = REAL(ans); - - for (int i=0; i Date: Mon, 1 Nov 2021 17:01:10 +0100 Subject: [PATCH 3/9] add news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5faf40723f..3bbc1c4b43 100644 --- a/NEWS.md +++ b/NEWS.md @@ -207,6 +207,8 @@ # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` +31. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. From 1343fa54426675587feebcecf6c3bf9d982a07e9 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Mon, 1 Nov 2021 17:15:58 +0100 Subject: [PATCH 4/9] fix jsub argument eval --- R/data.table.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index ad35d628f8..f40736df2a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1750,7 +1750,7 @@ replace_dot_alias = function(e) { if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE) } # add gshift support # weighted.mean #3977 - if (length(q)>=3 && q[[1L]] == "weighted.mean") return(TRUE) + if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE) # ^^ base::startWith errors on NULL unfortunately # head-tail uses default value n=6 which as of now should not go gforce ... ^^ # otherwise there must be three arguments, and only in two cases: @@ -1775,7 +1775,7 @@ replace_dot_alias = function(e) { # adding argument to ghead/gtail if none is supplied to g-optimized head/tail if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) - if (length(jsub)==3L && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 + if (length(jsub)==3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars) && exists(jsub[[3L]], parent.frame())) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } else if (verbose) catf("GForce is on, left j unchanged\n"); From f202f74c45ef289e13f46e275908cac9577c5d47 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Mon, 1 Nov 2021 17:28:45 +0100 Subject: [PATCH 5/9] add basic tests --- inst/tests/tests.Rraw | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6382a13a85..0df442f17e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18348,3 +18348,8 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) +# weighted.mean GForce optimized #3977 +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), b=1) +test(2226.1, DT[, weighted.mean(x, w), b, verbose=TRUE], DT[, stats::weighted.mean(x, w), b], output="GForce TRUE") +test(2226.2, DT[, weighted.mean(w, x), b, verbose=TRUE], DT[, stats::weighted.mean(w, x), b], output="GForce TRUE") +test(2226.3, DT[, weighted.mean(x), b, verbose=TRUE], DT[, mean(x), b], output="GForce TRUE") From 4484a6f0cc2410f78a6fc13653f719382643b1bc Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Mon, 1 Nov 2021 17:56:50 +0100 Subject: [PATCH 6/9] update tests --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index f40736df2a..52a28a1a18 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1769,7 +1769,7 @@ replace_dot_alias = function(e) { for (ii in seq_along(jsub)[-1L]) { if (dotN(jsub[[ii]])) next; # For #334 jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]])) - if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 + if (length(jsub[[ii]])==3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars) && exists(jsub[[ii]][[3L]], parent.frame())) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { # adding argument to ghead/gtail if none is supplied to g-optimized head/tail diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0df442f17e..80088282da 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18350,6 +18350,6 @@ test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Spe # weighted.mean GForce optimized #3977 DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), b=1) -test(2226.1, DT[, weighted.mean(x, w), b, verbose=TRUE], DT[, stats::weighted.mean(x, w), b], output="GForce TRUE") -test(2226.2, DT[, weighted.mean(w, x), b, verbose=TRUE], DT[, stats::weighted.mean(w, x), b], output="GForce TRUE") -test(2226.3, DT[, weighted.mean(x), b, verbose=TRUE], DT[, mean(x), b], output="GForce TRUE") +test(2226.1, DT[, weighted.mean(x, w), b, verbose=TRUE], DT[, stats::weighted.mean(x, w), b], output="GForce optimized j to") +test(2226.2, DT[, weighted.mean(w, x), b, verbose=TRUE], DT[, stats::weighted.mean(w, x), b], output="GForce optimized j to") +test(2226.3, DT[, weighted.mean(x), b, verbose=TRUE], DT[, mean(x), b], output="GForce optimized j to") From 3304f183da6b47c7ba40375df23ceee57cdf4404 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 5 Nov 2021 19:47:51 +0100 Subject: [PATCH 7/9] fix NA/NaN and more tests --- R/data.table.R | 12 ++++++- inst/tests/tests.Rraw | 75 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 52a28a1a18..9a9b9277d1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2997,7 +2997,17 @@ gfirst = function(x) .Call(Cgfirst, x) glast = function(x) .Call(Cglast, x) gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm) gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm) -gweighted.mean = function(x, w, na.rm=FALSE) { if (missing(w)) gmean(x, na.rm) else gsum(x*w, na.rm)/gsum(w, na.rm) } +gweighted.mean = function(x, w, na.rm=FALSE) { + if (missing(w)) gmean(x, na.rm) + else { + if (na.rm) { # take those indices out of the equation by setting them to 0 + ix <- is.na(x) + x[ix] <- 0 + w[ix] <- 0 + } + gsum((w!=0)*x*w, na.rm=FALSE)/gsum(w, na.rm=FALSE) + } +} gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm) gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm) gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 80088282da..1cb79c9e5c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18349,7 +18349,74 @@ test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Spe groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) # weighted.mean GForce optimized #3977 -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), b=1) -test(2226.1, DT[, weighted.mean(x, w), b, verbose=TRUE], DT[, stats::weighted.mean(x, w), b], output="GForce optimized j to") -test(2226.2, DT[, weighted.mean(w, x), b, verbose=TRUE], DT[, stats::weighted.mean(w, x), b], output="GForce optimized j to") -test(2226.3, DT[, weighted.mean(x), b, verbose=TRUE], DT[, mean(x), b], output="GForce optimized j to") +options(datatable.optimize=1L) +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2226.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE") +test(2226.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE") +test(2226.03, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce FALSE") +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2226.04, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") +test(2226.05, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") +test(2226.06, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce FALSE") +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2226.07, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") +test(2226.08, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce FALSE") +test(2226.09, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2226.10, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2226.11, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce FALSE") +test(2226.12, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") +test(2226.13, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce FALSE") +test(2226.14, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2226.15, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") +test(2226.16, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") +test(2226.17, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2226.18, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2226.19, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2226.20, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2226.21, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2226.22, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# same as previous test cases but now GForce optimized +options(datatable.optimize=2L) +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2226.31, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce optimized j to") +test(2226.32, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce optimized j to") +test(2226.33, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2226.34, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") +test(2226.35, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") +test(2226.36, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce optimized j to") +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2226.37, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") +test(2226.38, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce optimized j to") +test(2226.39, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2226.40, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2226.41, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce optimized j to") +test(2226.42, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") +test(2226.43, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce optimized j to") +test(2226.44, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2226.45, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") +test(2226.46, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") +test(2226.47, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2226.48, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2226.49, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2226.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2226.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2226.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") From d1e9213e34c747ea9f2c6f50191ce8bef839cea1 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 5 Nov 2021 19:51:46 +0100 Subject: [PATCH 8/9] restore options after tests --- inst/tests/tests.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1cb79c9e5c..04240c0dff 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18349,7 +18349,7 @@ test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Spe groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) # weighted.mean GForce optimized #3977 -options(datatable.optimize=1L) +old = options(datatable.optimize=1L) DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) test(2226.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE") test(2226.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE") @@ -18420,3 +18420,4 @@ test(2226.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) test(2226.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") test(2226.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +options(old) From fae5952ee2ed413da7f5575b26da85678027c200 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 3 Dec 2021 13:20:27 -0700 Subject: [PATCH 9/9] move/delete dangling ^^ comments from #5089 --- R/data.table.R | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index bf14392760..13ce66e233 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1737,16 +1737,13 @@ replace_dot_alias = function(e) { if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE) + # ^^ base::startWith errors on NULL unfortunately if (length(q)>=2L && q[[1L]] == "shift") { q_named = match.call(shift, q) if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE) - } # add gshift support - # weighted.mean #3977 - if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE) - # ^^ base::startWith errors on NULL unfortunately - # head-tail uses default value n=6 which as of now should not go gforce ... ^^ - # otherwise there must be three arguments, and only in two cases: - # 1) head/tail(x, 1) or 2) x[n], n>0 + } + if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE) #3977 + # otherwise there must be three arguments length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) }