From 5fabd648886e091a1da95890753692afd20cdd18 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Sat, 30 Oct 2021 01:55:07 +0200
Subject: [PATCH 1/9] init commit

---
 R/data.table.R |  6 ++++--
 src/gsumm.c    | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/init.c     |  2 ++
 3 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index e020ea3e3d..931a5752a5 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1749,6 +1749,7 @@ replace_dot_alias = function(e) {
             q_named = match.call(shift, q)
             if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE)
           } # add gshift support
+          if (length(q)>=3 && q[[1L]] == "weighted.mean") return(TRUE)
           #                       ^^ base::startWith errors on NULL unfortunately
           #        head-tail uses default value n=6 which as of now should not go gforce ... ^^
           # otherwise there must be three arguments, and only in two cases:
@@ -1773,7 +1774,7 @@ replace_dot_alias = function(e) {
             # adding argument to ghead/gtail if none is supplied to g-optimized head/tail
             if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L
             jsub[[1L]] = as.name(paste0("g", jsub[[1L]]))
-            if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame())   # tests 1187.3 & 1187.5
+            if (length(jsub)==3L && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame())   # tests 1187.3 & 1187.5
           }
           if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L))
         } else if (verbose) catf("GForce is on, left j unchanged\n");
@@ -2987,7 +2988,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) {
 #     (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function
 #     (3) define the gfun = function() R wrapper
 gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod",
-          "median", "min", "max", "var", "sd", ".N", "shift") # added .N for #334
+          "median", "min", "max", "var", "sd", ".N", "shift", "weighted.mean") # added .N for #334
 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here.
 ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment
 gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment
@@ -2995,6 +2996,7 @@ gfirst = function(x) .Call(Cgfirst, x)
 glast = function(x) .Call(Cglast, x)
 gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
 gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
+gweighted.mean = function(x, w, mode=1L, na.rm=FALSE) { if (mode==2L) gsum(x*w)/gsum(w) else .Call(CgweightedMean, x, w, na.rm) }
 gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm)
 gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm)
 gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm)
diff --git a/src/gsumm.c b/src/gsumm.c
index 4964de8b6e..5119202e53 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -717,6 +717,62 @@ SEXP gmean(SEXP x, SEXP narmArg)
   return(ans);
 }
 
+
+
+SEXP gweightedMean(SEXP x, SEXP w, SEXP narmArg)
+{
+  if (inherits(x, "factor"))
+    error(_("%s is not meaningful for factors."), "gweighted.mean");
+  if (!IS_TRUE_OR_FALSE(narmArg))
+    error(_("%s must be TRUE or FALSE"), "na.rm");
+  const bool narm = LOGICAL(narmArg)[0];
+  const bool nosubset = irowslen==-1;
+  const int n = nosubset ? length(x) : irowslen;
+  if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gweighted.mean");
+
+  SEXP ans;
+  int protecti=0;
+
+  switch(TYPEOF(x)) {
+  case LGLSXP: case INTSXP:
+    x = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(FALSE))); protecti++;
+    w = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(FALSE))); protecti++;
+  case REALSXP: {
+    if (INHERITS(x, char_integer64)) {
+      x = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(TRUE))); protecti++;
+    }
+    if (INHERITS(w, char_integer64)) {
+      w = PROTECT(coerceAs(w, ScalarReal(1), ScalarLogical(TRUE))); protecti++;
+    }
+    const double *restrict xd = REAL(x);
+    const double *restrict wd = REAL(w);
+    ans = PROTECT(allocVector(REALSXP, ngrp)); protecti++;
+    double *ansd = REAL(ans);
+
+    for (int i=0; i<ngrp; ++i) {
+      const int thisgrpsize = grpsize[i];
+      long double sum=0., wsum=0.;
+      for (int j=0; j<thisgrpsize; ++j) {
+        int ix = ff[i]+j-1;
+        if (isunsorted) ix = oo[ix]-1;
+        if (nosubset ? ISNAN(xd[ix]) || ISNAN(wd[ix]) : (irows[ix]==NA_INTEGER || (ix=irows[ix]-1,ISNAN(xd[ix])||ISNAN(wd[ix])))) {
+          if (narm) continue; else break;
+        }
+        sum += xd[ix] * wd[ix];
+        wsum += wd[ix];
+      }
+      if (wsum == 0.) { ansd[i]=NA_REAL; continue; }
+      ansd[i] = (double)(sum/wsum);
+    }
+  } break;
+  default:
+    error(_("Type '%s' not supported by GForce weighted.mean (gweighted.mean). Either add the prefix stats::weighted.mean(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)));
+  }
+  copyMostAttrib(x, ans);
+  UNPROTECT(protecti);
+  return(ans);
+}
+
 static SEXP gminmax(SEXP x, SEXP narm, const bool min)
 {
   if (!IS_TRUE_OR_FALSE(narm))
diff --git a/src/init.c b/src/init.c
index 814ada375d..c137ba9f5f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -77,6 +77,7 @@ SEXP issorted();
 SEXP gforce();
 SEXP gsum();
 SEXP gmean();
+SEXP gweightedMean();
 SEXP gmin();
 SEXP gmax();
 SEXP isOrderedSubset();
@@ -167,6 +168,7 @@ R_CallMethodDef callMethods[] = {
 {"Cgforce", (DL_FUNC) &gforce, -1},
 {"Cgsum", (DL_FUNC) &gsum, -1},
 {"Cgmean", (DL_FUNC) &gmean, -1},
+{"CgweightedMean", (DL_FUNC) &gweightedMean, -1},
 {"Cgmin", (DL_FUNC) &gmin, -1},
 {"Cgmax", (DL_FUNC) &gmax, -1},
 {"CisOrderedSubset", (DL_FUNC) &isOrderedSubset, -1},

From 4a642aa1e1db3833c7260d30482b255cace70480 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Mon, 1 Nov 2021 15:55:29 +0100
Subject: [PATCH 2/9] redirect gweighted.mean to gsum since 2*gsum is faster

---
 R/data.table.R |  3 ++-
 src/gsumm.c    | 56 --------------------------------------------------
 src/init.c     |  2 --
 3 files changed, 2 insertions(+), 59 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index 931a5752a5..ad35d628f8 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1749,6 +1749,7 @@ replace_dot_alias = function(e) {
             q_named = match.call(shift, q)
             if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE)
           } # add gshift support
+          # weighted.mean #3977
           if (length(q)>=3 && q[[1L]] == "weighted.mean") return(TRUE)
           #                       ^^ base::startWith errors on NULL unfortunately
           #        head-tail uses default value n=6 which as of now should not go gforce ... ^^
@@ -2996,7 +2997,7 @@ gfirst = function(x) .Call(Cgfirst, x)
 glast = function(x) .Call(Cglast, x)
 gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
 gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
-gweighted.mean = function(x, w, mode=1L, na.rm=FALSE) { if (mode==2L) gsum(x*w)/gsum(w) else .Call(CgweightedMean, x, w, na.rm) }
+gweighted.mean = function(x, w, na.rm=FALSE) { if (missing(w)) gmean(x, na.rm) else gsum(x*w, na.rm)/gsum(w, na.rm) }
 gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm)
 gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm)
 gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm)
diff --git a/src/gsumm.c b/src/gsumm.c
index 5119202e53..4964de8b6e 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -717,62 +717,6 @@ SEXP gmean(SEXP x, SEXP narmArg)
   return(ans);
 }
 
-
-
-SEXP gweightedMean(SEXP x, SEXP w, SEXP narmArg)
-{
-  if (inherits(x, "factor"))
-    error(_("%s is not meaningful for factors."), "gweighted.mean");
-  if (!IS_TRUE_OR_FALSE(narmArg))
-    error(_("%s must be TRUE or FALSE"), "na.rm");
-  const bool narm = LOGICAL(narmArg)[0];
-  const bool nosubset = irowslen==-1;
-  const int n = nosubset ? length(x) : irowslen;
-  if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gweighted.mean");
-
-  SEXP ans;
-  int protecti=0;
-
-  switch(TYPEOF(x)) {
-  case LGLSXP: case INTSXP:
-    x = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(FALSE))); protecti++;
-    w = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(FALSE))); protecti++;
-  case REALSXP: {
-    if (INHERITS(x, char_integer64)) {
-      x = PROTECT(coerceAs(x, ScalarReal(1), ScalarLogical(TRUE))); protecti++;
-    }
-    if (INHERITS(w, char_integer64)) {
-      w = PROTECT(coerceAs(w, ScalarReal(1), ScalarLogical(TRUE))); protecti++;
-    }
-    const double *restrict xd = REAL(x);
-    const double *restrict wd = REAL(w);
-    ans = PROTECT(allocVector(REALSXP, ngrp)); protecti++;
-    double *ansd = REAL(ans);
-
-    for (int i=0; i<ngrp; ++i) {
-      const int thisgrpsize = grpsize[i];
-      long double sum=0., wsum=0.;
-      for (int j=0; j<thisgrpsize; ++j) {
-        int ix = ff[i]+j-1;
-        if (isunsorted) ix = oo[ix]-1;
-        if (nosubset ? ISNAN(xd[ix]) || ISNAN(wd[ix]) : (irows[ix]==NA_INTEGER || (ix=irows[ix]-1,ISNAN(xd[ix])||ISNAN(wd[ix])))) {
-          if (narm) continue; else break;
-        }
-        sum += xd[ix] * wd[ix];
-        wsum += wd[ix];
-      }
-      if (wsum == 0.) { ansd[i]=NA_REAL; continue; }
-      ansd[i] = (double)(sum/wsum);
-    }
-  } break;
-  default:
-    error(_("Type '%s' not supported by GForce weighted.mean (gweighted.mean). Either add the prefix stats::weighted.mean(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)));
-  }
-  copyMostAttrib(x, ans);
-  UNPROTECT(protecti);
-  return(ans);
-}
-
 static SEXP gminmax(SEXP x, SEXP narm, const bool min)
 {
   if (!IS_TRUE_OR_FALSE(narm))
diff --git a/src/init.c b/src/init.c
index c137ba9f5f..814ada375d 100644
--- a/src/init.c
+++ b/src/init.c
@@ -77,7 +77,6 @@ SEXP issorted();
 SEXP gforce();
 SEXP gsum();
 SEXP gmean();
-SEXP gweightedMean();
 SEXP gmin();
 SEXP gmax();
 SEXP isOrderedSubset();
@@ -168,7 +167,6 @@ R_CallMethodDef callMethods[] = {
 {"Cgforce", (DL_FUNC) &gforce, -1},
 {"Cgsum", (DL_FUNC) &gsum, -1},
 {"Cgmean", (DL_FUNC) &gmean, -1},
-{"CgweightedMean", (DL_FUNC) &gweightedMean, -1},
 {"Cgmin", (DL_FUNC) &gmin, -1},
 {"Cgmax", (DL_FUNC) &gmax, -1},
 {"CisOrderedSubset", (DL_FUNC) &isOrderedSubset, -1},

From 2178dfae77f3b669545c2ce024eb0cbb10a01f3e Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Mon, 1 Nov 2021 17:01:10 +0100
Subject: [PATCH 3/9] add news

---
 NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 5faf40723f..3bbc1c4b43 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -207,6 +207,8 @@
     #  v1.14.4  0.4826  0.5586  0.6586  0.6329  0.7348  1.318   100
     ```
 
+31. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR.
+
 ## BUG FIXES
 
 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries.

From 1343fa54426675587feebcecf6c3bf9d982a07e9 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Mon, 1 Nov 2021 17:15:58 +0100
Subject: [PATCH 4/9] fix jsub argument eval

---
 R/data.table.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index ad35d628f8..f40736df2a 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1750,7 +1750,7 @@ replace_dot_alias = function(e) {
             if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE)
           } # add gshift support
           # weighted.mean #3977
-          if (length(q)>=3 && q[[1L]] == "weighted.mean") return(TRUE)
+          if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE)
           #                       ^^ base::startWith errors on NULL unfortunately
           #        head-tail uses default value n=6 which as of now should not go gforce ... ^^
           # otherwise there must be three arguments, and only in two cases:
@@ -1775,7 +1775,7 @@ replace_dot_alias = function(e) {
             # adding argument to ghead/gtail if none is supplied to g-optimized head/tail
             if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L
             jsub[[1L]] = as.name(paste0("g", jsub[[1L]]))
-            if (length(jsub)==3L && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame())   # tests 1187.3 & 1187.5
+            if (length(jsub)==3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars) && exists(jsub[[3L]], parent.frame())) jsub[[3L]] = eval(jsub[[3L]], parent.frame())   # tests 1187.3 & 1187.5
           }
           if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L))
         } else if (verbose) catf("GForce is on, left j unchanged\n");

From f202f74c45ef289e13f46e275908cac9577c5d47 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Mon, 1 Nov 2021 17:28:45 +0100
Subject: [PATCH 5/9] add basic tests

---
 inst/tests/tests.Rraw | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 6382a13a85..0df442f17e 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18348,3 +18348,8 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec
 test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')),
              groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species')))
 
+# weighted.mean GForce optimized #3977
+DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), b=1)
+test(2226.1, DT[, weighted.mean(x, w), b, verbose=TRUE], DT[, stats::weighted.mean(x, w), b], output="GForce TRUE")
+test(2226.2, DT[, weighted.mean(w, x), b, verbose=TRUE], DT[, stats::weighted.mean(w, x), b], output="GForce TRUE")
+test(2226.3, DT[, weighted.mean(x), b, verbose=TRUE], DT[, mean(x), b], output="GForce TRUE")

From 4484a6f0cc2410f78a6fc13653f719382643b1bc Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Mon, 1 Nov 2021 17:56:50 +0100
Subject: [PATCH 6/9] update tests

---
 R/data.table.R        | 2 +-
 inst/tests/tests.Rraw | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index f40736df2a..52a28a1a18 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1769,7 +1769,7 @@ replace_dot_alias = function(e) {
             for (ii in seq_along(jsub)[-1L]) {
               if (dotN(jsub[[ii]])) next; # For #334
               jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]]))
-              if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame())  # tests 1187.2 & 1187.4
+              if (length(jsub[[ii]])==3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars) && exists(jsub[[ii]][[3L]], parent.frame())) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame())  # tests 1187.2 & 1187.4
             }
           else {
             # adding argument to ghead/gtail if none is supplied to g-optimized head/tail
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 0df442f17e..80088282da 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18350,6 +18350,6 @@ test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Spe
 
 # weighted.mean GForce optimized #3977
 DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), b=1)
-test(2226.1, DT[, weighted.mean(x, w), b, verbose=TRUE], DT[, stats::weighted.mean(x, w), b], output="GForce TRUE")
-test(2226.2, DT[, weighted.mean(w, x), b, verbose=TRUE], DT[, stats::weighted.mean(w, x), b], output="GForce TRUE")
-test(2226.3, DT[, weighted.mean(x), b, verbose=TRUE], DT[, mean(x), b], output="GForce TRUE")
+test(2226.1, DT[, weighted.mean(x, w), b, verbose=TRUE], DT[, stats::weighted.mean(x, w), b], output="GForce optimized j to")
+test(2226.2, DT[, weighted.mean(w, x), b, verbose=TRUE], DT[, stats::weighted.mean(w, x), b], output="GForce optimized j to")
+test(2226.3, DT[, weighted.mean(x), b, verbose=TRUE], DT[, mean(x), b], output="GForce optimized j to")

From 3304f183da6b47c7ba40375df23ceee57cdf4404 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Fri, 5 Nov 2021 19:47:51 +0100
Subject: [PATCH 7/9] fix NA/NaN and more tests

---
 R/data.table.R        | 12 ++++++-
 inst/tests/tests.Rraw | 75 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index 52a28a1a18..9a9b9277d1 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -2997,7 +2997,17 @@ gfirst = function(x) .Call(Cgfirst, x)
 glast = function(x) .Call(Cglast, x)
 gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
 gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
-gweighted.mean = function(x, w, na.rm=FALSE) { if (missing(w)) gmean(x, na.rm) else gsum(x*w, na.rm)/gsum(w, na.rm) }
+gweighted.mean = function(x, w, na.rm=FALSE) {
+  if (missing(w)) gmean(x, na.rm)
+  else {
+    if (na.rm) { # take those indices out of the equation by setting them to 0
+      ix <- is.na(x)
+      x[ix] <- 0
+      w[ix] <- 0
+    }
+    gsum((w!=0)*x*w, na.rm=FALSE)/gsum(w, na.rm=FALSE)
+  }
+}
 gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm)
 gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm)
 gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 80088282da..1cb79c9e5c 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18349,7 +18349,74 @@ test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Spe
              groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species')))
 
 # weighted.mean GForce optimized #3977
-DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), b=1)
-test(2226.1, DT[, weighted.mean(x, w), b, verbose=TRUE], DT[, stats::weighted.mean(x, w), b], output="GForce optimized j to")
-test(2226.2, DT[, weighted.mean(w, x), b, verbose=TRUE], DT[, stats::weighted.mean(w, x), b], output="GForce optimized j to")
-test(2226.3, DT[, weighted.mean(x), b, verbose=TRUE], DT[, mean(x), b], output="GForce optimized j to")
+options(datatable.optimize=1L)
+DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L)
+test(2226.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE")
+test(2226.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE")
+test(2226.03, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce FALSE")
+# multiple groups
+DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L))
+test(2226.04, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE")
+test(2226.05, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE")
+test(2226.06, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce FALSE")
+# (only x XOR w) containing NA
+DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L))
+test(2226.07, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE")
+test(2226.08, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce FALSE")
+test(2226.09, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE")
+test(2226.10, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE")
+# (only x XOR w) containing NaN
+DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L))
+test(2226.11, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce FALSE")
+test(2226.12, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE")
+test(2226.13, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce FALSE")
+test(2226.14, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE")
+# (only x XOR w) containing NA and NaN
+DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L))
+test(2226.15, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE")
+test(2226.16, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE")
+test(2226.17, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE")
+test(2226.18, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE")
+# (x and w) containing NA and NaN
+DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L))
+test(2226.19, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE")
+test(2226.20, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE")
+DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L))
+test(2226.21, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE")
+test(2226.22, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE")
+# same as previous test cases but now GForce optimized
+options(datatable.optimize=2L)
+DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L)
+test(2226.31, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce optimized j to")
+test(2226.32, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce optimized j to")
+test(2226.33, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to")
+# multiple groups
+DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L))
+test(2226.34, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to")
+test(2226.35, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to")
+test(2226.36, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce optimized j to")
+# (only x XOR w) containing NA
+DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L))
+test(2226.37, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to")
+test(2226.38, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce optimized j to")
+test(2226.39, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to")
+test(2226.40, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to")
+# (only x XOR w) containing NaN
+DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L))
+test(2226.41, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce optimized j to")
+test(2226.42, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to")
+test(2226.43, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce optimized j to")
+test(2226.44, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to")
+# (only x XOR w) containing NA and NaN
+DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L))
+test(2226.45, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to")
+test(2226.46, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to")
+test(2226.47, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to")
+test(2226.48, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to")
+# (x and w) containing NA and NaN
+DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L))
+test(2226.49, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to")
+test(2226.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to")
+DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L))
+test(2226.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to")
+test(2226.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to")

From d1e9213e34c747ea9f2c6f50191ce8bef839cea1 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Fri, 5 Nov 2021 19:51:46 +0100
Subject: [PATCH 8/9] restore options after tests

---
 inst/tests/tests.Rraw | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 1cb79c9e5c..04240c0dff 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18349,7 +18349,7 @@ test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Spe
              groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species')))
 
 # weighted.mean GForce optimized #3977
-options(datatable.optimize=1L)
+old = options(datatable.optimize=1L)
 DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L)
 test(2226.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE")
 test(2226.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE")
@@ -18420,3 +18420,4 @@ test(2226.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table
 DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L))
 test(2226.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to")
 test(2226.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to")
+options(old)

From fae5952ee2ed413da7f5575b26da85678027c200 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Fri, 3 Dec 2021 13:20:27 -0700
Subject: [PATCH 9/9] move/delete dangling ^^ comments from #5089

---
 R/data.table.R | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index bf14392760..13ce66e233 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1737,16 +1737,13 @@ replace_dot_alias = function(e) {
           if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE)
           if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE)  # 875
           if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE)
+          #                       ^^ base::startWith errors on NULL unfortunately
           if (length(q)>=2L && q[[1L]] == "shift") {
             q_named = match.call(shift, q)
             if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE)
-          } # add gshift support
-          # weighted.mean #3977
-          if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE)
-          #                       ^^ base::startWith errors on NULL unfortunately
-          #        head-tail uses default value n=6 which as of now should not go gforce ... ^^
-          # otherwise there must be three arguments, and only in two cases:
-          #   1) head/tail(x, 1) or 2) x[n], n>0
+          }
+          if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE)  #3977
+          # otherwise there must be three arguments
           length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) &&
             ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) )
         }