Rdatatable · mattdowle · Dec 15, 2018 · Feb 7, 2018 · Mar 16, 2018 · Apr 17, 2018
@@ -46,6 +46,7 @@ export(rollup)
 S3method(groupingsets, data.table)
 S3method(cube, data.table)
 S3method(rollup, data.table)
+export(frollmean)
 
 S3method("[", data.table)
 S3method("[<-", data.table)

@@ -34,6 +34,8 @@
 
 8. `DT[..., .SDcols=]` now accepts `patterns()`; e.g. `DT[..., .SDcols=patterns("^V")]`, for filtering columns according to a pattern (as in `melt.data.table`), [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples.
 
+9. New `frollmean` has been added to calculate _rolling mean_. Function name and arguments are experimental. Related to [#2778](https://github.com/Rdatatable/data.table/issues/2778) (and [#624](https://github.com/Rdatatable/data.table/issues/624), [#626](https://github.com/Rdatatable/data.table/issues/626), [#1855](https://github.com/Rdatatable/data.table/issues/1855)). Other rolling statistics will follow.
+
 
 #### BUG FIXES
 

@@ -0,0 +1,11 @@
+froll <- function(fun, x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE, verbose=getOption("datatable.verbose")) {
+  stopifnot(!missing(fun), is.character(fun), length(fun)==1L, !is.na(fun))
+  algo = match.arg(algo)
+  align = match.arg(align)
+  ans = .Call(CfrollfunR, fun, x, n, fill, algo, align, na.rm, hasNA, adaptive, verbose)
+  ans
+}
+
+frollmean <- function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE, verbose=getOption("datatable.verbose")) {
+  froll(fun="mean", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive, verbose=verbose)
+}
@@ -0,0 +1,147 @@
+\name{roll}
+\alias{roll}
+\alias{froll}
+\alias{rolling}
+\alias{sliding}
+\alias{moving}
+\alias{frollmean}
+\alias{frollsum}
+\title{Rolling functions}
+\description{
+  Fast rolling functions to calculate aggregates on sliding window.
+}
+
+\usage{
+frollmean(x, n, fill=NA, algo=c("fast", "exact"), align=c("right",
+  "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE,
+  verbose=getOption("datatable.verbose"))
+}
+\arguments{
+  \item{x}{ vector, list, data.frame or data.table of numeric fields. }
+  \item{n}{ integer vector, for adaptive rolling function also list of
+    integer vectors, rolling window size. }
+  \item{fill}{ numeric, value to pad by, default \code{NA}. }
+  \item{algo}{ character, default \code{"fast"}. When set to \code{"exact"}
+    then slower algorithm is used. It suffers less from floating point
+    rounding error, perform extra pass to adjust rounding error
+    correction and carefully handle all non-finite values. If available
+    it will use multiple cores. See details for more information. }
+  \item{align}{ character, define if window frame covers preceding rows
+    \code{"right"}, following rows \code{"left"} or centered
+    \code{"center"}, default \code{"right"}. }
+  \item{na.rm}{ logical, should missing values be removed when
+    calculating window, default \code{FALSE}. For details on handling
+    other non finite values see details below. }
+  \item{hasNA}{ logical, if it is known that \code{x} contains \code{NA}
+    then setting to \code{TRUE} will speed up, default \code{NA}. }
+  \item{adaptive}{ logical, should adaptive rolling function be
+    calculated, default \code{FALSE}. See details below. }
+  \item{verbose}{ logical, default \code{getOption("datatable.verbose")},
+    \code{TRUE} turns on status and information messages to the console,
+    it also disable parallel processing. }
+}
+\details{
+  \code{froll*} functions accepts vectors, lists, data.frames or
+  data.tables. They always returns a list except when the input is a
+  \code{vector} and \code{length(n)==1} in which case a \code{vector}
+  is returned, for convenience. This is so that it can be used
+  conveniently within data.table's syntax.
+
+  Argument \code{n} allows multiple values to calculate multiple rolling
+  windows or if \code{adaptive=TRUE} then it expects a list, each list
+  element must be integer vector of window size corresponding to every
+  \code{column[row]} from \code{x}.
+
+  When \code{algo="fast"} is used then any \code{NaN, +Inf, -Inf} is
+  treated as \code{NA}. For precise handling of non-finite values use
+  \code{algo="exact"}.
+  Argument \code{algo="exact"} will make rolling functions to perform extra
+  computation for floating point rounding error correction. This is useful
+  mostly when when input data has distant outlier. It also handles
+  \code{NaN, +Inf, -Inf} consistently to base R.
+
+  Adaptive rolling functions are special cases where for each single
+  observation has own corresponding rolling window width. Due to the logic
+  of that function following restrictions apply:
+  \itemize{
+    \item{ \code{align} only \code{"right"}. }
+    \item{ if list of integer vectors is passed to \code{x} then all
+      list vectors must have equal length. }
+  }
+
+  When multiple columns or multiple windows width are provided then they
+  are run in parallel. Eventually nested parallelism occurs when
+  \code{algo="exact"}, see examples.
+}
+\value{
+  A list except when the input is a \code{vector} and
+  \code{length(n)==1} in which case a \code{vector} is returned.
+}
+\note{
+  Users coming from most popular package for rolling functions
+  \code{zoo} might expect following difference in \code{data.table}
+  implementation.
+  \itemize{
+    \item{ rolling function will always return same length of results
+      as provided input. }
+    \item{ \code{fill} by default \code{NA}. }
+    \item{ \code{fill} accept only constant values, no support for
+      \emph{na.locf} or other functions. }
+    \item{ \code{align} is by default \code{"right"}. }
+    \item{ \code{na.rm} is respected, no need to use other function
+      when having \code{NA} values. }
+    \item{ integers are always coerced to double. }
+    \item{ when \code{adaptive=FALSE} (default) then \code{n} must be a
+      numeric vector, list is not accepted. }
+    \item{ when \code{adaptive=TRUE} then \code{n} must be vector of
+      length equal to \code{nrow(x)}, or list of such vectors. }
+    \item{ there is no \code{partial} window support. }
+  }
+}
+\examples{
+d = as.data.table(list(1:6/2, 3:8/4))
+# rollmean of single vector and single window
+frollmean(d[, V1], 3)
+# multiple columns at once
+frollmean(d, 3)
+# multiple windows at once
+frollmean(d[, .(V1)], c(3, 4))
+# multiple columns and multiple windows at once
+frollmean(d, c(3, 4))
+## three above are embarrassingly parallel using openmp
+
+# performance vs exactness
+set.seed(108)
+x = sample(c(rnorm(1e3, 1e6, 5e5), 5e9, 5e-9))
+n = 15
+ma = function(x, n, na.rm=FALSE) {
+  ans = rep(NA_real_, nx<-length(x))
+  for (i in n:nx) ans[i] = mean(x[(i-n+1):i], na.rm=na.rm)
+  ans
+}
+fastma = function(x, n, na.rm) {
+  if (!missing(na.rm)) stop("NAs are unsupported, wrongly propagated by cumsum")
+  cs = cumsum(x)
+  scs = shift(cs, n)
+  scs[n] = 0
+  as.double((cs-scs)/n)
+}
+system.time(ans1<-ma(x, n))
+system.time(ans2<-fastma(x, n))
+system.time(ans3<-frollmean(x, n, algo="exact")) # parallel using openmp again
+system.time(ans4<-frollmean(x, n))
+anserr = list(
+  froll_exact_f = ans4-ans1,
+  froll_exact_t = ans3-ans1,
+  fastma = ans2-ans1
+)
+errs = sapply(lapply(anserr, abs), sum, na.rm=TRUE)
+sapply(errs, format, scientific=FALSE) # roundoff
+}
+\seealso{
+  \code{\link{shift}}, \code{\link{data.table}}
+}
+\references{
+  \href{Round-off error}{https://en.wikipedia.org/wiki/Round-off_error}
+}
+\keyword{ data }
@@ -6,6 +6,7 @@
 #include <stdint.h> // for uint64_t rather than unsigned long long
 #include <stdbool.h>
 #include "myomp.h"
+#include "types.h"
 
 // data.table depends on R>=3.0.0 when R_xlen_t was introduced
 // Before R 3.0.0, RLEN used to be switched to R_len_t as R_xlen_t wasn't available.
@@ -152,3 +153,15 @@ double wallclock();
 int getDTthreads();
 void avoid_openmp_hang_within_fork();
 
+// froll.c
+void frollmean(unsigned int algo, double *x, uint_fast64_t nx, double_ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose);
+void frollmeanFast(double *x, uint_fast64_t nx, double_ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose);
+void frollmeanExact(double *x, uint_fast64_t nx, double_ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose);
+
+// frolladaptive.c
+void fadaptiverollmean(unsigned int algo, double *x, uint_fast64_t nx, double_ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose);
+void fadaptiverollmeanFast(double *x, uint_fast64_t nx, double_ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose);
+void fadaptiverollmeanExact(double *x, uint_fast64_t nx, double_ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose);
+
+// frollR.c
+SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasNA, SEXP adaptive, SEXP verbose);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -34,6 +34,8 @@

		8. `DT[..., .SDcols=]` now accepts `patterns()`; e.g. `DT[..., .SDcols=patterns("^V")]`, for filtering columns according to a pattern (as in `melt.data.table`), [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples.

		9. New `frollmean` has been added to calculate _rolling mean_. Function name and arguments are experimental. Related to [#2778](https://github.com/Rdatatable/data.table/issues/2778) (and [#624](https://github.com/Rdatatable/data.table/issues/624), [#626](https://github.com/Rdatatable/data.table/issues/626), [#1855](https://github.com/Rdatatable/data.table/issues/1855)). Other rolling statistics will follow.


		#### BUG FIXES

Expand Down