diff --git a/NEWS.md b/NEWS.md index 00a9b5be1b..aad41a0ccb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -276,6 +276,22 @@ 38. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. +39. `.I` is now available in `by` for rowwise operations, [#1732](https://github.com/Rdatatable/data.table/issues/1732). Thanks to Rafael H. M. Pereira for requesting, and Benjamin Schwendinger for the PR. + + ```R + DT + # V1 V2 + # + # 1: 3 5 + # 2: 4 6 + + DT[, sum(.SD), by=.I] + # I V1 + # + # 1: 1 8 + # 2: 2 10 + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index dcb18ad1f9..e671a208df 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -766,6 +766,12 @@ replace_dot_alias = function(e) { # may evaluate to NULL | character() | "" | list(), likely a result of a user expression where no-grouping is one case being loop'd through bysubl = as.list.default(bysub) bysuborig = bysub + if (".I" %in% bysubl) { #1732 + if (!is.symbol(bysub) && (length(bysubl)!=2L || !is.symbol(bysubl[[2L]]) || !(bysubl[[1L]] %chin% c(".","c","list")))) + stopf("'by' contains .I but only the following are currently supported: by=.I, by=.(.I), by=c(.I), by=list(.I)") + bysub = if (is.null(irows)) seq_len(nrow(x)) else irows + bysuborig = as.symbol("I") + } if (is.name(bysub) && !(bysub %chin% names_x)) { # TO DO: names(x),names(i),and i. and x. prefixes bysub = eval(bysub, parent.frame(), parent.frame()) # fix for # 5106 - http://stackoverflow.com/questions/19983423/why-by-on-a-vector-not-from-a-data-table-column-is-very-slow diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e510b3292c..fc7e14f753 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18688,3 +18688,17 @@ ydt[, ret := rnorm(.N)] f = shift test(2233.33, copy(ydt)[, (ycols) := shift(ret, yn, type = "lead"), by = symbol, verbose=TRUE], copy(ydt)[, (ycols) := f(ret, yn, type = "lead"), by = symbol], output="GForce optimized j to") +# support by=.I; #1732 +DT = data.table(V1=1:5, V2=3:7, V3=5:1) +test(2234.1, DT[, min(.SD), by=.I], setnames(DT[, min(.SD), by=1:nrow(DT)], "nrow", "I")) +test(2234.2, DT[, min(.SD), by=.I], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +# works also with i +test(2234.3, DT[c(1,3,5), min(.SD), by=.I], data.table(I=c(1L, 3L, 5L), V1=c(1L, 3L, 1L))) +test(2234.4, DT[c(4, NA), min(.SD), by=.I], data.table(I=c(4L, NA), V1=c(2L, NA))) +# other writing styles of by=.I +test(2234.5, DT[, min(.SD), by=.(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.6, DT[, min(.SD), by=list(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.7, DT[, min(.SD), by=c(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.8, DT[, min(.SD), by=.I%%2L], error="by.*contains .I.*supported") # would be nice to support in future; i.e. by odd/even rows, and by=(.I+1L)%/%2L for pairs of rows; i.e. any expression of .I +test(2234.9, DT[, min(.SD), by=somefun(.I)], error="by.*contains .I.*supported") + diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index 1f4e1615c0..c96cbef5c4 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -10,7 +10,7 @@ \alias{.NGRP} \title{ Special symbols } \description{ - \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. + \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. \code{.I} can be used in \code{by} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}. } \details{ @@ -22,13 +22,13 @@ \item{\code{.SD} is a \code{data.table} containing the \bold{S}ubset of \code{x}'s \bold{D}ata for each group, excluding any columns used in \code{by} (or \code{keyby}).} \item{\code{.BY} is a \code{list} containing a length 1 vector for each item in \code{by}. This can be useful when \code{by} is not known in advance. The \code{by} variables are also available to \code{j} directly by name; useful for example for titles of graphs if \code{j} is a plot command, or to branch with \code{if()} depending on the value of a group variable.} \item{\code{.N} is an integer, length 1, containing the number of rows in the group. This may be useful when the column names are not known in advance and for convenience generally. When grouping by \code{i}, \code{.N} is the number of rows in \code{x} matched to, for each row of \code{i}, regardless of whether \code{nomatch} is \code{NA} or \code{NULL}. It is renamed to \code{N} (no dot) in the result (otherwise a column called \code{".N"} could conflict with the \code{.N} variable, see FAQ 4.6 for more details and example), unless it is explicitly named; e.g., \code{DT[,list(total=.N),by=a]}.} - \item{\code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}.} + \item{\code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}. If used in \code{by} it corresponds to applying a function rowwise. } \item{\code{.GRP} is an integer, length 1, containing a simple group counter. 1 for the 1st group, 2 for the 2nd, etc.} \item{\code{.NGRP} is an integer, length 1, containing the number of groups. } } \code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details. - + Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples. } \seealso{ @@ -58,5 +58,9 @@ X[, DT[.BY, y, on="x"], by=x] # join within each group # .N can be different in i and j DT[{cat(sprintf('in i, .N is \%d\n', .N)); a < .N/2}, {cat(sprintf('in j, .N is \%d\n', .N)); mean(a)}] + +# .I can be different in j and by, enabling rowwise operations in by +DT[, .(.I, min(.SD[,-1]))] +DT[, .(min(.SD[,-1])), by=.I] } \keyword{ data }