diff --git a/NEWS.md b/NEWS.md index 29cfad7509..5e47f7cd66 100644 --- a/NEWS.md +++ b/NEWS.md @@ -65,10 +65,22 @@ 13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR. -14. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). +14. `uniqueN()` is now faster on logical vectors. Thanks to Hugh Parsonage for [PR#2648](https://github.com/Rdatatable/data.table/pull/2648). + ``` + N = 1e9 + was now + x = c(TRUE,FALSE,NA,rep(TRUE,N)) + uniqueN(x) == 3 5.4s 0.00s + x = c(TRUE,rep(FALSE,N), NA) + uniqueN(x,na.rm=TRUE) == 2 5.4s 0.00s + x = c(rep(TRUE,N),FALSE,NA) + uniqueN(x) == 3 6.7s 0.38s + ``` + +15. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementation. -15. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. +16. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. #### BUG FIXES diff --git a/R/duplicated.R b/R/duplicated.R index 1f182870d8..6e56b5aba1 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -142,7 +142,10 @@ uniqueN <- function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) if (is.null(x)) return(0L) if (!is.atomic(x) && !is.data.frame(x)) stop("x must be an atomic vector or data.frames/data.tables") - if (is.atomic(x)) x = as_list(x) + if (is.atomic(x)) { + if (is.logical(x)) return(.Call(CuniqueNlogical, x, na.rm=na.rm)) + x = as_list(x) + } if (is.null(by)) by = seq_along(x) o = forderv(x, by=by, retGrp=TRUE, na.last=if (!na.rm) FALSE else NA) starts = attr(o, 'starts') diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dafb9c85f6..0325874d45 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6486,6 +6486,22 @@ DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6)) test(1475.1, uniqueN(DT), 10L) test(1475.2, DT[, .(uN=uniqueN(.SD)), by=A], data.table(A=1:3, uN=c(3L,4L,3L))) +# specialized uniqueN for logical vectors, PR#2648 +test(1475.3, uniqueN(c(NA, TRUE, FALSE)), 3L) +test(1475.4, uniqueN(c(NA, TRUE, FALSE), na.rm = TRUE), 2L) +test(1475.5, uniqueN(c(TRUE, FALSE), na.rm = TRUE), 2L) +test(1475.6, uniqueN(c(TRUE, FALSE)), 2L) +test(1475.7, uniqueN(c(TRUE, NA)), 2L) +test(1475.8, uniqueN(c(TRUE, NA), na.rm=TRUE), 1L) +test(1475.9, uniqueN(c(FALSE, NA)), 2L) +test(1475.11, uniqueN(c(FALSE, NA), na.rm=TRUE), 1L) +test(1475.12, uniqueN(c(NA,NA)), 1L) +test(1475.13, uniqueN(c(NA,NA), na.rm=TRUE), 0L) +test(1475.14, uniqueN(NA), 1L) +test(1475.15, uniqueN(NA, na.rm=TRUE), 0L) +test(1475.16, uniqueN(logical()), 0L) +test(1475.17, uniqueN(logical(), na.rm=TRUE), 0L) + # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) test(1476.1, DT[, .(y=mean(y)), x], setDT(aggregate(y ~ x, DT, mean))) diff --git a/src/data.table.h b/src/data.table.h index 1779045b9c..d10cdd3329 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -4,6 +4,7 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); #include // for uint64_t rather than unsigned long long +#include #include "myomp.h" // data.table depends on R>=3.0.0 when R_xlen_t was introduced diff --git a/src/init.c b/src/init.c index 9f67ff3b7a..9b5c5aa272 100644 --- a/src/init.c +++ b/src/init.c @@ -76,6 +76,7 @@ SEXP fsort(); SEXP inrange(); SEXP between(); SEXP hasOpenMP(); +SEXP uniqueNlogical(); // .Externals SEXP fastmean(); @@ -154,6 +155,7 @@ R_CallMethodDef callMethods[] = { {"Cinrange", (DL_FUNC) &inrange, -1}, {"Cbetween", (DL_FUNC) &between, -1}, {"ChasOpenMP", (DL_FUNC) &hasOpenMP, -1}, +{"CuniqueNlogical", (DL_FUNC) &uniqueNlogical, -1}, {NULL, NULL, 0} }; diff --git a/src/uniqlist.c b/src/uniqlist.c index 3958be99f7..478e04776e 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -228,3 +228,28 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul UNPROTECT(1); return(ans); } + +SEXP uniqueNlogical(SEXP x, SEXP narmArg) { + // single pass; short-circuit and return as soon as all 3 values are found + if (!isLogical(x)) error("x is not a logical vector"); + if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error("na.rm must be TRUE or FALSE"); + bool narm = LOGICAL(narmArg)[0]==1; + const R_xlen_t n = xlength(x); + if (n==0) + return ScalarInteger(0); // empty vector + Rboolean first = LOGICAL(x)[0]; + R_xlen_t i=0; + while (++i